rdig 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +2 -0
- data/LICENSE +20 -0
- data/README +61 -0
- data/TODO +0 -0
- data/bin/rdig +32 -0
- data/doc/examples/config.rb +53 -0
- data/install.rb +89 -0
- data/lib/htmlentities/CHANGES +21 -0
- data/lib/htmlentities/COPYING +7 -0
- data/lib/htmlentities/README +15 -0
- data/lib/htmlentities/htmlentities.rb +281 -0
- data/lib/rdig.rb +243 -0
- data/lib/rdig/content_extractors.rb +145 -0
- data/lib/rdig/crawler.rb +176 -0
- data/lib/rdig/highlight.rb +24 -0
- data/lib/rdig/http_client.rb +22 -0
- data/lib/rdig/index.rb +39 -0
- data/lib/rdig/search.rb +77 -0
- data/lib/rdig/url_filters.rb +171 -0
- data/rakefile +325 -0
- data/test/fixtures/html/custom_tag_selectors.html +25 -0
- data/test/fixtures/html/entities.html +15 -0
- data/test/fixtures/html/simple.html +17 -0
- data/test/test_helper.rb +18 -0
- data/test/unit/etag_filter_test.rb +23 -0
- data/test/unit/html_content_extractor_test.rb +64 -0
- data/test/unit/url_filters_test.rb +96 -0
- metadata +102 -0
data/lib/rdig.rb
ADDED
@@ -0,0 +1,243 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright (c) 2006 Jens Kraemer
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
7
|
+
# a copy of this software and associated documentation files (the
|
8
|
+
# "Software"), to deal in the Software without restriction, including
|
9
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
10
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
11
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
12
|
+
# the following conditions:
|
13
|
+
#
|
14
|
+
# The above copyright notice and this permission notice shall be
|
15
|
+
# included in all copies or substantial portions of the Software.
|
16
|
+
#
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
21
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
22
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
23
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
24
|
+
#++
|
25
|
+
#
|
26
|
+
|
27
|
+
RDIGVERSION = '0.1.0'
|
28
|
+
|
29
|
+
|
30
|
+
require 'thread'
|
31
|
+
require 'thwait'
|
32
|
+
require 'singleton'
|
33
|
+
require 'monitor'
|
34
|
+
require 'ostruct'
|
35
|
+
require 'uri'
|
36
|
+
require 'cgi'
|
37
|
+
require 'set'
|
38
|
+
require 'net/http'
|
39
|
+
require 'getoptlong'
|
40
|
+
|
41
|
+
begin
|
42
|
+
require 'rubyful_soup'
|
43
|
+
require 'ferret'
|
44
|
+
rescue LoadError
|
45
|
+
require 'rubygems'
|
46
|
+
require 'rubyful_soup'
|
47
|
+
require 'ferret'
|
48
|
+
end
|
49
|
+
|
50
|
+
require 'htmlentities/htmlentities'
|
51
|
+
|
52
|
+
require 'rdig/http_client'
|
53
|
+
require 'rdig/content_extractors'
|
54
|
+
require 'rdig/url_filters'
|
55
|
+
require 'rdig/search'
|
56
|
+
require 'rdig/index'
|
57
|
+
require 'rdig/crawler'
|
58
|
+
|
59
|
+
$KCODE = 'u'
|
60
|
+
require 'jcode'
|
61
|
+
|
62
|
+
# See README for basic usage information
|
63
|
+
module RDig
|
64
|
+
|
65
|
+
class << self
|
66
|
+
|
67
|
+
# the filter chain each URL has to run through before being crawled.
|
68
|
+
def filter_chain
|
69
|
+
@filter_chain ||= [
|
70
|
+
{ :maximum_redirect_filter => :max_redirects },
|
71
|
+
:fix_relative_uri,
|
72
|
+
:normalize_uri,
|
73
|
+
{ :hostname_filter => :include_hosts },
|
74
|
+
{ RDig::UrlFilters::UrlInclusionFilter => :include_documents },
|
75
|
+
{ RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
|
76
|
+
RDig::UrlFilters::VisitedUrlFilter
|
77
|
+
]
|
78
|
+
end
|
79
|
+
|
80
|
+
def application
|
81
|
+
@application ||= Application.new
|
82
|
+
end
|
83
|
+
|
84
|
+
def searcher
|
85
|
+
@searcher ||= Search::Searcher.new(config.ferret)
|
86
|
+
end
|
87
|
+
|
88
|
+
# RDig configuration
|
89
|
+
#
|
90
|
+
# may be used with a block:
|
91
|
+
# RDig.configuration do |config| ...
|
92
|
+
#
|
93
|
+
# see doc/examples/config.rb for a commented example configuration
|
94
|
+
def configuration
|
95
|
+
if block_given?
|
96
|
+
yield configuration
|
97
|
+
else
|
98
|
+
@config ||= OpenStruct.new(
|
99
|
+
:crawler => OpenStruct.new(
|
100
|
+
:start_urls => [ "http://localhost:3000/" ],
|
101
|
+
:include_hosts => [ "localhost" ],
|
102
|
+
:include_documents => nil,
|
103
|
+
:exclude_documents => nil,
|
104
|
+
:index_document => nil,
|
105
|
+
:num_threads => 2,
|
106
|
+
:max_redirects => 5,
|
107
|
+
:wait_before_leave => 10
|
108
|
+
),
|
109
|
+
:content_extraction => OpenStruct.new(
|
110
|
+
# settings for html content extraction
|
111
|
+
:html => OpenStruct.new(
|
112
|
+
# select the html element that contains the content to index
|
113
|
+
# by default, we index all inside the body tag:
|
114
|
+
:content_tag_selector => lambda { |tagsoup|
|
115
|
+
tagsoup.html.body
|
116
|
+
},
|
117
|
+
# select the html element containing the title
|
118
|
+
:title_tag_selector => lambda { |tagsoup|
|
119
|
+
tagsoup.html.head.title
|
120
|
+
}
|
121
|
+
)
|
122
|
+
),
|
123
|
+
:ferret => OpenStruct.new(
|
124
|
+
:path => "index/",
|
125
|
+
:create => true,
|
126
|
+
:handle_parse_errors => true,
|
127
|
+
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
|
128
|
+
:occur_default => Ferret::Search::BooleanClause::Occur::MUST
|
129
|
+
)
|
130
|
+
)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
alias config configuration
|
134
|
+
|
135
|
+
end
|
136
|
+
|
137
|
+
class Application
|
138
|
+
|
139
|
+
OPTIONS = [
|
140
|
+
['--config', '-c', GetoptLong::REQUIRED_ARGUMENT,
|
141
|
+
"Read aplication configuration from CONFIG."],
|
142
|
+
['--help', '-h', GetoptLong::NO_ARGUMENT,
|
143
|
+
"Display this help message."],
|
144
|
+
['--query', '-q', GetoptLong::REQUIRED_ARGUMENT,
|
145
|
+
"Execute QUERY."],
|
146
|
+
['--version', '-v', GetoptLong::NO_ARGUMENT,
|
147
|
+
"Display the program version."],
|
148
|
+
]
|
149
|
+
|
150
|
+
# Application options from the command line
|
151
|
+
def options
|
152
|
+
@options ||= OpenStruct.new
|
153
|
+
end
|
154
|
+
|
155
|
+
# Display the program usage line.
|
156
|
+
def usage
|
157
|
+
puts "rdig -c configfile {options}"
|
158
|
+
end
|
159
|
+
|
160
|
+
# Display the rake command line help.
|
161
|
+
def help
|
162
|
+
usage
|
163
|
+
puts
|
164
|
+
puts "Options are ..."
|
165
|
+
puts
|
166
|
+
OPTIONS.sort.each do |long, short, mode, desc|
|
167
|
+
if mode == GetoptLong::REQUIRED_ARGUMENT
|
168
|
+
if desc =~ /\b([A-Z]{2,})\b/
|
169
|
+
long = long + "=#{$1}"
|
170
|
+
end
|
171
|
+
end
|
172
|
+
printf " %-20s (%s)\n", long, short
|
173
|
+
printf " %s\n", desc
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
# Return a list of the command line options supported by the
|
178
|
+
# program.
|
179
|
+
def command_line_options
|
180
|
+
OPTIONS.collect { |lst| lst[0..-2] }
|
181
|
+
end
|
182
|
+
|
183
|
+
# Do the option defined by +opt+ and +value+.
|
184
|
+
def do_option(opt, value)
|
185
|
+
case opt
|
186
|
+
when '--help'
|
187
|
+
help
|
188
|
+
exit
|
189
|
+
when '--config'
|
190
|
+
options.config_file = value
|
191
|
+
when '--query'
|
192
|
+
options.query = value
|
193
|
+
when '--version'
|
194
|
+
puts "rdig, version #{RDIGVERSION}"
|
195
|
+
exit
|
196
|
+
else
|
197
|
+
fail "Unknown option: #{opt}"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# Read and handle the command line options.
|
202
|
+
def handle_options
|
203
|
+
opts = GetoptLong.new(*command_line_options)
|
204
|
+
opts.each { |opt, value| do_option(opt, value) }
|
205
|
+
end
|
206
|
+
|
207
|
+
# Load the configuration
|
208
|
+
def load_configfile
|
209
|
+
load File.expand_path(options.config_file)
|
210
|
+
end
|
211
|
+
|
212
|
+
# Run the +rdig+ application.
|
213
|
+
def run
|
214
|
+
handle_options
|
215
|
+
begin
|
216
|
+
load_configfile
|
217
|
+
rescue
|
218
|
+
puts $!.backtrace
|
219
|
+
fail "No Configfile found!\n#{$!}"
|
220
|
+
|
221
|
+
end
|
222
|
+
|
223
|
+
if options.query
|
224
|
+
# query the index
|
225
|
+
puts "executing query >#{options.query}<"
|
226
|
+
results = RDig.searcher.search(options.query)
|
227
|
+
puts "total results: #{results[:hitcount]}"
|
228
|
+
results[:list].each { |result|
|
229
|
+
puts <<-EOF
|
230
|
+
#{result[:url]}
|
231
|
+
#{result[:title]}
|
232
|
+
#{result[:extract]}
|
233
|
+
|
234
|
+
EOF
|
235
|
+
}
|
236
|
+
else
|
237
|
+
# rebuild index
|
238
|
+
@crawler = Crawler.new
|
239
|
+
@crawler.run
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
# override some methods concered with entity resolving
|
2
|
+
# to convert them to strings
|
3
|
+
class BeautifulStoneSoup
|
4
|
+
# resolve unknown html entities using the htmlentities lib
|
5
|
+
alias :orig_unknown_entityref :unknown_entityref
|
6
|
+
def unknown_entityref(ref)
|
7
|
+
if HTMLEntities::MAP.has_key?(ref)
|
8
|
+
handle_data [HTMLEntities::MAP[ref]].pack('U')
|
9
|
+
else
|
10
|
+
orig_unknown_entityref ref
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# resolve numeric entities to utf8
|
15
|
+
def handle_charref(ref)
|
16
|
+
handle_data( ref.gsub(/([0-9]{1,7})/) {
|
17
|
+
[$1.to_i].pack('U')
|
18
|
+
}.gsub(/x([0-9a-f]{1,6})/i) {
|
19
|
+
[$1.to_i(16)].pack('U')
|
20
|
+
} )
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
module RDig
|
25
|
+
|
26
|
+
# Contains Classes which are used for extracting content and meta data from
|
27
|
+
# various content types.
|
28
|
+
#
|
29
|
+
# TODO: support at least pdf, too.
|
30
|
+
module ContentExtractors
|
31
|
+
|
32
|
+
# process the given +content+ depending on it's +content_type+.
|
33
|
+
def ContentExtractors.process(content, content_type)
|
34
|
+
case content_type
|
35
|
+
when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
36
|
+
return HtmlContentExtractor.process(content)
|
37
|
+
else
|
38
|
+
puts "unable to handle content type #{content_type}"
|
39
|
+
end
|
40
|
+
return nil
|
41
|
+
end
|
42
|
+
|
43
|
+
# extracts title, content and links from html documents
|
44
|
+
class HtmlContentExtractor
|
45
|
+
|
46
|
+
# returns:
|
47
|
+
# { :content => 'extracted clear text',
|
48
|
+
# :meta => { :title => 'Title' },
|
49
|
+
# :links => [array of urls] }
|
50
|
+
def self.process(content)
|
51
|
+
result = { }
|
52
|
+
tag_soup = BeautifulSoup.new(content)
|
53
|
+
result[:title] = extract_title(tag_soup)
|
54
|
+
result[:links] = extract_links(tag_soup)
|
55
|
+
result[:content] = extract_content(tag_soup)
|
56
|
+
return result
|
57
|
+
end
|
58
|
+
|
59
|
+
# Extracts textual content from the HTML tree.
|
60
|
+
#
|
61
|
+
# - First, the root element to use is determined using the
|
62
|
+
# +content_element+ method, which itself uses the content_tag_selector
|
63
|
+
# from RDig.configuration.
|
64
|
+
# - Then, this element is processed by +extract_text+, which will give
|
65
|
+
# all textual content contained in the root element and all it's
|
66
|
+
# children.
|
67
|
+
def self.extract_content(tag_soup)
|
68
|
+
content = ''
|
69
|
+
content_element(tag_soup).children { |child|
|
70
|
+
extract_text(child, content)
|
71
|
+
}
|
72
|
+
return content.strip
|
73
|
+
end
|
74
|
+
|
75
|
+
# extracts the href attributes of all a tags, except
|
76
|
+
# internal links like <a href="#top">
|
77
|
+
def self.extract_links(tagsoup)
|
78
|
+
tagsoup.find_all('a').map { |link|
|
79
|
+
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
|
80
|
+
}.compact
|
81
|
+
end
|
82
|
+
|
83
|
+
# Extracts the title from the given html tree
|
84
|
+
def self.extract_title(tagsoup)
|
85
|
+
title = ''
|
86
|
+
the_title_tag = title_tag(tagsoup)
|
87
|
+
if the_title_tag.is_a? String
|
88
|
+
the_title_tag
|
89
|
+
else
|
90
|
+
extract_text(the_title_tag).strip if the_title_tag
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Recursively extracts all text contained in the given element,
|
95
|
+
# and appends it to content.
|
96
|
+
def self.extract_text(element, content='')
|
97
|
+
if element.is_a? NavigableString
|
98
|
+
value = strip_comments(element)
|
99
|
+
value.strip!
|
100
|
+
unless value.empty?
|
101
|
+
content << value
|
102
|
+
content << ' '
|
103
|
+
end
|
104
|
+
elsif element.string # it's a Tag, and it has some content string
|
105
|
+
value = element.string.strip
|
106
|
+
unless value.empty?
|
107
|
+
content << value
|
108
|
+
content << ' '
|
109
|
+
end
|
110
|
+
else
|
111
|
+
element.children { |child|
|
112
|
+
extract_text(child, content)
|
113
|
+
}
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# Returns the element to extract the title from.
|
118
|
+
#
|
119
|
+
# This may return a string, e.g. an attribute value selected from a meta
|
120
|
+
# tag, too.
|
121
|
+
def self.title_tag(tagsoup)
|
122
|
+
if RDig.config.content_extraction.html.title_tag_selector
|
123
|
+
RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
|
124
|
+
else
|
125
|
+
tagsoup.html.head.title
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Retrieve the root element to extract document content from
|
130
|
+
def self.content_element(tagsoup)
|
131
|
+
if RDig.config.content_extraction.html.content_tag_selector
|
132
|
+
RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
|
133
|
+
else
|
134
|
+
tagsoup.html.body
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# Return the given string minus all html comments
|
139
|
+
def self.strip_comments(string)
|
140
|
+
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
145
|
+
end
|
data/lib/rdig/crawler.rb
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
module RDig
|
2
|
+
|
3
|
+
|
4
|
+
class Crawler
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@documents = Queue.new
|
8
|
+
@etag_filter = ETagFilter.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def run
|
12
|
+
@indexer = Index::Indexer.new(RDig.config.ferret)
|
13
|
+
filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
|
14
|
+
RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
|
15
|
+
|
16
|
+
num_threads = RDig.config.crawler.num_threads
|
17
|
+
group = ThreadsWait.new
|
18
|
+
num_threads.times { |i|
|
19
|
+
group.join_nowait Thread.new("fetcher #{i}") {
|
20
|
+
filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
|
21
|
+
while (doc = @documents.pop) != :exit
|
22
|
+
process_document doc, filterchain
|
23
|
+
end
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
# dilemma: suppose we have 1 start url and two threads t1 and t2:
|
28
|
+
# t1 pops the start url from the queue which now is empty
|
29
|
+
# as the queue is empty now, t2 blocks until t1 adds the links
|
30
|
+
# retrieved from his document.
|
31
|
+
#
|
32
|
+
# But we need the 'queue empty' condition as a sign for us to stop
|
33
|
+
# waiting for new entries, too.
|
34
|
+
|
35
|
+
# check every now and then for an empty queue
|
36
|
+
sleep_interval = RDig.config.crawler.wait_before_leave
|
37
|
+
begin
|
38
|
+
sleep sleep_interval
|
39
|
+
end until @documents.empty?
|
40
|
+
# nothing to do any more, tell the threads to exit
|
41
|
+
num_threads.times { @documents << :exit }
|
42
|
+
|
43
|
+
puts "waiting for threads to finish..."
|
44
|
+
group.all_waits
|
45
|
+
ensure
|
46
|
+
@indexer.close if @indexer
|
47
|
+
end
|
48
|
+
|
49
|
+
def process_document(doc, filterchain)
|
50
|
+
doc.fetch
|
51
|
+
# add links from this document to the queue
|
52
|
+
doc.content[:links].each { |url| add_url(url, filterchain, doc) }
|
53
|
+
return unless @etag_filter.apply(doc)
|
54
|
+
case doc.status
|
55
|
+
when :success
|
56
|
+
if doc.content
|
57
|
+
if doc.content[:links]
|
58
|
+
doc.content[:links].each { |url| add_url(url, filterchain, doc) }
|
59
|
+
end
|
60
|
+
@indexer << doc
|
61
|
+
#else
|
62
|
+
#puts "success but no content: #{doc.uri.to_s}"
|
63
|
+
end
|
64
|
+
when :redirect
|
65
|
+
# links contains the url we were redirected to
|
66
|
+
doc.content[:links].each { |url| add_url(url, filterchain, doc) }
|
67
|
+
end
|
68
|
+
rescue
|
69
|
+
puts "error processing document #{doc.uri.to_s}: #{$!}"
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
# pipes a new document pointing to url through the filter chain,
|
74
|
+
# if it survives that, it gets added to the documents queue for further
|
75
|
+
# processing
|
76
|
+
def add_url(url, filterchain, referring_document = nil)
|
77
|
+
return if url.nil? || url.empty?
|
78
|
+
if referring_document
|
79
|
+
doc = Document.new(url, referring_document.uri)
|
80
|
+
# keep redirect count
|
81
|
+
if referring_document.status == :redirect
|
82
|
+
doc.redirections = referring_document.redirections + 1
|
83
|
+
end
|
84
|
+
else
|
85
|
+
doc = Document.new(url)
|
86
|
+
end
|
87
|
+
|
88
|
+
doc = filterchain.apply(doc)
|
89
|
+
|
90
|
+
if doc
|
91
|
+
puts "added url #{url}"
|
92
|
+
#else
|
93
|
+
#puts "skipping url #{url}"
|
94
|
+
end
|
95
|
+
@documents << doc if doc
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
class Document
|
102
|
+
include HttpClient
|
103
|
+
|
104
|
+
attr_reader :content
|
105
|
+
attr_reader :content_type
|
106
|
+
attr_reader :uri
|
107
|
+
attr_reader :referring_uri
|
108
|
+
attr_reader :status
|
109
|
+
attr_reader :etag
|
110
|
+
attr_accessor :redirections
|
111
|
+
|
112
|
+
# url: url of this document, may be relative to the referring doc or host.
|
113
|
+
# referrer: uri of the document we retrieved this link from
|
114
|
+
def initialize(url, referrer = nil)
|
115
|
+
@redirections = 0
|
116
|
+
begin
|
117
|
+
@uri = URI.parse(url)
|
118
|
+
rescue URI::InvalidURIError
|
119
|
+
raise "Cannot create document using invalid URL: #{url}"
|
120
|
+
end
|
121
|
+
@referring_uri = referrer
|
122
|
+
end
|
123
|
+
|
124
|
+
def has_content?
|
125
|
+
!self.content.nil?
|
126
|
+
end
|
127
|
+
|
128
|
+
def title; @content[:title] end
|
129
|
+
def body; @content[:content] end
|
130
|
+
def url; @uri.to_s end
|
131
|
+
|
132
|
+
def fetch
|
133
|
+
puts "fetching #{@uri.to_s}"
|
134
|
+
response = do_get(@uri)
|
135
|
+
case response
|
136
|
+
when Net::HTTPSuccess
|
137
|
+
@content_type = response['content-type']
|
138
|
+
@raw_body = response.body
|
139
|
+
@etag = response['etag']
|
140
|
+
# todo externalize this (another chain ?)
|
141
|
+
@content = ContentExtractors.process(@raw_body, @content_type)
|
142
|
+
@status = :success
|
143
|
+
when Net::HTTPRedirection
|
144
|
+
@status = :redirect
|
145
|
+
@content = { :links => [ response['location'] ] }
|
146
|
+
else
|
147
|
+
puts "don't know what to do with response: #{response}"
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
|
154
|
+
# checks fetched documents' E-Tag headers against the list of E-Tags
|
155
|
+
# of the documents already indexed.
|
156
|
+
# This is supposed to help against double-indexing documents which can
|
157
|
+
# be reached via different URLs (think http://host.com/ and
|
158
|
+
# http://host.com/index.html )
|
159
|
+
# Documents without ETag are allowed to pass through
|
160
|
+
class ETagFilter
|
161
|
+
include MonitorMixin
|
162
|
+
|
163
|
+
def initialize
|
164
|
+
@etags = Set.new
|
165
|
+
super
|
166
|
+
end
|
167
|
+
|
168
|
+
def apply(document)
|
169
|
+
return document unless document.etag
|
170
|
+
synchronize do
|
171
|
+
@etags.add?(document.etag) ? document : nil
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|