rdig 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rdig.rb ADDED
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #--
4
+ # Copyright (c) 2006 Jens Kraemer
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # "Software"), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #++
25
+ #
26
+
27
+ RDIGVERSION = '0.1.0'
28
+
29
+
30
+ require 'thread'
31
+ require 'thwait'
32
+ require 'singleton'
33
+ require 'monitor'
34
+ require 'ostruct'
35
+ require 'uri'
36
+ require 'cgi'
37
+ require 'set'
38
+ require 'net/http'
39
+ require 'getoptlong'
40
+
41
+ begin
42
+ require 'rubyful_soup'
43
+ require 'ferret'
44
+ rescue LoadError
45
+ require 'rubygems'
46
+ require 'rubyful_soup'
47
+ require 'ferret'
48
+ end
49
+
50
+ require 'htmlentities/htmlentities'
51
+
52
+ require 'rdig/http_client'
53
+ require 'rdig/content_extractors'
54
+ require 'rdig/url_filters'
55
+ require 'rdig/search'
56
+ require 'rdig/index'
57
+ require 'rdig/crawler'
58
+
59
+ $KCODE = 'u'
60
+ require 'jcode'
61
+
62
+ # See README for basic usage information
63
+ module RDig
64
+
65
+ class << self
66
+
67
+ # the filter chain each URL has to run through before being crawled.
68
+ def filter_chain
69
+ @filter_chain ||= [
70
+ { :maximum_redirect_filter => :max_redirects },
71
+ :fix_relative_uri,
72
+ :normalize_uri,
73
+ { :hostname_filter => :include_hosts },
74
+ { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
75
+ { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
76
+ RDig::UrlFilters::VisitedUrlFilter
77
+ ]
78
+ end
79
+
80
+ def application
81
+ @application ||= Application.new
82
+ end
83
+
84
+ def searcher
85
+ @searcher ||= Search::Searcher.new(config.ferret)
86
+ end
87
+
88
+ # RDig configuration
89
+ #
90
+ # may be used with a block:
91
+ # RDig.configuration do |config| ...
92
+ #
93
+ # see doc/examples/config.rb for a commented example configuration
94
+ def configuration
95
+ if block_given?
96
+ yield configuration
97
+ else
98
+ @config ||= OpenStruct.new(
99
+ :crawler => OpenStruct.new(
100
+ :start_urls => [ "http://localhost:3000/" ],
101
+ :include_hosts => [ "localhost" ],
102
+ :include_documents => nil,
103
+ :exclude_documents => nil,
104
+ :index_document => nil,
105
+ :num_threads => 2,
106
+ :max_redirects => 5,
107
+ :wait_before_leave => 10
108
+ ),
109
+ :content_extraction => OpenStruct.new(
110
+ # settings for html content extraction
111
+ :html => OpenStruct.new(
112
+ # select the html element that contains the content to index
113
+ # by default, we index all inside the body tag:
114
+ :content_tag_selector => lambda { |tagsoup|
115
+ tagsoup.html.body
116
+ },
117
+ # select the html element containing the title
118
+ :title_tag_selector => lambda { |tagsoup|
119
+ tagsoup.html.head.title
120
+ }
121
+ )
122
+ ),
123
+ :ferret => OpenStruct.new(
124
+ :path => "index/",
125
+ :create => true,
126
+ :handle_parse_errors => true,
127
+ :analyzer => Ferret::Analysis::StandardAnalyzer.new,
128
+ :occur_default => Ferret::Search::BooleanClause::Occur::MUST
129
+ )
130
+ )
131
+ end
132
+ end
133
+ alias config configuration
134
+
135
+ end
136
+
137
+ class Application
138
+
139
+ OPTIONS = [
140
+ ['--config', '-c', GetoptLong::REQUIRED_ARGUMENT,
141
+ "Read aplication configuration from CONFIG."],
142
+ ['--help', '-h', GetoptLong::NO_ARGUMENT,
143
+ "Display this help message."],
144
+ ['--query', '-q', GetoptLong::REQUIRED_ARGUMENT,
145
+ "Execute QUERY."],
146
+ ['--version', '-v', GetoptLong::NO_ARGUMENT,
147
+ "Display the program version."],
148
+ ]
149
+
150
+ # Application options from the command line
151
+ def options
152
+ @options ||= OpenStruct.new
153
+ end
154
+
155
+ # Display the program usage line.
156
+ def usage
157
+ puts "rdig -c configfile {options}"
158
+ end
159
+
160
+ # Display the rake command line help.
161
+ def help
162
+ usage
163
+ puts
164
+ puts "Options are ..."
165
+ puts
166
+ OPTIONS.sort.each do |long, short, mode, desc|
167
+ if mode == GetoptLong::REQUIRED_ARGUMENT
168
+ if desc =~ /\b([A-Z]{2,})\b/
169
+ long = long + "=#{$1}"
170
+ end
171
+ end
172
+ printf " %-20s (%s)\n", long, short
173
+ printf " %s\n", desc
174
+ end
175
+ end
176
+
177
+ # Return a list of the command line options supported by the
178
+ # program.
179
+ def command_line_options
180
+ OPTIONS.collect { |lst| lst[0..-2] }
181
+ end
182
+
183
+ # Do the option defined by +opt+ and +value+.
184
+ def do_option(opt, value)
185
+ case opt
186
+ when '--help'
187
+ help
188
+ exit
189
+ when '--config'
190
+ options.config_file = value
191
+ when '--query'
192
+ options.query = value
193
+ when '--version'
194
+ puts "rdig, version #{RDIGVERSION}"
195
+ exit
196
+ else
197
+ fail "Unknown option: #{opt}"
198
+ end
199
+ end
200
+
201
+ # Read and handle the command line options.
202
+ def handle_options
203
+ opts = GetoptLong.new(*command_line_options)
204
+ opts.each { |opt, value| do_option(opt, value) }
205
+ end
206
+
207
+ # Load the configuration
208
+ def load_configfile
209
+ load File.expand_path(options.config_file)
210
+ end
211
+
212
+ # Run the +rdig+ application.
213
+ def run
214
+ handle_options
215
+ begin
216
+ load_configfile
217
+ rescue
218
+ puts $!.backtrace
219
+ fail "No Configfile found!\n#{$!}"
220
+
221
+ end
222
+
223
+ if options.query
224
+ # query the index
225
+ puts "executing query >#{options.query}<"
226
+ results = RDig.searcher.search(options.query)
227
+ puts "total results: #{results[:hitcount]}"
228
+ results[:list].each { |result|
229
+ puts <<-EOF
230
+ #{result[:url]}
231
+ #{result[:title]}
232
+ #{result[:extract]}
233
+
234
+ EOF
235
+ }
236
+ else
237
+ # rebuild index
238
+ @crawler = Crawler.new
239
+ @crawler.run
240
+ end
241
+ end
242
+ end
243
+ end
@@ -0,0 +1,145 @@
1
+ # override some methods concered with entity resolving
2
+ # to convert them to strings
3
+ class BeautifulStoneSoup
4
+ # resolve unknown html entities using the htmlentities lib
5
+ alias :orig_unknown_entityref :unknown_entityref
6
+ def unknown_entityref(ref)
7
+ if HTMLEntities::MAP.has_key?(ref)
8
+ handle_data [HTMLEntities::MAP[ref]].pack('U')
9
+ else
10
+ orig_unknown_entityref ref
11
+ end
12
+ end
13
+
14
+ # resolve numeric entities to utf8
15
+ def handle_charref(ref)
16
+ handle_data( ref.gsub(/([0-9]{1,7})/) {
17
+ [$1.to_i].pack('U')
18
+ }.gsub(/x([0-9a-f]{1,6})/i) {
19
+ [$1.to_i(16)].pack('U')
20
+ } )
21
+ end
22
+ end
23
+
24
+ module RDig
25
+
26
+ # Contains Classes which are used for extracting content and meta data from
27
+ # various content types.
28
+ #
29
+ # TODO: support at least pdf, too.
30
+ module ContentExtractors
31
+
32
+ # process the given +content+ depending on it's +content_type+.
33
+ def ContentExtractors.process(content, content_type)
34
+ case content_type
35
+ when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
36
+ return HtmlContentExtractor.process(content)
37
+ else
38
+ puts "unable to handle content type #{content_type}"
39
+ end
40
+ return nil
41
+ end
42
+
43
+ # extracts title, content and links from html documents
44
+ class HtmlContentExtractor
45
+
46
+ # returns:
47
+ # { :content => 'extracted clear text',
48
+ # :meta => { :title => 'Title' },
49
+ # :links => [array of urls] }
50
+ def self.process(content)
51
+ result = { }
52
+ tag_soup = BeautifulSoup.new(content)
53
+ result[:title] = extract_title(tag_soup)
54
+ result[:links] = extract_links(tag_soup)
55
+ result[:content] = extract_content(tag_soup)
56
+ return result
57
+ end
58
+
59
+ # Extracts textual content from the HTML tree.
60
+ #
61
+ # - First, the root element to use is determined using the
62
+ # +content_element+ method, which itself uses the content_tag_selector
63
+ # from RDig.configuration.
64
+ # - Then, this element is processed by +extract_text+, which will give
65
+ # all textual content contained in the root element and all it's
66
+ # children.
67
+ def self.extract_content(tag_soup)
68
+ content = ''
69
+ content_element(tag_soup).children { |child|
70
+ extract_text(child, content)
71
+ }
72
+ return content.strip
73
+ end
74
+
75
+ # extracts the href attributes of all a tags, except
76
+ # internal links like <a href="#top">
77
+ def self.extract_links(tagsoup)
78
+ tagsoup.find_all('a').map { |link|
79
+ CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
80
+ }.compact
81
+ end
82
+
83
+ # Extracts the title from the given html tree
84
+ def self.extract_title(tagsoup)
85
+ title = ''
86
+ the_title_tag = title_tag(tagsoup)
87
+ if the_title_tag.is_a? String
88
+ the_title_tag
89
+ else
90
+ extract_text(the_title_tag).strip if the_title_tag
91
+ end
92
+ end
93
+
94
+ # Recursively extracts all text contained in the given element,
95
+ # and appends it to content.
96
+ def self.extract_text(element, content='')
97
+ if element.is_a? NavigableString
98
+ value = strip_comments(element)
99
+ value.strip!
100
+ unless value.empty?
101
+ content << value
102
+ content << ' '
103
+ end
104
+ elsif element.string # it's a Tag, and it has some content string
105
+ value = element.string.strip
106
+ unless value.empty?
107
+ content << value
108
+ content << ' '
109
+ end
110
+ else
111
+ element.children { |child|
112
+ extract_text(child, content)
113
+ }
114
+ end
115
+ end
116
+
117
+ # Returns the element to extract the title from.
118
+ #
119
+ # This may return a string, e.g. an attribute value selected from a meta
120
+ # tag, too.
121
+ def self.title_tag(tagsoup)
122
+ if RDig.config.content_extraction.html.title_tag_selector
123
+ RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
124
+ else
125
+ tagsoup.html.head.title
126
+ end
127
+ end
128
+
129
+ # Retrieve the root element to extract document content from
130
+ def self.content_element(tagsoup)
131
+ if RDig.config.content_extraction.html.content_tag_selector
132
+ RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
133
+ else
134
+ tagsoup.html.body
135
+ end
136
+ end
137
+
138
+ # Return the given string minus all html comments
139
+ def self.strip_comments(string)
140
+ string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
141
+ end
142
+ end
143
+
144
+ end
145
+ end
@@ -0,0 +1,176 @@
1
+ module RDig
2
+
3
+
4
+ class Crawler
5
+
6
+ def initialize
7
+ @documents = Queue.new
8
+ @etag_filter = ETagFilter.new
9
+ end
10
+
11
+ def run
12
+ @indexer = Index::Indexer.new(RDig.config.ferret)
13
+ filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
14
+ RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
15
+
16
+ num_threads = RDig.config.crawler.num_threads
17
+ group = ThreadsWait.new
18
+ num_threads.times { |i|
19
+ group.join_nowait Thread.new("fetcher #{i}") {
20
+ filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
21
+ while (doc = @documents.pop) != :exit
22
+ process_document doc, filterchain
23
+ end
24
+ }
25
+ }
26
+
27
+ # dilemma: suppose we have 1 start url and two threads t1 and t2:
28
+ # t1 pops the start url from the queue which now is empty
29
+ # as the queue is empty now, t2 blocks until t1 adds the links
30
+ # retrieved from his document.
31
+ #
32
+ # But we need the 'queue empty' condition as a sign for us to stop
33
+ # waiting for new entries, too.
34
+
35
+ # check every now and then for an empty queue
36
+ sleep_interval = RDig.config.crawler.wait_before_leave
37
+ begin
38
+ sleep sleep_interval
39
+ end until @documents.empty?
40
+ # nothing to do any more, tell the threads to exit
41
+ num_threads.times { @documents << :exit }
42
+
43
+ puts "waiting for threads to finish..."
44
+ group.all_waits
45
+ ensure
46
+ @indexer.close if @indexer
47
+ end
48
+
49
+ def process_document(doc, filterchain)
50
+ doc.fetch
51
+ # add links from this document to the queue
52
+ doc.content[:links].each { |url| add_url(url, filterchain, doc) }
53
+ return unless @etag_filter.apply(doc)
54
+ case doc.status
55
+ when :success
56
+ if doc.content
57
+ if doc.content[:links]
58
+ doc.content[:links].each { |url| add_url(url, filterchain, doc) }
59
+ end
60
+ @indexer << doc
61
+ #else
62
+ #puts "success but no content: #{doc.uri.to_s}"
63
+ end
64
+ when :redirect
65
+ # links contains the url we were redirected to
66
+ doc.content[:links].each { |url| add_url(url, filterchain, doc) }
67
+ end
68
+ rescue
69
+ puts "error processing document #{doc.uri.to_s}: #{$!}"
70
+ end
71
+
72
+
73
+ # pipes a new document pointing to url through the filter chain,
74
+ # if it survives that, it gets added to the documents queue for further
75
+ # processing
76
+ def add_url(url, filterchain, referring_document = nil)
77
+ return if url.nil? || url.empty?
78
+ if referring_document
79
+ doc = Document.new(url, referring_document.uri)
80
+ # keep redirect count
81
+ if referring_document.status == :redirect
82
+ doc.redirections = referring_document.redirections + 1
83
+ end
84
+ else
85
+ doc = Document.new(url)
86
+ end
87
+
88
+ doc = filterchain.apply(doc)
89
+
90
+ if doc
91
+ puts "added url #{url}"
92
+ #else
93
+ #puts "skipping url #{url}"
94
+ end
95
+ @documents << doc if doc
96
+ end
97
+
98
+ end
99
+
100
+
101
+ class Document
102
+ include HttpClient
103
+
104
+ attr_reader :content
105
+ attr_reader :content_type
106
+ attr_reader :uri
107
+ attr_reader :referring_uri
108
+ attr_reader :status
109
+ attr_reader :etag
110
+ attr_accessor :redirections
111
+
112
+ # url: url of this document, may be relative to the referring doc or host.
113
+ # referrer: uri of the document we retrieved this link from
114
+ def initialize(url, referrer = nil)
115
+ @redirections = 0
116
+ begin
117
+ @uri = URI.parse(url)
118
+ rescue URI::InvalidURIError
119
+ raise "Cannot create document using invalid URL: #{url}"
120
+ end
121
+ @referring_uri = referrer
122
+ end
123
+
124
+ def has_content?
125
+ !self.content.nil?
126
+ end
127
+
128
+ def title; @content[:title] end
129
+ def body; @content[:content] end
130
+ def url; @uri.to_s end
131
+
132
+ def fetch
133
+ puts "fetching #{@uri.to_s}"
134
+ response = do_get(@uri)
135
+ case response
136
+ when Net::HTTPSuccess
137
+ @content_type = response['content-type']
138
+ @raw_body = response.body
139
+ @etag = response['etag']
140
+ # todo externalize this (another chain ?)
141
+ @content = ContentExtractors.process(@raw_body, @content_type)
142
+ @status = :success
143
+ when Net::HTTPRedirection
144
+ @status = :redirect
145
+ @content = { :links => [ response['location'] ] }
146
+ else
147
+ puts "don't know what to do with response: #{response}"
148
+ end
149
+
150
+ end
151
+
152
+ end
153
+
154
+ # checks fetched documents' E-Tag headers against the list of E-Tags
155
+ # of the documents already indexed.
156
+ # This is supposed to help against double-indexing documents which can
157
+ # be reached via different URLs (think http://host.com/ and
158
+ # http://host.com/index.html )
159
+ # Documents without ETag are allowed to pass through
160
+ class ETagFilter
161
+ include MonitorMixin
162
+
163
+ def initialize
164
+ @etags = Set.new
165
+ super
166
+ end
167
+
168
+ def apply(document)
169
+ return document unless document.etag
170
+ synchronize do
171
+ @etags.add?(document.etag) ? document : nil
172
+ end
173
+ end
174
+ end
175
+
176
+ end