rdig 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rdig.rb ADDED
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #--
4
+ # Copyright (c) 2006 Jens Kraemer
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # "Software"), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #++
25
+ #
26
+
27
+ RDIGVERSION = '0.1.0'
28
+
29
+
30
+ require 'thread'
31
+ require 'thwait'
32
+ require 'singleton'
33
+ require 'monitor'
34
+ require 'ostruct'
35
+ require 'uri'
36
+ require 'cgi'
37
+ require 'set'
38
+ require 'net/http'
39
+ require 'getoptlong'
40
+
41
+ begin
42
+ require 'rubyful_soup'
43
+ require 'ferret'
44
+ rescue LoadError
45
+ require 'rubygems'
46
+ require 'rubyful_soup'
47
+ require 'ferret'
48
+ end
49
+
50
+ require 'htmlentities/htmlentities'
51
+
52
+ require 'rdig/http_client'
53
+ require 'rdig/content_extractors'
54
+ require 'rdig/url_filters'
55
+ require 'rdig/search'
56
+ require 'rdig/index'
57
+ require 'rdig/crawler'
58
+
59
+ $KCODE = 'u'
60
+ require 'jcode'
61
+
62
+ # See README for basic usage information
63
+ module RDig
64
+
65
+ class << self
66
+
67
+ # the filter chain each URL has to run through before being crawled.
68
+ def filter_chain
69
+ @filter_chain ||= [
70
+ { :maximum_redirect_filter => :max_redirects },
71
+ :fix_relative_uri,
72
+ :normalize_uri,
73
+ { :hostname_filter => :include_hosts },
74
+ { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
75
+ { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
76
+ RDig::UrlFilters::VisitedUrlFilter
77
+ ]
78
+ end
79
+
80
+ def application
81
+ @application ||= Application.new
82
+ end
83
+
84
+ def searcher
85
+ @searcher ||= Search::Searcher.new(config.ferret)
86
+ end
87
+
88
+ # RDig configuration
89
+ #
90
+ # may be used with a block:
91
+ # RDig.configuration do |config| ...
92
+ #
93
+ # see doc/examples/config.rb for a commented example configuration
94
+ def configuration
95
+ if block_given?
96
+ yield configuration
97
+ else
98
+ @config ||= OpenStruct.new(
99
+ :crawler => OpenStruct.new(
100
+ :start_urls => [ "http://localhost:3000/" ],
101
+ :include_hosts => [ "localhost" ],
102
+ :include_documents => nil,
103
+ :exclude_documents => nil,
104
+ :index_document => nil,
105
+ :num_threads => 2,
106
+ :max_redirects => 5,
107
+ :wait_before_leave => 10
108
+ ),
109
+ :content_extraction => OpenStruct.new(
110
+ # settings for html content extraction
111
+ :html => OpenStruct.new(
112
+ # select the html element that contains the content to index
113
+ # by default, we index all inside the body tag:
114
+ :content_tag_selector => lambda { |tagsoup|
115
+ tagsoup.html.body
116
+ },
117
+ # select the html element containing the title
118
+ :title_tag_selector => lambda { |tagsoup|
119
+ tagsoup.html.head.title
120
+ }
121
+ )
122
+ ),
123
+ :ferret => OpenStruct.new(
124
+ :path => "index/",
125
+ :create => true,
126
+ :handle_parse_errors => true,
127
+ :analyzer => Ferret::Analysis::StandardAnalyzer.new,
128
+ :occur_default => Ferret::Search::BooleanClause::Occur::MUST
129
+ )
130
+ )
131
+ end
132
+ end
133
+ alias config configuration
134
+
135
+ end
136
+
137
+ class Application
138
+
139
+ OPTIONS = [
140
+ ['--config', '-c', GetoptLong::REQUIRED_ARGUMENT,
141
+ "Read aplication configuration from CONFIG."],
142
+ ['--help', '-h', GetoptLong::NO_ARGUMENT,
143
+ "Display this help message."],
144
+ ['--query', '-q', GetoptLong::REQUIRED_ARGUMENT,
145
+ "Execute QUERY."],
146
+ ['--version', '-v', GetoptLong::NO_ARGUMENT,
147
+ "Display the program version."],
148
+ ]
149
+
150
+ # Application options from the command line
151
+ def options
152
+ @options ||= OpenStruct.new
153
+ end
154
+
155
+ # Display the program usage line.
156
+ def usage
157
+ puts "rdig -c configfile {options}"
158
+ end
159
+
160
+ # Display the rake command line help.
161
+ def help
162
+ usage
163
+ puts
164
+ puts "Options are ..."
165
+ puts
166
+ OPTIONS.sort.each do |long, short, mode, desc|
167
+ if mode == GetoptLong::REQUIRED_ARGUMENT
168
+ if desc =~ /\b([A-Z]{2,})\b/
169
+ long = long + "=#{$1}"
170
+ end
171
+ end
172
+ printf " %-20s (%s)\n", long, short
173
+ printf " %s\n", desc
174
+ end
175
+ end
176
+
177
+ # Return a list of the command line options supported by the
178
+ # program.
179
+ def command_line_options
180
+ OPTIONS.collect { |lst| lst[0..-2] }
181
+ end
182
+
183
+ # Do the option defined by +opt+ and +value+.
184
+ def do_option(opt, value)
185
+ case opt
186
+ when '--help'
187
+ help
188
+ exit
189
+ when '--config'
190
+ options.config_file = value
191
+ when '--query'
192
+ options.query = value
193
+ when '--version'
194
+ puts "rdig, version #{RDIGVERSION}"
195
+ exit
196
+ else
197
+ fail "Unknown option: #{opt}"
198
+ end
199
+ end
200
+
201
+ # Read and handle the command line options.
202
+ def handle_options
203
+ opts = GetoptLong.new(*command_line_options)
204
+ opts.each { |opt, value| do_option(opt, value) }
205
+ end
206
+
207
+ # Load the configuration
208
+ def load_configfile
209
+ load File.expand_path(options.config_file)
210
+ end
211
+
212
+ # Run the +rdig+ application.
213
+ def run
214
+ handle_options
215
+ begin
216
+ load_configfile
217
+ rescue
218
+ puts $!.backtrace
219
+ fail "No Configfile found!\n#{$!}"
220
+
221
+ end
222
+
223
+ if options.query
224
+ # query the index
225
+ puts "executing query >#{options.query}<"
226
+ results = RDig.searcher.search(options.query)
227
+ puts "total results: #{results[:hitcount]}"
228
+ results[:list].each { |result|
229
+ puts <<-EOF
230
+ #{result[:url]}
231
+ #{result[:title]}
232
+ #{result[:extract]}
233
+
234
+ EOF
235
+ }
236
+ else
237
+ # rebuild index
238
+ @crawler = Crawler.new
239
+ @crawler.run
240
+ end
241
+ end
242
+ end
243
+ end
@@ -0,0 +1,145 @@
1
+ # override some methods concered with entity resolving
2
+ # to convert them to strings
3
+ class BeautifulStoneSoup
4
+ # resolve unknown html entities using the htmlentities lib
5
+ alias :orig_unknown_entityref :unknown_entityref
6
+ def unknown_entityref(ref)
7
+ if HTMLEntities::MAP.has_key?(ref)
8
+ handle_data [HTMLEntities::MAP[ref]].pack('U')
9
+ else
10
+ orig_unknown_entityref ref
11
+ end
12
+ end
13
+
14
+ # resolve numeric entities to utf8
15
+ def handle_charref(ref)
16
+ handle_data( ref.gsub(/([0-9]{1,7})/) {
17
+ [$1.to_i].pack('U')
18
+ }.gsub(/x([0-9a-f]{1,6})/i) {
19
+ [$1.to_i(16)].pack('U')
20
+ } )
21
+ end
22
+ end
23
+
24
+ module RDig
25
+
26
+ # Contains Classes which are used for extracting content and meta data from
27
+ # various content types.
28
+ #
29
+ # TODO: support at least pdf, too.
30
+ module ContentExtractors
31
+
32
+ # process the given +content+ depending on it's +content_type+.
33
+ def ContentExtractors.process(content, content_type)
34
+ case content_type
35
+ when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
36
+ return HtmlContentExtractor.process(content)
37
+ else
38
+ puts "unable to handle content type #{content_type}"
39
+ end
40
+ return nil
41
+ end
42
+
43
+ # extracts title, content and links from html documents
44
+ class HtmlContentExtractor
45
+
46
+ # returns:
47
+ # { :content => 'extracted clear text',
48
+ # :meta => { :title => 'Title' },
49
+ # :links => [array of urls] }
50
+ def self.process(content)
51
+ result = { }
52
+ tag_soup = BeautifulSoup.new(content)
53
+ result[:title] = extract_title(tag_soup)
54
+ result[:links] = extract_links(tag_soup)
55
+ result[:content] = extract_content(tag_soup)
56
+ return result
57
+ end
58
+
59
+ # Extracts textual content from the HTML tree.
60
+ #
61
+ # - First, the root element to use is determined using the
62
+ # +content_element+ method, which itself uses the content_tag_selector
63
+ # from RDig.configuration.
64
+ # - Then, this element is processed by +extract_text+, which will give
65
+ # all textual content contained in the root element and all it's
66
+ # children.
67
+ def self.extract_content(tag_soup)
68
+ content = ''
69
+ content_element(tag_soup).children { |child|
70
+ extract_text(child, content)
71
+ }
72
+ return content.strip
73
+ end
74
+
75
+ # extracts the href attributes of all a tags, except
76
+ # internal links like <a href="#top">
77
+ def self.extract_links(tagsoup)
78
+ tagsoup.find_all('a').map { |link|
79
+ CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
80
+ }.compact
81
+ end
82
+
83
+ # Extracts the title from the given html tree
84
+ def self.extract_title(tagsoup)
85
+ title = ''
86
+ the_title_tag = title_tag(tagsoup)
87
+ if the_title_tag.is_a? String
88
+ the_title_tag
89
+ else
90
+ extract_text(the_title_tag).strip if the_title_tag
91
+ end
92
+ end
93
+
94
+ # Recursively extracts all text contained in the given element,
95
+ # and appends it to content.
96
+ def self.extract_text(element, content='')
97
+ if element.is_a? NavigableString
98
+ value = strip_comments(element)
99
+ value.strip!
100
+ unless value.empty?
101
+ content << value
102
+ content << ' '
103
+ end
104
+ elsif element.string # it's a Tag, and it has some content string
105
+ value = element.string.strip
106
+ unless value.empty?
107
+ content << value
108
+ content << ' '
109
+ end
110
+ else
111
+ element.children { |child|
112
+ extract_text(child, content)
113
+ }
114
+ end
115
+ end
116
+
117
+ # Returns the element to extract the title from.
118
+ #
119
+ # This may return a string, e.g. an attribute value selected from a meta
120
+ # tag, too.
121
+ def self.title_tag(tagsoup)
122
+ if RDig.config.content_extraction.html.title_tag_selector
123
+ RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
124
+ else
125
+ tagsoup.html.head.title
126
+ end
127
+ end
128
+
129
+ # Retrieve the root element to extract document content from
130
+ def self.content_element(tagsoup)
131
+ if RDig.config.content_extraction.html.content_tag_selector
132
+ RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
133
+ else
134
+ tagsoup.html.body
135
+ end
136
+ end
137
+
138
+ # Return the given string minus all html comments
139
+ def self.strip_comments(string)
140
+ string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
141
+ end
142
+ end
143
+
144
+ end
145
+ end
@@ -0,0 +1,176 @@
1
+ module RDig
2
+
3
+
4
+ class Crawler
5
+
6
+ def initialize
7
+ @documents = Queue.new
8
+ @etag_filter = ETagFilter.new
9
+ end
10
+
11
+ def run
12
+ @indexer = Index::Indexer.new(RDig.config.ferret)
13
+ filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
14
+ RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
15
+
16
+ num_threads = RDig.config.crawler.num_threads
17
+ group = ThreadsWait.new
18
+ num_threads.times { |i|
19
+ group.join_nowait Thread.new("fetcher #{i}") {
20
+ filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
21
+ while (doc = @documents.pop) != :exit
22
+ process_document doc, filterchain
23
+ end
24
+ }
25
+ }
26
+
27
+ # dilemma: suppose we have 1 start url and two threads t1 and t2:
28
+ # t1 pops the start url from the queue which now is empty
29
+ # as the queue is empty now, t2 blocks until t1 adds the links
30
+ # retrieved from his document.
31
+ #
32
+ # But we need the 'queue empty' condition as a sign for us to stop
33
+ # waiting for new entries, too.
34
+
35
+ # check every now and then for an empty queue
36
+ sleep_interval = RDig.config.crawler.wait_before_leave
37
+ begin
38
+ sleep sleep_interval
39
+ end until @documents.empty?
40
+ # nothing to do any more, tell the threads to exit
41
+ num_threads.times { @documents << :exit }
42
+
43
+ puts "waiting for threads to finish..."
44
+ group.all_waits
45
+ ensure
46
+ @indexer.close if @indexer
47
+ end
48
+
49
+ def process_document(doc, filterchain)
50
+ doc.fetch
51
+ # add links from this document to the queue
52
+ doc.content[:links].each { |url| add_url(url, filterchain, doc) }
53
+ return unless @etag_filter.apply(doc)
54
+ case doc.status
55
+ when :success
56
+ if doc.content
57
+ if doc.content[:links]
58
+ doc.content[:links].each { |url| add_url(url, filterchain, doc) }
59
+ end
60
+ @indexer << doc
61
+ #else
62
+ #puts "success but no content: #{doc.uri.to_s}"
63
+ end
64
+ when :redirect
65
+ # links contains the url we were redirected to
66
+ doc.content[:links].each { |url| add_url(url, filterchain, doc) }
67
+ end
68
+ rescue
69
+ puts "error processing document #{doc.uri.to_s}: #{$!}"
70
+ end
71
+
72
+
73
+ # pipes a new document pointing to url through the filter chain,
74
+ # if it survives that, it gets added to the documents queue for further
75
+ # processing
76
+ def add_url(url, filterchain, referring_document = nil)
77
+ return if url.nil? || url.empty?
78
+ if referring_document
79
+ doc = Document.new(url, referring_document.uri)
80
+ # keep redirect count
81
+ if referring_document.status == :redirect
82
+ doc.redirections = referring_document.redirections + 1
83
+ end
84
+ else
85
+ doc = Document.new(url)
86
+ end
87
+
88
+ doc = filterchain.apply(doc)
89
+
90
+ if doc
91
+ puts "added url #{url}"
92
+ #else
93
+ #puts "skipping url #{url}"
94
+ end
95
+ @documents << doc if doc
96
+ end
97
+
98
+ end
99
+
100
+
101
+ class Document
102
+ include HttpClient
103
+
104
+ attr_reader :content
105
+ attr_reader :content_type
106
+ attr_reader :uri
107
+ attr_reader :referring_uri
108
+ attr_reader :status
109
+ attr_reader :etag
110
+ attr_accessor :redirections
111
+
112
+ # url: url of this document, may be relative to the referring doc or host.
113
+ # referrer: uri of the document we retrieved this link from
114
+ def initialize(url, referrer = nil)
115
+ @redirections = 0
116
+ begin
117
+ @uri = URI.parse(url)
118
+ rescue URI::InvalidURIError
119
+ raise "Cannot create document using invalid URL: #{url}"
120
+ end
121
+ @referring_uri = referrer
122
+ end
123
+
124
+ def has_content?
125
+ !self.content.nil?
126
+ end
127
+
128
+ def title; @content[:title] end
129
+ def body; @content[:content] end
130
+ def url; @uri.to_s end
131
+
132
+ def fetch
133
+ puts "fetching #{@uri.to_s}"
134
+ response = do_get(@uri)
135
+ case response
136
+ when Net::HTTPSuccess
137
+ @content_type = response['content-type']
138
+ @raw_body = response.body
139
+ @etag = response['etag']
140
+ # todo externalize this (another chain ?)
141
+ @content = ContentExtractors.process(@raw_body, @content_type)
142
+ @status = :success
143
+ when Net::HTTPRedirection
144
+ @status = :redirect
145
+ @content = { :links => [ response['location'] ] }
146
+ else
147
+ puts "don't know what to do with response: #{response}"
148
+ end
149
+
150
+ end
151
+
152
+ end
153
+
154
+ # checks fetched documents' E-Tag headers against the list of E-Tags
155
+ # of the documents already indexed.
156
+ # This is supposed to help against double-indexing documents which can
157
+ # be reached via different URLs (think http://host.com/ and
158
+ # http://host.com/index.html )
159
+ # Documents without ETag are allowed to pass through
160
+ class ETagFilter
161
+ include MonitorMixin
162
+
163
+ def initialize
164
+ @etags = Set.new
165
+ super
166
+ end
167
+
168
+ def apply(document)
169
+ return document unless document.etag
170
+ synchronize do
171
+ @etags.add?(document.etag) ? document : nil
172
+ end
173
+ end
174
+ end
175
+
176
+ end