rdig 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -27,7 +27,40 @@ RDig.configuration do |cfg|
27
27
  # cfg.verbose = false
28
28
 
29
29
  # content extraction options
30
+ cfg.content_extraction = OpenStruct.new(
30
31
 
32
+ # HPRICOT configuration
33
+ # this is the html parser used by default from RDig 0.3.3 upwards.
34
+ # Hpricot by far outperforms Rubyful Soup, and is at least as flexible when
35
+ # it comes to selection of portions of the html documents.
36
+ :hpricot => OpenStruct.new(
37
+ # css selector for the element containing the page title
38
+ :title_tag_selector => 'title',
39
+ # might also be a proc returning either an element or a string:
40
+ # :title_tag_selector => lambda { |hpricot_doc| ... }
41
+ :content_tag_selector => 'body'
42
+ # might also be a proc returning either an element or a string:
43
+ # :content_tag_selector => lambda { |hpricot_doc| ... }
44
+ )
45
+
46
+ # RUBYFUL SOUP
47
+ # This is a powerful, but somewhat slow, ruby-only html parsing lib which was
48
+ # RDig's default html parser up to version 0.3.2. To use it, comment the
49
+ # hpricot config above, and uncomment the following:
50
+ #
51
+ # :rubyful_soup => OpenStruct.new(
52
+ # # select the html element that contains the content to index
53
+ # # by default, we index all inside the body tag:
54
+ # :content_tag_selector => lambda { |tagsoup|
55
+ # tagsoup.html.body
56
+ # },
57
+ # # select the html element containing the title
58
+ # :title_tag_selector => lambda { |tagsoup|
59
+ # tagsoup.html.head.title
60
+ # }
61
+ # )
62
+ )
63
+
31
64
  # provide a method that returns the title of an html document
32
65
  # this method may either return a tag to extract the title from,
33
66
  # or a ready-to-index string.
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.2'
27
+ RDIGVERSION = '0.3.3'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -42,22 +42,12 @@ require 'open-uri'
42
42
 
43
43
  begin
44
44
  require 'ferret'
45
- require 'rubyful_soup'
46
45
  rescue LoadError
47
46
  require 'rubygems'
48
47
  require 'ferret'
49
- require 'rubyful_soup'
50
48
  end
51
49
 
52
50
  require 'htmlentities/htmlentities'
53
-
54
- require 'rdig/content_extractors'
55
- require 'rdig/url_filters'
56
- require 'rdig/search'
57
- require 'rdig/index'
58
- require 'rdig/file'
59
- require 'rdig/documents'
60
- require 'rdig/crawler'
61
51
 
62
52
 
63
53
  $KCODE = 'u'
@@ -124,25 +114,37 @@ module RDig
124
114
  :wait_before_leave => 10
125
115
  ),
126
116
  :content_extraction => OpenStruct.new(
127
- # settings for html content extraction
128
- :html => OpenStruct.new(
129
- # select the html element that contains the content to index
130
- # by default, we index all inside the body tag:
131
- :content_tag_selector => lambda { |tagsoup|
132
- tagsoup.html.body
133
- },
134
- # select the html element containing the title
135
- :title_tag_selector => lambda { |tagsoup|
136
- tagsoup.html.head.title
137
- }
117
+ # settings for html content extraction (hpricot)
118
+ :hpricot => OpenStruct.new(
119
+ # css selector for the element containing the page title
120
+ :title_tag_selector => 'title',
121
+ # might also be a proc returning either an element or a string:
122
+ # :title_tag_selector => lambda { |hpricot_doc| ... }
123
+ :content_tag_selector => 'body'
124
+ # might also be a proc returning either an element or a string:
125
+ # :content_tag_selector => lambda { |hpricot_doc| ... }
138
126
  )
127
+ #,
128
+ # # settings for html content extraction (RubyfulSoup)
129
+ # :rubyful_soup => OpenStruct.new(
130
+ # # select the html element that contains the content to index
131
+ # # by default, we index all inside the body tag:
132
+ # :content_tag_selector => lambda { |tagsoup|
133
+ # tagsoup.html.body
134
+ # },
135
+ # # select the html element containing the title
136
+ # :title_tag_selector => lambda { |tagsoup|
137
+ # tagsoup.html.head.title
138
+ # }
139
+ # )
139
140
  ),
140
141
  :index => OpenStruct.new(
141
142
  :path => "index/",
142
143
  :create => true,
143
144
  :handle_parse_errors => true,
144
145
  :analyzer => Ferret::Analysis::StandardAnalyzer.new,
145
- :occur_default => :must
146
+ :occur_default => :must,
147
+ :default_field => '*'
146
148
  )
147
149
  )
148
150
  end
@@ -261,3 +263,13 @@ module RDig
261
263
  end
262
264
  end
263
265
  end
266
+
267
+ require 'rdig/content_extractors'
268
+ require 'rdig/url_filters'
269
+ require 'rdig/search'
270
+ require 'rdig/index'
271
+ require 'rdig/file'
272
+ require 'rdig/documents'
273
+ require 'rdig/crawler'
274
+
275
+
@@ -1,26 +1,3 @@
1
- # override some methods concered with entity resolving
2
- # to convert them to strings
3
- class BeautifulStoneSoup
4
- # resolve unknown html entities using the htmlentities lib
5
- alias :orig_unknown_entityref :unknown_entityref
6
- def unknown_entityref(ref)
7
- if HTMLEntities::MAP.has_key?(ref)
8
- handle_data [HTMLEntities::MAP[ref]].pack('U')
9
- else
10
- orig_unknown_entityref ref
11
- end
12
- end
13
-
14
- # resolve numeric entities to utf8
15
- def handle_charref(ref)
16
- handle_data( ref.gsub(/([0-9]{1,7})/) {
17
- [$1.to_i].pack('U')
18
- }.gsub(/x([0-9a-f]{1,6})/i) {
19
- [$1.to_i(16)].pack('U')
20
- } )
21
- end
22
- end
23
-
24
1
  module RDig
25
2
 
26
3
  # Contains classes which are used for extracting content and meta data from
@@ -30,15 +7,6 @@ module RDig
30
7
  # process the given +content+ depending on it's +content_type+.
31
8
  def self.process(content, content_type)
32
9
  ContentExtractor.process(content, content_type)
33
- # case content_type
34
- #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
35
- # return HtmlContentExtractor.process(content)
36
- #when /^application\/.+pdf/
37
- # return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
38
- #else
39
- # puts "unable to handle content type #{content_type}"
40
- #end
41
- #return nil
42
10
  end
43
11
 
44
12
  # Base class for Content Extractors.
@@ -48,7 +16,7 @@ module RDig
48
16
 
49
17
  def self.inherited(extractor)
50
18
  super(extractor)
51
- puts("discovered content extractor class: #{extractor}")
19
+ puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
52
20
  self.extractors << extractor
53
21
  end
54
22
 
@@ -72,7 +40,7 @@ module RDig
72
40
  end
73
41
 
74
42
  def can_do(content_type)
75
- content_type =~ @pattern
43
+ @pattern && content_type =~ @pattern
76
44
  end
77
45
  end
78
46
 
@@ -104,197 +72,14 @@ module RDig
104
72
  end
105
73
  end
106
74
 
107
- # Extract text from pdf content.
108
- #
109
- # Requires the pdftotext and pdfinfo utilities from the
110
- # xpdf-utils package
111
- # (on debian and friends do 'apt-get install xpdf-utils')
112
- #
113
- class PdfContentExtractor < ContentExtractor
114
- include ExternalAppHelper
115
-
116
- def initialize(config)
117
- super(config)
118
- @pattern = /^application\/pdf/
119
- @pdftotext = 'pdftotext'
120
- @pdfinfo = 'pdfinfo'
121
- @available = true
122
- [ @pdftotext, @pdfinfo].each { |program|
123
- unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
124
- @available = false
125
- break
126
- end
127
- }
128
- end
129
-
130
- def process(content)
131
- result = {}
132
- as_file(content) do |file|
133
- result[:content] = get_content(file.path).strip
134
- result[:title] = get_title(file.path)
135
- end
136
- result
137
- end
138
-
139
- def get_content(path_to_tempfile)
140
- %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
141
- end
142
-
143
- # extracts the title from pdf meta data
144
- # needs pdfinfo
145
- # returns the title or nil if no title was found
146
- def get_title(path_to_tempfile)
147
- %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
148
- rescue
149
- end
150
- end
151
-
152
- # Extract text from word documents
153
- #
154
- # Requires the wvHtml utility
155
- # (on debian and friends do 'apt-get install wv')
156
- class WordContentExtractor < ContentExtractor
157
- include ExternalAppHelper
158
-
159
- def initialize(config)
160
- super(config)
161
- @wvhtml = 'wvHtml'
162
- @pattern = /^application\/msword/
163
- # html extractor for parsing wvHtml output
164
- @html_extractor = HtmlContentExtractor.new(OpenStruct.new(
165
- :html => OpenStruct.new(
166
- :content_tag_selector => lambda { |tagsoup|
167
- tagsoup.html.body
168
- },
169
- :title_tag_selector => lambda { |tagsoup|
170
- tagsoup.html.head.title
171
- }
172
- )))
173
-
174
- # TODO: besser: if $?.exitstatus == 127 (not found)
175
- @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
176
- end
177
-
178
- def process(content)
179
- result = {}
180
- as_file(content) do |file|
181
- result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
182
- end
183
- return result || {}
184
- end
185
-
186
- end
187
-
188
- # extracts title, content and links from html documents
189
- class HtmlContentExtractor < ContentExtractor
190
-
191
- def initialize(config)
192
- super(config)
193
- @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
194
- end
195
-
196
- # returns:
197
- # { :content => 'extracted clear text',
198
- # :meta => { :title => 'Title' },
199
- # :links => [array of urls] }
200
- def process(content)
201
- result = { }
202
- tag_soup = BeautifulSoup.new(content)
203
- result[:title] = extract_title(tag_soup)
204
- result[:links] = extract_links(tag_soup)
205
- result[:content] = extract_content(tag_soup)
206
- return result
207
- end
208
-
209
- # Extracts textual content from the HTML tree.
210
- #
211
- # - First, the root element to use is determined using the
212
- # +content_element+ method, which itself uses the content_tag_selector
213
- # from RDig.configuration.
214
- # - Then, this element is processed by +extract_text+, which will give
215
- # all textual content contained in the root element and all it's
216
- # children.
217
- def extract_content(tag_soup)
218
- content = ''
219
- ce = content_element(tag_soup)
220
- ce.children { |child|
221
- extract_text(child, content)
222
- } unless ce.nil?
223
- return content.strip
224
- end
225
-
226
- # extracts the href attributes of all a tags, except
227
- # internal links like <a href="#top">
228
- def extract_links(tagsoup)
229
- tagsoup.find_all('a').map { |link|
230
- CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
231
- }.compact
232
- end
233
-
234
- # Extracts the title from the given html tree
235
- def extract_title(tagsoup)
236
- the_title_tag = title_tag(tagsoup)
237
- if the_title_tag.is_a? String
238
- the_title_tag
239
- else
240
- title = ''
241
- extract_text(the_title_tag, title)
242
- title.strip
243
- end
244
- end
245
-
246
- # Recursively extracts all text contained in the given element,
247
- # and appends it to content.
248
- def extract_text(element, content='')
249
- return nil if element.nil?
250
- if element.is_a? NavigableString
251
- value = strip_comments(element)
252
- value.strip!
253
- unless value.empty?
254
- content << value
255
- content << ' '
256
- end
257
- elsif element.string # it's a Tag, and it has some content string
258
- # skip inline scripts and styles
259
- return nil if element.name =~ /^(script|style)$/i
260
- value = element.string.strip
261
- unless value.empty?
262
- content << value
263
- content << ' '
264
- end
265
- else
266
- element.children { |child|
267
- extract_text(child, content)
268
- }
269
- end
270
- end
271
-
272
- # Returns the element to extract the title from.
273
- #
274
- # This may return a string, e.g. an attribute value selected from a meta
275
- # tag, too.
276
- def title_tag(tagsoup)
277
- if @config.html.title_tag_selector
278
- @config.html.title_tag_selector.call(tagsoup)
279
- else
280
- tagsoup.html.head.title
281
- end
282
- end
283
-
284
- # Retrieve the root element to extract document content from
285
- def content_element(tagsoup)
286
- if @config.html.content_tag_selector
287
- @config.html.content_tag_selector.call(tagsoup)
288
- else
289
- tagsoup.html.body
290
- end
291
- end
292
-
293
- # Return the given string minus all html comments
294
- def strip_comments(string)
295
- string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
296
- end
297
- end
75
+ end
76
+ end
298
77
 
78
+ # load content extractors
79
+ Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
80
+ begin
81
+ require f
82
+ rescue
83
+ puts "error loading #{f}: #{$!}"
299
84
  end
300
85
  end
@@ -0,0 +1,41 @@
1
+ module RDig
2
+ module ContentExtractors
3
+
4
+ # Extract text from word documents
5
+ #
6
+ # Requires the wvHtml utility
7
+ # (on debian and friends do 'apt-get install wv')
8
+ class WordContentExtractor < ContentExtractor
9
+ include ExternalAppHelper
10
+
11
+ def initialize(config)
12
+ super(config)
13
+ @wvhtml = 'wvHtml'
14
+ @pattern = /^application\/msword/
15
+ # html extractor for parsing wvHtml output
16
+ @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
17
+ :rubyful_soup => OpenStruct.new(
18
+ :content_tag_selector => lambda { |tagsoup|
19
+ tagsoup.html.body
20
+ },
21
+ :title_tag_selector => lambda { |tagsoup|
22
+ tagsoup.html.head.title
23
+ }
24
+ )))
25
+
26
+ # TODO: better: if $?.exitstatus == 127 (not found)
27
+ @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
28
+ end
29
+
30
+ def process(content)
31
+ result = {}
32
+ as_file(content) do |file|
33
+ result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
34
+ end
35
+ return result || {}
36
+ end
37
+
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,99 @@
1
+ begin
2
+ require 'hpricot'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ end
7
+
8
+ module RDig
9
+ module ContentExtractors
10
+
11
+ # extracts title, content and links from html documents using the hpricot library
12
+ class HpricotContentExtractor < ContentExtractor
13
+
14
+ def initialize(config)
15
+ super(config.hpricot)
16
+ # if not configured, refuse to handle any content:
17
+ @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.hpricot
18
+ end
19
+
20
+ # returns:
21
+ # { :content => 'extracted clear text',
22
+ # :title => 'Title',
23
+ # :links => [array of urls] }
24
+ def process(content)
25
+ doc = Hpricot(content)
26
+ {
27
+ :title => extract_title(doc).decode_entities,
28
+ :links => extract_links(doc),
29
+ :content => extract_content(doc).decode_entities
30
+ }
31
+ end
32
+
33
+ # Extracts textual content from the HTML tree.
34
+ #
35
+ # - First, the root element to use is determined using the
36
+ # +content_element+ method, which itself uses the content_tag_selector
37
+ # from RDig.configuration.
38
+ # - Then, this element is processed by +extract_text+, which will give
39
+ # all textual content contained in the root element and all it's
40
+ # children.
41
+ def extract_content(doc)
42
+ content = ''
43
+ ce = content_element(doc)
44
+ content = strip_tags(strip_comments(ce.inner_html)) if ce
45
+ # (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
46
+ # extract_text child, content
47
+ return content.strip
48
+ end
49
+
50
+ # extracts the href attributes of all a tags, except
51
+ # internal links like <a href="#top">
52
+ def extract_links(doc)
53
+ (doc/'a').map { |link|
54
+ href = link['href']
55
+ CGI.unescapeHTML(href) if href && href !~ /^#/
56
+ }.compact
57
+ end
58
+
59
+ # Extracts the title from the given html tree
60
+ def extract_title(doc)
61
+ the_title_tag = title_tag(doc)
62
+ return the_title_tag unless the_title_tag.respond_to? :inner_html
63
+ strip_tags(the_title_tag.inner_html)
64
+ end
65
+
66
+ # Returns the element to extract the title from.
67
+ #
68
+ # This may return a string, e.g. an attribute value selected from a meta
69
+ # tag, too.
70
+ def title_tag(doc)
71
+ tag_from_config(doc, :title_tag_selector) || doc.at('title')
72
+ end
73
+
74
+ # Retrieve the root element to extract document content from
75
+ def content_element(doc)
76
+ tag_from_config(doc, :content_tag_selector) || doc.at('body')
77
+ end
78
+
79
+ def tag_from_config(doc, config_key)
80
+ cfg = @config.send(config_key)
81
+ cfg.is_a?(String) ? doc/cfg : cfg.call(doc) if cfg
82
+ end
83
+
84
+ # Return the given string minus all html comments
85
+ def strip_comments(string)
86
+ string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
87
+ end
88
+ def strip_tags(string)
89
+ string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
90
+ Regexp::MULTILINE, 'u'), ''
91
+ string.gsub! Regexp.new('<.+?>',
92
+ Regexp::MULTILINE, 'u'), ''
93
+ string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
94
+ end
95
+
96
+ end
97
+
98
+ end
99
+ end
@@ -0,0 +1,49 @@
1
+ module RDig
2
+ module ContentExtractors
3
+ # Extract text from pdf content.
4
+ #
5
+ # Requires the pdftotext and pdfinfo utilities from the
6
+ # xpdf-utils package
7
+ # (on debian and friends do 'apt-get install xpdf-utils')
8
+ #
9
+ class PdfContentExtractor < ContentExtractor
10
+ include ExternalAppHelper
11
+
12
+ def initialize(config)
13
+ super(config)
14
+ @pattern = /^application\/pdf/
15
+ @pdftotext = 'pdftotext'
16
+ @pdfinfo = 'pdfinfo'
17
+ @available = true
18
+ [ @pdftotext, @pdfinfo].each { |program|
19
+ unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
20
+ @available = false
21
+ break
22
+ end
23
+ }
24
+ end
25
+
26
+ def process(content)
27
+ result = {}
28
+ as_file(content) do |file|
29
+ result[:content] = get_content(file.path).strip
30
+ result[:title] = get_title(file.path)
31
+ end
32
+ result
33
+ end
34
+
35
+ def get_content(path_to_tempfile)
36
+ %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
37
+ end
38
+
39
+ # extracts the title from pdf meta data
40
+ # needs pdfinfo
41
+ # returns the title or nil if no title was found
42
+ def get_title(path_to_tempfile)
43
+ %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
44
+ rescue
45
+ end
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,147 @@
1
+ begin
2
+ require 'rubyful_soup'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'rubyful_soup'
6
+ end
7
+
8
+ # override some methods concered with entity resolving
9
+ # to convert them to strings
10
+ class BeautifulStoneSoup
11
+ # resolve unknown html entities using the htmlentities lib
12
+ alias :orig_unknown_entityref :unknown_entityref
13
+ def unknown_entityref(ref)
14
+ if HTMLEntities::MAP.has_key?(ref)
15
+ handle_data [HTMLEntities::MAP[ref]].pack('U')
16
+ else
17
+ orig_unknown_entityref ref
18
+ end
19
+ end
20
+
21
+ # resolve numeric entities to utf8
22
+ def handle_charref(ref)
23
+ handle_data( ref.gsub(/([0-9]{1,7})/) {
24
+ [$1.to_i].pack('U')
25
+ }.gsub(/x([0-9a-f]{1,6})/i) {
26
+ [$1.to_i(16)].pack('U')
27
+ } )
28
+ end
29
+ end
30
+
31
+ module RDig
32
+ module ContentExtractors
33
+
34
+ # extracts title, content and links from html documents
35
+ class RubyfulSoupContentExtractor < ContentExtractor
36
+
37
+ def initialize(config)
38
+ super(config.rubyful_soup)
39
+ # if not configured, refuse to handle any content:
40
+ @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
41
+ end
42
+
43
+ # returns:
44
+ # { :content => 'extracted clear text',
45
+ # :meta => { :title => 'Title' },
46
+ # :links => [array of urls] }
47
+ def process(content)
48
+ result = { }
49
+ tag_soup = BeautifulSoup.new(content)
50
+ result[:title] = extract_title(tag_soup)
51
+ result[:links] = extract_links(tag_soup)
52
+ result[:content] = extract_content(tag_soup)
53
+ return result
54
+ end
55
+
56
+ # Extracts textual content from the HTML tree.
57
+ #
58
+ # - First, the root element to use is determined using the
59
+ # +content_element+ method, which itself uses the content_tag_selector
60
+ # from RDig.configuration.
61
+ # - Then, this element is processed by +extract_text+, which will give
62
+ # all textual content contained in the root element and all it's
63
+ # children.
64
+ def extract_content(tag_soup)
65
+ content = ''
66
+ ce = content_element(tag_soup)
67
+ ce.children { |child|
68
+ extract_text(child, content)
69
+ } unless ce.nil?
70
+ return content.strip
71
+ end
72
+
73
+ # extracts the href attributes of all a tags, except
74
+ # internal links like <a href="#top">
75
+ def extract_links(tagsoup)
76
+ tagsoup.find_all('a').map { |link|
77
+ CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
78
+ }.compact
79
+ end
80
+
81
+ # Extracts the title from the given html tree
82
+ def extract_title(tagsoup)
83
+ the_title_tag = title_tag(tagsoup)
84
+ if the_title_tag.is_a? String
85
+ the_title_tag
86
+ else
87
+ title = ''
88
+ extract_text(the_title_tag, title)
89
+ title.strip
90
+ end
91
+ end
92
+
93
+ # Recursively extracts all text contained in the given element,
94
+ # and appends it to content.
95
+ def extract_text(element, content='')
96
+ return nil if element.nil?
97
+ if element.is_a? NavigableString
98
+ value = strip_comments(element)
99
+ value.strip!
100
+ unless value.empty?
101
+ content << value
102
+ content << ' '
103
+ end
104
+ elsif element.string # it's a Tag, and it has some content string
105
+ # skip inline scripts and styles
106
+ return nil if element.name =~ /^(script|style)$/i
107
+ value = element.string.strip
108
+ unless value.empty?
109
+ content << value
110
+ content << ' '
111
+ end
112
+ else
113
+ element.children { |child|
114
+ extract_text(child, content)
115
+ }
116
+ end
117
+ end
118
+
119
+ # Returns the element to extract the title from.
120
+ #
121
+ # This may return a string, e.g. an attribute value selected from a meta
122
+ # tag, too.
123
+ def title_tag(tagsoup)
124
+ if @config.title_tag_selector
125
+ @config.title_tag_selector.call(tagsoup)
126
+ else
127
+ tagsoup.html.head.title
128
+ end
129
+ end
130
+
131
+ # Retrieve the root element to extract document content from
132
+ def content_element(tagsoup)
133
+ if @config.content_tag_selector
134
+ @config.content_tag_selector.call(tagsoup)
135
+ else
136
+ tagsoup.html.body
137
+ end
138
+ end
139
+
140
+ # Return the given string minus all html comments
141
+ def strip_comments(string)
142
+ string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
143
+ end
144
+ end
145
+
146
+ end
147
+ end
@@ -12,7 +12,7 @@ module RDig
12
12
  # takes the ferret section of the rdig configuration as a parameter.
13
13
  def initialize(settings)
14
14
  @ferret_config = settings
15
- @query_parser = Ferret::QueryParser.new('*', settings.marshal_dump)
15
+ @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
16
16
  ferret_searcher
17
17
  end
18
18
 
@@ -24,8 +24,8 @@ module RDig
24
24
  @ferret_searcher = nil
25
25
  end
26
26
  unless @ferret_searcher
27
- @ferret_searcher = IndexSearcher.new(@ferret_config.path)
28
- @query_parser.fields = @ferret_searcher.reader.get_field_names.to_a
27
+ @ferret_searcher = Ferret::Search::Searcher.new(@ferret_config.path)
28
+ @query_parser.fields = @ferret_searcher.reader.field_names.to_a
29
29
  end
30
30
  @ferret_searcher
31
31
  end
@@ -36,23 +36,23 @@ module RDig
36
36
  # for more information on queries.
37
37
  # A Ferret::Search::Query instance may be given, too.
38
38
  #
39
- # Otions are:
40
- # first_doc:: first document in result list to retrieve (0-based). The default is 0.
41
- # num_docs:: number of documents to retrieve. The default is 10.
39
+ # Some of the more often used otions are:
40
+ # offset:: first document in result list to retrieve (0-based). The default is 0.
41
+ # limit:: number of documents to retrieve. The default is 10.
42
+ # Please see the Ferret::Search::Searcher API for more options.
42
43
  def search(query, options={})
43
44
  result = {}
44
45
  query = query_parser.parse(query) if query.is_a?(String)
45
46
  puts "Query: #{query}"
46
- hits = ferret_searcher.search(query, options)
47
- result[:hitcount] = hits.total_hits
48
47
  results = []
49
- hits.each { |doc_id,score|
50
- doc = ferret_searcher.reader.get_document doc_id
48
+ searcher = ferret_searcher
49
+ result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
50
+ doc = searcher[doc_id]
51
51
  results << { :score => score,
52
- :title => doc['title'],
53
- :url => doc['url'],
54
- :extract => build_extract(doc['data']) }
55
- }
52
+ :title => doc[:title],
53
+ :url => doc[:url],
54
+ :extract => build_extract(doc[:data]) }
55
+ end
56
56
  result[:list] = results
57
57
  result
58
58
  end
data/rakefile CHANGED
@@ -125,12 +125,16 @@ else
125
125
  to help building a site search for web sites or intranets. Internally,
126
126
  Ferret is used for the full text indexing. After creating a config file
127
127
  for your site, the index can be built with a single call to rdig.
128
+ For HTML page crawling, hpricot and rubyful_soup are supported.
128
129
  EOF
129
130
 
130
131
  #### Dependencies and requirements.
131
132
 
132
133
  s.add_dependency('ferret', '>= 0.10.0')
133
- s.add_dependency('rubyful_soup', '>= 1.0.4')
134
+ # TODO: check if there is anything like 'suggested' instead of required, or
135
+ # ORed dependencies...
136
+ #s.add_dependency('rubyful_soup', '>= 1.0.4')
137
+ #s.add_dependency('hpricot', '>= 0.4')
134
138
  #s.requirements << ""
135
139
 
136
140
  #### Which files are to be included in this gem? Everything! (Except CVS directories.)
@@ -0,0 +1,77 @@
1
+ require 'test_helper'
2
+ class HpricotContentExtractorTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @config = RDig.config.content_extraction.hpricot.clone
7
+ @extractor = ContentExtractors::HpricotContentExtractor.new(OpenStruct.new(:hpricot => @config))
8
+ @nbsp = [160].pack('U') # non breaking space
9
+ end
10
+
11
+ def test_can_do
12
+ assert !@extractor.can_do('application/pdf')
13
+ assert !@extractor.can_do('application/msword')
14
+ assert @extractor.can_do('text/html')
15
+ assert @extractor.can_do('text/xml')
16
+ assert @extractor.can_do('application/xml')
17
+ assert @extractor.can_do('application/xhtml+xml')
18
+ end
19
+
20
+ def test_simple
21
+ result = ContentExtractors.process(html_doc('simple'), 'text/html')
22
+ assert_not_nil result
23
+ assert_equal 'Sample Title', result[:title]
24
+ assert_not_nil result[:content]
25
+ assert_not_nil result[:links]
26
+ assert_equal 1, result[:links].size
27
+ assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
28
+ assert_equal 'http://test.host/affe.html', result[:links].first
29
+ end
30
+
31
+ def test_entities
32
+ result = @extractor.process(html_doc('entities'))
33
+ assert_equal 'Sample & Title', result[:title]
34
+ assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
35
+ assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
36
+ assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
37
+ end
38
+
39
+ def test_custom_content_element
40
+ @config.title_tag_selector = lambda do |doc|
41
+ doc.at("h1[@class='title']")
42
+ end
43
+ @config.content_tag_selector = lambda do |doc|
44
+ doc.at("div[@id='content']")
45
+ end
46
+ result = @extractor.process(html_doc('custom_tag_selectors'))
47
+ assert_equal 'Sample Title in h1', result[:title]
48
+ assert_equal 'Affe Real content is here.', result[:content]
49
+ # check if links are collected outside the content tag, too:
50
+ assert_equal 3, result[:links].size
51
+ assert_equal 'http://test.host/outside.html', result[:links].first
52
+ assert_equal '/inside.html', result[:links][1]
53
+ assert_equal '/footer.html', result[:links][2]
54
+ end
55
+
56
+
57
+ def test_title_from_dcmeta
58
+ @config.title_tag_selector = lambda do |doc|
59
+ doc.at("meta[@name='DC.title']")['content']
60
+ end
61
+ result = @extractor.process(html_doc('custom_tag_selectors'))
62
+ assert_equal 'Title from DC meta data', result[:title]
63
+ end
64
+
65
+ def test_preprocessed_title
66
+ @config.title_tag_selector = lambda do |doc|
67
+ title = doc.at("meta[@name='DC.title']")['content']
68
+ # use only a portion of the title tag's contents if it matches our
69
+ # regexp:
70
+ (title =~ /^(.*)meta data$/ ? $1 : title).strip
71
+ end
72
+ result = @extractor.process(html_doc('custom_tag_selectors'))
73
+ assert_equal 'Title from DC', result[:title]
74
+ end
75
+
76
+ end
77
+
@@ -1,10 +1,16 @@
1
1
  require 'test_helper'
2
- class HtmlContentExtractorTest < Test::Unit::TestCase
2
+ class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
7
- @extractor = ContentExtractors::HtmlContentExtractor.new(@config)
6
+ @config = OpenStruct.new(
7
+ :content_tag_selector => lambda { |tagsoup|
8
+ tagsoup.html.body
9
+ },
10
+ :title_tag_selector => lambda { |tagsoup|
11
+ tagsoup.html.head.title
12
+ })
13
+ @extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
8
14
  @nbsp = [160].pack('U') # non breaking space
9
15
  end
10
16
 
@@ -37,10 +43,10 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
37
43
  end
38
44
 
39
45
  def test_custom_content_element
40
- @config.html.title_tag_selector = lambda do |tagsoup|
46
+ @config.title_tag_selector = lambda do |tagsoup|
41
47
  tagsoup.find('h1', :attrs => { 'class', 'title' })
42
48
  end
43
- @config.html.content_tag_selector = lambda do |tagsoup|
49
+ @config.content_tag_selector = lambda do |tagsoup|
44
50
  tagsoup.find('div', :attrs => { 'id', 'content' })
45
51
  end
46
52
  result = @extractor.process(html_doc('custom_tag_selectors'))
@@ -55,7 +61,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
55
61
 
56
62
 
57
63
  def test_title_from_dcmeta
58
- @config.html.title_tag_selector = lambda do |tagsoup|
64
+ @config.title_tag_selector = lambda do |tagsoup|
59
65
  tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
60
66
  end
61
67
  result = @extractor.process(html_doc('custom_tag_selectors'))
@@ -63,7 +69,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
63
69
  end
64
70
 
65
71
  def test_preprocessed_title
66
- @config.html.title_tag_selector = lambda do |tagsoup|
72
+ @config.title_tag_selector = lambda do |tagsoup|
67
73
  title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
68
74
  # use only a portion of the title tag's contents if it matches our
69
75
  # regexp:
@@ -0,0 +1,35 @@
1
+ require 'test_helper'
2
+ class SearcherTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
7
+ index_dir = 'tmp/test-index'
8
+ Dir.mkdir index_dir unless File.directory? index_dir
9
+ RDig.configuration do |cfg|
10
+ @old_crawler_cfg = cfg.crawler.clone
11
+ cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
12
+ cfg.crawler.num_threads = 1
13
+ cfg.crawler.wait_before_leave = 1
14
+ cfg.index.path = index_dir
15
+ cfg.verbose = true
16
+ end
17
+ crawler = Crawler.new
18
+ crawler.run
19
+ end
20
+
21
+ def teardown
22
+ RDig.configuration do |cfg|
23
+ cfg.crawler = @old_crawler_cfg
24
+ end
25
+ end
26
+
27
+ def test_search
28
+ result = RDig.searcher.search 'some sample text'
29
+ assert_equal 3, result[:hitcount]
30
+ assert_equal 3, result[:list].size
31
+ end
32
+
33
+ end
34
+
35
+
metadata CHANGED
@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: rdig
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.2
7
- date: 2006-10-09 00:00:00 +02:00
6
+ version: 0.3.3
7
+ date: 2006-10-23 00:00:00 +02:00
8
8
  summary: Ruby based web site indexing and searching library.
9
9
  require_paths:
10
10
  - lib
11
11
  email: jk@jkraemer.net
12
12
  homepage: http://rdig.rubyforge.org/
13
13
  rubyforge_project: rdig
14
- description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig.
14
+ description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
15
15
  autorequire:
16
16
  default_executable: rdig
17
17
  bindir: bin
@@ -33,6 +33,7 @@ files:
33
33
  - lib/rdig
34
34
  - lib/htmlentities
35
35
  - lib/rdig.rb
36
+ - lib/rdig/content_extractors
36
37
  - lib/rdig/crawler.rb
37
38
  - lib/rdig/search.rb
38
39
  - lib/rdig/highlight.rb
@@ -41,6 +42,10 @@ files:
41
42
  - lib/rdig/content_extractors.rb
42
43
  - lib/rdig/documents.rb
43
44
  - lib/rdig/file.rb
45
+ - lib/rdig/content_extractors/rubyful_soup.rb
46
+ - lib/rdig/content_extractors/doc.rb
47
+ - lib/rdig/content_extractors/hpricot.rb
48
+ - lib/rdig/content_extractors/pdf.rb
44
49
  - lib/htmlentities/CHANGES
45
50
  - lib/htmlentities/COPYING
46
51
  - lib/htmlentities/README
@@ -50,8 +55,10 @@ files:
50
55
  - test/test_helper.rb
51
56
  - test/unit/etag_filter_test.rb
52
57
  - test/unit/url_filters_test.rb
53
- - test/unit/html_content_extractor_test.rb
58
+ - test/unit/searcher_test.rb
59
+ - test/unit/rubyful_soup_content_extractor_test.rb
54
60
  - test/unit/pdf_content_extractor_test.rb
61
+ - test/unit/hpricot_content_extractor_test.rb
55
62
  - test/unit/word_content_extractor_test.rb
56
63
  - test/unit/file_document_test.rb
57
64
  - test/unit/crawler_fs_test.rb
@@ -100,12 +107,3 @@ dependencies:
100
107
  - !ruby/object:Gem::Version
101
108
  version: 0.10.0
102
109
  version:
103
- - !ruby/object:Gem::Dependency
104
- name: rubyful_soup
105
- version_requirement:
106
- version_requirements: !ruby/object:Gem::Version::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: 1.0.4
111
- version: