rdig 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,7 +27,40 @@ RDig.configuration do |cfg|
27
27
  # cfg.verbose = false
28
28
 
29
29
  # content extraction options
30
+ cfg.content_extraction = OpenStruct.new(
30
31
 
32
+ # HPRICOT configuration
33
+ # this is the html parser used by default from RDig 0.3.3 upwards.
34
+ # Hpricot by far outperforms Rubyful Soup, and is at least as flexible when
35
+ # it comes to selection of portions of the html documents.
36
+ :hpricot => OpenStruct.new(
37
+ # css selector for the element containing the page title
38
+ :title_tag_selector => 'title',
39
+ # might also be a proc returning either an element or a string:
40
+ # :title_tag_selector => lambda { |hpricot_doc| ... }
41
+ :content_tag_selector => 'body'
42
+ # might also be a proc returning either an element or a string:
43
+ # :content_tag_selector => lambda { |hpricot_doc| ... }
44
+ )
45
+
46
+ # RUBYFUL SOUP
47
+ # This is a powerful, but somewhat slow, ruby-only html parsing lib which was
48
+ # RDig's default html parser up to version 0.3.2. To use it, comment the
49
+ # hpricot config above, and uncomment the following:
50
+ #
51
+ # :rubyful_soup => OpenStruct.new(
52
+ # # select the html element that contains the content to index
53
+ # # by default, we index all inside the body tag:
54
+ # :content_tag_selector => lambda { |tagsoup|
55
+ # tagsoup.html.body
56
+ # },
57
+ # # select the html element containing the title
58
+ # :title_tag_selector => lambda { |tagsoup|
59
+ # tagsoup.html.head.title
60
+ # }
61
+ # )
62
+ )
63
+
31
64
  # provide a method that returns the title of an html document
32
65
  # this method may either return a tag to extract the title from,
33
66
  # or a ready-to-index string.
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.2'
27
+ RDIGVERSION = '0.3.3'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -42,22 +42,12 @@ require 'open-uri'
42
42
 
43
43
  begin
44
44
  require 'ferret'
45
- require 'rubyful_soup'
46
45
  rescue LoadError
47
46
  require 'rubygems'
48
47
  require 'ferret'
49
- require 'rubyful_soup'
50
48
  end
51
49
 
52
50
  require 'htmlentities/htmlentities'
53
-
54
- require 'rdig/content_extractors'
55
- require 'rdig/url_filters'
56
- require 'rdig/search'
57
- require 'rdig/index'
58
- require 'rdig/file'
59
- require 'rdig/documents'
60
- require 'rdig/crawler'
61
51
 
62
52
 
63
53
  $KCODE = 'u'
@@ -124,25 +114,37 @@ module RDig
124
114
  :wait_before_leave => 10
125
115
  ),
126
116
  :content_extraction => OpenStruct.new(
127
- # settings for html content extraction
128
- :html => OpenStruct.new(
129
- # select the html element that contains the content to index
130
- # by default, we index all inside the body tag:
131
- :content_tag_selector => lambda { |tagsoup|
132
- tagsoup.html.body
133
- },
134
- # select the html element containing the title
135
- :title_tag_selector => lambda { |tagsoup|
136
- tagsoup.html.head.title
137
- }
117
+ # settings for html content extraction (hpricot)
118
+ :hpricot => OpenStruct.new(
119
+ # css selector for the element containing the page title
120
+ :title_tag_selector => 'title',
121
+ # might also be a proc returning either an element or a string:
122
+ # :title_tag_selector => lambda { |hpricot_doc| ... }
123
+ :content_tag_selector => 'body'
124
+ # might also be a proc returning either an element or a string:
125
+ # :content_tag_selector => lambda { |hpricot_doc| ... }
138
126
  )
127
+ #,
128
+ # # settings for html content extraction (RubyfulSoup)
129
+ # :rubyful_soup => OpenStruct.new(
130
+ # # select the html element that contains the content to index
131
+ # # by default, we index all inside the body tag:
132
+ # :content_tag_selector => lambda { |tagsoup|
133
+ # tagsoup.html.body
134
+ # },
135
+ # # select the html element containing the title
136
+ # :title_tag_selector => lambda { |tagsoup|
137
+ # tagsoup.html.head.title
138
+ # }
139
+ # )
139
140
  ),
140
141
  :index => OpenStruct.new(
141
142
  :path => "index/",
142
143
  :create => true,
143
144
  :handle_parse_errors => true,
144
145
  :analyzer => Ferret::Analysis::StandardAnalyzer.new,
145
- :occur_default => :must
146
+ :occur_default => :must,
147
+ :default_field => '*'
146
148
  )
147
149
  )
148
150
  end
@@ -261,3 +263,13 @@ module RDig
261
263
  end
262
264
  end
263
265
  end
266
+
267
+ require 'rdig/content_extractors'
268
+ require 'rdig/url_filters'
269
+ require 'rdig/search'
270
+ require 'rdig/index'
271
+ require 'rdig/file'
272
+ require 'rdig/documents'
273
+ require 'rdig/crawler'
274
+
275
+
@@ -1,26 +1,3 @@
1
- # override some methods concered with entity resolving
2
- # to convert them to strings
3
- class BeautifulStoneSoup
4
- # resolve unknown html entities using the htmlentities lib
5
- alias :orig_unknown_entityref :unknown_entityref
6
- def unknown_entityref(ref)
7
- if HTMLEntities::MAP.has_key?(ref)
8
- handle_data [HTMLEntities::MAP[ref]].pack('U')
9
- else
10
- orig_unknown_entityref ref
11
- end
12
- end
13
-
14
- # resolve numeric entities to utf8
15
- def handle_charref(ref)
16
- handle_data( ref.gsub(/([0-9]{1,7})/) {
17
- [$1.to_i].pack('U')
18
- }.gsub(/x([0-9a-f]{1,6})/i) {
19
- [$1.to_i(16)].pack('U')
20
- } )
21
- end
22
- end
23
-
24
1
  module RDig
25
2
 
26
3
  # Contains classes which are used for extracting content and meta data from
@@ -30,15 +7,6 @@ module RDig
30
7
  # process the given +content+ depending on it's +content_type+.
31
8
  def self.process(content, content_type)
32
9
  ContentExtractor.process(content, content_type)
33
- # case content_type
34
- #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
35
- # return HtmlContentExtractor.process(content)
36
- #when /^application\/.+pdf/
37
- # return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
38
- #else
39
- # puts "unable to handle content type #{content_type}"
40
- #end
41
- #return nil
42
10
  end
43
11
 
44
12
  # Base class for Content Extractors.
@@ -48,7 +16,7 @@ module RDig
48
16
 
49
17
  def self.inherited(extractor)
50
18
  super(extractor)
51
- puts("discovered content extractor class: #{extractor}")
19
+ puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
52
20
  self.extractors << extractor
53
21
  end
54
22
 
@@ -72,7 +40,7 @@ module RDig
72
40
  end
73
41
 
74
42
  def can_do(content_type)
75
- content_type =~ @pattern
43
+ @pattern && content_type =~ @pattern
76
44
  end
77
45
  end
78
46
 
@@ -104,197 +72,14 @@ module RDig
104
72
  end
105
73
  end
106
74
 
107
- # Extract text from pdf content.
108
- #
109
- # Requires the pdftotext and pdfinfo utilities from the
110
- # xpdf-utils package
111
- # (on debian and friends do 'apt-get install xpdf-utils')
112
- #
113
- class PdfContentExtractor < ContentExtractor
114
- include ExternalAppHelper
115
-
116
- def initialize(config)
117
- super(config)
118
- @pattern = /^application\/pdf/
119
- @pdftotext = 'pdftotext'
120
- @pdfinfo = 'pdfinfo'
121
- @available = true
122
- [ @pdftotext, @pdfinfo].each { |program|
123
- unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
124
- @available = false
125
- break
126
- end
127
- }
128
- end
129
-
130
- def process(content)
131
- result = {}
132
- as_file(content) do |file|
133
- result[:content] = get_content(file.path).strip
134
- result[:title] = get_title(file.path)
135
- end
136
- result
137
- end
138
-
139
- def get_content(path_to_tempfile)
140
- %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
141
- end
142
-
143
- # extracts the title from pdf meta data
144
- # needs pdfinfo
145
- # returns the title or nil if no title was found
146
- def get_title(path_to_tempfile)
147
- %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
148
- rescue
149
- end
150
- end
151
-
152
- # Extract text from word documents
153
- #
154
- # Requires the wvHtml utility
155
- # (on debian and friends do 'apt-get install wv')
156
- class WordContentExtractor < ContentExtractor
157
- include ExternalAppHelper
158
-
159
- def initialize(config)
160
- super(config)
161
- @wvhtml = 'wvHtml'
162
- @pattern = /^application\/msword/
163
- # html extractor for parsing wvHtml output
164
- @html_extractor = HtmlContentExtractor.new(OpenStruct.new(
165
- :html => OpenStruct.new(
166
- :content_tag_selector => lambda { |tagsoup|
167
- tagsoup.html.body
168
- },
169
- :title_tag_selector => lambda { |tagsoup|
170
- tagsoup.html.head.title
171
- }
172
- )))
173
-
174
- # TODO: besser: if $?.exitstatus == 127 (not found)
175
- @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
176
- end
177
-
178
- def process(content)
179
- result = {}
180
- as_file(content) do |file|
181
- result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
182
- end
183
- return result || {}
184
- end
185
-
186
- end
187
-
188
- # extracts title, content and links from html documents
189
- class HtmlContentExtractor < ContentExtractor
190
-
191
- def initialize(config)
192
- super(config)
193
- @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
194
- end
195
-
196
- # returns:
197
- # { :content => 'extracted clear text',
198
- # :meta => { :title => 'Title' },
199
- # :links => [array of urls] }
200
- def process(content)
201
- result = { }
202
- tag_soup = BeautifulSoup.new(content)
203
- result[:title] = extract_title(tag_soup)
204
- result[:links] = extract_links(tag_soup)
205
- result[:content] = extract_content(tag_soup)
206
- return result
207
- end
208
-
209
- # Extracts textual content from the HTML tree.
210
- #
211
- # - First, the root element to use is determined using the
212
- # +content_element+ method, which itself uses the content_tag_selector
213
- # from RDig.configuration.
214
- # - Then, this element is processed by +extract_text+, which will give
215
- # all textual content contained in the root element and all it's
216
- # children.
217
- def extract_content(tag_soup)
218
- content = ''
219
- ce = content_element(tag_soup)
220
- ce.children { |child|
221
- extract_text(child, content)
222
- } unless ce.nil?
223
- return content.strip
224
- end
225
-
226
- # extracts the href attributes of all a tags, except
227
- # internal links like <a href="#top">
228
- def extract_links(tagsoup)
229
- tagsoup.find_all('a').map { |link|
230
- CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
231
- }.compact
232
- end
233
-
234
- # Extracts the title from the given html tree
235
- def extract_title(tagsoup)
236
- the_title_tag = title_tag(tagsoup)
237
- if the_title_tag.is_a? String
238
- the_title_tag
239
- else
240
- title = ''
241
- extract_text(the_title_tag, title)
242
- title.strip
243
- end
244
- end
245
-
246
- # Recursively extracts all text contained in the given element,
247
- # and appends it to content.
248
- def extract_text(element, content='')
249
- return nil if element.nil?
250
- if element.is_a? NavigableString
251
- value = strip_comments(element)
252
- value.strip!
253
- unless value.empty?
254
- content << value
255
- content << ' '
256
- end
257
- elsif element.string # it's a Tag, and it has some content string
258
- # skip inline scripts and styles
259
- return nil if element.name =~ /^(script|style)$/i
260
- value = element.string.strip
261
- unless value.empty?
262
- content << value
263
- content << ' '
264
- end
265
- else
266
- element.children { |child|
267
- extract_text(child, content)
268
- }
269
- end
270
- end
271
-
272
- # Returns the element to extract the title from.
273
- #
274
- # This may return a string, e.g. an attribute value selected from a meta
275
- # tag, too.
276
- def title_tag(tagsoup)
277
- if @config.html.title_tag_selector
278
- @config.html.title_tag_selector.call(tagsoup)
279
- else
280
- tagsoup.html.head.title
281
- end
282
- end
283
-
284
- # Retrieve the root element to extract document content from
285
- def content_element(tagsoup)
286
- if @config.html.content_tag_selector
287
- @config.html.content_tag_selector.call(tagsoup)
288
- else
289
- tagsoup.html.body
290
- end
291
- end
292
-
293
- # Return the given string minus all html comments
294
- def strip_comments(string)
295
- string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
296
- end
297
- end
75
+ end
76
+ end
298
77
 
78
+ # load content extractors
79
+ Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
80
+ begin
81
+ require f
82
+ rescue
83
+ puts "error loading #{f}: #{$!}"
299
84
  end
300
85
  end
@@ -0,0 +1,41 @@
1
+ module RDig
2
+ module ContentExtractors
3
+
4
+ # Extract text from word documents
5
+ #
6
+ # Requires the wvHtml utility
7
+ # (on debian and friends do 'apt-get install wv')
8
+ class WordContentExtractor < ContentExtractor
9
+ include ExternalAppHelper
10
+
11
+ def initialize(config)
12
+ super(config)
13
+ @wvhtml = 'wvHtml'
14
+ @pattern = /^application\/msword/
15
+ # html extractor for parsing wvHtml output
16
+ @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
17
+ :rubyful_soup => OpenStruct.new(
18
+ :content_tag_selector => lambda { |tagsoup|
19
+ tagsoup.html.body
20
+ },
21
+ :title_tag_selector => lambda { |tagsoup|
22
+ tagsoup.html.head.title
23
+ }
24
+ )))
25
+
26
+ # TODO: better: if $?.exitstatus == 127 (not found)
27
+ @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
28
+ end
29
+
30
+ def process(content)
31
+ result = {}
32
+ as_file(content) do |file|
33
+ result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
34
+ end
35
+ return result || {}
36
+ end
37
+
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,99 @@
1
+ begin
2
+ require 'hpricot'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ end
7
+
8
+ module RDig
9
+ module ContentExtractors
10
+
11
+ # extracts title, content and links from html documents using the hpricot library
12
+ class HpricotContentExtractor < ContentExtractor
13
+
14
+ def initialize(config)
15
+ super(config.hpricot)
16
+ # if not configured, refuse to handle any content:
17
+ @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.hpricot
18
+ end
19
+
20
+ # returns:
21
+ # { :content => 'extracted clear text',
22
+ # :title => 'Title',
23
+ # :links => [array of urls] }
24
+ def process(content)
25
+ doc = Hpricot(content)
26
+ {
27
+ :title => extract_title(doc).decode_entities,
28
+ :links => extract_links(doc),
29
+ :content => extract_content(doc).decode_entities
30
+ }
31
+ end
32
+
33
+ # Extracts textual content from the HTML tree.
34
+ #
35
+ # - First, the root element to use is determined using the
36
+ # +content_element+ method, which itself uses the content_tag_selector
37
+ # from RDig.configuration.
38
+ # - Then, this element is processed by +extract_text+, which will give
39
+ # all textual content contained in the root element and all it's
40
+ # children.
41
+ def extract_content(doc)
42
+ content = ''
43
+ ce = content_element(doc)
44
+ content = strip_tags(strip_comments(ce.inner_html)) if ce
45
+ # (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
46
+ # extract_text child, content
47
+ return content.strip
48
+ end
49
+
50
+ # extracts the href attributes of all a tags, except
51
+ # internal links like <a href="#top">
52
+ def extract_links(doc)
53
+ (doc/'a').map { |link|
54
+ href = link['href']
55
+ CGI.unescapeHTML(href) if href && href !~ /^#/
56
+ }.compact
57
+ end
58
+
59
+ # Extracts the title from the given html tree
60
+ def extract_title(doc)
61
+ the_title_tag = title_tag(doc)
62
+ return the_title_tag unless the_title_tag.respond_to? :inner_html
63
+ strip_tags(the_title_tag.inner_html)
64
+ end
65
+
66
+ # Returns the element to extract the title from.
67
+ #
68
+ # This may return a string, e.g. an attribute value selected from a meta
69
+ # tag, too.
70
+ def title_tag(doc)
71
+ tag_from_config(doc, :title_tag_selector) || doc.at('title')
72
+ end
73
+
74
+ # Retrieve the root element to extract document content from
75
+ def content_element(doc)
76
+ tag_from_config(doc, :content_tag_selector) || doc.at('body')
77
+ end
78
+
79
+ def tag_from_config(doc, config_key)
80
+ cfg = @config.send(config_key)
81
+ cfg.is_a?(String) ? doc/cfg : cfg.call(doc) if cfg
82
+ end
83
+
84
+ # Return the given string minus all html comments
85
+ def strip_comments(string)
86
+ string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
87
+ end
88
+ def strip_tags(string)
89
+ string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
90
+ Regexp::MULTILINE, 'u'), ''
91
+ string.gsub! Regexp.new('<.+?>',
92
+ Regexp::MULTILINE, 'u'), ''
93
+ string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
94
+ end
95
+
96
+ end
97
+
98
+ end
99
+ end
@@ -0,0 +1,49 @@
1
+ module RDig
2
+ module ContentExtractors
3
+ # Extract text from pdf content.
4
+ #
5
+ # Requires the pdftotext and pdfinfo utilities from the
6
+ # xpdf-utils package
7
+ # (on debian and friends do 'apt-get install xpdf-utils')
8
+ #
9
+ class PdfContentExtractor < ContentExtractor
10
+ include ExternalAppHelper
11
+
12
+ def initialize(config)
13
+ super(config)
14
+ @pattern = /^application\/pdf/
15
+ @pdftotext = 'pdftotext'
16
+ @pdfinfo = 'pdfinfo'
17
+ @available = true
18
+ [ @pdftotext, @pdfinfo].each { |program|
19
+ unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
20
+ @available = false
21
+ break
22
+ end
23
+ }
24
+ end
25
+
26
+ def process(content)
27
+ result = {}
28
+ as_file(content) do |file|
29
+ result[:content] = get_content(file.path).strip
30
+ result[:title] = get_title(file.path)
31
+ end
32
+ result
33
+ end
34
+
35
+ def get_content(path_to_tempfile)
36
+ %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
37
+ end
38
+
39
+ # extracts the title from pdf meta data
40
+ # needs pdfinfo
41
+ # returns the title or nil if no title was found
42
+ def get_title(path_to_tempfile)
43
+ %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
44
+ rescue
45
+ end
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,147 @@
1
+ begin
2
+ require 'rubyful_soup'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'rubyful_soup'
6
+ end
7
+
8
+ # override some methods concered with entity resolving
9
+ # to convert them to strings
10
+ class BeautifulStoneSoup
11
+ # resolve unknown html entities using the htmlentities lib
12
+ alias :orig_unknown_entityref :unknown_entityref
13
+ def unknown_entityref(ref)
14
+ if HTMLEntities::MAP.has_key?(ref)
15
+ handle_data [HTMLEntities::MAP[ref]].pack('U')
16
+ else
17
+ orig_unknown_entityref ref
18
+ end
19
+ end
20
+
21
+ # resolve numeric entities to utf8
22
+ def handle_charref(ref)
23
+ handle_data( ref.gsub(/([0-9]{1,7})/) {
24
+ [$1.to_i].pack('U')
25
+ }.gsub(/x([0-9a-f]{1,6})/i) {
26
+ [$1.to_i(16)].pack('U')
27
+ } )
28
+ end
29
+ end
30
+
31
+ module RDig
32
+ module ContentExtractors
33
+
34
+ # extracts title, content and links from html documents
35
+ class RubyfulSoupContentExtractor < ContentExtractor
36
+
37
+ def initialize(config)
38
+ super(config.rubyful_soup)
39
+ # if not configured, refuse to handle any content:
40
+ @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
41
+ end
42
+
43
+ # returns:
44
+ # { :content => 'extracted clear text',
45
+ # :meta => { :title => 'Title' },
46
+ # :links => [array of urls] }
47
+ def process(content)
48
+ result = { }
49
+ tag_soup = BeautifulSoup.new(content)
50
+ result[:title] = extract_title(tag_soup)
51
+ result[:links] = extract_links(tag_soup)
52
+ result[:content] = extract_content(tag_soup)
53
+ return result
54
+ end
55
+
56
+ # Extracts textual content from the HTML tree.
57
+ #
58
+ # - First, the root element to use is determined using the
59
+ # +content_element+ method, which itself uses the content_tag_selector
60
+ # from RDig.configuration.
61
+ # - Then, this element is processed by +extract_text+, which will give
62
+ # all textual content contained in the root element and all it's
63
+ # children.
64
+ def extract_content(tag_soup)
65
+ content = ''
66
+ ce = content_element(tag_soup)
67
+ ce.children { |child|
68
+ extract_text(child, content)
69
+ } unless ce.nil?
70
+ return content.strip
71
+ end
72
+
73
+ # extracts the href attributes of all a tags, except
74
+ # internal links like <a href="#top">
75
+ def extract_links(tagsoup)
76
+ tagsoup.find_all('a').map { |link|
77
+ CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
78
+ }.compact
79
+ end
80
+
81
+ # Extracts the title from the given html tree
82
+ def extract_title(tagsoup)
83
+ the_title_tag = title_tag(tagsoup)
84
+ if the_title_tag.is_a? String
85
+ the_title_tag
86
+ else
87
+ title = ''
88
+ extract_text(the_title_tag, title)
89
+ title.strip
90
+ end
91
+ end
92
+
93
+ # Recursively extracts all text contained in the given element,
94
+ # and appends it to content.
95
+ def extract_text(element, content='')
96
+ return nil if element.nil?
97
+ if element.is_a? NavigableString
98
+ value = strip_comments(element)
99
+ value.strip!
100
+ unless value.empty?
101
+ content << value
102
+ content << ' '
103
+ end
104
+ elsif element.string # it's a Tag, and it has some content string
105
+ # skip inline scripts and styles
106
+ return nil if element.name =~ /^(script|style)$/i
107
+ value = element.string.strip
108
+ unless value.empty?
109
+ content << value
110
+ content << ' '
111
+ end
112
+ else
113
+ element.children { |child|
114
+ extract_text(child, content)
115
+ }
116
+ end
117
+ end
118
+
119
+ # Returns the element to extract the title from.
120
+ #
121
+ # This may return a string, e.g. an attribute value selected from a meta
122
+ # tag, too.
123
+ def title_tag(tagsoup)
124
+ if @config.title_tag_selector
125
+ @config.title_tag_selector.call(tagsoup)
126
+ else
127
+ tagsoup.html.head.title
128
+ end
129
+ end
130
+
131
+ # Retrieve the root element to extract document content from
132
+ def content_element(tagsoup)
133
+ if @config.content_tag_selector
134
+ @config.content_tag_selector.call(tagsoup)
135
+ else
136
+ tagsoup.html.body
137
+ end
138
+ end
139
+
140
+ # Return the given string minus all html comments
141
+ def strip_comments(string)
142
+ string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
143
+ end
144
+ end
145
+
146
+ end
147
+ end
@@ -12,7 +12,7 @@ module RDig
12
12
  # takes the ferret section of the rdig configuration as a parameter.
13
13
  def initialize(settings)
14
14
  @ferret_config = settings
15
- @query_parser = Ferret::QueryParser.new('*', settings.marshal_dump)
15
+ @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
16
16
  ferret_searcher
17
17
  end
18
18
 
@@ -24,8 +24,8 @@ module RDig
24
24
  @ferret_searcher = nil
25
25
  end
26
26
  unless @ferret_searcher
27
- @ferret_searcher = IndexSearcher.new(@ferret_config.path)
28
- @query_parser.fields = @ferret_searcher.reader.get_field_names.to_a
27
+ @ferret_searcher = Ferret::Search::Searcher.new(@ferret_config.path)
28
+ @query_parser.fields = @ferret_searcher.reader.field_names.to_a
29
29
  end
30
30
  @ferret_searcher
31
31
  end
@@ -36,23 +36,23 @@ module RDig
36
36
  # for more information on queries.
37
37
  # A Ferret::Search::Query instance may be given, too.
38
38
  #
39
- # Otions are:
40
- # first_doc:: first document in result list to retrieve (0-based). The default is 0.
41
- # num_docs:: number of documents to retrieve. The default is 10.
39
+ # Some of the more often used otions are:
40
+ # offset:: first document in result list to retrieve (0-based). The default is 0.
41
+ # limit:: number of documents to retrieve. The default is 10.
42
+ # Please see the Ferret::Search::Searcher API for more options.
42
43
  def search(query, options={})
43
44
  result = {}
44
45
  query = query_parser.parse(query) if query.is_a?(String)
45
46
  puts "Query: #{query}"
46
- hits = ferret_searcher.search(query, options)
47
- result[:hitcount] = hits.total_hits
48
47
  results = []
49
- hits.each { |doc_id,score|
50
- doc = ferret_searcher.reader.get_document doc_id
48
+ searcher = ferret_searcher
49
+ result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
50
+ doc = searcher[doc_id]
51
51
  results << { :score => score,
52
- :title => doc['title'],
53
- :url => doc['url'],
54
- :extract => build_extract(doc['data']) }
55
- }
52
+ :title => doc[:title],
53
+ :url => doc[:url],
54
+ :extract => build_extract(doc[:data]) }
55
+ end
56
56
  result[:list] = results
57
57
  result
58
58
  end
data/rakefile CHANGED
@@ -125,12 +125,16 @@ else
125
125
  to help building a site search for web sites or intranets. Internally,
126
126
  Ferret is used for the full text indexing. After creating a config file
127
127
  for your site, the index can be built with a single call to rdig.
128
+ For HTML page crawling, hpricot and rubyful_soup are supported.
128
129
  EOF
129
130
 
130
131
  #### Dependencies and requirements.
131
132
 
132
133
  s.add_dependency('ferret', '>= 0.10.0')
133
- s.add_dependency('rubyful_soup', '>= 1.0.4')
134
+ # TODO: check if there is anything like 'suggested' instead of required, or
135
+ # ORed dependencies...
136
+ #s.add_dependency('rubyful_soup', '>= 1.0.4')
137
+ #s.add_dependency('hpricot', '>= 0.4')
134
138
  #s.requirements << ""
135
139
 
136
140
  #### Which files are to be included in this gem? Everything! (Except CVS directories.)
@@ -0,0 +1,77 @@
1
+ require 'test_helper'
2
+ class HpricotContentExtractorTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @config = RDig.config.content_extraction.hpricot.clone
7
+ @extractor = ContentExtractors::HpricotContentExtractor.new(OpenStruct.new(:hpricot => @config))
8
+ @nbsp = [160].pack('U') # non breaking space
9
+ end
10
+
11
+ def test_can_do
12
+ assert !@extractor.can_do('application/pdf')
13
+ assert !@extractor.can_do('application/msword')
14
+ assert @extractor.can_do('text/html')
15
+ assert @extractor.can_do('text/xml')
16
+ assert @extractor.can_do('application/xml')
17
+ assert @extractor.can_do('application/xhtml+xml')
18
+ end
19
+
20
+ def test_simple
21
+ result = ContentExtractors.process(html_doc('simple'), 'text/html')
22
+ assert_not_nil result
23
+ assert_equal 'Sample Title', result[:title]
24
+ assert_not_nil result[:content]
25
+ assert_not_nil result[:links]
26
+ assert_equal 1, result[:links].size
27
+ assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
28
+ assert_equal 'http://test.host/affe.html', result[:links].first
29
+ end
30
+
31
+ def test_entities
32
+ result = @extractor.process(html_doc('entities'))
33
+ assert_equal 'Sample & Title', result[:title]
34
+ assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
35
+ assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
36
+ assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
37
+ end
38
+
39
+ def test_custom_content_element
40
+ @config.title_tag_selector = lambda do |doc|
41
+ doc.at("h1[@class='title']")
42
+ end
43
+ @config.content_tag_selector = lambda do |doc|
44
+ doc.at("div[@id='content']")
45
+ end
46
+ result = @extractor.process(html_doc('custom_tag_selectors'))
47
+ assert_equal 'Sample Title in h1', result[:title]
48
+ assert_equal 'Affe Real content is here.', result[:content]
49
+ # check if links are collected outside the content tag, too:
50
+ assert_equal 3, result[:links].size
51
+ assert_equal 'http://test.host/outside.html', result[:links].first
52
+ assert_equal '/inside.html', result[:links][1]
53
+ assert_equal '/footer.html', result[:links][2]
54
+ end
55
+
56
+
57
+ def test_title_from_dcmeta
58
+ @config.title_tag_selector = lambda do |doc|
59
+ doc.at("meta[@name='DC.title']")['content']
60
+ end
61
+ result = @extractor.process(html_doc('custom_tag_selectors'))
62
+ assert_equal 'Title from DC meta data', result[:title]
63
+ end
64
+
65
+ def test_preprocessed_title
66
+ @config.title_tag_selector = lambda do |doc|
67
+ title = doc.at("meta[@name='DC.title']")['content']
68
+ # use only a portion of the title tag's contents if it matches our
69
+ # regexp:
70
+ (title =~ /^(.*)meta data$/ ? $1 : title).strip
71
+ end
72
+ result = @extractor.process(html_doc('custom_tag_selectors'))
73
+ assert_equal 'Title from DC', result[:title]
74
+ end
75
+
76
+ end
77
+
@@ -1,10 +1,16 @@
1
1
  require 'test_helper'
2
- class HtmlContentExtractorTest < Test::Unit::TestCase
2
+ class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
7
- @extractor = ContentExtractors::HtmlContentExtractor.new(@config)
6
+ @config = OpenStruct.new(
7
+ :content_tag_selector => lambda { |tagsoup|
8
+ tagsoup.html.body
9
+ },
10
+ :title_tag_selector => lambda { |tagsoup|
11
+ tagsoup.html.head.title
12
+ })
13
+ @extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
8
14
  @nbsp = [160].pack('U') # non breaking space
9
15
  end
10
16
 
@@ -37,10 +43,10 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
37
43
  end
38
44
 
39
45
  def test_custom_content_element
40
- @config.html.title_tag_selector = lambda do |tagsoup|
46
+ @config.title_tag_selector = lambda do |tagsoup|
41
47
  tagsoup.find('h1', :attrs => { 'class', 'title' })
42
48
  end
43
- @config.html.content_tag_selector = lambda do |tagsoup|
49
+ @config.content_tag_selector = lambda do |tagsoup|
44
50
  tagsoup.find('div', :attrs => { 'id', 'content' })
45
51
  end
46
52
  result = @extractor.process(html_doc('custom_tag_selectors'))
@@ -55,7 +61,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
55
61
 
56
62
 
57
63
  def test_title_from_dcmeta
58
- @config.html.title_tag_selector = lambda do |tagsoup|
64
+ @config.title_tag_selector = lambda do |tagsoup|
59
65
  tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
60
66
  end
61
67
  result = @extractor.process(html_doc('custom_tag_selectors'))
@@ -63,7 +69,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
63
69
  end
64
70
 
65
71
  def test_preprocessed_title
66
- @config.html.title_tag_selector = lambda do |tagsoup|
72
+ @config.title_tag_selector = lambda do |tagsoup|
67
73
  title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
68
74
  # use only a portion of the title tag's contents if it matches our
69
75
  # regexp:
@@ -0,0 +1,35 @@
1
+ require 'test_helper'
2
+ class SearcherTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
7
+ index_dir = 'tmp/test-index'
8
+ Dir.mkdir index_dir unless File.directory? index_dir
9
+ RDig.configuration do |cfg|
10
+ @old_crawler_cfg = cfg.crawler.clone
11
+ cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
12
+ cfg.crawler.num_threads = 1
13
+ cfg.crawler.wait_before_leave = 1
14
+ cfg.index.path = index_dir
15
+ cfg.verbose = true
16
+ end
17
+ crawler = Crawler.new
18
+ crawler.run
19
+ end
20
+
21
+ def teardown
22
+ RDig.configuration do |cfg|
23
+ cfg.crawler = @old_crawler_cfg
24
+ end
25
+ end
26
+
27
+ def test_search
28
+ result = RDig.searcher.search 'some sample text'
29
+ assert_equal 3, result[:hitcount]
30
+ assert_equal 3, result[:list].size
31
+ end
32
+
33
+ end
34
+
35
+
metadata CHANGED
@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: rdig
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.2
7
- date: 2006-10-09 00:00:00 +02:00
6
+ version: 0.3.3
7
+ date: 2006-10-23 00:00:00 +02:00
8
8
  summary: Ruby based web site indexing and searching library.
9
9
  require_paths:
10
10
  - lib
11
11
  email: jk@jkraemer.net
12
12
  homepage: http://rdig.rubyforge.org/
13
13
  rubyforge_project: rdig
14
- description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig.
14
+ description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
15
15
  autorequire:
16
16
  default_executable: rdig
17
17
  bindir: bin
@@ -33,6 +33,7 @@ files:
33
33
  - lib/rdig
34
34
  - lib/htmlentities
35
35
  - lib/rdig.rb
36
+ - lib/rdig/content_extractors
36
37
  - lib/rdig/crawler.rb
37
38
  - lib/rdig/search.rb
38
39
  - lib/rdig/highlight.rb
@@ -41,6 +42,10 @@ files:
41
42
  - lib/rdig/content_extractors.rb
42
43
  - lib/rdig/documents.rb
43
44
  - lib/rdig/file.rb
45
+ - lib/rdig/content_extractors/rubyful_soup.rb
46
+ - lib/rdig/content_extractors/doc.rb
47
+ - lib/rdig/content_extractors/hpricot.rb
48
+ - lib/rdig/content_extractors/pdf.rb
44
49
  - lib/htmlentities/CHANGES
45
50
  - lib/htmlentities/COPYING
46
51
  - lib/htmlentities/README
@@ -50,8 +55,10 @@ files:
50
55
  - test/test_helper.rb
51
56
  - test/unit/etag_filter_test.rb
52
57
  - test/unit/url_filters_test.rb
53
- - test/unit/html_content_extractor_test.rb
58
+ - test/unit/searcher_test.rb
59
+ - test/unit/rubyful_soup_content_extractor_test.rb
54
60
  - test/unit/pdf_content_extractor_test.rb
61
+ - test/unit/hpricot_content_extractor_test.rb
55
62
  - test/unit/word_content_extractor_test.rb
56
63
  - test/unit/file_document_test.rb
57
64
  - test/unit/crawler_fs_test.rb
@@ -100,12 +107,3 @@ dependencies:
100
107
  - !ruby/object:Gem::Version
101
108
  version: 0.10.0
102
109
  version:
103
- - !ruby/object:Gem::Dependency
104
- name: rubyful_soup
105
- version_requirement:
106
- version_requirements: !ruby/object:Gem::Version::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: 1.0.4
111
- version: