rdig 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/doc/examples/config.rb +33 -0
- data/lib/rdig.rb +35 -23
- data/lib/rdig/content_extractors.rb +10 -225
- data/lib/rdig/content_extractors/doc.rb +41 -0
- data/lib/rdig/content_extractors/hpricot.rb +99 -0
- data/lib/rdig/content_extractors/pdf.rb +49 -0
- data/lib/rdig/content_extractors/rubyful_soup.rb +147 -0
- data/lib/rdig/search.rb +14 -14
- data/rakefile +5 -1
- data/test/unit/hpricot_content_extractor_test.rb +77 -0
- data/test/unit/{html_content_extractor_test.rb → rubyful_soup_content_extractor_test.rb} +13 -7
- data/test/unit/searcher_test.rb +35 -0
- metadata +11 -13
data/doc/examples/config.rb
CHANGED
@@ -27,7 +27,40 @@ RDig.configuration do |cfg|
|
|
27
27
|
# cfg.verbose = false
|
28
28
|
|
29
29
|
# content extraction options
|
30
|
+
cfg.content_extraction = OpenStruct.new(
|
30
31
|
|
32
|
+
# HPRICOT configuration
|
33
|
+
# this is the html parser used by default from RDig 0.3.3 upwards.
|
34
|
+
# Hpricot by far outperforms Rubyful Soup, and is at least as flexible when
|
35
|
+
# it comes to selection of portions of the html documents.
|
36
|
+
:hpricot => OpenStruct.new(
|
37
|
+
# css selector for the element containing the page title
|
38
|
+
:title_tag_selector => 'title',
|
39
|
+
# might also be a proc returning either an element or a string:
|
40
|
+
# :title_tag_selector => lambda { |hpricot_doc| ... }
|
41
|
+
:content_tag_selector => 'body'
|
42
|
+
# might also be a proc returning either an element or a string:
|
43
|
+
# :content_tag_selector => lambda { |hpricot_doc| ... }
|
44
|
+
)
|
45
|
+
|
46
|
+
# RUBYFUL SOUP
|
47
|
+
# This is a powerful, but somewhat slow, ruby-only html parsing lib which was
|
48
|
+
# RDig's default html parser up to version 0.3.2. To use it, comment the
|
49
|
+
# hpricot config above, and uncomment the following:
|
50
|
+
#
|
51
|
+
# :rubyful_soup => OpenStruct.new(
|
52
|
+
# # select the html element that contains the content to index
|
53
|
+
# # by default, we index all inside the body tag:
|
54
|
+
# :content_tag_selector => lambda { |tagsoup|
|
55
|
+
# tagsoup.html.body
|
56
|
+
# },
|
57
|
+
# # select the html element containing the title
|
58
|
+
# :title_tag_selector => lambda { |tagsoup|
|
59
|
+
# tagsoup.html.head.title
|
60
|
+
# }
|
61
|
+
# )
|
62
|
+
)
|
63
|
+
|
31
64
|
# provide a method that returns the title of an html document
|
32
65
|
# this method may either return a tag to extract the title from,
|
33
66
|
# or a ready-to-index string.
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.3'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -42,22 +42,12 @@ require 'open-uri'
|
|
42
42
|
|
43
43
|
begin
|
44
44
|
require 'ferret'
|
45
|
-
require 'rubyful_soup'
|
46
45
|
rescue LoadError
|
47
46
|
require 'rubygems'
|
48
47
|
require 'ferret'
|
49
|
-
require 'rubyful_soup'
|
50
48
|
end
|
51
49
|
|
52
50
|
require 'htmlentities/htmlentities'
|
53
|
-
|
54
|
-
require 'rdig/content_extractors'
|
55
|
-
require 'rdig/url_filters'
|
56
|
-
require 'rdig/search'
|
57
|
-
require 'rdig/index'
|
58
|
-
require 'rdig/file'
|
59
|
-
require 'rdig/documents'
|
60
|
-
require 'rdig/crawler'
|
61
51
|
|
62
52
|
|
63
53
|
$KCODE = 'u'
|
@@ -124,25 +114,37 @@ module RDig
|
|
124
114
|
:wait_before_leave => 10
|
125
115
|
),
|
126
116
|
:content_extraction => OpenStruct.new(
|
127
|
-
# settings for html content extraction
|
128
|
-
:
|
129
|
-
#
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
#
|
135
|
-
:
|
136
|
-
tagsoup.html.head.title
|
137
|
-
}
|
117
|
+
# settings for html content extraction (hpricot)
|
118
|
+
:hpricot => OpenStruct.new(
|
119
|
+
# css selector for the element containing the page title
|
120
|
+
:title_tag_selector => 'title',
|
121
|
+
# might also be a proc returning either an element or a string:
|
122
|
+
# :title_tag_selector => lambda { |hpricot_doc| ... }
|
123
|
+
:content_tag_selector => 'body'
|
124
|
+
# might also be a proc returning either an element or a string:
|
125
|
+
# :content_tag_selector => lambda { |hpricot_doc| ... }
|
138
126
|
)
|
127
|
+
#,
|
128
|
+
# # settings for html content extraction (RubyfulSoup)
|
129
|
+
# :rubyful_soup => OpenStruct.new(
|
130
|
+
# # select the html element that contains the content to index
|
131
|
+
# # by default, we index all inside the body tag:
|
132
|
+
# :content_tag_selector => lambda { |tagsoup|
|
133
|
+
# tagsoup.html.body
|
134
|
+
# },
|
135
|
+
# # select the html element containing the title
|
136
|
+
# :title_tag_selector => lambda { |tagsoup|
|
137
|
+
# tagsoup.html.head.title
|
138
|
+
# }
|
139
|
+
# )
|
139
140
|
),
|
140
141
|
:index => OpenStruct.new(
|
141
142
|
:path => "index/",
|
142
143
|
:create => true,
|
143
144
|
:handle_parse_errors => true,
|
144
145
|
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
|
145
|
-
:occur_default => :must
|
146
|
+
:occur_default => :must,
|
147
|
+
:default_field => '*'
|
146
148
|
)
|
147
149
|
)
|
148
150
|
end
|
@@ -261,3 +263,13 @@ module RDig
|
|
261
263
|
end
|
262
264
|
end
|
263
265
|
end
|
266
|
+
|
267
|
+
require 'rdig/content_extractors'
|
268
|
+
require 'rdig/url_filters'
|
269
|
+
require 'rdig/search'
|
270
|
+
require 'rdig/index'
|
271
|
+
require 'rdig/file'
|
272
|
+
require 'rdig/documents'
|
273
|
+
require 'rdig/crawler'
|
274
|
+
|
275
|
+
|
@@ -1,26 +1,3 @@
|
|
1
|
-
# override some methods concered with entity resolving
|
2
|
-
# to convert them to strings
|
3
|
-
class BeautifulStoneSoup
|
4
|
-
# resolve unknown html entities using the htmlentities lib
|
5
|
-
alias :orig_unknown_entityref :unknown_entityref
|
6
|
-
def unknown_entityref(ref)
|
7
|
-
if HTMLEntities::MAP.has_key?(ref)
|
8
|
-
handle_data [HTMLEntities::MAP[ref]].pack('U')
|
9
|
-
else
|
10
|
-
orig_unknown_entityref ref
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
# resolve numeric entities to utf8
|
15
|
-
def handle_charref(ref)
|
16
|
-
handle_data( ref.gsub(/([0-9]{1,7})/) {
|
17
|
-
[$1.to_i].pack('U')
|
18
|
-
}.gsub(/x([0-9a-f]{1,6})/i) {
|
19
|
-
[$1.to_i(16)].pack('U')
|
20
|
-
} )
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
1
|
module RDig
|
25
2
|
|
26
3
|
# Contains classes which are used for extracting content and meta data from
|
@@ -30,15 +7,6 @@ module RDig
|
|
30
7
|
# process the given +content+ depending on it's +content_type+.
|
31
8
|
def self.process(content, content_type)
|
32
9
|
ContentExtractor.process(content, content_type)
|
33
|
-
# case content_type
|
34
|
-
#when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
35
|
-
# return HtmlContentExtractor.process(content)
|
36
|
-
#when /^application\/.+pdf/
|
37
|
-
# return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
|
38
|
-
#else
|
39
|
-
# puts "unable to handle content type #{content_type}"
|
40
|
-
#end
|
41
|
-
#return nil
|
42
10
|
end
|
43
11
|
|
44
12
|
# Base class for Content Extractors.
|
@@ -48,7 +16,7 @@ module RDig
|
|
48
16
|
|
49
17
|
def self.inherited(extractor)
|
50
18
|
super(extractor)
|
51
|
-
puts("discovered content extractor class: #{extractor}")
|
19
|
+
puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
|
52
20
|
self.extractors << extractor
|
53
21
|
end
|
54
22
|
|
@@ -72,7 +40,7 @@ module RDig
|
|
72
40
|
end
|
73
41
|
|
74
42
|
def can_do(content_type)
|
75
|
-
content_type =~ @pattern
|
43
|
+
@pattern && content_type =~ @pattern
|
76
44
|
end
|
77
45
|
end
|
78
46
|
|
@@ -104,197 +72,14 @@ module RDig
|
|
104
72
|
end
|
105
73
|
end
|
106
74
|
|
107
|
-
|
108
|
-
|
109
|
-
# Requires the pdftotext and pdfinfo utilities from the
|
110
|
-
# xpdf-utils package
|
111
|
-
# (on debian and friends do 'apt-get install xpdf-utils')
|
112
|
-
#
|
113
|
-
class PdfContentExtractor < ContentExtractor
|
114
|
-
include ExternalAppHelper
|
115
|
-
|
116
|
-
def initialize(config)
|
117
|
-
super(config)
|
118
|
-
@pattern = /^application\/pdf/
|
119
|
-
@pdftotext = 'pdftotext'
|
120
|
-
@pdfinfo = 'pdfinfo'
|
121
|
-
@available = true
|
122
|
-
[ @pdftotext, @pdfinfo].each { |program|
|
123
|
-
unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
|
124
|
-
@available = false
|
125
|
-
break
|
126
|
-
end
|
127
|
-
}
|
128
|
-
end
|
129
|
-
|
130
|
-
def process(content)
|
131
|
-
result = {}
|
132
|
-
as_file(content) do |file|
|
133
|
-
result[:content] = get_content(file.path).strip
|
134
|
-
result[:title] = get_title(file.path)
|
135
|
-
end
|
136
|
-
result
|
137
|
-
end
|
138
|
-
|
139
|
-
def get_content(path_to_tempfile)
|
140
|
-
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
|
141
|
-
end
|
142
|
-
|
143
|
-
# extracts the title from pdf meta data
|
144
|
-
# needs pdfinfo
|
145
|
-
# returns the title or nil if no title was found
|
146
|
-
def get_title(path_to_tempfile)
|
147
|
-
%x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
|
148
|
-
rescue
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
# Extract text from word documents
|
153
|
-
#
|
154
|
-
# Requires the wvHtml utility
|
155
|
-
# (on debian and friends do 'apt-get install wv')
|
156
|
-
class WordContentExtractor < ContentExtractor
|
157
|
-
include ExternalAppHelper
|
158
|
-
|
159
|
-
def initialize(config)
|
160
|
-
super(config)
|
161
|
-
@wvhtml = 'wvHtml'
|
162
|
-
@pattern = /^application\/msword/
|
163
|
-
# html extractor for parsing wvHtml output
|
164
|
-
@html_extractor = HtmlContentExtractor.new(OpenStruct.new(
|
165
|
-
:html => OpenStruct.new(
|
166
|
-
:content_tag_selector => lambda { |tagsoup|
|
167
|
-
tagsoup.html.body
|
168
|
-
},
|
169
|
-
:title_tag_selector => lambda { |tagsoup|
|
170
|
-
tagsoup.html.head.title
|
171
|
-
}
|
172
|
-
)))
|
173
|
-
|
174
|
-
# TODO: besser: if $?.exitstatus == 127 (not found)
|
175
|
-
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
176
|
-
end
|
177
|
-
|
178
|
-
def process(content)
|
179
|
-
result = {}
|
180
|
-
as_file(content) do |file|
|
181
|
-
result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
|
182
|
-
end
|
183
|
-
return result || {}
|
184
|
-
end
|
185
|
-
|
186
|
-
end
|
187
|
-
|
188
|
-
# extracts title, content and links from html documents
|
189
|
-
class HtmlContentExtractor < ContentExtractor
|
190
|
-
|
191
|
-
def initialize(config)
|
192
|
-
super(config)
|
193
|
-
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
194
|
-
end
|
195
|
-
|
196
|
-
# returns:
|
197
|
-
# { :content => 'extracted clear text',
|
198
|
-
# :meta => { :title => 'Title' },
|
199
|
-
# :links => [array of urls] }
|
200
|
-
def process(content)
|
201
|
-
result = { }
|
202
|
-
tag_soup = BeautifulSoup.new(content)
|
203
|
-
result[:title] = extract_title(tag_soup)
|
204
|
-
result[:links] = extract_links(tag_soup)
|
205
|
-
result[:content] = extract_content(tag_soup)
|
206
|
-
return result
|
207
|
-
end
|
208
|
-
|
209
|
-
# Extracts textual content from the HTML tree.
|
210
|
-
#
|
211
|
-
# - First, the root element to use is determined using the
|
212
|
-
# +content_element+ method, which itself uses the content_tag_selector
|
213
|
-
# from RDig.configuration.
|
214
|
-
# - Then, this element is processed by +extract_text+, which will give
|
215
|
-
# all textual content contained in the root element and all it's
|
216
|
-
# children.
|
217
|
-
def extract_content(tag_soup)
|
218
|
-
content = ''
|
219
|
-
ce = content_element(tag_soup)
|
220
|
-
ce.children { |child|
|
221
|
-
extract_text(child, content)
|
222
|
-
} unless ce.nil?
|
223
|
-
return content.strip
|
224
|
-
end
|
225
|
-
|
226
|
-
# extracts the href attributes of all a tags, except
|
227
|
-
# internal links like <a href="#top">
|
228
|
-
def extract_links(tagsoup)
|
229
|
-
tagsoup.find_all('a').map { |link|
|
230
|
-
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
|
231
|
-
}.compact
|
232
|
-
end
|
233
|
-
|
234
|
-
# Extracts the title from the given html tree
|
235
|
-
def extract_title(tagsoup)
|
236
|
-
the_title_tag = title_tag(tagsoup)
|
237
|
-
if the_title_tag.is_a? String
|
238
|
-
the_title_tag
|
239
|
-
else
|
240
|
-
title = ''
|
241
|
-
extract_text(the_title_tag, title)
|
242
|
-
title.strip
|
243
|
-
end
|
244
|
-
end
|
245
|
-
|
246
|
-
# Recursively extracts all text contained in the given element,
|
247
|
-
# and appends it to content.
|
248
|
-
def extract_text(element, content='')
|
249
|
-
return nil if element.nil?
|
250
|
-
if element.is_a? NavigableString
|
251
|
-
value = strip_comments(element)
|
252
|
-
value.strip!
|
253
|
-
unless value.empty?
|
254
|
-
content << value
|
255
|
-
content << ' '
|
256
|
-
end
|
257
|
-
elsif element.string # it's a Tag, and it has some content string
|
258
|
-
# skip inline scripts and styles
|
259
|
-
return nil if element.name =~ /^(script|style)$/i
|
260
|
-
value = element.string.strip
|
261
|
-
unless value.empty?
|
262
|
-
content << value
|
263
|
-
content << ' '
|
264
|
-
end
|
265
|
-
else
|
266
|
-
element.children { |child|
|
267
|
-
extract_text(child, content)
|
268
|
-
}
|
269
|
-
end
|
270
|
-
end
|
271
|
-
|
272
|
-
# Returns the element to extract the title from.
|
273
|
-
#
|
274
|
-
# This may return a string, e.g. an attribute value selected from a meta
|
275
|
-
# tag, too.
|
276
|
-
def title_tag(tagsoup)
|
277
|
-
if @config.html.title_tag_selector
|
278
|
-
@config.html.title_tag_selector.call(tagsoup)
|
279
|
-
else
|
280
|
-
tagsoup.html.head.title
|
281
|
-
end
|
282
|
-
end
|
283
|
-
|
284
|
-
# Retrieve the root element to extract document content from
|
285
|
-
def content_element(tagsoup)
|
286
|
-
if @config.html.content_tag_selector
|
287
|
-
@config.html.content_tag_selector.call(tagsoup)
|
288
|
-
else
|
289
|
-
tagsoup.html.body
|
290
|
-
end
|
291
|
-
end
|
292
|
-
|
293
|
-
# Return the given string minus all html comments
|
294
|
-
def strip_comments(string)
|
295
|
-
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
|
296
|
-
end
|
297
|
-
end
|
75
|
+
end
|
76
|
+
end
|
298
77
|
|
78
|
+
# load content extractors
|
79
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
|
80
|
+
begin
|
81
|
+
require f
|
82
|
+
rescue
|
83
|
+
puts "error loading #{f}: #{$!}"
|
299
84
|
end
|
300
85
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module RDig
|
2
|
+
module ContentExtractors
|
3
|
+
|
4
|
+
# Extract text from word documents
|
5
|
+
#
|
6
|
+
# Requires the wvHtml utility
|
7
|
+
# (on debian and friends do 'apt-get install wv')
|
8
|
+
class WordContentExtractor < ContentExtractor
|
9
|
+
include ExternalAppHelper
|
10
|
+
|
11
|
+
def initialize(config)
|
12
|
+
super(config)
|
13
|
+
@wvhtml = 'wvHtml'
|
14
|
+
@pattern = /^application\/msword/
|
15
|
+
# html extractor for parsing wvHtml output
|
16
|
+
@html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
|
17
|
+
:rubyful_soup => OpenStruct.new(
|
18
|
+
:content_tag_selector => lambda { |tagsoup|
|
19
|
+
tagsoup.html.body
|
20
|
+
},
|
21
|
+
:title_tag_selector => lambda { |tagsoup|
|
22
|
+
tagsoup.html.head.title
|
23
|
+
}
|
24
|
+
)))
|
25
|
+
|
26
|
+
# TODO: better: if $?.exitstatus == 127 (not found)
|
27
|
+
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
28
|
+
end
|
29
|
+
|
30
|
+
def process(content)
|
31
|
+
result = {}
|
32
|
+
as_file(content) do |file|
|
33
|
+
result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
|
34
|
+
end
|
35
|
+
return result || {}
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
begin
|
2
|
+
require 'hpricot'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems'
|
5
|
+
require 'hpricot'
|
6
|
+
end
|
7
|
+
|
8
|
+
module RDig
|
9
|
+
module ContentExtractors
|
10
|
+
|
11
|
+
# extracts title, content and links from html documents using the hpricot library
|
12
|
+
class HpricotContentExtractor < ContentExtractor
|
13
|
+
|
14
|
+
def initialize(config)
|
15
|
+
super(config.hpricot)
|
16
|
+
# if not configured, refuse to handle any content:
|
17
|
+
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.hpricot
|
18
|
+
end
|
19
|
+
|
20
|
+
# returns:
|
21
|
+
# { :content => 'extracted clear text',
|
22
|
+
# :title => 'Title',
|
23
|
+
# :links => [array of urls] }
|
24
|
+
def process(content)
|
25
|
+
doc = Hpricot(content)
|
26
|
+
{
|
27
|
+
:title => extract_title(doc).decode_entities,
|
28
|
+
:links => extract_links(doc),
|
29
|
+
:content => extract_content(doc).decode_entities
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
# Extracts textual content from the HTML tree.
|
34
|
+
#
|
35
|
+
# - First, the root element to use is determined using the
|
36
|
+
# +content_element+ method, which itself uses the content_tag_selector
|
37
|
+
# from RDig.configuration.
|
38
|
+
# - Then, this element is processed by +extract_text+, which will give
|
39
|
+
# all textual content contained in the root element and all it's
|
40
|
+
# children.
|
41
|
+
def extract_content(doc)
|
42
|
+
content = ''
|
43
|
+
ce = content_element(doc)
|
44
|
+
content = strip_tags(strip_comments(ce.inner_html)) if ce
|
45
|
+
# (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
|
46
|
+
# extract_text child, content
|
47
|
+
return content.strip
|
48
|
+
end
|
49
|
+
|
50
|
+
# extracts the href attributes of all a tags, except
|
51
|
+
# internal links like <a href="#top">
|
52
|
+
def extract_links(doc)
|
53
|
+
(doc/'a').map { |link|
|
54
|
+
href = link['href']
|
55
|
+
CGI.unescapeHTML(href) if href && href !~ /^#/
|
56
|
+
}.compact
|
57
|
+
end
|
58
|
+
|
59
|
+
# Extracts the title from the given html tree
|
60
|
+
def extract_title(doc)
|
61
|
+
the_title_tag = title_tag(doc)
|
62
|
+
return the_title_tag unless the_title_tag.respond_to? :inner_html
|
63
|
+
strip_tags(the_title_tag.inner_html)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Returns the element to extract the title from.
|
67
|
+
#
|
68
|
+
# This may return a string, e.g. an attribute value selected from a meta
|
69
|
+
# tag, too.
|
70
|
+
def title_tag(doc)
|
71
|
+
tag_from_config(doc, :title_tag_selector) || doc.at('title')
|
72
|
+
end
|
73
|
+
|
74
|
+
# Retrieve the root element to extract document content from
|
75
|
+
def content_element(doc)
|
76
|
+
tag_from_config(doc, :content_tag_selector) || doc.at('body')
|
77
|
+
end
|
78
|
+
|
79
|
+
def tag_from_config(doc, config_key)
|
80
|
+
cfg = @config.send(config_key)
|
81
|
+
cfg.is_a?(String) ? doc/cfg : cfg.call(doc) if cfg
|
82
|
+
end
|
83
|
+
|
84
|
+
# Return the given string minus all html comments
|
85
|
+
def strip_comments(string)
|
86
|
+
string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
|
87
|
+
end
|
88
|
+
def strip_tags(string)
|
89
|
+
string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
|
90
|
+
Regexp::MULTILINE, 'u'), ''
|
91
|
+
string.gsub! Regexp.new('<.+?>',
|
92
|
+
Regexp::MULTILINE, 'u'), ''
|
93
|
+
string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module RDig
|
2
|
+
module ContentExtractors
|
3
|
+
# Extract text from pdf content.
|
4
|
+
#
|
5
|
+
# Requires the pdftotext and pdfinfo utilities from the
|
6
|
+
# xpdf-utils package
|
7
|
+
# (on debian and friends do 'apt-get install xpdf-utils')
|
8
|
+
#
|
9
|
+
class PdfContentExtractor < ContentExtractor
|
10
|
+
include ExternalAppHelper
|
11
|
+
|
12
|
+
def initialize(config)
|
13
|
+
super(config)
|
14
|
+
@pattern = /^application\/pdf/
|
15
|
+
@pdftotext = 'pdftotext'
|
16
|
+
@pdfinfo = 'pdfinfo'
|
17
|
+
@available = true
|
18
|
+
[ @pdftotext, @pdfinfo].each { |program|
|
19
|
+
unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
|
20
|
+
@available = false
|
21
|
+
break
|
22
|
+
end
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
def process(content)
|
27
|
+
result = {}
|
28
|
+
as_file(content) do |file|
|
29
|
+
result[:content] = get_content(file.path).strip
|
30
|
+
result[:title] = get_title(file.path)
|
31
|
+
end
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_content(path_to_tempfile)
|
36
|
+
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
|
37
|
+
end
|
38
|
+
|
39
|
+
# extracts the title from pdf meta data
|
40
|
+
# needs pdfinfo
|
41
|
+
# returns the title or nil if no title was found
|
42
|
+
def get_title(path_to_tempfile)
|
43
|
+
%x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
|
44
|
+
rescue
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
begin
|
2
|
+
require 'rubyful_soup'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems'
|
5
|
+
require 'rubyful_soup'
|
6
|
+
end
|
7
|
+
|
8
|
+
# override some methods concered with entity resolving
|
9
|
+
# to convert them to strings
|
10
|
+
class BeautifulStoneSoup
|
11
|
+
# resolve unknown html entities using the htmlentities lib
|
12
|
+
alias :orig_unknown_entityref :unknown_entityref
|
13
|
+
def unknown_entityref(ref)
|
14
|
+
if HTMLEntities::MAP.has_key?(ref)
|
15
|
+
handle_data [HTMLEntities::MAP[ref]].pack('U')
|
16
|
+
else
|
17
|
+
orig_unknown_entityref ref
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# resolve numeric entities to utf8
|
22
|
+
def handle_charref(ref)
|
23
|
+
handle_data( ref.gsub(/([0-9]{1,7})/) {
|
24
|
+
[$1.to_i].pack('U')
|
25
|
+
}.gsub(/x([0-9a-f]{1,6})/i) {
|
26
|
+
[$1.to_i(16)].pack('U')
|
27
|
+
} )
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
module RDig
|
32
|
+
module ContentExtractors
|
33
|
+
|
34
|
+
# extracts title, content and links from html documents
|
35
|
+
class RubyfulSoupContentExtractor < ContentExtractor
|
36
|
+
|
37
|
+
def initialize(config)
|
38
|
+
super(config.rubyful_soup)
|
39
|
+
# if not configured, refuse to handle any content:
|
40
|
+
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
|
41
|
+
end
|
42
|
+
|
43
|
+
# returns:
|
44
|
+
# { :content => 'extracted clear text',
|
45
|
+
# :meta => { :title => 'Title' },
|
46
|
+
# :links => [array of urls] }
|
47
|
+
def process(content)
|
48
|
+
result = { }
|
49
|
+
tag_soup = BeautifulSoup.new(content)
|
50
|
+
result[:title] = extract_title(tag_soup)
|
51
|
+
result[:links] = extract_links(tag_soup)
|
52
|
+
result[:content] = extract_content(tag_soup)
|
53
|
+
return result
|
54
|
+
end
|
55
|
+
|
56
|
+
# Extracts textual content from the HTML tree.
|
57
|
+
#
|
58
|
+
# - First, the root element to use is determined using the
|
59
|
+
# +content_element+ method, which itself uses the content_tag_selector
|
60
|
+
# from RDig.configuration.
|
61
|
+
# - Then, this element is processed by +extract_text+, which will give
|
62
|
+
# all textual content contained in the root element and all it's
|
63
|
+
# children.
|
64
|
+
def extract_content(tag_soup)
|
65
|
+
content = ''
|
66
|
+
ce = content_element(tag_soup)
|
67
|
+
ce.children { |child|
|
68
|
+
extract_text(child, content)
|
69
|
+
} unless ce.nil?
|
70
|
+
return content.strip
|
71
|
+
end
|
72
|
+
|
73
|
+
# extracts the href attributes of all a tags, except
|
74
|
+
# internal links like <a href="#top">
|
75
|
+
def extract_links(tagsoup)
|
76
|
+
tagsoup.find_all('a').map { |link|
|
77
|
+
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
|
78
|
+
}.compact
|
79
|
+
end
|
80
|
+
|
81
|
+
# Extracts the title from the given html tree
|
82
|
+
def extract_title(tagsoup)
|
83
|
+
the_title_tag = title_tag(tagsoup)
|
84
|
+
if the_title_tag.is_a? String
|
85
|
+
the_title_tag
|
86
|
+
else
|
87
|
+
title = ''
|
88
|
+
extract_text(the_title_tag, title)
|
89
|
+
title.strip
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Recursively extracts all text contained in the given element,
|
94
|
+
# and appends it to content.
|
95
|
+
def extract_text(element, content='')
|
96
|
+
return nil if element.nil?
|
97
|
+
if element.is_a? NavigableString
|
98
|
+
value = strip_comments(element)
|
99
|
+
value.strip!
|
100
|
+
unless value.empty?
|
101
|
+
content << value
|
102
|
+
content << ' '
|
103
|
+
end
|
104
|
+
elsif element.string # it's a Tag, and it has some content string
|
105
|
+
# skip inline scripts and styles
|
106
|
+
return nil if element.name =~ /^(script|style)$/i
|
107
|
+
value = element.string.strip
|
108
|
+
unless value.empty?
|
109
|
+
content << value
|
110
|
+
content << ' '
|
111
|
+
end
|
112
|
+
else
|
113
|
+
element.children { |child|
|
114
|
+
extract_text(child, content)
|
115
|
+
}
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Returns the element to extract the title from.
|
120
|
+
#
|
121
|
+
# This may return a string, e.g. an attribute value selected from a meta
|
122
|
+
# tag, too.
|
123
|
+
def title_tag(tagsoup)
|
124
|
+
if @config.title_tag_selector
|
125
|
+
@config.title_tag_selector.call(tagsoup)
|
126
|
+
else
|
127
|
+
tagsoup.html.head.title
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Retrieve the root element to extract document content from
|
132
|
+
def content_element(tagsoup)
|
133
|
+
if @config.content_tag_selector
|
134
|
+
@config.content_tag_selector.call(tagsoup)
|
135
|
+
else
|
136
|
+
tagsoup.html.body
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return the given string minus all html comments
|
141
|
+
def strip_comments(string)
|
142
|
+
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
end
|
147
|
+
end
|
data/lib/rdig/search.rb
CHANGED
@@ -12,7 +12,7 @@ module RDig
|
|
12
12
|
# takes the ferret section of the rdig configuration as a parameter.
|
13
13
|
def initialize(settings)
|
14
14
|
@ferret_config = settings
|
15
|
-
@query_parser = Ferret::QueryParser.new(
|
15
|
+
@query_parser = Ferret::QueryParser.new(settings.marshal_dump)
|
16
16
|
ferret_searcher
|
17
17
|
end
|
18
18
|
|
@@ -24,8 +24,8 @@ module RDig
|
|
24
24
|
@ferret_searcher = nil
|
25
25
|
end
|
26
26
|
unless @ferret_searcher
|
27
|
-
@ferret_searcher =
|
28
|
-
@query_parser.fields = @ferret_searcher.reader.
|
27
|
+
@ferret_searcher = Ferret::Search::Searcher.new(@ferret_config.path)
|
28
|
+
@query_parser.fields = @ferret_searcher.reader.field_names.to_a
|
29
29
|
end
|
30
30
|
@ferret_searcher
|
31
31
|
end
|
@@ -36,23 +36,23 @@ module RDig
|
|
36
36
|
# for more information on queries.
|
37
37
|
# A Ferret::Search::Query instance may be given, too.
|
38
38
|
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
39
|
+
# Some of the more often used otions are:
|
40
|
+
# offset:: first document in result list to retrieve (0-based). The default is 0.
|
41
|
+
# limit:: number of documents to retrieve. The default is 10.
|
42
|
+
# Please see the Ferret::Search::Searcher API for more options.
|
42
43
|
def search(query, options={})
|
43
44
|
result = {}
|
44
45
|
query = query_parser.parse(query) if query.is_a?(String)
|
45
46
|
puts "Query: #{query}"
|
46
|
-
hits = ferret_searcher.search(query, options)
|
47
|
-
result[:hitcount] = hits.total_hits
|
48
47
|
results = []
|
49
|
-
|
50
|
-
|
48
|
+
searcher = ferret_searcher
|
49
|
+
result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
|
50
|
+
doc = searcher[doc_id]
|
51
51
|
results << { :score => score,
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
52
|
+
:title => doc[:title],
|
53
|
+
:url => doc[:url],
|
54
|
+
:extract => build_extract(doc[:data]) }
|
55
|
+
end
|
56
56
|
result[:list] = results
|
57
57
|
result
|
58
58
|
end
|
data/rakefile
CHANGED
@@ -125,12 +125,16 @@ else
|
|
125
125
|
to help building a site search for web sites or intranets. Internally,
|
126
126
|
Ferret is used for the full text indexing. After creating a config file
|
127
127
|
for your site, the index can be built with a single call to rdig.
|
128
|
+
For HTML page crawling, hpricot and rubyful_soup are supported.
|
128
129
|
EOF
|
129
130
|
|
130
131
|
#### Dependencies and requirements.
|
131
132
|
|
132
133
|
s.add_dependency('ferret', '>= 0.10.0')
|
133
|
-
|
134
|
+
# TODO: check if there is anything like 'suggested' instead of required, or
|
135
|
+
# ORed dependencies...
|
136
|
+
#s.add_dependency('rubyful_soup', '>= 1.0.4')
|
137
|
+
#s.add_dependency('hpricot', '>= 0.4')
|
134
138
|
#s.requirements << ""
|
135
139
|
|
136
140
|
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class HpricotContentExtractorTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@config = RDig.config.content_extraction.hpricot.clone
|
7
|
+
@extractor = ContentExtractors::HpricotContentExtractor.new(OpenStruct.new(:hpricot => @config))
|
8
|
+
@nbsp = [160].pack('U') # non breaking space
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_can_do
|
12
|
+
assert !@extractor.can_do('application/pdf')
|
13
|
+
assert !@extractor.can_do('application/msword')
|
14
|
+
assert @extractor.can_do('text/html')
|
15
|
+
assert @extractor.can_do('text/xml')
|
16
|
+
assert @extractor.can_do('application/xml')
|
17
|
+
assert @extractor.can_do('application/xhtml+xml')
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_simple
|
21
|
+
result = ContentExtractors.process(html_doc('simple'), 'text/html')
|
22
|
+
assert_not_nil result
|
23
|
+
assert_equal 'Sample Title', result[:title]
|
24
|
+
assert_not_nil result[:content]
|
25
|
+
assert_not_nil result[:links]
|
26
|
+
assert_equal 1, result[:links].size
|
27
|
+
assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
|
28
|
+
assert_equal 'http://test.host/affe.html', result[:links].first
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_entities
|
32
|
+
result = @extractor.process(html_doc('entities'))
|
33
|
+
assert_equal 'Sample & Title', result[:title]
|
34
|
+
assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
|
35
|
+
assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
|
36
|
+
assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_custom_content_element
|
40
|
+
@config.title_tag_selector = lambda do |doc|
|
41
|
+
doc.at("h1[@class='title']")
|
42
|
+
end
|
43
|
+
@config.content_tag_selector = lambda do |doc|
|
44
|
+
doc.at("div[@id='content']")
|
45
|
+
end
|
46
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
47
|
+
assert_equal 'Sample Title in h1', result[:title]
|
48
|
+
assert_equal 'Affe Real content is here.', result[:content]
|
49
|
+
# check if links are collected outside the content tag, too:
|
50
|
+
assert_equal 3, result[:links].size
|
51
|
+
assert_equal 'http://test.host/outside.html', result[:links].first
|
52
|
+
assert_equal '/inside.html', result[:links][1]
|
53
|
+
assert_equal '/footer.html', result[:links][2]
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def test_title_from_dcmeta
|
58
|
+
@config.title_tag_selector = lambda do |doc|
|
59
|
+
doc.at("meta[@name='DC.title']")['content']
|
60
|
+
end
|
61
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
62
|
+
assert_equal 'Title from DC meta data', result[:title]
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_preprocessed_title
|
66
|
+
@config.title_tag_selector = lambda do |doc|
|
67
|
+
title = doc.at("meta[@name='DC.title']")['content']
|
68
|
+
# use only a portion of the title tag's contents if it matches our
|
69
|
+
# regexp:
|
70
|
+
(title =~ /^(.*)meta data$/ ? $1 : title).strip
|
71
|
+
end
|
72
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
73
|
+
assert_equal 'Title from DC', result[:title]
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
@@ -1,10 +1,16 @@
|
|
1
1
|
require 'test_helper'
|
2
|
-
class
|
2
|
+
class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
|
3
3
|
include TestHelper
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@config =
|
7
|
-
|
6
|
+
@config = OpenStruct.new(
|
7
|
+
:content_tag_selector => lambda { |tagsoup|
|
8
|
+
tagsoup.html.body
|
9
|
+
},
|
10
|
+
:title_tag_selector => lambda { |tagsoup|
|
11
|
+
tagsoup.html.head.title
|
12
|
+
})
|
13
|
+
@extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
|
8
14
|
@nbsp = [160].pack('U') # non breaking space
|
9
15
|
end
|
10
16
|
|
@@ -37,10 +43,10 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
37
43
|
end
|
38
44
|
|
39
45
|
def test_custom_content_element
|
40
|
-
@config.
|
46
|
+
@config.title_tag_selector = lambda do |tagsoup|
|
41
47
|
tagsoup.find('h1', :attrs => { 'class', 'title' })
|
42
48
|
end
|
43
|
-
@config.
|
49
|
+
@config.content_tag_selector = lambda do |tagsoup|
|
44
50
|
tagsoup.find('div', :attrs => { 'id', 'content' })
|
45
51
|
end
|
46
52
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
@@ -55,7 +61,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
55
61
|
|
56
62
|
|
57
63
|
def test_title_from_dcmeta
|
58
|
-
@config.
|
64
|
+
@config.title_tag_selector = lambda do |tagsoup|
|
59
65
|
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
60
66
|
end
|
61
67
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
@@ -63,7 +69,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
63
69
|
end
|
64
70
|
|
65
71
|
def test_preprocessed_title
|
66
|
-
@config.
|
72
|
+
@config.title_tag_selector = lambda do |tagsoup|
|
67
73
|
title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
68
74
|
# use only a portion of the title tag's contents if it matches our
|
69
75
|
# regexp:
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class SearcherTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
|
7
|
+
index_dir = 'tmp/test-index'
|
8
|
+
Dir.mkdir index_dir unless File.directory? index_dir
|
9
|
+
RDig.configuration do |cfg|
|
10
|
+
@old_crawler_cfg = cfg.crawler.clone
|
11
|
+
cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
|
12
|
+
cfg.crawler.num_threads = 1
|
13
|
+
cfg.crawler.wait_before_leave = 1
|
14
|
+
cfg.index.path = index_dir
|
15
|
+
cfg.verbose = true
|
16
|
+
end
|
17
|
+
crawler = Crawler.new
|
18
|
+
crawler.run
|
19
|
+
end
|
20
|
+
|
21
|
+
def teardown
|
22
|
+
RDig.configuration do |cfg|
|
23
|
+
cfg.crawler = @old_crawler_cfg
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_search
|
28
|
+
result = RDig.searcher.search 'some sample text'
|
29
|
+
assert_equal 3, result[:hitcount]
|
30
|
+
assert_equal 3, result[:list].size
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
|
metadata
CHANGED
@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-10-
|
6
|
+
version: 0.3.3
|
7
|
+
date: 2006-10-23 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
11
|
email: jk@jkraemer.net
|
12
12
|
homepage: http://rdig.rubyforge.org/
|
13
13
|
rubyforge_project: rdig
|
14
|
-
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig.
|
14
|
+
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
|
15
15
|
autorequire:
|
16
16
|
default_executable: rdig
|
17
17
|
bindir: bin
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- lib/rdig
|
34
34
|
- lib/htmlentities
|
35
35
|
- lib/rdig.rb
|
36
|
+
- lib/rdig/content_extractors
|
36
37
|
- lib/rdig/crawler.rb
|
37
38
|
- lib/rdig/search.rb
|
38
39
|
- lib/rdig/highlight.rb
|
@@ -41,6 +42,10 @@ files:
|
|
41
42
|
- lib/rdig/content_extractors.rb
|
42
43
|
- lib/rdig/documents.rb
|
43
44
|
- lib/rdig/file.rb
|
45
|
+
- lib/rdig/content_extractors/rubyful_soup.rb
|
46
|
+
- lib/rdig/content_extractors/doc.rb
|
47
|
+
- lib/rdig/content_extractors/hpricot.rb
|
48
|
+
- lib/rdig/content_extractors/pdf.rb
|
44
49
|
- lib/htmlentities/CHANGES
|
45
50
|
- lib/htmlentities/COPYING
|
46
51
|
- lib/htmlentities/README
|
@@ -50,8 +55,10 @@ files:
|
|
50
55
|
- test/test_helper.rb
|
51
56
|
- test/unit/etag_filter_test.rb
|
52
57
|
- test/unit/url_filters_test.rb
|
53
|
-
- test/unit/
|
58
|
+
- test/unit/searcher_test.rb
|
59
|
+
- test/unit/rubyful_soup_content_extractor_test.rb
|
54
60
|
- test/unit/pdf_content_extractor_test.rb
|
61
|
+
- test/unit/hpricot_content_extractor_test.rb
|
55
62
|
- test/unit/word_content_extractor_test.rb
|
56
63
|
- test/unit/file_document_test.rb
|
57
64
|
- test/unit/crawler_fs_test.rb
|
@@ -100,12 +107,3 @@ dependencies:
|
|
100
107
|
- !ruby/object:Gem::Version
|
101
108
|
version: 0.10.0
|
102
109
|
version:
|
103
|
-
- !ruby/object:Gem::Dependency
|
104
|
-
name: rubyful_soup
|
105
|
-
version_requirement:
|
106
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: 1.0.4
|
111
|
-
version:
|