rdig 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/doc/examples/config.rb +33 -0
- data/lib/rdig.rb +35 -23
- data/lib/rdig/content_extractors.rb +10 -225
- data/lib/rdig/content_extractors/doc.rb +41 -0
- data/lib/rdig/content_extractors/hpricot.rb +99 -0
- data/lib/rdig/content_extractors/pdf.rb +49 -0
- data/lib/rdig/content_extractors/rubyful_soup.rb +147 -0
- data/lib/rdig/search.rb +14 -14
- data/rakefile +5 -1
- data/test/unit/hpricot_content_extractor_test.rb +77 -0
- data/test/unit/{html_content_extractor_test.rb → rubyful_soup_content_extractor_test.rb} +13 -7
- data/test/unit/searcher_test.rb +35 -0
- metadata +11 -13
data/doc/examples/config.rb
CHANGED
@@ -27,7 +27,40 @@ RDig.configuration do |cfg|
|
|
27
27
|
# cfg.verbose = false
|
28
28
|
|
29
29
|
# content extraction options
|
30
|
+
cfg.content_extraction = OpenStruct.new(
|
30
31
|
|
32
|
+
# HPRICOT configuration
|
33
|
+
# this is the html parser used by default from RDig 0.3.3 upwards.
|
34
|
+
# Hpricot by far outperforms Rubyful Soup, and is at least as flexible when
|
35
|
+
# it comes to selection of portions of the html documents.
|
36
|
+
:hpricot => OpenStruct.new(
|
37
|
+
# css selector for the element containing the page title
|
38
|
+
:title_tag_selector => 'title',
|
39
|
+
# might also be a proc returning either an element or a string:
|
40
|
+
# :title_tag_selector => lambda { |hpricot_doc| ... }
|
41
|
+
:content_tag_selector => 'body'
|
42
|
+
# might also be a proc returning either an element or a string:
|
43
|
+
# :content_tag_selector => lambda { |hpricot_doc| ... }
|
44
|
+
)
|
45
|
+
|
46
|
+
# RUBYFUL SOUP
|
47
|
+
# This is a powerful, but somewhat slow, ruby-only html parsing lib which was
|
48
|
+
# RDig's default html parser up to version 0.3.2. To use it, comment the
|
49
|
+
# hpricot config above, and uncomment the following:
|
50
|
+
#
|
51
|
+
# :rubyful_soup => OpenStruct.new(
|
52
|
+
# # select the html element that contains the content to index
|
53
|
+
# # by default, we index all inside the body tag:
|
54
|
+
# :content_tag_selector => lambda { |tagsoup|
|
55
|
+
# tagsoup.html.body
|
56
|
+
# },
|
57
|
+
# # select the html element containing the title
|
58
|
+
# :title_tag_selector => lambda { |tagsoup|
|
59
|
+
# tagsoup.html.head.title
|
60
|
+
# }
|
61
|
+
# )
|
62
|
+
)
|
63
|
+
|
31
64
|
# provide a method that returns the title of an html document
|
32
65
|
# this method may either return a tag to extract the title from,
|
33
66
|
# or a ready-to-index string.
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.3'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -42,22 +42,12 @@ require 'open-uri'
|
|
42
42
|
|
43
43
|
begin
|
44
44
|
require 'ferret'
|
45
|
-
require 'rubyful_soup'
|
46
45
|
rescue LoadError
|
47
46
|
require 'rubygems'
|
48
47
|
require 'ferret'
|
49
|
-
require 'rubyful_soup'
|
50
48
|
end
|
51
49
|
|
52
50
|
require 'htmlentities/htmlentities'
|
53
|
-
|
54
|
-
require 'rdig/content_extractors'
|
55
|
-
require 'rdig/url_filters'
|
56
|
-
require 'rdig/search'
|
57
|
-
require 'rdig/index'
|
58
|
-
require 'rdig/file'
|
59
|
-
require 'rdig/documents'
|
60
|
-
require 'rdig/crawler'
|
61
51
|
|
62
52
|
|
63
53
|
$KCODE = 'u'
|
@@ -124,25 +114,37 @@ module RDig
|
|
124
114
|
:wait_before_leave => 10
|
125
115
|
),
|
126
116
|
:content_extraction => OpenStruct.new(
|
127
|
-
# settings for html content extraction
|
128
|
-
:
|
129
|
-
#
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
#
|
135
|
-
:
|
136
|
-
tagsoup.html.head.title
|
137
|
-
}
|
117
|
+
# settings for html content extraction (hpricot)
|
118
|
+
:hpricot => OpenStruct.new(
|
119
|
+
# css selector for the element containing the page title
|
120
|
+
:title_tag_selector => 'title',
|
121
|
+
# might also be a proc returning either an element or a string:
|
122
|
+
# :title_tag_selector => lambda { |hpricot_doc| ... }
|
123
|
+
:content_tag_selector => 'body'
|
124
|
+
# might also be a proc returning either an element or a string:
|
125
|
+
# :content_tag_selector => lambda { |hpricot_doc| ... }
|
138
126
|
)
|
127
|
+
#,
|
128
|
+
# # settings for html content extraction (RubyfulSoup)
|
129
|
+
# :rubyful_soup => OpenStruct.new(
|
130
|
+
# # select the html element that contains the content to index
|
131
|
+
# # by default, we index all inside the body tag:
|
132
|
+
# :content_tag_selector => lambda { |tagsoup|
|
133
|
+
# tagsoup.html.body
|
134
|
+
# },
|
135
|
+
# # select the html element containing the title
|
136
|
+
# :title_tag_selector => lambda { |tagsoup|
|
137
|
+
# tagsoup.html.head.title
|
138
|
+
# }
|
139
|
+
# )
|
139
140
|
),
|
140
141
|
:index => OpenStruct.new(
|
141
142
|
:path => "index/",
|
142
143
|
:create => true,
|
143
144
|
:handle_parse_errors => true,
|
144
145
|
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
|
145
|
-
:occur_default => :must
|
146
|
+
:occur_default => :must,
|
147
|
+
:default_field => '*'
|
146
148
|
)
|
147
149
|
)
|
148
150
|
end
|
@@ -261,3 +263,13 @@ module RDig
|
|
261
263
|
end
|
262
264
|
end
|
263
265
|
end
|
266
|
+
|
267
|
+
require 'rdig/content_extractors'
|
268
|
+
require 'rdig/url_filters'
|
269
|
+
require 'rdig/search'
|
270
|
+
require 'rdig/index'
|
271
|
+
require 'rdig/file'
|
272
|
+
require 'rdig/documents'
|
273
|
+
require 'rdig/crawler'
|
274
|
+
|
275
|
+
|
@@ -1,26 +1,3 @@
|
|
1
|
-
# override some methods concered with entity resolving
|
2
|
-
# to convert them to strings
|
3
|
-
class BeautifulStoneSoup
|
4
|
-
# resolve unknown html entities using the htmlentities lib
|
5
|
-
alias :orig_unknown_entityref :unknown_entityref
|
6
|
-
def unknown_entityref(ref)
|
7
|
-
if HTMLEntities::MAP.has_key?(ref)
|
8
|
-
handle_data [HTMLEntities::MAP[ref]].pack('U')
|
9
|
-
else
|
10
|
-
orig_unknown_entityref ref
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
# resolve numeric entities to utf8
|
15
|
-
def handle_charref(ref)
|
16
|
-
handle_data( ref.gsub(/([0-9]{1,7})/) {
|
17
|
-
[$1.to_i].pack('U')
|
18
|
-
}.gsub(/x([0-9a-f]{1,6})/i) {
|
19
|
-
[$1.to_i(16)].pack('U')
|
20
|
-
} )
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
1
|
module RDig
|
25
2
|
|
26
3
|
# Contains classes which are used for extracting content and meta data from
|
@@ -30,15 +7,6 @@ module RDig
|
|
30
7
|
# process the given +content+ depending on it's +content_type+.
|
31
8
|
def self.process(content, content_type)
|
32
9
|
ContentExtractor.process(content, content_type)
|
33
|
-
# case content_type
|
34
|
-
#when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
35
|
-
# return HtmlContentExtractor.process(content)
|
36
|
-
#when /^application\/.+pdf/
|
37
|
-
# return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
|
38
|
-
#else
|
39
|
-
# puts "unable to handle content type #{content_type}"
|
40
|
-
#end
|
41
|
-
#return nil
|
42
10
|
end
|
43
11
|
|
44
12
|
# Base class for Content Extractors.
|
@@ -48,7 +16,7 @@ module RDig
|
|
48
16
|
|
49
17
|
def self.inherited(extractor)
|
50
18
|
super(extractor)
|
51
|
-
puts("discovered content extractor class: #{extractor}")
|
19
|
+
puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
|
52
20
|
self.extractors << extractor
|
53
21
|
end
|
54
22
|
|
@@ -72,7 +40,7 @@ module RDig
|
|
72
40
|
end
|
73
41
|
|
74
42
|
def can_do(content_type)
|
75
|
-
content_type =~ @pattern
|
43
|
+
@pattern && content_type =~ @pattern
|
76
44
|
end
|
77
45
|
end
|
78
46
|
|
@@ -104,197 +72,14 @@ module RDig
|
|
104
72
|
end
|
105
73
|
end
|
106
74
|
|
107
|
-
|
108
|
-
|
109
|
-
# Requires the pdftotext and pdfinfo utilities from the
|
110
|
-
# xpdf-utils package
|
111
|
-
# (on debian and friends do 'apt-get install xpdf-utils')
|
112
|
-
#
|
113
|
-
class PdfContentExtractor < ContentExtractor
|
114
|
-
include ExternalAppHelper
|
115
|
-
|
116
|
-
def initialize(config)
|
117
|
-
super(config)
|
118
|
-
@pattern = /^application\/pdf/
|
119
|
-
@pdftotext = 'pdftotext'
|
120
|
-
@pdfinfo = 'pdfinfo'
|
121
|
-
@available = true
|
122
|
-
[ @pdftotext, @pdfinfo].each { |program|
|
123
|
-
unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
|
124
|
-
@available = false
|
125
|
-
break
|
126
|
-
end
|
127
|
-
}
|
128
|
-
end
|
129
|
-
|
130
|
-
def process(content)
|
131
|
-
result = {}
|
132
|
-
as_file(content) do |file|
|
133
|
-
result[:content] = get_content(file.path).strip
|
134
|
-
result[:title] = get_title(file.path)
|
135
|
-
end
|
136
|
-
result
|
137
|
-
end
|
138
|
-
|
139
|
-
def get_content(path_to_tempfile)
|
140
|
-
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
|
141
|
-
end
|
142
|
-
|
143
|
-
# extracts the title from pdf meta data
|
144
|
-
# needs pdfinfo
|
145
|
-
# returns the title or nil if no title was found
|
146
|
-
def get_title(path_to_tempfile)
|
147
|
-
%x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
|
148
|
-
rescue
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
# Extract text from word documents
|
153
|
-
#
|
154
|
-
# Requires the wvHtml utility
|
155
|
-
# (on debian and friends do 'apt-get install wv')
|
156
|
-
class WordContentExtractor < ContentExtractor
|
157
|
-
include ExternalAppHelper
|
158
|
-
|
159
|
-
def initialize(config)
|
160
|
-
super(config)
|
161
|
-
@wvhtml = 'wvHtml'
|
162
|
-
@pattern = /^application\/msword/
|
163
|
-
# html extractor for parsing wvHtml output
|
164
|
-
@html_extractor = HtmlContentExtractor.new(OpenStruct.new(
|
165
|
-
:html => OpenStruct.new(
|
166
|
-
:content_tag_selector => lambda { |tagsoup|
|
167
|
-
tagsoup.html.body
|
168
|
-
},
|
169
|
-
:title_tag_selector => lambda { |tagsoup|
|
170
|
-
tagsoup.html.head.title
|
171
|
-
}
|
172
|
-
)))
|
173
|
-
|
174
|
-
# TODO: besser: if $?.exitstatus == 127 (not found)
|
175
|
-
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
176
|
-
end
|
177
|
-
|
178
|
-
def process(content)
|
179
|
-
result = {}
|
180
|
-
as_file(content) do |file|
|
181
|
-
result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
|
182
|
-
end
|
183
|
-
return result || {}
|
184
|
-
end
|
185
|
-
|
186
|
-
end
|
187
|
-
|
188
|
-
# extracts title, content and links from html documents
|
189
|
-
class HtmlContentExtractor < ContentExtractor
|
190
|
-
|
191
|
-
def initialize(config)
|
192
|
-
super(config)
|
193
|
-
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
194
|
-
end
|
195
|
-
|
196
|
-
# returns:
|
197
|
-
# { :content => 'extracted clear text',
|
198
|
-
# :meta => { :title => 'Title' },
|
199
|
-
# :links => [array of urls] }
|
200
|
-
def process(content)
|
201
|
-
result = { }
|
202
|
-
tag_soup = BeautifulSoup.new(content)
|
203
|
-
result[:title] = extract_title(tag_soup)
|
204
|
-
result[:links] = extract_links(tag_soup)
|
205
|
-
result[:content] = extract_content(tag_soup)
|
206
|
-
return result
|
207
|
-
end
|
208
|
-
|
209
|
-
# Extracts textual content from the HTML tree.
|
210
|
-
#
|
211
|
-
# - First, the root element to use is determined using the
|
212
|
-
# +content_element+ method, which itself uses the content_tag_selector
|
213
|
-
# from RDig.configuration.
|
214
|
-
# - Then, this element is processed by +extract_text+, which will give
|
215
|
-
# all textual content contained in the root element and all it's
|
216
|
-
# children.
|
217
|
-
def extract_content(tag_soup)
|
218
|
-
content = ''
|
219
|
-
ce = content_element(tag_soup)
|
220
|
-
ce.children { |child|
|
221
|
-
extract_text(child, content)
|
222
|
-
} unless ce.nil?
|
223
|
-
return content.strip
|
224
|
-
end
|
225
|
-
|
226
|
-
# extracts the href attributes of all a tags, except
|
227
|
-
# internal links like <a href="#top">
|
228
|
-
def extract_links(tagsoup)
|
229
|
-
tagsoup.find_all('a').map { |link|
|
230
|
-
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
|
231
|
-
}.compact
|
232
|
-
end
|
233
|
-
|
234
|
-
# Extracts the title from the given html tree
|
235
|
-
def extract_title(tagsoup)
|
236
|
-
the_title_tag = title_tag(tagsoup)
|
237
|
-
if the_title_tag.is_a? String
|
238
|
-
the_title_tag
|
239
|
-
else
|
240
|
-
title = ''
|
241
|
-
extract_text(the_title_tag, title)
|
242
|
-
title.strip
|
243
|
-
end
|
244
|
-
end
|
245
|
-
|
246
|
-
# Recursively extracts all text contained in the given element,
|
247
|
-
# and appends it to content.
|
248
|
-
def extract_text(element, content='')
|
249
|
-
return nil if element.nil?
|
250
|
-
if element.is_a? NavigableString
|
251
|
-
value = strip_comments(element)
|
252
|
-
value.strip!
|
253
|
-
unless value.empty?
|
254
|
-
content << value
|
255
|
-
content << ' '
|
256
|
-
end
|
257
|
-
elsif element.string # it's a Tag, and it has some content string
|
258
|
-
# skip inline scripts and styles
|
259
|
-
return nil if element.name =~ /^(script|style)$/i
|
260
|
-
value = element.string.strip
|
261
|
-
unless value.empty?
|
262
|
-
content << value
|
263
|
-
content << ' '
|
264
|
-
end
|
265
|
-
else
|
266
|
-
element.children { |child|
|
267
|
-
extract_text(child, content)
|
268
|
-
}
|
269
|
-
end
|
270
|
-
end
|
271
|
-
|
272
|
-
# Returns the element to extract the title from.
|
273
|
-
#
|
274
|
-
# This may return a string, e.g. an attribute value selected from a meta
|
275
|
-
# tag, too.
|
276
|
-
def title_tag(tagsoup)
|
277
|
-
if @config.html.title_tag_selector
|
278
|
-
@config.html.title_tag_selector.call(tagsoup)
|
279
|
-
else
|
280
|
-
tagsoup.html.head.title
|
281
|
-
end
|
282
|
-
end
|
283
|
-
|
284
|
-
# Retrieve the root element to extract document content from
|
285
|
-
def content_element(tagsoup)
|
286
|
-
if @config.html.content_tag_selector
|
287
|
-
@config.html.content_tag_selector.call(tagsoup)
|
288
|
-
else
|
289
|
-
tagsoup.html.body
|
290
|
-
end
|
291
|
-
end
|
292
|
-
|
293
|
-
# Return the given string minus all html comments
|
294
|
-
def strip_comments(string)
|
295
|
-
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
|
296
|
-
end
|
297
|
-
end
|
75
|
+
end
|
76
|
+
end
|
298
77
|
|
78
|
+
# load content extractors
|
79
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
|
80
|
+
begin
|
81
|
+
require f
|
82
|
+
rescue
|
83
|
+
puts "error loading #{f}: #{$!}"
|
299
84
|
end
|
300
85
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module RDig
|
2
|
+
module ContentExtractors
|
3
|
+
|
4
|
+
# Extract text from word documents
|
5
|
+
#
|
6
|
+
# Requires the wvHtml utility
|
7
|
+
# (on debian and friends do 'apt-get install wv')
|
8
|
+
class WordContentExtractor < ContentExtractor
|
9
|
+
include ExternalAppHelper
|
10
|
+
|
11
|
+
def initialize(config)
|
12
|
+
super(config)
|
13
|
+
@wvhtml = 'wvHtml'
|
14
|
+
@pattern = /^application\/msword/
|
15
|
+
# html extractor for parsing wvHtml output
|
16
|
+
@html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
|
17
|
+
:rubyful_soup => OpenStruct.new(
|
18
|
+
:content_tag_selector => lambda { |tagsoup|
|
19
|
+
tagsoup.html.body
|
20
|
+
},
|
21
|
+
:title_tag_selector => lambda { |tagsoup|
|
22
|
+
tagsoup.html.head.title
|
23
|
+
}
|
24
|
+
)))
|
25
|
+
|
26
|
+
# TODO: better: if $?.exitstatus == 127 (not found)
|
27
|
+
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
28
|
+
end
|
29
|
+
|
30
|
+
def process(content)
|
31
|
+
result = {}
|
32
|
+
as_file(content) do |file|
|
33
|
+
result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
|
34
|
+
end
|
35
|
+
return result || {}
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
begin
|
2
|
+
require 'hpricot'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems'
|
5
|
+
require 'hpricot'
|
6
|
+
end
|
7
|
+
|
8
|
+
module RDig
|
9
|
+
module ContentExtractors
|
10
|
+
|
11
|
+
# extracts title, content and links from html documents using the hpricot library
|
12
|
+
class HpricotContentExtractor < ContentExtractor
|
13
|
+
|
14
|
+
def initialize(config)
|
15
|
+
super(config.hpricot)
|
16
|
+
# if not configured, refuse to handle any content:
|
17
|
+
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.hpricot
|
18
|
+
end
|
19
|
+
|
20
|
+
# returns:
|
21
|
+
# { :content => 'extracted clear text',
|
22
|
+
# :title => 'Title',
|
23
|
+
# :links => [array of urls] }
|
24
|
+
def process(content)
|
25
|
+
doc = Hpricot(content)
|
26
|
+
{
|
27
|
+
:title => extract_title(doc).decode_entities,
|
28
|
+
:links => extract_links(doc),
|
29
|
+
:content => extract_content(doc).decode_entities
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
# Extracts textual content from the HTML tree.
|
34
|
+
#
|
35
|
+
# - First, the root element to use is determined using the
|
36
|
+
# +content_element+ method, which itself uses the content_tag_selector
|
37
|
+
# from RDig.configuration.
|
38
|
+
# - Then, this element is processed by +extract_text+, which will give
|
39
|
+
# all textual content contained in the root element and all it's
|
40
|
+
# children.
|
41
|
+
def extract_content(doc)
|
42
|
+
content = ''
|
43
|
+
ce = content_element(doc)
|
44
|
+
content = strip_tags(strip_comments(ce.inner_html)) if ce
|
45
|
+
# (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
|
46
|
+
# extract_text child, content
|
47
|
+
return content.strip
|
48
|
+
end
|
49
|
+
|
50
|
+
# extracts the href attributes of all a tags, except
|
51
|
+
# internal links like <a href="#top">
|
52
|
+
def extract_links(doc)
|
53
|
+
(doc/'a').map { |link|
|
54
|
+
href = link['href']
|
55
|
+
CGI.unescapeHTML(href) if href && href !~ /^#/
|
56
|
+
}.compact
|
57
|
+
end
|
58
|
+
|
59
|
+
# Extracts the title from the given html tree
|
60
|
+
def extract_title(doc)
|
61
|
+
the_title_tag = title_tag(doc)
|
62
|
+
return the_title_tag unless the_title_tag.respond_to? :inner_html
|
63
|
+
strip_tags(the_title_tag.inner_html)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Returns the element to extract the title from.
|
67
|
+
#
|
68
|
+
# This may return a string, e.g. an attribute value selected from a meta
|
69
|
+
# tag, too.
|
70
|
+
def title_tag(doc)
|
71
|
+
tag_from_config(doc, :title_tag_selector) || doc.at('title')
|
72
|
+
end
|
73
|
+
|
74
|
+
# Retrieve the root element to extract document content from
|
75
|
+
def content_element(doc)
|
76
|
+
tag_from_config(doc, :content_tag_selector) || doc.at('body')
|
77
|
+
end
|
78
|
+
|
79
|
+
def tag_from_config(doc, config_key)
|
80
|
+
cfg = @config.send(config_key)
|
81
|
+
cfg.is_a?(String) ? doc/cfg : cfg.call(doc) if cfg
|
82
|
+
end
|
83
|
+
|
84
|
+
# Return the given string minus all html comments
|
85
|
+
def strip_comments(string)
|
86
|
+
string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
|
87
|
+
end
|
88
|
+
def strip_tags(string)
|
89
|
+
string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
|
90
|
+
Regexp::MULTILINE, 'u'), ''
|
91
|
+
string.gsub! Regexp.new('<.+?>',
|
92
|
+
Regexp::MULTILINE, 'u'), ''
|
93
|
+
string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module RDig
|
2
|
+
module ContentExtractors
|
3
|
+
# Extract text from pdf content.
|
4
|
+
#
|
5
|
+
# Requires the pdftotext and pdfinfo utilities from the
|
6
|
+
# xpdf-utils package
|
7
|
+
# (on debian and friends do 'apt-get install xpdf-utils')
|
8
|
+
#
|
9
|
+
class PdfContentExtractor < ContentExtractor
|
10
|
+
include ExternalAppHelper
|
11
|
+
|
12
|
+
def initialize(config)
|
13
|
+
super(config)
|
14
|
+
@pattern = /^application\/pdf/
|
15
|
+
@pdftotext = 'pdftotext'
|
16
|
+
@pdfinfo = 'pdfinfo'
|
17
|
+
@available = true
|
18
|
+
[ @pdftotext, @pdfinfo].each { |program|
|
19
|
+
unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
|
20
|
+
@available = false
|
21
|
+
break
|
22
|
+
end
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
def process(content)
|
27
|
+
result = {}
|
28
|
+
as_file(content) do |file|
|
29
|
+
result[:content] = get_content(file.path).strip
|
30
|
+
result[:title] = get_title(file.path)
|
31
|
+
end
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_content(path_to_tempfile)
|
36
|
+
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
|
37
|
+
end
|
38
|
+
|
39
|
+
# extracts the title from pdf meta data
|
40
|
+
# needs pdfinfo
|
41
|
+
# returns the title or nil if no title was found
|
42
|
+
def get_title(path_to_tempfile)
|
43
|
+
%x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
|
44
|
+
rescue
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
begin
|
2
|
+
require 'rubyful_soup'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems'
|
5
|
+
require 'rubyful_soup'
|
6
|
+
end
|
7
|
+
|
8
|
+
# override some methods concered with entity resolving
|
9
|
+
# to convert them to strings
|
10
|
+
class BeautifulStoneSoup
|
11
|
+
# resolve unknown html entities using the htmlentities lib
|
12
|
+
alias :orig_unknown_entityref :unknown_entityref
|
13
|
+
def unknown_entityref(ref)
|
14
|
+
if HTMLEntities::MAP.has_key?(ref)
|
15
|
+
handle_data [HTMLEntities::MAP[ref]].pack('U')
|
16
|
+
else
|
17
|
+
orig_unknown_entityref ref
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# resolve numeric entities to utf8
|
22
|
+
def handle_charref(ref)
|
23
|
+
handle_data( ref.gsub(/([0-9]{1,7})/) {
|
24
|
+
[$1.to_i].pack('U')
|
25
|
+
}.gsub(/x([0-9a-f]{1,6})/i) {
|
26
|
+
[$1.to_i(16)].pack('U')
|
27
|
+
} )
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
module RDig
|
32
|
+
module ContentExtractors
|
33
|
+
|
34
|
+
# extracts title, content and links from html documents
|
35
|
+
class RubyfulSoupContentExtractor < ContentExtractor
|
36
|
+
|
37
|
+
def initialize(config)
|
38
|
+
super(config.rubyful_soup)
|
39
|
+
# if not configured, refuse to handle any content:
|
40
|
+
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
|
41
|
+
end
|
42
|
+
|
43
|
+
# returns:
|
44
|
+
# { :content => 'extracted clear text',
|
45
|
+
# :meta => { :title => 'Title' },
|
46
|
+
# :links => [array of urls] }
|
47
|
+
def process(content)
|
48
|
+
result = { }
|
49
|
+
tag_soup = BeautifulSoup.new(content)
|
50
|
+
result[:title] = extract_title(tag_soup)
|
51
|
+
result[:links] = extract_links(tag_soup)
|
52
|
+
result[:content] = extract_content(tag_soup)
|
53
|
+
return result
|
54
|
+
end
|
55
|
+
|
56
|
+
# Extracts textual content from the HTML tree.
|
57
|
+
#
|
58
|
+
# - First, the root element to use is determined using the
|
59
|
+
# +content_element+ method, which itself uses the content_tag_selector
|
60
|
+
# from RDig.configuration.
|
61
|
+
# - Then, this element is processed by +extract_text+, which will give
|
62
|
+
# all textual content contained in the root element and all it's
|
63
|
+
# children.
|
64
|
+
def extract_content(tag_soup)
|
65
|
+
content = ''
|
66
|
+
ce = content_element(tag_soup)
|
67
|
+
ce.children { |child|
|
68
|
+
extract_text(child, content)
|
69
|
+
} unless ce.nil?
|
70
|
+
return content.strip
|
71
|
+
end
|
72
|
+
|
73
|
+
# extracts the href attributes of all a tags, except
|
74
|
+
# internal links like <a href="#top">
|
75
|
+
def extract_links(tagsoup)
|
76
|
+
tagsoup.find_all('a').map { |link|
|
77
|
+
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
|
78
|
+
}.compact
|
79
|
+
end
|
80
|
+
|
81
|
+
# Extracts the title from the given html tree
|
82
|
+
def extract_title(tagsoup)
|
83
|
+
the_title_tag = title_tag(tagsoup)
|
84
|
+
if the_title_tag.is_a? String
|
85
|
+
the_title_tag
|
86
|
+
else
|
87
|
+
title = ''
|
88
|
+
extract_text(the_title_tag, title)
|
89
|
+
title.strip
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Recursively extracts all text contained in the given element,
|
94
|
+
# and appends it to content.
|
95
|
+
def extract_text(element, content='')
|
96
|
+
return nil if element.nil?
|
97
|
+
if element.is_a? NavigableString
|
98
|
+
value = strip_comments(element)
|
99
|
+
value.strip!
|
100
|
+
unless value.empty?
|
101
|
+
content << value
|
102
|
+
content << ' '
|
103
|
+
end
|
104
|
+
elsif element.string # it's a Tag, and it has some content string
|
105
|
+
# skip inline scripts and styles
|
106
|
+
return nil if element.name =~ /^(script|style)$/i
|
107
|
+
value = element.string.strip
|
108
|
+
unless value.empty?
|
109
|
+
content << value
|
110
|
+
content << ' '
|
111
|
+
end
|
112
|
+
else
|
113
|
+
element.children { |child|
|
114
|
+
extract_text(child, content)
|
115
|
+
}
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Returns the element to extract the title from.
|
120
|
+
#
|
121
|
+
# This may return a string, e.g. an attribute value selected from a meta
|
122
|
+
# tag, too.
|
123
|
+
def title_tag(tagsoup)
|
124
|
+
if @config.title_tag_selector
|
125
|
+
@config.title_tag_selector.call(tagsoup)
|
126
|
+
else
|
127
|
+
tagsoup.html.head.title
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Retrieve the root element to extract document content from
|
132
|
+
def content_element(tagsoup)
|
133
|
+
if @config.content_tag_selector
|
134
|
+
@config.content_tag_selector.call(tagsoup)
|
135
|
+
else
|
136
|
+
tagsoup.html.body
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return the given string minus all html comments
|
141
|
+
def strip_comments(string)
|
142
|
+
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
end
|
147
|
+
end
|
data/lib/rdig/search.rb
CHANGED
@@ -12,7 +12,7 @@ module RDig
|
|
12
12
|
# takes the ferret section of the rdig configuration as a parameter.
|
13
13
|
def initialize(settings)
|
14
14
|
@ferret_config = settings
|
15
|
-
@query_parser = Ferret::QueryParser.new(
|
15
|
+
@query_parser = Ferret::QueryParser.new(settings.marshal_dump)
|
16
16
|
ferret_searcher
|
17
17
|
end
|
18
18
|
|
@@ -24,8 +24,8 @@ module RDig
|
|
24
24
|
@ferret_searcher = nil
|
25
25
|
end
|
26
26
|
unless @ferret_searcher
|
27
|
-
@ferret_searcher =
|
28
|
-
@query_parser.fields = @ferret_searcher.reader.
|
27
|
+
@ferret_searcher = Ferret::Search::Searcher.new(@ferret_config.path)
|
28
|
+
@query_parser.fields = @ferret_searcher.reader.field_names.to_a
|
29
29
|
end
|
30
30
|
@ferret_searcher
|
31
31
|
end
|
@@ -36,23 +36,23 @@ module RDig
|
|
36
36
|
# for more information on queries.
|
37
37
|
# A Ferret::Search::Query instance may be given, too.
|
38
38
|
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
39
|
+
# Some of the more often used otions are:
|
40
|
+
# offset:: first document in result list to retrieve (0-based). The default is 0.
|
41
|
+
# limit:: number of documents to retrieve. The default is 10.
|
42
|
+
# Please see the Ferret::Search::Searcher API for more options.
|
42
43
|
def search(query, options={})
|
43
44
|
result = {}
|
44
45
|
query = query_parser.parse(query) if query.is_a?(String)
|
45
46
|
puts "Query: #{query}"
|
46
|
-
hits = ferret_searcher.search(query, options)
|
47
|
-
result[:hitcount] = hits.total_hits
|
48
47
|
results = []
|
49
|
-
|
50
|
-
|
48
|
+
searcher = ferret_searcher
|
49
|
+
result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
|
50
|
+
doc = searcher[doc_id]
|
51
51
|
results << { :score => score,
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
52
|
+
:title => doc[:title],
|
53
|
+
:url => doc[:url],
|
54
|
+
:extract => build_extract(doc[:data]) }
|
55
|
+
end
|
56
56
|
result[:list] = results
|
57
57
|
result
|
58
58
|
end
|
data/rakefile
CHANGED
@@ -125,12 +125,16 @@ else
|
|
125
125
|
to help building a site search for web sites or intranets. Internally,
|
126
126
|
Ferret is used for the full text indexing. After creating a config file
|
127
127
|
for your site, the index can be built with a single call to rdig.
|
128
|
+
For HTML page crawling, hpricot and rubyful_soup are supported.
|
128
129
|
EOF
|
129
130
|
|
130
131
|
#### Dependencies and requirements.
|
131
132
|
|
132
133
|
s.add_dependency('ferret', '>= 0.10.0')
|
133
|
-
|
134
|
+
# TODO: check if there is anything like 'suggested' instead of required, or
|
135
|
+
# ORed dependencies...
|
136
|
+
#s.add_dependency('rubyful_soup', '>= 1.0.4')
|
137
|
+
#s.add_dependency('hpricot', '>= 0.4')
|
134
138
|
#s.requirements << ""
|
135
139
|
|
136
140
|
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class HpricotContentExtractorTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@config = RDig.config.content_extraction.hpricot.clone
|
7
|
+
@extractor = ContentExtractors::HpricotContentExtractor.new(OpenStruct.new(:hpricot => @config))
|
8
|
+
@nbsp = [160].pack('U') # non breaking space
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_can_do
|
12
|
+
assert !@extractor.can_do('application/pdf')
|
13
|
+
assert !@extractor.can_do('application/msword')
|
14
|
+
assert @extractor.can_do('text/html')
|
15
|
+
assert @extractor.can_do('text/xml')
|
16
|
+
assert @extractor.can_do('application/xml')
|
17
|
+
assert @extractor.can_do('application/xhtml+xml')
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_simple
|
21
|
+
result = ContentExtractors.process(html_doc('simple'), 'text/html')
|
22
|
+
assert_not_nil result
|
23
|
+
assert_equal 'Sample Title', result[:title]
|
24
|
+
assert_not_nil result[:content]
|
25
|
+
assert_not_nil result[:links]
|
26
|
+
assert_equal 1, result[:links].size
|
27
|
+
assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
|
28
|
+
assert_equal 'http://test.host/affe.html', result[:links].first
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_entities
|
32
|
+
result = @extractor.process(html_doc('entities'))
|
33
|
+
assert_equal 'Sample & Title', result[:title]
|
34
|
+
assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
|
35
|
+
assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
|
36
|
+
assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_custom_content_element
|
40
|
+
@config.title_tag_selector = lambda do |doc|
|
41
|
+
doc.at("h1[@class='title']")
|
42
|
+
end
|
43
|
+
@config.content_tag_selector = lambda do |doc|
|
44
|
+
doc.at("div[@id='content']")
|
45
|
+
end
|
46
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
47
|
+
assert_equal 'Sample Title in h1', result[:title]
|
48
|
+
assert_equal 'Affe Real content is here.', result[:content]
|
49
|
+
# check if links are collected outside the content tag, too:
|
50
|
+
assert_equal 3, result[:links].size
|
51
|
+
assert_equal 'http://test.host/outside.html', result[:links].first
|
52
|
+
assert_equal '/inside.html', result[:links][1]
|
53
|
+
assert_equal '/footer.html', result[:links][2]
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def test_title_from_dcmeta
|
58
|
+
@config.title_tag_selector = lambda do |doc|
|
59
|
+
doc.at("meta[@name='DC.title']")['content']
|
60
|
+
end
|
61
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
62
|
+
assert_equal 'Title from DC meta data', result[:title]
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_preprocessed_title
|
66
|
+
@config.title_tag_selector = lambda do |doc|
|
67
|
+
title = doc.at("meta[@name='DC.title']")['content']
|
68
|
+
# use only a portion of the title tag's contents if it matches our
|
69
|
+
# regexp:
|
70
|
+
(title =~ /^(.*)meta data$/ ? $1 : title).strip
|
71
|
+
end
|
72
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
73
|
+
assert_equal 'Title from DC', result[:title]
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
@@ -1,10 +1,16 @@
|
|
1
1
|
require 'test_helper'
|
2
|
-
class
|
2
|
+
class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
|
3
3
|
include TestHelper
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@config =
|
7
|
-
|
6
|
+
@config = OpenStruct.new(
|
7
|
+
:content_tag_selector => lambda { |tagsoup|
|
8
|
+
tagsoup.html.body
|
9
|
+
},
|
10
|
+
:title_tag_selector => lambda { |tagsoup|
|
11
|
+
tagsoup.html.head.title
|
12
|
+
})
|
13
|
+
@extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
|
8
14
|
@nbsp = [160].pack('U') # non breaking space
|
9
15
|
end
|
10
16
|
|
@@ -37,10 +43,10 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
37
43
|
end
|
38
44
|
|
39
45
|
def test_custom_content_element
|
40
|
-
@config.
|
46
|
+
@config.title_tag_selector = lambda do |tagsoup|
|
41
47
|
tagsoup.find('h1', :attrs => { 'class', 'title' })
|
42
48
|
end
|
43
|
-
@config.
|
49
|
+
@config.content_tag_selector = lambda do |tagsoup|
|
44
50
|
tagsoup.find('div', :attrs => { 'id', 'content' })
|
45
51
|
end
|
46
52
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
@@ -55,7 +61,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
55
61
|
|
56
62
|
|
57
63
|
def test_title_from_dcmeta
|
58
|
-
@config.
|
64
|
+
@config.title_tag_selector = lambda do |tagsoup|
|
59
65
|
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
60
66
|
end
|
61
67
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
@@ -63,7 +69,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
63
69
|
end
|
64
70
|
|
65
71
|
def test_preprocessed_title
|
66
|
-
@config.
|
72
|
+
@config.title_tag_selector = lambda do |tagsoup|
|
67
73
|
title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
68
74
|
# use only a portion of the title tag's contents if it matches our
|
69
75
|
# regexp:
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class SearcherTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
|
7
|
+
index_dir = 'tmp/test-index'
|
8
|
+
Dir.mkdir index_dir unless File.directory? index_dir
|
9
|
+
RDig.configuration do |cfg|
|
10
|
+
@old_crawler_cfg = cfg.crawler.clone
|
11
|
+
cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
|
12
|
+
cfg.crawler.num_threads = 1
|
13
|
+
cfg.crawler.wait_before_leave = 1
|
14
|
+
cfg.index.path = index_dir
|
15
|
+
cfg.verbose = true
|
16
|
+
end
|
17
|
+
crawler = Crawler.new
|
18
|
+
crawler.run
|
19
|
+
end
|
20
|
+
|
21
|
+
def teardown
|
22
|
+
RDig.configuration do |cfg|
|
23
|
+
cfg.crawler = @old_crawler_cfg
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_search
|
28
|
+
result = RDig.searcher.search 'some sample text'
|
29
|
+
assert_equal 3, result[:hitcount]
|
30
|
+
assert_equal 3, result[:list].size
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
|
metadata
CHANGED
@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-10-
|
6
|
+
version: 0.3.3
|
7
|
+
date: 2006-10-23 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
11
|
email: jk@jkraemer.net
|
12
12
|
homepage: http://rdig.rubyforge.org/
|
13
13
|
rubyforge_project: rdig
|
14
|
-
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig.
|
14
|
+
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
|
15
15
|
autorequire:
|
16
16
|
default_executable: rdig
|
17
17
|
bindir: bin
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- lib/rdig
|
34
34
|
- lib/htmlentities
|
35
35
|
- lib/rdig.rb
|
36
|
+
- lib/rdig/content_extractors
|
36
37
|
- lib/rdig/crawler.rb
|
37
38
|
- lib/rdig/search.rb
|
38
39
|
- lib/rdig/highlight.rb
|
@@ -41,6 +42,10 @@ files:
|
|
41
42
|
- lib/rdig/content_extractors.rb
|
42
43
|
- lib/rdig/documents.rb
|
43
44
|
- lib/rdig/file.rb
|
45
|
+
- lib/rdig/content_extractors/rubyful_soup.rb
|
46
|
+
- lib/rdig/content_extractors/doc.rb
|
47
|
+
- lib/rdig/content_extractors/hpricot.rb
|
48
|
+
- lib/rdig/content_extractors/pdf.rb
|
44
49
|
- lib/htmlentities/CHANGES
|
45
50
|
- lib/htmlentities/COPYING
|
46
51
|
- lib/htmlentities/README
|
@@ -50,8 +55,10 @@ files:
|
|
50
55
|
- test/test_helper.rb
|
51
56
|
- test/unit/etag_filter_test.rb
|
52
57
|
- test/unit/url_filters_test.rb
|
53
|
-
- test/unit/
|
58
|
+
- test/unit/searcher_test.rb
|
59
|
+
- test/unit/rubyful_soup_content_extractor_test.rb
|
54
60
|
- test/unit/pdf_content_extractor_test.rb
|
61
|
+
- test/unit/hpricot_content_extractor_test.rb
|
55
62
|
- test/unit/word_content_extractor_test.rb
|
56
63
|
- test/unit/file_document_test.rb
|
57
64
|
- test/unit/crawler_fs_test.rb
|
@@ -100,12 +107,3 @@ dependencies:
|
|
100
107
|
- !ruby/object:Gem::Version
|
101
108
|
version: 0.10.0
|
102
109
|
version:
|
103
|
-
- !ruby/object:Gem::Dependency
|
104
|
-
name: rubyful_soup
|
105
|
-
version_requirement:
|
106
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
107
|
-
requirements:
|
108
|
-
- - ">="
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: 1.0.4
|
111
|
-
version:
|