scrubber-scrubyt 0.4.11
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +343 -0
- data/COPYING +340 -0
- data/README +99 -0
- data/Rakefile +101 -0
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/constraint.rb +169 -0
- data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
- data/lib/scrubyt/core/scraping/pattern.rb +359 -0
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
- data/lib/scrubyt/core/shared/extractor.rb +167 -0
- data/lib/scrubyt/logging.rb +154 -0
- data/lib/scrubyt/output/post_processor.rb +139 -0
- data/lib/scrubyt/output/result.rb +44 -0
- data/lib/scrubyt/output/result_dumper.rb +154 -0
- data/lib/scrubyt/output/result_node.rb +140 -0
- data/lib/scrubyt/output/scrubyt_result.rb +42 -0
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
- data/lib/scrubyt/utils/shared_utils.rb +58 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
- data/lib/scrubyt/utils/xpathutils.rb +202 -0
- data/lib/scrubyt.rb +43 -0
- data/test/blackbox_test.rb +60 -0
- data/test/blackbox_tests/basic/multi_root.rb +6 -0
- data/test/blackbox_tests/basic/simple.rb +5 -0
- data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
- data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
- data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
- data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
- metadata +115 -0
@@ -0,0 +1,359 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module Scrubyt
|
5
|
+
##
|
6
|
+
#=<tt>Group more filters into one</tt>
|
7
|
+
#
|
8
|
+
#Server as an umbrella for filters which are conceptually extracting
|
9
|
+
#the same thing - for example a price or a title or ...
|
10
|
+
#
|
11
|
+
#Sometimes the same piece of information can not be extracted with one filter
|
12
|
+
#across more result instances (for example a price has an XPath in record n,
|
13
|
+
#but since in record n+1 has a discount price as well, the real price is pushed
|
14
|
+
#to a different XPath etc) - in this case the more filters which extract the same
|
15
|
+
#thing are hold in the same pattern.
|
16
|
+
class Pattern
|
17
|
+
#Type of the pattern;
|
18
|
+
|
19
|
+
# TODO: Update documentation
|
20
|
+
|
21
|
+
# # a root pattern represents a (surprise!) root pattern
|
22
|
+
# PATTERN_TYPE_ROOT = :PATTERN_TYPE_ROOT
|
23
|
+
# # a tree pattern represents a HTML region
|
24
|
+
# PATTERN_TYPE_TREE = :PATTERN_TYPE_TREE
|
25
|
+
# # represents an attribute of the node extracted by the parent pattern
|
26
|
+
# PATTERN_TYPE_ATTRIBUTE = :PATTERN_TYPE_ATTRIBUTE
|
27
|
+
# # represents a pattern which filters its output with a regexp
|
28
|
+
# PATTERN_TYPE_REGEXP = :PATTERN_TYPE_REGEXP
|
29
|
+
# # represents a pattern which crawls to the detail page and extracts information from there
|
30
|
+
# PATTERN_TYPE_DETAIL_PAGE = :PATTERN_TYPE_DETAIL_PAGE
|
31
|
+
# # represents a download pattern
|
32
|
+
# PATTERN_TYPE_DOWNLOAD = :PATTERN_TYPE_DOWNLOAD
|
33
|
+
# # write out the HTML subtree beginning at the matched element
|
34
|
+
# PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
|
35
|
+
|
36
|
+
VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
|
37
|
+
|
38
|
+
# :determine - default value, represent that type of example need determine
|
39
|
+
# :string - represent node with example type EXAMPLE_TYPE_STRING
|
40
|
+
VALID_PATTERN_EXAMPLE_TYPES = [:determine, :xpath]
|
41
|
+
|
42
|
+
#The pattern can be either a model pattern (in this case it is
|
43
|
+
#written to the output) or a temp pattern (in this case it is skipped)
|
44
|
+
#Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
|
45
|
+
#is considered to be a model pattern
|
46
|
+
|
47
|
+
#Model pattern are shown in the output
|
48
|
+
# OUTPUT_TYPE_MODEL = :OUTPUT_TYPE_MODEL
|
49
|
+
# #Temp patterns are skipped in the output (their ancestors are appended to the parent
|
50
|
+
# #of the pattrern which was skipped
|
51
|
+
# OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
|
52
|
+
|
53
|
+
VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
|
54
|
+
|
55
|
+
#These options can be set upon wrapper creation
|
56
|
+
PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
|
57
|
+
VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
|
58
|
+
|
59
|
+
attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
|
60
|
+
:indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
|
61
|
+
|
62
|
+
attr_reader(:next_page_url, :result_indexer)
|
63
|
+
|
64
|
+
option_reader(:type => :tree, :output_type => :model, :generalize => false,
|
65
|
+
:write_text => lambda { @children.size == 0 }, :limit => nil,
|
66
|
+
:default => nil, :resolve => :full, :except => [], :example_type => :determine)
|
67
|
+
|
68
|
+
def initialize(name, args=[], extractor=nil, parent=nil, &block)
|
69
|
+
#init attributes
|
70
|
+
@name = name
|
71
|
+
@extractor = extractor
|
72
|
+
@parent = parent
|
73
|
+
@options = {}
|
74
|
+
@children = []
|
75
|
+
@filters = []
|
76
|
+
@constraints = []
|
77
|
+
@modifier_calls = []
|
78
|
+
|
79
|
+
#grab any examples that are defined
|
80
|
+
examples = look_for_examples(args)
|
81
|
+
|
82
|
+
#parse the options hash if provided
|
83
|
+
parse_options_hash(args[-1]) if args[-1].is_a? Hash
|
84
|
+
|
85
|
+
#perform checks for special cases
|
86
|
+
examples = check_if_shortcut_pattern() if examples == nil
|
87
|
+
check_if_detail_page(block)
|
88
|
+
@options[:output_type] = :page_list if name == 'page_list'
|
89
|
+
|
90
|
+
#create filters
|
91
|
+
if examples == nil
|
92
|
+
@filters << Scrubyt::BaseFilter.create(self) #create a default filter
|
93
|
+
else
|
94
|
+
examples.each do |example|
|
95
|
+
@filters << Scrubyt::BaseFilter.create(self,example) #create a filter
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
#by default, generalize the root pattern, but only in the case if
|
100
|
+
#@generalize was not set up explicitly
|
101
|
+
if @options[:generalize].nil?
|
102
|
+
@options[:generalize] = true if parent.nil?
|
103
|
+
@options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
|
104
|
+
end
|
105
|
+
|
106
|
+
#parse child patterns if available
|
107
|
+
parse_child_patterns(&block) if ( !block.nil? && type != :detail_page )
|
108
|
+
|
109
|
+
#tree pattern only (TODO: subclass?)
|
110
|
+
if type == :tree
|
111
|
+
#generate xpaths and regexps
|
112
|
+
@filters.each do |filter|
|
113
|
+
filter.generate_XPath_for_example(false) unless @name == 'next_page'
|
114
|
+
filter.generate_regexp_for_example
|
115
|
+
end
|
116
|
+
#when the xpaths of this pattern have been created, its children can make their xpaths relative
|
117
|
+
xpaths = @filters.collect { |filter| filter.xpath }
|
118
|
+
@children.each do |child|
|
119
|
+
child.generate_relative_XPaths xpaths
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def generate_relative_XPaths(parent_xpaths)
|
125
|
+
return if type != :tree
|
126
|
+
raise ArgumentError.new if parent_xpaths.size != 1 && parent_xpaths.size != @filters.size #TODO: should be checked earlier with proper error message
|
127
|
+
@filters.each_index do |index|
|
128
|
+
@filters[index].generate_relative_XPath parent_xpaths[parent_xpaths.size == 1 ? 0 : index]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
#Shortcut patterns, as their name says, are a shortcut for creating patterns
|
133
|
+
#from predefined rules; for example:
|
134
|
+
#
|
135
|
+
# detail_url
|
136
|
+
#
|
137
|
+
# is equivalent to
|
138
|
+
#
|
139
|
+
# detail_url 'href', type => :attribute
|
140
|
+
#
|
141
|
+
#i.e. the system figures out on it's own that because of the postfix, the
|
142
|
+
#example should be looked up (but it should never override the user input!)
|
143
|
+
#another example (will be available later):
|
144
|
+
#
|
145
|
+
# every_img
|
146
|
+
#
|
147
|
+
# is equivivalent to
|
148
|
+
#
|
149
|
+
# every_img '//img'
|
150
|
+
#
|
151
|
+
def check_if_shortcut_pattern()
|
152
|
+
if @name =~ /.+_url/
|
153
|
+
@options[:type] = :attribute
|
154
|
+
['href']
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
#Check whether the currently created pattern is a detail pattern (i.e. it refrences
|
159
|
+
#a subextractor). Also check if the currently created pattern is
|
160
|
+
#an ancestor of a detail pattern , and store this in a hash if yes (to be able to
|
161
|
+
#traverse the pattern structure on detail pages as well).
|
162
|
+
def check_if_detail_page(block)
|
163
|
+
if @name =~ /.+_detail/
|
164
|
+
@options[:type] = :detail_page
|
165
|
+
@referenced_extractor = block
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def parent_of_leaf
|
170
|
+
@children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? }
|
171
|
+
end
|
172
|
+
|
173
|
+
def filter_count
|
174
|
+
@filters.size
|
175
|
+
end
|
176
|
+
|
177
|
+
def parse_child_patterns(&block)
|
178
|
+
context = Object.new
|
179
|
+
context.instance_eval do
|
180
|
+
def current=(value)
|
181
|
+
@current = value
|
182
|
+
end
|
183
|
+
def method_missing(method_name, *args, &block)
|
184
|
+
if method_name.to_s[0..0] == '_'
|
185
|
+
#add hash option
|
186
|
+
key = method_name.to_s[1..-1].to_sym
|
187
|
+
check_option(key)
|
188
|
+
args.each do |arg|
|
189
|
+
current_value = @current.options[key]
|
190
|
+
if current_value.nil?
|
191
|
+
@current.options[key] = arg
|
192
|
+
else
|
193
|
+
@current.options[key] = [current_value] if !current_value.is_a Array
|
194
|
+
@current.options[key] << arg
|
195
|
+
end
|
196
|
+
end
|
197
|
+
else
|
198
|
+
#create child pattern
|
199
|
+
child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
|
200
|
+
@current.children << child
|
201
|
+
child
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
context.current = self
|
206
|
+
context.instance_eval(&block)
|
207
|
+
end
|
208
|
+
|
209
|
+
#Dispatcher function; The class was already too big so I have decided to factor
|
210
|
+
#out some methods based on their functionality (like output, adding constraints)
|
211
|
+
#to utility classes.
|
212
|
+
#
|
213
|
+
#The second function besides dispatching is to lookup the results in an evaluated
|
214
|
+
#wrapper, for example
|
215
|
+
#
|
216
|
+
# camera_data.item[1].item_name[0]
|
217
|
+
def method_missing(method_name, *args, &block)
|
218
|
+
if @extractor.evaluating_extractor_definition
|
219
|
+
@modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
|
220
|
+
end
|
221
|
+
|
222
|
+
case method_name.to_s
|
223
|
+
when 'select_indices'
|
224
|
+
@result_indexer = Scrubyt::ResultIndexer.new(*args)
|
225
|
+
return self
|
226
|
+
when /^ensure_/
|
227
|
+
@constraints << Scrubyt::ConstraintAdder.send(method_name, *args)
|
228
|
+
return self #To make chaining possible
|
229
|
+
else
|
230
|
+
@children.each { |child| return child if child.name == method_name.to_s }
|
231
|
+
end
|
232
|
+
|
233
|
+
raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
|
234
|
+
end
|
235
|
+
|
236
|
+
def evaluate(source, filter_indices)
|
237
|
+
if type == :detail_page # DIRTY!
|
238
|
+
return @filters[0].evaluate(source)
|
239
|
+
end
|
240
|
+
|
241
|
+
#we apply all filters if filter_indices is nil
|
242
|
+
indices_to_evaluate = filter_indices.nil? ? 0...@filters.size : filter_indices
|
243
|
+
#stores the results of all filters
|
244
|
+
all_filter_results = []
|
245
|
+
#remembers which filters have retured a certain result
|
246
|
+
indices_mapping = {}
|
247
|
+
#evaluate filters and collect filter results
|
248
|
+
indices_to_evaluate.each do |filter_index|
|
249
|
+
filter = @filters[filter_index]
|
250
|
+
filter_results = filter.evaluate(source)
|
251
|
+
filter_results.each do |result|
|
252
|
+
#add result to list if not already there
|
253
|
+
all_filter_results << result if all_filter_results.index(result).nil?
|
254
|
+
#add the current filter's index to the mapping
|
255
|
+
(indices_mapping[result] ||= []) << filter_index
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
#apply constraints
|
260
|
+
if @constraints.size > 0
|
261
|
+
all_filter_results = all_filter_results.select do |result|
|
262
|
+
@constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) }
|
263
|
+
end
|
264
|
+
end
|
265
|
+
#apply indexer
|
266
|
+
all_filter_results = @result_indexer.select_indices_to_extract(all_filter_results) if !@result_indexer.nil?
|
267
|
+
|
268
|
+
#create result nodes and evaluate children
|
269
|
+
result_nodes = []
|
270
|
+
all_filter_results.each do |result|
|
271
|
+
#create result node
|
272
|
+
node = ResultNode.new(@name, result, @options)
|
273
|
+
node.generated_by_leaf = (@children.size == 0)
|
274
|
+
#evaluate children
|
275
|
+
@children.each do |child|
|
276
|
+
raise if self.filter_count != 1 && child.filter_count != self.filter_count
|
277
|
+
if self.filter_count == 1
|
278
|
+
#evaluate all child filters
|
279
|
+
node.push(*child.evaluate(result, nil))
|
280
|
+
else
|
281
|
+
#evaluate appropriate child filters
|
282
|
+
node.push(*child.evaluate(result, indices_mapping[result]))
|
283
|
+
end
|
284
|
+
end
|
285
|
+
#apply child constraints (ensure_presence_of_pattern)
|
286
|
+
required_child_names = @constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN }.map {|c| c.target}
|
287
|
+
unless required_child_names.empty?
|
288
|
+
check = lambda { |node_to_check|
|
289
|
+
required_child_names.delete node_to_check.name
|
290
|
+
node_to_check.each { |child| check.call child }
|
291
|
+
}
|
292
|
+
check.call node
|
293
|
+
end
|
294
|
+
next unless required_child_names.empty?
|
295
|
+
#add the current result node to the list
|
296
|
+
result_nodes << node
|
297
|
+
end
|
298
|
+
if result_nodes.empty?
|
299
|
+
result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
|
300
|
+
end
|
301
|
+
case output_type
|
302
|
+
when :model
|
303
|
+
return result_nodes
|
304
|
+
when :page_list
|
305
|
+
result_nodes.each do |result_node|
|
306
|
+
@extractor.add_to_next_page_list result_node
|
307
|
+
end
|
308
|
+
return []
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
private
|
313
|
+
def parse_options_hash(hash)
|
314
|
+
#merge provided hash
|
315
|
+
@options.merge!(hash)
|
316
|
+
#check if valid
|
317
|
+
hash.each { |key, value| check_option(key.to_sym) }
|
318
|
+
raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
|
319
|
+
raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
|
320
|
+
raise "Invalid example type: #{example_type.to_s}" if VALID_PATTERN_EXAMPLE_TYPES.index(example_type.to_sym).nil?
|
321
|
+
end
|
322
|
+
|
323
|
+
def check_option(option)
|
324
|
+
raise "Unknown pattern option: #{option.to_s}" if VALID_OPTIONS.index(option).nil?
|
325
|
+
end
|
326
|
+
|
327
|
+
def look_for_examples(args)
|
328
|
+
if (args[0].is_a? String)
|
329
|
+
examples = args.select {|e| e.is_a? String}
|
330
|
+
#Check if all the String parameters are really the first
|
331
|
+
#parameters
|
332
|
+
args[0..examples.size-1].each do |example|
|
333
|
+
if !example.is_a? String
|
334
|
+
puts 'FATAL: Problem with example specification'
|
335
|
+
end
|
336
|
+
end
|
337
|
+
elsif (args[0].is_a? Regexp)
|
338
|
+
examples = args.select {|e| e.is_a? Regexp}
|
339
|
+
#Check if all the String parameters are really the first
|
340
|
+
#parameters
|
341
|
+
args[0..examples.size].each do |example|
|
342
|
+
if !example.is_a? Regexp
|
343
|
+
puts 'FATAL: Problem with example specification'
|
344
|
+
end
|
345
|
+
end
|
346
|
+
@options[:type] = :regexp
|
347
|
+
elsif (args[0].is_a? Hash)
|
348
|
+
examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
|
349
|
+
examples = nil if examples == []
|
350
|
+
elsif (args[0].is_a? Proc)
|
351
|
+
examples = [args[0]]
|
352
|
+
end
|
353
|
+
|
354
|
+
@has_examples = !examples.nil?
|
355
|
+
examples
|
356
|
+
end
|
357
|
+
|
358
|
+
end #end of class Pattern
|
359
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Apply different functions on the input document</tt>
|
4
|
+
#Before the document is passed to Hpricot for parsing, we may need
|
5
|
+
#to do different stuff with it which are clumsy/not appropriate/impossible
|
6
|
+
#to do once the document is loaded.
|
7
|
+
class PreFilterDocument
|
8
|
+
#Replace <br/> tags with newlines
|
9
|
+
def self.br_to_newline(doc)
|
10
|
+
doc.gsub(/<br[ \/]*>/i, "\r\n")
|
11
|
+
doc = doc.tr("\240"," ")
|
12
|
+
end #end of function br_to_newline
|
13
|
+
end #end of class PreFilterDocument
|
14
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Selecting results based on indices</tt>
|
4
|
+
#
|
5
|
+
#If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
|
6
|
+
#probably with a variable count of results (like tags, authors etc.), you may need just
|
7
|
+
#specific elements - like the last one, every third one, or at specific indices.
|
8
|
+
#In this case you should use the select_indices syntax.
|
9
|
+
class ResultIndexer
|
10
|
+
attr_reader :indices_to_extract
|
11
|
+
|
12
|
+
def initialize(*args)
|
13
|
+
select_indices(*args)
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
#Perform selection of the desires result instances, based on their indices
|
18
|
+
def select_indices_to_extract(ary)
|
19
|
+
return ary if @indices_to_extract == nil
|
20
|
+
to_keep = []
|
21
|
+
@indices_to_extract.each {|e|
|
22
|
+
if e.is_a? Symbol
|
23
|
+
case e
|
24
|
+
when :first
|
25
|
+
to_keep << 0
|
26
|
+
when :last
|
27
|
+
to_keep << ary.size-1
|
28
|
+
when :all_but_last
|
29
|
+
(0..ary.size-2).each {|i| to_keep << i}
|
30
|
+
when :all_but_first
|
31
|
+
(1..ary.size-1).each {|i| to_keep << i}
|
32
|
+
when :every_even
|
33
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
|
34
|
+
when :every_odd
|
35
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
36
|
+
when :every_second
|
37
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
38
|
+
when :every_third
|
39
|
+
(0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
|
40
|
+
when :every_fourth
|
41
|
+
(0..ary.size).each {|i| to_keep << i if (i % 4 == 0)}
|
42
|
+
end
|
43
|
+
end
|
44
|
+
}
|
45
|
+
@indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
|
46
|
+
to_keep.sort!
|
47
|
+
ary.reject! {|e| !to_keep.include? ary.index(e)}
|
48
|
+
ary
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
##
|
53
|
+
#Do not return the whole result set, just specified indices - like
|
54
|
+
#first,last, every odd index, indices from [1..3] etc.
|
55
|
+
#
|
56
|
+
#This method can accept:
|
57
|
+
#- a range, like (2..3)
|
58
|
+
#- an array of indices, like [1,2,3]
|
59
|
+
#- specified set of keywords:
|
60
|
+
# - :first
|
61
|
+
# - :last
|
62
|
+
# - :every_even
|
63
|
+
# - :every_odd
|
64
|
+
# (there can be more of these keywords in one select_indices call)
|
65
|
+
def select_indices(*args)
|
66
|
+
indices_to_grab = args[0]
|
67
|
+
case indices_to_grab.class.to_s
|
68
|
+
when "Range"
|
69
|
+
@indices_to_extract = indices_to_grab.to_a
|
70
|
+
when "Array"
|
71
|
+
nested_arrays = []
|
72
|
+
indices_to_grab.each {|e|
|
73
|
+
if e.is_a? Array
|
74
|
+
nested_arrays << e
|
75
|
+
elsif e.is_a? Range
|
76
|
+
nested_arrays << e.to_a
|
77
|
+
end
|
78
|
+
}
|
79
|
+
@indices_to_extract = indices_to_grab
|
80
|
+
nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
|
81
|
+
@indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
|
82
|
+
when "Symbol"
|
83
|
+
#parse this when we already have the results
|
84
|
+
@indices_to_extract = [indices_to_grab]
|
85
|
+
else
|
86
|
+
puts "Invalid index specification"
|
87
|
+
end
|
88
|
+
end #end of function select_indices
|
89
|
+
end #end of class ResultIndexer
|
90
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,167 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Driving the whole extraction process</tt>
|
4
|
+
#
|
5
|
+
#Extractor is a performer class - it gets an extractor definition and carries
|
6
|
+
#out the actions and evaluates the wrappers sequentially.
|
7
|
+
#
|
8
|
+
#Originally also the navigation actions were here, but since the class got too
|
9
|
+
#big, they were factored out to an own class, NavigationAction.
|
10
|
+
class Extractor
|
11
|
+
include FetchAction
|
12
|
+
|
13
|
+
attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
|
14
|
+
|
15
|
+
#The definition of the extractor is passed through this method
|
16
|
+
def self.define(mode=nil, &extractor_definition)
|
17
|
+
if mode.is_a?(Hash)
|
18
|
+
if mode[:agent]==:firefox
|
19
|
+
FetchAction.class_eval do
|
20
|
+
include Navigation::Firewatir
|
21
|
+
end
|
22
|
+
else
|
23
|
+
FetchAction.class_eval do
|
24
|
+
include Navigation::Mechanize
|
25
|
+
end
|
26
|
+
end
|
27
|
+
else
|
28
|
+
FetchAction.class_eval do
|
29
|
+
include Navigation::Mechanize
|
30
|
+
end
|
31
|
+
end
|
32
|
+
extractor = self.new(mode, extractor_definition)
|
33
|
+
extractor.result
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.load(filename)
|
37
|
+
define(&eval(IO.read(filename)))
|
38
|
+
end
|
39
|
+
|
40
|
+
def initialize(mode, extractor_definition)
|
41
|
+
@mode = mode
|
42
|
+
@root_patterns = []
|
43
|
+
@next_page_pattern = nil
|
44
|
+
# @hpricot_doc = nil
|
45
|
+
# @hpricot_doc_url = nil
|
46
|
+
@evaluating_extractor_definition = false
|
47
|
+
@next_page_list = []
|
48
|
+
@processed_pages = []
|
49
|
+
|
50
|
+
backtrace = SharedUtils.get_backtrace
|
51
|
+
parts = backtrace[1].split(':')
|
52
|
+
source_file = parts[0]
|
53
|
+
|
54
|
+
Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
|
55
|
+
|
56
|
+
@evaluating_extractor_definition = true
|
57
|
+
context = Object.new
|
58
|
+
context.extend NavigationActions
|
59
|
+
context.instance_eval do
|
60
|
+
def extractor=(value)
|
61
|
+
@extractor = value
|
62
|
+
end
|
63
|
+
|
64
|
+
def next_page(*args)
|
65
|
+
@extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
|
66
|
+
end
|
67
|
+
|
68
|
+
def method_missing(method_name, *args, &block)
|
69
|
+
root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
|
70
|
+
@extractor.root_patterns << root_pattern
|
71
|
+
root_pattern
|
72
|
+
end
|
73
|
+
end
|
74
|
+
context.extractor = self
|
75
|
+
context.instance_eval(&extractor_definition)
|
76
|
+
@evaluating_extractor_definition = false
|
77
|
+
|
78
|
+
if @root_patterns.empty?
|
79
|
+
# TODO: this should be an exception
|
80
|
+
Scrubyt.log :ERROR, 'No extractor defined, exiting...'
|
81
|
+
exit
|
82
|
+
end
|
83
|
+
|
84
|
+
#Once all is set up, evaluate the extractor from the root pattern!
|
85
|
+
root_results = evaluate_extractor
|
86
|
+
|
87
|
+
@result = ScrubytResult.new('root')
|
88
|
+
@result.push(*root_results)
|
89
|
+
@result.root_patterns = @root_patterns
|
90
|
+
@result.source_file = source_file
|
91
|
+
@result.source_proc = extractor_definition
|
92
|
+
|
93
|
+
#Return the root pattern
|
94
|
+
Scrubyt.log :INFO, 'Extraction finished succesfully!'
|
95
|
+
end
|
96
|
+
|
97
|
+
def get_hpricot_doc
|
98
|
+
FetchAction.get_hpricot_doc
|
99
|
+
end
|
100
|
+
|
101
|
+
def get_current_doc_url
|
102
|
+
FetchAction.get_current_doc_url
|
103
|
+
end
|
104
|
+
|
105
|
+
def get_detail_pattern_relations
|
106
|
+
@detail_pattern_relations
|
107
|
+
end
|
108
|
+
|
109
|
+
def get_mode
|
110
|
+
@mode
|
111
|
+
end
|
112
|
+
|
113
|
+
def get_original_host_name
|
114
|
+
@original_host_name
|
115
|
+
end
|
116
|
+
|
117
|
+
def add_to_next_page_list(result_node)
|
118
|
+
if result_node.result.is_a? Hpricot::Elem
|
119
|
+
node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
|
120
|
+
return if node == nil || node.attributes['href'] == nil
|
121
|
+
href = node.attributes['href'].gsub('&') {'&'}
|
122
|
+
elsif result_node.result.is_a? String
|
123
|
+
href = result_node.result
|
124
|
+
end
|
125
|
+
url = href #TODO need absolute address here 1/4
|
126
|
+
@next_page_list << url
|
127
|
+
end
|
128
|
+
|
129
|
+
def evaluate_extractor
|
130
|
+
root_results = []
|
131
|
+
current_page_count = 1
|
132
|
+
catch :quit_next_page_loop do
|
133
|
+
loop do
|
134
|
+
url = get_current_doc_url #TODO need absolute address here 2/4
|
135
|
+
@processed_pages << url
|
136
|
+
@root_patterns.each do |root_pattern|
|
137
|
+
root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
138
|
+
end
|
139
|
+
|
140
|
+
while @processed_pages.include? url #TODO need absolute address here 3/4
|
141
|
+
if !@next_page_pattern.nil?
|
142
|
+
throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
|
143
|
+
throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
|
144
|
+
xpath = @next_page_pattern.filters[0].xpath
|
145
|
+
node = (get_hpricot_doc/xpath).map.last
|
146
|
+
node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
|
147
|
+
throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
|
148
|
+
href = node.attributes['href'].gsub('&') {'&'}
|
149
|
+
throw :quit_next_page_loop if href == nil
|
150
|
+
url = href #TODO need absolute address here 4/4
|
151
|
+
else
|
152
|
+
throw :quit_next_page_loop if @next_page_list.empty?
|
153
|
+
url = @next_page_list.pop
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
restore_host_name
|
158
|
+
FetchAction.fetch(url)
|
159
|
+
|
160
|
+
current_page_count += 1
|
161
|
+
end
|
162
|
+
end
|
163
|
+
root_results
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
end
|