scrubber-scrubyt 0.4.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/CHANGELOG +343 -0
  2. data/COPYING +340 -0
  3. data/README +99 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
  6. data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
  7. data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
  8. data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
  9. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  10. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  11. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  13. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  14. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  15. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  16. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  17. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  18. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  19. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  20. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  21. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  22. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  23. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  24. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  25. data/lib/scrubyt/core/shared/extractor.rb +167 -0
  26. data/lib/scrubyt/logging.rb +154 -0
  27. data/lib/scrubyt/output/post_processor.rb +139 -0
  28. data/lib/scrubyt/output/result.rb +44 -0
  29. data/lib/scrubyt/output/result_dumper.rb +154 -0
  30. data/lib/scrubyt/output/result_node.rb +140 -0
  31. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  32. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  33. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  34. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  35. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  36. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  37. data/lib/scrubyt.rb +43 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +115 -0
@@ -0,0 +1,359 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+
4
+ module Scrubyt
5
+ ##
6
+ #=<tt>Group more filters into one</tt>
7
+ #
8
+ #Server as an umbrella for filters which are conceptually extracting
9
+ #the same thing - for example a price or a title or ...
10
+ #
11
+ #Sometimes the same piece of information can not be extracted with one filter
12
+ #across more result instances (for example a price has an XPath in record n,
13
+ #but since in record n+1 has a discount price as well, the real price is pushed
14
+ #to a different XPath etc) - in this case the more filters which extract the same
15
+ #thing are hold in the same pattern.
16
+ class Pattern
17
+ #Type of the pattern;
18
+
19
+ # TODO: Update documentation
20
+
21
+ # # a root pattern represents a (surprise!) root pattern
22
+ # PATTERN_TYPE_ROOT = :PATTERN_TYPE_ROOT
23
+ # # a tree pattern represents a HTML region
24
+ # PATTERN_TYPE_TREE = :PATTERN_TYPE_TREE
25
+ # # represents an attribute of the node extracted by the parent pattern
26
+ # PATTERN_TYPE_ATTRIBUTE = :PATTERN_TYPE_ATTRIBUTE
27
+ # # represents a pattern which filters its output with a regexp
28
+ # PATTERN_TYPE_REGEXP = :PATTERN_TYPE_REGEXP
29
+ # # represents a pattern which crawls to the detail page and extracts information from there
30
+ # PATTERN_TYPE_DETAIL_PAGE = :PATTERN_TYPE_DETAIL_PAGE
31
+ # # represents a download pattern
32
+ # PATTERN_TYPE_DOWNLOAD = :PATTERN_TYPE_DOWNLOAD
33
+ # # write out the HTML subtree beginning at the matched element
34
+ # PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
35
+
36
+ VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
37
+
38
+ # :determine - default value, represent that type of example need determine
39
+ # :string - represent node with example type EXAMPLE_TYPE_STRING
40
+ VALID_PATTERN_EXAMPLE_TYPES = [:determine, :xpath]
41
+
42
+ #The pattern can be either a model pattern (in this case it is
43
+ #written to the output) or a temp pattern (in this case it is skipped)
44
+ #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
45
+ #is considered to be a model pattern
46
+
47
+ #Model pattern are shown in the output
48
+ # OUTPUT_TYPE_MODEL = :OUTPUT_TYPE_MODEL
49
+ # #Temp patterns are skipped in the output (their ancestors are appended to the parent
50
+ # #of the pattrern which was skipped
51
+ # OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
52
+
53
+ VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
54
+
55
+ #These options can be set upon wrapper creation
56
+ PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
57
+ VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
58
+
59
+ attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
60
+ :indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
61
+
62
+ attr_reader(:next_page_url, :result_indexer)
63
+
64
+ option_reader(:type => :tree, :output_type => :model, :generalize => false,
65
+ :write_text => lambda { @children.size == 0 }, :limit => nil,
66
+ :default => nil, :resolve => :full, :except => [], :example_type => :determine)
67
+
68
+ def initialize(name, args=[], extractor=nil, parent=nil, &block)
69
+ #init attributes
70
+ @name = name
71
+ @extractor = extractor
72
+ @parent = parent
73
+ @options = {}
74
+ @children = []
75
+ @filters = []
76
+ @constraints = []
77
+ @modifier_calls = []
78
+
79
+ #grab any examples that are defined
80
+ examples = look_for_examples(args)
81
+
82
+ #parse the options hash if provided
83
+ parse_options_hash(args[-1]) if args[-1].is_a? Hash
84
+
85
+ #perform checks for special cases
86
+ examples = check_if_shortcut_pattern() if examples == nil
87
+ check_if_detail_page(block)
88
+ @options[:output_type] = :page_list if name == 'page_list'
89
+
90
+ #create filters
91
+ if examples == nil
92
+ @filters << Scrubyt::BaseFilter.create(self) #create a default filter
93
+ else
94
+ examples.each do |example|
95
+ @filters << Scrubyt::BaseFilter.create(self,example) #create a filter
96
+ end
97
+ end
98
+
99
+ #by default, generalize the root pattern, but only in the case if
100
+ #@generalize was not set up explicitly
101
+ if @options[:generalize].nil?
102
+ @options[:generalize] = true if parent.nil?
103
+ @options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
104
+ end
105
+
106
+ #parse child patterns if available
107
+ parse_child_patterns(&block) if ( !block.nil? && type != :detail_page )
108
+
109
+ #tree pattern only (TODO: subclass?)
110
+ if type == :tree
111
+ #generate xpaths and regexps
112
+ @filters.each do |filter|
113
+ filter.generate_XPath_for_example(false) unless @name == 'next_page'
114
+ filter.generate_regexp_for_example
115
+ end
116
+ #when the xpaths of this pattern have been created, its children can make their xpaths relative
117
+ xpaths = @filters.collect { |filter| filter.xpath }
118
+ @children.each do |child|
119
+ child.generate_relative_XPaths xpaths
120
+ end
121
+ end
122
+ end
123
+
124
+ def generate_relative_XPaths(parent_xpaths)
125
+ return if type != :tree
126
+ raise ArgumentError.new if parent_xpaths.size != 1 && parent_xpaths.size != @filters.size #TODO: should be checked earlier with proper error message
127
+ @filters.each_index do |index|
128
+ @filters[index].generate_relative_XPath parent_xpaths[parent_xpaths.size == 1 ? 0 : index]
129
+ end
130
+ end
131
+
132
+ #Shortcut patterns, as their name says, are a shortcut for creating patterns
133
+ #from predefined rules; for example:
134
+ #
135
+ # detail_url
136
+ #
137
+ # is equivalent to
138
+ #
139
+ # detail_url 'href', type => :attribute
140
+ #
141
+ #i.e. the system figures out on it's own that because of the postfix, the
142
+ #example should be looked up (but it should never override the user input!)
143
+ #another example (will be available later):
144
+ #
145
+ # every_img
146
+ #
147
+ # is equivivalent to
148
+ #
149
+ # every_img '//img'
150
+ #
151
+ def check_if_shortcut_pattern()
152
+ if @name =~ /.+_url/
153
+ @options[:type] = :attribute
154
+ ['href']
155
+ end
156
+ end
157
+
158
+ #Check whether the currently created pattern is a detail pattern (i.e. it refrences
159
+ #a subextractor). Also check if the currently created pattern is
160
+ #an ancestor of a detail pattern , and store this in a hash if yes (to be able to
161
+ #traverse the pattern structure on detail pages as well).
162
+ def check_if_detail_page(block)
163
+ if @name =~ /.+_detail/
164
+ @options[:type] = :detail_page
165
+ @referenced_extractor = block
166
+ end
167
+ end
168
+
169
+ def parent_of_leaf
170
+ @children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? }
171
+ end
172
+
173
+ def filter_count
174
+ @filters.size
175
+ end
176
+
177
+ def parse_child_patterns(&block)
178
+ context = Object.new
179
+ context.instance_eval do
180
+ def current=(value)
181
+ @current = value
182
+ end
183
+ def method_missing(method_name, *args, &block)
184
+ if method_name.to_s[0..0] == '_'
185
+ #add hash option
186
+ key = method_name.to_s[1..-1].to_sym
187
+ check_option(key)
188
+ args.each do |arg|
189
+ current_value = @current.options[key]
190
+ if current_value.nil?
191
+ @current.options[key] = arg
192
+ else
193
+ @current.options[key] = [current_value] if !current_value.is_a Array
194
+ @current.options[key] << arg
195
+ end
196
+ end
197
+ else
198
+ #create child pattern
199
+ child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
200
+ @current.children << child
201
+ child
202
+ end
203
+ end
204
+ end
205
+ context.current = self
206
+ context.instance_eval(&block)
207
+ end
208
+
209
+ #Dispatcher function; The class was already too big so I have decided to factor
210
+ #out some methods based on their functionality (like output, adding constraints)
211
+ #to utility classes.
212
+ #
213
+ #The second function besides dispatching is to lookup the results in an evaluated
214
+ #wrapper, for example
215
+ #
216
+ # camera_data.item[1].item_name[0]
217
+ def method_missing(method_name, *args, &block)
218
+ if @extractor.evaluating_extractor_definition
219
+ @modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
220
+ end
221
+
222
+ case method_name.to_s
223
+ when 'select_indices'
224
+ @result_indexer = Scrubyt::ResultIndexer.new(*args)
225
+ return self
226
+ when /^ensure_/
227
+ @constraints << Scrubyt::ConstraintAdder.send(method_name, *args)
228
+ return self #To make chaining possible
229
+ else
230
+ @children.each { |child| return child if child.name == method_name.to_s }
231
+ end
232
+
233
+ raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
234
+ end
235
+
236
+ def evaluate(source, filter_indices)
237
+ if type == :detail_page # DIRTY!
238
+ return @filters[0].evaluate(source)
239
+ end
240
+
241
+ #we apply all filters if filter_indices is nil
242
+ indices_to_evaluate = filter_indices.nil? ? 0...@filters.size : filter_indices
243
+ #stores the results of all filters
244
+ all_filter_results = []
245
+ #remembers which filters have retured a certain result
246
+ indices_mapping = {}
247
+ #evaluate filters and collect filter results
248
+ indices_to_evaluate.each do |filter_index|
249
+ filter = @filters[filter_index]
250
+ filter_results = filter.evaluate(source)
251
+ filter_results.each do |result|
252
+ #add result to list if not already there
253
+ all_filter_results << result if all_filter_results.index(result).nil?
254
+ #add the current filter's index to the mapping
255
+ (indices_mapping[result] ||= []) << filter_index
256
+ end
257
+ end
258
+
259
+ #apply constraints
260
+ if @constraints.size > 0
261
+ all_filter_results = all_filter_results.select do |result|
262
+ @constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) }
263
+ end
264
+ end
265
+ #apply indexer
266
+ all_filter_results = @result_indexer.select_indices_to_extract(all_filter_results) if !@result_indexer.nil?
267
+
268
+ #create result nodes and evaluate children
269
+ result_nodes = []
270
+ all_filter_results.each do |result|
271
+ #create result node
272
+ node = ResultNode.new(@name, result, @options)
273
+ node.generated_by_leaf = (@children.size == 0)
274
+ #evaluate children
275
+ @children.each do |child|
276
+ raise if self.filter_count != 1 && child.filter_count != self.filter_count
277
+ if self.filter_count == 1
278
+ #evaluate all child filters
279
+ node.push(*child.evaluate(result, nil))
280
+ else
281
+ #evaluate appropriate child filters
282
+ node.push(*child.evaluate(result, indices_mapping[result]))
283
+ end
284
+ end
285
+ #apply child constraints (ensure_presence_of_pattern)
286
+ required_child_names = @constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN }.map {|c| c.target}
287
+ unless required_child_names.empty?
288
+ check = lambda { |node_to_check|
289
+ required_child_names.delete node_to_check.name
290
+ node_to_check.each { |child| check.call child }
291
+ }
292
+ check.call node
293
+ end
294
+ next unless required_child_names.empty?
295
+ #add the current result node to the list
296
+ result_nodes << node
297
+ end
298
+ if result_nodes.empty?
299
+ result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
300
+ end
301
+ case output_type
302
+ when :model
303
+ return result_nodes
304
+ when :page_list
305
+ result_nodes.each do |result_node|
306
+ @extractor.add_to_next_page_list result_node
307
+ end
308
+ return []
309
+ end
310
+ end
311
+
312
+ private
313
+ def parse_options_hash(hash)
314
+ #merge provided hash
315
+ @options.merge!(hash)
316
+ #check if valid
317
+ hash.each { |key, value| check_option(key.to_sym) }
318
+ raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
319
+ raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
320
+ raise "Invalid example type: #{example_type.to_s}" if VALID_PATTERN_EXAMPLE_TYPES.index(example_type.to_sym).nil?
321
+ end
322
+
323
+ def check_option(option)
324
+ raise "Unknown pattern option: #{option.to_s}" if VALID_OPTIONS.index(option).nil?
325
+ end
326
+
327
+ def look_for_examples(args)
328
+ if (args[0].is_a? String)
329
+ examples = args.select {|e| e.is_a? String}
330
+ #Check if all the String parameters are really the first
331
+ #parameters
332
+ args[0..examples.size-1].each do |example|
333
+ if !example.is_a? String
334
+ puts 'FATAL: Problem with example specification'
335
+ end
336
+ end
337
+ elsif (args[0].is_a? Regexp)
338
+ examples = args.select {|e| e.is_a? Regexp}
339
+ #Check if all the String parameters are really the first
340
+ #parameters
341
+ args[0..examples.size].each do |example|
342
+ if !example.is_a? Regexp
343
+ puts 'FATAL: Problem with example specification'
344
+ end
345
+ end
346
+ @options[:type] = :regexp
347
+ elsif (args[0].is_a? Hash)
348
+ examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
349
+ examples = nil if examples == []
350
+ elsif (args[0].is_a? Proc)
351
+ examples = [args[0]]
352
+ end
353
+
354
+ @has_examples = !examples.nil?
355
+ examples
356
+ end
357
+
358
+ end #end of class Pattern
359
+ end #end of module Scrubyt
@@ -0,0 +1,14 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Apply different functions on the input document</tt>
4
+ #Before the document is passed to Hpricot for parsing, we may need
5
+ #to do different stuff with it which are clumsy/not appropriate/impossible
6
+ #to do once the document is loaded.
7
+ class PreFilterDocument
8
+ #Replace <br/> tags with newlines
9
+ def self.br_to_newline(doc)
10
+ doc.gsub(/<br[ \/]*>/i, "\r\n")
11
+ doc = doc.tr("\240"," ")
12
+ end #end of function br_to_newline
13
+ end #end of class PreFilterDocument
14
+ end #end of module Scrubyt
@@ -0,0 +1,90 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Selecting results based on indices</tt>
4
+ #
5
+ #If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
6
+ #probably with a variable count of results (like tags, authors etc.), you may need just
7
+ #specific elements - like the last one, every third one, or at specific indices.
8
+ #In this case you should use the select_indices syntax.
9
+ class ResultIndexer
10
+ attr_reader :indices_to_extract
11
+
12
+ def initialize(*args)
13
+ select_indices(*args)
14
+ end
15
+
16
+ ##
17
+ #Perform selection of the desires result instances, based on their indices
18
+ def select_indices_to_extract(ary)
19
+ return ary if @indices_to_extract == nil
20
+ to_keep = []
21
+ @indices_to_extract.each {|e|
22
+ if e.is_a? Symbol
23
+ case e
24
+ when :first
25
+ to_keep << 0
26
+ when :last
27
+ to_keep << ary.size-1
28
+ when :all_but_last
29
+ (0..ary.size-2).each {|i| to_keep << i}
30
+ when :all_but_first
31
+ (1..ary.size-1).each {|i| to_keep << i}
32
+ when :every_even
33
+ (0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
34
+ when :every_odd
35
+ (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
36
+ when :every_second
37
+ (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
38
+ when :every_third
39
+ (0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
40
+ when :every_fourth
41
+ (0..ary.size).each {|i| to_keep << i if (i % 4 == 0)}
42
+ end
43
+ end
44
+ }
45
+ @indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
46
+ to_keep.sort!
47
+ ary.reject! {|e| !to_keep.include? ary.index(e)}
48
+ ary
49
+ end
50
+
51
+ private
52
+ ##
53
+ #Do not return the whole result set, just specified indices - like
54
+ #first,last, every odd index, indices from [1..3] etc.
55
+ #
56
+ #This method can accept:
57
+ #- a range, like (2..3)
58
+ #- an array of indices, like [1,2,3]
59
+ #- specified set of keywords:
60
+ # - :first
61
+ # - :last
62
+ # - :every_even
63
+ # - :every_odd
64
+ # (there can be more of these keywords in one select_indices call)
65
+ def select_indices(*args)
66
+ indices_to_grab = args[0]
67
+ case indices_to_grab.class.to_s
68
+ when "Range"
69
+ @indices_to_extract = indices_to_grab.to_a
70
+ when "Array"
71
+ nested_arrays = []
72
+ indices_to_grab.each {|e|
73
+ if e.is_a? Array
74
+ nested_arrays << e
75
+ elsif e.is_a? Range
76
+ nested_arrays << e.to_a
77
+ end
78
+ }
79
+ @indices_to_extract = indices_to_grab
80
+ nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
81
+ @indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
82
+ when "Symbol"
83
+ #parse this when we already have the results
84
+ @indices_to_extract = [indices_to_grab]
85
+ else
86
+ puts "Invalid index specification"
87
+ end
88
+ end #end of function select_indices
89
+ end #end of class ResultIndexer
90
+ end #end of module Scrubyt
@@ -0,0 +1,167 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Driving the whole extraction process</tt>
4
+ #
5
+ #Extractor is a performer class - it gets an extractor definition and carries
6
+ #out the actions and evaluates the wrappers sequentially.
7
+ #
8
+ #Originally also the navigation actions were here, but since the class got too
9
+ #big, they were factored out to an own class, NavigationAction.
10
+ class Extractor
11
+ include FetchAction
12
+
13
+ attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
14
+
15
+ #The definition of the extractor is passed through this method
16
+ def self.define(mode=nil, &extractor_definition)
17
+ if mode.is_a?(Hash)
18
+ if mode[:agent]==:firefox
19
+ FetchAction.class_eval do
20
+ include Navigation::Firewatir
21
+ end
22
+ else
23
+ FetchAction.class_eval do
24
+ include Navigation::Mechanize
25
+ end
26
+ end
27
+ else
28
+ FetchAction.class_eval do
29
+ include Navigation::Mechanize
30
+ end
31
+ end
32
+ extractor = self.new(mode, extractor_definition)
33
+ extractor.result
34
+ end
35
+
36
+ def self.load(filename)
37
+ define(&eval(IO.read(filename)))
38
+ end
39
+
40
+ def initialize(mode, extractor_definition)
41
+ @mode = mode
42
+ @root_patterns = []
43
+ @next_page_pattern = nil
44
+ # @hpricot_doc = nil
45
+ # @hpricot_doc_url = nil
46
+ @evaluating_extractor_definition = false
47
+ @next_page_list = []
48
+ @processed_pages = []
49
+
50
+ backtrace = SharedUtils.get_backtrace
51
+ parts = backtrace[1].split(':')
52
+ source_file = parts[0]
53
+
54
+ Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
55
+
56
+ @evaluating_extractor_definition = true
57
+ context = Object.new
58
+ context.extend NavigationActions
59
+ context.instance_eval do
60
+ def extractor=(value)
61
+ @extractor = value
62
+ end
63
+
64
+ def next_page(*args)
65
+ @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
66
+ end
67
+
68
+ def method_missing(method_name, *args, &block)
69
+ root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
70
+ @extractor.root_patterns << root_pattern
71
+ root_pattern
72
+ end
73
+ end
74
+ context.extractor = self
75
+ context.instance_eval(&extractor_definition)
76
+ @evaluating_extractor_definition = false
77
+
78
+ if @root_patterns.empty?
79
+ # TODO: this should be an exception
80
+ Scrubyt.log :ERROR, 'No extractor defined, exiting...'
81
+ exit
82
+ end
83
+
84
+ #Once all is set up, evaluate the extractor from the root pattern!
85
+ root_results = evaluate_extractor
86
+
87
+ @result = ScrubytResult.new('root')
88
+ @result.push(*root_results)
89
+ @result.root_patterns = @root_patterns
90
+ @result.source_file = source_file
91
+ @result.source_proc = extractor_definition
92
+
93
+ #Return the root pattern
94
+ Scrubyt.log :INFO, 'Extraction finished succesfully!'
95
+ end
96
+
97
+ def get_hpricot_doc
98
+ FetchAction.get_hpricot_doc
99
+ end
100
+
101
+ def get_current_doc_url
102
+ FetchAction.get_current_doc_url
103
+ end
104
+
105
+ def get_detail_pattern_relations
106
+ @detail_pattern_relations
107
+ end
108
+
109
+ def get_mode
110
+ @mode
111
+ end
112
+
113
+ def get_original_host_name
114
+ @original_host_name
115
+ end
116
+
117
+ def add_to_next_page_list(result_node)
118
+ if result_node.result.is_a? Hpricot::Elem
119
+ node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
120
+ return if node == nil || node.attributes['href'] == nil
121
+ href = node.attributes['href'].gsub('&amp;') {'&'}
122
+ elsif result_node.result.is_a? String
123
+ href = result_node.result
124
+ end
125
+ url = href #TODO need absolute address here 1/4
126
+ @next_page_list << url
127
+ end
128
+
129
+ def evaluate_extractor
130
+ root_results = []
131
+ current_page_count = 1
132
+ catch :quit_next_page_loop do
133
+ loop do
134
+ url = get_current_doc_url #TODO need absolute address here 2/4
135
+ @processed_pages << url
136
+ @root_patterns.each do |root_pattern|
137
+ root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
138
+ end
139
+
140
+ while @processed_pages.include? url #TODO need absolute address here 3/4
141
+ if !@next_page_pattern.nil?
142
+ throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
143
+ throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
144
+ xpath = @next_page_pattern.filters[0].xpath
145
+ node = (get_hpricot_doc/xpath).map.last
146
+ node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
147
+ throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
148
+ href = node.attributes['href'].gsub('&amp;') {'&'}
149
+ throw :quit_next_page_loop if href == nil
150
+ url = href #TODO need absolute address here 4/4
151
+ else
152
+ throw :quit_next_page_loop if @next_page_list.empty?
153
+ url = @next_page_list.pop
154
+ end
155
+ end
156
+
157
+ restore_host_name
158
+ FetchAction.fetch(url)
159
+
160
+ current_page_count += 1
161
+ end
162
+ end
163
+ root_results
164
+ end
165
+
166
+ end
167
+ end