scrubyt 0.3.0 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,17 +14,25 @@ module Scrubyt
14
14
 
15
15
  private
16
16
  def download_file(source)
17
- host_name = @parent_pattern.evaluation_context.extractor.get_host_name
17
+ return '' if source.size < 4
18
+ host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
18
19
  outfile = nil
20
+ host_name += "/" if host_name[-1..-1] != "/"
19
21
  base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
20
- return '' if source.size < 4
21
22
  file_name = source.scan(/.+\/(.*)/)[0][0]
23
+ return nil if @parent_pattern.except.include? file_name
22
24
  Net::HTTP.start(base_url) { |http|
23
- puts "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
24
- resp = http.get(source.scan(/\s*(.+)/)[0][0])
25
- outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
26
- FileUtils.mkdir_p @example
27
- open(outfile, 'wb') {|f| f.write(resp.body) }
25
+ Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
26
+ begin
27
+ ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
28
+ path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
29
+ resp = http.get(path, {'User-Agent'=> ua})
30
+ outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
31
+ FileUtils.mkdir_p @example
32
+ open(outfile, 'wb') {|f| f.write(resp.body) }
33
+ rescue Timeout::Error
34
+ outfile = "[FAILED]#{file_name}"
35
+ end
28
36
  }
29
37
  outfile.scan(/.+\/(.*)/)[0][0]
30
38
  end
@@ -34,11 +42,21 @@ private
34
42
  loop do
35
43
  if File.exists? file_name
36
44
  if already_found
37
- last_no = file_name.scan(/_(\d+)\./)[0][0]
38
- file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
45
+ if file_name.include?('.')
46
+ last_no = file_name.scan(/_(\d+)\./)[0][0]
47
+ file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
48
+ else
49
+ last_no = file_name.scan(/_(\d+)$/)[0][0]
50
+ file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
51
+ end
39
52
  else
40
- file_name.sub!(/\./) {"_1\."}
41
- already_found = true
53
+ if file_name.include?('.')
54
+ file_name.sub!(/\./) {"_1\."}
55
+ already_found = true
56
+ else
57
+ file_name << '_1'
58
+ already_found = true
59
+ end
42
60
  end
43
61
  else
44
62
  break
@@ -1,17 +1,17 @@
1
1
  module Scrubyt
2
2
  class RegexpFilter < BaseFilter
3
-
3
+
4
4
  def evaluate(source)
5
5
  if source.is_a? String
6
6
  source.scan(@example).flatten
7
7
  else
8
- source.inner_text.scan(@example).flatten
9
- end
8
+ source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
9
+ end
10
10
  end
11
-
11
+
12
12
  def to_sexp
13
13
  [:lit, @example]
14
14
  end
15
-
15
+
16
16
  end #End of class TreeFilter
17
17
  end #End of module Scrubyt
@@ -0,0 +1,14 @@
1
+ module Scrubyt
2
+ class ScriptFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ param = source
6
+ param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
7
+ @example.call param
8
+ end
9
+
10
+ def to_sexp
11
+ [:str, "FIXME!!! Can't dump Proc"]
12
+ end #end of method to_sexp
13
+ end #End of class ConstantFilter
14
+ end #End of module Scrubyt
@@ -0,0 +1,38 @@
1
+ module Scrubyt
2
+ class TextFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return find_string(source) if @example =~ /^find\(/
6
+ final_element_name = @example.scan(/^(.+?)\[/)[0][0]
7
+ text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
8
+
9
+ index = @example.scan(/\]:(.+)/).flatten
10
+ index = 0 if index.empty?
11
+ index = index[0].to_i unless index[0] == "all"
12
+
13
+ result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
14
+ return "" unless result
15
+
16
+ if index[0] == "all"
17
+ result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
18
+ else
19
+ [XPathUtils.traverse_up_until_name(result,final_element_name)]
20
+ end
21
+ end
22
+
23
+ def find_string(source)
24
+ str = @example.scan(/find\((.+)\)/).flatten[0]
25
+ strings_to_find = str.include? ('|') ? str.split('|') : [str]
26
+ strings_to_find.each do |s|
27
+ result = SharedUtils.traverse_for_match(source,/#{s}/i)
28
+ return [s] unless result.empty?
29
+ end
30
+ return []
31
+ end
32
+
33
+ def to_sexp
34
+ [:str, @example]
35
+ end #end of method to_sexp
36
+ end #End of class TextFilter
37
+ end #End of module Scrubyt
38
+
@@ -38,7 +38,7 @@ module Scrubyt
38
38
  return if @temp_sink.is_a? String
39
39
  return if @example =~ /.+\[.+\]$/
40
40
 
41
- text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_text)
41
+ text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
42
42
  match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
43
43
  return if match_range == (0..text.length)
44
44
 
@@ -64,7 +64,7 @@ module Scrubyt
64
64
  when EXAMPLE_TYPE_XPATH
65
65
  @xpath = @example
66
66
  when EXAMPLE_TYPE_STRING
67
- @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
67
+ @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
68
68
  @example,
69
69
  next_page_example)
70
70
  return if @temp_sink == nil
@@ -116,10 +116,10 @@ module Scrubyt
116
116
  current_example_index += 1
117
117
  end
118
118
  when EXAMPLE_TYPE_IMAGE
119
- @temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.extractor.get_hpricot_doc, @example)
119
+ @temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
120
120
  @xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
121
121
  when EXAMPLE_TYPE_COMPOUND
122
- @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
122
+ @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
123
123
  @example,
124
124
  next_page_example)
125
125
  @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
@@ -128,7 +128,7 @@ module Scrubyt
128
128
  end
129
129
 
130
130
  def generate_relative_XPath(parent_xpath)
131
- parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
131
+ parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
132
132
  parent_xpath,
133
133
  @parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
134
134
  @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
@@ -33,7 +33,7 @@ module Scrubyt
33
33
  # # write out the HTML subtree beginning at the matched element
34
34
  # PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
35
35
 
36
- VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree]
36
+ VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
37
37
 
38
38
  #The pattern can be either a model pattern (in this case it is
39
39
  #written to the output) or a temp pattern (in this case it is skipped)
@@ -46,27 +46,25 @@ module Scrubyt
46
46
  # #of the pattrern which was skipped
47
47
  # OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
48
48
 
49
- VALID_OUTPUT_TYPES = [:model, :temp]
49
+ VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
50
50
 
51
51
  #These options can be set upon wrapper creation
52
- PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve]
52
+ PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
53
53
  VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
54
54
 
55
- attr_accessor(:name, :options, :children, :constraints, :filters, :parent,
56
- :last_result, :evaluation_context,
57
- :indices_to_extract, :referenced_extractor, :referenced_pattern,
58
- :source_file, :source_proc, :modifier_calls)
55
+ attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
56
+ :indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
59
57
 
60
58
  attr_reader(:next_page_url, :result_indexer)
61
59
 
62
60
  option_reader(:type => :tree, :output_type => :model, :generalize => false,
63
61
  :write_text => lambda { @children.size == 0 }, :limit => nil,
64
- :default => nil, :resolve => :full)
62
+ :default => nil, :resolve => :full, :except => nil, :example_type => nil)
65
63
 
66
- def initialize(name, args=[], evaluation_context=nil, parent=nil, &block)
64
+ def initialize(name, args=[], extractor=nil, parent=nil, &block)
67
65
  #init attributes
68
66
  @name = name
69
- @evaluation_context = evaluation_context
67
+ @extractor = extractor
70
68
  @parent = parent
71
69
  @options = {}
72
70
  @children = []
@@ -83,6 +81,7 @@ module Scrubyt
83
81
  #perform checks for special cases
84
82
  examples = check_if_shortcut_pattern() if examples == nil
85
83
  check_if_detail_page(block)
84
+ @options[:output_type] = :page_list if name == 'page_list'
86
85
 
87
86
  #create filters
88
87
  if examples == nil
@@ -97,7 +96,7 @@ module Scrubyt
97
96
  #@generalize was not set up explicitly
98
97
  if @options[:generalize].nil?
99
98
  @options[:generalize] = true if parent.nil?
100
- @options[:generalize] = false if filters[0].example =~ /.+\[[a-zA-Z].+\]$/
99
+ @options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
101
100
  end
102
101
 
103
102
  #parse child patterns if available
@@ -160,7 +159,6 @@ module Scrubyt
160
159
  if @name =~ /.+_detail/
161
160
  @options[:type] = :detail_page
162
161
  @referenced_extractor = block
163
- Scrubyt::Extractor.add_detail_extractor_to_pattern_name(block, self)
164
162
  end
165
163
  end
166
164
 
@@ -194,7 +192,7 @@ module Scrubyt
194
192
  end
195
193
  else
196
194
  #create child pattern
197
- child = Scrubyt::Pattern.new(method_name.to_s, args, @current.evaluation_context, @current, &block)
195
+ child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
198
196
  @current.children << child
199
197
  child
200
198
  end
@@ -213,7 +211,7 @@ module Scrubyt
213
211
  #
214
212
  # camera_data.item[1].item_name[0]
215
213
  def method_missing(method_name, *args, &block)
216
- if @evaluation_context.evaluating_extractor_definition
214
+ if @extractor.evaluating_extractor_definition
217
215
  @modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
218
216
  end
219
217
 
@@ -294,9 +292,17 @@ module Scrubyt
294
292
  result_nodes << node
295
293
  end
296
294
  if result_nodes.empty?
297
- result_nodes << ResultNode.new(@name,@options[:default],@options) if @options[:default]
295
+ result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
296
+ end
297
+ case output_type
298
+ when :model
299
+ return result_nodes
300
+ when :page_list
301
+ result_nodes.each do |result_node|
302
+ @extractor.add_to_next_page_list result_node
303
+ end
304
+ return []
298
305
  end
299
- result_nodes
300
306
  end
301
307
 
302
308
  def to_sexp
@@ -310,8 +316,7 @@ module Scrubyt
310
316
 
311
317
  if type == :detail_page
312
318
  #add detail page extractor
313
- detail_root = @evaluation_context.extractor.get_detail_extractor(self)
314
- sexp = [:iter, sexp, nil, [:block, detail_root.to_sexp]]
319
+ sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
315
320
  else
316
321
  #add child block if the pattern has children
317
322
  sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
@@ -363,6 +368,8 @@ module Scrubyt
363
368
  elsif (args[0].is_a? Hash)
364
369
  examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
365
370
  examples = nil if examples == []
371
+ elsif (args[0].is_a? Proc)
372
+ examples = [args[0]]
366
373
  end
367
374
 
368
375
  @has_examples = !examples.nil?
@@ -7,166 +7,147 @@ module Scrubyt
7
7
  #
8
8
  #Originally also the navigation actions were here, but since the class got too
9
9
  #big, they were factored out to an own class, NavigationAction.
10
- class Extractor
10
+ class Extractor
11
+ include FetchAction
12
+
13
+ attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
14
+
11
15
  #The definition of the extractor is passed through this method
12
16
  def self.define(mode=nil, &extractor_definition)
17
+ extractor = self.new(mode, extractor_definition)
18
+ extractor.result
19
+ end
20
+
21
+ def self.load(filename)
22
+ define(&eval(IO.read(filename)))
23
+ end
24
+
25
+ def initialize(mode, extractor_definition)
26
+ @mode = mode
27
+ @root_patterns = []
28
+ @next_page_pattern = nil
29
+ # @hpricot_doc = nil
30
+ # @hpricot_doc_url = nil
31
+ @evaluating_extractor_definition = false
32
+ @next_page_list = []
33
+ @processed_pages = []
34
+
13
35
  backtrace = SharedUtils.get_backtrace
14
36
  parts = backtrace[1].split(':')
15
37
  source_file = parts[0]
16
38
 
17
- @@mode = mode
18
- #We are keeping the relations between the detail patterns and their root patterns
19
- @@detail_extractor_to_pattern_name = {}
20
- @@detail_pattern_relations = {}
21
- #root pattern -> URIBuilder mapping
22
- @@next_patterns = {}
23
- mode_name = (mode == :production ? 'Production' : 'Learning')
39
+ Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
24
40
 
25
- Scrubyt.log :MODE, mode_name
26
-
27
- @@evaluation_context = EvaluationContext.new
28
- #Hack up an artificial root pattern (i.e. do not return the pattern which
29
- #is the root one in the user's definition, but rather the real (invisible)
30
- #root pattern
31
- @@evaluation_context.evaluating_extractor_definition = true
32
- class_eval(&extractor_definition)
33
- @@evaluation_context.evaluating_extractor_definition = false
34
- root_pattern = @@evaluation_context.root_pattern
35
-
36
- if root_pattern.nil?
41
+ @evaluating_extractor_definition = true
42
+ context = Object.new
43
+ context.extend NavigationActions
44
+ context.instance_eval do
45
+ def extractor=(value)
46
+ @extractor = value
47
+ end
48
+
49
+ def next_page(*args)
50
+ @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
51
+ end
52
+
53
+ def method_missing(method_name, *args, &block)
54
+ root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
55
+ @extractor.root_patterns << root_pattern
56
+ root_pattern
57
+ end
58
+ end
59
+ context.extractor = self
60
+ context.instance_eval(&extractor_definition)
61
+ @evaluating_extractor_definition = false
62
+
63
+ if @root_patterns.empty?
37
64
  # TODO: this should be an exception
38
65
  Scrubyt.log :ERROR, 'No extractor defined, exiting...'
39
66
  exit
40
67
  end
41
-
42
- root_pattern.source_file = source_file
43
- root_pattern.source_proc = extractor_definition
68
+
44
69
  #Once all is set up, evaluate the extractor from the root pattern!
45
- root_results = evaluate_extractor(root_pattern)
46
-
47
- scrubyt_result = ScrubytResult.new('root')
48
- scrubyt_result.push(*root_results)
49
- scrubyt_result.root_pattern = root_pattern
50
-
70
+ root_results = evaluate_extractor
71
+
72
+ @result = ScrubytResult.new('root')
73
+ @result.push(*root_results)
74
+ @result.root_patterns = @root_patterns
75
+ @result.source_file = source_file
76
+ @result.source_proc = extractor_definition
77
+
51
78
  #Return the root pattern
52
79
  Scrubyt.log :INFO, 'Extraction finished succesfully!'
53
- scrubyt_result
54
- end
55
-
56
- #Evaluate a subexttractor (i.e. an extractor on a detail page).
57
- #The url passed to this function is automatically loaded.
58
- #The definition of the subextractor is passed as a block
59
- #
60
- #!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
61
- def self.evaluate_subextractor(url, parent_pattern, resolve)
62
- if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
63
- detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
64
- detail_root.last_result = nil
65
- FetchAction.store_page
66
- @@original_evaluation_context.push @@evaluation_context
67
- @@host_stack.push FetchAction.get_host_name
68
- @@evaluation_context = EvaluationContext.new
69
- FetchAction.restore_host_name
70
- fetch url, :resolve => resolve
71
- @@evaluation_context.extractor = self
72
- @@evaluation_context.root_pattern = detail_root
73
- root_results = evaluate_extractor detail_root
74
- @@evaluation_context = @@original_evaluation_context.pop
75
- FetchAction.restore_page
76
- FetchAction.store_host_name(@@host_stack.pop)
77
- root_results
78
- else
79
- @@original_evaluation_context ||= []
80
- @@host_stack ||= []
81
- FetchAction.store_page
82
- @@original_evaluation_context.push @@evaluation_context
83
- @@host_stack.push FetchAction.get_host_name
84
- @@evaluation_context = EvaluationContext.new
85
- FetchAction.restore_host_name
86
- fetch url, :resolve => resolve
87
- class_eval(&parent_pattern.referenced_extractor)
88
- root_pattern = @@evaluation_context.root_pattern
89
- @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern
90
- root_results = evaluate_extractor(root_pattern)
91
- @@evaluation_context = @@original_evaluation_context.pop
92
- FetchAction.restore_page
93
- FetchAction.store_host_name(@@host_stack.pop)
94
- root_results
95
- end
96
- end
97
-
98
- #build the current wrapper
99
- def self.method_missing(method_name, *args, &block)
100
- if NavigationActions::KEYWORDS.include? method_name.to_s
101
- NavigationActions.send(method_name, *args)
102
- return
103
- end
104
-
105
- if method_name.to_s == 'next_page'
106
- pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
107
- pattern.evaluation_context = @@evaluation_context
108
-
109
- @@evaluation_context.setup_uri_builder(pattern, args)
110
- @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
111
- else
112
- raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
113
- #Create a root pattern
114
- @@evaluation_context.extractor = self
115
- root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
116
- @@last_root_pattern = root_pattern
117
- @@evaluation_context.root_pattern = root_pattern
118
- root_pattern
119
- end
120
80
  end
121
81
 
122
- def self.add_detail_extractor_to_pattern_name(referenced_extractor, pattern)
123
- @@detail_extractor_to_pattern_name[referenced_extractor] ||= [] << pattern
124
- end
125
-
126
- def self.get_detail_extractor(parent_pattern)
127
- @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
128
- end
129
-
130
- def self.get_hpricot_doc
131
- NavigationActions.get_hpricot_doc
82
+ def get_hpricot_doc
83
+ FetchAction.get_hpricot_doc
132
84
  end
133
85
 
134
- def self.get_current_doc_url
135
- NavigationActions.get_current_doc_url
86
+ def get_current_doc_url
87
+ FetchAction.get_current_doc_url
136
88
  end
137
89
 
138
- def self.get_detail_pattern_relations
139
- @@detail_pattern_relations
90
+ def get_detail_pattern_relations
91
+ @detail_pattern_relations
140
92
  end
141
93
 
142
- def self.get_host_name
143
- NavigationActions.get_host_name
94
+ def get_mode
95
+ @mode
144
96
  end
145
97
 
146
- def self.get_mode
147
- @@mode
98
+ def get_original_host_name
99
+ @original_host_name
148
100
  end
149
101
 
150
- def self.get_original_host_name
151
- @@original_host_name
102
+ def add_to_next_page_list(result_node)
103
+ if result_node.result.is_a? Hpricot::Elem
104
+ node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
105
+ return if node == nil || node.attributes['href'] == nil
106
+ href = node.attributes['href'].gsub('&amp;') {'&'}
107
+ elsif result_node.result.is_a? String
108
+ href = result_node.result
109
+ end
110
+ url = href #TODO need absolute address here 1/4
111
+ @next_page_list << url
152
112
  end
153
113
 
154
- private
155
-
156
- def self.evaluate_extractor(root_pattern)
114
+ def evaluate_extractor
157
115
  root_results = []
158
- if @@next_patterns[root_pattern]
159
- current_page_count = 1
116
+ current_page_count = 1
117
+ catch :quit_next_page_loop do
160
118
  loop do
161
- root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
162
- break if (@@next_patterns[root_pattern].limit == current_page_count || !@@evaluation_context.crawl_to_new_page(@@next_patterns[root_pattern]))
163
- current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
119
+ url = get_current_doc_url #TODO need absolute address here 2/4
120
+ puts url
121
+ @processed_pages << url
122
+ @root_patterns.each do |root_pattern|
123
+ root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
124
+ end
125
+
126
+ while @processed_pages.include? url #TODO need absolute address here 3/4
127
+ if !@next_page_pattern.nil?
128
+ throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
129
+ throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
130
+ xpath = @next_page_pattern.filters[0].xpath
131
+ node = (get_hpricot_doc/xpath).map.last
132
+ node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
133
+ throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
134
+ href = node.attributes['href'].gsub('&amp;') {'&'}
135
+ throw :quit_next_page_loop if href == nil
136
+ url = href #TODO need absolute address here 4/4
137
+ else
138
+ throw :quit_next_page_loop if @next_page_list.empty?
139
+ url = @next_page_list.pop
140
+ end
141
+ end
142
+
143
+ restore_host_name
144
+ FetchAction.fetch(url)
145
+
146
+ current_page_count += 1
164
147
  end
165
- else
166
- root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
167
148
  end
168
149
  root_results
169
150
  end
170
151
 
171
- end #end of class Extractor
172
- end #end of module Scrubyt
152
+ end
153
+ end