scrubyt 0.3.0 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,17 +14,25 @@ module Scrubyt
14
14
 
15
15
  private
16
16
  def download_file(source)
17
- host_name = @parent_pattern.evaluation_context.extractor.get_host_name
17
+ return '' if source.size < 4
18
+ host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
18
19
  outfile = nil
20
+ host_name += "/" if host_name[-1..-1] != "/"
19
21
  base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
20
- return '' if source.size < 4
21
22
  file_name = source.scan(/.+\/(.*)/)[0][0]
23
+ return nil if @parent_pattern.except.include? file_name
22
24
  Net::HTTP.start(base_url) { |http|
23
- puts "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
24
- resp = http.get(source.scan(/\s*(.+)/)[0][0])
25
- outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
26
- FileUtils.mkdir_p @example
27
- open(outfile, 'wb') {|f| f.write(resp.body) }
25
+ Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
26
+ begin
27
+ ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
28
+ path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
29
+ resp = http.get(path, {'User-Agent'=> ua})
30
+ outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
31
+ FileUtils.mkdir_p @example
32
+ open(outfile, 'wb') {|f| f.write(resp.body) }
33
+ rescue Timeout::Error
34
+ outfile = "[FAILED]#{file_name}"
35
+ end
28
36
  }
29
37
  outfile.scan(/.+\/(.*)/)[0][0]
30
38
  end
@@ -34,11 +42,21 @@ private
34
42
  loop do
35
43
  if File.exists? file_name
36
44
  if already_found
37
- last_no = file_name.scan(/_(\d+)\./)[0][0]
38
- file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
45
+ if file_name.include?('.')
46
+ last_no = file_name.scan(/_(\d+)\./)[0][0]
47
+ file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
48
+ else
49
+ last_no = file_name.scan(/_(\d+)$/)[0][0]
50
+ file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
51
+ end
39
52
  else
40
- file_name.sub!(/\./) {"_1\."}
41
- already_found = true
53
+ if file_name.include?('.')
54
+ file_name.sub!(/\./) {"_1\."}
55
+ already_found = true
56
+ else
57
+ file_name << '_1'
58
+ already_found = true
59
+ end
42
60
  end
43
61
  else
44
62
  break
@@ -1,17 +1,17 @@
1
1
  module Scrubyt
2
2
  class RegexpFilter < BaseFilter
3
-
3
+
4
4
  def evaluate(source)
5
5
  if source.is_a? String
6
6
  source.scan(@example).flatten
7
7
  else
8
- source.inner_text.scan(@example).flatten
9
- end
8
+ source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
9
+ end
10
10
  end
11
-
11
+
12
12
  def to_sexp
13
13
  [:lit, @example]
14
14
  end
15
-
15
+
16
16
  end #End of class TreeFilter
17
17
  end #End of module Scrubyt
@@ -0,0 +1,14 @@
1
+ module Scrubyt
2
+ class ScriptFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ param = source
6
+ param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
7
+ @example.call param
8
+ end
9
+
10
+ def to_sexp
11
+ [:str, "FIXME!!! Can't dump Proc"]
12
+ end #end of method to_sexp
13
+ end #End of class ConstantFilter
14
+ end #End of module Scrubyt
@@ -0,0 +1,38 @@
1
+ module Scrubyt
2
+ class TextFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return find_string(source) if @example =~ /^find\(/
6
+ final_element_name = @example.scan(/^(.+?)\[/)[0][0]
7
+ text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
8
+
9
+ index = @example.scan(/\]:(.+)/).flatten
10
+ index = 0 if index.empty?
11
+ index = index[0].to_i unless index[0] == "all"
12
+
13
+ result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
14
+ return "" unless result
15
+
16
+ if index[0] == "all"
17
+ result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
18
+ else
19
+ [XPathUtils.traverse_up_until_name(result,final_element_name)]
20
+ end
21
+ end
22
+
23
+ def find_string(source)
24
+ str = @example.scan(/find\((.+)\)/).flatten[0]
25
+ strings_to_find = str.include? ('|') ? str.split('|') : [str]
26
+ strings_to_find.each do |s|
27
+ result = SharedUtils.traverse_for_match(source,/#{s}/i)
28
+ return [s] unless result.empty?
29
+ end
30
+ return []
31
+ end
32
+
33
+ def to_sexp
34
+ [:str, @example]
35
+ end #end of method to_sexp
36
+ end #End of class TextFilter
37
+ end #End of module Scrubyt
38
+
@@ -38,7 +38,7 @@ module Scrubyt
38
38
  return if @temp_sink.is_a? String
39
39
  return if @example =~ /.+\[.+\]$/
40
40
 
41
- text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_text)
41
+ text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
42
42
  match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
43
43
  return if match_range == (0..text.length)
44
44
 
@@ -64,7 +64,7 @@ module Scrubyt
64
64
  when EXAMPLE_TYPE_XPATH
65
65
  @xpath = @example
66
66
  when EXAMPLE_TYPE_STRING
67
- @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
67
+ @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
68
68
  @example,
69
69
  next_page_example)
70
70
  return if @temp_sink == nil
@@ -116,10 +116,10 @@ module Scrubyt
116
116
  current_example_index += 1
117
117
  end
118
118
  when EXAMPLE_TYPE_IMAGE
119
- @temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.extractor.get_hpricot_doc, @example)
119
+ @temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
120
120
  @xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
121
121
  when EXAMPLE_TYPE_COMPOUND
122
- @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
122
+ @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
123
123
  @example,
124
124
  next_page_example)
125
125
  @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
@@ -128,7 +128,7 @@ module Scrubyt
128
128
  end
129
129
 
130
130
  def generate_relative_XPath(parent_xpath)
131
- parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
131
+ parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
132
132
  parent_xpath,
133
133
  @parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
134
134
  @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
@@ -33,7 +33,7 @@ module Scrubyt
33
33
  # # write out the HTML subtree beginning at the matched element
34
34
  # PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
35
35
 
36
- VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree]
36
+ VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
37
37
 
38
38
  #The pattern can be either a model pattern (in this case it is
39
39
  #written to the output) or a temp pattern (in this case it is skipped)
@@ -46,27 +46,25 @@ module Scrubyt
46
46
  # #of the pattrern which was skipped
47
47
  # OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
48
48
 
49
- VALID_OUTPUT_TYPES = [:model, :temp]
49
+ VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
50
50
 
51
51
  #These options can be set upon wrapper creation
52
- PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve]
52
+ PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
53
53
  VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
54
54
 
55
- attr_accessor(:name, :options, :children, :constraints, :filters, :parent,
56
- :last_result, :evaluation_context,
57
- :indices_to_extract, :referenced_extractor, :referenced_pattern,
58
- :source_file, :source_proc, :modifier_calls)
55
+ attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
56
+ :indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
59
57
 
60
58
  attr_reader(:next_page_url, :result_indexer)
61
59
 
62
60
  option_reader(:type => :tree, :output_type => :model, :generalize => false,
63
61
  :write_text => lambda { @children.size == 0 }, :limit => nil,
64
- :default => nil, :resolve => :full)
62
+ :default => nil, :resolve => :full, :except => nil, :example_type => nil)
65
63
 
66
- def initialize(name, args=[], evaluation_context=nil, parent=nil, &block)
64
+ def initialize(name, args=[], extractor=nil, parent=nil, &block)
67
65
  #init attributes
68
66
  @name = name
69
- @evaluation_context = evaluation_context
67
+ @extractor = extractor
70
68
  @parent = parent
71
69
  @options = {}
72
70
  @children = []
@@ -83,6 +81,7 @@ module Scrubyt
83
81
  #perform checks for special cases
84
82
  examples = check_if_shortcut_pattern() if examples == nil
85
83
  check_if_detail_page(block)
84
+ @options[:output_type] = :page_list if name == 'page_list'
86
85
 
87
86
  #create filters
88
87
  if examples == nil
@@ -97,7 +96,7 @@ module Scrubyt
97
96
  #@generalize was not set up explicitly
98
97
  if @options[:generalize].nil?
99
98
  @options[:generalize] = true if parent.nil?
100
- @options[:generalize] = false if filters[0].example =~ /.+\[[a-zA-Z].+\]$/
99
+ @options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
101
100
  end
102
101
 
103
102
  #parse child patterns if available
@@ -160,7 +159,6 @@ module Scrubyt
160
159
  if @name =~ /.+_detail/
161
160
  @options[:type] = :detail_page
162
161
  @referenced_extractor = block
163
- Scrubyt::Extractor.add_detail_extractor_to_pattern_name(block, self)
164
162
  end
165
163
  end
166
164
 
@@ -194,7 +192,7 @@ module Scrubyt
194
192
  end
195
193
  else
196
194
  #create child pattern
197
- child = Scrubyt::Pattern.new(method_name.to_s, args, @current.evaluation_context, @current, &block)
195
+ child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
198
196
  @current.children << child
199
197
  child
200
198
  end
@@ -213,7 +211,7 @@ module Scrubyt
213
211
  #
214
212
  # camera_data.item[1].item_name[0]
215
213
  def method_missing(method_name, *args, &block)
216
- if @evaluation_context.evaluating_extractor_definition
214
+ if @extractor.evaluating_extractor_definition
217
215
  @modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
218
216
  end
219
217
 
@@ -294,9 +292,17 @@ module Scrubyt
294
292
  result_nodes << node
295
293
  end
296
294
  if result_nodes.empty?
297
- result_nodes << ResultNode.new(@name,@options[:default],@options) if @options[:default]
295
+ result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
296
+ end
297
+ case output_type
298
+ when :model
299
+ return result_nodes
300
+ when :page_list
301
+ result_nodes.each do |result_node|
302
+ @extractor.add_to_next_page_list result_node
303
+ end
304
+ return []
298
305
  end
299
- result_nodes
300
306
  end
301
307
 
302
308
  def to_sexp
@@ -310,8 +316,7 @@ module Scrubyt
310
316
 
311
317
  if type == :detail_page
312
318
  #add detail page extractor
313
- detail_root = @evaluation_context.extractor.get_detail_extractor(self)
314
- sexp = [:iter, sexp, nil, [:block, detail_root.to_sexp]]
319
+ sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
315
320
  else
316
321
  #add child block if the pattern has children
317
322
  sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
@@ -363,6 +368,8 @@ module Scrubyt
363
368
  elsif (args[0].is_a? Hash)
364
369
  examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
365
370
  examples = nil if examples == []
371
+ elsif (args[0].is_a? Proc)
372
+ examples = [args[0]]
366
373
  end
367
374
 
368
375
  @has_examples = !examples.nil?
@@ -7,166 +7,147 @@ module Scrubyt
7
7
  #
8
8
  #Originally also the navigation actions were here, but since the class got too
9
9
  #big, they were factored out to an own class, NavigationAction.
10
- class Extractor
10
+ class Extractor
11
+ include FetchAction
12
+
13
+ attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
14
+
11
15
  #The definition of the extractor is passed through this method
12
16
  def self.define(mode=nil, &extractor_definition)
17
+ extractor = self.new(mode, extractor_definition)
18
+ extractor.result
19
+ end
20
+
21
+ def self.load(filename)
22
+ define(&eval(IO.read(filename)))
23
+ end
24
+
25
+ def initialize(mode, extractor_definition)
26
+ @mode = mode
27
+ @root_patterns = []
28
+ @next_page_pattern = nil
29
+ # @hpricot_doc = nil
30
+ # @hpricot_doc_url = nil
31
+ @evaluating_extractor_definition = false
32
+ @next_page_list = []
33
+ @processed_pages = []
34
+
13
35
  backtrace = SharedUtils.get_backtrace
14
36
  parts = backtrace[1].split(':')
15
37
  source_file = parts[0]
16
38
 
17
- @@mode = mode
18
- #We are keeping the relations between the detail patterns and their root patterns
19
- @@detail_extractor_to_pattern_name = {}
20
- @@detail_pattern_relations = {}
21
- #root pattern -> URIBuilder mapping
22
- @@next_patterns = {}
23
- mode_name = (mode == :production ? 'Production' : 'Learning')
39
+ Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
24
40
 
25
- Scrubyt.log :MODE, mode_name
26
-
27
- @@evaluation_context = EvaluationContext.new
28
- #Hack up an artificial root pattern (i.e. do not return the pattern which
29
- #is the root one in the user's definition, but rather the real (invisible)
30
- #root pattern
31
- @@evaluation_context.evaluating_extractor_definition = true
32
- class_eval(&extractor_definition)
33
- @@evaluation_context.evaluating_extractor_definition = false
34
- root_pattern = @@evaluation_context.root_pattern
35
-
36
- if root_pattern.nil?
41
+ @evaluating_extractor_definition = true
42
+ context = Object.new
43
+ context.extend NavigationActions
44
+ context.instance_eval do
45
+ def extractor=(value)
46
+ @extractor = value
47
+ end
48
+
49
+ def next_page(*args)
50
+ @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
51
+ end
52
+
53
+ def method_missing(method_name, *args, &block)
54
+ root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
55
+ @extractor.root_patterns << root_pattern
56
+ root_pattern
57
+ end
58
+ end
59
+ context.extractor = self
60
+ context.instance_eval(&extractor_definition)
61
+ @evaluating_extractor_definition = false
62
+
63
+ if @root_patterns.empty?
37
64
  # TODO: this should be an exception
38
65
  Scrubyt.log :ERROR, 'No extractor defined, exiting...'
39
66
  exit
40
67
  end
41
-
42
- root_pattern.source_file = source_file
43
- root_pattern.source_proc = extractor_definition
68
+
44
69
  #Once all is set up, evaluate the extractor from the root pattern!
45
- root_results = evaluate_extractor(root_pattern)
46
-
47
- scrubyt_result = ScrubytResult.new('root')
48
- scrubyt_result.push(*root_results)
49
- scrubyt_result.root_pattern = root_pattern
50
-
70
+ root_results = evaluate_extractor
71
+
72
+ @result = ScrubytResult.new('root')
73
+ @result.push(*root_results)
74
+ @result.root_patterns = @root_patterns
75
+ @result.source_file = source_file
76
+ @result.source_proc = extractor_definition
77
+
51
78
  #Return the root pattern
52
79
  Scrubyt.log :INFO, 'Extraction finished succesfully!'
53
- scrubyt_result
54
- end
55
-
56
- #Evaluate a subexttractor (i.e. an extractor on a detail page).
57
- #The url passed to this function is automatically loaded.
58
- #The definition of the subextractor is passed as a block
59
- #
60
- #!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
61
- def self.evaluate_subextractor(url, parent_pattern, resolve)
62
- if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
63
- detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
64
- detail_root.last_result = nil
65
- FetchAction.store_page
66
- @@original_evaluation_context.push @@evaluation_context
67
- @@host_stack.push FetchAction.get_host_name
68
- @@evaluation_context = EvaluationContext.new
69
- FetchAction.restore_host_name
70
- fetch url, :resolve => resolve
71
- @@evaluation_context.extractor = self
72
- @@evaluation_context.root_pattern = detail_root
73
- root_results = evaluate_extractor detail_root
74
- @@evaluation_context = @@original_evaluation_context.pop
75
- FetchAction.restore_page
76
- FetchAction.store_host_name(@@host_stack.pop)
77
- root_results
78
- else
79
- @@original_evaluation_context ||= []
80
- @@host_stack ||= []
81
- FetchAction.store_page
82
- @@original_evaluation_context.push @@evaluation_context
83
- @@host_stack.push FetchAction.get_host_name
84
- @@evaluation_context = EvaluationContext.new
85
- FetchAction.restore_host_name
86
- fetch url, :resolve => resolve
87
- class_eval(&parent_pattern.referenced_extractor)
88
- root_pattern = @@evaluation_context.root_pattern
89
- @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern
90
- root_results = evaluate_extractor(root_pattern)
91
- @@evaluation_context = @@original_evaluation_context.pop
92
- FetchAction.restore_page
93
- FetchAction.store_host_name(@@host_stack.pop)
94
- root_results
95
- end
96
- end
97
-
98
- #build the current wrapper
99
- def self.method_missing(method_name, *args, &block)
100
- if NavigationActions::KEYWORDS.include? method_name.to_s
101
- NavigationActions.send(method_name, *args)
102
- return
103
- end
104
-
105
- if method_name.to_s == 'next_page'
106
- pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
107
- pattern.evaluation_context = @@evaluation_context
108
-
109
- @@evaluation_context.setup_uri_builder(pattern, args)
110
- @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
111
- else
112
- raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
113
- #Create a root pattern
114
- @@evaluation_context.extractor = self
115
- root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
116
- @@last_root_pattern = root_pattern
117
- @@evaluation_context.root_pattern = root_pattern
118
- root_pattern
119
- end
120
80
  end
121
81
 
122
- def self.add_detail_extractor_to_pattern_name(referenced_extractor, pattern)
123
- @@detail_extractor_to_pattern_name[referenced_extractor] ||= [] << pattern
124
- end
125
-
126
- def self.get_detail_extractor(parent_pattern)
127
- @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
128
- end
129
-
130
- def self.get_hpricot_doc
131
- NavigationActions.get_hpricot_doc
82
+ def get_hpricot_doc
83
+ FetchAction.get_hpricot_doc
132
84
  end
133
85
 
134
- def self.get_current_doc_url
135
- NavigationActions.get_current_doc_url
86
+ def get_current_doc_url
87
+ FetchAction.get_current_doc_url
136
88
  end
137
89
 
138
- def self.get_detail_pattern_relations
139
- @@detail_pattern_relations
90
+ def get_detail_pattern_relations
91
+ @detail_pattern_relations
140
92
  end
141
93
 
142
- def self.get_host_name
143
- NavigationActions.get_host_name
94
+ def get_mode
95
+ @mode
144
96
  end
145
97
 
146
- def self.get_mode
147
- @@mode
98
+ def get_original_host_name
99
+ @original_host_name
148
100
  end
149
101
 
150
- def self.get_original_host_name
151
- @@original_host_name
102
+ def add_to_next_page_list(result_node)
103
+ if result_node.result.is_a? Hpricot::Elem
104
+ node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
105
+ return if node == nil || node.attributes['href'] == nil
106
+ href = node.attributes['href'].gsub('&amp;') {'&'}
107
+ elsif result_node.result.is_a? String
108
+ href = result_node.result
109
+ end
110
+ url = href #TODO need absolute address here 1/4
111
+ @next_page_list << url
152
112
  end
153
113
 
154
- private
155
-
156
- def self.evaluate_extractor(root_pattern)
114
+ def evaluate_extractor
157
115
  root_results = []
158
- if @@next_patterns[root_pattern]
159
- current_page_count = 1
116
+ current_page_count = 1
117
+ catch :quit_next_page_loop do
160
118
  loop do
161
- root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
162
- break if (@@next_patterns[root_pattern].limit == current_page_count || !@@evaluation_context.crawl_to_new_page(@@next_patterns[root_pattern]))
163
- current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
119
+ url = get_current_doc_url #TODO need absolute address here 2/4
120
+ puts url
121
+ @processed_pages << url
122
+ @root_patterns.each do |root_pattern|
123
+ root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
124
+ end
125
+
126
+ while @processed_pages.include? url #TODO need absolute address here 3/4
127
+ if !@next_page_pattern.nil?
128
+ throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
129
+ throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
130
+ xpath = @next_page_pattern.filters[0].xpath
131
+ node = (get_hpricot_doc/xpath).map.last
132
+ node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
133
+ throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
134
+ href = node.attributes['href'].gsub('&amp;') {'&'}
135
+ throw :quit_next_page_loop if href == nil
136
+ url = href #TODO need absolute address here 4/4
137
+ else
138
+ throw :quit_next_page_loop if @next_page_list.empty?
139
+ url = @next_page_list.pop
140
+ end
141
+ end
142
+
143
+ restore_host_name
144
+ FetchAction.fetch(url)
145
+
146
+ current_page_count += 1
164
147
  end
165
- else
166
- root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
167
148
  end
168
149
  root_results
169
150
  end
170
151
 
171
- end #end of class Extractor
172
- end #end of module Scrubyt
152
+ end
153
+ end