scrubyt 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +59 -12
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +24 -6
- data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
- data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
- data/lib/scrubyt/core/scraping/constraint.rb +53 -57
- data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
- data/lib/scrubyt/core/scraping/pattern.rb +292 -157
- data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
- data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
- data/lib/scrubyt/core/shared/extractor.rb +122 -163
- data/lib/scrubyt/output/export.rb +59 -174
- data/lib/scrubyt/output/post_processor.rb +4 -3
- data/lib/scrubyt/output/result.rb +8 -9
- data/lib/scrubyt/output/result_dumper.rb +81 -42
- data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
- data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
- data/lib/scrubyt/utils/shared_utils.rb +39 -26
- data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
- data/lib/scrubyt/utils/xpathutils.rb +31 -30
- data/test/unittests/constraint_test.rb +11 -7
- data/test/unittests/extractor_test.rb +6 -6
- data/test/unittests/filter_test.rb +66 -66
- metadata +22 -15
- data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -1,18 +1,18 @@
|
|
1
1
|
module Scrubyt
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
2
|
+
##
|
3
|
+
#=<tt>Selecting results based on indices</tt>
|
4
|
+
#
|
5
|
+
#If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
|
6
|
+
#probably with a variable count of results (like tags, authors etc.), you may need just
|
7
|
+
#specific elements - like the last one, every third one, or at specific indices.
|
8
|
+
#In this case you should use the select_indices syntax.
|
9
9
|
class ResultIndexer
|
10
10
|
attr_reader :indices_to_extract
|
11
|
-
|
11
|
+
|
12
12
|
def initialize(*args)
|
13
13
|
select_indices(*args)
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
##
|
17
17
|
#Perform selection of the desires result instances, based on their indices
|
18
18
|
def select_indices_to_extract(ary)
|
@@ -21,24 +21,24 @@ module Scrubyt
|
|
21
21
|
@indices_to_extract.each {|e|
|
22
22
|
if e.is_a? Symbol
|
23
23
|
case e
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
24
|
+
when :first
|
25
|
+
to_keep << 0
|
26
|
+
when :last
|
27
|
+
to_keep << ary.size-1
|
28
|
+
when :all_but_last
|
29
|
+
(0..ary.size-2).each {|i| to_keep << i}
|
30
|
+
when :all_but_first
|
31
|
+
(1..ary.size-1).each {|i| to_keep << i}
|
32
|
+
when :every_even
|
33
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
|
34
|
+
when :every_odd
|
35
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
36
|
+
when :every_second
|
37
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
38
|
+
when :every_third
|
39
|
+
(0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
|
40
|
+
when :every_fourth
|
41
|
+
(0..ary.size).each {|i| to_keep << i if (i % 4 == 0)}
|
42
42
|
end
|
43
43
|
end
|
44
44
|
}
|
@@ -48,7 +48,11 @@ module Scrubyt
|
|
48
48
|
ary
|
49
49
|
end
|
50
50
|
|
51
|
-
|
51
|
+
# def to_sexp
|
52
|
+
# [:array, *@indices_to_extract.collect { |index| [:lit, index] }]
|
53
|
+
# end
|
54
|
+
|
55
|
+
private
|
52
56
|
##
|
53
57
|
#Do not return the whole result set, just specified indices - like
|
54
58
|
#first,last, every odd index, indices from [1..3] etc.
|
@@ -65,25 +69,25 @@ private
|
|
65
69
|
def select_indices(*args)
|
66
70
|
indices_to_grab = args[0]
|
67
71
|
case indices_to_grab.class.to_s
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
72
|
+
when "Range"
|
73
|
+
@indices_to_extract = indices_to_grab.to_a
|
74
|
+
when "Array"
|
75
|
+
nested_arrays = []
|
76
|
+
indices_to_grab.each {|e|
|
77
|
+
if e.is_a? Array
|
78
|
+
nested_arrays << e
|
79
|
+
elsif e.is_a? Range
|
80
|
+
nested_arrays << e.to_a
|
81
|
+
end
|
82
|
+
}
|
83
|
+
@indices_to_extract = indices_to_grab
|
84
|
+
nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
|
85
|
+
@indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
|
86
|
+
when "Symbol"
|
87
|
+
#parse this when we already have the results
|
88
|
+
@indices_to_extract = [indices_to_grab]
|
89
|
+
else
|
90
|
+
puts "Invalid index specification"
|
87
91
|
end
|
88
92
|
end #end of function select_indices
|
89
93
|
end #end of class ResultIndexer
|
@@ -13,15 +13,14 @@ module Scrubyt
|
|
13
13
|
#two classes need to communicate frequently as well as share different information
|
14
14
|
#and this is accomplished through EvaluationContext.
|
15
15
|
class EvaluationContext
|
16
|
-
attr_accessor :root_pattern, :document_index, :
|
17
|
-
:extractor, :uri_builder
|
16
|
+
attr_accessor :root_pattern, :document_index, :extractor, :uri_builder, :evaluating_extractor_definition
|
18
17
|
|
19
18
|
def initialize
|
20
19
|
@root_pattern = nil
|
21
20
|
@next_page = nil
|
22
|
-
@block_count = 0
|
23
21
|
@document_index = 0
|
24
22
|
@extractor = nil
|
23
|
+
@evaluating_extractor_definition = false
|
25
24
|
end
|
26
25
|
|
27
26
|
##
|
@@ -33,6 +32,7 @@ module Scrubyt
|
|
33
32
|
uri_builder.generate_next_uri
|
34
33
|
return nil if temp_document == nil
|
35
34
|
clear_sources_and_sinks(@root_pattern)
|
35
|
+
FetchAction.restore_host_name
|
36
36
|
@extractor.fetch(temp_document)
|
37
37
|
attach_current_document
|
38
38
|
end
|
@@ -49,15 +49,6 @@ module Scrubyt
|
|
49
49
|
@root_pattern.result.add_result(@root_pattern.filters[0].source,
|
50
50
|
@root_pattern.filters[0].sink)
|
51
51
|
end
|
52
|
-
|
53
|
-
##
|
54
|
-
#Based on the given examples, calculate the XPaths for the tree patterns
|
55
|
-
def setup_examples
|
56
|
-
get_root_pattern(nil)
|
57
|
-
mark_leaf_parents(@root_pattern)
|
58
|
-
generate_examples(@root_pattern)
|
59
|
-
check_for_multipe_examples(@root_pattern)
|
60
|
-
end
|
61
52
|
|
62
53
|
##
|
63
54
|
#After crawling to the new page, the sources and sinks need to be cleaned
|
@@ -85,35 +76,5 @@ module Scrubyt
|
|
85
76
|
end
|
86
77
|
@uri_builder = URIBuilder.new(pattern,args)
|
87
78
|
end
|
88
|
-
|
89
|
-
def get_root_pattern(pattern)
|
90
|
-
if @root_pattern == nil
|
91
|
-
while (pattern.parent != nil)
|
92
|
-
get_root_pattern(pattern.parent)
|
93
|
-
end
|
94
|
-
@root_pattern = pattern
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
private
|
99
|
-
def mark_leaf_parents(pattern)
|
100
|
-
pattern.children.each { |child|
|
101
|
-
pattern.parent_of_leaf = true if child.children.size == 0
|
102
|
-
}
|
103
|
-
pattern.children.each { |child| mark_leaf_parents(child) }
|
104
|
-
end
|
105
|
-
|
106
|
-
##
|
107
|
-
#Check the tree and turn all the XPaths for the examples (but the topmost one)
|
108
|
-
#into relative ones
|
109
|
-
def check_for_multipe_examples(pattern)
|
110
|
-
pattern.children.each {|child_pattern| check_for_multipe_examples(child_pattern) }
|
111
|
-
pattern.filters.each { |filter| filter.setup_relative_XPaths } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
112
|
-
end
|
113
|
-
|
114
|
-
def generate_examples(pattern)
|
115
|
-
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
116
|
-
pattern.filters.each { |filter| filter.generate_XPath_for_example(false) } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
117
|
-
end #end of function generate_examples
|
118
79
|
end #end of class EvaluationContext
|
119
80
|
end #end of module Scrubyt
|
@@ -1,20 +1,19 @@
|
|
1
|
-
require 'open-uri'
|
2
|
-
require 'rubygems'
|
3
|
-
require 'mechanize'
|
4
|
-
require 'hpricot'
|
5
|
-
|
6
1
|
module Scrubyt
|
7
|
-
##
|
8
|
-
#=<tt>Driving the whole extraction process</tt>
|
9
|
-
#
|
10
|
-
#Extractor is a performer class - it gets an extractor definition and carries
|
11
|
-
#out the actions and evaluates the wrappers sequentially.
|
12
|
-
#
|
13
|
-
#Originally also the navigation actions were here, but since the class got too
|
14
|
-
#big, they were factored out to an own class, NavigationAction.
|
15
|
-
class Extractor
|
2
|
+
##
|
3
|
+
#=<tt>Driving the whole extraction process</tt>
|
4
|
+
#
|
5
|
+
#Extractor is a performer class - it gets an extractor definition and carries
|
6
|
+
#out the actions and evaluates the wrappers sequentially.
|
7
|
+
#
|
8
|
+
#Originally also the navigation actions were here, but since the class got too
|
9
|
+
#big, they were factored out to an own class, NavigationAction.
|
10
|
+
class Extractor
|
16
11
|
#The definition of the extractor is passed through this method
|
17
12
|
def self.define(mode=nil, &extractor_definition)
|
13
|
+
backtrace = SharedUtils.get_backtrace
|
14
|
+
parts = backtrace[1].split(':')
|
15
|
+
source_file = parts[0]
|
16
|
+
|
18
17
|
@@mode = mode
|
19
18
|
#We are keeping the relations between the detail patterns and their root patterns
|
20
19
|
@@detail_extractor_to_pattern_name = {}
|
@@ -25,17 +24,19 @@ module Scrubyt
|
|
25
24
|
puts "[MODE] #{mode_name}"
|
26
25
|
NavigationActions.new
|
27
26
|
@@evaluation_context = EvaluationContext.new
|
28
|
-
#Hack up an artificial root pattern (i.e. do not return the pattern which
|
27
|
+
#Hack up an artificial root pattern (i.e. do not return the pattern which
|
29
28
|
#is the root one in the user's definition, but rather the real (invisible)
|
30
29
|
#root pattern
|
31
|
-
|
32
|
-
|
30
|
+
@@evaluation_context.evaluating_extractor_definition = true
|
31
|
+
class_eval(&extractor_definition)
|
32
|
+
@@evaluation_context.evaluating_extractor_definition = false
|
33
|
+
root_pattern = @@evaluation_context.root_pattern
|
34
|
+
if root_pattern.nil?
|
33
35
|
puts "No extractor defined, exiting..."
|
34
36
|
exit
|
35
37
|
end
|
36
|
-
root_pattern =
|
37
|
-
|
38
|
-
@@evaluation_context.setup_examples
|
38
|
+
root_pattern.source_file = source_file
|
39
|
+
root_pattern.source_proc = extractor_definition
|
39
40
|
#Once all is set up, evaluate the extractor from the root pattern!
|
40
41
|
evaluate_extractor(root_pattern)
|
41
42
|
#Apply all postprocess steps
|
@@ -45,168 +46,126 @@ module Scrubyt
|
|
45
46
|
root_pattern
|
46
47
|
end
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
49
|
+
#Evaluate a subexttractor (i.e. an extractor on a detail page).
|
50
|
+
#The url passed to this function is automatically loaded.
|
51
|
+
#The definition of the subextractor is passed as a block
|
52
|
+
#
|
53
|
+
#!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
|
54
|
+
def self.evaluate_subextractor(url, parent_pattern, resolve)
|
55
|
+
if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
|
56
|
+
detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
|
57
|
+
detail_root.result = Result.new
|
58
|
+
detail_root.last_result = nil
|
59
|
+
FetchAction.store_page
|
60
|
+
@@original_evaluation_context.push @@evaluation_context
|
61
|
+
@@host_stack.push FetchAction.get_host_name
|
62
|
+
@@evaluation_context = EvaluationContext.new
|
63
|
+
@@evaluation_context.clear_sources_and_sinks detail_root
|
64
|
+
FetchAction.restore_host_name
|
65
|
+
fetch url, :resolve => resolve
|
66
|
+
@@evaluation_context.extractor = self
|
67
|
+
@@evaluation_context.root_pattern = detail_root
|
68
|
+
@@evaluation_context.attach_current_document
|
69
|
+
evaluate_extractor detail_root
|
70
|
+
@@evaluation_context = @@original_evaluation_context.pop
|
71
|
+
FetchAction.restore_page
|
72
|
+
FetchAction.store_host_name(@@host_stack.pop)
|
73
|
+
detail_root.to_xml
|
74
|
+
else
|
75
|
+
@@original_evaluation_context ||= []
|
76
|
+
@@host_stack ||= []
|
77
|
+
FetchAction.store_page
|
78
|
+
@@original_evaluation_context.push @@evaluation_context
|
79
|
+
@@host_stack.push FetchAction.get_host_name
|
80
|
+
@@evaluation_context = EvaluationContext.new
|
81
|
+
FetchAction.restore_host_name
|
82
|
+
fetch url, :resolve => resolve
|
83
|
+
evaluated_extractor = (class_eval(&parent_pattern.referenced_extractor))
|
84
|
+
root_pattern = evaluated_extractor.parent
|
85
|
+
@@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern.children[0]
|
86
|
+
evaluate_extractor(root_pattern)
|
87
|
+
#Apply all postprocess steps
|
88
|
+
PostProcessor.apply_post_processing(root_pattern)
|
89
|
+
@@evaluation_context = @@original_evaluation_context.pop
|
90
|
+
FetchAction.restore_page
|
91
|
+
FetchAction.store_host_name(@@host_stack.pop)
|
92
|
+
root_pattern.to_xml
|
93
|
+
end
|
86
94
|
end
|
87
|
-
end
|
88
95
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
|
96
|
-
check_if_shortcut_pattern(pattern)
|
97
|
-
check_if_detail_page(pattern, args)
|
98
|
-
pattern.evaluation_context = @@evaluation_context
|
99
|
-
if @parent == nil
|
96
|
+
#build the current wrapper
|
97
|
+
def self.method_missing(method_name, *args, &block)
|
98
|
+
if NavigationActions::KEYWORDS.include? method_name.to_s
|
99
|
+
NavigationActions.send(method_name, *args)
|
100
|
+
return
|
101
|
+
end
|
100
102
|
if method_name.to_s == 'next_page'
|
103
|
+
pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
|
104
|
+
pattern.evaluation_context = @@evaluation_context
|
105
|
+
|
101
106
|
@@evaluation_context.setup_uri_builder(pattern, args)
|
102
107
|
@@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
|
103
|
-
p @@last_root_pattern.children[0].name
|
104
|
-
return @@last_pattern
|
105
108
|
else
|
109
|
+
raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
|
106
110
|
#Create a root pattern
|
107
|
-
root_pattern = Scrubyt::Pattern.new('root', :type => :root)
|
111
|
+
root_pattern = Scrubyt::Pattern.new('root', [:type => :root], @@evaluation_context)
|
108
112
|
@@last_root_pattern = root_pattern
|
109
|
-
root_pattern.evaluation_context = @@evaluation_context
|
110
113
|
@@evaluation_context.root_pattern = root_pattern
|
111
114
|
@@evaluation_context.extractor = self
|
112
|
-
#add the currently active document to the root pattern
|
115
|
+
#add the currently active document to the root pattern
|
113
116
|
@@evaluation_context.attach_current_document
|
114
|
-
@@evaluation_context
|
115
|
-
|
117
|
+
pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
|
118
|
+
root_pattern.children << pattern
|
119
|
+
pattern
|
116
120
|
end
|
117
|
-
else
|
118
|
-
@parent.add_child_pattern(pattern) if @parent != nil
|
119
121
|
end
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
def self.check_if_shortcut_pattern(pattern)
|
152
|
-
case pattern.name
|
153
|
-
when /.+_url/
|
154
|
-
#make sure that we are not overriding the user's settings
|
155
|
-
if !pattern.examples
|
156
|
-
pattern.filters[0].example = 'href'
|
157
|
-
pattern.type = Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
|
158
|
-
end
|
122
|
+
|
123
|
+
def self.add_detail_extractor_to_pattern_name(referenced_extractor, pattern)
|
124
|
+
@@detail_extractor_to_pattern_name[referenced_extractor] ||= [] << pattern
|
125
|
+
end
|
126
|
+
|
127
|
+
def self.get_detail_extractor(parent_pattern)
|
128
|
+
@@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.get_hpricot_doc
|
132
|
+
NavigationActions.get_hpricot_doc
|
133
|
+
end
|
134
|
+
|
135
|
+
def self.get_current_doc_url
|
136
|
+
NavigationActions.get_current_doc_url
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.get_detail_pattern_relations
|
140
|
+
@@detail_pattern_relations
|
141
|
+
end
|
142
|
+
|
143
|
+
def self.get_host_name
|
144
|
+
NavigationActions.get_host_name
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.get_mode
|
148
|
+
@@mode
|
149
|
+
end
|
150
|
+
|
151
|
+
def self.get_original_host_name
|
152
|
+
@@original_host_name
|
159
153
|
end
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
#a subextractor). Also check if the currently created pattern is
|
164
|
-
#an ancestor of a detail pattern , and store this in a hash if yes (to be able to
|
165
|
-
#traverse the pattern structure on detail pages as well).
|
166
|
-
def self.check_if_detail_page(pattern, args)
|
167
|
-
return if args.size == 0
|
168
|
-
return if !args[0].is_a? Hash
|
169
|
-
return if !args[0][:references]
|
170
|
-
referenced_extractor = args[0][:references]
|
171
|
-
pattern.type = Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
172
|
-
pattern.referenced_extractor = referenced_extractor
|
173
|
-
@@detail_extractor_to_pattern_name[referenced_extractor] ||= []
|
174
|
-
@@detail_extractor_to_pattern_name[referenced_extractor] = @@detail_extractor_to_pattern_name[referenced_extractor] << pattern
|
175
|
-
end
|
176
|
-
|
177
|
-
def self.get_hpricot_doc
|
178
|
-
NavigationActions.get_hpricot_doc
|
179
|
-
end
|
180
|
-
|
181
|
-
def self.get_current_doc_url
|
182
|
-
NavigationActions.get_current_doc_url
|
183
|
-
end
|
184
|
-
|
185
|
-
def self.get_detail_pattern_relations
|
186
|
-
@@detail_pattern_relations
|
187
|
-
end
|
188
|
-
|
189
|
-
def self.get_mode
|
190
|
-
@@mode
|
191
|
-
end
|
192
|
-
|
193
|
-
private
|
154
|
+
|
155
|
+
private
|
156
|
+
|
194
157
|
def self.evaluate_extractor(root_pattern)
|
195
158
|
if @@next_patterns[root_pattern]
|
196
159
|
current_page_count = 1
|
197
160
|
loop do
|
198
|
-
|
199
|
-
break if (@@next_patterns[root_pattern].limit == current_page_count ||
|
161
|
+
root_pattern.evaluate(nil)
|
162
|
+
break if (@@next_patterns[root_pattern].limit == current_page_count || !@@evaluation_context.crawl_to_new_page(root_pattern, @@next_patterns[root_pattern]))
|
200
163
|
current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
|
201
164
|
end
|
202
|
-
else
|
203
|
-
|
165
|
+
else
|
166
|
+
root_pattern.evaluate(nil)
|
204
167
|
end
|
205
|
-
end
|
206
|
-
|
207
|
-
def self.really_evaluate_extractor(pattern)
|
208
|
-
pattern.evaluate
|
209
|
-
pattern.children.each { |child| really_evaluate_extractor child }
|
210
|
-
end #end of method evaluate_wrapper
|
168
|
+
end
|
169
|
+
|
211
170
|
end #end of class Extractor
|
212
|
-
end #end of module Scrubyt
|
171
|
+
end #end of module Scrubyt
|