scrubyt 0.2.0 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +132 -1
- data/Rakefile +4 -2
- data/lib/scrubyt.rb +15 -10
- data/lib/scrubyt/core/navigation/fetch_action.rb +152 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +106 -0
- data/lib/scrubyt/{constraint.rb → core/scraping/constraint.rb} +0 -0
- data/lib/scrubyt/{constraint_adder.rb → core/scraping/constraint_adder.rb} +0 -0
- data/lib/scrubyt/{filter.rb → core/scraping/filter.rb} +22 -4
- data/lib/scrubyt/{pattern.rb → core/scraping/pattern.rb} +21 -98
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +13 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +88 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +97 -0
- data/lib/scrubyt/core/shared/extractor.rb +116 -0
- data/lib/scrubyt/{export.rb → output/export.rb} +14 -8
- data/lib/scrubyt/output/post_processor.rb +137 -0
- data/lib/scrubyt/{result.rb → output/result.rb} +0 -0
- data/lib/scrubyt/{result_dumper.rb → output/result_dumper.rb} +0 -7
- data/lib/scrubyt/{xpathutils.rb → utils/xpathutils.rb} +5 -2
- data/test/unittests/pattern_test.rb +27 -0
- metadata +40 -17
- data/lib/scrubyt/extractor.rb +0 -279
- data/lib/scrubyt/post_processor.rb +0 -73
File without changes
|
File without changes
|
@@ -69,12 +69,18 @@ module Scrubyt
|
|
69
69
|
#Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
|
70
70
|
#is evaluated, every pattern evaluates its filters and then they are calling this method
|
71
71
|
def evaluate(source)
|
72
|
-
case @parent_pattern.type
|
72
|
+
case @parent_pattern.type
|
73
73
|
when Scrubyt::Pattern::PATTERN_TYPE_TREE
|
74
74
|
result = source/@xpath
|
75
|
+
#puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
|
75
76
|
result.class == Hpricot::Elements ? result.map : [result]
|
76
77
|
when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
|
77
|
-
[source.attributes[@example]]
|
78
|
+
attribute_value = [source.attributes[@example]]
|
79
|
+
return attribute_value if attribute_value[0]
|
80
|
+
@@attribute_in_parent = nil
|
81
|
+
Filter.traverse_up_until_attribute_found(source.parent, @example)
|
82
|
+
@@attribute_in_parent
|
83
|
+
|
78
84
|
when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
|
79
85
|
source.inner_text.scan(@example).flatten
|
80
86
|
end
|
@@ -88,7 +94,9 @@ module Scrubyt
|
|
88
94
|
when EXAMPLE_TYPE_XPATH
|
89
95
|
@xpath = @example
|
90
96
|
when EXAMPLE_TYPE_STRING
|
91
|
-
@temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.filters[0].source[0],
|
97
|
+
@temp_sink = XPathUtils.find_node_from_text( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
98
|
+
@example,
|
99
|
+
false )
|
92
100
|
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
93
101
|
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
94
102
|
when EXAMPLE_TYPE_CHILDREN
|
@@ -127,7 +135,7 @@ module Scrubyt
|
|
127
135
|
current_example_index += 1
|
128
136
|
end
|
129
137
|
when EXAMPLE_TYPE_IMAGE
|
130
|
-
@temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.filters[0].source[0], @example)
|
138
|
+
@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)
|
131
139
|
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
|
132
140
|
end
|
133
141
|
end
|
@@ -139,6 +147,16 @@ module Scrubyt
|
|
139
147
|
end
|
140
148
|
|
141
149
|
private
|
150
|
+
def self.traverse_up_until_attribute_found(source, attribute)
|
151
|
+
if (!source.parent.is_a? Hpricot::Doc)
|
152
|
+
#p source.attributes
|
153
|
+
#p attribute
|
154
|
+
#p source.attributes[attribute]
|
155
|
+
@@attribute_in_parent = source.attributes[attribute] if source.attributes[attribute]
|
156
|
+
traverse_up_until_attribute_found(source.parent, attribute) if !@attribute_in_parent
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
142
160
|
def self.determine_example_type(example)
|
143
161
|
if example.instance_of? Regexp
|
144
162
|
EXAMPLE_TYPE_REGEXP
|
@@ -41,23 +41,18 @@ module Scrubyt
|
|
41
41
|
SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'example']
|
42
42
|
|
43
43
|
attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
|
44
|
-
:last_result, :result, :
|
45
|
-
:
|
46
|
-
:
|
47
|
-
attr_reader :type, :generalize_set, :next_page_url
|
44
|
+
:last_result, :result, :example, :limit,
|
45
|
+
:examples, :parent_of_leaf, :evaluation_context,
|
46
|
+
:indices_to_extract, :evaluation_context
|
47
|
+
attr_reader :type, :generalize_set, :next_page_url, :result_indexer
|
48
48
|
|
49
49
|
def initialize (name, *args)
|
50
50
|
@name = name #name of the pattern
|
51
51
|
parse_args(args) #parse the rest of the arguments
|
52
|
-
@root_pattern = nil #root pattern of the wrapper
|
53
52
|
@children = [] #child patterns
|
54
53
|
@filters = [] #filters of the wrapper
|
55
54
|
@result = Result.new #hierarchical results of the pattern
|
56
|
-
|
57
|
-
@evaluated_examples = []
|
58
|
-
@next_page = nil
|
59
|
-
@document_index = 0
|
60
|
-
if @examples == nil
|
55
|
+
if @examples == nil
|
61
56
|
filters << Scrubyt::Filter.new(self) #create a default filter
|
62
57
|
else
|
63
58
|
@examples.each do |example|
|
@@ -112,6 +107,9 @@ module Scrubyt
|
|
112
107
|
# camera_data.item[1].item_name[0]
|
113
108
|
def method_missing(method_name, *args, &block)
|
114
109
|
case method_name.to_s
|
110
|
+
when 'select_indices'
|
111
|
+
@result_indexer = Scrubyt::ResultIndexer.new(*args)
|
112
|
+
self
|
115
113
|
when /^to_/
|
116
114
|
Scrubyt::ResultDumper.send(method_name.to_s, self)
|
117
115
|
when /^ensure_/
|
@@ -135,9 +133,9 @@ module Scrubyt
|
|
135
133
|
# camera_data[1].item[1].item_name[0]
|
136
134
|
def [](index)
|
137
135
|
if @name == 'root'
|
138
|
-
@
|
136
|
+
@evaluation_context.document_index = index
|
139
137
|
else
|
140
|
-
@parent.last_result = @parent.last_result[@
|
138
|
+
@parent.last_result = @parent.last_result[@evaluation_context.document_index] if @parent.last_result.is_a? Array
|
141
139
|
return nil if (@result.lookup(@parent.last_result)) == nil
|
142
140
|
@last_result = @result.lookup(@parent.last_result)[index]
|
143
141
|
end
|
@@ -150,7 +148,7 @@ module Scrubyt
|
|
150
148
|
def export(file, output_file_name=nil, extractor_result_file_name=nil)
|
151
149
|
Scrubyt::Export.export(file, self, output_file_name, extractor_result_file_name)
|
152
150
|
end
|
153
|
-
|
151
|
+
|
154
152
|
##
|
155
153
|
#Add a filter to this pattern
|
156
154
|
def add_filter(filter)
|
@@ -167,39 +165,7 @@ module Scrubyt
|
|
167
165
|
child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
|
168
166
|
@children << child
|
169
167
|
end
|
170
|
-
|
171
|
-
##
|
172
|
-
#Crawl to a new page. This function should not be called from the outside - it is automatically called
|
173
|
-
#if the next_page is defined
|
174
|
-
def crawl_to_new_page
|
175
|
-
temp_document = generate_next_page_link(@next_page)
|
176
|
-
return nil if temp_document == nil
|
177
|
-
clear_sources_and_sinks(@root_pattern)
|
178
|
-
@root_pattern.extractor.fetch(temp_document)
|
179
|
-
attach_current_document
|
180
|
-
end
|
181
|
-
|
182
|
-
##
|
183
|
-
#Attach document to the root pattern; This is happening automatically as the root pattern is defined or
|
184
|
-
#crawling to a new page
|
185
|
-
def attach_current_document
|
186
|
-
doc = @root_pattern.extractor.get_hpricot_doc
|
187
|
-
filters[0].source << doc
|
188
|
-
filters[0].sink << doc
|
189
|
-
@last_result ||= []
|
190
|
-
@last_result << doc
|
191
|
-
@result.add_result(filters[0].source, filters[0].sink)
|
192
|
-
end
|
193
|
-
|
194
|
-
##
|
195
|
-
#Based on the given examples, calculate the XPaths for the tree patterns
|
196
|
-
def setup_examples
|
197
|
-
get_root_pattern(self)
|
198
|
-
mark_leaf_parents(self)
|
199
|
-
set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
|
200
|
-
generate_examples(@root_pattern)
|
201
|
-
end
|
202
|
-
|
168
|
+
|
203
169
|
##
|
204
170
|
#Evaluate the pattern. This means evaluating all the filters and adding
|
205
171
|
#their extracted instances to the array of results of this pattern
|
@@ -225,23 +191,21 @@ module Scrubyt
|
|
225
191
|
end
|
226
192
|
result = result_hash.reject {|k,v| k if !v}
|
227
193
|
sorted_result = r.reject {|e| !result.keys.include? e}
|
228
|
-
|
194
|
+
indexer = @result_indexer == nil ? sorted_result : @result_indexer.select_indices_to_extract(sorted_result)
|
195
|
+
add_result(filter, source, indexer)
|
229
196
|
else
|
230
|
-
|
197
|
+
indexer = @result_indexer == nil ? r : @result_indexer.select_indices_to_extract(r)
|
198
|
+
add_result(filter, source, indexer)
|
231
199
|
end#end of constraint check
|
232
200
|
end#end of source iteration
|
233
201
|
end#end of filter iteration
|
234
|
-
end
|
235
|
-
|
236
|
-
def get_instance_count
|
237
|
-
@@instance_count
|
238
|
-
end
|
202
|
+
end
|
239
203
|
|
240
204
|
def get_constraints
|
241
205
|
filters[0].constraints
|
242
|
-
end
|
243
|
-
|
244
|
-
private
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
245
209
|
def look_for_examples(args)
|
246
210
|
if (args[0].is_a? String)
|
247
211
|
@examples = args.select {|e| e.is_a? String}
|
@@ -269,48 +233,7 @@ private
|
|
269
233
|
results.each do |res|
|
270
234
|
filter.sink << res
|
271
235
|
@result.add_result(source, res)
|
272
|
-
@@instance_count[@name] += 1
|
273
236
|
end
|
274
|
-
end
|
275
|
-
|
276
|
-
def get_root_pattern(pattern)
|
277
|
-
if @root_pattern == nil
|
278
|
-
while (pattern.parent != nil)
|
279
|
-
get_root_pattern(pattern.parent)
|
280
|
-
end
|
281
|
-
@root_pattern = pattern
|
282
|
-
end
|
283
|
-
end
|
284
|
-
|
285
|
-
def mark_leaf_parents(pattern)
|
286
|
-
pattern.children.each { |child|
|
287
|
-
pattern.parent_of_leaf = true if child.children.size == 0
|
288
|
-
}
|
289
|
-
pattern.children.each { |child| mark_leaf_parents(child) }
|
290
|
-
end
|
291
|
-
|
292
|
-
def set_root_pattern_whole_wrapper(pattern, root_pattern)
|
293
|
-
pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
|
294
|
-
pattern.root_pattern = root_pattern
|
295
|
-
end
|
296
|
-
|
297
|
-
def generate_examples(pattern)
|
298
|
-
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
299
|
-
pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == PATTERN_TYPE_TREE
|
300
|
-
end
|
301
|
-
|
302
|
-
def clear_sources_and_sinks(pattern)
|
303
|
-
pattern.filters.each do |filter|
|
304
|
-
filter.source = []
|
305
|
-
filter.sink = []
|
306
|
-
end
|
307
|
-
pattern.children.each {|child| clear_sources_and_sinks child}
|
308
|
-
end
|
309
|
-
|
310
|
-
def generate_next_page_link(example)
|
311
|
-
node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example, true)
|
312
|
-
return nil if node == nil
|
313
|
-
node.attributes['href'].gsub('&') {'&'}
|
314
|
-
end # end of method generate_next_page_link
|
237
|
+
end # end of method generate_examples
|
315
238
|
end #end of class Pattern
|
316
239
|
end #end of module Scrubyt
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Apply different functions on the input document</tt>
|
4
|
+
#Before the document is passed to Hpricot for parsing, we may need
|
5
|
+
#to do different stuff with it which are clumsy/not appropriate/impossible
|
6
|
+
#to do once the document is loaded.
|
7
|
+
class PreFilterDocument
|
8
|
+
#Replace <br/> tags with newlines
|
9
|
+
def self.br_to_newline(doc)
|
10
|
+
doc.gsub(/<br[ \/]*>/i, "\r\n")
|
11
|
+
end #end of function br_to_newline
|
12
|
+
end #end of class PreFilterDocument
|
13
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Selecting results based on indices</tt>
|
4
|
+
#
|
5
|
+
#If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
|
6
|
+
#probably with a variable count of results (like tags, authors etc.), you may need just
|
7
|
+
#specific elements - like the last one, every third one, or at specific indices.
|
8
|
+
#In this case you should use the select_indices syntax.
|
9
|
+
class ResultIndexer
|
10
|
+
attr_reader :indices_to_extract
|
11
|
+
|
12
|
+
def initialize(*args)
|
13
|
+
select_indices(*args)
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
#Perform selection of the desires result instances, based on their indices
|
18
|
+
def select_indices_to_extract(ary)
|
19
|
+
return ary if @indices_to_extract == nil
|
20
|
+
to_keep = []
|
21
|
+
@indices_to_extract.each {|e|
|
22
|
+
if e.is_a? Symbol
|
23
|
+
case e
|
24
|
+
when :first
|
25
|
+
to_keep << 0
|
26
|
+
when :last
|
27
|
+
to_keep << ary.size-1
|
28
|
+
when :all_but_last
|
29
|
+
(0..ary.size-2).each {|i| to_keep << i}
|
30
|
+
when :all_but_first
|
31
|
+
(1..ary.size-1).each {|i| to_keep << i}
|
32
|
+
when :every_even
|
33
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
|
34
|
+
when :every_odd
|
35
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
36
|
+
when :every_second
|
37
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
38
|
+
when :every_third
|
39
|
+
(0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
}
|
43
|
+
@indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
|
44
|
+
to_keep.sort!
|
45
|
+
ary.reject! {|e| !to_keep.include? ary.index(e)}
|
46
|
+
ary
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
##
|
51
|
+
#Do not return the whole result set, just specified indices - like
|
52
|
+
#first,last, every odd index, indices from [1..3] etc.
|
53
|
+
#
|
54
|
+
#This method can accept:
|
55
|
+
#- a range, like (2..3)
|
56
|
+
#- an array of indices, like [1,2,3]
|
57
|
+
#- specified set of keywords:
|
58
|
+
# - :first
|
59
|
+
# - :last
|
60
|
+
# - :every_even
|
61
|
+
# - :every_odd
|
62
|
+
# (there can be more of these keywords in one select_indices call)
|
63
|
+
def select_indices(*args)
|
64
|
+
indices_to_grab = args[0]
|
65
|
+
case indices_to_grab.class.to_s
|
66
|
+
when "Range"
|
67
|
+
@indices_to_extract = indices_to_grab.to_a
|
68
|
+
when "Array"
|
69
|
+
nested_arrays = []
|
70
|
+
indices_to_grab.each {|e|
|
71
|
+
if e.is_a? Array
|
72
|
+
nested_arrays << e
|
73
|
+
elsif e.is_a? Range
|
74
|
+
nested_arrays << e.to_a
|
75
|
+
end
|
76
|
+
}
|
77
|
+
@indices_to_extract = indices_to_grab
|
78
|
+
nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
|
79
|
+
@indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
|
80
|
+
when "Symbol"
|
81
|
+
#parse this when we already have the results
|
82
|
+
@indices_to_extract = [indices_to_grab]
|
83
|
+
else
|
84
|
+
puts "Invalid index specification"
|
85
|
+
end
|
86
|
+
end #end of function select_indices
|
87
|
+
end #end of class ResultIndexer
|
88
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,97 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Holding the evaluation context of the extraction process</tt>
|
4
|
+
#
|
5
|
+
#Every kind of data that is shared among patterns during the extraction process
|
6
|
+
#is held in this class, so it can be looked up anytime.
|
7
|
+
#
|
8
|
+
#This class provides also some high-level basic functionality in navigation, like
|
9
|
+
#crawling to new pages, attaching doucment to the root pattern once arrived at the
|
10
|
+
#desired page etc.
|
11
|
+
#
|
12
|
+
#It can be viewed as a glue between Extractor and NavigationActions as well - these
|
13
|
+
#two classes need to communicate frequently as well as share different information
|
14
|
+
#and this is accomplished through EvaluationContext.
|
15
|
+
class EvaluationContext
|
16
|
+
attr_accessor :root_pattern, :next_page, :document_index, :block_count,
|
17
|
+
:extractor, :limit
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
@root_pattern = nil
|
21
|
+
@next_page = nil
|
22
|
+
@block_count = 0
|
23
|
+
@document_index = 0
|
24
|
+
@extractor = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
#Crawl to a new page. This function should not be called from the outside - it is automatically called
|
29
|
+
#if the next_page is defined
|
30
|
+
def crawl_to_new_page
|
31
|
+
temp_document = generate_next_page_link(@next_page)
|
32
|
+
return nil if temp_document == nil
|
33
|
+
clear_sources_and_sinks(@root_pattern)
|
34
|
+
@extractor.fetch(temp_document)
|
35
|
+
attach_current_document
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
#Attach document to the root pattern; This is happening automatically as the root pattern is defined or
|
40
|
+
#crawling to a new page
|
41
|
+
def attach_current_document
|
42
|
+
doc = @extractor.get_hpricot_doc
|
43
|
+
@root_pattern.filters[0].source << doc
|
44
|
+
@root_pattern.filters[0].sink << doc
|
45
|
+
@root_pattern.last_result ||= []
|
46
|
+
@root_pattern.last_result << doc
|
47
|
+
@root_pattern.result.add_result(@root_pattern.filters[0].source,
|
48
|
+
@root_pattern.filters[0].sink)
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
#Based on the given examples, calculate the XPaths for the tree patterns
|
53
|
+
def setup_examples
|
54
|
+
get_root_pattern(nil)
|
55
|
+
mark_leaf_parents(@root_pattern)
|
56
|
+
generate_examples(@root_pattern)
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
#After crawling to the new page, the sources and sinks need to be cleaned
|
61
|
+
#since they are no more valid
|
62
|
+
def clear_sources_and_sinks(pattern)
|
63
|
+
pattern.filters.each do |filter|
|
64
|
+
filter.source = []
|
65
|
+
filter.sink = []
|
66
|
+
end
|
67
|
+
pattern.children.each {|child| clear_sources_and_sinks child}
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_next_page_link(example)
|
71
|
+
node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example, true)
|
72
|
+
return nil if node == nil
|
73
|
+
node.attributes['href'].gsub('&') {'&'}
|
74
|
+
end
|
75
|
+
|
76
|
+
def mark_leaf_parents(pattern)
|
77
|
+
pattern.children.each { |child|
|
78
|
+
pattern.parent_of_leaf = true if child.children.size == 0
|
79
|
+
}
|
80
|
+
pattern.children.each { |child| mark_leaf_parents(child) }
|
81
|
+
end
|
82
|
+
|
83
|
+
def generate_examples(pattern)
|
84
|
+
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
85
|
+
pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
86
|
+
end
|
87
|
+
|
88
|
+
def get_root_pattern(pattern)
|
89
|
+
if @root_pattern == nil
|
90
|
+
while (pattern.parent != nil)
|
91
|
+
get_root_pattern(pattern.parent)
|
92
|
+
end
|
93
|
+
@root_pattern = pattern
|
94
|
+
end
|
95
|
+
end #end of function
|
96
|
+
end #end of class EvaluationContext
|
97
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'mechanize'
|
5
|
+
require 'hpricot'
|
6
|
+
|
7
|
+
module Scrubyt
|
8
|
+
##
|
9
|
+
#=<tt>Driving the whole extraction process</tt>
|
10
|
+
#
|
11
|
+
#Extractor is a performer class - it gets an extractor definition and carries
|
12
|
+
#out the actions and evaluates the wrappers sequentially.
|
13
|
+
#
|
14
|
+
#Originally also the navigation actions were here, but since the class got too
|
15
|
+
#big, they were factored out to an own class, NavigationAction.
|
16
|
+
class Extractor
|
17
|
+
#The definition of the extractor is passed through this method
|
18
|
+
def self.define(mode=nil, &extractor_definition)
|
19
|
+
@@mode = mode
|
20
|
+
mode_name = (mode == :production ? 'Production' : 'Learning')
|
21
|
+
puts "[MODE] #{mode_name}"
|
22
|
+
NavigationActions.new
|
23
|
+
@@evaluation_context = EvaluationContext.new
|
24
|
+
#Hack up an artificial root pattern (i.e. do not return the pattern which
|
25
|
+
#is the root one in the user's definition, but rather the real (invisible)
|
26
|
+
#root pattern
|
27
|
+
evaluated_extractor = (class_eval(&extractor_definition))
|
28
|
+
if evaluated_extractor == nil
|
29
|
+
puts "No extractor defined, exiting..."
|
30
|
+
exit
|
31
|
+
end
|
32
|
+
root_pattern = evaluated_extractor.parent
|
33
|
+
#Recursively match data based on examples
|
34
|
+
@@evaluation_context.setup_examples
|
35
|
+
#Once all is set up, evaluate the extractor from the root pattern!
|
36
|
+
evaluate_extractor(root_pattern)
|
37
|
+
#Apply all postprocess steps
|
38
|
+
PostProcessor.apply_post_processing(root_pattern)
|
39
|
+
#Return the root pattern
|
40
|
+
puts "Extraction finished succesfully!"
|
41
|
+
root_pattern
|
42
|
+
end
|
43
|
+
|
44
|
+
#build the current wrapper
|
45
|
+
def self.method_missing(method_name, *args, &block)
|
46
|
+
if NavigationActions::KEYWORDS.include? method_name.to_s
|
47
|
+
NavigationActions.send(method_name, *args)
|
48
|
+
return
|
49
|
+
end
|
50
|
+
pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
|
51
|
+
pattern.evaluation_context = @@evaluation_context
|
52
|
+
if @parent == nil
|
53
|
+
if method_name.to_s == 'next_page'
|
54
|
+
@@evaluation_context.next_page = args[0]
|
55
|
+
@@evaluation_context.limit =
|
56
|
+
args[1][:limit] if args.size > 1
|
57
|
+
return @@last_pattern
|
58
|
+
else
|
59
|
+
#Create a root pattern
|
60
|
+
root_pattern = Scrubyt::Pattern.new('root', :type => :root)
|
61
|
+
root_pattern.evaluation_context = @@evaluation_context
|
62
|
+
@@evaluation_context.root_pattern = root_pattern
|
63
|
+
@@evaluation_context.extractor = self
|
64
|
+
#add the currently active document to the root pattern
|
65
|
+
@@evaluation_context.attach_current_document
|
66
|
+
@@evaluation_context.root_pattern.add_child_pattern(pattern)
|
67
|
+
@@evaluation_context.block_count = 0
|
68
|
+
end
|
69
|
+
else
|
70
|
+
@parent.add_child_pattern(pattern) if @parent != nil
|
71
|
+
end
|
72
|
+
if block_given?
|
73
|
+
@@evaluation_context.block_count = @@evaluation_context.block_count + 1
|
74
|
+
@stack ||=[]
|
75
|
+
@parent = pattern
|
76
|
+
@stack.push @parent
|
77
|
+
class_eval(&block)
|
78
|
+
@stack.pop
|
79
|
+
@parent = @stack.last
|
80
|
+
end
|
81
|
+
@@last_pattern = pattern
|
82
|
+
end
|
83
|
+
|
84
|
+
#Used in lord of the hacks vol 1. Check out export.rb if you are still interested
|
85
|
+
#(You should not be :)
|
86
|
+
def self.get_block_count
|
87
|
+
@@root_pattern.block_count
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.get_hpricot_doc
|
91
|
+
NavigationActions.get_hpricot_doc
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.get_mode
|
95
|
+
@@mode
|
96
|
+
end
|
97
|
+
private
|
98
|
+
def self.evaluate_extractor(root_pattern)
|
99
|
+
if @@evaluation_context.next_page
|
100
|
+
current_page_count = 1
|
101
|
+
loop do
|
102
|
+
really_evaluate_extractor(root_pattern)
|
103
|
+
break if (@@evaluation_context.limit == current_page_count || @@evaluation_context.crawl_to_new_page == nil)
|
104
|
+
current_page_count += 1 if @@evaluation_context.limit != nil
|
105
|
+
end
|
106
|
+
else
|
107
|
+
really_evaluate_extractor(root_pattern)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.really_evaluate_extractor(pattern)
|
112
|
+
pattern.evaluate
|
113
|
+
pattern.children.each { |child| really_evaluate_extractor child }
|
114
|
+
end #end of method evaluate_wrapper
|
115
|
+
end #end of class Extractor
|
116
|
+
end #end of module Scrubyt
|