scrubyt 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +132 -1
- data/Rakefile +4 -2
- data/lib/scrubyt.rb +15 -10
- data/lib/scrubyt/core/navigation/fetch_action.rb +152 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +106 -0
- data/lib/scrubyt/{constraint.rb → core/scraping/constraint.rb} +0 -0
- data/lib/scrubyt/{constraint_adder.rb → core/scraping/constraint_adder.rb} +0 -0
- data/lib/scrubyt/{filter.rb → core/scraping/filter.rb} +22 -4
- data/lib/scrubyt/{pattern.rb → core/scraping/pattern.rb} +21 -98
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +13 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +88 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +97 -0
- data/lib/scrubyt/core/shared/extractor.rb +116 -0
- data/lib/scrubyt/{export.rb → output/export.rb} +14 -8
- data/lib/scrubyt/output/post_processor.rb +137 -0
- data/lib/scrubyt/{result.rb → output/result.rb} +0 -0
- data/lib/scrubyt/{result_dumper.rb → output/result_dumper.rb} +0 -7
- data/lib/scrubyt/{xpathutils.rb → utils/xpathutils.rb} +5 -2
- data/test/unittests/pattern_test.rb +27 -0
- metadata +40 -17
- data/lib/scrubyt/extractor.rb +0 -279
- data/lib/scrubyt/post_processor.rb +0 -73
File without changes
|
File without changes
|
@@ -69,12 +69,18 @@ module Scrubyt
|
|
69
69
|
#Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
|
70
70
|
#is evaluated, every pattern evaluates its filters and then they are calling this method
|
71
71
|
def evaluate(source)
|
72
|
-
case @parent_pattern.type
|
72
|
+
case @parent_pattern.type
|
73
73
|
when Scrubyt::Pattern::PATTERN_TYPE_TREE
|
74
74
|
result = source/@xpath
|
75
|
+
#puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
|
75
76
|
result.class == Hpricot::Elements ? result.map : [result]
|
76
77
|
when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
|
77
|
-
[source.attributes[@example]]
|
78
|
+
attribute_value = [source.attributes[@example]]
|
79
|
+
return attribute_value if attribute_value[0]
|
80
|
+
@@attribute_in_parent = nil
|
81
|
+
Filter.traverse_up_until_attribute_found(source.parent, @example)
|
82
|
+
@@attribute_in_parent
|
83
|
+
|
78
84
|
when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
|
79
85
|
source.inner_text.scan(@example).flatten
|
80
86
|
end
|
@@ -88,7 +94,9 @@ module Scrubyt
|
|
88
94
|
when EXAMPLE_TYPE_XPATH
|
89
95
|
@xpath = @example
|
90
96
|
when EXAMPLE_TYPE_STRING
|
91
|
-
@temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.filters[0].source[0],
|
97
|
+
@temp_sink = XPathUtils.find_node_from_text( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
98
|
+
@example,
|
99
|
+
false )
|
92
100
|
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
93
101
|
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
94
102
|
when EXAMPLE_TYPE_CHILDREN
|
@@ -127,7 +135,7 @@ module Scrubyt
|
|
127
135
|
current_example_index += 1
|
128
136
|
end
|
129
137
|
when EXAMPLE_TYPE_IMAGE
|
130
|
-
@temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.filters[0].source[0], @example)
|
138
|
+
@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)
|
131
139
|
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
|
132
140
|
end
|
133
141
|
end
|
@@ -139,6 +147,16 @@ module Scrubyt
|
|
139
147
|
end
|
140
148
|
|
141
149
|
private
|
150
|
+
def self.traverse_up_until_attribute_found(source, attribute)
|
151
|
+
if (!source.parent.is_a? Hpricot::Doc)
|
152
|
+
#p source.attributes
|
153
|
+
#p attribute
|
154
|
+
#p source.attributes[attribute]
|
155
|
+
@@attribute_in_parent = source.attributes[attribute] if source.attributes[attribute]
|
156
|
+
traverse_up_until_attribute_found(source.parent, attribute) if !@attribute_in_parent
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
142
160
|
def self.determine_example_type(example)
|
143
161
|
if example.instance_of? Regexp
|
144
162
|
EXAMPLE_TYPE_REGEXP
|
@@ -41,23 +41,18 @@ module Scrubyt
|
|
41
41
|
SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'example']
|
42
42
|
|
43
43
|
attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
|
44
|
-
:last_result, :result, :
|
45
|
-
:
|
46
|
-
:
|
47
|
-
attr_reader :type, :generalize_set, :next_page_url
|
44
|
+
:last_result, :result, :example, :limit,
|
45
|
+
:examples, :parent_of_leaf, :evaluation_context,
|
46
|
+
:indices_to_extract, :evaluation_context
|
47
|
+
attr_reader :type, :generalize_set, :next_page_url, :result_indexer
|
48
48
|
|
49
49
|
def initialize (name, *args)
|
50
50
|
@name = name #name of the pattern
|
51
51
|
parse_args(args) #parse the rest of the arguments
|
52
|
-
@root_pattern = nil #root pattern of the wrapper
|
53
52
|
@children = [] #child patterns
|
54
53
|
@filters = [] #filters of the wrapper
|
55
54
|
@result = Result.new #hierarchical results of the pattern
|
56
|
-
|
57
|
-
@evaluated_examples = []
|
58
|
-
@next_page = nil
|
59
|
-
@document_index = 0
|
60
|
-
if @examples == nil
|
55
|
+
if @examples == nil
|
61
56
|
filters << Scrubyt::Filter.new(self) #create a default filter
|
62
57
|
else
|
63
58
|
@examples.each do |example|
|
@@ -112,6 +107,9 @@ module Scrubyt
|
|
112
107
|
# camera_data.item[1].item_name[0]
|
113
108
|
def method_missing(method_name, *args, &block)
|
114
109
|
case method_name.to_s
|
110
|
+
when 'select_indices'
|
111
|
+
@result_indexer = Scrubyt::ResultIndexer.new(*args)
|
112
|
+
self
|
115
113
|
when /^to_/
|
116
114
|
Scrubyt::ResultDumper.send(method_name.to_s, self)
|
117
115
|
when /^ensure_/
|
@@ -135,9 +133,9 @@ module Scrubyt
|
|
135
133
|
# camera_data[1].item[1].item_name[0]
|
136
134
|
def [](index)
|
137
135
|
if @name == 'root'
|
138
|
-
@
|
136
|
+
@evaluation_context.document_index = index
|
139
137
|
else
|
140
|
-
@parent.last_result = @parent.last_result[@
|
138
|
+
@parent.last_result = @parent.last_result[@evaluation_context.document_index] if @parent.last_result.is_a? Array
|
141
139
|
return nil if (@result.lookup(@parent.last_result)) == nil
|
142
140
|
@last_result = @result.lookup(@parent.last_result)[index]
|
143
141
|
end
|
@@ -150,7 +148,7 @@ module Scrubyt
|
|
150
148
|
def export(file, output_file_name=nil, extractor_result_file_name=nil)
|
151
149
|
Scrubyt::Export.export(file, self, output_file_name, extractor_result_file_name)
|
152
150
|
end
|
153
|
-
|
151
|
+
|
154
152
|
##
|
155
153
|
#Add a filter to this pattern
|
156
154
|
def add_filter(filter)
|
@@ -167,39 +165,7 @@ module Scrubyt
|
|
167
165
|
child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
|
168
166
|
@children << child
|
169
167
|
end
|
170
|
-
|
171
|
-
##
|
172
|
-
#Crawl to a new page. This function should not be called from the outside - it is automatically called
|
173
|
-
#if the next_page is defined
|
174
|
-
def crawl_to_new_page
|
175
|
-
temp_document = generate_next_page_link(@next_page)
|
176
|
-
return nil if temp_document == nil
|
177
|
-
clear_sources_and_sinks(@root_pattern)
|
178
|
-
@root_pattern.extractor.fetch(temp_document)
|
179
|
-
attach_current_document
|
180
|
-
end
|
181
|
-
|
182
|
-
##
|
183
|
-
#Attach document to the root pattern; This is happening automatically as the root pattern is defined or
|
184
|
-
#crawling to a new page
|
185
|
-
def attach_current_document
|
186
|
-
doc = @root_pattern.extractor.get_hpricot_doc
|
187
|
-
filters[0].source << doc
|
188
|
-
filters[0].sink << doc
|
189
|
-
@last_result ||= []
|
190
|
-
@last_result << doc
|
191
|
-
@result.add_result(filters[0].source, filters[0].sink)
|
192
|
-
end
|
193
|
-
|
194
|
-
##
|
195
|
-
#Based on the given examples, calculate the XPaths for the tree patterns
|
196
|
-
def setup_examples
|
197
|
-
get_root_pattern(self)
|
198
|
-
mark_leaf_parents(self)
|
199
|
-
set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
|
200
|
-
generate_examples(@root_pattern)
|
201
|
-
end
|
202
|
-
|
168
|
+
|
203
169
|
##
|
204
170
|
#Evaluate the pattern. This means evaluating all the filters and adding
|
205
171
|
#their extracted instances to the array of results of this pattern
|
@@ -225,23 +191,21 @@ module Scrubyt
|
|
225
191
|
end
|
226
192
|
result = result_hash.reject {|k,v| k if !v}
|
227
193
|
sorted_result = r.reject {|e| !result.keys.include? e}
|
228
|
-
|
194
|
+
indexer = @result_indexer == nil ? sorted_result : @result_indexer.select_indices_to_extract(sorted_result)
|
195
|
+
add_result(filter, source, indexer)
|
229
196
|
else
|
230
|
-
|
197
|
+
indexer = @result_indexer == nil ? r : @result_indexer.select_indices_to_extract(r)
|
198
|
+
add_result(filter, source, indexer)
|
231
199
|
end#end of constraint check
|
232
200
|
end#end of source iteration
|
233
201
|
end#end of filter iteration
|
234
|
-
end
|
235
|
-
|
236
|
-
def get_instance_count
|
237
|
-
@@instance_count
|
238
|
-
end
|
202
|
+
end
|
239
203
|
|
240
204
|
def get_constraints
|
241
205
|
filters[0].constraints
|
242
|
-
end
|
243
|
-
|
244
|
-
private
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
245
209
|
def look_for_examples(args)
|
246
210
|
if (args[0].is_a? String)
|
247
211
|
@examples = args.select {|e| e.is_a? String}
|
@@ -269,48 +233,7 @@ private
|
|
269
233
|
results.each do |res|
|
270
234
|
filter.sink << res
|
271
235
|
@result.add_result(source, res)
|
272
|
-
@@instance_count[@name] += 1
|
273
236
|
end
|
274
|
-
end
|
275
|
-
|
276
|
-
def get_root_pattern(pattern)
|
277
|
-
if @root_pattern == nil
|
278
|
-
while (pattern.parent != nil)
|
279
|
-
get_root_pattern(pattern.parent)
|
280
|
-
end
|
281
|
-
@root_pattern = pattern
|
282
|
-
end
|
283
|
-
end
|
284
|
-
|
285
|
-
def mark_leaf_parents(pattern)
|
286
|
-
pattern.children.each { |child|
|
287
|
-
pattern.parent_of_leaf = true if child.children.size == 0
|
288
|
-
}
|
289
|
-
pattern.children.each { |child| mark_leaf_parents(child) }
|
290
|
-
end
|
291
|
-
|
292
|
-
def set_root_pattern_whole_wrapper(pattern, root_pattern)
|
293
|
-
pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
|
294
|
-
pattern.root_pattern = root_pattern
|
295
|
-
end
|
296
|
-
|
297
|
-
def generate_examples(pattern)
|
298
|
-
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
299
|
-
pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == PATTERN_TYPE_TREE
|
300
|
-
end
|
301
|
-
|
302
|
-
def clear_sources_and_sinks(pattern)
|
303
|
-
pattern.filters.each do |filter|
|
304
|
-
filter.source = []
|
305
|
-
filter.sink = []
|
306
|
-
end
|
307
|
-
pattern.children.each {|child| clear_sources_and_sinks child}
|
308
|
-
end
|
309
|
-
|
310
|
-
def generate_next_page_link(example)
|
311
|
-
node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example, true)
|
312
|
-
return nil if node == nil
|
313
|
-
node.attributes['href'].gsub('&') {'&'}
|
314
|
-
end # end of method generate_next_page_link
|
237
|
+
end # end of method generate_examples
|
315
238
|
end #end of class Pattern
|
316
239
|
end #end of module Scrubyt
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Apply different functions on the input document</tt>
|
4
|
+
#Before the document is passed to Hpricot for parsing, we may need
|
5
|
+
#to do different stuff with it which are clumsy/not appropriate/impossible
|
6
|
+
#to do once the document is loaded.
|
7
|
+
class PreFilterDocument
|
8
|
+
#Replace <br/> tags with newlines
|
9
|
+
def self.br_to_newline(doc)
|
10
|
+
doc.gsub(/<br[ \/]*>/i, "\r\n")
|
11
|
+
end #end of function br_to_newline
|
12
|
+
end #end of class PreFilterDocument
|
13
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Selecting results based on indices</tt>
|
4
|
+
#
|
5
|
+
#If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
|
6
|
+
#probably with a variable count of results (like tags, authors etc.), you may need just
|
7
|
+
#specific elements - like the last one, every third one, or at specific indices.
|
8
|
+
#In this case you should use the select_indices syntax.
|
9
|
+
class ResultIndexer
|
10
|
+
attr_reader :indices_to_extract
|
11
|
+
|
12
|
+
def initialize(*args)
|
13
|
+
select_indices(*args)
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
#Perform selection of the desires result instances, based on their indices
|
18
|
+
def select_indices_to_extract(ary)
|
19
|
+
return ary if @indices_to_extract == nil
|
20
|
+
to_keep = []
|
21
|
+
@indices_to_extract.each {|e|
|
22
|
+
if e.is_a? Symbol
|
23
|
+
case e
|
24
|
+
when :first
|
25
|
+
to_keep << 0
|
26
|
+
when :last
|
27
|
+
to_keep << ary.size-1
|
28
|
+
when :all_but_last
|
29
|
+
(0..ary.size-2).each {|i| to_keep << i}
|
30
|
+
when :all_but_first
|
31
|
+
(1..ary.size-1).each {|i| to_keep << i}
|
32
|
+
when :every_even
|
33
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
|
34
|
+
when :every_odd
|
35
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
36
|
+
when :every_second
|
37
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
38
|
+
when :every_third
|
39
|
+
(0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
}
|
43
|
+
@indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
|
44
|
+
to_keep.sort!
|
45
|
+
ary.reject! {|e| !to_keep.include? ary.index(e)}
|
46
|
+
ary
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
##
|
51
|
+
#Do not return the whole result set, just specified indices - like
|
52
|
+
#first,last, every odd index, indices from [1..3] etc.
|
53
|
+
#
|
54
|
+
#This method can accept:
|
55
|
+
#- a range, like (2..3)
|
56
|
+
#- an array of indices, like [1,2,3]
|
57
|
+
#- specified set of keywords:
|
58
|
+
# - :first
|
59
|
+
# - :last
|
60
|
+
# - :every_even
|
61
|
+
# - :every_odd
|
62
|
+
# (there can be more of these keywords in one select_indices call)
|
63
|
+
def select_indices(*args)
|
64
|
+
indices_to_grab = args[0]
|
65
|
+
case indices_to_grab.class.to_s
|
66
|
+
when "Range"
|
67
|
+
@indices_to_extract = indices_to_grab.to_a
|
68
|
+
when "Array"
|
69
|
+
nested_arrays = []
|
70
|
+
indices_to_grab.each {|e|
|
71
|
+
if e.is_a? Array
|
72
|
+
nested_arrays << e
|
73
|
+
elsif e.is_a? Range
|
74
|
+
nested_arrays << e.to_a
|
75
|
+
end
|
76
|
+
}
|
77
|
+
@indices_to_extract = indices_to_grab
|
78
|
+
nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
|
79
|
+
@indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
|
80
|
+
when "Symbol"
|
81
|
+
#parse this when we already have the results
|
82
|
+
@indices_to_extract = [indices_to_grab]
|
83
|
+
else
|
84
|
+
puts "Invalid index specification"
|
85
|
+
end
|
86
|
+
end #end of function select_indices
|
87
|
+
end #end of class ResultIndexer
|
88
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,97 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Holding the evaluation context of the extraction process</tt>
|
4
|
+
#
|
5
|
+
#Every kind of data that is shared among patterns during the extraction process
|
6
|
+
#is held in this class, so it can be looked up anytime.
|
7
|
+
#
|
8
|
+
#This class provides also some high-level basic functionality in navigation, like
|
9
|
+
#crawling to new pages, attaching doucment to the root pattern once arrived at the
|
10
|
+
#desired page etc.
|
11
|
+
#
|
12
|
+
#It can be viewed as a glue between Extractor and NavigationActions as well - these
|
13
|
+
#two classes need to communicate frequently as well as share different information
|
14
|
+
#and this is accomplished through EvaluationContext.
|
15
|
+
class EvaluationContext
|
16
|
+
attr_accessor :root_pattern, :next_page, :document_index, :block_count,
|
17
|
+
:extractor, :limit
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
@root_pattern = nil
|
21
|
+
@next_page = nil
|
22
|
+
@block_count = 0
|
23
|
+
@document_index = 0
|
24
|
+
@extractor = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
#Crawl to a new page. This function should not be called from the outside - it is automatically called
|
29
|
+
#if the next_page is defined
|
30
|
+
def crawl_to_new_page
|
31
|
+
temp_document = generate_next_page_link(@next_page)
|
32
|
+
return nil if temp_document == nil
|
33
|
+
clear_sources_and_sinks(@root_pattern)
|
34
|
+
@extractor.fetch(temp_document)
|
35
|
+
attach_current_document
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
#Attach document to the root pattern; This is happening automatically as the root pattern is defined or
|
40
|
+
#crawling to a new page
|
41
|
+
def attach_current_document
|
42
|
+
doc = @extractor.get_hpricot_doc
|
43
|
+
@root_pattern.filters[0].source << doc
|
44
|
+
@root_pattern.filters[0].sink << doc
|
45
|
+
@root_pattern.last_result ||= []
|
46
|
+
@root_pattern.last_result << doc
|
47
|
+
@root_pattern.result.add_result(@root_pattern.filters[0].source,
|
48
|
+
@root_pattern.filters[0].sink)
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
#Based on the given examples, calculate the XPaths for the tree patterns
|
53
|
+
def setup_examples
|
54
|
+
get_root_pattern(nil)
|
55
|
+
mark_leaf_parents(@root_pattern)
|
56
|
+
generate_examples(@root_pattern)
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
#After crawling to the new page, the sources and sinks need to be cleaned
|
61
|
+
#since they are no more valid
|
62
|
+
def clear_sources_and_sinks(pattern)
|
63
|
+
pattern.filters.each do |filter|
|
64
|
+
filter.source = []
|
65
|
+
filter.sink = []
|
66
|
+
end
|
67
|
+
pattern.children.each {|child| clear_sources_and_sinks child}
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_next_page_link(example)
|
71
|
+
node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example, true)
|
72
|
+
return nil if node == nil
|
73
|
+
node.attributes['href'].gsub('&') {'&'}
|
74
|
+
end
|
75
|
+
|
76
|
+
def mark_leaf_parents(pattern)
|
77
|
+
pattern.children.each { |child|
|
78
|
+
pattern.parent_of_leaf = true if child.children.size == 0
|
79
|
+
}
|
80
|
+
pattern.children.each { |child| mark_leaf_parents(child) }
|
81
|
+
end
|
82
|
+
|
83
|
+
def generate_examples(pattern)
|
84
|
+
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
85
|
+
pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
86
|
+
end
|
87
|
+
|
88
|
+
def get_root_pattern(pattern)
|
89
|
+
if @root_pattern == nil
|
90
|
+
while (pattern.parent != nil)
|
91
|
+
get_root_pattern(pattern.parent)
|
92
|
+
end
|
93
|
+
@root_pattern = pattern
|
94
|
+
end
|
95
|
+
end #end of function
|
96
|
+
end #end of class EvaluationContext
|
97
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'mechanize'
|
5
|
+
require 'hpricot'
|
6
|
+
|
7
|
+
module Scrubyt
|
8
|
+
##
|
9
|
+
#=<tt>Driving the whole extraction process</tt>
|
10
|
+
#
|
11
|
+
#Extractor is a performer class - it gets an extractor definition and carries
|
12
|
+
#out the actions and evaluates the wrappers sequentially.
|
13
|
+
#
|
14
|
+
#Originally also the navigation actions were here, but since the class got too
|
15
|
+
#big, they were factored out to an own class, NavigationAction.
|
16
|
+
class Extractor
|
17
|
+
#The definition of the extractor is passed through this method
|
18
|
+
def self.define(mode=nil, &extractor_definition)
|
19
|
+
@@mode = mode
|
20
|
+
mode_name = (mode == :production ? 'Production' : 'Learning')
|
21
|
+
puts "[MODE] #{mode_name}"
|
22
|
+
NavigationActions.new
|
23
|
+
@@evaluation_context = EvaluationContext.new
|
24
|
+
#Hack up an artificial root pattern (i.e. do not return the pattern which
|
25
|
+
#is the root one in the user's definition, but rather the real (invisible)
|
26
|
+
#root pattern
|
27
|
+
evaluated_extractor = (class_eval(&extractor_definition))
|
28
|
+
if evaluated_extractor == nil
|
29
|
+
puts "No extractor defined, exiting..."
|
30
|
+
exit
|
31
|
+
end
|
32
|
+
root_pattern = evaluated_extractor.parent
|
33
|
+
#Recursively match data based on examples
|
34
|
+
@@evaluation_context.setup_examples
|
35
|
+
#Once all is set up, evaluate the extractor from the root pattern!
|
36
|
+
evaluate_extractor(root_pattern)
|
37
|
+
#Apply all postprocess steps
|
38
|
+
PostProcessor.apply_post_processing(root_pattern)
|
39
|
+
#Return the root pattern
|
40
|
+
puts "Extraction finished succesfully!"
|
41
|
+
root_pattern
|
42
|
+
end
|
43
|
+
|
44
|
+
#build the current wrapper
|
45
|
+
def self.method_missing(method_name, *args, &block)
|
46
|
+
if NavigationActions::KEYWORDS.include? method_name.to_s
|
47
|
+
NavigationActions.send(method_name, *args)
|
48
|
+
return
|
49
|
+
end
|
50
|
+
pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
|
51
|
+
pattern.evaluation_context = @@evaluation_context
|
52
|
+
if @parent == nil
|
53
|
+
if method_name.to_s == 'next_page'
|
54
|
+
@@evaluation_context.next_page = args[0]
|
55
|
+
@@evaluation_context.limit =
|
56
|
+
args[1][:limit] if args.size > 1
|
57
|
+
return @@last_pattern
|
58
|
+
else
|
59
|
+
#Create a root pattern
|
60
|
+
root_pattern = Scrubyt::Pattern.new('root', :type => :root)
|
61
|
+
root_pattern.evaluation_context = @@evaluation_context
|
62
|
+
@@evaluation_context.root_pattern = root_pattern
|
63
|
+
@@evaluation_context.extractor = self
|
64
|
+
#add the currently active document to the root pattern
|
65
|
+
@@evaluation_context.attach_current_document
|
66
|
+
@@evaluation_context.root_pattern.add_child_pattern(pattern)
|
67
|
+
@@evaluation_context.block_count = 0
|
68
|
+
end
|
69
|
+
else
|
70
|
+
@parent.add_child_pattern(pattern) if @parent != nil
|
71
|
+
end
|
72
|
+
if block_given?
|
73
|
+
@@evaluation_context.block_count = @@evaluation_context.block_count + 1
|
74
|
+
@stack ||=[]
|
75
|
+
@parent = pattern
|
76
|
+
@stack.push @parent
|
77
|
+
class_eval(&block)
|
78
|
+
@stack.pop
|
79
|
+
@parent = @stack.last
|
80
|
+
end
|
81
|
+
@@last_pattern = pattern
|
82
|
+
end
|
83
|
+
|
84
|
+
#Used in lord of the hacks vol 1. Check out export.rb if you are still interested
|
85
|
+
#(You should not be :)
|
86
|
+
def self.get_block_count
|
87
|
+
@@root_pattern.block_count
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.get_hpricot_doc
|
91
|
+
NavigationActions.get_hpricot_doc
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.get_mode
|
95
|
+
@@mode
|
96
|
+
end
|
97
|
+
private
|
98
|
+
def self.evaluate_extractor(root_pattern)
|
99
|
+
if @@evaluation_context.next_page
|
100
|
+
current_page_count = 1
|
101
|
+
loop do
|
102
|
+
really_evaluate_extractor(root_pattern)
|
103
|
+
break if (@@evaluation_context.limit == current_page_count || @@evaluation_context.crawl_to_new_page == nil)
|
104
|
+
current_page_count += 1 if @@evaluation_context.limit != nil
|
105
|
+
end
|
106
|
+
else
|
107
|
+
really_evaluate_extractor(root_pattern)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.really_evaluate_extractor(pattern)
|
112
|
+
pattern.evaluate
|
113
|
+
pattern.children.each { |child| really_evaluate_extractor child }
|
114
|
+
end #end of method evaluate_wrapper
|
115
|
+
end #end of class Extractor
|
116
|
+
end #end of module Scrubyt
|