scrubyt 0.2.8 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +32 -2
- data/Rakefile +25 -20
- data/lib/scrubyt.rb +24 -5
- data/lib/scrubyt/core/navigation/fetch_action.rb +76 -42
- data/lib/scrubyt/core/navigation/navigation_actions.rb +24 -6
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +2 -2
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +2 -1
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -2
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +37 -12
- data/lib/scrubyt/core/scraping/pattern.rb +82 -90
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +2 -1
- data/lib/scrubyt/core/shared/evaluation_context.rb +14 -37
- data/lib/scrubyt/core/shared/extractor.rb +55 -54
- data/lib/scrubyt/logging.rb +16 -0
- data/lib/scrubyt/output/export.rb +1 -1
- data/lib/scrubyt/output/post_processor.rb +6 -5
- data/lib/scrubyt/output/result.rb +1 -0
- data/lib/scrubyt/output/result_dumper.rb +4 -3
- data/lib/scrubyt/output/result_node.rb +73 -0
- data/lib/scrubyt/output/scrubyt_result.rb +28 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +8 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +14 -1
- data/lib/scrubyt/utils/xpathutils.rb +11 -0
- metadata +7 -12
- data/test/unittests/constraint_test.rb +0 -107
- data/test/unittests/extractor_test.rb +0 -91
- data/test/unittests/filter_test.rb +0 -79
- data/test/unittests/input/constraint_test.html +0 -55
- data/test/unittests/input/test.html +0 -39
- data/test/unittests/pattern_test.rb +0 -27
- data/test/unittests/simple_example_lookup_test.rb +0 -68
- data/test/unittests/xpathutils_test.rb +0 -152
@@ -43,13 +43,14 @@ module Scrubyt
|
|
43
43
|
EXAMPLE_TYPE_IMAGE = 2
|
44
44
|
#No example - the actual XPath is determined from the children XPaths (their LCA)
|
45
45
|
EXAMPLE_TYPE_CHILDREN = 3
|
46
|
+
|
46
47
|
#Regexp example, like /\d+@*\d+[a-z]/
|
47
48
|
EXAMPLE_TYPE_REGEXP = 4
|
48
49
|
#Compound example, like :contains => 'goodies'
|
49
50
|
EXAMPLE_TYPE_COMPOUND = 5
|
50
51
|
|
51
52
|
attr_accessor(:example_type, :parent_pattern, :temp_sink,
|
52
|
-
:constraints, :xpath, :regexp, :example, :
|
53
|
+
:constraints, :xpath, :regexp, :example, :final_result)
|
53
54
|
|
54
55
|
def self.create(parent_pattern, example=nil)
|
55
56
|
|
@@ -63,6 +64,7 @@ module Scrubyt
|
|
63
64
|
|
64
65
|
#Dispatcher method to add constraints; of course, as with any method_missing, this method
|
65
66
|
#should not be called directly
|
67
|
+
|
66
68
|
#TODO still used?
|
67
69
|
def method_missing(method_name, *args, &block)
|
68
70
|
case method_name.to_s
|
@@ -82,8 +84,6 @@ module Scrubyt
|
|
82
84
|
def initialize(parent_pattern, example)
|
83
85
|
@example_type = BaseFilter.determine_example_type(example)
|
84
86
|
@parent_pattern = parent_pattern
|
85
|
-
@sink = [] #output of a filter
|
86
|
-
@source = [] #input of a filter
|
87
87
|
@example = example
|
88
88
|
@xpath = nil #The xpath to evaluate this filter
|
89
89
|
@constraints = [] #list of constraints
|
@@ -98,9 +98,9 @@ module Scrubyt
|
|
98
98
|
case example
|
99
99
|
when nil
|
100
100
|
EXAMPLE_TYPE_CHILDREN
|
101
|
-
when /\.(jpg|png|gif|jpeg)
|
101
|
+
when /\.(jpg|png|gif|jpeg)(\[\d+\])?$/
|
102
102
|
EXAMPLE_TYPE_IMAGE
|
103
|
-
when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)
|
103
|
+
when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*(\[@.+=.+\])?(\/@.+)?$/
|
104
104
|
(example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
|
105
105
|
else
|
106
106
|
EXAMPLE_TYPE_STRING
|
@@ -3,9 +3,9 @@ module Scrubyt
|
|
3
3
|
|
4
4
|
def evaluate(source)
|
5
5
|
if source.is_a? String
|
6
|
-
|
6
|
+
@parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
|
7
7
|
else
|
8
|
-
|
8
|
+
@parent_pattern.evaluation_context.extractor.evaluate_subextractor(
|
9
9
|
XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
|
10
10
|
@parent_pattern, @parent_pattern.resolve)
|
11
11
|
end
|
@@ -20,7 +20,8 @@ private
|
|
20
20
|
return '' if source.size < 4
|
21
21
|
file_name = source.scan(/.+\/(.*)/)[0][0]
|
22
22
|
Net::HTTP.start(base_url) { |http|
|
23
|
-
|
23
|
+
puts "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
|
24
|
+
resp = http.get(source.scan(/\s*(.+)/)[0][0])
|
24
25
|
outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
|
25
26
|
FileUtils.mkdir_p @example
|
26
27
|
open(outfile, 'wb') {|f| f.write(resp.body) }
|
@@ -2,9 +2,22 @@ module Scrubyt
|
|
2
2
|
class TreeFilter < BaseFilter
|
3
3
|
|
4
4
|
def evaluate(source)
|
5
|
+
return [@final_result] if @final_result
|
6
|
+
#Crude hack! Drop it after it will be supported in Hpricot
|
7
|
+
if @xpath =~ /.+\/@.+$/
|
8
|
+
@example = @xpath
|
9
|
+
@xpath = @xpath.scan(/^(.+?)\/@/)[0][0]
|
10
|
+
end
|
5
11
|
result = source/@xpath
|
6
|
-
|
7
|
-
|
12
|
+
|
13
|
+
Scrubyt.log :ACTION, "Evaluating #{@parent_pattern.name} with #{@xpath}"
|
14
|
+
|
15
|
+
xpath_results = Hpricot::Elements === result ? result : [result]
|
16
|
+
|
17
|
+
if @example =~ /.+\/@.+$/
|
18
|
+
result_attribute = @example.scan(/.+\/@(.+?)$/)[0][0]
|
19
|
+
xpath_results.map! {|r| r.attributes[result_attribute] }
|
20
|
+
end
|
8
21
|
if @regexp == nil
|
9
22
|
xpath_results
|
10
23
|
else
|
@@ -22,6 +35,8 @@ module Scrubyt
|
|
22
35
|
def generate_regexp_for_example
|
23
36
|
return if @example_type != EXAMPLE_TYPE_STRING
|
24
37
|
return if @temp_sink.nil?
|
38
|
+
return if @temp_sink.is_a? String
|
39
|
+
return if @example =~ /.+\[.+\]$/
|
25
40
|
|
26
41
|
text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_text)
|
27
42
|
match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
|
@@ -49,10 +64,14 @@ module Scrubyt
|
|
49
64
|
when EXAMPLE_TYPE_XPATH
|
50
65
|
@xpath = @example
|
51
66
|
when EXAMPLE_TYPE_STRING
|
52
|
-
@temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.evaluation_context.
|
67
|
+
@temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
|
53
68
|
@example,
|
54
69
|
next_page_example)
|
55
70
|
return if @temp_sink == nil
|
71
|
+
if @temp_sink.is_a? String
|
72
|
+
@final_result = @temp_sink
|
73
|
+
return
|
74
|
+
end
|
56
75
|
|
57
76
|
mark_changing_ranges = lambda { |element, range|
|
58
77
|
element.instance_eval do
|
@@ -63,14 +82,14 @@ module Scrubyt
|
|
63
82
|
end
|
64
83
|
}
|
65
84
|
mark_changing_ranges.call(@temp_sink, @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0))
|
66
|
-
|
67
|
-
@xpath = XPathUtils.generate_XPath(@temp_sink, nil,
|
85
|
+
write_indices = next_page_example ? true : !@parent_pattern.generalize
|
86
|
+
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, write_indices)
|
68
87
|
when EXAMPLE_TYPE_CHILDREN
|
69
88
|
current_example_index = 0
|
70
89
|
loop do
|
71
90
|
all_child_temp_sinks = []
|
72
91
|
@parent_pattern.children.each do |child_pattern|
|
73
|
-
all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink
|
92
|
+
all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink if child_pattern.filters[current_example_index].temp_sink
|
74
93
|
end
|
75
94
|
result = all_child_temp_sinks.pop
|
76
95
|
if all_child_temp_sinks.empty?
|
@@ -81,7 +100,7 @@ module Scrubyt
|
|
81
100
|
end
|
82
101
|
end
|
83
102
|
xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
|
84
|
-
|
103
|
+
XPathUtils.generate_XPath(result, nil, true)
|
85
104
|
if @parent_pattern.filters.size < current_example_index + 1
|
86
105
|
@parent_pattern.filters << Scrubyt::BaseFilter.create(@parent_pattern)
|
87
106
|
end
|
@@ -97,24 +116,30 @@ module Scrubyt
|
|
97
116
|
current_example_index += 1
|
98
117
|
end
|
99
118
|
when EXAMPLE_TYPE_IMAGE
|
100
|
-
#@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)\
|
101
119
|
@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.extractor.get_hpricot_doc, @example)
|
102
|
-
@xpath = XPathUtils.generate_XPath(@temp_sink, nil,
|
120
|
+
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
|
103
121
|
when EXAMPLE_TYPE_COMPOUND
|
104
|
-
@temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.evaluation_context.
|
122
|
+
@temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
|
105
123
|
@example,
|
106
124
|
next_page_example)
|
107
125
|
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
108
|
-
|
126
|
+
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
109
127
|
end
|
110
128
|
end
|
111
129
|
|
112
130
|
def generate_relative_XPath(parent_xpath)
|
131
|
+
parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.evaluation_context.extractor.get_hpricot_doc,
|
132
|
+
parent_xpath,
|
133
|
+
@parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
|
113
134
|
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
|
114
135
|
end
|
115
136
|
|
116
137
|
def to_sexp
|
117
|
-
|
138
|
+
if @example =~ /.+\[@.+\]$/
|
139
|
+
[:str, "#{@xpath}/@#{@example.scan(/\[@(.+?)\]/)[0][0]}"]
|
140
|
+
else
|
141
|
+
[:str, @xpath]
|
142
|
+
end
|
118
143
|
end
|
119
144
|
|
120
145
|
end #End of class TreeFilter
|
@@ -33,7 +33,7 @@ module Scrubyt
|
|
33
33
|
# # write out the HTML subtree beginning at the matched element
|
34
34
|
# PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
|
35
35
|
|
36
|
-
VALID_PATTERN_TYPES = [:
|
36
|
+
VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree]
|
37
37
|
|
38
38
|
#The pattern can be either a model pattern (in this case it is
|
39
39
|
#written to the output) or a temp pattern (in this case it is skipped)
|
@@ -49,10 +49,11 @@ module Scrubyt
|
|
49
49
|
VALID_OUTPUT_TYPES = [:model, :temp]
|
50
50
|
|
51
51
|
#These options can be set upon wrapper creation
|
52
|
-
|
52
|
+
PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve]
|
53
|
+
VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
|
53
54
|
|
54
55
|
attr_accessor(:name, :options, :children, :constraints, :filters, :parent,
|
55
|
-
:last_result, :
|
56
|
+
:last_result, :evaluation_context,
|
56
57
|
:indices_to_extract, :referenced_extractor, :referenced_pattern,
|
57
58
|
:source_file, :source_proc, :modifier_calls)
|
58
59
|
|
@@ -60,7 +61,7 @@ module Scrubyt
|
|
60
61
|
|
61
62
|
option_reader(:type => :tree, :output_type => :model, :generalize => false,
|
62
63
|
:write_text => lambda { @children.size == 0 }, :limit => nil,
|
63
|
-
|
64
|
+
:default => nil, :resolve => :full)
|
64
65
|
|
65
66
|
def initialize(name, args=[], evaluation_context=nil, parent=nil, &block)
|
66
67
|
#init attributes
|
@@ -71,7 +72,6 @@ module Scrubyt
|
|
71
72
|
@children = []
|
72
73
|
@filters = []
|
73
74
|
@constraints = []
|
74
|
-
@result = Result.new
|
75
75
|
@modifier_calls = []
|
76
76
|
|
77
77
|
#grab any examples that are defined
|
@@ -93,9 +93,12 @@ module Scrubyt
|
|
93
93
|
end
|
94
94
|
end
|
95
95
|
|
96
|
-
#by default, generalize
|
96
|
+
#by default, generalize the root pattern, but only in the case if
|
97
97
|
#@generalize was not set up explicitly
|
98
|
-
|
98
|
+
if @options[:generalize].nil?
|
99
|
+
@options[:generalize] = true if parent.nil?
|
100
|
+
@options[:generalize] = false if filters[0].example =~ /.+\[[a-zA-Z].+\]$/
|
101
|
+
end
|
99
102
|
|
100
103
|
#parse child patterns if available
|
101
104
|
parse_child_patterns(&block) if ( !block.nil? && type != :detail_page )
|
@@ -104,7 +107,7 @@ module Scrubyt
|
|
104
107
|
if type == :tree
|
105
108
|
#generate xpaths and regexps
|
106
109
|
@filters.each do |filter|
|
107
|
-
filter.generate_XPath_for_example(false)
|
110
|
+
filter.generate_XPath_for_example(false) unless @name == 'next_page'
|
108
111
|
filter.generate_regexp_for_example
|
109
112
|
end
|
110
113
|
#when the xpaths of this pattern have been created, its children can make their xpaths relative
|
@@ -154,9 +157,6 @@ module Scrubyt
|
|
154
157
|
#an ancestor of a detail pattern , and store this in a hash if yes (to be able to
|
155
158
|
#traverse the pattern structure on detail pages as well).
|
156
159
|
def check_if_detail_page(block)
|
157
|
-
#return if !@options[:references]
|
158
|
-
#@options[:type] = :detail_page
|
159
|
-
#@referenced_extractor = @options[:references]
|
160
160
|
if @name =~ /.+_detail/
|
161
161
|
@options[:type] = :detail_page
|
162
162
|
@referenced_extractor = block
|
@@ -168,6 +168,10 @@ module Scrubyt
|
|
168
168
|
@children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? }
|
169
169
|
end
|
170
170
|
|
171
|
+
def filter_count
|
172
|
+
@filters.size
|
173
|
+
end
|
174
|
+
|
171
175
|
def parse_child_patterns(&block)
|
172
176
|
context = Object.new
|
173
177
|
context.instance_eval do
|
@@ -177,7 +181,8 @@ module Scrubyt
|
|
177
181
|
def method_missing(method_name, *args, &block)
|
178
182
|
if method_name.to_s[0..0] == '_'
|
179
183
|
#add hash option
|
180
|
-
key =
|
184
|
+
key = method_name.to_s[1..-1].to_sym
|
185
|
+
check_option(key)
|
181
186
|
args.each do |arg|
|
182
187
|
current_value = @current.options[key]
|
183
188
|
if current_value.nil?
|
@@ -216,8 +221,6 @@ module Scrubyt
|
|
216
221
|
when 'select_indices'
|
217
222
|
@result_indexer = Scrubyt::ResultIndexer.new(*args)
|
218
223
|
return self
|
219
|
-
when /^to_/
|
220
|
-
return Scrubyt::ResultDumper.send(method_name.to_s, self)
|
221
224
|
when /^ensure_/
|
222
225
|
@constraints << Scrubyt::ConstraintAdder.send(method_name, *args)
|
223
226
|
return self #To make chaining possible
|
@@ -228,80 +231,72 @@ module Scrubyt
|
|
228
231
|
raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
|
229
232
|
end
|
230
233
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
# camera_data.item[1].item_name[0]
|
235
|
-
#
|
236
|
-
#possible. The method Pattern::method missing handles the 'item', 'item_name' etc.
|
237
|
-
#parts, while the indexing ([1], [0]) is handled by this function.
|
238
|
-
#If you would like to select a different document than the first one (which is
|
239
|
-
#the default), you should use the form:
|
240
|
-
#
|
241
|
-
# camera_data[1].item[1].item_name[0]
|
242
|
-
def [](index)
|
243
|
-
if @name == 'root'
|
244
|
-
@evaluation_context.document_index = index
|
245
|
-
else
|
246
|
-
@parent.last_result = @parent.last_result[@evaluation_context.document_index] if @parent.last_result.is_a? Array
|
247
|
-
return nil if (@result.lookup(@parent.last_result)) == nil
|
248
|
-
@last_result = @result.lookup(@parent.last_result)[index]
|
234
|
+
def evaluate(source, filter_indices)
|
235
|
+
if type == :detail_page # DIRTY!
|
236
|
+
return @filters[0].evaluate(source)
|
249
237
|
end
|
250
|
-
self
|
251
|
-
end
|
252
238
|
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
#
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
239
|
+
#we apply all filters if filter_indices is nil
|
240
|
+
indices_to_evaluate = filter_indices.nil? ? 0...@filters.size : filter_indices
|
241
|
+
#stores the results of all filters
|
242
|
+
all_filter_results = []
|
243
|
+
#remembers which filters have retured a certain result
|
244
|
+
indices_mapping = {}
|
245
|
+
#evaluate filters and collect filter results
|
246
|
+
indices_to_evaluate.each do |filter_index|
|
247
|
+
filter = @filters[filter_index]
|
248
|
+
filter_results = filter.evaluate(source)
|
249
|
+
filter_results.each do |result|
|
250
|
+
#add result to list if not already there
|
251
|
+
all_filter_results << result if all_filter_results.index(result).nil?
|
252
|
+
#add the current filter's index to the mapping
|
253
|
+
(indices_mapping[result] ||= []) << filter_index
|
254
|
+
end
|
262
255
|
end
|
263
|
-
end
|
264
|
-
|
265
|
-
def old_export(input_file, output_file_name=nil, extractor_result_file_name=nil)
|
266
|
-
contents = open(input_file).read
|
267
|
-
wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
|
268
|
-
Scrubyt::Export.export(self, wrapper_name, output_file_name, extractor_result_file_name)
|
269
|
-
end
|
270
|
-
|
271
|
-
def new_export(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
|
272
|
-
Scrubyt::Export.export(self, wrapper_name, output_file_name, extractor_result_file_name)
|
273
|
-
end
|
274
256
|
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
#
|
295
|
-
|
296
|
-
|
257
|
+
#apply constraints
|
258
|
+
if @constraints.size > 0
|
259
|
+
all_filter_results = all_filter_results.select do |result|
|
260
|
+
@constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) }
|
261
|
+
end
|
262
|
+
end
|
263
|
+
#apply indexer
|
264
|
+
all_filter_results = @result_indexer.select_indices_to_extract(all_filter_results) if !@result_indexer.nil?
|
265
|
+
|
266
|
+
#create result nodes and evaluate children
|
267
|
+
result_nodes = []
|
268
|
+
all_filter_results.each do |result|
|
269
|
+
#create result node
|
270
|
+
node = ResultNode.new(@name, result, @options)
|
271
|
+
node.generated_by_leaf = (@children.size == 0)
|
272
|
+
#evaluate children
|
273
|
+
@children.each do |child|
|
274
|
+
raise if self.filter_count != 1 && child.filter_count != self.filter_count
|
275
|
+
if self.filter_count == 1
|
276
|
+
#evaluate all child filters
|
277
|
+
node.push(*child.evaluate(result, nil))
|
278
|
+
else
|
279
|
+
#evaluate appropriate child filters
|
280
|
+
node.push(*child.evaluate(result, indices_mapping[result]))
|
297
281
|
end
|
298
282
|
end
|
283
|
+
#apply child constraints (ensure_presence_of_pattern)
|
284
|
+
required_child_names = @constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN }.map {|c| c.target}
|
285
|
+
unless required_child_names.empty?
|
286
|
+
check = lambda { |node_to_check|
|
287
|
+
required_child_names.delete node_to_check.name
|
288
|
+
node_to_check.each { |child| check.call child }
|
289
|
+
}
|
290
|
+
check.call node
|
291
|
+
end
|
292
|
+
next unless required_child_names.empty?
|
293
|
+
#add the current result node to the list
|
294
|
+
result_nodes << node
|
299
295
|
end
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
#do postprocessing
|
296
|
+
if result_nodes.empty?
|
297
|
+
result_nodes << ResultNode.new(@name,@options[:default],@options) if @options[:default]
|
298
|
+
end
|
299
|
+
result_nodes
|
305
300
|
end
|
306
301
|
|
307
302
|
def to_sexp
|
@@ -316,7 +311,7 @@ module Scrubyt
|
|
316
311
|
if type == :detail_page
|
317
312
|
#add detail page extractor
|
318
313
|
detail_root = @evaluation_context.extractor.get_detail_extractor(self)
|
319
|
-
sexp = [:iter, sexp, nil, [:block,
|
314
|
+
sexp = [:iter, sexp, nil, [:block, detail_root.to_sexp]]
|
320
315
|
else
|
321
316
|
#add child block if the pattern has children
|
322
317
|
sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
|
@@ -336,11 +331,15 @@ module Scrubyt
|
|
336
331
|
#merge provided hash
|
337
332
|
@options.merge!(hash)
|
338
333
|
#check if valid
|
339
|
-
hash.each { |key, value|
|
334
|
+
hash.each { |key, value| check_option(key.to_sym) }
|
340
335
|
raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
|
341
336
|
raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
|
342
337
|
end
|
343
338
|
|
339
|
+
def check_option(option)
|
340
|
+
raise "Unknown pattern option: #{option.to_s}" if VALID_OPTIONS.index(option).nil?
|
341
|
+
end
|
342
|
+
|
344
343
|
def look_for_examples(args)
|
345
344
|
if (args[0].is_a? String)
|
346
345
|
examples = args.select {|e| e.is_a? String}
|
@@ -370,12 +369,5 @@ module Scrubyt
|
|
370
369
|
examples
|
371
370
|
end
|
372
371
|
|
373
|
-
def add_result(filter, source, results)
|
374
|
-
results.each do |res|
|
375
|
-
filter.sink << res
|
376
|
-
@result.add_result(source, res)
|
377
|
-
end
|
378
|
-
end
|
379
|
-
|
380
372
|
end #end of class Pattern
|
381
373
|
end #end of module Scrubyt
|