sutch-scrubyt 0.4.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/CHANGELOG +350 -0
  2. data/COPYING +340 -0
  3. data/README +121 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt.rb +45 -0
  6. data/lib/scrubyt/core/navigation/agents/firewatir.rb +253 -0
  7. data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
  8. data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
  9. data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
  10. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  11. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  12. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  13. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  14. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  15. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  16. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  17. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  18. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  19. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  20. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  21. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  22. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  23. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  24. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  25. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  26. data/lib/scrubyt/core/shared/extractor.rb +168 -0
  27. data/lib/scrubyt/logging.rb +154 -0
  28. data/lib/scrubyt/output/post_processor.rb +139 -0
  29. data/lib/scrubyt/output/result.rb +44 -0
  30. data/lib/scrubyt/output/result_dumper.rb +154 -0
  31. data/lib/scrubyt/output/result_node.rb +140 -0
  32. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  33. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  34. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  35. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  36. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  37. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +117 -0
@@ -0,0 +1,9 @@
1
+ module Scrubyt
2
+ class HtmlSubtreeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ source.inner_html
6
+ end
7
+
8
+ end #End of class TreeFilter
9
+ end #End of module Scrubyt
@@ -0,0 +1,13 @@
1
+ module Scrubyt
2
+ class RegexpFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ if source.is_a? String
6
+ source.scan(@example).flatten
7
+ else
8
+ source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
9
+ end
10
+ end
11
+
12
+ end #End of class TreeFilter
13
+ end #End of module Scrubyt
@@ -0,0 +1,11 @@
1
+ module Scrubyt
2
+ class ScriptFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ param = source
6
+ param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
7
+ @example.call param
8
+ end
9
+
10
+ end #End of class ConstantFilter
11
+ end #End of module Scrubyt
@@ -0,0 +1,34 @@
1
+ module Scrubyt
2
+ class TextFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return find_string(source) if @example =~ /^find\(/
6
+ final_element_name = @example.scan(/^(.+?)\[/)[0][0]
7
+ text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
8
+
9
+ index = @example.scan(/\]:(.+)/).flatten
10
+ index = 0 if index.empty?
11
+ index = index[0].to_i unless index[0] == "all"
12
+ result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
13
+ return "" unless result
14
+
15
+ if index[0] == "all"
16
+ result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
17
+ else
18
+ [XPathUtils.traverse_up_until_name(result,final_element_name)]
19
+ end
20
+ end
21
+
22
+ def find_string(source)
23
+ str = @example.scan(/find\((.+)\)/).flatten[0]
24
+ strings_to_find = str.include?('|') ? str.split('|') : [str]
25
+ strings_to_find.each do |s|
26
+ result = SharedUtils.traverse_for_match(source,/#{s}/i)
27
+ return [s] unless result.empty?
28
+ end
29
+ return []
30
+ end
31
+
32
+ end #End of class TextFilter
33
+ end #End of module Scrubyt
34
+
@@ -0,0 +1,138 @@
1
+ module Scrubyt
2
+ class TreeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return [@final_result] if @final_result
6
+ #Crude hack! Drop it after it will be supported in Hpricot
7
+ if @xpath =~ /.+\/@.+$/
8
+ @example = @xpath
9
+ @xpath = @xpath.scan(/^(.+?)\/@/)[0][0]
10
+ end
11
+ result = source/@xpath
12
+
13
+ Scrubyt.log :ACTION, "Evaluating #{@parent_pattern.name} with #{@xpath}"
14
+
15
+ xpath_results = Hpricot::Elements === result ? result : [result]
16
+
17
+ if @example =~ /.+\/@.+$/
18
+ result_attribute = @example.scan(/.+\/@(.+?)$/)[0][0]
19
+ xpath_results.map! {|r| r.attributes[result_attribute] }
20
+ end
21
+ if @regexp == nil
22
+ xpath_results
23
+ else
24
+ regexp_results = []
25
+ xpath_results.each do |entry|
26
+ text = SharedUtils.prepare_text_for_comparison(result.inner_html)
27
+ if text =~ @regexp
28
+ regexp_results << $1
29
+ end
30
+ end
31
+ regexp_results
32
+ end
33
+ end
34
+
35
+ def generate_regexp_for_example
36
+ return if @example_type != EXAMPLE_TYPE_STRING
37
+ return if @temp_sink.nil?
38
+ return if @temp_sink.is_a? String
39
+ return if @example =~ /.+\[.+\]$/
40
+
41
+ text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
42
+ match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
43
+ return if match_range == (0..text.length)
44
+
45
+ @regexp = text
46
+ @temp_sink.changing_ranges.sort.reverse.each do |range|
47
+ @regexp[range] = if range == match_range then '<<<regexp_selection>>>' else '<<<regexp_changing>>>' end
48
+ end
49
+ @regexp = Regexp.escape(@regexp)
50
+ @regexp = @regexp.gsub('<<<regexp_changing>>>', '.*?')
51
+ @regexp = @regexp.gsub('<<<regexp_selection>>>', '(.*?)')
52
+ @regexp = '^' + @regexp + '$'
53
+ @regexp = /#{@regexp}/
54
+ end
55
+
56
+
57
+ #For all the tree patterns, generate an XPath based on the given example
58
+ #Also this method should not be called directly; It is automatically called for every tree
59
+ #pattern directly after wrapper definition
60
+ def generate_XPath_for_example(next_page_example=false)
61
+ #puts "generating example for: #{@parent_pattern.name}"
62
+ #puts @example_type
63
+ case @example_type
64
+ when EXAMPLE_TYPE_XPATH
65
+ @xpath = @example
66
+ when EXAMPLE_TYPE_STRING
67
+ @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
68
+ @example,
69
+ next_page_example)
70
+ return if @temp_sink == nil
71
+ if @temp_sink.is_a? String
72
+ @final_result = @temp_sink
73
+ return
74
+ end
75
+
76
+ mark_changing_ranges = lambda { |element, range|
77
+ element.instance_eval do
78
+ @changing_ranges ||= [] << range
79
+ def changing_ranges
80
+ @changing_ranges
81
+ end
82
+ end
83
+ }
84
+ mark_changing_ranges.call(@temp_sink, @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0))
85
+ write_indices = next_page_example ? true : !@parent_pattern.generalize
86
+ @xpath = XPathUtils.generate_XPath(@temp_sink, nil, write_indices)
87
+ when EXAMPLE_TYPE_CHILDREN
88
+ current_example_index = 0
89
+ loop do
90
+ all_child_temp_sinks = []
91
+ @parent_pattern.children.each do |child_pattern|
92
+ all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink if child_pattern.filters[current_example_index].temp_sink
93
+ end
94
+ result = all_child_temp_sinks.pop
95
+ if all_child_temp_sinks.empty?
96
+ result = result.parent
97
+ else
98
+ all_child_temp_sinks.each do |child_sink|
99
+ result = XPathUtils.lowest_common_ancestor(result, child_sink)
100
+ end
101
+ end
102
+ xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
103
+ XPathUtils.generate_XPath(result, nil, true)
104
+ if @parent_pattern.filters.size < current_example_index + 1
105
+ @parent_pattern.filters << Scrubyt::BaseFilter.create(@parent_pattern)
106
+ end
107
+ @parent_pattern.filters[current_example_index].xpath = xpath
108
+ @parent_pattern.filters[current_example_index].temp_sink = result
109
+ @parent_pattern.children.each do |child_pattern|
110
+ next if child_pattern.type == :detail_page
111
+ child_pattern.filters[current_example_index].xpath =
112
+ child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
113
+ XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
114
+ end
115
+ break if @parent_pattern.children[0].filters.size == current_example_index + 1
116
+ current_example_index += 1
117
+ end
118
+ when EXAMPLE_TYPE_IMAGE
119
+ @temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
120
+ @xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
121
+ when EXAMPLE_TYPE_COMPOUND
122
+ @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
123
+ @example,
124
+ next_page_example)
125
+ @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
126
+ XPathUtils.generate_XPath(@temp_sink, nil, true)
127
+ end
128
+ end
129
+
130
+ def generate_relative_XPath(parent_xpath)
131
+ parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
132
+ parent_xpath,
133
+ @parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
134
+ @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
135
+ end
136
+
137
+ end #End of class TreeFilter
138
+ end #End of module Scrubyt
@@ -0,0 +1,359 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+
4
+ module Scrubyt
5
+ ##
6
+ #=<tt>Group more filters into one</tt>
7
+ #
8
+ #Server as an umbrella for filters which are conceptually extracting
9
+ #the same thing - for example a price or a title or ...
10
+ #
11
+ #Sometimes the same piece of information can not be extracted with one filter
12
+ #across more result instances (for example a price has an XPath in record n,
13
+ #but since in record n+1 has a discount price as well, the real price is pushed
14
+ #to a different XPath etc) - in this case the more filters which extract the same
15
+ #thing are hold in the same pattern.
16
+ class Pattern
17
+ #Type of the pattern;
18
+
19
+ # TODO: Update documentation
20
+
21
+ # # a root pattern represents a (surprise!) root pattern
22
+ # PATTERN_TYPE_ROOT = :PATTERN_TYPE_ROOT
23
+ # # a tree pattern represents a HTML region
24
+ # PATTERN_TYPE_TREE = :PATTERN_TYPE_TREE
25
+ # # represents an attribute of the node extracted by the parent pattern
26
+ # PATTERN_TYPE_ATTRIBUTE = :PATTERN_TYPE_ATTRIBUTE
27
+ # # represents a pattern which filters its output with a regexp
28
+ # PATTERN_TYPE_REGEXP = :PATTERN_TYPE_REGEXP
29
+ # # represents a pattern which crawls to the detail page and extracts information from there
30
+ # PATTERN_TYPE_DETAIL_PAGE = :PATTERN_TYPE_DETAIL_PAGE
31
+ # # represents a download pattern
32
+ # PATTERN_TYPE_DOWNLOAD = :PATTERN_TYPE_DOWNLOAD
33
+ # # write out the HTML subtree beginning at the matched element
34
+ # PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
35
+
36
+ VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
37
+
38
+ # :determine - default value, represent that type of example need determine
39
+ # :string - represent node with example type EXAMPLE_TYPE_STRING
40
+ VALID_PATTERN_EXAMPLE_TYPES = [:determine, :xpath]
41
+
42
+ #The pattern can be either a model pattern (in this case it is
43
+ #written to the output) or a temp pattern (in this case it is skipped)
44
+ #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
45
+ #is considered to be a model pattern
46
+
47
+ #Model pattern are shown in the output
48
+ # OUTPUT_TYPE_MODEL = :OUTPUT_TYPE_MODEL
49
+ # #Temp patterns are skipped in the output (their ancestors are appended to the parent
50
+ # #of the pattrern which was skipped
51
+ # OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
52
+
53
+ VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
54
+
55
+ #These options can be set upon wrapper creation
56
+ PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
57
+ VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
58
+
59
+ attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
60
+ :indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
61
+
62
+ attr_reader(:next_page_url, :result_indexer)
63
+
64
+ option_reader(:type => :tree, :output_type => :model, :generalize => false,
65
+ :write_text => lambda { @children.size == 0 }, :limit => nil,
66
+ :default => nil, :resolve => :full, :except => [], :example_type => :determine)
67
+
68
+ def initialize(name, args=[], extractor=nil, parent=nil, &block)
69
+ #init attributes
70
+ @name = name
71
+ @extractor = extractor
72
+ @parent = parent
73
+ @options = {}
74
+ @children = []
75
+ @filters = []
76
+ @constraints = []
77
+ @modifier_calls = []
78
+
79
+ #grab any examples that are defined
80
+ examples = look_for_examples(args)
81
+
82
+ #parse the options hash if provided
83
+ parse_options_hash(args[-1]) if args[-1].is_a? Hash
84
+
85
+ #perform checks for special cases
86
+ examples = check_if_shortcut_pattern() if examples == nil
87
+ check_if_detail_page(block)
88
+ @options[:output_type] = :page_list if name == 'page_list'
89
+
90
+ #create filters
91
+ if examples == nil
92
+ @filters << Scrubyt::BaseFilter.create(self) #create a default filter
93
+ else
94
+ examples.each do |example|
95
+ @filters << Scrubyt::BaseFilter.create(self,example) #create a filter
96
+ end
97
+ end
98
+
99
+ #by default, generalize the root pattern, but only in the case if
100
+ #@generalize was not set up explicitly
101
+ if @options[:generalize].nil?
102
+ @options[:generalize] = true if parent.nil?
103
+ @options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
104
+ end
105
+
106
+ #parse child patterns if available
107
+ parse_child_patterns(&block) if ( !block.nil? && type != :detail_page )
108
+
109
+ #tree pattern only (TODO: subclass?)
110
+ if type == :tree
111
+ #generate xpaths and regexps
112
+ @filters.each do |filter|
113
+ filter.generate_XPath_for_example(false) unless @name == 'next_page'
114
+ filter.generate_regexp_for_example
115
+ end
116
+ #when the xpaths of this pattern have been created, its children can make their xpaths relative
117
+ xpaths = @filters.collect { |filter| filter.xpath }
118
+ @children.each do |child|
119
+ child.generate_relative_XPaths xpaths
120
+ end
121
+ end
122
+ end
123
+
124
+ def generate_relative_XPaths(parent_xpaths)
125
+ return if type != :tree
126
+ raise ArgumentError.new if parent_xpaths.size != 1 && parent_xpaths.size != @filters.size #TODO: should be checked earlier with proper error message
127
+ @filters.each_index do |index|
128
+ @filters[index].generate_relative_XPath parent_xpaths[parent_xpaths.size == 1 ? 0 : index]
129
+ end
130
+ end
131
+
132
+ #Shortcut patterns, as their name says, are a shortcut for creating patterns
133
+ #from predefined rules; for example:
134
+ #
135
+ # detail_url
136
+ #
137
+ # is equivalent to
138
+ #
139
+ # detail_url 'href', type => :attribute
140
+ #
141
+ #i.e. the system figures out on it's own that because of the postfix, the
142
+ #example should be looked up (but it should never override the user input!)
143
+ #another example (will be available later):
144
+ #
145
+ # every_img
146
+ #
147
+ # is equivivalent to
148
+ #
149
+ # every_img '//img'
150
+ #
151
+ def check_if_shortcut_pattern()
152
+ if @name =~ /.+_url/
153
+ @options[:type] = :attribute
154
+ ['href']
155
+ end
156
+ end
157
+
158
+ #Check whether the currently created pattern is a detail pattern (i.e. it refrences
159
+ #a subextractor). Also check if the currently created pattern is
160
+ #an ancestor of a detail pattern , and store this in a hash if yes (to be able to
161
+ #traverse the pattern structure on detail pages as well).
162
+ def check_if_detail_page(block)
163
+ if @name =~ /.+_detail/
164
+ @options[:type] = :detail_page
165
+ @referenced_extractor = block
166
+ end
167
+ end
168
+
169
+ def parent_of_leaf
170
+ @children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? }
171
+ end
172
+
173
+ def filter_count
174
+ @filters.size
175
+ end
176
+
177
+ def parse_child_patterns(&block)
178
+ context = Object.new
179
+ context.instance_eval do
180
+ def current=(value)
181
+ @current = value
182
+ end
183
+ def method_missing(method_name, *args, &block)
184
+ if method_name.to_s[0..0] == '_'
185
+ #add hash option
186
+ key = method_name.to_s[1..-1].to_sym
187
+ check_option(key)
188
+ args.each do |arg|
189
+ current_value = @current.options[key]
190
+ if current_value.nil?
191
+ @current.options[key] = arg
192
+ else
193
+ @current.options[key] = [current_value] if !current_value.is_a Array
194
+ @current.options[key] << arg
195
+ end
196
+ end
197
+ else
198
+ #create child pattern
199
+ child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
200
+ @current.children << child
201
+ child
202
+ end
203
+ end
204
+ end
205
+ context.current = self
206
+ context.instance_eval(&block)
207
+ end
208
+
209
+ #Dispatcher function; The class was already too big so I have decided to factor
210
+ #out some methods based on their functionality (like output, adding constraints)
211
+ #to utility classes.
212
+ #
213
+ #The second function besides dispatching is to lookup the results in an evaluated
214
+ #wrapper, for example
215
+ #
216
+ # camera_data.item[1].item_name[0]
217
+ def method_missing(method_name, *args, &block)
218
+ if @extractor.evaluating_extractor_definition
219
+ @modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
220
+ end
221
+
222
+ case method_name.to_s
223
+ when 'select_indices'
224
+ @result_indexer = Scrubyt::ResultIndexer.new(*args)
225
+ return self
226
+ when /^ensure_/
227
+ @constraints << Scrubyt::ConstraintAdder.send(method_name, *args)
228
+ return self #To make chaining possible
229
+ else
230
+ @children.each { |child| return child if child.name == method_name.to_s }
231
+ end
232
+
233
+ raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
234
+ end
235
+
236
+ def evaluate(source, filter_indices)
237
+ if type == :detail_page # DIRTY!
238
+ return @filters[0].evaluate(source)
239
+ end
240
+
241
+ #we apply all filters if filter_indices is nil
242
+ indices_to_evaluate = filter_indices.nil? ? 0...@filters.size : filter_indices
243
+ #stores the results of all filters
244
+ all_filter_results = []
245
+ #remembers which filters have retured a certain result
246
+ indices_mapping = {}
247
+ #evaluate filters and collect filter results
248
+ indices_to_evaluate.each do |filter_index|
249
+ filter = @filters[filter_index]
250
+ filter_results = filter.evaluate(source)
251
+ filter_results.each do |result|
252
+ #add result to list if not already there
253
+ all_filter_results << result if all_filter_results.index(result).nil?
254
+ #add the current filter's index to the mapping
255
+ (indices_mapping[result] ||= []) << filter_index
256
+ end
257
+ end
258
+
259
+ #apply constraints
260
+ if @constraints.size > 0
261
+ all_filter_results = all_filter_results.select do |result|
262
+ @constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) }
263
+ end
264
+ end
265
+ #apply indexer
266
+ all_filter_results = @result_indexer.select_indices_to_extract(all_filter_results) if !@result_indexer.nil?
267
+
268
+ #create result nodes and evaluate children
269
+ result_nodes = []
270
+ all_filter_results.each do |result|
271
+ #create result node
272
+ node = ResultNode.new(@name, result, @options)
273
+ node.generated_by_leaf = (@children.size == 0)
274
+ #evaluate children
275
+ @children.each do |child|
276
+ raise if self.filter_count != 1 && child.filter_count != self.filter_count
277
+ if self.filter_count == 1
278
+ #evaluate all child filters
279
+ node.push(*child.evaluate(result, nil))
280
+ else
281
+ #evaluate appropriate child filters
282
+ node.push(*child.evaluate(result, indices_mapping[result]))
283
+ end
284
+ end
285
+ #apply child constraints (ensure_presence_of_pattern)
286
+ required_child_names = @constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN }.map {|c| c.target}
287
+ unless required_child_names.empty?
288
+ check = lambda { |node_to_check|
289
+ required_child_names.delete node_to_check.name
290
+ node_to_check.each { |child| check.call child }
291
+ }
292
+ check.call node
293
+ end
294
+ next unless required_child_names.empty?
295
+ #add the current result node to the list
296
+ result_nodes << node
297
+ end
298
+ if result_nodes.empty?
299
+ result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
300
+ end
301
+ case output_type
302
+ when :model
303
+ return result_nodes
304
+ when :page_list
305
+ result_nodes.each do |result_node|
306
+ @extractor.add_to_next_page_list result_node
307
+ end
308
+ return []
309
+ end
310
+ end
311
+
312
+ private
313
+ def parse_options_hash(hash)
314
+ #merge provided hash
315
+ @options.merge!(hash)
316
+ #check if valid
317
+ hash.each { |key, value| check_option(key.to_sym) }
318
+ raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
319
+ raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
320
+ raise "Invalid example type: #{example_type.to_s}" if VALID_PATTERN_EXAMPLE_TYPES.index(example_type.to_sym).nil?
321
+ end
322
+
323
+ def check_option(option)
324
+ raise "Unknown pattern option: #{option.to_s}" if VALID_OPTIONS.index(option).nil?
325
+ end
326
+
327
+ def look_for_examples(args)
328
+ if (args[0].is_a? String)
329
+ examples = args.select {|e| e.is_a? String}
330
+ #Check if all the String parameters are really the first
331
+ #parameters
332
+ args[0..examples.size-1].each do |example|
333
+ if !example.is_a? String
334
+ puts 'FATAL: Problem with example specification'
335
+ end
336
+ end
337
+ elsif (args[0].is_a? Regexp)
338
+ examples = args.select {|e| e.is_a? Regexp}
339
+ #Check if all the String parameters are really the first
340
+ #parameters
341
+ args[0..examples.size].each do |example|
342
+ if !example.is_a? Regexp
343
+ puts 'FATAL: Problem with example specification'
344
+ end
345
+ end
346
+ @options[:type] = :regexp
347
+ elsif (args[0].is_a? Hash)
348
+ examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
349
+ examples = nil if examples == []
350
+ elsif (args[0].is_a? Proc)
351
+ examples = [args[0]]
352
+ end
353
+
354
+ @has_examples = !examples.nil?
355
+ examples
356
+ end
357
+
358
+ end #end of class Pattern
359
+ end #end of module Scrubyt