scrubyt 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. data/CHANGELOG +59 -12
  2. data/Rakefile +2 -2
  3. data/lib/scrubyt.rb +24 -6
  4. data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
  5. data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
  6. data/lib/scrubyt/core/scraping/constraint.rb +53 -57
  7. data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
  8. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
  9. data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
  10. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
  11. data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
  13. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
  14. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
  15. data/lib/scrubyt/core/scraping/pattern.rb +292 -157
  16. data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
  17. data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
  18. data/lib/scrubyt/core/shared/extractor.rb +122 -163
  19. data/lib/scrubyt/output/export.rb +59 -174
  20. data/lib/scrubyt/output/post_processor.rb +4 -3
  21. data/lib/scrubyt/output/result.rb +8 -9
  22. data/lib/scrubyt/output/result_dumper.rb +81 -42
  23. data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
  24. data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
  25. data/lib/scrubyt/utils/shared_utils.rb +39 -26
  26. data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
  27. data/lib/scrubyt/utils/xpathutils.rb +31 -30
  28. data/test/unittests/constraint_test.rb +11 -7
  29. data/test/unittests/extractor_test.rb +6 -6
  30. data/test/unittests/filter_test.rb +66 -66
  31. metadata +22 -15
  32. data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -0,0 +1,7 @@
1
+ module Scrubyt
2
+ class HTMLSubTreeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ end #end of method
6
+ end #End of class TreeFilter
7
+ end #End of module Scrubyt
@@ -0,0 +1,17 @@
1
+ module Scrubyt
2
+ class RegexpFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ if source.is_a? String
6
+ source.scan(@example).flatten
7
+ else
8
+ source.inner_text.scan(@example).flatten
9
+ end
10
+ end
11
+
12
+ def to_sexp
13
+ [:lit, @example]
14
+ end
15
+
16
+ end #End of class TreeFilter
17
+ end #End of module Scrubyt
@@ -0,0 +1,121 @@
1
+ module Scrubyt
2
+ class TreeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ result = source/@xpath
6
+ #puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
7
+ xpath_results = if result.class == Hpricot::Elements then result.map else [result] end
8
+ if @regexp == nil
9
+ xpath_results
10
+ else
11
+ regexp_results = []
12
+ xpath_results.each do |entry|
13
+ text = SharedUtils.prepare_text_for_comparison(result.inner_html)
14
+ if text =~ @regexp
15
+ regexp_results << $1
16
+ end
17
+ end
18
+ regexp_results
19
+ end
20
+ end
21
+
22
+ def generate_regexp_for_example
23
+ return if @example_type != EXAMPLE_TYPE_STRING
24
+ return if @temp_sink.nil?
25
+
26
+ text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_text)
27
+ match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
28
+ return if match_range == (0..text.length)
29
+
30
+ @regexp = text
31
+ @temp_sink.changing_ranges.sort.reverse.each do |range|
32
+ @regexp[range] = if range == match_range then '<<<regexp_selection>>>' else '<<<regexp_changing>>>' end
33
+ end
34
+ @regexp = Regexp.escape(@regexp)
35
+ @regexp = @regexp.gsub('<<<regexp_changing>>>', '.*?')
36
+ @regexp = @regexp.gsub('<<<regexp_selection>>>', '(.*?)')
37
+ @regexp = '^' + @regexp + '$'
38
+ @regexp = /#{@regexp}/
39
+ end
40
+
41
+
42
+ #For all the tree patterns, generate an XPath based on the given example
43
+ #Also this method should not be called directly; It is automatically called for every tree
44
+ #pattern directly after wrapper definition
45
+ def generate_XPath_for_example(next_page_example=false)
46
+ #puts "generating example for: #{@parent_pattern.name}"
47
+ #puts @example_type
48
+ case @example_type
49
+ when EXAMPLE_TYPE_XPATH
50
+ @xpath = @example
51
+ when EXAMPLE_TYPE_STRING
52
+ @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
53
+ @example,
54
+ next_page_example)
55
+ return if @temp_sink == nil
56
+
57
+ mark_changing_ranges = lambda { |element, range|
58
+ element.instance_eval do
59
+ @changing_ranges ||= [] << range
60
+ def changing_ranges
61
+ @changing_ranges
62
+ end
63
+ end
64
+ }
65
+ mark_changing_ranges.call(@temp_sink, @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0))
66
+
67
+ @xpath = XPathUtils.generate_XPath(@temp_sink, nil, !@parent_pattern.generalize)
68
+ when EXAMPLE_TYPE_CHILDREN
69
+ current_example_index = 0
70
+ loop do
71
+ all_child_temp_sinks = []
72
+ @parent_pattern.children.each do |child_pattern|
73
+ all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink
74
+ end
75
+ result = all_child_temp_sinks.pop
76
+ if all_child_temp_sinks.empty?
77
+ result = result.parent
78
+ else
79
+ all_child_temp_sinks.each do |child_sink|
80
+ result = XPathUtils.lowest_common_ancestor(result, child_sink)
81
+ end
82
+ end
83
+ xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
84
+ XPathUtils.generate_XPath(result, nil, true)
85
+ if @parent_pattern.filters.size < current_example_index + 1
86
+ @parent_pattern.filters << Scrubyt::BaseFilter.create(@parent_pattern)
87
+ end
88
+ @parent_pattern.filters[current_example_index].xpath = xpath
89
+ @parent_pattern.filters[current_example_index].temp_sink = result
90
+ @parent_pattern.children.each do |child_pattern|
91
+ next if child_pattern.type == :detail_page
92
+ child_pattern.filters[current_example_index].xpath =
93
+ child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
94
+ XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
95
+ end
96
+ break if @parent_pattern.children[0].filters.size == current_example_index + 1
97
+ current_example_index += 1
98
+ end
99
+ when EXAMPLE_TYPE_IMAGE
100
+ #@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)\
101
+ @temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.extractor.get_hpricot_doc, @example)
102
+ @xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
103
+ when EXAMPLE_TYPE_COMPOUND
104
+ @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
105
+ @example,
106
+ next_page_example)
107
+ @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
108
+ XPathUtils.generate_XPath(@temp_sink, nil, true)
109
+ end
110
+ end
111
+
112
+ def generate_relative_XPath(parent_xpath)
113
+ @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
114
+ end
115
+
116
+ def to_sexp
117
+ [:str, @xpath]
118
+ end
119
+
120
+ end #End of class TreeFilter
121
+ end #End of module Scrubyt
@@ -5,129 +5,234 @@ module Scrubyt
5
5
  ##
6
6
  #=<tt>Group more filters into one</tt>
7
7
  #
8
- #Server as an umbrella for filters which are conceptually extracting
8
+ #Server as an umbrella for filters which are conceptually extracting
9
9
  #the same thing - for example a price or a title or ...
10
10
  #
11
- #Sometimes the same piece of information can not be extracted with one filter
11
+ #Sometimes the same piece of information can not be extracted with one filter
12
12
  #across more result instances (for example a price has an XPath in record n,
13
13
  #but since in record n+1 has a discount price as well, the real price is pushed
14
14
  #to a different XPath etc) - in this case the more filters which extract the same
15
15
  #thing are hold in the same pattern.
16
16
  class Pattern
17
- #Type of the pattern;
18
-
19
- # a root pattern represents a (surprise!) root pattern
20
- PATTERN_TYPE_ROOT = 0x00
21
- # a tree pattern represents a HTML region
22
- PATTERN_TYPE_TREE = 0x01
23
- # represents an attribute of the node extracted by the parent pattern
24
- PATTERN_TYPE_ATTRIBUTE = 0x02
25
- # represents a pattern which filters its output with a regexp
26
- PATTERN_TYPE_REGEXP = 0x03
27
- # represents a pattern which crawls to the detail page and extracts information from there
28
- PATTERN_TYPE_DETAIL = 0x04
17
+ #Type of the pattern;
18
+
19
+ # TODO: Update documentation
20
+
21
+ # # a root pattern represents a (surprise!) root pattern
22
+ # PATTERN_TYPE_ROOT = :PATTERN_TYPE_ROOT
23
+ # # a tree pattern represents a HTML region
24
+ # PATTERN_TYPE_TREE = :PATTERN_TYPE_TREE
25
+ # # represents an attribute of the node extracted by the parent pattern
26
+ # PATTERN_TYPE_ATTRIBUTE = :PATTERN_TYPE_ATTRIBUTE
27
+ # # represents a pattern which filters its output with a regexp
28
+ # PATTERN_TYPE_REGEXP = :PATTERN_TYPE_REGEXP
29
+ # # represents a pattern which crawls to the detail page and extracts information from there
30
+ # PATTERN_TYPE_DETAIL_PAGE = :PATTERN_TYPE_DETAIL_PAGE
31
+ # # represents a download pattern
32
+ # PATTERN_TYPE_DOWNLOAD = :PATTERN_TYPE_DOWNLOAD
33
+ # # write out the HTML subtree beginning at the matched element
34
+ # PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
35
+
36
+ VALID_PATTERN_TYPES = [:root, :tree, :attribute, :regexp, :detail_page, :download, :html_subtree]
29
37
 
30
38
  #The pattern can be either a model pattern (in this case it is
31
39
  #written to the output) or a temp pattern (in this case it is skipped)
32
40
  #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
33
41
  #is considered to be a model pattern
34
-
42
+
35
43
  #Model pattern are shown in the output
36
- OUTPUT_TYPE_MODEL = 0x10
37
- #Temp patterns are skipped in the output (their ancestors are appended to the parent
38
- #of the pattrern which was skipped
39
- OUTPUT_TYPE_TEMP = 0x11
40
-
41
- #These fields can be set upon wrapper creation - i.e. a field which is public but not contained here can be accessed
42
- #from outside, but not set as a result of wrapper construction
43
- SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'write_text']
44
-
45
- attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
46
- :last_result, :result, :limit,
47
- :examples, :parent_of_leaf, :evaluation_context, :type,
48
- :indices_to_extract, :evaluation_context, :referenced_extractor,
49
- :referenced_pattern, :write_text
50
- attr_reader :generalize_set, :next_page_url, :result_indexer
51
-
52
- def initialize (name, *args)
53
- @name = name #name of the pattern
54
- parse_args(args) #parse the rest of the arguments
55
- @children = [] #child patterns
56
- @filters = [] #filters of the wrapper
57
- @result = Result.new #hierarchical results of the pattern
58
- if @examples == nil
59
- filters << Scrubyt::Filter.new(self) #create a default filter
44
+ # OUTPUT_TYPE_MODEL = :OUTPUT_TYPE_MODEL
45
+ # #Temp patterns are skipped in the output (their ancestors are appended to the parent
46
+ # #of the pattrern which was skipped
47
+ # OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
48
+
49
+ VALID_OUTPUT_TYPES = [:model, :temp]
50
+
51
+ #These options can be set upon wrapper creation
52
+ VALID_OPTIONS = [:generalize, :type, :output_type, :write_text, :references, :limit, :default, :resolve] + Scrubyt::CompoundExample::DESCRIPTORS
53
+
54
+ attr_accessor(:name, :options, :children, :constraints, :filters, :parent,
55
+ :last_result, :result, :evaluation_context,
56
+ :indices_to_extract, :referenced_extractor, :referenced_pattern,
57
+ :source_file, :source_proc, :modifier_calls)
58
+
59
+ attr_reader(:next_page_url, :result_indexer)
60
+
61
+ option_reader(:type => :tree, :output_type => :model, :generalize => false,
62
+ :write_text => lambda { @children.size == 0 }, :limit => nil,
63
+ :default => nil, :resolve => :full)
64
+
65
+ def initialize(name, args=[], evaluation_context=nil, parent=nil, &block)
66
+ #init attributes
67
+ @name = name
68
+ @evaluation_context = evaluation_context
69
+ @parent = parent
70
+ @options = {}
71
+ @children = []
72
+ @filters = []
73
+ @constraints = []
74
+ @result = Result.new
75
+ @modifier_calls = []
76
+
77
+ #grab any examples that are defined
78
+ examples = look_for_examples(args)
79
+
80
+ #parse the options hash if provided
81
+ parse_options_hash(args[-1]) if args[-1].is_a? Hash
82
+
83
+ #perform checks for special cases
84
+ examples = check_if_shortcut_pattern() if examples == nil
85
+ check_if_detail_page(block)
86
+
87
+ #create filters
88
+ if examples == nil
89
+ @filters << Scrubyt::BaseFilter.create(self) #create a default filter
60
90
  else
61
- @examples.each do |example|
62
- filters << Scrubyt::Filter.new(self,example) #create a filter
91
+ examples.each do |example|
92
+ @filters << Scrubyt::BaseFilter.create(self,example) #create a filter
93
+ end
94
+ end
95
+
96
+ #by default, generalize direct children of the root pattern, but only in the case if
97
+ #@generalize was not set up explicitly
98
+ @options[:generalize] = true if parent && parent.type == :root && @options[:generalize].nil?
99
+
100
+ #parse child patterns if available
101
+ parse_child_patterns(&block) if ( !block.nil? && type != :detail_page )
102
+
103
+ #tree pattern only (TODO: subclass?)
104
+ if type == :tree
105
+ #generate xpaths and regexps
106
+ @filters.each do |filter|
107
+ filter.generate_XPath_for_example(false)
108
+ filter.generate_regexp_for_example
109
+ end
110
+ #when the xpaths of this pattern have been created, its children can make their xpaths relative
111
+ xpaths = @filters.collect { |filter| filter.xpath }
112
+ @children.each do |child|
113
+ child.generate_relative_XPaths xpaths
63
114
  end
64
- end
115
+ end
116
+ end
117
+
118
+ def generate_relative_XPaths(parent_xpaths)
119
+ return if type != :tree
120
+ raise ArgumentError.new if parent_xpaths.size != 1 && parent_xpaths.size != @filters.size #TODO: should be checked earlier with proper error message
121
+ @filters.each_index do |index|
122
+ @filters[index].generate_relative_XPath parent_xpaths[parent_xpaths.size == 1 ? 0 : index]
123
+ end
65
124
  end
66
-
67
- #Parse the args passed as *args; There is only one compulsory parameter to pattern: it's name
68
- #All the other parameters can (but do not have to) be specified;
125
+
126
+ #Shortcut patterns, as their name says, are a shortcut for creating patterns
127
+ #from predefined rules; for example:
128
+ #
129
+ # detail_url
130
+ #
131
+ # is equivalent to
132
+ #
133
+ # detail_url 'href', type => :attribute
134
+ #
135
+ #i.e. the system figures out on it's own that because of the postfix, the
136
+ #example should be looked up (but it should never override the user input!)
137
+ #another example (will be available later):
138
+ #
139
+ # every_img
140
+ #
141
+ # is equivivalent to
69
142
  #
70
- #If an example is specified, it *MUST* be the first parameter; the order of the other
71
- #parameters is irrelevant
72
- def parse_args(args)
73
- #Grab any examples that are defined!
74
- look_for_examples(args)
75
- args.each do |arg|
76
- next if !arg.is_a? Hash
77
- arg.each do |k,v|
78
- #Set only the setable fields
79
- if SETTABLE_FIELDS.include? k.to_s
80
- #If the user is specifying a pattern type, turn it into the corresponding constant
81
- v = "PATTERN_TYPE_#{v.to_s.upcase!}" if k.to_s == 'type'
82
- v = "OUTPUT_TYPE_#{v.to_s.upcase!}" if k.to_s == 'output_type'
83
- #Otherwise, if nothing special is happening, isntance_eval the hash pair
84
- instance_eval("@#{k.to_s} = #{v}")
143
+ # every_img '//img'
144
+ #
145
+ def check_if_shortcut_pattern()
146
+ if @name =~ /.+_url/
147
+ @options[:type] = :attribute
148
+ ['href']
149
+ end
150
+ end
151
+
152
+ #Check whether the currently created pattern is a detail pattern (i.e. it refrences
153
+ #a subextractor). Also check if the currently created pattern is
154
+ #an ancestor of a detail pattern , and store this in a hash if yes (to be able to
155
+ #traverse the pattern structure on detail pages as well).
156
+ def check_if_detail_page(block)
157
+ #return if !@options[:references]
158
+ #@options[:type] = :detail_page
159
+ #@referenced_extractor = @options[:references]
160
+ if @name =~ /.+_detail/
161
+ @options[:type] = :detail_page
162
+ @referenced_extractor = block
163
+ Scrubyt::Extractor.add_detail_extractor_to_pattern_name(block, self)
164
+ end
165
+ end
166
+
167
+ def parent_of_leaf
168
+ @children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? }
169
+ end
170
+
171
+ def parse_child_patterns(&block)
172
+ context = Object.new
173
+ context.instance_eval do
174
+ def current=(value)
175
+ @current = value
176
+ end
177
+ def method_missing(method_name, *args, &block)
178
+ if method_name.to_s[0..0] == '_'
179
+ #add hash option
180
+ key = :"#{method_name.to_s[1..-1]}"
181
+ args.each do |arg|
182
+ current_value = @current.options[key]
183
+ if current_value.nil?
184
+ @current.options[key] = arg
185
+ else
186
+ @current.options[key] = [current_value] if !current_value.is_a Array
187
+ @current.options[key] << arg
188
+ end
189
+ end
190
+ else
191
+ #create child pattern
192
+ child = Scrubyt::Pattern.new(method_name.to_s, args, @current.evaluation_context, @current, &block)
193
+ @current.children << child
194
+ child
85
195
  end
86
- #This flags says that the user explicitly wants to set generalization on a pattern
87
- #In this case, of course, our heuristics do not apply - the users setting overrides
88
- #it
89
- @generalize_set = true if (k.to_s == 'generalize')
90
196
  end
91
197
  end
92
- #default settings - the user can override them, but if she did not do so,
93
- #we will setup some meaningful defaults
94
- @type ||= PATTERN_TYPE_TREE
95
- @output_type ||= OUTPUT_TYPE_MODEL
96
- #don't generalize by default
97
- @generalize ||= false
98
- #This flag indicates that the user set 'generalize' to some value;
99
- #This way we can ensure that the explicit setting will not be overridden
100
- @generalize_set ||= false
101
- end
102
-
198
+ context.current = self
199
+ context.instance_eval(&block)
200
+ end
201
+
103
202
  #Dispatcher function; The class was already too big so I have decided to factor
104
- #out some methods based on their functionality (like output, adding constraints)
203
+ #out some methods based on their functionality (like output, adding constraints)
105
204
  #to utility classes.
106
205
  #
107
- #The second function besides dispatching is to lookup the results in an evaluated
206
+ #The second function besides dispatching is to lookup the results in an evaluated
108
207
  #wrapper, for example
109
208
  #
110
209
  # camera_data.item[1].item_name[0]
111
210
  def method_missing(method_name, *args, &block)
211
+ if @evaluation_context.evaluating_extractor_definition
212
+ @modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
213
+ end
214
+
112
215
  case method_name.to_s
113
- when 'select_indices'
114
- @result_indexer = Scrubyt::ResultIndexer.new(*args)
115
- self
116
- when /^to_/
117
- Scrubyt::ResultDumper.send(method_name.to_s, self)
118
- when /^ensure_/
119
- Scrubyt::ConstraintAdder.send(method_name, self, *args)
120
- else
121
- @children.each { |child| return child if child.name == method_name.to_s }
122
- nil
216
+ when 'select_indices'
217
+ @result_indexer = Scrubyt::ResultIndexer.new(*args)
218
+ return self
219
+ when /^to_/
220
+ return Scrubyt::ResultDumper.send(method_name.to_s, self)
221
+ when /^ensure_/
222
+ @constraints << Scrubyt::ConstraintAdder.send(method_name, *args)
223
+ return self #To make chaining possible
224
+ else
225
+ @children.each { |child| return child if child.name == method_name.to_s }
123
226
  end
227
+
228
+ raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
124
229
  end
125
230
 
126
231
  #Companion function to the previous one (Pattern::method_missing). It makes
127
232
  #inspecting results, like
128
233
  #
129
234
  # camera_data.item[1].item_name[0]
130
- #
235
+ #
131
236
  #possible. The method Pattern::method missing handles the 'item', 'item_name' etc.
132
237
  #parts, while the indexing ([1], [0]) is handled by this function.
133
238
  #If you would like to select a different document than the first one (which is
@@ -142,105 +247,135 @@ module Scrubyt
142
247
  return nil if (@result.lookup(@parent.last_result)) == nil
143
248
  @last_result = @result.lookup(@parent.last_result)[index]
144
249
  end
145
- self
250
+ self
146
251
  end
147
-
252
+
148
253
  ##
149
254
  #If export is called on the root pattern, it exports the whole extractor wher it is
150
255
  #defined; See export.rb for further details on the parameters
151
- def export(file, output_file_name=nil, extractor_result_file_name=nil)
152
- Scrubyt::Export.export(file, self, output_file_name, extractor_result_file_name)
256
+ def export(arg1, output_file_name=nil, extractor_result_file_name=nil)
257
+ # require 'scrubyt/output/export_old'; Scrubyt::ExportOld.export(arg1, self, output_file_name, extractor_result_file_name) ; return
258
+ if File.exists? arg1
259
+ old_export(arg1, output_file_name, extractor_result_file_name)
260
+ else
261
+ new_export(arg1, output_file_name, extractor_result_file_name)
262
+ end
153
263
  end
154
-
155
- ##
156
- #Add a filter to this pattern
157
- def add_filter(filter)
158
- @filters << filter
159
- return self
264
+
265
+ def old_export(input_file, output_file_name=nil, extractor_result_file_name=nil)
266
+ contents = open(input_file).read
267
+ wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
268
+ Scrubyt::Export.export(self, wrapper_name, output_file_name, extractor_result_file_name)
160
269
  end
161
270
 
162
- ##
163
- #Add a child pattern to this pattern
164
- def add_child_pattern(child)
165
- child.parent = self
166
- #by default, generalize direct children of the root pattern, but only in the case if
167
- #@generalize was not set up explicitly
168
- child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
169
- @children << child
271
+ def new_export(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
272
+ Scrubyt::Export.export(self, wrapper_name, output_file_name, extractor_result_file_name)
170
273
  end
171
-
274
+
172
275
  ##
173
- #Evaluate the pattern. This means evaluating all the filters and adding
276
+ #Evaluate the pattern. This means evaluating all the filters and adding
174
277
  #their extracted instances to the array of results of this pattern
175
- def evaluate
176
- #No need to evaluate if there is no parent pattern
177
- return if @parent == nil
178
- all_filter_results = []
179
- @filters.each do |filter|
180
- filter_index = @filters.index(filter)
181
- filter_index = 0 if @parent.filters.size <= filter_index
182
- filter.source = @parent.filters[filter_index].sink
183
- filter.source.each do |source|
184
- r = filter.evaluate(source)
185
- next if r == nil
186
- if filter.constraints.size > 0
187
- #in the beginning, keys of result_hash are made up of all the results of the filter
188
- #with value = true; Later on, only those results will have 'true' value which are
189
- #accepted with all filters
190
- result_hash = {}
191
- r.each { |res| result_hash[res] = true }
192
- result_hash.keys.each do |res|
193
- filter.constraints.each { |constraint| result_hash[res] &&= constraint.check(res) }
278
+ def evaluate(parent_filters)
279
+ if type != :root #TODO: should be removed, but there is more refactoring of filter handling needed to do so
280
+ all_filter_results = []
281
+ @filters.each do |filter|
282
+ filter_index = @filters.index(filter)
283
+ filter_index = 0 if parent_filters.size <= filter_index
284
+ filter.source = parent_filters[filter_index].sink
285
+ filter.source.each do |source|
286
+ results = filter.evaluate(source)
287
+ next if results == nil
288
+ #apply constraints
289
+ if @constraints.size > 0
290
+ results = results.select do |result|
291
+ @constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) }
292
+ end
194
293
  end
195
- result = result_hash.reject {|k,v| k if !v}
196
- sorted_result = r.reject {|e| !result.keys.include? e}
197
- indexer = @result_indexer == nil ? sorted_result : @result_indexer.select_indices_to_extract(sorted_result)
198
- add_result(filter, source, indexer)
199
- else
200
- indexer = @result_indexer == nil ? r : @result_indexer.select_indices_to_extract(r)
201
- add_result(filter, source, indexer)
202
- end#end of constraint check
203
- end#end of source iteration
204
- end#end of filter iteration
205
- end
206
-
207
- def get_constraints
208
- filters[0].constraints
209
- end
210
-
211
- private
294
+ #apply indexer
295
+ results = @result_indexer.select_indices_to_extract(results) if !@result_indexer.nil?
296
+ add_result(filter, source, results)
297
+ end
298
+ end
299
+ end
300
+
301
+ #evaluate children
302
+ @children.each { |child| child.evaluate(@filters) }
303
+
304
+ #do postprocessing
305
+ end
306
+
307
+ def to_sexp
308
+ #collect arguments
309
+ args = []
310
+ args.push(*@filters.to_sexp_array) if type != :detail_page #TODO: this if shouldn't be there
311
+ args.push(@options.to_sexp) if !@options.empty?
312
+
313
+ #build main call
314
+ sexp = [:fcall, @name, [:array, *args]]
315
+
316
+ if type == :detail_page
317
+ #add detail page extractor
318
+ detail_root = @evaluation_context.extractor.get_detail_extractor(self)
319
+ sexp = [:iter, sexp, nil, [:block, *detail_root.children.to_sexp_array ]]
320
+ else
321
+ #add child block if the pattern has children
322
+ sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
323
+ end
324
+
325
+ #add modifier calls - TODO: remove when everything is exported to the options hash
326
+ @modifier_calls.each do |modifier_sexp|
327
+ sexp = [:call, sexp, *modifier_sexp]
328
+ end
329
+
330
+ #return complete sexp
331
+ sexp
332
+ end
333
+
334
+ private
335
+ def parse_options_hash(hash)
336
+ #merge provided hash
337
+ @options.merge!(hash)
338
+ #check if valid
339
+ hash.each { |key, value| raise "Unknown pattern option: #{key.to_s}" if VALID_OPTIONS.index(key.to_sym).nil? }
340
+ raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
341
+ raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
342
+ end
343
+
212
344
  def look_for_examples(args)
213
345
  if (args[0].is_a? String)
214
- @examples = args.select {|e| e.is_a? String}
346
+ examples = args.select {|e| e.is_a? String}
215
347
  #Check if all the String parameters are really the first
216
- #parameters
217
- args[0..@examples.size-1].each do |example|
348
+ #parameters
349
+ args[0..examples.size-1].each do |example|
218
350
  if !example.is_a? String
219
351
  puts 'FATAL: Problem with example specification'
220
352
  end
221
353
  end
222
354
  elsif (args[0].is_a? Regexp)
223
- @examples = args.select {|e| e.is_a? Regexp}
355
+ examples = args.select {|e| e.is_a? Regexp}
224
356
  #Check if all the String parameters are really the first
225
- #parameters
226
- args[0..@examples.size].each do |example|
357
+ #parameters
358
+ args[0..examples.size].each do |example|
227
359
  if !example.is_a? Regexp
228
360
  puts 'FATAL: Problem with example specification'
229
361
  end
230
362
  end
231
- @type = PATTERN_TYPE_REGEXP
363
+ @options[:type] = :regexp
232
364
  elsif (args[0].is_a? Hash)
233
- @examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
234
- @examples = nil if @examples == []
365
+ examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
366
+ examples = nil if examples == []
235
367
  end
236
-
368
+
369
+ @has_examples = !examples.nil?
370
+ examples
237
371
  end
238
-
372
+
239
373
  def add_result(filter, source, results)
240
374
  results.each do |res|
241
375
  filter.sink << res
242
376
  @result.add_result(source, res)
243
- end
244
- end # end of method generate_examples
377
+ end
378
+ end
379
+
245
380
  end #end of class Pattern
246
381
  end #end of module Scrubyt