scrubyt 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +59 -12
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +24 -6
- data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
- data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
- data/lib/scrubyt/core/scraping/constraint.rb +53 -57
- data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
- data/lib/scrubyt/core/scraping/pattern.rb +292 -157
- data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
- data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
- data/lib/scrubyt/core/shared/extractor.rb +122 -163
- data/lib/scrubyt/output/export.rb +59 -174
- data/lib/scrubyt/output/post_processor.rb +4 -3
- data/lib/scrubyt/output/result.rb +8 -9
- data/lib/scrubyt/output/result_dumper.rb +81 -42
- data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
- data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
- data/lib/scrubyt/utils/shared_utils.rb +39 -26
- data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
- data/lib/scrubyt/utils/xpathutils.rb +31 -30
- data/test/unittests/constraint_test.rb +11 -7
- data/test/unittests/extractor_test.rb +6 -6
- data/test/unittests/filter_test.rb +66 -66
- metadata +22 -15
- data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -0,0 +1,17 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class RegexpFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
if source.is_a? String
|
6
|
+
source.scan(@example).flatten
|
7
|
+
else
|
8
|
+
source.inner_text.scan(@example).flatten
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_sexp
|
13
|
+
[:lit, @example]
|
14
|
+
end
|
15
|
+
|
16
|
+
end #End of class TreeFilter
|
17
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class TreeFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
result = source/@xpath
|
6
|
+
#puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
|
7
|
+
xpath_results = if result.class == Hpricot::Elements then result.map else [result] end
|
8
|
+
if @regexp == nil
|
9
|
+
xpath_results
|
10
|
+
else
|
11
|
+
regexp_results = []
|
12
|
+
xpath_results.each do |entry|
|
13
|
+
text = SharedUtils.prepare_text_for_comparison(result.inner_html)
|
14
|
+
if text =~ @regexp
|
15
|
+
regexp_results << $1
|
16
|
+
end
|
17
|
+
end
|
18
|
+
regexp_results
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def generate_regexp_for_example
|
23
|
+
return if @example_type != EXAMPLE_TYPE_STRING
|
24
|
+
return if @temp_sink.nil?
|
25
|
+
|
26
|
+
text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_text)
|
27
|
+
match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
|
28
|
+
return if match_range == (0..text.length)
|
29
|
+
|
30
|
+
@regexp = text
|
31
|
+
@temp_sink.changing_ranges.sort.reverse.each do |range|
|
32
|
+
@regexp[range] = if range == match_range then '<<<regexp_selection>>>' else '<<<regexp_changing>>>' end
|
33
|
+
end
|
34
|
+
@regexp = Regexp.escape(@regexp)
|
35
|
+
@regexp = @regexp.gsub('<<<regexp_changing>>>', '.*?')
|
36
|
+
@regexp = @regexp.gsub('<<<regexp_selection>>>', '(.*?)')
|
37
|
+
@regexp = '^' + @regexp + '$'
|
38
|
+
@regexp = /#{@regexp}/
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
#For all the tree patterns, generate an XPath based on the given example
|
43
|
+
#Also this method should not be called directly; It is automatically called for every tree
|
44
|
+
#pattern directly after wrapper definition
|
45
|
+
def generate_XPath_for_example(next_page_example=false)
|
46
|
+
#puts "generating example for: #{@parent_pattern.name}"
|
47
|
+
#puts @example_type
|
48
|
+
case @example_type
|
49
|
+
when EXAMPLE_TYPE_XPATH
|
50
|
+
@xpath = @example
|
51
|
+
when EXAMPLE_TYPE_STRING
|
52
|
+
@temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
53
|
+
@example,
|
54
|
+
next_page_example)
|
55
|
+
return if @temp_sink == nil
|
56
|
+
|
57
|
+
mark_changing_ranges = lambda { |element, range|
|
58
|
+
element.instance_eval do
|
59
|
+
@changing_ranges ||= [] << range
|
60
|
+
def changing_ranges
|
61
|
+
@changing_ranges
|
62
|
+
end
|
63
|
+
end
|
64
|
+
}
|
65
|
+
mark_changing_ranges.call(@temp_sink, @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0))
|
66
|
+
|
67
|
+
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, !@parent_pattern.generalize)
|
68
|
+
when EXAMPLE_TYPE_CHILDREN
|
69
|
+
current_example_index = 0
|
70
|
+
loop do
|
71
|
+
all_child_temp_sinks = []
|
72
|
+
@parent_pattern.children.each do |child_pattern|
|
73
|
+
all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink
|
74
|
+
end
|
75
|
+
result = all_child_temp_sinks.pop
|
76
|
+
if all_child_temp_sinks.empty?
|
77
|
+
result = result.parent
|
78
|
+
else
|
79
|
+
all_child_temp_sinks.each do |child_sink|
|
80
|
+
result = XPathUtils.lowest_common_ancestor(result, child_sink)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
|
84
|
+
XPathUtils.generate_XPath(result, nil, true)
|
85
|
+
if @parent_pattern.filters.size < current_example_index + 1
|
86
|
+
@parent_pattern.filters << Scrubyt::BaseFilter.create(@parent_pattern)
|
87
|
+
end
|
88
|
+
@parent_pattern.filters[current_example_index].xpath = xpath
|
89
|
+
@parent_pattern.filters[current_example_index].temp_sink = result
|
90
|
+
@parent_pattern.children.each do |child_pattern|
|
91
|
+
next if child_pattern.type == :detail_page
|
92
|
+
child_pattern.filters[current_example_index].xpath =
|
93
|
+
child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
|
94
|
+
XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
|
95
|
+
end
|
96
|
+
break if @parent_pattern.children[0].filters.size == current_example_index + 1
|
97
|
+
current_example_index += 1
|
98
|
+
end
|
99
|
+
when EXAMPLE_TYPE_IMAGE
|
100
|
+
#@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)\
|
101
|
+
@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.extractor.get_hpricot_doc, @example)
|
102
|
+
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
|
103
|
+
when EXAMPLE_TYPE_COMPOUND
|
104
|
+
@temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
105
|
+
@example,
|
106
|
+
next_page_example)
|
107
|
+
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
108
|
+
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def generate_relative_XPath(parent_xpath)
|
113
|
+
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
|
114
|
+
end
|
115
|
+
|
116
|
+
def to_sexp
|
117
|
+
[:str, @xpath]
|
118
|
+
end
|
119
|
+
|
120
|
+
end #End of class TreeFilter
|
121
|
+
end #End of module Scrubyt
|
@@ -5,129 +5,234 @@ module Scrubyt
|
|
5
5
|
##
|
6
6
|
#=<tt>Group more filters into one</tt>
|
7
7
|
#
|
8
|
-
#Server as an umbrella for filters which are conceptually extracting
|
8
|
+
#Server as an umbrella for filters which are conceptually extracting
|
9
9
|
#the same thing - for example a price or a title or ...
|
10
10
|
#
|
11
|
-
#Sometimes the same piece of information can not be extracted with one filter
|
11
|
+
#Sometimes the same piece of information can not be extracted with one filter
|
12
12
|
#across more result instances (for example a price has an XPath in record n,
|
13
13
|
#but since in record n+1 has a discount price as well, the real price is pushed
|
14
14
|
#to a different XPath etc) - in this case the more filters which extract the same
|
15
15
|
#thing are hold in the same pattern.
|
16
16
|
class Pattern
|
17
|
-
#Type of the pattern;
|
18
|
-
|
19
|
-
#
|
20
|
-
|
21
|
-
# a
|
22
|
-
|
23
|
-
#
|
24
|
-
|
25
|
-
# represents
|
26
|
-
|
27
|
-
# represents a pattern which
|
28
|
-
|
17
|
+
#Type of the pattern;
|
18
|
+
|
19
|
+
# TODO: Update documentation
|
20
|
+
|
21
|
+
# # a root pattern represents a (surprise!) root pattern
|
22
|
+
# PATTERN_TYPE_ROOT = :PATTERN_TYPE_ROOT
|
23
|
+
# # a tree pattern represents a HTML region
|
24
|
+
# PATTERN_TYPE_TREE = :PATTERN_TYPE_TREE
|
25
|
+
# # represents an attribute of the node extracted by the parent pattern
|
26
|
+
# PATTERN_TYPE_ATTRIBUTE = :PATTERN_TYPE_ATTRIBUTE
|
27
|
+
# # represents a pattern which filters its output with a regexp
|
28
|
+
# PATTERN_TYPE_REGEXP = :PATTERN_TYPE_REGEXP
|
29
|
+
# # represents a pattern which crawls to the detail page and extracts information from there
|
30
|
+
# PATTERN_TYPE_DETAIL_PAGE = :PATTERN_TYPE_DETAIL_PAGE
|
31
|
+
# # represents a download pattern
|
32
|
+
# PATTERN_TYPE_DOWNLOAD = :PATTERN_TYPE_DOWNLOAD
|
33
|
+
# # write out the HTML subtree beginning at the matched element
|
34
|
+
# PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
|
35
|
+
|
36
|
+
VALID_PATTERN_TYPES = [:root, :tree, :attribute, :regexp, :detail_page, :download, :html_subtree]
|
29
37
|
|
30
38
|
#The pattern can be either a model pattern (in this case it is
|
31
39
|
#written to the output) or a temp pattern (in this case it is skipped)
|
32
40
|
#Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
|
33
41
|
#is considered to be a model pattern
|
34
|
-
|
42
|
+
|
35
43
|
#Model pattern are shown in the output
|
36
|
-
OUTPUT_TYPE_MODEL =
|
37
|
-
#Temp patterns are skipped in the output (their ancestors are appended to the parent
|
38
|
-
#of the pattrern which was skipped
|
39
|
-
OUTPUT_TYPE_TEMP =
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
:
|
48
|
-
:indices_to_extract, :
|
49
|
-
:
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
44
|
+
# OUTPUT_TYPE_MODEL = :OUTPUT_TYPE_MODEL
|
45
|
+
# #Temp patterns are skipped in the output (their ancestors are appended to the parent
|
46
|
+
# #of the pattrern which was skipped
|
47
|
+
# OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
|
48
|
+
|
49
|
+
VALID_OUTPUT_TYPES = [:model, :temp]
|
50
|
+
|
51
|
+
#These options can be set upon wrapper creation
|
52
|
+
VALID_OPTIONS = [:generalize, :type, :output_type, :write_text, :references, :limit, :default, :resolve] + Scrubyt::CompoundExample::DESCRIPTORS
|
53
|
+
|
54
|
+
attr_accessor(:name, :options, :children, :constraints, :filters, :parent,
|
55
|
+
:last_result, :result, :evaluation_context,
|
56
|
+
:indices_to_extract, :referenced_extractor, :referenced_pattern,
|
57
|
+
:source_file, :source_proc, :modifier_calls)
|
58
|
+
|
59
|
+
attr_reader(:next_page_url, :result_indexer)
|
60
|
+
|
61
|
+
option_reader(:type => :tree, :output_type => :model, :generalize => false,
|
62
|
+
:write_text => lambda { @children.size == 0 }, :limit => nil,
|
63
|
+
:default => nil, :resolve => :full)
|
64
|
+
|
65
|
+
def initialize(name, args=[], evaluation_context=nil, parent=nil, &block)
|
66
|
+
#init attributes
|
67
|
+
@name = name
|
68
|
+
@evaluation_context = evaluation_context
|
69
|
+
@parent = parent
|
70
|
+
@options = {}
|
71
|
+
@children = []
|
72
|
+
@filters = []
|
73
|
+
@constraints = []
|
74
|
+
@result = Result.new
|
75
|
+
@modifier_calls = []
|
76
|
+
|
77
|
+
#grab any examples that are defined
|
78
|
+
examples = look_for_examples(args)
|
79
|
+
|
80
|
+
#parse the options hash if provided
|
81
|
+
parse_options_hash(args[-1]) if args[-1].is_a? Hash
|
82
|
+
|
83
|
+
#perform checks for special cases
|
84
|
+
examples = check_if_shortcut_pattern() if examples == nil
|
85
|
+
check_if_detail_page(block)
|
86
|
+
|
87
|
+
#create filters
|
88
|
+
if examples == nil
|
89
|
+
@filters << Scrubyt::BaseFilter.create(self) #create a default filter
|
60
90
|
else
|
61
|
-
|
62
|
-
filters << Scrubyt::
|
91
|
+
examples.each do |example|
|
92
|
+
@filters << Scrubyt::BaseFilter.create(self,example) #create a filter
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
#by default, generalize direct children of the root pattern, but only in the case if
|
97
|
+
#@generalize was not set up explicitly
|
98
|
+
@options[:generalize] = true if parent && parent.type == :root && @options[:generalize].nil?
|
99
|
+
|
100
|
+
#parse child patterns if available
|
101
|
+
parse_child_patterns(&block) if ( !block.nil? && type != :detail_page )
|
102
|
+
|
103
|
+
#tree pattern only (TODO: subclass?)
|
104
|
+
if type == :tree
|
105
|
+
#generate xpaths and regexps
|
106
|
+
@filters.each do |filter|
|
107
|
+
filter.generate_XPath_for_example(false)
|
108
|
+
filter.generate_regexp_for_example
|
109
|
+
end
|
110
|
+
#when the xpaths of this pattern have been created, its children can make their xpaths relative
|
111
|
+
xpaths = @filters.collect { |filter| filter.xpath }
|
112
|
+
@children.each do |child|
|
113
|
+
child.generate_relative_XPaths xpaths
|
63
114
|
end
|
64
|
-
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def generate_relative_XPaths(parent_xpaths)
|
119
|
+
return if type != :tree
|
120
|
+
raise ArgumentError.new if parent_xpaths.size != 1 && parent_xpaths.size != @filters.size #TODO: should be checked earlier with proper error message
|
121
|
+
@filters.each_index do |index|
|
122
|
+
@filters[index].generate_relative_XPath parent_xpaths[parent_xpaths.size == 1 ? 0 : index]
|
123
|
+
end
|
65
124
|
end
|
66
|
-
|
67
|
-
#
|
68
|
-
#
|
125
|
+
|
126
|
+
#Shortcut patterns, as their name says, are a shortcut for creating patterns
|
127
|
+
#from predefined rules; for example:
|
128
|
+
#
|
129
|
+
# detail_url
|
130
|
+
#
|
131
|
+
# is equivalent to
|
132
|
+
#
|
133
|
+
# detail_url 'href', type => :attribute
|
134
|
+
#
|
135
|
+
#i.e. the system figures out on it's own that because of the postfix, the
|
136
|
+
#example should be looked up (but it should never override the user input!)
|
137
|
+
#another example (will be available later):
|
138
|
+
#
|
139
|
+
# every_img
|
140
|
+
#
|
141
|
+
# is equivivalent to
|
69
142
|
#
|
70
|
-
#
|
71
|
-
#
|
72
|
-
def
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
143
|
+
# every_img '//img'
|
144
|
+
#
|
145
|
+
def check_if_shortcut_pattern()
|
146
|
+
if @name =~ /.+_url/
|
147
|
+
@options[:type] = :attribute
|
148
|
+
['href']
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
#Check whether the currently created pattern is a detail pattern (i.e. it refrences
|
153
|
+
#a subextractor). Also check if the currently created pattern is
|
154
|
+
#an ancestor of a detail pattern , and store this in a hash if yes (to be able to
|
155
|
+
#traverse the pattern structure on detail pages as well).
|
156
|
+
def check_if_detail_page(block)
|
157
|
+
#return if !@options[:references]
|
158
|
+
#@options[:type] = :detail_page
|
159
|
+
#@referenced_extractor = @options[:references]
|
160
|
+
if @name =~ /.+_detail/
|
161
|
+
@options[:type] = :detail_page
|
162
|
+
@referenced_extractor = block
|
163
|
+
Scrubyt::Extractor.add_detail_extractor_to_pattern_name(block, self)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def parent_of_leaf
|
168
|
+
@children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? }
|
169
|
+
end
|
170
|
+
|
171
|
+
def parse_child_patterns(&block)
|
172
|
+
context = Object.new
|
173
|
+
context.instance_eval do
|
174
|
+
def current=(value)
|
175
|
+
@current = value
|
176
|
+
end
|
177
|
+
def method_missing(method_name, *args, &block)
|
178
|
+
if method_name.to_s[0..0] == '_'
|
179
|
+
#add hash option
|
180
|
+
key = :"#{method_name.to_s[1..-1]}"
|
181
|
+
args.each do |arg|
|
182
|
+
current_value = @current.options[key]
|
183
|
+
if current_value.nil?
|
184
|
+
@current.options[key] = arg
|
185
|
+
else
|
186
|
+
@current.options[key] = [current_value] if !current_value.is_a Array
|
187
|
+
@current.options[key] << arg
|
188
|
+
end
|
189
|
+
end
|
190
|
+
else
|
191
|
+
#create child pattern
|
192
|
+
child = Scrubyt::Pattern.new(method_name.to_s, args, @current.evaluation_context, @current, &block)
|
193
|
+
@current.children << child
|
194
|
+
child
|
85
195
|
end
|
86
|
-
#This flags says that the user explicitly wants to set generalization on a pattern
|
87
|
-
#In this case, of course, our heuristics do not apply - the users setting overrides
|
88
|
-
#it
|
89
|
-
@generalize_set = true if (k.to_s == 'generalize')
|
90
196
|
end
|
91
197
|
end
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
#don't generalize by default
|
97
|
-
@generalize ||= false
|
98
|
-
#This flag indicates that the user set 'generalize' to some value;
|
99
|
-
#This way we can ensure that the explicit setting will not be overridden
|
100
|
-
@generalize_set ||= false
|
101
|
-
end
|
102
|
-
|
198
|
+
context.current = self
|
199
|
+
context.instance_eval(&block)
|
200
|
+
end
|
201
|
+
|
103
202
|
#Dispatcher function; The class was already too big so I have decided to factor
|
104
|
-
#out some methods based on their functionality (like output, adding constraints)
|
203
|
+
#out some methods based on their functionality (like output, adding constraints)
|
105
204
|
#to utility classes.
|
106
205
|
#
|
107
|
-
#The second function besides dispatching is to lookup the results in an evaluated
|
206
|
+
#The second function besides dispatching is to lookup the results in an evaluated
|
108
207
|
#wrapper, for example
|
109
208
|
#
|
110
209
|
# camera_data.item[1].item_name[0]
|
111
210
|
def method_missing(method_name, *args, &block)
|
211
|
+
if @evaluation_context.evaluating_extractor_definition
|
212
|
+
@modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
|
213
|
+
end
|
214
|
+
|
112
215
|
case method_name.to_s
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
216
|
+
when 'select_indices'
|
217
|
+
@result_indexer = Scrubyt::ResultIndexer.new(*args)
|
218
|
+
return self
|
219
|
+
when /^to_/
|
220
|
+
return Scrubyt::ResultDumper.send(method_name.to_s, self)
|
221
|
+
when /^ensure_/
|
222
|
+
@constraints << Scrubyt::ConstraintAdder.send(method_name, *args)
|
223
|
+
return self #To make chaining possible
|
224
|
+
else
|
225
|
+
@children.each { |child| return child if child.name == method_name.to_s }
|
123
226
|
end
|
227
|
+
|
228
|
+
raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
|
124
229
|
end
|
125
230
|
|
126
231
|
#Companion function to the previous one (Pattern::method_missing). It makes
|
127
232
|
#inspecting results, like
|
128
233
|
#
|
129
234
|
# camera_data.item[1].item_name[0]
|
130
|
-
#
|
235
|
+
#
|
131
236
|
#possible. The method Pattern::method missing handles the 'item', 'item_name' etc.
|
132
237
|
#parts, while the indexing ([1], [0]) is handled by this function.
|
133
238
|
#If you would like to select a different document than the first one (which is
|
@@ -142,105 +247,135 @@ module Scrubyt
|
|
142
247
|
return nil if (@result.lookup(@parent.last_result)) == nil
|
143
248
|
@last_result = @result.lookup(@parent.last_result)[index]
|
144
249
|
end
|
145
|
-
self
|
250
|
+
self
|
146
251
|
end
|
147
|
-
|
252
|
+
|
148
253
|
##
|
149
254
|
#If export is called on the root pattern, it exports the whole extractor wher it is
|
150
255
|
#defined; See export.rb for further details on the parameters
|
151
|
-
def export(
|
152
|
-
Scrubyt::
|
256
|
+
def export(arg1, output_file_name=nil, extractor_result_file_name=nil)
|
257
|
+
# require 'scrubyt/output/export_old'; Scrubyt::ExportOld.export(arg1, self, output_file_name, extractor_result_file_name) ; return
|
258
|
+
if File.exists? arg1
|
259
|
+
old_export(arg1, output_file_name, extractor_result_file_name)
|
260
|
+
else
|
261
|
+
new_export(arg1, output_file_name, extractor_result_file_name)
|
262
|
+
end
|
153
263
|
end
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
return self
|
264
|
+
|
265
|
+
def old_export(input_file, output_file_name=nil, extractor_result_file_name=nil)
|
266
|
+
contents = open(input_file).read
|
267
|
+
wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
|
268
|
+
Scrubyt::Export.export(self, wrapper_name, output_file_name, extractor_result_file_name)
|
160
269
|
end
|
161
270
|
|
162
|
-
|
163
|
-
|
164
|
-
def add_child_pattern(child)
|
165
|
-
child.parent = self
|
166
|
-
#by default, generalize direct children of the root pattern, but only in the case if
|
167
|
-
#@generalize was not set up explicitly
|
168
|
-
child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
|
169
|
-
@children << child
|
271
|
+
def new_export(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
|
272
|
+
Scrubyt::Export.export(self, wrapper_name, output_file_name, extractor_result_file_name)
|
170
273
|
end
|
171
|
-
|
274
|
+
|
172
275
|
##
|
173
|
-
#Evaluate the pattern. This means evaluating all the filters and adding
|
276
|
+
#Evaluate the pattern. This means evaluating all the filters and adding
|
174
277
|
#their extracted instances to the array of results of this pattern
|
175
|
-
def evaluate
|
176
|
-
#
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
result_hash = {}
|
191
|
-
r.each { |res| result_hash[res] = true }
|
192
|
-
result_hash.keys.each do |res|
|
193
|
-
filter.constraints.each { |constraint| result_hash[res] &&= constraint.check(res) }
|
278
|
+
def evaluate(parent_filters)
|
279
|
+
if type != :root #TODO: should be removed, but there is more refactoring of filter handling needed to do so
|
280
|
+
all_filter_results = []
|
281
|
+
@filters.each do |filter|
|
282
|
+
filter_index = @filters.index(filter)
|
283
|
+
filter_index = 0 if parent_filters.size <= filter_index
|
284
|
+
filter.source = parent_filters[filter_index].sink
|
285
|
+
filter.source.each do |source|
|
286
|
+
results = filter.evaluate(source)
|
287
|
+
next if results == nil
|
288
|
+
#apply constraints
|
289
|
+
if @constraints.size > 0
|
290
|
+
results = results.select do |result|
|
291
|
+
@constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) }
|
292
|
+
end
|
194
293
|
end
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
294
|
+
#apply indexer
|
295
|
+
results = @result_indexer.select_indices_to_extract(results) if !@result_indexer.nil?
|
296
|
+
add_result(filter, source, results)
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
#evaluate children
|
302
|
+
@children.each { |child| child.evaluate(@filters) }
|
303
|
+
|
304
|
+
#do postprocessing
|
305
|
+
end
|
306
|
+
|
307
|
+
def to_sexp
|
308
|
+
#collect arguments
|
309
|
+
args = []
|
310
|
+
args.push(*@filters.to_sexp_array) if type != :detail_page #TODO: this if shouldn't be there
|
311
|
+
args.push(@options.to_sexp) if !@options.empty?
|
312
|
+
|
313
|
+
#build main call
|
314
|
+
sexp = [:fcall, @name, [:array, *args]]
|
315
|
+
|
316
|
+
if type == :detail_page
|
317
|
+
#add detail page extractor
|
318
|
+
detail_root = @evaluation_context.extractor.get_detail_extractor(self)
|
319
|
+
sexp = [:iter, sexp, nil, [:block, *detail_root.children.to_sexp_array ]]
|
320
|
+
else
|
321
|
+
#add child block if the pattern has children
|
322
|
+
sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
|
323
|
+
end
|
324
|
+
|
325
|
+
#add modifier calls - TODO: remove when everything is exported to the options hash
|
326
|
+
@modifier_calls.each do |modifier_sexp|
|
327
|
+
sexp = [:call, sexp, *modifier_sexp]
|
328
|
+
end
|
329
|
+
|
330
|
+
#return complete sexp
|
331
|
+
sexp
|
332
|
+
end
|
333
|
+
|
334
|
+
private
|
335
|
+
def parse_options_hash(hash)
|
336
|
+
#merge provided hash
|
337
|
+
@options.merge!(hash)
|
338
|
+
#check if valid
|
339
|
+
hash.each { |key, value| raise "Unknown pattern option: #{key.to_s}" if VALID_OPTIONS.index(key.to_sym).nil? }
|
340
|
+
raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
|
341
|
+
raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
|
342
|
+
end
|
343
|
+
|
212
344
|
def look_for_examples(args)
|
213
345
|
if (args[0].is_a? String)
|
214
|
-
|
346
|
+
examples = args.select {|e| e.is_a? String}
|
215
347
|
#Check if all the String parameters are really the first
|
216
|
-
#parameters
|
217
|
-
args[0
|
348
|
+
#parameters
|
349
|
+
args[0..examples.size-1].each do |example|
|
218
350
|
if !example.is_a? String
|
219
351
|
puts 'FATAL: Problem with example specification'
|
220
352
|
end
|
221
353
|
end
|
222
354
|
elsif (args[0].is_a? Regexp)
|
223
|
-
|
355
|
+
examples = args.select {|e| e.is_a? Regexp}
|
224
356
|
#Check if all the String parameters are really the first
|
225
|
-
#parameters
|
226
|
-
args[0
|
357
|
+
#parameters
|
358
|
+
args[0..examples.size].each do |example|
|
227
359
|
if !example.is_a? Regexp
|
228
360
|
puts 'FATAL: Problem with example specification'
|
229
361
|
end
|
230
362
|
end
|
231
|
-
@type =
|
363
|
+
@options[:type] = :regexp
|
232
364
|
elsif (args[0].is_a? Hash)
|
233
|
-
|
234
|
-
|
365
|
+
examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
|
366
|
+
examples = nil if examples == []
|
235
367
|
end
|
236
|
-
|
368
|
+
|
369
|
+
@has_examples = !examples.nil?
|
370
|
+
examples
|
237
371
|
end
|
238
|
-
|
372
|
+
|
239
373
|
def add_result(filter, source, results)
|
240
374
|
results.each do |res|
|
241
375
|
filter.sink << res
|
242
376
|
@result.add_result(source, res)
|
243
|
-
end
|
244
|
-
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
|
245
380
|
end #end of class Pattern
|
246
381
|
end #end of module Scrubyt
|