jspradlin-scrubyt 0.4.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +343 -0
- data/COPYING +340 -0
- data/README +120 -0
- data/Rakefile +101 -0
- data/lib/scrubyt.rb +45 -0
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/constraint.rb +169 -0
- data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
- data/lib/scrubyt/core/scraping/pattern.rb +359 -0
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
- data/lib/scrubyt/core/shared/extractor.rb +167 -0
- data/lib/scrubyt/logging.rb +154 -0
- data/lib/scrubyt/output/post_processor.rb +139 -0
- data/lib/scrubyt/output/result.rb +44 -0
- data/lib/scrubyt/output/result_dumper.rb +154 -0
- data/lib/scrubyt/output/result_node.rb +142 -0
- data/lib/scrubyt/output/scrubyt_result.rb +42 -0
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
- data/lib/scrubyt/utils/shared_utils.rb +58 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
- data/lib/scrubyt/utils/xpathutils.rb +202 -0
- data/test/blackbox_test.rb +60 -0
- data/test/blackbox_tests/basic/multi_root.rb +6 -0
- data/test/blackbox_tests/basic/simple.rb +5 -0
- data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
- data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
- data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
- data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
- metadata +117 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class RegexpFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
if source.is_a? String
|
6
|
+
source.scan(@example).flatten
|
7
|
+
else
|
8
|
+
source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
end #End of class TreeFilter
|
13
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class ScriptFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
param = source
|
6
|
+
param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
|
7
|
+
@example.call param
|
8
|
+
end
|
9
|
+
|
10
|
+
end #End of class ConstantFilter
|
11
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class TextFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
return find_string(source) if @example =~ /^find\(/
|
6
|
+
final_element_name = @example.scan(/^(.+?)\[/)[0][0]
|
7
|
+
text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
|
8
|
+
|
9
|
+
index = @example.scan(/\]:(.+)/).flatten
|
10
|
+
index = 0 if index.empty?
|
11
|
+
index = index[0].to_i unless index[0] == "all"
|
12
|
+
result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
|
13
|
+
return "" unless result
|
14
|
+
|
15
|
+
if index[0] == "all"
|
16
|
+
result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
|
17
|
+
else
|
18
|
+
[XPathUtils.traverse_up_until_name(result,final_element_name)]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def find_string(source)
|
23
|
+
str = @example.scan(/find\((.+)\)/).flatten[0]
|
24
|
+
strings_to_find = str.include?('|') ? str.split('|') : [str]
|
25
|
+
strings_to_find.each do |s|
|
26
|
+
result = SharedUtils.traverse_for_match(source,/#{s}/i)
|
27
|
+
return [s] unless result.empty?
|
28
|
+
end
|
29
|
+
return []
|
30
|
+
end
|
31
|
+
|
32
|
+
end #End of class TextFilter
|
33
|
+
end #End of module Scrubyt
|
34
|
+
|
@@ -0,0 +1,138 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class TreeFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
return [@final_result] if @final_result
|
6
|
+
#Crude hack! Drop it after it will be supported in Hpricot
|
7
|
+
if @xpath =~ /.+\/@.+$/
|
8
|
+
@example = @xpath
|
9
|
+
@xpath = @xpath.scan(/^(.+?)\/@/)[0][0]
|
10
|
+
end
|
11
|
+
result = source/@xpath
|
12
|
+
|
13
|
+
Scrubyt.log :ACTION, "Evaluating #{@parent_pattern.name} with #{@xpath}"
|
14
|
+
|
15
|
+
xpath_results = Hpricot::Elements === result ? result : [result]
|
16
|
+
|
17
|
+
if @example =~ /.+\/@.+$/
|
18
|
+
result_attribute = @example.scan(/.+\/@(.+?)$/)[0][0]
|
19
|
+
xpath_results.map! {|r| r.attributes[result_attribute] }
|
20
|
+
end
|
21
|
+
if @regexp == nil
|
22
|
+
xpath_results
|
23
|
+
else
|
24
|
+
regexp_results = []
|
25
|
+
xpath_results.each do |entry|
|
26
|
+
text = SharedUtils.prepare_text_for_comparison(result.inner_html)
|
27
|
+
if text =~ @regexp
|
28
|
+
regexp_results << $1
|
29
|
+
end
|
30
|
+
end
|
31
|
+
regexp_results
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def generate_regexp_for_example
|
36
|
+
return if @example_type != EXAMPLE_TYPE_STRING
|
37
|
+
return if @temp_sink.nil?
|
38
|
+
return if @temp_sink.is_a? String
|
39
|
+
return if @example =~ /.+\[.+\]$/
|
40
|
+
|
41
|
+
text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
|
42
|
+
match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
|
43
|
+
return if match_range == (0..text.length)
|
44
|
+
|
45
|
+
@regexp = text
|
46
|
+
@temp_sink.changing_ranges.sort.reverse.each do |range|
|
47
|
+
@regexp[range] = if range == match_range then '<<<regexp_selection>>>' else '<<<regexp_changing>>>' end
|
48
|
+
end
|
49
|
+
@regexp = Regexp.escape(@regexp)
|
50
|
+
@regexp = @regexp.gsub('<<<regexp_changing>>>', '.*?')
|
51
|
+
@regexp = @regexp.gsub('<<<regexp_selection>>>', '(.*?)')
|
52
|
+
@regexp = '^' + @regexp + '$'
|
53
|
+
@regexp = /#{@regexp}/
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
#For all the tree patterns, generate an XPath based on the given example
|
58
|
+
#Also this method should not be called directly; It is automatically called for every tree
|
59
|
+
#pattern directly after wrapper definition
|
60
|
+
def generate_XPath_for_example(next_page_example=false)
|
61
|
+
#puts "generating example for: #{@parent_pattern.name}"
|
62
|
+
#puts @example_type
|
63
|
+
case @example_type
|
64
|
+
when EXAMPLE_TYPE_XPATH
|
65
|
+
@xpath = @example
|
66
|
+
when EXAMPLE_TYPE_STRING
|
67
|
+
@temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
|
68
|
+
@example,
|
69
|
+
next_page_example)
|
70
|
+
return if @temp_sink == nil
|
71
|
+
if @temp_sink.is_a? String
|
72
|
+
@final_result = @temp_sink
|
73
|
+
return
|
74
|
+
end
|
75
|
+
|
76
|
+
mark_changing_ranges = lambda { |element, range|
|
77
|
+
element.instance_eval do
|
78
|
+
@changing_ranges ||= [] << range
|
79
|
+
def changing_ranges
|
80
|
+
@changing_ranges
|
81
|
+
end
|
82
|
+
end
|
83
|
+
}
|
84
|
+
mark_changing_ranges.call(@temp_sink, @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0))
|
85
|
+
write_indices = next_page_example ? true : !@parent_pattern.generalize
|
86
|
+
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, write_indices)
|
87
|
+
when EXAMPLE_TYPE_CHILDREN
|
88
|
+
current_example_index = 0
|
89
|
+
loop do
|
90
|
+
all_child_temp_sinks = []
|
91
|
+
@parent_pattern.children.each do |child_pattern|
|
92
|
+
all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink if child_pattern.filters[current_example_index].temp_sink
|
93
|
+
end
|
94
|
+
result = all_child_temp_sinks.pop
|
95
|
+
if all_child_temp_sinks.empty?
|
96
|
+
result = result.parent
|
97
|
+
else
|
98
|
+
all_child_temp_sinks.each do |child_sink|
|
99
|
+
result = XPathUtils.lowest_common_ancestor(result, child_sink)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
|
103
|
+
XPathUtils.generate_XPath(result, nil, true)
|
104
|
+
if @parent_pattern.filters.size < current_example_index + 1
|
105
|
+
@parent_pattern.filters << Scrubyt::BaseFilter.create(@parent_pattern)
|
106
|
+
end
|
107
|
+
@parent_pattern.filters[current_example_index].xpath = xpath
|
108
|
+
@parent_pattern.filters[current_example_index].temp_sink = result
|
109
|
+
@parent_pattern.children.each do |child_pattern|
|
110
|
+
next if child_pattern.type == :detail_page
|
111
|
+
child_pattern.filters[current_example_index].xpath =
|
112
|
+
child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
|
113
|
+
XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
|
114
|
+
end
|
115
|
+
break if @parent_pattern.children[0].filters.size == current_example_index + 1
|
116
|
+
current_example_index += 1
|
117
|
+
end
|
118
|
+
when EXAMPLE_TYPE_IMAGE
|
119
|
+
@temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
|
120
|
+
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
|
121
|
+
when EXAMPLE_TYPE_COMPOUND
|
122
|
+
@temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
|
123
|
+
@example,
|
124
|
+
next_page_example)
|
125
|
+
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
126
|
+
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def generate_relative_XPath(parent_xpath)
|
131
|
+
parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
|
132
|
+
parent_xpath,
|
133
|
+
@parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
|
134
|
+
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
|
135
|
+
end
|
136
|
+
|
137
|
+
end #End of class TreeFilter
|
138
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,359 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module Scrubyt
|
5
|
+
##
|
6
|
+
#=<tt>Group more filters into one</tt>
|
7
|
+
#
|
8
|
+
#Server as an umbrella for filters which are conceptually extracting
|
9
|
+
#the same thing - for example a price or a title or ...
|
10
|
+
#
|
11
|
+
#Sometimes the same piece of information can not be extracted with one filter
|
12
|
+
#across more result instances (for example a price has an XPath in record n,
|
13
|
+
#but since in record n+1 has a discount price as well, the real price is pushed
|
14
|
+
#to a different XPath etc) - in this case the more filters which extract the same
|
15
|
+
#thing are hold in the same pattern.
|
16
|
+
class Pattern
|
17
|
+
#Type of the pattern;
|
18
|
+
|
19
|
+
# TODO: Update documentation
|
20
|
+
|
21
|
+
# # a root pattern represents a (surprise!) root pattern
|
22
|
+
# PATTERN_TYPE_ROOT = :PATTERN_TYPE_ROOT
|
23
|
+
# # a tree pattern represents a HTML region
|
24
|
+
# PATTERN_TYPE_TREE = :PATTERN_TYPE_TREE
|
25
|
+
# # represents an attribute of the node extracted by the parent pattern
|
26
|
+
# PATTERN_TYPE_ATTRIBUTE = :PATTERN_TYPE_ATTRIBUTE
|
27
|
+
# # represents a pattern which filters its output with a regexp
|
28
|
+
# PATTERN_TYPE_REGEXP = :PATTERN_TYPE_REGEXP
|
29
|
+
# # represents a pattern which crawls to the detail page and extracts information from there
|
30
|
+
# PATTERN_TYPE_DETAIL_PAGE = :PATTERN_TYPE_DETAIL_PAGE
|
31
|
+
# # represents a download pattern
|
32
|
+
# PATTERN_TYPE_DOWNLOAD = :PATTERN_TYPE_DOWNLOAD
|
33
|
+
# # write out the HTML subtree beginning at the matched element
|
34
|
+
# PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
|
35
|
+
|
36
|
+
VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
|
37
|
+
|
38
|
+
# :determine - default value, represent that type of example need determine
|
39
|
+
# :string - represent node with example type EXAMPLE_TYPE_STRING
|
40
|
+
VALID_PATTERN_EXAMPLE_TYPES = [:determine, :xpath]
|
41
|
+
|
42
|
+
#The pattern can be either a model pattern (in this case it is
|
43
|
+
#written to the output) or a temp pattern (in this case it is skipped)
|
44
|
+
#Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
|
45
|
+
#is considered to be a model pattern
|
46
|
+
|
47
|
+
#Model pattern are shown in the output
|
48
|
+
# OUTPUT_TYPE_MODEL = :OUTPUT_TYPE_MODEL
|
49
|
+
# #Temp patterns are skipped in the output (their ancestors are appended to the parent
|
50
|
+
# #of the pattrern which was skipped
|
51
|
+
# OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
|
52
|
+
|
53
|
+
VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
|
54
|
+
|
55
|
+
#These options can be set upon wrapper creation
|
56
|
+
PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
|
57
|
+
VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
|
58
|
+
|
59
|
+
attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
|
60
|
+
:indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
|
61
|
+
|
62
|
+
attr_reader(:next_page_url, :result_indexer)
|
63
|
+
|
64
|
+
option_reader(:type => :tree, :output_type => :model, :generalize => false,
|
65
|
+
:write_text => lambda { @children.size == 0 }, :limit => nil,
|
66
|
+
:default => nil, :resolve => :full, :except => [], :example_type => :determine)
|
67
|
+
|
68
|
+
def initialize(name, args=[], extractor=nil, parent=nil, &block)
|
69
|
+
#init attributes
|
70
|
+
@name = name
|
71
|
+
@extractor = extractor
|
72
|
+
@parent = parent
|
73
|
+
@options = {}
|
74
|
+
@children = []
|
75
|
+
@filters = []
|
76
|
+
@constraints = []
|
77
|
+
@modifier_calls = []
|
78
|
+
|
79
|
+
#grab any examples that are defined
|
80
|
+
examples = look_for_examples(args)
|
81
|
+
|
82
|
+
#parse the options hash if provided
|
83
|
+
parse_options_hash(args[-1]) if args[-1].is_a? Hash
|
84
|
+
|
85
|
+
#perform checks for special cases
|
86
|
+
examples = check_if_shortcut_pattern() if examples == nil
|
87
|
+
check_if_detail_page(block)
|
88
|
+
@options[:output_type] = :page_list if name == 'page_list'
|
89
|
+
|
90
|
+
#create filters
|
91
|
+
if examples == nil
|
92
|
+
@filters << Scrubyt::BaseFilter.create(self) #create a default filter
|
93
|
+
else
|
94
|
+
examples.each do |example|
|
95
|
+
@filters << Scrubyt::BaseFilter.create(self,example) #create a filter
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
#by default, generalize the root pattern, but only in the case if
|
100
|
+
#@generalize was not set up explicitly
|
101
|
+
if @options[:generalize].nil?
|
102
|
+
@options[:generalize] = true if parent.nil?
|
103
|
+
@options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
|
104
|
+
end
|
105
|
+
|
106
|
+
#parse child patterns if available
|
107
|
+
parse_child_patterns(&block) if ( !block.nil? && type != :detail_page )
|
108
|
+
|
109
|
+
#tree pattern only (TODO: subclass?)
|
110
|
+
if type == :tree
|
111
|
+
#generate xpaths and regexps
|
112
|
+
@filters.each do |filter|
|
113
|
+
filter.generate_XPath_for_example(false) unless @name == 'next_page'
|
114
|
+
filter.generate_regexp_for_example
|
115
|
+
end
|
116
|
+
#when the xpaths of this pattern have been created, its children can make their xpaths relative
|
117
|
+
xpaths = @filters.collect { |filter| filter.xpath }
|
118
|
+
@children.each do |child|
|
119
|
+
child.generate_relative_XPaths xpaths
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def generate_relative_XPaths(parent_xpaths)
|
125
|
+
return if type != :tree
|
126
|
+
raise ArgumentError.new if parent_xpaths.size != 1 && parent_xpaths.size != @filters.size #TODO: should be checked earlier with proper error message
|
127
|
+
@filters.each_index do |index|
|
128
|
+
@filters[index].generate_relative_XPath parent_xpaths[parent_xpaths.size == 1 ? 0 : index]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
#Shortcut patterns, as their name says, are a shortcut for creating patterns
|
133
|
+
#from predefined rules; for example:
|
134
|
+
#
|
135
|
+
# detail_url
|
136
|
+
#
|
137
|
+
# is equivalent to
|
138
|
+
#
|
139
|
+
# detail_url 'href', type => :attribute
|
140
|
+
#
|
141
|
+
#i.e. the system figures out on it's own that because of the postfix, the
|
142
|
+
#example should be looked up (but it should never override the user input!)
|
143
|
+
#another example (will be available later):
|
144
|
+
#
|
145
|
+
# every_img
|
146
|
+
#
|
147
|
+
# is equivivalent to
|
148
|
+
#
|
149
|
+
# every_img '//img'
|
150
|
+
#
|
151
|
+
def check_if_shortcut_pattern()
|
152
|
+
if @name =~ /.+_url/
|
153
|
+
@options[:type] = :attribute
|
154
|
+
['href']
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
#Check whether the currently created pattern is a detail pattern (i.e. it refrences
|
159
|
+
#a subextractor). Also check if the currently created pattern is
|
160
|
+
#an ancestor of a detail pattern , and store this in a hash if yes (to be able to
|
161
|
+
#traverse the pattern structure on detail pages as well).
|
162
|
+
def check_if_detail_page(block)
|
163
|
+
if @name =~ /.+_detail/
|
164
|
+
@options[:type] = :detail_page
|
165
|
+
@referenced_extractor = block
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def parent_of_leaf
|
170
|
+
@children.inject(false) { |is_parent_of_leaf, child| is_parent_of_leaf || child.children.empty? }
|
171
|
+
end
|
172
|
+
|
173
|
+
def filter_count
|
174
|
+
@filters.size
|
175
|
+
end
|
176
|
+
|
177
|
+
def parse_child_patterns(&block)
|
178
|
+
context = Object.new
|
179
|
+
context.instance_eval do
|
180
|
+
def current=(value)
|
181
|
+
@current = value
|
182
|
+
end
|
183
|
+
def method_missing(method_name, *args, &block)
|
184
|
+
if method_name.to_s[0..0] == '_'
|
185
|
+
#add hash option
|
186
|
+
key = method_name.to_s[1..-1].to_sym
|
187
|
+
check_option(key)
|
188
|
+
args.each do |arg|
|
189
|
+
current_value = @current.options[key]
|
190
|
+
if current_value.nil?
|
191
|
+
@current.options[key] = arg
|
192
|
+
else
|
193
|
+
@current.options[key] = [current_value] if !current_value.is_a Array
|
194
|
+
@current.options[key] << arg
|
195
|
+
end
|
196
|
+
end
|
197
|
+
else
|
198
|
+
#create child pattern
|
199
|
+
child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
|
200
|
+
@current.children << child
|
201
|
+
child
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
context.current = self
|
206
|
+
context.instance_eval(&block)
|
207
|
+
end
|
208
|
+
|
209
|
+
#Dispatcher function; The class was already too big so I have decided to factor
|
210
|
+
#out some methods based on their functionality (like output, adding constraints)
|
211
|
+
#to utility classes.
|
212
|
+
#
|
213
|
+
#The second function besides dispatching is to lookup the results in an evaluated
|
214
|
+
#wrapper, for example
|
215
|
+
#
|
216
|
+
# camera_data.item[1].item_name[0]
|
217
|
+
def method_missing(method_name, *args, &block)
|
218
|
+
if @extractor.evaluating_extractor_definition
|
219
|
+
@modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
|
220
|
+
end
|
221
|
+
|
222
|
+
case method_name.to_s
|
223
|
+
when 'select_indices'
|
224
|
+
@result_indexer = Scrubyt::ResultIndexer.new(*args)
|
225
|
+
return self
|
226
|
+
when /^ensure_/
|
227
|
+
@constraints << Scrubyt::ConstraintAdder.send(method_name, *args)
|
228
|
+
return self #To make chaining possible
|
229
|
+
else
|
230
|
+
@children.each { |child| return child if child.name == method_name.to_s }
|
231
|
+
end
|
232
|
+
|
233
|
+
raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
|
234
|
+
end
|
235
|
+
|
236
|
+
def evaluate(source, filter_indices)
|
237
|
+
if type == :detail_page # DIRTY!
|
238
|
+
return @filters[0].evaluate(source)
|
239
|
+
end
|
240
|
+
|
241
|
+
#we apply all filters if filter_indices is nil
|
242
|
+
indices_to_evaluate = filter_indices.nil? ? 0...@filters.size : filter_indices
|
243
|
+
#stores the results of all filters
|
244
|
+
all_filter_results = []
|
245
|
+
#remembers which filters have retured a certain result
|
246
|
+
indices_mapping = {}
|
247
|
+
#evaluate filters and collect filter results
|
248
|
+
indices_to_evaluate.each do |filter_index|
|
249
|
+
filter = @filters[filter_index]
|
250
|
+
filter_results = filter.evaluate(source)
|
251
|
+
filter_results.each do |result|
|
252
|
+
#add result to list if not already there
|
253
|
+
all_filter_results << result if all_filter_results.index(result).nil?
|
254
|
+
#add the current filter's index to the mapping
|
255
|
+
(indices_mapping[result] ||= []) << filter_index
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
#apply constraints
|
260
|
+
if @constraints.size > 0
|
261
|
+
all_filter_results = all_filter_results.select do |result|
|
262
|
+
@constraints.inject(true) { |accepted, constraint| accepted && constraint.check(result) }
|
263
|
+
end
|
264
|
+
end
|
265
|
+
#apply indexer
|
266
|
+
all_filter_results = @result_indexer.select_indices_to_extract(all_filter_results) if !@result_indexer.nil?
|
267
|
+
|
268
|
+
#create result nodes and evaluate children
|
269
|
+
result_nodes = []
|
270
|
+
all_filter_results.each do |result|
|
271
|
+
#create result node
|
272
|
+
node = ResultNode.new(@name, result, @options)
|
273
|
+
node.generated_by_leaf = (@children.size == 0)
|
274
|
+
#evaluate children
|
275
|
+
@children.each do |child|
|
276
|
+
raise if self.filter_count != 1 && child.filter_count != self.filter_count
|
277
|
+
if self.filter_count == 1
|
278
|
+
#evaluate all child filters
|
279
|
+
node.push(*child.evaluate(result, nil))
|
280
|
+
else
|
281
|
+
#evaluate appropriate child filters
|
282
|
+
node.push(*child.evaluate(result, indices_mapping[result]))
|
283
|
+
end
|
284
|
+
end
|
285
|
+
#apply child constraints (ensure_presence_of_pattern)
|
286
|
+
required_child_names = @constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN }.map {|c| c.target}
|
287
|
+
unless required_child_names.empty?
|
288
|
+
check = lambda { |node_to_check|
|
289
|
+
required_child_names.delete node_to_check.name
|
290
|
+
node_to_check.each { |child| check.call child }
|
291
|
+
}
|
292
|
+
check.call node
|
293
|
+
end
|
294
|
+
next unless required_child_names.empty?
|
295
|
+
#add the current result node to the list
|
296
|
+
result_nodes << node
|
297
|
+
end
|
298
|
+
if result_nodes.empty?
|
299
|
+
result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
|
300
|
+
end
|
301
|
+
case output_type
|
302
|
+
when :model
|
303
|
+
return result_nodes
|
304
|
+
when :page_list
|
305
|
+
result_nodes.each do |result_node|
|
306
|
+
@extractor.add_to_next_page_list result_node
|
307
|
+
end
|
308
|
+
return []
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
private
|
313
|
+
def parse_options_hash(hash)
|
314
|
+
#merge provided hash
|
315
|
+
@options.merge!(hash)
|
316
|
+
#check if valid
|
317
|
+
hash.each { |key, value| check_option(key.to_sym) }
|
318
|
+
raise "Invalid pattern type: #{type.to_s}" if VALID_PATTERN_TYPES.index(type.to_sym).nil?
|
319
|
+
raise "Invalid output type: #{output_type.to_s}" if VALID_OUTPUT_TYPES.index(output_type.to_sym).nil?
|
320
|
+
raise "Invalid example type: #{example_type.to_s}" if VALID_PATTERN_EXAMPLE_TYPES.index(example_type.to_sym).nil?
|
321
|
+
end
|
322
|
+
|
323
|
+
def check_option(option)
|
324
|
+
raise "Unknown pattern option: #{option.to_s}" if VALID_OPTIONS.index(option).nil?
|
325
|
+
end
|
326
|
+
|
327
|
+
def look_for_examples(args)
|
328
|
+
if (args[0].is_a? String)
|
329
|
+
examples = args.select {|e| e.is_a? String}
|
330
|
+
#Check if all the String parameters are really the first
|
331
|
+
#parameters
|
332
|
+
args[0..examples.size-1].each do |example|
|
333
|
+
if !example.is_a? String
|
334
|
+
puts 'FATAL: Problem with example specification'
|
335
|
+
end
|
336
|
+
end
|
337
|
+
elsif (args[0].is_a? Regexp)
|
338
|
+
examples = args.select {|e| e.is_a? Regexp}
|
339
|
+
#Check if all the String parameters are really the first
|
340
|
+
#parameters
|
341
|
+
args[0..examples.size].each do |example|
|
342
|
+
if !example.is_a? Regexp
|
343
|
+
puts 'FATAL: Problem with example specification'
|
344
|
+
end
|
345
|
+
end
|
346
|
+
@options[:type] = :regexp
|
347
|
+
elsif (args[0].is_a? Hash)
|
348
|
+
examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
|
349
|
+
examples = nil if examples == []
|
350
|
+
elsif (args[0].is_a? Proc)
|
351
|
+
examples = [args[0]]
|
352
|
+
end
|
353
|
+
|
354
|
+
@has_examples = !examples.nil?
|
355
|
+
examples
|
356
|
+
end
|
357
|
+
|
358
|
+
end #end of class Pattern
|
359
|
+
end #end of module Scrubyt
|