scrubyt 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +41 -0
- data/Rakefile +55 -0
- data/lib/scrubyt.rb +9 -0
- data/lib/scrubyt/constraint.rb +185 -0
- data/lib/scrubyt/constraint_adder.rb +86 -0
- data/lib/scrubyt/export.rb +187 -0
- data/lib/scrubyt/extractor.rb +187 -0
- data/lib/scrubyt/filter.rb +144 -0
- data/lib/scrubyt/pattern.rb +263 -0
- data/lib/scrubyt/result.rb +43 -0
- data/lib/scrubyt/result_dumper.rb +84 -0
- data/lib/scrubyt/xpathutils.rb +196 -0
- data/test/unittests/constraint_test.rb +106 -0
- data/test/unittests/extractor_test.rb +93 -0
- data/test/unittests/filter_test.rb +71 -0
- data/test/unittests/input/constraint_test.html +55 -0
- data/test/unittests/input/test.html +39 -0
- data/test/unittests/xpathutils_test.rb +165 -0
- metadata +63 -0
@@ -0,0 +1,187 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'mechanize'
|
5
|
+
require 'hpricot'
|
6
|
+
require 'pp'
|
7
|
+
|
8
|
+
module Scrubyt
|
9
|
+
##
|
10
|
+
#=<tt>Driving the whole extraction process</tt>
|
11
|
+
#Extractor is a performer class - it gets an extractor definition and carries
|
12
|
+
#out the actions and evaluates the wrappers sequentially.
|
13
|
+
#
|
14
|
+
#It also defines the actions as class methods - check out the section
|
15
|
+
#commented with ############# Actions.
|
16
|
+
class Extractor
|
17
|
+
|
18
|
+
#The definition of the extractor is passed through this method
|
19
|
+
def self.define(&extractor_definition)
|
20
|
+
@@current_doc_url = nil
|
21
|
+
@@current_form = nil
|
22
|
+
@@current_doc_protocol = nil
|
23
|
+
@@base_dir = nil
|
24
|
+
@@host_name = nil
|
25
|
+
@@agent = WWW::Mechanize.new
|
26
|
+
#Hack up an artificial root pattern (i.e. do not return the pattern which
|
27
|
+
#is the root one in the user's definition, but rather the real (invisible)
|
28
|
+
#root pattern
|
29
|
+
root_pattern = (class_eval(&extractor_definition)).parent
|
30
|
+
#A little hack here: upon wrapper construction we are counting the number
|
31
|
+
#of blocks, so we know the count of the 'end's/'}'s which end the extractor
|
32
|
+
#definition
|
33
|
+
#Recursively match data based on examples
|
34
|
+
root_pattern.setup_examples
|
35
|
+
#Once all is set up, evaluate the wrapper from the root pattern!
|
36
|
+
if root_pattern.next_page
|
37
|
+
current_page_count = 1
|
38
|
+
loop do
|
39
|
+
evaluate_wrapper(root_pattern)
|
40
|
+
break if (root_pattern.limit == current_page_count || root_pattern.crawl_to_new_page == nil)
|
41
|
+
current_page_count += 1 if root_pattern.limit != nil
|
42
|
+
end
|
43
|
+
else
|
44
|
+
evaluate_wrapper(root_pattern)
|
45
|
+
end
|
46
|
+
#Return the root pattern
|
47
|
+
root_pattern
|
48
|
+
end
|
49
|
+
|
50
|
+
#build the current wrapper
|
51
|
+
def self.method_missing(method_name, *args, &block)
|
52
|
+
pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
|
53
|
+
if @parent == nil
|
54
|
+
if method_name.to_s == 'next_page'
|
55
|
+
@@root_pattern.next_page = args[0]
|
56
|
+
@@root_pattern.limit = args[1][:limit] if args.size > 1
|
57
|
+
return @@last_pattern
|
58
|
+
else
|
59
|
+
#Create a root pattern
|
60
|
+
root_pattern = Scrubyt::Pattern.new('root', :type => :root)
|
61
|
+
@@root_pattern = root_pattern
|
62
|
+
@@root_pattern.root_pattern = root_pattern
|
63
|
+
@@root_pattern.root_pattern.extractor = self
|
64
|
+
#add the currently active document to the root pattern
|
65
|
+
@@root_pattern.attach_current_document
|
66
|
+
@@root_pattern.add_child_pattern(pattern)
|
67
|
+
@@root_pattern.block_count = 0
|
68
|
+
@@root_pattern.extractor = self
|
69
|
+
end
|
70
|
+
else
|
71
|
+
@parent.add_child_pattern(pattern) if @parent != nil
|
72
|
+
end
|
73
|
+
if block_given?
|
74
|
+
@@root_pattern.block_count = @@root_pattern.block_count + 1
|
75
|
+
@stack ||=[]
|
76
|
+
@parent = pattern
|
77
|
+
@stack.push @parent
|
78
|
+
class_eval(&block)
|
79
|
+
@stack.pop
|
80
|
+
@parent = @stack.last
|
81
|
+
end
|
82
|
+
@@last_pattern = pattern
|
83
|
+
end
|
84
|
+
|
85
|
+
#Used in lord of the hacks vol 1. Check out export.rb if you are still interested
|
86
|
+
#(You should not be :)
|
87
|
+
def self.get_block_count
|
88
|
+
@@root_pattern.block_count
|
89
|
+
end
|
90
|
+
|
91
|
+
############# Actions
|
92
|
+
#
|
93
|
+
|
94
|
+
##
|
95
|
+
# At any given point, the current document can be queried with this method; Typically used
|
96
|
+
# when the navigation is over and the result document is passed to the wrapper
|
97
|
+
def self.get_current_doc_url
|
98
|
+
@@current_doc_url
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.get_hpricot_doc
|
102
|
+
@@hpricot_doc
|
103
|
+
end
|
104
|
+
|
105
|
+
##
|
106
|
+
#Action to fetch a document (either a file or a http address)
|
107
|
+
#
|
108
|
+
#*parameters*
|
109
|
+
#
|
110
|
+
#_doc_url_ - the url or file name to fetch
|
111
|
+
def self.fetch(doc_url, mechanize_doc=nil)
|
112
|
+
puts "fetching: #{doc_url}"
|
113
|
+
if (mechanize_doc == nil)
|
114
|
+
@@current_doc_url = doc_url
|
115
|
+
@@current_doc_protocol = ((doc_url =~ /^http/ || doc_url =~ /^www/) ? :http : :file)
|
116
|
+
if @@base_dir == nil
|
117
|
+
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == :file
|
118
|
+
else
|
119
|
+
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
120
|
+
end
|
121
|
+
|
122
|
+
if @@host_name == nil
|
123
|
+
if @@current_doc_protocol == :http
|
124
|
+
@@host_name = doc_url.scan(/http:\/\/.+?\//)[0]
|
125
|
+
@@host_name = doc_url if @@host_name == nil
|
126
|
+
end
|
127
|
+
else
|
128
|
+
@@current_doc_url = (@@host_name + doc_url) if doc_url !~ /#{@@host_name}/
|
129
|
+
end
|
130
|
+
|
131
|
+
@@mechanize_doc = @@agent.get(@@current_doc_url) if @@current_doc_protocol == :http
|
132
|
+
else
|
133
|
+
@@current_doc_url = doc_url
|
134
|
+
@@mechanize_doc = mechanize_doc
|
135
|
+
end
|
136
|
+
@@hpricot_doc = mechanize_doc != nil ? Hpricot(@@mechanize_doc.body) : Hpricot(open(@@current_doc_url))
|
137
|
+
out = open('kamaty.html', 'w')
|
138
|
+
out.write @@hpricot_doc.to_s
|
139
|
+
out.close
|
140
|
+
end
|
141
|
+
|
142
|
+
##
|
143
|
+
#Action to fill a textfield with a query string
|
144
|
+
#
|
145
|
+
##*parameters*
|
146
|
+
#
|
147
|
+
#_textfield_name_ - the name of the textfield (e.g. the name of the google search
|
148
|
+
#textfield is 'q'
|
149
|
+
#
|
150
|
+
#_query_string_ - the string that should be entered into the textfield
|
151
|
+
def self.fill_textfield(textfield_name, query_string)
|
152
|
+
puts 'fill textfield'
|
153
|
+
textfield = (@@hpricot_doc/"input[@name=#{textfield_name}]").map()[0]
|
154
|
+
formname = Scrubyt::XPathUtils.traverse_up_until_name(textfield, 'form').attributes['name']
|
155
|
+
@@current_form = @@mechanize_doc.forms.with.name(formname).first
|
156
|
+
eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
|
157
|
+
end
|
158
|
+
|
159
|
+
#Submit the last form;
|
160
|
+
def self.submit
|
161
|
+
puts 'submit'
|
162
|
+
result_page = @@agent.submit(@@current_form)#, @@current_form.buttons.first)
|
163
|
+
@@current_doc_url = result_page.uri.to_s
|
164
|
+
fetch(@@current_doc_url, result_page)
|
165
|
+
end
|
166
|
+
|
167
|
+
def self.click_link(link_text)
|
168
|
+
puts 'click link'
|
169
|
+
puts /^#{Regexp.escape(link_text)}$/
|
170
|
+
p /^#{Regexp.escape(link_text)}$/
|
171
|
+
link = @@mechanize_doc.links.text(/^#{Regexp.escape(link_text)}$/)
|
172
|
+
result_page = @@agent.click(link)
|
173
|
+
@@current_doc_url = result_page.uri.to_s
|
174
|
+
fetch(@@current_doc_url, result_page)
|
175
|
+
end
|
176
|
+
|
177
|
+
#
|
178
|
+
#############
|
179
|
+
|
180
|
+
private
|
181
|
+
def self.evaluate_wrapper(pattern)
|
182
|
+
pattern.evaluate
|
183
|
+
pattern.children.each { |child| evaluate_wrapper child }
|
184
|
+
end
|
185
|
+
|
186
|
+
end #end of class Extractor
|
187
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,144 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Filter out relevant pieces from the parent pattern</tt>
|
4
|
+
#
|
5
|
+
#A Scrubyt wrapper is almost like a waterfall: water is pouring from the top until
|
6
|
+
#it reaches the bottom. The biggest difference is that instead of water, a HTML
|
7
|
+
#document travels through the space.
|
8
|
+
#
|
9
|
+
#Of course Scrubyt would not make much sense if the same document would arrive at
|
10
|
+
#the bottom that was poured in at the top - since in this case we might use an
|
11
|
+
#indentity transformation (i.e. do nothing with the input) as well.
|
12
|
+
#
|
13
|
+
#This is where filters came in: as they name says, they filter the stuff that is
|
14
|
+
#pouring from above, to leave the interesting parts and discard the rest.
|
15
|
+
#The working of a filter will be explained most easily by the help of an example.
|
16
|
+
#Let's consider that we would like to extract information from a webshop; Concretely
|
17
|
+
#we are interested in the name of the items and the URL pointing to the image of the
|
18
|
+
#item
|
19
|
+
#
|
20
|
+
#To accomplish this. first we select the items with the pattern item (a pattern is
|
21
|
+
#a logical grouping of fillters; see Pattern documentation) Then our new
|
22
|
+
#context is the result extracted by the item pattern; For every pattern, further
|
23
|
+
#extract the name and the image of the item; and finally, extractr the href attribute
|
24
|
+
#of the image. Let's see an illustration:
|
25
|
+
#
|
26
|
+
# root --> This pattern is called a 'root pattern', It is invisible to you
|
27
|
+
# | and basically it represents the document; it has no filters
|
28
|
+
# +-- item --> Filter what's coming from above (the whole document) to get
|
29
|
+
# | relevant pieces of data (in this case webshop items)
|
30
|
+
# +-- name --> Again, filter what's coming from above (a webshop item) and
|
31
|
+
# | leave only item names after this operation
|
32
|
+
# +-- image --> This time filter the image of the item
|
33
|
+
# |
|
34
|
+
# +-- href --> And finally, from the image elements, get the attribute 'href'
|
35
|
+
class Filter
|
36
|
+
#Type of the example this filter is extracted with
|
37
|
+
|
38
|
+
#XPath example, like html/body/tr/td[1] etc.
|
39
|
+
EXAMPLE_TYPE_XPATH = 0
|
40
|
+
#String from the document, for example 'Canon EOS 300 D'.
|
41
|
+
EXAMPLE_TYPE_STRING = 1
|
42
|
+
#Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
|
43
|
+
EXAMPLE_TYPE_IMAGE = 2
|
44
|
+
#No example - the actual XPath is determined from the children XPaths (their LCA)
|
45
|
+
EXAMPLE_TYPE_CHILDREN = 3
|
46
|
+
#Regexp example, like /\d+@*\d+[a-z]/
|
47
|
+
EXAMPLE_TYPE_REGEXP = 4
|
48
|
+
|
49
|
+
attr_accessor :example_type, :parent_pattern, :temp_sink, :constraints, :xpath, :regexp
|
50
|
+
|
51
|
+
def initialize(parent_pattern, *args)
|
52
|
+
@parent_pattern = parent_pattern
|
53
|
+
#If the example type is not explicitly defined in the pattern definition,
|
54
|
+
#try to determine it automatically from the example
|
55
|
+
@example_type = (args[0] == nil ? Filter.determine_example_type(parent_pattern.example) :
|
56
|
+
args[0][:example_type])
|
57
|
+
@regexp = parent_pattern.example if @example_type == EXAMPLE_TYPE_REGEXP
|
58
|
+
@xpath = nil #The xpath to evaluate this filter
|
59
|
+
#temp sinks are used for the initial run when determining the XPaths for examples;
|
60
|
+
@temp_sink = nil
|
61
|
+
@constraints = [] #list of constraints
|
62
|
+
end
|
63
|
+
|
64
|
+
#Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
|
65
|
+
#is evaluated, every pattern evaluates its filters and then they are calling this method
|
66
|
+
def evaluate(source)
|
67
|
+
case @parent_pattern.type
|
68
|
+
when Scrubyt::Pattern::PATTERN_TYPE_TREE
|
69
|
+
result = source/@xpath
|
70
|
+
result.class == Hpricot::Elements ? result.map : [result]
|
71
|
+
when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
|
72
|
+
[source.attributes[@parent_pattern.example]]
|
73
|
+
when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
|
74
|
+
source.inner_text.scan(@regexp).flatten
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
#For all the tree patterns, generate an XPath based on the given example
|
79
|
+
#Also this method should not be called directly; It is automatically called for every tree
|
80
|
+
#pattern directly after wrapper definition
|
81
|
+
def generate_XPath_for_example
|
82
|
+
case @example_type
|
83
|
+
when EXAMPLE_TYPE_XPATH
|
84
|
+
@xpath = @parent_pattern.example
|
85
|
+
when EXAMPLE_TYPE_STRING
|
86
|
+
@temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.source[0], @parent_pattern.example )
|
87
|
+
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
88
|
+
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
89
|
+
when EXAMPLE_TYPE_CHILDREN
|
90
|
+
all_child_temp_sinks = []
|
91
|
+
@parent_pattern.children.each do |child_pattern|
|
92
|
+
child_pattern.filters.each do |filter|
|
93
|
+
all_child_temp_sinks << filter.temp_sink
|
94
|
+
end
|
95
|
+
end
|
96
|
+
result = all_child_temp_sinks.pop
|
97
|
+
if all_child_temp_sinks.empty?
|
98
|
+
result = result.parent
|
99
|
+
else
|
100
|
+
all_child_temp_sinks.each do |child_sink|
|
101
|
+
result = XPathUtils.lowest_common_ancestor(result, child_sink)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
@temp_sink = result
|
105
|
+
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
106
|
+
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
107
|
+
@parent_pattern.children.each do |child_pattern|
|
108
|
+
child_pattern.filters.each do |filter|
|
109
|
+
filter.xpath =
|
110
|
+
child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(filter.temp_sink, result) :
|
111
|
+
XPathUtils.generate_relative_XPath(filter.temp_sink, result)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
when EXAMPLE_TYPE_IMAGE
|
115
|
+
@temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.source[0], @parent_pattern.example)
|
116
|
+
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
#Dispatcher method to add constraints; of course, as with any method_missing, this method
|
121
|
+
#should not be called directly
|
122
|
+
def method_missing(method_name, *args, &block)
|
123
|
+
constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
def self.determine_example_type(example)
|
128
|
+
if example.instance_of? Regexp
|
129
|
+
EXAMPLE_TYPE_REGEXP
|
130
|
+
else
|
131
|
+
case example
|
132
|
+
when nil
|
133
|
+
EXAMPLE_TYPE_CHILDREN
|
134
|
+
when /\.(jpg|png|gif|jpeg)$/
|
135
|
+
EXAMPLE_TYPE_IMAGE
|
136
|
+
when /^\/{1,2}[a-z]+(\[\d+\])?(\/{1,2}[a-z]+(\[\d+\])?)*$/
|
137
|
+
(example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
|
138
|
+
else
|
139
|
+
EXAMPLE_TYPE_STRING
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end #End of method determine_example_type
|
143
|
+
end #End of class
|
144
|
+
end #End of module
|
@@ -0,0 +1,263 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
module Scrubyt
|
6
|
+
##
|
7
|
+
#=<tt>Group more filters into one</tt>
|
8
|
+
#
|
9
|
+
#Server as an umbrella for filters which are conceptually extracting
|
10
|
+
#the same thing - for example a price or a title or ...
|
11
|
+
#
|
12
|
+
#Sometimes the same piece of information can not be extracted with one filter
|
13
|
+
#across more result instances (for example a price has an XPath in record n,
|
14
|
+
#but since in record n+1 has a discount price as well, the real price is pushed
|
15
|
+
#to a different XPath etc) - in this case the more filters which extract the same
|
16
|
+
#thing are hold in the same pattern.
|
17
|
+
class Pattern
|
18
|
+
#Type of the pattern;
|
19
|
+
|
20
|
+
# a root pattern represents a (surprise!) root pattern
|
21
|
+
PATTERN_TYPE_ROOT = 0
|
22
|
+
# a tree pattern represents a HTML region
|
23
|
+
PATTERN_TYPE_TREE = 1
|
24
|
+
# represents an attribute of the node extracted by the parent pattern
|
25
|
+
PATTERN_TYPE_ATTRIBUTE = 2
|
26
|
+
# represents a pattern which filters its output with a regexp
|
27
|
+
PATTERN_TYPE_REGEXP = 3
|
28
|
+
|
29
|
+
#The pattern can be either a model pattern (in this case it is
|
30
|
+
#written to the output) or a temp pattern (in this case it is skipped)
|
31
|
+
#Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
|
32
|
+
#is considered to be a model pattern
|
33
|
+
|
34
|
+
#Model pattern are shown in the output
|
35
|
+
OUTPUT_TYPE_MODEL = 0
|
36
|
+
#Temp patterns are skipped in the output (their ancestors are appended to the parent
|
37
|
+
#of the pattrern which was skipped
|
38
|
+
OUTPUT_TYPE_TEMP = 1
|
39
|
+
|
40
|
+
#These fields can be set upon wrapper creation - i.e. a field which is public but not contained here can be accessed
|
41
|
+
#from outside, but not set as a result of wrapper construction
|
42
|
+
SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'example']
|
43
|
+
|
44
|
+
attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
|
45
|
+
:last_result, :result, :root_pattern, :example, :block_count,
|
46
|
+
:next_page, :limit, :extractor, :extracted_docs, :source, :sink
|
47
|
+
attr_reader :type, :generalize_set, :next_page_url
|
48
|
+
|
49
|
+
def initialize (name, *args)
|
50
|
+
@name = name #name of the pattern
|
51
|
+
parse_args(args) #parse the rest of the arguments
|
52
|
+
@root_pattern = nil #root pattern of the wrapper
|
53
|
+
@children = [] #child patterns
|
54
|
+
@filters = [] #filters of the wrapper
|
55
|
+
@sink = [] #output of a pattern
|
56
|
+
@source = [] #input of a pattern
|
57
|
+
@result = Result.new #hierarchical results of the pattern
|
58
|
+
@@instance_count = Hash.new(0)
|
59
|
+
@next_page = nil
|
60
|
+
filters << Scrubyt::Filter.new(self) #create a filter
|
61
|
+
end
|
62
|
+
|
63
|
+
#Parse the args passed as *args; There is only one compulsory parameter to pattern: it's name
|
64
|
+
#All the other parameters can (but do not have to) be specified;
|
65
|
+
#
|
66
|
+
#If an example is specified, it *MUST* be the first parameter; the order of the other
|
67
|
+
#parameters is irrelevant
|
68
|
+
def parse_args(args)
|
69
|
+
#If an example id defined, not only get it but also remove it so it
|
70
|
+
#does not interfere with the other possible string parameters
|
71
|
+
@example = args.delete_at(0) if args[0].instance_of? String
|
72
|
+
@example = args.delete_at(0) if args[0].instance_of? Regexp
|
73
|
+
args.each do |arg|
|
74
|
+
arg.each do |k,v|
|
75
|
+
#Set only the setable fields
|
76
|
+
if SETTABLE_FIELDS.include? k.to_s
|
77
|
+
#If the user is specifying a pattern type, turn it into the corresponding constant
|
78
|
+
v = "PATTERN_TYPE_#{v.to_s.upcase!}" if k.to_s == 'type'
|
79
|
+
v = "OUTPUT_TYPE_#{v.to_s.upcase!}" if k.to_s == 'output_type'
|
80
|
+
#Otherwise, if nothing special is happening, isntance_eval the hash pair
|
81
|
+
instance_eval("@#{k.to_s} = #{v}")
|
82
|
+
end
|
83
|
+
#This flags says that the user explicitly wants to set generalization on a pattern
|
84
|
+
#In this case, of course, our heuristics do not apply - the users setting overrides
|
85
|
+
#it
|
86
|
+
@generalize_set = true if (k.to_s == 'generalize')
|
87
|
+
end
|
88
|
+
end
|
89
|
+
#default settings - the user can override them, but if she did not do so,
|
90
|
+
#we will setup some meaningful defaults
|
91
|
+
@type ||= PATTERN_TYPE_TREE
|
92
|
+
@type = PATTERN_TYPE_REGEXP if @example.instance_of? Regexp
|
93
|
+
@output_type ||= OUTPUT_TYPE_MODEL
|
94
|
+
#don't generalize by default
|
95
|
+
@generalize ||= false
|
96
|
+
#This flag indicates that the user set 'generalize' to some value;
|
97
|
+
#This way we can ensure that the explicit setting will not be overridden
|
98
|
+
@generalize_set ||= false
|
99
|
+
end
|
100
|
+
|
101
|
+
#Dispatcher function; The class was already too big so I have decided to factor
|
102
|
+
#out some methods based on their functionality (like output, adding constraints)
|
103
|
+
#to utility classes.
|
104
|
+
#
|
105
|
+
#The second function besides dispatching is to lookup the results in an evaluated
|
106
|
+
#wrapper, for example
|
107
|
+
#
|
108
|
+
# camera_data.item[1].item_name[0]
|
109
|
+
def method_missing(method_name, *args, &block)
|
110
|
+
case method_name.to_s
|
111
|
+
when /^to_/
|
112
|
+
Scrubyt::ResultDumper.send(method_name.to_s, self)
|
113
|
+
when /^ensure_/
|
114
|
+
Scrubyt::ConstraintAdder.send(method_name, self, *args)
|
115
|
+
else
|
116
|
+
@children.each { |child| return child if child.name == method_name.to_s }
|
117
|
+
nil
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
#Companion function to the previous one (Pattern::method_missing). It makes
|
122
|
+
#inspecting results, like
|
123
|
+
#
|
124
|
+
# camera_data.item[1].item_name[0]
|
125
|
+
#
|
126
|
+
#possible. The method Pattern::method missing handles the 'item', 'item_name' etc.
|
127
|
+
#parts, while the indexing ([1], [0]) is handled by this function
|
128
|
+
def [](index)
|
129
|
+
return nil if (@result.lookup(@parent.last_result)) == nil
|
130
|
+
@last_result = @result.lookup(@parent.last_result)[index]
|
131
|
+
self
|
132
|
+
end
|
133
|
+
|
134
|
+
##
|
135
|
+
#If export is called on the root pattern, it exports the whole extractor wher it is
|
136
|
+
#defined; See export.rb for further details on the parameters
|
137
|
+
def export(file, output_file_name=nil, extractor_result_file_name=nil)
|
138
|
+
Scrubyt::Export.export(file, self, output_file_name, extractor_result_file_name)
|
139
|
+
end
|
140
|
+
|
141
|
+
##
|
142
|
+
#Add a filter to this pattern
|
143
|
+
def add_filter(filter)
|
144
|
+
@filters << filter
|
145
|
+
return self
|
146
|
+
end
|
147
|
+
|
148
|
+
##
|
149
|
+
#Add a child pattern to this pattern
|
150
|
+
def add_child_pattern(child)
|
151
|
+
child.parent = self
|
152
|
+
#by default, generalize direct children of the root pattern, but only in the case if
|
153
|
+
#@generalize was not set up explicitly
|
154
|
+
child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
|
155
|
+
@children << child
|
156
|
+
end
|
157
|
+
|
158
|
+
##
|
159
|
+
#Crawl to a new page. This function should not be called from the outside - it is automatically called
|
160
|
+
#if the next_page is defined
|
161
|
+
def crawl_to_new_page
|
162
|
+
temp_document = generate_next_page_link(@next_page)
|
163
|
+
return nil if temp_document == nil
|
164
|
+
clear_sources_and_sinks(@root_pattern)
|
165
|
+
@root_pattern.extractor.fetch(temp_document, nil)
|
166
|
+
attach_current_document
|
167
|
+
end
|
168
|
+
|
169
|
+
##
|
170
|
+
#Attach document to the root pattern; This is happening automatically as the root pattern is defined or
|
171
|
+
#crawling to a new page
|
172
|
+
def attach_current_document
|
173
|
+
doc = @root_pattern.extractor.get_hpricot_doc
|
174
|
+
@source << doc
|
175
|
+
@sink << doc
|
176
|
+
@last_result ||= []
|
177
|
+
@last_result << doc
|
178
|
+
@result.add_result(@source, @sink)
|
179
|
+
end
|
180
|
+
|
181
|
+
##
|
182
|
+
#Based on the given examples, calculate the XPaths for the tree patterns
|
183
|
+
def setup_examples
|
184
|
+
get_root_pattern(self)
|
185
|
+
set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
|
186
|
+
generate_examples(@root_pattern)
|
187
|
+
end
|
188
|
+
|
189
|
+
##
|
190
|
+
#Evaluate the pattern. This means evaluating all the filters and adding
|
191
|
+
#their extracted instances to the array of results of this pattern
|
192
|
+
def evaluate
|
193
|
+
#No need to evaluate if there is no parent pattern
|
194
|
+
return if @parent == nil
|
195
|
+
@source = @parent.sink
|
196
|
+
@source.each do |source|
|
197
|
+
@filters.each do |filter|
|
198
|
+
r = filter.evaluate(source)
|
199
|
+
if filter.constraints.size > 0
|
200
|
+
#in the beginning, keys of result_hash are made up of all the results of the filter
|
201
|
+
#with value = true; Later on, only those results will have 'true' value which are
|
202
|
+
#accepted with all filters
|
203
|
+
result_hash = {}
|
204
|
+
r.each { |res| result_hash[res] = true }
|
205
|
+
result_hash.keys.each do |res|
|
206
|
+
filter.constraints.each { |constraint| result_hash[res] &&= constraint.check(res) }
|
207
|
+
end
|
208
|
+
result = result_hash.reject {|k,v| k if !v}
|
209
|
+
sorted_result = r.reject {|e| !result.keys.include? e}
|
210
|
+
add_result(source, sorted_result)
|
211
|
+
else
|
212
|
+
add_result(source, r)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
end
|
218
|
+
|
219
|
+
def get_instance_count
|
220
|
+
@@instance_count
|
221
|
+
end
|
222
|
+
|
223
|
+
private
|
224
|
+
def add_result(source, results)
|
225
|
+
results.each do |res|
|
226
|
+
@sink << res
|
227
|
+
@result.add_result(source, res)
|
228
|
+
@@instance_count[@name] += 1
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def get_root_pattern(pattern)
|
233
|
+
if @root_pattern == nil
|
234
|
+
while (pattern.parent != nil)
|
235
|
+
get_root_pattern(pattern.parent)
|
236
|
+
end
|
237
|
+
@root_pattern = pattern
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def set_root_pattern_whole_wrapper(pattern, root_pattern)
|
242
|
+
pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
|
243
|
+
pattern.root_pattern = root_pattern
|
244
|
+
end
|
245
|
+
|
246
|
+
def generate_examples(pattern)
|
247
|
+
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
248
|
+
pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == PATTERN_TYPE_TREE
|
249
|
+
end
|
250
|
+
|
251
|
+
def clear_sources_and_sinks(pattern)
|
252
|
+
pattern.source = []
|
253
|
+
pattern.sink = []
|
254
|
+
pattern.children.each {|child| clear_sources_and_sinks child}
|
255
|
+
end
|
256
|
+
|
257
|
+
def generate_next_page_link(example)
|
258
|
+
node = XPathUtils.find_node_from_text(@root_pattern.source[0], example)
|
259
|
+
return nil if node == nil
|
260
|
+
node.attributes['href']
|
261
|
+
end # end of method generate_next_page_link
|
262
|
+
end #end of class Pattern
|
263
|
+
end #end of module Scrubyt
|