scrubyt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,187 @@
1
+ require 'logger'
2
+ require 'open-uri'
3
+ require 'rubygems'
4
+ require 'mechanize'
5
+ require 'hpricot'
6
+ require 'pp'
7
+
8
+ module Scrubyt
9
+ ##
10
+ #=<tt>Driving the whole extraction process</tt>
11
+ #Extractor is a performer class - it gets an extractor definition and carries
12
+ #out the actions and evaluates the wrappers sequentially.
13
+ #
14
+ #It also defines the actions as class methods - check out the section
15
+ #commented with ############# Actions.
16
+ class Extractor
17
+
18
+ #The definition of the extractor is passed through this method
19
+ def self.define(&extractor_definition)
20
+ @@current_doc_url = nil
21
+ @@current_form = nil
22
+ @@current_doc_protocol = nil
23
+ @@base_dir = nil
24
+ @@host_name = nil
25
+ @@agent = WWW::Mechanize.new
26
+ #Hack up an artificial root pattern (i.e. do not return the pattern which
27
+ #is the root one in the user's definition, but rather the real (invisible)
28
+ #root pattern
29
+ root_pattern = (class_eval(&extractor_definition)).parent
30
+ #A little hack here: upon wrapper construction we are counting the number
31
+ #of blocks, so we know the count of the 'end's/'}'s which end the extractor
32
+ #definition
33
+ #Recursively match data based on examples
34
+ root_pattern.setup_examples
35
+ #Once all is set up, evaluate the wrapper from the root pattern!
36
+ if root_pattern.next_page
37
+ current_page_count = 1
38
+ loop do
39
+ evaluate_wrapper(root_pattern)
40
+ break if (root_pattern.limit == current_page_count || root_pattern.crawl_to_new_page == nil)
41
+ current_page_count += 1 if root_pattern.limit != nil
42
+ end
43
+ else
44
+ evaluate_wrapper(root_pattern)
45
+ end
46
+ #Return the root pattern
47
+ root_pattern
48
+ end
49
+
50
+ #build the current wrapper
51
+ def self.method_missing(method_name, *args, &block)
52
+ pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
53
+ if @parent == nil
54
+ if method_name.to_s == 'next_page'
55
+ @@root_pattern.next_page = args[0]
56
+ @@root_pattern.limit = args[1][:limit] if args.size > 1
57
+ return @@last_pattern
58
+ else
59
+ #Create a root pattern
60
+ root_pattern = Scrubyt::Pattern.new('root', :type => :root)
61
+ @@root_pattern = root_pattern
62
+ @@root_pattern.root_pattern = root_pattern
63
+ @@root_pattern.root_pattern.extractor = self
64
+ #add the currently active document to the root pattern
65
+ @@root_pattern.attach_current_document
66
+ @@root_pattern.add_child_pattern(pattern)
67
+ @@root_pattern.block_count = 0
68
+ @@root_pattern.extractor = self
69
+ end
70
+ else
71
+ @parent.add_child_pattern(pattern) if @parent != nil
72
+ end
73
+ if block_given?
74
+ @@root_pattern.block_count = @@root_pattern.block_count + 1
75
+ @stack ||=[]
76
+ @parent = pattern
77
+ @stack.push @parent
78
+ class_eval(&block)
79
+ @stack.pop
80
+ @parent = @stack.last
81
+ end
82
+ @@last_pattern = pattern
83
+ end
84
+
85
+ #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
86
+ #(You should not be :)
87
+ def self.get_block_count
88
+ @@root_pattern.block_count
89
+ end
90
+
91
+ ############# Actions
92
+ #
93
+
94
+ ##
95
+ # At any given point, the current document can be queried with this method; Typically used
96
+ # when the navigation is over and the result document is passed to the wrapper
97
+ def self.get_current_doc_url
98
+ @@current_doc_url
99
+ end
100
+
101
+ def self.get_hpricot_doc
102
+ @@hpricot_doc
103
+ end
104
+
105
+ ##
106
+ #Action to fetch a document (either a file or a http address)
107
+ #
108
+ #*parameters*
109
+ #
110
+ #_doc_url_ - the url or file name to fetch
111
+ def self.fetch(doc_url, mechanize_doc=nil)
112
+ puts "fetching: #{doc_url}"
113
+ if (mechanize_doc == nil)
114
+ @@current_doc_url = doc_url
115
+ @@current_doc_protocol = ((doc_url =~ /^http/ || doc_url =~ /^www/) ? :http : :file)
116
+ if @@base_dir == nil
117
+ @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == :file
118
+ else
119
+ @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
120
+ end
121
+
122
+ if @@host_name == nil
123
+ if @@current_doc_protocol == :http
124
+ @@host_name = doc_url.scan(/http:\/\/.+?\//)[0]
125
+ @@host_name = doc_url if @@host_name == nil
126
+ end
127
+ else
128
+ @@current_doc_url = (@@host_name + doc_url) if doc_url !~ /#{@@host_name}/
129
+ end
130
+
131
+ @@mechanize_doc = @@agent.get(@@current_doc_url) if @@current_doc_protocol == :http
132
+ else
133
+ @@current_doc_url = doc_url
134
+ @@mechanize_doc = mechanize_doc
135
+ end
136
+ @@hpricot_doc = mechanize_doc != nil ? Hpricot(@@mechanize_doc.body) : Hpricot(open(@@current_doc_url))
137
+ out = open('kamaty.html', 'w')
138
+ out.write @@hpricot_doc.to_s
139
+ out.close
140
+ end
141
+
142
+ ##
143
+ #Action to fill a textfield with a query string
144
+ #
145
+ ##*parameters*
146
+ #
147
+ #_textfield_name_ - the name of the textfield (e.g. the name of the google search
148
+ #textfield is 'q'
149
+ #
150
+ #_query_string_ - the string that should be entered into the textfield
151
+ def self.fill_textfield(textfield_name, query_string)
152
+ puts 'fill textfield'
153
+ textfield = (@@hpricot_doc/"input[@name=#{textfield_name}]").map()[0]
154
+ formname = Scrubyt::XPathUtils.traverse_up_until_name(textfield, 'form').attributes['name']
155
+ @@current_form = @@mechanize_doc.forms.with.name(formname).first
156
+ eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
157
+ end
158
+
159
+ #Submit the last form;
160
+ def self.submit
161
+ puts 'submit'
162
+ result_page = @@agent.submit(@@current_form)#, @@current_form.buttons.first)
163
+ @@current_doc_url = result_page.uri.to_s
164
+ fetch(@@current_doc_url, result_page)
165
+ end
166
+
167
+ def self.click_link(link_text)
168
+ puts 'click link'
169
+ puts /^#{Regexp.escape(link_text)}$/
170
+ p /^#{Regexp.escape(link_text)}$/
171
+ link = @@mechanize_doc.links.text(/^#{Regexp.escape(link_text)}$/)
172
+ result_page = @@agent.click(link)
173
+ @@current_doc_url = result_page.uri.to_s
174
+ fetch(@@current_doc_url, result_page)
175
+ end
176
+
177
+ #
178
+ #############
179
+
180
+ private
181
+ def self.evaluate_wrapper(pattern)
182
+ pattern.evaluate
183
+ pattern.children.each { |child| evaluate_wrapper child }
184
+ end
185
+
186
+ end #end of class Extractor
187
+ end #end of module Scrubyt
@@ -0,0 +1,144 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Filter out relevant pieces from the parent pattern</tt>
4
+ #
5
+ #A Scrubyt wrapper is almost like a waterfall: water is pouring from the top until
6
+ #it reaches the bottom. The biggest difference is that instead of water, a HTML
7
+ #document travels through the space.
8
+ #
9
+ #Of course Scrubyt would not make much sense if the same document would arrive at
10
+ #the bottom that was poured in at the top - since in this case we might use an
11
+ #indentity transformation (i.e. do nothing with the input) as well.
12
+ #
13
+ #This is where filters came in: as they name says, they filter the stuff that is
14
+ #pouring from above, to leave the interesting parts and discard the rest.
15
+ #The working of a filter will be explained most easily by the help of an example.
16
+ #Let's consider that we would like to extract information from a webshop; Concretely
17
+ #we are interested in the name of the items and the URL pointing to the image of the
18
+ #item
19
+ #
20
+ #To accomplish this. first we select the items with the pattern item (a pattern is
21
+ #a logical grouping of fillters; see Pattern documentation) Then our new
22
+ #context is the result extracted by the item pattern; For every pattern, further
23
+ #extract the name and the image of the item; and finally, extractr the href attribute
24
+ #of the image. Let's see an illustration:
25
+ #
26
+ # root --> This pattern is called a 'root pattern', It is invisible to you
27
+ # | and basically it represents the document; it has no filters
28
+ # +-- item --> Filter what's coming from above (the whole document) to get
29
+ # | relevant pieces of data (in this case webshop items)
30
+ # +-- name --> Again, filter what's coming from above (a webshop item) and
31
+ # | leave only item names after this operation
32
+ # +-- image --> This time filter the image of the item
33
+ # |
34
+ # +-- href --> And finally, from the image elements, get the attribute 'href'
35
+ class Filter
36
+ #Type of the example this filter is extracted with
37
+
38
+ #XPath example, like html/body/tr/td[1] etc.
39
+ EXAMPLE_TYPE_XPATH = 0
40
+ #String from the document, for example 'Canon EOS 300 D'.
41
+ EXAMPLE_TYPE_STRING = 1
42
+ #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
43
+ EXAMPLE_TYPE_IMAGE = 2
44
+ #No example - the actual XPath is determined from the children XPaths (their LCA)
45
+ EXAMPLE_TYPE_CHILDREN = 3
46
+ #Regexp example, like /\d+@*\d+[a-z]/
47
+ EXAMPLE_TYPE_REGEXP = 4
48
+
49
+ attr_accessor :example_type, :parent_pattern, :temp_sink, :constraints, :xpath, :regexp
50
+
51
+ def initialize(parent_pattern, *args)
52
+ @parent_pattern = parent_pattern
53
+ #If the example type is not explicitly defined in the pattern definition,
54
+ #try to determine it automatically from the example
55
+ @example_type = (args[0] == nil ? Filter.determine_example_type(parent_pattern.example) :
56
+ args[0][:example_type])
57
+ @regexp = parent_pattern.example if @example_type == EXAMPLE_TYPE_REGEXP
58
+ @xpath = nil #The xpath to evaluate this filter
59
+ #temp sinks are used for the initial run when determining the XPaths for examples;
60
+ @temp_sink = nil
61
+ @constraints = [] #list of constraints
62
+ end
63
+
64
+ #Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
65
+ #is evaluated, every pattern evaluates its filters and then they are calling this method
66
+ def evaluate(source)
67
+ case @parent_pattern.type
68
+ when Scrubyt::Pattern::PATTERN_TYPE_TREE
69
+ result = source/@xpath
70
+ result.class == Hpricot::Elements ? result.map : [result]
71
+ when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
72
+ [source.attributes[@parent_pattern.example]]
73
+ when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
74
+ source.inner_text.scan(@regexp).flatten
75
+ end
76
+ end
77
+
78
+ #For all the tree patterns, generate an XPath based on the given example
79
+ #Also this method should not be called directly; It is automatically called for every tree
80
+ #pattern directly after wrapper definition
81
+ def generate_XPath_for_example
82
+ case @example_type
83
+ when EXAMPLE_TYPE_XPATH
84
+ @xpath = @parent_pattern.example
85
+ when EXAMPLE_TYPE_STRING
86
+ @temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.source[0], @parent_pattern.example )
87
+ @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
88
+ XPathUtils.generate_XPath(@temp_sink, nil, true)
89
+ when EXAMPLE_TYPE_CHILDREN
90
+ all_child_temp_sinks = []
91
+ @parent_pattern.children.each do |child_pattern|
92
+ child_pattern.filters.each do |filter|
93
+ all_child_temp_sinks << filter.temp_sink
94
+ end
95
+ end
96
+ result = all_child_temp_sinks.pop
97
+ if all_child_temp_sinks.empty?
98
+ result = result.parent
99
+ else
100
+ all_child_temp_sinks.each do |child_sink|
101
+ result = XPathUtils.lowest_common_ancestor(result, child_sink)
102
+ end
103
+ end
104
+ @temp_sink = result
105
+ @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
106
+ XPathUtils.generate_XPath(@temp_sink, nil, true)
107
+ @parent_pattern.children.each do |child_pattern|
108
+ child_pattern.filters.each do |filter|
109
+ filter.xpath =
110
+ child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(filter.temp_sink, result) :
111
+ XPathUtils.generate_relative_XPath(filter.temp_sink, result)
112
+ end
113
+ end
114
+ when EXAMPLE_TYPE_IMAGE
115
+ @temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.source[0], @parent_pattern.example)
116
+ @xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
117
+ end
118
+ end
119
+
120
+ #Dispatcher method to add constraints; of course, as with any method_missing, this method
121
+ #should not be called directly
122
+ def method_missing(method_name, *args, &block)
123
+ constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
124
+ end
125
+
126
+ private
127
+ def self.determine_example_type(example)
128
+ if example.instance_of? Regexp
129
+ EXAMPLE_TYPE_REGEXP
130
+ else
131
+ case example
132
+ when nil
133
+ EXAMPLE_TYPE_CHILDREN
134
+ when /\.(jpg|png|gif|jpeg)$/
135
+ EXAMPLE_TYPE_IMAGE
136
+ when /^\/{1,2}[a-z]+(\[\d+\])?(\/{1,2}[a-z]+(\[\d+\])?)*$/
137
+ (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
138
+ else
139
+ EXAMPLE_TYPE_STRING
140
+ end
141
+ end
142
+ end #End of method determine_example_type
143
+ end #End of class
144
+ end #End of module
@@ -0,0 +1,263 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'open-uri'
4
+
5
+ module Scrubyt
6
+ ##
7
+ #=<tt>Group more filters into one</tt>
8
+ #
9
+ #Server as an umbrella for filters which are conceptually extracting
10
+ #the same thing - for example a price or a title or ...
11
+ #
12
+ #Sometimes the same piece of information can not be extracted with one filter
13
+ #across more result instances (for example a price has an XPath in record n,
14
+ #but since in record n+1 has a discount price as well, the real price is pushed
15
+ #to a different XPath etc) - in this case the more filters which extract the same
16
+ #thing are hold in the same pattern.
17
+ class Pattern
18
+ #Type of the pattern;
19
+
20
+ # a root pattern represents a (surprise!) root pattern
21
+ PATTERN_TYPE_ROOT = 0
22
+ # a tree pattern represents a HTML region
23
+ PATTERN_TYPE_TREE = 1
24
+ # represents an attribute of the node extracted by the parent pattern
25
+ PATTERN_TYPE_ATTRIBUTE = 2
26
+ # represents a pattern which filters its output with a regexp
27
+ PATTERN_TYPE_REGEXP = 3
28
+
29
+ #The pattern can be either a model pattern (in this case it is
30
+ #written to the output) or a temp pattern (in this case it is skipped)
31
+ #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
32
+ #is considered to be a model pattern
33
+
34
+ #Model pattern are shown in the output
35
+ OUTPUT_TYPE_MODEL = 0
36
+ #Temp patterns are skipped in the output (their ancestors are appended to the parent
37
+ #of the pattrern which was skipped
38
+ OUTPUT_TYPE_TEMP = 1
39
+
40
+ #These fields can be set upon wrapper creation - i.e. a field which is public but not contained here can be accessed
41
+ #from outside, but not set as a result of wrapper construction
42
+ SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'example']
43
+
44
+ attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
45
+ :last_result, :result, :root_pattern, :example, :block_count,
46
+ :next_page, :limit, :extractor, :extracted_docs, :source, :sink
47
+ attr_reader :type, :generalize_set, :next_page_url
48
+
49
+ def initialize (name, *args)
50
+ @name = name #name of the pattern
51
+ parse_args(args) #parse the rest of the arguments
52
+ @root_pattern = nil #root pattern of the wrapper
53
+ @children = [] #child patterns
54
+ @filters = [] #filters of the wrapper
55
+ @sink = [] #output of a pattern
56
+ @source = [] #input of a pattern
57
+ @result = Result.new #hierarchical results of the pattern
58
+ @@instance_count = Hash.new(0)
59
+ @next_page = nil
60
+ filters << Scrubyt::Filter.new(self) #create a filter
61
+ end
62
+
63
+ #Parse the args passed as *args; There is only one compulsory parameter to pattern: it's name
64
+ #All the other parameters can (but do not have to) be specified;
65
+ #
66
+ #If an example is specified, it *MUST* be the first parameter; the order of the other
67
+ #parameters is irrelevant
68
+ def parse_args(args)
69
+ #If an example id defined, not only get it but also remove it so it
70
+ #does not interfere with the other possible string parameters
71
+ @example = args.delete_at(0) if args[0].instance_of? String
72
+ @example = args.delete_at(0) if args[0].instance_of? Regexp
73
+ args.each do |arg|
74
+ arg.each do |k,v|
75
+ #Set only the setable fields
76
+ if SETTABLE_FIELDS.include? k.to_s
77
+ #If the user is specifying a pattern type, turn it into the corresponding constant
78
+ v = "PATTERN_TYPE_#{v.to_s.upcase!}" if k.to_s == 'type'
79
+ v = "OUTPUT_TYPE_#{v.to_s.upcase!}" if k.to_s == 'output_type'
80
+ #Otherwise, if nothing special is happening, isntance_eval the hash pair
81
+ instance_eval("@#{k.to_s} = #{v}")
82
+ end
83
+ #This flags says that the user explicitly wants to set generalization on a pattern
84
+ #In this case, of course, our heuristics do not apply - the users setting overrides
85
+ #it
86
+ @generalize_set = true if (k.to_s == 'generalize')
87
+ end
88
+ end
89
+ #default settings - the user can override them, but if she did not do so,
90
+ #we will setup some meaningful defaults
91
+ @type ||= PATTERN_TYPE_TREE
92
+ @type = PATTERN_TYPE_REGEXP if @example.instance_of? Regexp
93
+ @output_type ||= OUTPUT_TYPE_MODEL
94
+ #don't generalize by default
95
+ @generalize ||= false
96
+ #This flag indicates that the user set 'generalize' to some value;
97
+ #This way we can ensure that the explicit setting will not be overridden
98
+ @generalize_set ||= false
99
+ end
100
+
101
+ #Dispatcher function; The class was already too big so I have decided to factor
102
+ #out some methods based on their functionality (like output, adding constraints)
103
+ #to utility classes.
104
+ #
105
+ #The second function besides dispatching is to lookup the results in an evaluated
106
+ #wrapper, for example
107
+ #
108
+ # camera_data.item[1].item_name[0]
109
+ def method_missing(method_name, *args, &block)
110
+ case method_name.to_s
111
+ when /^to_/
112
+ Scrubyt::ResultDumper.send(method_name.to_s, self)
113
+ when /^ensure_/
114
+ Scrubyt::ConstraintAdder.send(method_name, self, *args)
115
+ else
116
+ @children.each { |child| return child if child.name == method_name.to_s }
117
+ nil
118
+ end
119
+ end
120
+
121
+ #Companion function to the previous one (Pattern::method_missing). It makes
122
+ #inspecting results, like
123
+ #
124
+ # camera_data.item[1].item_name[0]
125
+ #
126
+ #possible. The method Pattern::method missing handles the 'item', 'item_name' etc.
127
+ #parts, while the indexing ([1], [0]) is handled by this function
128
+ def [](index)
129
+ return nil if (@result.lookup(@parent.last_result)) == nil
130
+ @last_result = @result.lookup(@parent.last_result)[index]
131
+ self
132
+ end
133
+
134
+ ##
135
+ #If export is called on the root pattern, it exports the whole extractor wher it is
136
+ #defined; See export.rb for further details on the parameters
137
+ def export(file, output_file_name=nil, extractor_result_file_name=nil)
138
+ Scrubyt::Export.export(file, self, output_file_name, extractor_result_file_name)
139
+ end
140
+
141
+ ##
142
+ #Add a filter to this pattern
143
+ def add_filter(filter)
144
+ @filters << filter
145
+ return self
146
+ end
147
+
148
+ ##
149
+ #Add a child pattern to this pattern
150
+ def add_child_pattern(child)
151
+ child.parent = self
152
+ #by default, generalize direct children of the root pattern, but only in the case if
153
+ #@generalize was not set up explicitly
154
+ child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
155
+ @children << child
156
+ end
157
+
158
+ ##
159
+ #Crawl to a new page. This function should not be called from the outside - it is automatically called
160
+ #if the next_page is defined
161
+ def crawl_to_new_page
162
+ temp_document = generate_next_page_link(@next_page)
163
+ return nil if temp_document == nil
164
+ clear_sources_and_sinks(@root_pattern)
165
+ @root_pattern.extractor.fetch(temp_document, nil)
166
+ attach_current_document
167
+ end
168
+
169
+ ##
170
+ #Attach document to the root pattern; This is happening automatically as the root pattern is defined or
171
+ #crawling to a new page
172
+ def attach_current_document
173
+ doc = @root_pattern.extractor.get_hpricot_doc
174
+ @source << doc
175
+ @sink << doc
176
+ @last_result ||= []
177
+ @last_result << doc
178
+ @result.add_result(@source, @sink)
179
+ end
180
+
181
+ ##
182
+ #Based on the given examples, calculate the XPaths for the tree patterns
183
+ def setup_examples
184
+ get_root_pattern(self)
185
+ set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
186
+ generate_examples(@root_pattern)
187
+ end
188
+
189
+ ##
190
+ #Evaluate the pattern. This means evaluating all the filters and adding
191
+ #their extracted instances to the array of results of this pattern
192
+ def evaluate
193
+ #No need to evaluate if there is no parent pattern
194
+ return if @parent == nil
195
+ @source = @parent.sink
196
+ @source.each do |source|
197
+ @filters.each do |filter|
198
+ r = filter.evaluate(source)
199
+ if filter.constraints.size > 0
200
+ #in the beginning, keys of result_hash are made up of all the results of the filter
201
+ #with value = true; Later on, only those results will have 'true' value which are
202
+ #accepted with all filters
203
+ result_hash = {}
204
+ r.each { |res| result_hash[res] = true }
205
+ result_hash.keys.each do |res|
206
+ filter.constraints.each { |constraint| result_hash[res] &&= constraint.check(res) }
207
+ end
208
+ result = result_hash.reject {|k,v| k if !v}
209
+ sorted_result = r.reject {|e| !result.keys.include? e}
210
+ add_result(source, sorted_result)
211
+ else
212
+ add_result(source, r)
213
+ end
214
+ end
215
+ end
216
+
217
+ end
218
+
219
+ def get_instance_count
220
+ @@instance_count
221
+ end
222
+
223
+ private
224
+ def add_result(source, results)
225
+ results.each do |res|
226
+ @sink << res
227
+ @result.add_result(source, res)
228
+ @@instance_count[@name] += 1
229
+ end
230
+ end
231
+
232
+ def get_root_pattern(pattern)
233
+ if @root_pattern == nil
234
+ while (pattern.parent != nil)
235
+ get_root_pattern(pattern.parent)
236
+ end
237
+ @root_pattern = pattern
238
+ end
239
+ end
240
+
241
+ def set_root_pattern_whole_wrapper(pattern, root_pattern)
242
+ pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
243
+ pattern.root_pattern = root_pattern
244
+ end
245
+
246
+ def generate_examples(pattern)
247
+ pattern.children.each {|child_pattern| generate_examples(child_pattern) }
248
+ pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == PATTERN_TYPE_TREE
249
+ end
250
+
251
+ def clear_sources_and_sinks(pattern)
252
+ pattern.source = []
253
+ pattern.sink = []
254
+ pattern.children.each {|child| clear_sources_and_sinks child}
255
+ end
256
+
257
+ def generate_next_page_link(example)
258
+ node = XPathUtils.find_node_from_text(@root_pattern.source[0], example)
259
+ return nil if node == nil
260
+ node.attributes['href']
261
+ end # end of method generate_next_page_link
262
+ end #end of class Pattern
263
+ end #end of module Scrubyt