scrubyt 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,187 @@
1
+ require 'logger'
2
+ require 'open-uri'
3
+ require 'rubygems'
4
+ require 'mechanize'
5
+ require 'hpricot'
6
+ require 'pp'
7
+
8
+ module Scrubyt
9
+ ##
10
+ #=<tt>Driving the whole extraction process</tt>
11
+ #Extractor is a performer class - it gets an extractor definition and carries
12
+ #out the actions and evaluates the wrappers sequentially.
13
+ #
14
+ #It also defines the actions as class methods - check out the section
15
+ #commented with ############# Actions.
16
+ class Extractor
17
+
18
+ #The definition of the extractor is passed through this method
19
+ def self.define(&extractor_definition)
20
+ @@current_doc_url = nil
21
+ @@current_form = nil
22
+ @@current_doc_protocol = nil
23
+ @@base_dir = nil
24
+ @@host_name = nil
25
+ @@agent = WWW::Mechanize.new
26
+ #Hack up an artificial root pattern (i.e. do not return the pattern which
27
+ #is the root one in the user's definition, but rather the real (invisible)
28
+ #root pattern
29
+ root_pattern = (class_eval(&extractor_definition)).parent
30
+ #A little hack here: upon wrapper construction we are counting the number
31
+ #of blocks, so we know the count of the 'end's/'}'s which end the extractor
32
+ #definition
33
+ #Recursively match data based on examples
34
+ root_pattern.setup_examples
35
+ #Once all is set up, evaluate the wrapper from the root pattern!
36
+ if root_pattern.next_page
37
+ current_page_count = 1
38
+ loop do
39
+ evaluate_wrapper(root_pattern)
40
+ break if (root_pattern.limit == current_page_count || root_pattern.crawl_to_new_page == nil)
41
+ current_page_count += 1 if root_pattern.limit != nil
42
+ end
43
+ else
44
+ evaluate_wrapper(root_pattern)
45
+ end
46
+ #Return the root pattern
47
+ root_pattern
48
+ end
49
+
50
+ #build the current wrapper
51
+ def self.method_missing(method_name, *args, &block)
52
+ pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
53
+ if @parent == nil
54
+ if method_name.to_s == 'next_page'
55
+ @@root_pattern.next_page = args[0]
56
+ @@root_pattern.limit = args[1][:limit] if args.size > 1
57
+ return @@last_pattern
58
+ else
59
+ #Create a root pattern
60
+ root_pattern = Scrubyt::Pattern.new('root', :type => :root)
61
+ @@root_pattern = root_pattern
62
+ @@root_pattern.root_pattern = root_pattern
63
+ @@root_pattern.root_pattern.extractor = self
64
+ #add the currently active document to the root pattern
65
+ @@root_pattern.attach_current_document
66
+ @@root_pattern.add_child_pattern(pattern)
67
+ @@root_pattern.block_count = 0
68
+ @@root_pattern.extractor = self
69
+ end
70
+ else
71
+ @parent.add_child_pattern(pattern) if @parent != nil
72
+ end
73
+ if block_given?
74
+ @@root_pattern.block_count = @@root_pattern.block_count + 1
75
+ @stack ||=[]
76
+ @parent = pattern
77
+ @stack.push @parent
78
+ class_eval(&block)
79
+ @stack.pop
80
+ @parent = @stack.last
81
+ end
82
+ @@last_pattern = pattern
83
+ end
84
+
85
+ #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
86
+ #(You should not be :)
87
+ def self.get_block_count
88
+ @@root_pattern.block_count
89
+ end
90
+
91
+ ############# Actions
92
+ #
93
+
94
+ ##
95
+ # At any given point, the current document can be queried with this method; Typically used
96
+ # when the navigation is over and the result document is passed to the wrapper
97
+ def self.get_current_doc_url
98
+ @@current_doc_url
99
+ end
100
+
101
+ def self.get_hpricot_doc
102
+ @@hpricot_doc
103
+ end
104
+
105
+ ##
106
+ #Action to fetch a document (either a file or a http address)
107
+ #
108
+ #*parameters*
109
+ #
110
+ #_doc_url_ - the url or file name to fetch
111
+ def self.fetch(doc_url, mechanize_doc=nil)
112
+ puts "fetching: #{doc_url}"
113
+ if (mechanize_doc == nil)
114
+ @@current_doc_url = doc_url
115
+ @@current_doc_protocol = ((doc_url =~ /^http/ || doc_url =~ /^www/) ? :http : :file)
116
+ if @@base_dir == nil
117
+ @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == :file
118
+ else
119
+ @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
120
+ end
121
+
122
+ if @@host_name == nil
123
+ if @@current_doc_protocol == :http
124
+ @@host_name = doc_url.scan(/http:\/\/.+?\//)[0]
125
+ @@host_name = doc_url if @@host_name == nil
126
+ end
127
+ else
128
+ @@current_doc_url = (@@host_name + doc_url) if doc_url !~ /#{@@host_name}/
129
+ end
130
+
131
+ @@mechanize_doc = @@agent.get(@@current_doc_url) if @@current_doc_protocol == :http
132
+ else
133
+ @@current_doc_url = doc_url
134
+ @@mechanize_doc = mechanize_doc
135
+ end
136
+ @@hpricot_doc = mechanize_doc != nil ? Hpricot(@@mechanize_doc.body) : Hpricot(open(@@current_doc_url))
137
+ out = open('kamaty.html', 'w')
138
+ out.write @@hpricot_doc.to_s
139
+ out.close
140
+ end
141
+
142
+ ##
143
+ #Action to fill a textfield with a query string
144
+ #
145
+ ##*parameters*
146
+ #
147
+ #_textfield_name_ - the name of the textfield (e.g. the name of the google search
148
+ #textfield is 'q'
149
+ #
150
+ #_query_string_ - the string that should be entered into the textfield
151
+ def self.fill_textfield(textfield_name, query_string)
152
+ puts 'fill textfield'
153
+ textfield = (@@hpricot_doc/"input[@name=#{textfield_name}]").map()[0]
154
+ formname = Scrubyt::XPathUtils.traverse_up_until_name(textfield, 'form').attributes['name']
155
+ @@current_form = @@mechanize_doc.forms.with.name(formname).first
156
+ eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
157
+ end
158
+
159
+ #Submit the last form;
160
+ def self.submit
161
+ puts 'submit'
162
+ result_page = @@agent.submit(@@current_form)#, @@current_form.buttons.first)
163
+ @@current_doc_url = result_page.uri.to_s
164
+ fetch(@@current_doc_url, result_page)
165
+ end
166
+
167
+ def self.click_link(link_text)
168
+ puts 'click link'
169
+ puts /^#{Regexp.escape(link_text)}$/
170
+ p /^#{Regexp.escape(link_text)}$/
171
+ link = @@mechanize_doc.links.text(/^#{Regexp.escape(link_text)}$/)
172
+ result_page = @@agent.click(link)
173
+ @@current_doc_url = result_page.uri.to_s
174
+ fetch(@@current_doc_url, result_page)
175
+ end
176
+
177
+ #
178
+ #############
179
+
180
+ private
181
+ def self.evaluate_wrapper(pattern)
182
+ pattern.evaluate
183
+ pattern.children.each { |child| evaluate_wrapper child }
184
+ end
185
+
186
+ end #end of class Extractor
187
+ end #end of module Scrubyt
@@ -0,0 +1,144 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Filter out relevant pieces from the parent pattern</tt>
4
+ #
5
+ #A Scrubyt wrapper is almost like a waterfall: water is pouring from the top until
6
+ #it reaches the bottom. The biggest difference is that instead of water, a HTML
7
+ #document travels through the space.
8
+ #
9
+ #Of course Scrubyt would not make much sense if the same document would arrive at
10
+ #the bottom that was poured in at the top - since in this case we might use an
11
+ #indentity transformation (i.e. do nothing with the input) as well.
12
+ #
13
+ #This is where filters came in: as they name says, they filter the stuff that is
14
+ #pouring from above, to leave the interesting parts and discard the rest.
15
+ #The working of a filter will be explained most easily by the help of an example.
16
+ #Let's consider that we would like to extract information from a webshop; Concretely
17
+ #we are interested in the name of the items and the URL pointing to the image of the
18
+ #item
19
+ #
20
+ #To accomplish this. first we select the items with the pattern item (a pattern is
21
+ #a logical grouping of fillters; see Pattern documentation) Then our new
22
+ #context is the result extracted by the item pattern; For every pattern, further
23
+ #extract the name and the image of the item; and finally, extractr the href attribute
24
+ #of the image. Let's see an illustration:
25
+ #
26
+ # root --> This pattern is called a 'root pattern', It is invisible to you
27
+ # | and basically it represents the document; it has no filters
28
+ # +-- item --> Filter what's coming from above (the whole document) to get
29
+ # | relevant pieces of data (in this case webshop items)
30
+ # +-- name --> Again, filter what's coming from above (a webshop item) and
31
+ # | leave only item names after this operation
32
+ # +-- image --> This time filter the image of the item
33
+ # |
34
+ # +-- href --> And finally, from the image elements, get the attribute 'href'
35
+ class Filter
36
+ #Type of the example this filter is extracted with
37
+
38
+ #XPath example, like html/body/tr/td[1] etc.
39
+ EXAMPLE_TYPE_XPATH = 0
40
+ #String from the document, for example 'Canon EOS 300 D'.
41
+ EXAMPLE_TYPE_STRING = 1
42
+ #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
43
+ EXAMPLE_TYPE_IMAGE = 2
44
+ #No example - the actual XPath is determined from the children XPaths (their LCA)
45
+ EXAMPLE_TYPE_CHILDREN = 3
46
+ #Regexp example, like /\d+@*\d+[a-z]/
47
+ EXAMPLE_TYPE_REGEXP = 4
48
+
49
+ attr_accessor :example_type, :parent_pattern, :temp_sink, :constraints, :xpath, :regexp
50
+
51
+ def initialize(parent_pattern, *args)
52
+ @parent_pattern = parent_pattern
53
+ #If the example type is not explicitly defined in the pattern definition,
54
+ #try to determine it automatically from the example
55
+ @example_type = (args[0] == nil ? Filter.determine_example_type(parent_pattern.example) :
56
+ args[0][:example_type])
57
+ @regexp = parent_pattern.example if @example_type == EXAMPLE_TYPE_REGEXP
58
+ @xpath = nil #The xpath to evaluate this filter
59
+ #temp sinks are used for the initial run when determining the XPaths for examples;
60
+ @temp_sink = nil
61
+ @constraints = [] #list of constraints
62
+ end
63
+
64
+ #Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
65
+ #is evaluated, every pattern evaluates its filters and then they are calling this method
66
+ def evaluate(source)
67
+ case @parent_pattern.type
68
+ when Scrubyt::Pattern::PATTERN_TYPE_TREE
69
+ result = source/@xpath
70
+ result.class == Hpricot::Elements ? result.map : [result]
71
+ when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
72
+ [source.attributes[@parent_pattern.example]]
73
+ when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
74
+ source.inner_text.scan(@regexp).flatten
75
+ end
76
+ end
77
+
78
+ #For all the tree patterns, generate an XPath based on the given example
79
+ #Also this method should not be called directly; It is automatically called for every tree
80
+ #pattern directly after wrapper definition
81
+ def generate_XPath_for_example
82
+ case @example_type
83
+ when EXAMPLE_TYPE_XPATH
84
+ @xpath = @parent_pattern.example
85
+ when EXAMPLE_TYPE_STRING
86
+ @temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.source[0], @parent_pattern.example )
87
+ @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
88
+ XPathUtils.generate_XPath(@temp_sink, nil, true)
89
+ when EXAMPLE_TYPE_CHILDREN
90
+ all_child_temp_sinks = []
91
+ @parent_pattern.children.each do |child_pattern|
92
+ child_pattern.filters.each do |filter|
93
+ all_child_temp_sinks << filter.temp_sink
94
+ end
95
+ end
96
+ result = all_child_temp_sinks.pop
97
+ if all_child_temp_sinks.empty?
98
+ result = result.parent
99
+ else
100
+ all_child_temp_sinks.each do |child_sink|
101
+ result = XPathUtils.lowest_common_ancestor(result, child_sink)
102
+ end
103
+ end
104
+ @temp_sink = result
105
+ @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
106
+ XPathUtils.generate_XPath(@temp_sink, nil, true)
107
+ @parent_pattern.children.each do |child_pattern|
108
+ child_pattern.filters.each do |filter|
109
+ filter.xpath =
110
+ child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(filter.temp_sink, result) :
111
+ XPathUtils.generate_relative_XPath(filter.temp_sink, result)
112
+ end
113
+ end
114
+ when EXAMPLE_TYPE_IMAGE
115
+ @temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.source[0], @parent_pattern.example)
116
+ @xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
117
+ end
118
+ end
119
+
120
+ #Dispatcher method to add constraints; of course, as with any method_missing, this method
121
+ #should not be called directly
122
+ def method_missing(method_name, *args, &block)
123
+ constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
124
+ end
125
+
126
+ private
127
+ def self.determine_example_type(example)
128
+ if example.instance_of? Regexp
129
+ EXAMPLE_TYPE_REGEXP
130
+ else
131
+ case example
132
+ when nil
133
+ EXAMPLE_TYPE_CHILDREN
134
+ when /\.(jpg|png|gif|jpeg)$/
135
+ EXAMPLE_TYPE_IMAGE
136
+ when /^\/{1,2}[a-z]+(\[\d+\])?(\/{1,2}[a-z]+(\[\d+\])?)*$/
137
+ (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
138
+ else
139
+ EXAMPLE_TYPE_STRING
140
+ end
141
+ end
142
+ end #End of method determine_example_type
143
+ end #End of class
144
+ end #End of module
@@ -0,0 +1,263 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'open-uri'
4
+
5
+ module Scrubyt
6
+ ##
7
+ #=<tt>Group more filters into one</tt>
8
+ #
9
+ #Server as an umbrella for filters which are conceptually extracting
10
+ #the same thing - for example a price or a title or ...
11
+ #
12
+ #Sometimes the same piece of information can not be extracted with one filter
13
+ #across more result instances (for example a price has an XPath in record n,
14
+ #but since in record n+1 has a discount price as well, the real price is pushed
15
+ #to a different XPath etc) - in this case the more filters which extract the same
16
+ #thing are hold in the same pattern.
17
+ class Pattern
18
+ #Type of the pattern;
19
+
20
+ # a root pattern represents a (surprise!) root pattern
21
+ PATTERN_TYPE_ROOT = 0
22
+ # a tree pattern represents a HTML region
23
+ PATTERN_TYPE_TREE = 1
24
+ # represents an attribute of the node extracted by the parent pattern
25
+ PATTERN_TYPE_ATTRIBUTE = 2
26
+ # represents a pattern which filters its output with a regexp
27
+ PATTERN_TYPE_REGEXP = 3
28
+
29
+ #The pattern can be either a model pattern (in this case it is
30
+ #written to the output) or a temp pattern (in this case it is skipped)
31
+ #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
32
+ #is considered to be a model pattern
33
+
34
+ #Model pattern are shown in the output
35
+ OUTPUT_TYPE_MODEL = 0
36
+ #Temp patterns are skipped in the output (their ancestors are appended to the parent
37
+ #of the pattrern which was skipped
38
+ OUTPUT_TYPE_TEMP = 1
39
+
40
+ #These fields can be set upon wrapper creation - i.e. a field which is public but not contained here can be accessed
41
+ #from outside, but not set as a result of wrapper construction
42
+ SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'example']
43
+
44
+ attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
45
+ :last_result, :result, :root_pattern, :example, :block_count,
46
+ :next_page, :limit, :extractor, :extracted_docs, :source, :sink
47
+ attr_reader :type, :generalize_set, :next_page_url
48
+
49
+ def initialize (name, *args)
50
+ @name = name #name of the pattern
51
+ parse_args(args) #parse the rest of the arguments
52
+ @root_pattern = nil #root pattern of the wrapper
53
+ @children = [] #child patterns
54
+ @filters = [] #filters of the wrapper
55
+ @sink = [] #output of a pattern
56
+ @source = [] #input of a pattern
57
+ @result = Result.new #hierarchical results of the pattern
58
+ @@instance_count = Hash.new(0)
59
+ @next_page = nil
60
+ filters << Scrubyt::Filter.new(self) #create a filter
61
+ end
62
+
63
+ #Parse the args passed as *args; There is only one compulsory parameter to pattern: it's name
64
+ #All the other parameters can (but do not have to) be specified;
65
+ #
66
+ #If an example is specified, it *MUST* be the first parameter; the order of the other
67
+ #parameters is irrelevant
68
+ def parse_args(args)
69
+ #If an example id defined, not only get it but also remove it so it
70
+ #does not interfere with the other possible string parameters
71
+ @example = args.delete_at(0) if args[0].instance_of? String
72
+ @example = args.delete_at(0) if args[0].instance_of? Regexp
73
+ args.each do |arg|
74
+ arg.each do |k,v|
75
+ #Set only the setable fields
76
+ if SETTABLE_FIELDS.include? k.to_s
77
+ #If the user is specifying a pattern type, turn it into the corresponding constant
78
+ v = "PATTERN_TYPE_#{v.to_s.upcase!}" if k.to_s == 'type'
79
+ v = "OUTPUT_TYPE_#{v.to_s.upcase!}" if k.to_s == 'output_type'
80
+ #Otherwise, if nothing special is happening, isntance_eval the hash pair
81
+ instance_eval("@#{k.to_s} = #{v}")
82
+ end
83
+ #This flags says that the user explicitly wants to set generalization on a pattern
84
+ #In this case, of course, our heuristics do not apply - the users setting overrides
85
+ #it
86
+ @generalize_set = true if (k.to_s == 'generalize')
87
+ end
88
+ end
89
+ #default settings - the user can override them, but if she did not do so,
90
+ #we will setup some meaningful defaults
91
+ @type ||= PATTERN_TYPE_TREE
92
+ @type = PATTERN_TYPE_REGEXP if @example.instance_of? Regexp
93
+ @output_type ||= OUTPUT_TYPE_MODEL
94
+ #don't generalize by default
95
+ @generalize ||= false
96
+ #This flag indicates that the user set 'generalize' to some value;
97
+ #This way we can ensure that the explicit setting will not be overridden
98
+ @generalize_set ||= false
99
+ end
100
+
101
+ #Dispatcher function; The class was already too big so I have decided to factor
102
+ #out some methods based on their functionality (like output, adding constraints)
103
+ #to utility classes.
104
+ #
105
+ #The second function besides dispatching is to lookup the results in an evaluated
106
+ #wrapper, for example
107
+ #
108
+ # camera_data.item[1].item_name[0]
109
+ def method_missing(method_name, *args, &block)
110
+ case method_name.to_s
111
+ when /^to_/
112
+ Scrubyt::ResultDumper.send(method_name.to_s, self)
113
+ when /^ensure_/
114
+ Scrubyt::ConstraintAdder.send(method_name, self, *args)
115
+ else
116
+ @children.each { |child| return child if child.name == method_name.to_s }
117
+ nil
118
+ end
119
+ end
120
+
121
+ #Companion function to the previous one (Pattern::method_missing). It makes
122
+ #inspecting results, like
123
+ #
124
+ # camera_data.item[1].item_name[0]
125
+ #
126
+ #possible. The method Pattern::method missing handles the 'item', 'item_name' etc.
127
+ #parts, while the indexing ([1], [0]) is handled by this function
128
+ def [](index)
129
+ return nil if (@result.lookup(@parent.last_result)) == nil
130
+ @last_result = @result.lookup(@parent.last_result)[index]
131
+ self
132
+ end
133
+
134
+ ##
135
+ #If export is called on the root pattern, it exports the whole extractor wher it is
136
+ #defined; See export.rb for further details on the parameters
137
+ def export(file, output_file_name=nil, extractor_result_file_name=nil)
138
+ Scrubyt::Export.export(file, self, output_file_name, extractor_result_file_name)
139
+ end
140
+
141
+ ##
142
+ #Add a filter to this pattern
143
+ def add_filter(filter)
144
+ @filters << filter
145
+ return self
146
+ end
147
+
148
+ ##
149
+ #Add a child pattern to this pattern
150
+ def add_child_pattern(child)
151
+ child.parent = self
152
+ #by default, generalize direct children of the root pattern, but only in the case if
153
+ #@generalize was not set up explicitly
154
+ child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
155
+ @children << child
156
+ end
157
+
158
+ ##
159
+ #Crawl to a new page. This function should not be called from the outside - it is automatically called
160
+ #if the next_page is defined
161
+ def crawl_to_new_page
162
+ temp_document = generate_next_page_link(@next_page)
163
+ return nil if temp_document == nil
164
+ clear_sources_and_sinks(@root_pattern)
165
+ @root_pattern.extractor.fetch(temp_document, nil)
166
+ attach_current_document
167
+ end
168
+
169
+ ##
170
+ #Attach document to the root pattern; This is happening automatically as the root pattern is defined or
171
+ #crawling to a new page
172
+ def attach_current_document
173
+ doc = @root_pattern.extractor.get_hpricot_doc
174
+ @source << doc
175
+ @sink << doc
176
+ @last_result ||= []
177
+ @last_result << doc
178
+ @result.add_result(@source, @sink)
179
+ end
180
+
181
+ ##
182
+ #Based on the given examples, calculate the XPaths for the tree patterns
183
+ def setup_examples
184
+ get_root_pattern(self)
185
+ set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
186
+ generate_examples(@root_pattern)
187
+ end
188
+
189
+ ##
190
+ #Evaluate the pattern. This means evaluating all the filters and adding
191
+ #their extracted instances to the array of results of this pattern
192
+ def evaluate
193
+ #No need to evaluate if there is no parent pattern
194
+ return if @parent == nil
195
+ @source = @parent.sink
196
+ @source.each do |source|
197
+ @filters.each do |filter|
198
+ r = filter.evaluate(source)
199
+ if filter.constraints.size > 0
200
+ #in the beginning, keys of result_hash are made up of all the results of the filter
201
+ #with value = true; Later on, only those results will have 'true' value which are
202
+ #accepted with all filters
203
+ result_hash = {}
204
+ r.each { |res| result_hash[res] = true }
205
+ result_hash.keys.each do |res|
206
+ filter.constraints.each { |constraint| result_hash[res] &&= constraint.check(res) }
207
+ end
208
+ result = result_hash.reject {|k,v| k if !v}
209
+ sorted_result = r.reject {|e| !result.keys.include? e}
210
+ add_result(source, sorted_result)
211
+ else
212
+ add_result(source, r)
213
+ end
214
+ end
215
+ end
216
+
217
+ end
218
+
219
+ def get_instance_count
220
+ @@instance_count
221
+ end
222
+
223
+ private
224
+ def add_result(source, results)
225
+ results.each do |res|
226
+ @sink << res
227
+ @result.add_result(source, res)
228
+ @@instance_count[@name] += 1
229
+ end
230
+ end
231
+
232
+ def get_root_pattern(pattern)
233
+ if @root_pattern == nil
234
+ while (pattern.parent != nil)
235
+ get_root_pattern(pattern.parent)
236
+ end
237
+ @root_pattern = pattern
238
+ end
239
+ end
240
+
241
+ def set_root_pattern_whole_wrapper(pattern, root_pattern)
242
+ pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
243
+ pattern.root_pattern = root_pattern
244
+ end
245
+
246
+ def generate_examples(pattern)
247
+ pattern.children.each {|child_pattern| generate_examples(child_pattern) }
248
+ pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == PATTERN_TYPE_TREE
249
+ end
250
+
251
+ def clear_sources_and_sinks(pattern)
252
+ pattern.source = []
253
+ pattern.sink = []
254
+ pattern.children.each {|child| clear_sources_and_sinks child}
255
+ end
256
+
257
+ def generate_next_page_link(example)
258
+ node = XPathUtils.find_node_from_text(@root_pattern.source[0], example)
259
+ return nil if node == nil
260
+ node.attributes['href']
261
+ end # end of method generate_next_page_link
262
+ end #end of class Pattern
263
+ end #end of module Scrubyt