scrubyt 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -109,7 +109,7 @@ private
109
109
  first_line = contents.scan(/.*Extractor\.define.*/)
110
110
  #During wrapper construction, we count the number of blocks; add one occurrence of
111
111
  #end (to close the block of the extractor definition)
112
- count = pattern.root_pattern.block_count + 1
112
+ count = pattern.evaluation_context.block_count + 1
113
113
  #Construct the extractor definition matching regexp based on the number of ends
114
114
  definition = contents.scan(/Extractor\.define(?:.*?(?:\}|end)){#{count.to_s}}/m)
115
115
  #Since the regexp matching the extractor definition was multiline, get the first
@@ -117,14 +117,13 @@ private
117
117
  rows = definition[0].split("\n")
118
118
  #Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
119
119
  #patterns could be matched very easily from the extractor definition (because they begun
120
- #with 'P.'). Now that P has been removed, mimick it!
121
- keywords = ['fetch', 'fill_textfield', 'submit', 'end', 'click_link']
120
+ #with 'P.'). Now that P has been removed, mimick it!
122
121
  rows.each do |row|
123
122
  #Do not prepend P. to comments and empty lines
124
123
  next if (row.strip =~ /^#/ || row.strip == '')
125
124
  #Do not prepend P. to any of the reserved keywords
126
125
  jump_to_next = false
127
- keywords.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
126
+ NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
128
127
  next if jump_to_next
129
128
  #Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
130
129
  row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
@@ -180,8 +179,13 @@ private
180
179
 
181
180
  def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
182
181
  return if name=='root'
183
- full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
184
- examples = full_line.split(",")
182
+ parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
183
+ if parens.empty?
184
+ full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
185
+ else
186
+ full_line = parens[0][0]
187
+ end
188
+ examples = full_line.split(",")
185
189
  examples.reject! {|exa| exa.strip!; exa[0..0] != %q{"} && exa[0..0] != %q{'} }
186
190
  all_xpaths = ""
187
191
  examples.each do |e|
@@ -193,9 +197,11 @@ private
193
197
  end
194
198
  replacing_xpath = full_line.include?('{') ? "P.#{name}('#{all_xpaths}')" :
195
199
  "P.#{name} #{all_xpaths}"
196
- @full_definition.sub!(/P\.#{name}\s+#{left_delimiter}(.*)#{right_delimiter}/) do
200
+ optional_paren_escaped = parens.empty? ? '' : '\('
201
+ optional_paren = parens.empty? ? '' : '('
202
+ @full_definition.sub!(/P\.#{name}\s*#{optional_paren_escaped}#{left_delimiter}(.*)#{right_delimiter}/) do
197
203
  @name_to_xpath_map.delete("#{name}")
198
- replacing_xpath
204
+ optional_paren + replacing_xpath
199
205
  end
200
206
  end
201
207
 
@@ -0,0 +1,137 @@
1
+ module Scrubyt
2
+
3
+ require 'set'
4
+ ##
5
+ #=<tt>Post processing results after the extraction</tt>
6
+ #Some things can not be carried out during evaluation - for example
7
+ #the ensure_presence_of_pattern constraint (since the evaluation is top
8
+ #to bottom, at a given point we don't know yet whether the currently
9
+ #evaluated pattern will have a child pattern or not) or removing unneeded
10
+ #results caused by evaluating multiple filters.
11
+ #
12
+ #The sole purpose of this class is to execute these post-processing tasks.
13
+ class PostProcessor
14
+ ##
15
+ #This is just a convenience method do call all the postprocessing
16
+ #functionality and checks
17
+ def self.apply_post_processing(root_pattern)
18
+ ensure_presence_of_pattern_full(root_pattern)
19
+ remove_multiple_filter_duplicates(root_pattern) if root_pattern.children[0].filters.size > 1
20
+ report_if_no_results(root_pattern) if root_pattern.evaluation_context.extractor.get_mode != :production
21
+ end
22
+
23
+ ##
24
+ #Apply the ensure_presence_of_pattern constraint on
25
+ #the full extractor
26
+ def self.ensure_presence_of_pattern_full(pattern)
27
+ ensure_presence_of_pattern(pattern)
28
+ pattern.children.each {|child| ensure_presence_of_pattern_full(child)}
29
+ end
30
+
31
+ ##
32
+ #Remove unneeded results of a pattern (caused by evaluating multiple filters)
33
+ #See for example the B&N scenario - the book titles are extracted two times
34
+ #for every pattern (since both examples generate the same XPath for them)
35
+ #but since always only one of the results has a price, the other is discarded
36
+ def self.remove_multiple_filter_duplicates(pattern)
37
+ remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
38
+ pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
39
+ end
40
+
41
+ ##
42
+ #Issue an error report if the document did not extract anything.
43
+ #Probably this is because the structure of the page changed or
44
+ #because of some rather nasty bug - in any case, something wrong
45
+ #is going on, and we need to inform the user about this!
46
+ def self.report_if_no_results(root_pattern)
47
+ results_found = false
48
+ root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
49
+ puts
50
+ puts "!!!!!! WARNING: The extractor did not find any result instances"
51
+ puts "Most probably this is wrong. Check your extractor and if you are"
52
+ puts "sure it should work, report a bug!"
53
+ puts
54
+ end
55
+
56
+ private
57
+ def self.ensure_presence_of_pattern(pattern)
58
+ #holds the name of those child patterns which have to be present as children of the input parameter
59
+ epop_names = pattern.get_constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
60
+ return if epop_names.empty?
61
+ #all_parent_values holds instances extracted by pattern
62
+ all_parent_values = []
63
+ pattern.result.childmap.each { |h| all_parent_values << h.values }
64
+ all_parent_values.flatten!
65
+ #indices of result instances (of pattern) we are going to remove
66
+ results_to_remove = Set.new
67
+ pattern.children.each do |child_pattern|
68
+ #all_child_values holds instances extracted by child_pattern
69
+ all_child_values = []
70
+ child_pattern.result.childmap.each { |h| all_child_values << h.values }
71
+ all_child_values.flatten!
72
+
73
+ #populate results_to_remove
74
+ i = 0
75
+ all_parent_values.each do |parent_value|
76
+ #Hey! Not just the direct children but all the ancestors
77
+ @found_ancestor = false
78
+ check_ancestors(parent_value, all_child_values)
79
+
80
+ results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
81
+ i += 1
82
+ end
83
+ end
84
+ #based on results_to_remove, populate the array 'rejected' which holds the actual instances
85
+ #(and not indices, as in the case of results_to_remove!). In other words, we are mapping
86
+ #results_to_remove indices to their actual instances
87
+ rejected = []
88
+ i = -1
89
+ pattern.result.childmap.each do |h|
90
+ h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
91
+ end
92
+
93
+ #Finally, do the actual delete!
94
+ pattern.result.childmap.each { |h| h.each { |k,v| rejected.each { |r| v.delete(r)} } }
95
+ end
96
+
97
+ def self.check_ancestors(parent_value, all_child_values)
98
+ parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child }
99
+ parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem }
100
+ end
101
+
102
+ def self.remove_multiple_filter_duplicates_intern(pattern)
103
+ possible_duplicates = {}
104
+ longest_result = 0
105
+ pattern.result.childmap.each { |r|
106
+ r.each do |k,v|
107
+ v.each do |x|
108
+ all_child_results = []
109
+ pattern.children.each { |child|
110
+ temp_res = child.result.lookup(x)
111
+ all_child_results << temp_res if temp_res != nil
112
+ }
113
+ next if all_child_results.size <= 1
114
+ longest_result = all_child_results.map {|e| e.size}.max
115
+ all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
116
+ possible_duplicates[x] = all_child_results.transpose
117
+ end
118
+ end
119
+ }
120
+ #Determine the 'real' duplicates
121
+ real_duplicates = {}
122
+ possible_duplicates.each { |k,v|
123
+ next if v.size == 1
124
+ v.each { |r| real_duplicates[k] = r }
125
+ }
126
+
127
+ #Finally, remove them!
128
+ pattern.children.each { |child|
129
+ child.result.childmap.each { |r|
130
+ r.each { |k,v|
131
+ real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
132
+ }
133
+ }
134
+ }
135
+ end #end of function
136
+ end #end of class PostProcessor
137
+ end #end of module Scrubyt
@@ -81,13 +81,6 @@ private
81
81
  end
82
82
  end
83
83
 
84
- def self.print_old_sta(pattern, depth)
85
- puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{pattern.get_instance_count[pattern.name]} instances.") if pattern.name != 'root'
86
- pattern.children.each do |child|
87
- print_statistics_recursive(child, depth + 4)
88
- end
89
- end
90
-
91
84
  def self.print_statistics_recursive(pattern, depth)
92
85
  if pattern.name != 'root'
93
86
  count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
@@ -24,6 +24,8 @@ module Scrubyt
24
24
  def self.find_node_from_text(doc, text, next_link)
25
25
  @node = nil
26
26
  @found = false
27
+ #digg next page hack
28
+ text.gsub!('»', '&#187;')
27
29
  self.traverse_for_full_text(doc,text)
28
30
  self.lowest_possible_node_with_text(@node, text) if @node != nil
29
31
  if (@found == false)
@@ -138,7 +140,7 @@ module Scrubyt
138
140
  #most typically the user will need the 0th - but if this is not the
139
141
  #case, there is the possibility to override this
140
142
  def self.find_image(doc, example, index=0)
141
- (doc/"img[@src='#{example}']")[index]
143
+ (doc/"//img[@src='#{example}']")[index]
142
144
  end
143
145
 
144
146
  ##
@@ -208,7 +210,8 @@ private
208
210
  def self.traverse_for_full_text(node, text)
209
211
  return if @found
210
212
  if (node.instance_of? Hpricot::Elem)
211
- ft = unescape_entities(full_text(node)).strip
213
+ ft = unescape_entities(full_text(node)).strip
214
+ #puts ">>#{ft}<<, text:>>#{text}<<" if ((ft.size < 20) && ft =~ /Open Source/)
212
215
  if (ft == text)
213
216
  @found = true
214
217
  @node = node
@@ -0,0 +1,27 @@
1
+ require 'rubygems'
2
+ require 'scrubyt'
3
+ require 'test/unit'
4
+
5
+ class PatternTest < Test::Unit::TestCase
6
+
7
+ def test_select_indices
8
+ some_pattern = Scrubyt::Pattern.new('some_pattern')
9
+ some_pattern.select_indices(1..3)
10
+ assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
11
+ some_pattern.select_indices([1])
12
+ assert_equal(some_pattern.result_indexer.indices_to_extract, [1])
13
+ some_pattern.select_indices([1,2,3])
14
+ assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
15
+ some_pattern.select_indices(:first)
16
+ assert_equal(some_pattern.result_indexer.indices_to_extract, [:first])
17
+ some_pattern.select_indices([:first, :last])
18
+ assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,:last])
19
+ some_pattern.select_indices([:first, [5,6]])
20
+ assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,5,6])
21
+ some_pattern.select_indices([:first, 1..2])
22
+ assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,1,2])
23
+ some_pattern.select_indices([4..5, :first, [5,6]])
24
+ assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,4,5,6])
25
+ end
26
+
27
+ end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: scrubyt
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.0
7
- date: 2007-02-04 00:00:00 +01:00
6
+ version: 0.2.3
7
+ date: 2007-02-20 00:00:00 +01:00
8
8
  summary: A powerful Web-scraping framework
9
9
  require_paths:
10
10
  - lib
@@ -34,24 +34,30 @@ files:
34
34
  - CHANGELOG
35
35
  - Rakefile
36
36
  - lib/scrubyt.rb
37
- - lib/scrubyt/constraint.rb
38
- - lib/scrubyt/pattern.rb
39
- - lib/scrubyt/result.rb
40
- - lib/scrubyt/export.rb
41
- - lib/scrubyt/constraint_adder.rb
42
- - lib/scrubyt/post_processor.rb
43
- - lib/scrubyt/filter.rb
44
- - lib/scrubyt/xpathutils.rb
45
- - lib/scrubyt/result_dumper.rb
46
- - lib/scrubyt/extractor.rb
37
+ - lib/scrubyt/utils/xpathutils.rb
38
+ - lib/scrubyt/output/result_dumper.rb
39
+ - lib/scrubyt/output/export.rb
40
+ - lib/scrubyt/output/post_processor.rb
41
+ - lib/scrubyt/output/result.rb
42
+ - lib/scrubyt/core/navigation/fetch_action.rb
43
+ - lib/scrubyt/core/navigation/navigation_actions.rb
44
+ - lib/scrubyt/core/scraping/result_indexer.rb
45
+ - lib/scrubyt/core/scraping/constraint_adder.rb
46
+ - lib/scrubyt/core/scraping/constraint.rb
47
+ - lib/scrubyt/core/scraping/filter.rb
48
+ - lib/scrubyt/core/scraping/pattern.rb
49
+ - lib/scrubyt/core/scraping/pre_filter_document.rb
50
+ - lib/scrubyt/core/shared/evaluation_context.rb
51
+ - lib/scrubyt/core/shared/extractor.rb
47
52
  test_files:
48
53
  - test/unittests/input
49
- - test/unittests/constraint_test.rb
50
54
  - test/unittests/filter_test.rb
51
- - test/unittests/xpathutils_test.rb
55
+ - test/unittests/pattern_test.rb
52
56
  - test/unittests/extractor_test.rb
53
- - test/unittests/input/test.html
57
+ - test/unittests/xpathutils_test.rb
58
+ - test/unittests/constraint_test.rb
54
59
  - test/unittests/input/constraint_test.html
60
+ - test/unittests/input/test.html
55
61
  rdoc_options: []
56
62
 
57
63
  extra_rdoc_files: []
@@ -62,5 +68,22 @@ extensions: []
62
68
 
63
69
  requirements: []
64
70
 
65
- dependencies: []
66
-
71
+ dependencies:
72
+ - !ruby/object:Gem::Dependency
73
+ name: hpricot
74
+ version_requirement:
75
+ version_requirements: !ruby/object:Gem::Version::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0.5"
80
+ version:
81
+ - !ruby/object:Gem::Dependency
82
+ name: mechanize
83
+ version_requirement:
84
+ version_requirements: !ruby/object:Gem::Version::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: 0.6.3
89
+ version:
@@ -1,279 +0,0 @@
1
- require 'logger'
2
- require 'open-uri'
3
- require 'rubygems'
4
- require 'mechanize'
5
- require 'hpricot'
6
- require 'pp'
7
- require 'set'
8
-
9
- module Scrubyt
10
- ##
11
- #=<tt>Driving the whole extraction process</tt>
12
- #Extractor is a performer class - it gets an extractor definition and carries
13
- #out the actions and evaluates the wrappers sequentially.
14
- #
15
- #It also defines the actions as class methods - check out the section
16
- #commented with ############# Actions.
17
- class Extractor
18
-
19
- #The definition of the extractor is passed through this method
20
- def self.define(&extractor_definition)
21
- @@current_doc_url = nil
22
- @@current_form = nil
23
- @@current_doc_protocol = nil
24
- @@base_dir = nil
25
- @@host_name = nil
26
- @@agent = WWW::Mechanize.new
27
- #Hack up an artificial root pattern (i.e. do not return the pattern which
28
- #is the root one in the user's definition, but rather the real (invisible)
29
- #root pattern
30
- root_pattern = (class_eval(&extractor_definition)).parent
31
- #A little hack here: upon wrapper construction we are counting the number
32
- #of blocks, so we know the count of the 'end's/'}'s which end the extractor
33
- #definition
34
- #Recursively match data based on examples
35
- root_pattern.setup_examples
36
- #Once all is set up, evaluate the wrapper from the root pattern!
37
- if root_pattern.next_page
38
- current_page_count = 1
39
- loop do
40
- evaluate_wrapper(root_pattern)
41
- break if (root_pattern.limit == current_page_count || root_pattern.crawl_to_new_page == nil)
42
- current_page_count += 1 if root_pattern.limit != nil
43
- end
44
- else
45
- evaluate_wrapper(root_pattern)
46
- end
47
- ensure_all_postconditions(root_pattern)
48
- PostProcessor.remove_multiple_filter_duplicates(root_pattern)
49
- PostProcessor.report_if_no_results(root_pattern)
50
- #Return the root pattern
51
- root_pattern
52
- end
53
-
54
- #build the current wrapper
55
- def self.method_missing(method_name, *args, &block)
56
- pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
57
- if @parent == nil
58
- if method_name.to_s == 'next_page'
59
- @@root_pattern.next_page = args[0]
60
- @@root_pattern.limit = args[1][:limit] if args.size > 1
61
- return @@last_pattern
62
- else
63
- #Create a root pattern
64
- root_pattern = Scrubyt::Pattern.new('root', :type => :root)
65
- @@root_pattern = root_pattern
66
- @@root_pattern.root_pattern = root_pattern
67
- @@root_pattern.root_pattern.extractor = self
68
- #add the currently active document to the root pattern
69
- @@root_pattern.attach_current_document
70
- @@root_pattern.add_child_pattern(pattern)
71
- @@root_pattern.block_count = 0
72
- @@root_pattern.extractor = self
73
- end
74
- else
75
- @parent.add_child_pattern(pattern) if @parent != nil
76
- end
77
- if block_given?
78
- @@root_pattern.block_count = @@root_pattern.block_count + 1
79
- @stack ||=[]
80
- @parent = pattern
81
- @stack.push @parent
82
- class_eval(&block)
83
- @stack.pop
84
- @parent = @stack.last
85
- end
86
- @@last_pattern = pattern
87
- end
88
-
89
- #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
90
- #(You should not be :)
91
- def self.get_block_count
92
- @@root_pattern.block_count
93
- end
94
-
95
- ############# Actions
96
- #
97
-
98
- ##
99
- # At any given point, the current document can be queried with this method; Typically used
100
- # when the navigation is over and the result document is passed to the wrapper
101
- def self.get_current_doc_url
102
- @@current_doc_url
103
- end
104
-
105
- def self.get_hpricot_doc
106
- @@hpricot_doc
107
- end
108
-
109
- ##
110
- #Action to fetch a document (either a file or a http address)
111
- #
112
- #*parameters*
113
- #
114
- #_doc_url_ - the url or file name to fetch
115
- def self.fetch(doc_url, mechanize_doc=nil)
116
- if (mechanize_doc == nil)
117
- @@current_doc_url = doc_url
118
- @@current_doc_protocol = ((doc_url =~ /^http/ || doc_url =~ /^www/) ? :http : :file)
119
- if @@base_dir == nil
120
- @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == :file
121
- else
122
- @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
123
- end
124
-
125
- if @@host_name != nil
126
- if doc_url !~ /#{@@host_name}/
127
- @@current_doc_url = (@@host_name + doc_url)
128
- #remove duplicate parts, like /blogs/en/blogs/en
129
- @@current_doc_url = @@current_doc_url.split('/').uniq.reject{|x| x == ""}.join('/')
130
- @@current_doc_url.sub!('http:/', 'http://')
131
- end
132
- end
133
- puts "[ACTION] fetching document: #{@@current_doc_url}"
134
- if @@current_doc_protocol == :http
135
-
136
- @@mechanize_doc = @@agent.get(@@current_doc_url)
137
- @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0]
138
- @@host_name = doc_url if @@host_name == nil
139
- end
140
- else
141
- @@current_doc_url = doc_url
142
- @@mechanize_doc = mechanize_doc
143
- @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0]
144
- @@host_name = doc_url if @@host_name == nil
145
- end
146
- @@hpricot_doc = Hpricot(open(@@current_doc_url))
147
- end
148
-
149
- ##
150
- #Action to fill a textfield with a query string
151
- #
152
- ##*parameters*
153
- #
154
- #_textfield_name_ - the name of the textfield (e.g. the name of the google search
155
- #textfield is 'q'
156
- #
157
- #_query_string_ - the string that should be entered into the textfield
158
- def self.fill_textfield(textfield_name, query_string)
159
- puts "[ACTION] typing #{query_string} into the textfield named '#{textfield_name}'"
160
- textfield = (@@hpricot_doc/"input[@name=#{textfield_name}]").map()[0]
161
- form_tag = Scrubyt::XPathUtils.traverse_up_until_name(textfield, 'form')
162
- #Refactor this code, it's a total mess
163
- formname = form_tag.attributes['name']
164
- if formname == nil
165
- id_string = form_tag.attributes['id']
166
- if id_string == nil
167
- action_string = form_tag.attributes['action']
168
- if action_string == nil
169
- #If even this fails, do it with a button
170
- else
171
- puts "Finding from action"
172
- puts action_string
173
- find_form_with_attribute('action', action_string)
174
- end
175
- else
176
- puts "Finding from id"
177
- find_form_with_attribute('id', id_string)
178
- end
179
- else
180
- puts "Finding from name"
181
- @@current_form = @@mechanize_doc.forms.with.name(formname).first
182
- end
183
-
184
- eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
185
- end
186
-
187
- def self.find_form_with_attribute(attr, expected_value)
188
- puts "attr: #{attr}"
189
- i = 0
190
- loop do
191
- @@current_form = @@mechanize_doc.forms[i]
192
- print "current a: "
193
- puts @@current_form.form_node.attributes[attr]
194
- return nil if @@current_form == nil
195
- break if @@current_form.form_node.attributes[attr] == expected_value
196
- i+= 1
197
- end
198
- end
199
-
200
- #Submit the last form;
201
- def self.submit
202
- puts '[ACTION] submitting form...'
203
- result_page = @@agent.submit(@@current_form)#, @@current_form.buttons.first)
204
- @@current_doc_url = result_page.uri.to_s
205
- puts "[ACTION] fetched #{@@current_doc_url}"
206
- fetch(@@current_doc_url, result_page)
207
- end
208
-
209
- def self.click_link(link_text)
210
- puts "[ACTION] clicking link: #{link_text}"
211
- link = @@mechanize_doc.links.text(/^#{Regexp.escape(link_text)}$/)
212
- result_page = @@agent.click(link)
213
- @@current_doc_url = result_page.uri.to_s
214
- fetch(@@current_doc_url, result_page)
215
- end
216
-
217
- #
218
- #############
219
-
220
- private
221
- def self.ensure_all_postconditions(pattern)
222
- ensure_postconditions(pattern)
223
- pattern.children.each {|child| ensure_all_postconditions(child)}
224
- end
225
-
226
- def self.ensure_postconditions(pattern)
227
- #holds the name of those child patterns which have to be present as children of the input parameter
228
- epop_names = pattern.get_constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
229
- return if epop_names.empty?
230
- #all_parent_values holds instances extracted by pattern
231
- all_parent_values = []
232
- pattern.result.childmap.each { |h| all_parent_values << h.values }
233
- all_parent_values.flatten!
234
- #indices of result instances (of pattern) we are going to remove
235
- results_to_remove = Set.new
236
- pattern.children.each do |child_pattern|
237
- #all_child_values holds instances extracted by child_pattern
238
- all_child_values = []
239
- child_pattern.result.childmap.each { |h| all_child_values << h.values }
240
- all_child_values.flatten!
241
-
242
- #populate results_to_remove
243
- i = 0
244
- all_parent_values.each do |parent_value|
245
- #Hey! Not just the direct children but all the ancestors
246
- @found_ancestor = false
247
- check_ancestors(parent_value, all_child_values)
248
-
249
- results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
250
- i += 1
251
- end
252
- end
253
- #based on results_to_remove, populate the array 'rejected' which holds the actual instances
254
- #(and not indices, as in the case of results_to_remove!). In other words, we are mapping
255
- #results_to_remove indices to their actual instances
256
- rejected = []
257
- i = -1
258
- pattern.result.childmap.each do |h|
259
- h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
260
- end
261
-
262
- #Correct the statistics
263
- pattern.get_instance_count[pattern.name] -= rejected.size
264
-
265
- #Finally, do the actual delete!
266
- pattern.result.childmap.each { |h| h.each { |k,v| rejected.each { |r| v.delete(r)} } }
267
- end
268
-
269
- def self.check_ancestors(parent_value, all_child_values)
270
- parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child }
271
- parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem }
272
- end
273
-
274
- def self.evaluate_wrapper(pattern)
275
- pattern.evaluate
276
- pattern.children.each { |child| evaluate_wrapper child }
277
- end #end of method evaluate_wrapper
278
- end #end of class Extractor
279
- end #end of module Scrubyt