scrubyt 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +132 -1
- data/Rakefile +4 -2
- data/lib/scrubyt.rb +15 -10
- data/lib/scrubyt/core/navigation/fetch_action.rb +152 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +106 -0
- data/lib/scrubyt/{constraint.rb → core/scraping/constraint.rb} +0 -0
- data/lib/scrubyt/{constraint_adder.rb → core/scraping/constraint_adder.rb} +0 -0
- data/lib/scrubyt/{filter.rb → core/scraping/filter.rb} +22 -4
- data/lib/scrubyt/{pattern.rb → core/scraping/pattern.rb} +21 -98
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +13 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +88 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +97 -0
- data/lib/scrubyt/core/shared/extractor.rb +116 -0
- data/lib/scrubyt/{export.rb → output/export.rb} +14 -8
- data/lib/scrubyt/output/post_processor.rb +137 -0
- data/lib/scrubyt/{result.rb → output/result.rb} +0 -0
- data/lib/scrubyt/{result_dumper.rb → output/result_dumper.rb} +0 -7
- data/lib/scrubyt/{xpathutils.rb → utils/xpathutils.rb} +5 -2
- data/test/unittests/pattern_test.rb +27 -0
- metadata +40 -17
- data/lib/scrubyt/extractor.rb +0 -279
- data/lib/scrubyt/post_processor.rb +0 -73
@@ -109,7 +109,7 @@ private
|
|
109
109
|
first_line = contents.scan(/.*Extractor\.define.*/)
|
110
110
|
#During wrapper construction, we count the number of blocks; add one occurrence of
|
111
111
|
#end (to close the block of the extractor definition)
|
112
|
-
count = pattern.
|
112
|
+
count = pattern.evaluation_context.block_count + 1
|
113
113
|
#Construct the extractor definition matching regexp based on the number of ends
|
114
114
|
definition = contents.scan(/Extractor\.define(?:.*?(?:\}|end)){#{count.to_s}}/m)
|
115
115
|
#Since the regexp matching the extractor definition was multiline, get the first
|
@@ -117,14 +117,13 @@ private
|
|
117
117
|
rows = definition[0].split("\n")
|
118
118
|
#Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
|
119
119
|
#patterns could be matched very easily from the extractor definition (because they begun
|
120
|
-
#with 'P.'). Now that P has been removed, mimick it!
|
121
|
-
keywords = ['fetch', 'fill_textfield', 'submit', 'end', 'click_link']
|
120
|
+
#with 'P.'). Now that P has been removed, mimick it!
|
122
121
|
rows.each do |row|
|
123
122
|
#Do not prepend P. to comments and empty lines
|
124
123
|
next if (row.strip =~ /^#/ || row.strip == '')
|
125
124
|
#Do not prepend P. to any of the reserved keywords
|
126
125
|
jump_to_next = false
|
127
|
-
|
126
|
+
NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
|
128
127
|
next if jump_to_next
|
129
128
|
#Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
|
130
129
|
row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
|
@@ -180,8 +179,13 @@ private
|
|
180
179
|
|
181
180
|
def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
|
182
181
|
return if name=='root'
|
183
|
-
|
184
|
-
|
182
|
+
parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
|
183
|
+
if parens.empty?
|
184
|
+
full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
|
185
|
+
else
|
186
|
+
full_line = parens[0][0]
|
187
|
+
end
|
188
|
+
examples = full_line.split(",")
|
185
189
|
examples.reject! {|exa| exa.strip!; exa[0..0] != %q{"} && exa[0..0] != %q{'} }
|
186
190
|
all_xpaths = ""
|
187
191
|
examples.each do |e|
|
@@ -193,9 +197,11 @@ private
|
|
193
197
|
end
|
194
198
|
replacing_xpath = full_line.include?('{') ? "P.#{name}('#{all_xpaths}')" :
|
195
199
|
"P.#{name} #{all_xpaths}"
|
196
|
-
|
200
|
+
optional_paren_escaped = parens.empty? ? '' : '\('
|
201
|
+
optional_paren = parens.empty? ? '' : '('
|
202
|
+
@full_definition.sub!(/P\.#{name}\s*#{optional_paren_escaped}#{left_delimiter}(.*)#{right_delimiter}/) do
|
197
203
|
@name_to_xpath_map.delete("#{name}")
|
198
|
-
replacing_xpath
|
204
|
+
optional_paren + replacing_xpath
|
199
205
|
end
|
200
206
|
end
|
201
207
|
|
@@ -0,0 +1,137 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
##
|
5
|
+
#=<tt>Post processing results after the extraction</tt>
|
6
|
+
#Some things can not be carried out during evaluation - for example
|
7
|
+
#the ensure_presence_of_pattern constraint (since the evaluation is top
|
8
|
+
#to bottom, at a given point we don't know yet whether the currently
|
9
|
+
#evaluated pattern will have a child pattern or not) or removing unneeded
|
10
|
+
#results caused by evaluating multiple filters.
|
11
|
+
#
|
12
|
+
#The sole purpose of this class is to execute these post-processing tasks.
|
13
|
+
class PostProcessor
|
14
|
+
##
|
15
|
+
#This is just a convenience method do call all the postprocessing
|
16
|
+
#functionality and checks
|
17
|
+
def self.apply_post_processing(root_pattern)
|
18
|
+
ensure_presence_of_pattern_full(root_pattern)
|
19
|
+
remove_multiple_filter_duplicates(root_pattern) if root_pattern.children[0].filters.size > 1
|
20
|
+
report_if_no_results(root_pattern) if root_pattern.evaluation_context.extractor.get_mode != :production
|
21
|
+
end
|
22
|
+
|
23
|
+
##
|
24
|
+
#Apply the ensure_presence_of_pattern constraint on
|
25
|
+
#the full extractor
|
26
|
+
def self.ensure_presence_of_pattern_full(pattern)
|
27
|
+
ensure_presence_of_pattern(pattern)
|
28
|
+
pattern.children.each {|child| ensure_presence_of_pattern_full(child)}
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
#Remove unneeded results of a pattern (caused by evaluating multiple filters)
|
33
|
+
#See for example the B&N scenario - the book titles are extracted two times
|
34
|
+
#for every pattern (since both examples generate the same XPath for them)
|
35
|
+
#but since always only one of the results has a price, the other is discarded
|
36
|
+
def self.remove_multiple_filter_duplicates(pattern)
|
37
|
+
remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
|
38
|
+
pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
|
39
|
+
end
|
40
|
+
|
41
|
+
##
|
42
|
+
#Issue an error report if the document did not extract anything.
|
43
|
+
#Probably this is because the structure of the page changed or
|
44
|
+
#because of some rather nasty bug - in any case, something wrong
|
45
|
+
#is going on, and we need to inform the user about this!
|
46
|
+
def self.report_if_no_results(root_pattern)
|
47
|
+
results_found = false
|
48
|
+
root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
|
49
|
+
puts
|
50
|
+
puts "!!!!!! WARNING: The extractor did not find any result instances"
|
51
|
+
puts "Most probably this is wrong. Check your extractor and if you are"
|
52
|
+
puts "sure it should work, report a bug!"
|
53
|
+
puts
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
def self.ensure_presence_of_pattern(pattern)
|
58
|
+
#holds the name of those child patterns which have to be present as children of the input parameter
|
59
|
+
epop_names = pattern.get_constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
|
60
|
+
return if epop_names.empty?
|
61
|
+
#all_parent_values holds instances extracted by pattern
|
62
|
+
all_parent_values = []
|
63
|
+
pattern.result.childmap.each { |h| all_parent_values << h.values }
|
64
|
+
all_parent_values.flatten!
|
65
|
+
#indices of result instances (of pattern) we are going to remove
|
66
|
+
results_to_remove = Set.new
|
67
|
+
pattern.children.each do |child_pattern|
|
68
|
+
#all_child_values holds instances extracted by child_pattern
|
69
|
+
all_child_values = []
|
70
|
+
child_pattern.result.childmap.each { |h| all_child_values << h.values }
|
71
|
+
all_child_values.flatten!
|
72
|
+
|
73
|
+
#populate results_to_remove
|
74
|
+
i = 0
|
75
|
+
all_parent_values.each do |parent_value|
|
76
|
+
#Hey! Not just the direct children but all the ancestors
|
77
|
+
@found_ancestor = false
|
78
|
+
check_ancestors(parent_value, all_child_values)
|
79
|
+
|
80
|
+
results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
|
81
|
+
i += 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
#based on results_to_remove, populate the array 'rejected' which holds the actual instances
|
85
|
+
#(and not indices, as in the case of results_to_remove!). In other words, we are mapping
|
86
|
+
#results_to_remove indices to their actual instances
|
87
|
+
rejected = []
|
88
|
+
i = -1
|
89
|
+
pattern.result.childmap.each do |h|
|
90
|
+
h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
|
91
|
+
end
|
92
|
+
|
93
|
+
#Finally, do the actual delete!
|
94
|
+
pattern.result.childmap.each { |h| h.each { |k,v| rejected.each { |r| v.delete(r)} } }
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.check_ancestors(parent_value, all_child_values)
|
98
|
+
parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child }
|
99
|
+
parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem }
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.remove_multiple_filter_duplicates_intern(pattern)
|
103
|
+
possible_duplicates = {}
|
104
|
+
longest_result = 0
|
105
|
+
pattern.result.childmap.each { |r|
|
106
|
+
r.each do |k,v|
|
107
|
+
v.each do |x|
|
108
|
+
all_child_results = []
|
109
|
+
pattern.children.each { |child|
|
110
|
+
temp_res = child.result.lookup(x)
|
111
|
+
all_child_results << temp_res if temp_res != nil
|
112
|
+
}
|
113
|
+
next if all_child_results.size <= 1
|
114
|
+
longest_result = all_child_results.map {|e| e.size}.max
|
115
|
+
all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
|
116
|
+
possible_duplicates[x] = all_child_results.transpose
|
117
|
+
end
|
118
|
+
end
|
119
|
+
}
|
120
|
+
#Determine the 'real' duplicates
|
121
|
+
real_duplicates = {}
|
122
|
+
possible_duplicates.each { |k,v|
|
123
|
+
next if v.size == 1
|
124
|
+
v.each { |r| real_duplicates[k] = r }
|
125
|
+
}
|
126
|
+
|
127
|
+
#Finally, remove them!
|
128
|
+
pattern.children.each { |child|
|
129
|
+
child.result.childmap.each { |r|
|
130
|
+
r.each { |k,v|
|
131
|
+
real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
|
132
|
+
}
|
133
|
+
}
|
134
|
+
}
|
135
|
+
end #end of function
|
136
|
+
end #end of class PostProcessor
|
137
|
+
end #end of module Scrubyt
|
File without changes
|
@@ -81,13 +81,6 @@ private
|
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
84
|
-
def self.print_old_sta(pattern, depth)
|
85
|
-
puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{pattern.get_instance_count[pattern.name]} instances.") if pattern.name != 'root'
|
86
|
-
pattern.children.each do |child|
|
87
|
-
print_statistics_recursive(child, depth + 4)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
84
|
def self.print_statistics_recursive(pattern, depth)
|
92
85
|
if pattern.name != 'root'
|
93
86
|
count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
|
@@ -24,6 +24,8 @@ module Scrubyt
|
|
24
24
|
def self.find_node_from_text(doc, text, next_link)
|
25
25
|
@node = nil
|
26
26
|
@found = false
|
27
|
+
#digg next page hack
|
28
|
+
text.gsub!('»', '»')
|
27
29
|
self.traverse_for_full_text(doc,text)
|
28
30
|
self.lowest_possible_node_with_text(@node, text) if @node != nil
|
29
31
|
if (@found == false)
|
@@ -138,7 +140,7 @@ module Scrubyt
|
|
138
140
|
#most typically the user will need the 0th - but if this is not the
|
139
141
|
#case, there is the possibility to override this
|
140
142
|
def self.find_image(doc, example, index=0)
|
141
|
-
(doc/"img[@src='#{example}']")[index]
|
143
|
+
(doc/"//img[@src='#{example}']")[index]
|
142
144
|
end
|
143
145
|
|
144
146
|
##
|
@@ -208,7 +210,8 @@ private
|
|
208
210
|
def self.traverse_for_full_text(node, text)
|
209
211
|
return if @found
|
210
212
|
if (node.instance_of? Hpricot::Elem)
|
211
|
-
ft = unescape_entities(full_text(node)).strip
|
213
|
+
ft = unescape_entities(full_text(node)).strip
|
214
|
+
#puts ">>#{ft}<<, text:>>#{text}<<" if ((ft.size < 20) && ft =~ /Open Source/)
|
212
215
|
if (ft == text)
|
213
216
|
@found = true
|
214
217
|
@node = node
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'scrubyt'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
class PatternTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_select_indices
|
8
|
+
some_pattern = Scrubyt::Pattern.new('some_pattern')
|
9
|
+
some_pattern.select_indices(1..3)
|
10
|
+
assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
|
11
|
+
some_pattern.select_indices([1])
|
12
|
+
assert_equal(some_pattern.result_indexer.indices_to_extract, [1])
|
13
|
+
some_pattern.select_indices([1,2,3])
|
14
|
+
assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
|
15
|
+
some_pattern.select_indices(:first)
|
16
|
+
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first])
|
17
|
+
some_pattern.select_indices([:first, :last])
|
18
|
+
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,:last])
|
19
|
+
some_pattern.select_indices([:first, [5,6]])
|
20
|
+
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,5,6])
|
21
|
+
some_pattern.select_indices([:first, 1..2])
|
22
|
+
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,1,2])
|
23
|
+
some_pattern.select_indices([4..5, :first, [5,6]])
|
24
|
+
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,4,5,6])
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: scrubyt
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-02-
|
6
|
+
version: 0.2.3
|
7
|
+
date: 2007-02-20 00:00:00 +01:00
|
8
8
|
summary: A powerful Web-scraping framework
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,24 +34,30 @@ files:
|
|
34
34
|
- CHANGELOG
|
35
35
|
- Rakefile
|
36
36
|
- lib/scrubyt.rb
|
37
|
-
- lib/scrubyt/
|
38
|
-
- lib/scrubyt/
|
39
|
-
- lib/scrubyt/
|
40
|
-
- lib/scrubyt/
|
41
|
-
- lib/scrubyt/
|
42
|
-
- lib/scrubyt/
|
43
|
-
- lib/scrubyt/
|
44
|
-
- lib/scrubyt/
|
45
|
-
- lib/scrubyt/
|
46
|
-
- lib/scrubyt/
|
37
|
+
- lib/scrubyt/utils/xpathutils.rb
|
38
|
+
- lib/scrubyt/output/result_dumper.rb
|
39
|
+
- lib/scrubyt/output/export.rb
|
40
|
+
- lib/scrubyt/output/post_processor.rb
|
41
|
+
- lib/scrubyt/output/result.rb
|
42
|
+
- lib/scrubyt/core/navigation/fetch_action.rb
|
43
|
+
- lib/scrubyt/core/navigation/navigation_actions.rb
|
44
|
+
- lib/scrubyt/core/scraping/result_indexer.rb
|
45
|
+
- lib/scrubyt/core/scraping/constraint_adder.rb
|
46
|
+
- lib/scrubyt/core/scraping/constraint.rb
|
47
|
+
- lib/scrubyt/core/scraping/filter.rb
|
48
|
+
- lib/scrubyt/core/scraping/pattern.rb
|
49
|
+
- lib/scrubyt/core/scraping/pre_filter_document.rb
|
50
|
+
- lib/scrubyt/core/shared/evaluation_context.rb
|
51
|
+
- lib/scrubyt/core/shared/extractor.rb
|
47
52
|
test_files:
|
48
53
|
- test/unittests/input
|
49
|
-
- test/unittests/constraint_test.rb
|
50
54
|
- test/unittests/filter_test.rb
|
51
|
-
- test/unittests/
|
55
|
+
- test/unittests/pattern_test.rb
|
52
56
|
- test/unittests/extractor_test.rb
|
53
|
-
- test/unittests/
|
57
|
+
- test/unittests/xpathutils_test.rb
|
58
|
+
- test/unittests/constraint_test.rb
|
54
59
|
- test/unittests/input/constraint_test.html
|
60
|
+
- test/unittests/input/test.html
|
55
61
|
rdoc_options: []
|
56
62
|
|
57
63
|
extra_rdoc_files: []
|
@@ -62,5 +68,22 @@ extensions: []
|
|
62
68
|
|
63
69
|
requirements: []
|
64
70
|
|
65
|
-
dependencies:
|
66
|
-
|
71
|
+
dependencies:
|
72
|
+
- !ruby/object:Gem::Dependency
|
73
|
+
name: hpricot
|
74
|
+
version_requirement:
|
75
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: "0.5"
|
80
|
+
version:
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: mechanize
|
83
|
+
version_requirement:
|
84
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: 0.6.3
|
89
|
+
version:
|
data/lib/scrubyt/extractor.rb
DELETED
@@ -1,279 +0,0 @@
|
|
1
|
-
require 'logger'
|
2
|
-
require 'open-uri'
|
3
|
-
require 'rubygems'
|
4
|
-
require 'mechanize'
|
5
|
-
require 'hpricot'
|
6
|
-
require 'pp'
|
7
|
-
require 'set'
|
8
|
-
|
9
|
-
module Scrubyt
|
10
|
-
##
|
11
|
-
#=<tt>Driving the whole extraction process</tt>
|
12
|
-
#Extractor is a performer class - it gets an extractor definition and carries
|
13
|
-
#out the actions and evaluates the wrappers sequentially.
|
14
|
-
#
|
15
|
-
#It also defines the actions as class methods - check out the section
|
16
|
-
#commented with ############# Actions.
|
17
|
-
class Extractor
|
18
|
-
|
19
|
-
#The definition of the extractor is passed through this method
|
20
|
-
def self.define(&extractor_definition)
|
21
|
-
@@current_doc_url = nil
|
22
|
-
@@current_form = nil
|
23
|
-
@@current_doc_protocol = nil
|
24
|
-
@@base_dir = nil
|
25
|
-
@@host_name = nil
|
26
|
-
@@agent = WWW::Mechanize.new
|
27
|
-
#Hack up an artificial root pattern (i.e. do not return the pattern which
|
28
|
-
#is the root one in the user's definition, but rather the real (invisible)
|
29
|
-
#root pattern
|
30
|
-
root_pattern = (class_eval(&extractor_definition)).parent
|
31
|
-
#A little hack here: upon wrapper construction we are counting the number
|
32
|
-
#of blocks, so we know the count of the 'end's/'}'s which end the extractor
|
33
|
-
#definition
|
34
|
-
#Recursively match data based on examples
|
35
|
-
root_pattern.setup_examples
|
36
|
-
#Once all is set up, evaluate the wrapper from the root pattern!
|
37
|
-
if root_pattern.next_page
|
38
|
-
current_page_count = 1
|
39
|
-
loop do
|
40
|
-
evaluate_wrapper(root_pattern)
|
41
|
-
break if (root_pattern.limit == current_page_count || root_pattern.crawl_to_new_page == nil)
|
42
|
-
current_page_count += 1 if root_pattern.limit != nil
|
43
|
-
end
|
44
|
-
else
|
45
|
-
evaluate_wrapper(root_pattern)
|
46
|
-
end
|
47
|
-
ensure_all_postconditions(root_pattern)
|
48
|
-
PostProcessor.remove_multiple_filter_duplicates(root_pattern)
|
49
|
-
PostProcessor.report_if_no_results(root_pattern)
|
50
|
-
#Return the root pattern
|
51
|
-
root_pattern
|
52
|
-
end
|
53
|
-
|
54
|
-
#build the current wrapper
|
55
|
-
def self.method_missing(method_name, *args, &block)
|
56
|
-
pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
|
57
|
-
if @parent == nil
|
58
|
-
if method_name.to_s == 'next_page'
|
59
|
-
@@root_pattern.next_page = args[0]
|
60
|
-
@@root_pattern.limit = args[1][:limit] if args.size > 1
|
61
|
-
return @@last_pattern
|
62
|
-
else
|
63
|
-
#Create a root pattern
|
64
|
-
root_pattern = Scrubyt::Pattern.new('root', :type => :root)
|
65
|
-
@@root_pattern = root_pattern
|
66
|
-
@@root_pattern.root_pattern = root_pattern
|
67
|
-
@@root_pattern.root_pattern.extractor = self
|
68
|
-
#add the currently active document to the root pattern
|
69
|
-
@@root_pattern.attach_current_document
|
70
|
-
@@root_pattern.add_child_pattern(pattern)
|
71
|
-
@@root_pattern.block_count = 0
|
72
|
-
@@root_pattern.extractor = self
|
73
|
-
end
|
74
|
-
else
|
75
|
-
@parent.add_child_pattern(pattern) if @parent != nil
|
76
|
-
end
|
77
|
-
if block_given?
|
78
|
-
@@root_pattern.block_count = @@root_pattern.block_count + 1
|
79
|
-
@stack ||=[]
|
80
|
-
@parent = pattern
|
81
|
-
@stack.push @parent
|
82
|
-
class_eval(&block)
|
83
|
-
@stack.pop
|
84
|
-
@parent = @stack.last
|
85
|
-
end
|
86
|
-
@@last_pattern = pattern
|
87
|
-
end
|
88
|
-
|
89
|
-
#Used in lord of the hacks vol 1. Check out export.rb if you are still interested
|
90
|
-
#(You should not be :)
|
91
|
-
def self.get_block_count
|
92
|
-
@@root_pattern.block_count
|
93
|
-
end
|
94
|
-
|
95
|
-
############# Actions
|
96
|
-
#
|
97
|
-
|
98
|
-
##
|
99
|
-
# At any given point, the current document can be queried with this method; Typically used
|
100
|
-
# when the navigation is over and the result document is passed to the wrapper
|
101
|
-
def self.get_current_doc_url
|
102
|
-
@@current_doc_url
|
103
|
-
end
|
104
|
-
|
105
|
-
def self.get_hpricot_doc
|
106
|
-
@@hpricot_doc
|
107
|
-
end
|
108
|
-
|
109
|
-
##
|
110
|
-
#Action to fetch a document (either a file or a http address)
|
111
|
-
#
|
112
|
-
#*parameters*
|
113
|
-
#
|
114
|
-
#_doc_url_ - the url or file name to fetch
|
115
|
-
def self.fetch(doc_url, mechanize_doc=nil)
|
116
|
-
if (mechanize_doc == nil)
|
117
|
-
@@current_doc_url = doc_url
|
118
|
-
@@current_doc_protocol = ((doc_url =~ /^http/ || doc_url =~ /^www/) ? :http : :file)
|
119
|
-
if @@base_dir == nil
|
120
|
-
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == :file
|
121
|
-
else
|
122
|
-
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
123
|
-
end
|
124
|
-
|
125
|
-
if @@host_name != nil
|
126
|
-
if doc_url !~ /#{@@host_name}/
|
127
|
-
@@current_doc_url = (@@host_name + doc_url)
|
128
|
-
#remove duplicate parts, like /blogs/en/blogs/en
|
129
|
-
@@current_doc_url = @@current_doc_url.split('/').uniq.reject{|x| x == ""}.join('/')
|
130
|
-
@@current_doc_url.sub!('http:/', 'http://')
|
131
|
-
end
|
132
|
-
end
|
133
|
-
puts "[ACTION] fetching document: #{@@current_doc_url}"
|
134
|
-
if @@current_doc_protocol == :http
|
135
|
-
|
136
|
-
@@mechanize_doc = @@agent.get(@@current_doc_url)
|
137
|
-
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0]
|
138
|
-
@@host_name = doc_url if @@host_name == nil
|
139
|
-
end
|
140
|
-
else
|
141
|
-
@@current_doc_url = doc_url
|
142
|
-
@@mechanize_doc = mechanize_doc
|
143
|
-
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0]
|
144
|
-
@@host_name = doc_url if @@host_name == nil
|
145
|
-
end
|
146
|
-
@@hpricot_doc = Hpricot(open(@@current_doc_url))
|
147
|
-
end
|
148
|
-
|
149
|
-
##
|
150
|
-
#Action to fill a textfield with a query string
|
151
|
-
#
|
152
|
-
##*parameters*
|
153
|
-
#
|
154
|
-
#_textfield_name_ - the name of the textfield (e.g. the name of the google search
|
155
|
-
#textfield is 'q'
|
156
|
-
#
|
157
|
-
#_query_string_ - the string that should be entered into the textfield
|
158
|
-
def self.fill_textfield(textfield_name, query_string)
|
159
|
-
puts "[ACTION] typing #{query_string} into the textfield named '#{textfield_name}'"
|
160
|
-
textfield = (@@hpricot_doc/"input[@name=#{textfield_name}]").map()[0]
|
161
|
-
form_tag = Scrubyt::XPathUtils.traverse_up_until_name(textfield, 'form')
|
162
|
-
#Refactor this code, it's a total mess
|
163
|
-
formname = form_tag.attributes['name']
|
164
|
-
if formname == nil
|
165
|
-
id_string = form_tag.attributes['id']
|
166
|
-
if id_string == nil
|
167
|
-
action_string = form_tag.attributes['action']
|
168
|
-
if action_string == nil
|
169
|
-
#If even this fails, do it with a button
|
170
|
-
else
|
171
|
-
puts "Finding from action"
|
172
|
-
puts action_string
|
173
|
-
find_form_with_attribute('action', action_string)
|
174
|
-
end
|
175
|
-
else
|
176
|
-
puts "Finding from id"
|
177
|
-
find_form_with_attribute('id', id_string)
|
178
|
-
end
|
179
|
-
else
|
180
|
-
puts "Finding from name"
|
181
|
-
@@current_form = @@mechanize_doc.forms.with.name(formname).first
|
182
|
-
end
|
183
|
-
|
184
|
-
eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
|
185
|
-
end
|
186
|
-
|
187
|
-
def self.find_form_with_attribute(attr, expected_value)
|
188
|
-
puts "attr: #{attr}"
|
189
|
-
i = 0
|
190
|
-
loop do
|
191
|
-
@@current_form = @@mechanize_doc.forms[i]
|
192
|
-
print "current a: "
|
193
|
-
puts @@current_form.form_node.attributes[attr]
|
194
|
-
return nil if @@current_form == nil
|
195
|
-
break if @@current_form.form_node.attributes[attr] == expected_value
|
196
|
-
i+= 1
|
197
|
-
end
|
198
|
-
end
|
199
|
-
|
200
|
-
#Submit the last form;
|
201
|
-
def self.submit
|
202
|
-
puts '[ACTION] submitting form...'
|
203
|
-
result_page = @@agent.submit(@@current_form)#, @@current_form.buttons.first)
|
204
|
-
@@current_doc_url = result_page.uri.to_s
|
205
|
-
puts "[ACTION] fetched #{@@current_doc_url}"
|
206
|
-
fetch(@@current_doc_url, result_page)
|
207
|
-
end
|
208
|
-
|
209
|
-
def self.click_link(link_text)
|
210
|
-
puts "[ACTION] clicking link: #{link_text}"
|
211
|
-
link = @@mechanize_doc.links.text(/^#{Regexp.escape(link_text)}$/)
|
212
|
-
result_page = @@agent.click(link)
|
213
|
-
@@current_doc_url = result_page.uri.to_s
|
214
|
-
fetch(@@current_doc_url, result_page)
|
215
|
-
end
|
216
|
-
|
217
|
-
#
|
218
|
-
#############
|
219
|
-
|
220
|
-
private
|
221
|
-
def self.ensure_all_postconditions(pattern)
|
222
|
-
ensure_postconditions(pattern)
|
223
|
-
pattern.children.each {|child| ensure_all_postconditions(child)}
|
224
|
-
end
|
225
|
-
|
226
|
-
def self.ensure_postconditions(pattern)
|
227
|
-
#holds the name of those child patterns which have to be present as children of the input parameter
|
228
|
-
epop_names = pattern.get_constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
|
229
|
-
return if epop_names.empty?
|
230
|
-
#all_parent_values holds instances extracted by pattern
|
231
|
-
all_parent_values = []
|
232
|
-
pattern.result.childmap.each { |h| all_parent_values << h.values }
|
233
|
-
all_parent_values.flatten!
|
234
|
-
#indices of result instances (of pattern) we are going to remove
|
235
|
-
results_to_remove = Set.new
|
236
|
-
pattern.children.each do |child_pattern|
|
237
|
-
#all_child_values holds instances extracted by child_pattern
|
238
|
-
all_child_values = []
|
239
|
-
child_pattern.result.childmap.each { |h| all_child_values << h.values }
|
240
|
-
all_child_values.flatten!
|
241
|
-
|
242
|
-
#populate results_to_remove
|
243
|
-
i = 0
|
244
|
-
all_parent_values.each do |parent_value|
|
245
|
-
#Hey! Not just the direct children but all the ancestors
|
246
|
-
@found_ancestor = false
|
247
|
-
check_ancestors(parent_value, all_child_values)
|
248
|
-
|
249
|
-
results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
|
250
|
-
i += 1
|
251
|
-
end
|
252
|
-
end
|
253
|
-
#based on results_to_remove, populate the array 'rejected' which holds the actual instances
|
254
|
-
#(and not indices, as in the case of results_to_remove!). In other words, we are mapping
|
255
|
-
#results_to_remove indices to their actual instances
|
256
|
-
rejected = []
|
257
|
-
i = -1
|
258
|
-
pattern.result.childmap.each do |h|
|
259
|
-
h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
|
260
|
-
end
|
261
|
-
|
262
|
-
#Correct the statistics
|
263
|
-
pattern.get_instance_count[pattern.name] -= rejected.size
|
264
|
-
|
265
|
-
#Finally, do the actual delete!
|
266
|
-
pattern.result.childmap.each { |h| h.each { |k,v| rejected.each { |r| v.delete(r)} } }
|
267
|
-
end
|
268
|
-
|
269
|
-
def self.check_ancestors(parent_value, all_child_values)
|
270
|
-
parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child }
|
271
|
-
parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem }
|
272
|
-
end
|
273
|
-
|
274
|
-
def self.evaluate_wrapper(pattern)
|
275
|
-
pattern.evaluate
|
276
|
-
pattern.children.each { |child| evaluate_wrapper child }
|
277
|
-
end #end of method evaluate_wrapper
|
278
|
-
end #end of class Extractor
|
279
|
-
end #end of module Scrubyt
|