scrubyt 0.2.3 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +30 -0
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +5 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +13 -2
- data/lib/scrubyt/core/navigation/navigation_actions.rb +4 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/filter.rb +35 -11
- data/lib/scrubyt/core/scraping/pattern.rb +29 -22
- data/lib/scrubyt/core/scraping/result_indexer.rb +2 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +44 -22
- data/lib/scrubyt/core/shared/extractor.rb +111 -15
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +67 -0
- data/lib/scrubyt/output/export.rb +69 -22
- data/lib/scrubyt/output/result.rb +1 -0
- data/lib/scrubyt/output/result_dumper.rb +26 -7
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/shared_utils.rb +45 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +23 -0
- data/lib/scrubyt/utils/xpathutils.rb +43 -92
- data/test/unittests/simple_example_lookup_test.rb +68 -0
- data/test/unittests/xpathutils_test.rb +0 -13
- metadata +9 -3
data/CHANGELOG
CHANGED
@@ -1,5 +1,35 @@
|
|
1
1
|
= scRUBYt! Changelog
|
2
2
|
|
3
|
+
== 0.2.5
|
4
|
+
=== 22th March, 2007
|
5
|
+
|
6
|
+
The mission of this release was to add even more powerful features, like crawling to detail pages or compound example specification, as well as fixing the most frequently popping-up bugs. Scraping of concrete sites is more and more frequently the cause for new features and bugfixes, which in my opinion means that the framework is beginning to make sense: from a shiny toy which looks cool and everybody wants to play with, it is moving towards a tool which you reach after if you seriously want to scrape a site.
|
7
|
+
The new stuff in this release is 99% scraping related - if you are looking for new features in the navigation part, probably the next version will be for you, where I will concentrate more on adding new widgets and possibilities to the navigation process. Firewatir integration is very close, too - perhaps already the next release will contain FireWatir, or in the worst the next-next one.
|
8
|
+
|
9
|
+
=<tt>changes:</tt>
|
10
|
+
* [NEW] Automatically crawling to and extracting from detail pages
|
11
|
+
* [NEW] Compound example specification: So far the example of a pattern had to be a string.
|
12
|
+
Now it can be a hash as well, like {:contains => /\d\d-\d/, :begins_with => 'Telephone'}
|
13
|
+
* [NEW] More sophisticated example specification: Possible to use regexp as well, and need not
|
14
|
+
(but still possible of course) to specify the whole content of the node - nodes that
|
15
|
+
contain the string/match the regexp will be returned, too
|
16
|
+
* [NEW] Possibility to force writing text in case of non-leaf nodes
|
17
|
+
* [NEW] Crawling to the next page now possible via image links as well
|
18
|
+
* [NEW] Possibility to define examples for any pattern (before it did not make sense for ancestors)
|
19
|
+
* [NEW] Implementation of crawling to the next page with different methods
|
20
|
+
* [NEW] Heuristics: if something ends with _url, it is a shortcut for:
|
21
|
+
some_url 'href', :type => :attribute
|
22
|
+
* [FIX] Crawling to the next page (the broken google example): if the next
|
23
|
+
link text is not an <a>, traverse down until the <a> is found; if it is
|
24
|
+
still not found, traverse up until it is found
|
25
|
+
* [FIX] Crawling to next pages does not break if the next link is greyed out
|
26
|
+
(or otherwise present but has no href attribute (Credit: sorry, I could not find in the comments :(
|
27
|
+
* [FIX] DRY-ed next link lookup - it should be much more robust now as it is uses the 'standard' example lookup
|
28
|
+
* [NEW] Correct exporting of detail page extractors
|
29
|
+
* [NEW] Added more powerful XPath regexp (Credit: Karol Hosiawa)
|
30
|
+
* [NEW] New examples for the new featutres
|
31
|
+
* [FIX] Tons of bugfixes, new blackbox and unit tests, refactoring and stabilization
|
32
|
+
|
3
33
|
== 0.2.3
|
4
34
|
=== 20th February, 2007
|
5
35
|
|
data/Rakefile
CHANGED
@@ -18,7 +18,7 @@ task "cleanup_readme" => ["rdoc"]
|
|
18
18
|
|
19
19
|
gem_spec = Gem::Specification.new do |s|
|
20
20
|
s.name = 'scrubyt'
|
21
|
-
s.version = '0.2.
|
21
|
+
s.version = '0.2.6'
|
22
22
|
s.summary = 'A powerful Web-scraping framework'
|
23
23
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
24
24
|
# Files containing Test::Unit test cases.
|
@@ -82,7 +82,7 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
|
|
82
82
|
pkg.need_tar = false
|
83
83
|
end
|
84
84
|
|
85
|
-
Rake::PackageTask.new('scrubyt-examples', '0.2.
|
85
|
+
Rake::PackageTask.new('scrubyt-examples', '0.2.6') do |pkg|
|
86
86
|
pkg.need_zip = true
|
87
87
|
pkg.need_tar = true
|
88
88
|
pkg.package_files.include("examples/**/*")
|
data/lib/scrubyt.rb
CHANGED
@@ -2,6 +2,7 @@ require 'scrubyt/core/scraping/constraint_adder.rb'
|
|
2
2
|
require 'scrubyt/core/scraping/constraint.rb'
|
3
3
|
require 'scrubyt/core/scraping/result_indexer.rb'
|
4
4
|
require 'scrubyt/core/scraping/pre_filter_document.rb'
|
5
|
+
require 'scrubyt/core/scraping/compound_example.rb'
|
5
6
|
require 'scrubyt/output/export.rb'
|
6
7
|
require 'scrubyt/core/shared/extractor.rb'
|
7
8
|
require 'scrubyt/core/scraping/filter.rb'
|
@@ -13,3 +14,7 @@ require 'scrubyt/output/post_processor.rb'
|
|
13
14
|
require 'scrubyt/core/navigation/navigation_actions.rb'
|
14
15
|
require 'scrubyt/core/navigation/fetch_action.rb'
|
15
16
|
require 'scrubyt/core/shared/evaluation_context.rb'
|
17
|
+
require 'scrubyt/core/shared/u_r_i_builder.rb'
|
18
|
+
require 'scrubyt/utils/shared_utils.rb'
|
19
|
+
require 'scrubyt/utils/simple_example_lookup.rb'
|
20
|
+
require 'scrubyt/utils/compound_example_lookup.rb'
|
@@ -85,7 +85,15 @@ module Scrubyt
|
|
85
85
|
|
86
86
|
def self.get_hpricot_doc
|
87
87
|
@@hpricot_doc
|
88
|
-
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.get_host_name
|
91
|
+
@@host_name
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.restore_host_name
|
95
|
+
@@host_name = @@original_host_name
|
96
|
+
end
|
89
97
|
private
|
90
98
|
def self.determine_protocol
|
91
99
|
old_protocol = @@current_doc_protocol
|
@@ -134,6 +142,8 @@ private
|
|
134
142
|
def self.handle_relative_url(doc_url)
|
135
143
|
return if doc_url =~ /^http/
|
136
144
|
if @@host_name != nil
|
145
|
+
#p doc_url
|
146
|
+
#p @@host_name
|
137
147
|
if doc_url !~ /#{@@host_name}/
|
138
148
|
@@current_doc_url = (@@host_name + doc_url)
|
139
149
|
#remove duplicate parts, like /blogs/en/blogs/en
|
@@ -146,7 +156,8 @@ private
|
|
146
156
|
def self.store_host_name(doc_url)
|
147
157
|
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
|
148
158
|
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
|
149
|
-
@@host_name = doc_url if @@host_name == nil
|
159
|
+
@@host_name = doc_url if @@host_name == nil
|
160
|
+
@@original_host_name ||= @@host_name
|
150
161
|
end #end of function store_host_name
|
151
162
|
end #end of class FetchAction
|
152
163
|
end #end of module Scrubyt
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Represents a compund example</tt>
|
4
|
+
#
|
5
|
+
#There are two types of string examples in scRUBYt! right now:
|
6
|
+
#the simple example and the compound example. The simple example
|
7
|
+
#is specified by a string, and a compound example is specified with
|
8
|
+
#:contains, :begins_with and :ends_with descriptors - which can be
|
9
|
+
#both regexps or strings
|
10
|
+
class CompoundExample
|
11
|
+
|
12
|
+
DESCRIPTORS = [:contains, :begins_with, :ends_with]
|
13
|
+
|
14
|
+
attr_accessor :descriptor_hash
|
15
|
+
|
16
|
+
def initialize(descriptor_hash)
|
17
|
+
@descriptor_hash = descriptor_hash
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
#Is the hash passed to this function a compound example descriptor hash?
|
22
|
+
#Need to decide this when parsing pattern parameters
|
23
|
+
def self.compound_example?(hash)
|
24
|
+
hash.each do |k,v|
|
25
|
+
return false if !DESCRIPTORS.include? k
|
26
|
+
end
|
27
|
+
true
|
28
|
+
end# end of method
|
29
|
+
end# #end of class CompoundExample
|
30
|
+
end# end of module Scrubyt
|
@@ -45,6 +45,8 @@ module Scrubyt
|
|
45
45
|
EXAMPLE_TYPE_CHILDREN = 3
|
46
46
|
#Regexp example, like /\d+@*\d+[a-z]/
|
47
47
|
EXAMPLE_TYPE_REGEXP = 4
|
48
|
+
#Compound example, like :contains => 'goodies'
|
49
|
+
EXAMPLE_TYPE_COMPOUND = 5
|
48
50
|
|
49
51
|
attr_accessor :example_type, :parent_pattern, :temp_sink,
|
50
52
|
:constraints, :xpath, :regexp, :example, :source, :sink
|
@@ -62,7 +64,7 @@ module Scrubyt
|
|
62
64
|
@example = example
|
63
65
|
@xpath = nil #The xpath to evaluate this filter
|
64
66
|
#temp sinks are used for the initial run when determining the XPaths for examples;
|
65
|
-
|
67
|
+
#@temp_sink = nil
|
66
68
|
@constraints = [] #list of constraints
|
67
69
|
end
|
68
70
|
|
@@ -75,38 +77,44 @@ module Scrubyt
|
|
75
77
|
#puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
|
76
78
|
result.class == Hpricot::Elements ? result.map : [result]
|
77
79
|
when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
|
80
|
+
puts "Evaluating: #{@parent_pattern.name}"
|
78
81
|
attribute_value = [source.attributes[@example]]
|
79
82
|
return attribute_value if attribute_value[0]
|
80
83
|
@@attribute_in_parent = nil
|
81
84
|
Filter.traverse_up_until_attribute_found(source.parent, @example)
|
82
|
-
@@attribute_in_parent
|
83
|
-
|
85
|
+
@@attribute_in_parent
|
84
86
|
when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
|
85
87
|
source.inner_text.scan(@example).flatten
|
88
|
+
when Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
89
|
+
#p @parent_pattern.name
|
90
|
+
result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
|
91
|
+
XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
|
92
|
+
@parent_pattern)
|
86
93
|
end
|
87
94
|
end
|
88
95
|
|
89
96
|
#For all the tree patterns, generate an XPath based on the given example
|
90
97
|
#Also this method should not be called directly; It is automatically called for every tree
|
91
98
|
#pattern directly after wrapper definition
|
92
|
-
def generate_XPath_for_example
|
99
|
+
def generate_XPath_for_example(next_page_example=false)
|
100
|
+
#puts "generating example for: #{@parent_pattern.name}"
|
101
|
+
#puts @example_type
|
93
102
|
case @example_type
|
94
103
|
when EXAMPLE_TYPE_XPATH
|
95
104
|
@xpath = @example
|
96
105
|
when EXAMPLE_TYPE_STRING
|
97
|
-
@temp_sink =
|
98
|
-
|
99
|
-
|
106
|
+
@temp_sink = SimpleExampleLookup.find_node_from_text( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
107
|
+
@example,
|
108
|
+
next_page_example )
|
100
109
|
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
101
110
|
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
102
|
-
when EXAMPLE_TYPE_CHILDREN
|
111
|
+
when EXAMPLE_TYPE_CHILDREN
|
103
112
|
current_example_index = 0
|
104
113
|
loop do
|
105
114
|
all_child_temp_sinks = []
|
106
115
|
@parent_pattern.children.each do |child_pattern|
|
107
116
|
all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink
|
108
117
|
end
|
109
|
-
|
110
118
|
result = all_child_temp_sinks.pop
|
111
119
|
if all_child_temp_sinks.empty?
|
112
120
|
result = result.parent
|
@@ -122,7 +130,8 @@ module Scrubyt
|
|
122
130
|
end
|
123
131
|
@parent_pattern.filters[current_example_index].xpath = xpath
|
124
132
|
@parent_pattern.filters[current_example_index].temp_sink = result
|
125
|
-
@parent_pattern.children.each do |child_pattern|
|
133
|
+
@parent_pattern.children.each do |child_pattern|
|
134
|
+
next if child_pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
126
135
|
child_pattern.filters[current_example_index].xpath =
|
127
136
|
child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
|
128
137
|
XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
|
@@ -137,8 +146,20 @@ module Scrubyt
|
|
137
146
|
when EXAMPLE_TYPE_IMAGE
|
138
147
|
@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)
|
139
148
|
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
|
149
|
+
when EXAMPLE_TYPE_COMPOUND
|
150
|
+
@temp_sink = CompoundExampleLookup.find_node_from_compund_example( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
151
|
+
@example,
|
152
|
+
next_page_example )
|
153
|
+
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
154
|
+
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
140
155
|
end
|
141
156
|
end
|
157
|
+
|
158
|
+
def setup_relative_XPaths
|
159
|
+
return if !@parent_pattern.parent.parent
|
160
|
+
parent_filter = @parent_pattern.parent.filters[@parent_pattern.filters.index(self)]
|
161
|
+
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_filter.xpath, @xpath) if (@xpath =~ /^\/html/)
|
162
|
+
end
|
142
163
|
|
143
164
|
#Dispatcher method to add constraints; of course, as with any method_missing, this method
|
144
165
|
#should not be called directly
|
@@ -160,13 +181,16 @@ private
|
|
160
181
|
def self.determine_example_type(example)
|
161
182
|
if example.instance_of? Regexp
|
162
183
|
EXAMPLE_TYPE_REGEXP
|
184
|
+
elsif example.instance_of? Hash
|
185
|
+
EXAMPLE_TYPE_COMPOUND
|
163
186
|
else
|
164
187
|
case example
|
165
188
|
when nil
|
166
189
|
EXAMPLE_TYPE_CHILDREN
|
167
190
|
when /\.(jpg|png|gif|jpeg)$/
|
168
191
|
EXAMPLE_TYPE_IMAGE
|
169
|
-
when
|
192
|
+
when
|
193
|
+
/^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*$/
|
170
194
|
(example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
|
171
195
|
else
|
172
196
|
EXAMPLE_TYPE_STRING
|
@@ -17,13 +17,15 @@ module Scrubyt
|
|
17
17
|
#Type of the pattern;
|
18
18
|
|
19
19
|
# a root pattern represents a (surprise!) root pattern
|
20
|
-
PATTERN_TYPE_ROOT =
|
20
|
+
PATTERN_TYPE_ROOT = 0x00
|
21
21
|
# a tree pattern represents a HTML region
|
22
|
-
PATTERN_TYPE_TREE =
|
22
|
+
PATTERN_TYPE_TREE = 0x01
|
23
23
|
# represents an attribute of the node extracted by the parent pattern
|
24
|
-
PATTERN_TYPE_ATTRIBUTE =
|
24
|
+
PATTERN_TYPE_ATTRIBUTE = 0x02
|
25
25
|
# represents a pattern which filters its output with a regexp
|
26
|
-
PATTERN_TYPE_REGEXP =
|
26
|
+
PATTERN_TYPE_REGEXP = 0x03
|
27
|
+
# represents a pattern which crawls to the detail page and extracts information from there
|
28
|
+
PATTERN_TYPE_DETAIL = 0x04
|
27
29
|
|
28
30
|
#The pattern can be either a model pattern (in this case it is
|
29
31
|
#written to the output) or a temp pattern (in this case it is skipped)
|
@@ -31,20 +33,21 @@ module Scrubyt
|
|
31
33
|
#is considered to be a model pattern
|
32
34
|
|
33
35
|
#Model pattern are shown in the output
|
34
|
-
OUTPUT_TYPE_MODEL =
|
36
|
+
OUTPUT_TYPE_MODEL = 0x10
|
35
37
|
#Temp patterns are skipped in the output (their ancestors are appended to the parent
|
36
38
|
#of the pattrern which was skipped
|
37
|
-
OUTPUT_TYPE_TEMP =
|
39
|
+
OUTPUT_TYPE_TEMP = 0x11
|
38
40
|
|
39
41
|
#These fields can be set upon wrapper creation - i.e. a field which is public but not contained here can be accessed
|
40
42
|
#from outside, but not set as a result of wrapper construction
|
41
|
-
SETTABLE_FIELDS = ['generalize', 'type', 'output_type', '
|
43
|
+
SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'write_text']
|
42
44
|
|
43
45
|
attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
|
44
|
-
:last_result, :result, :
|
45
|
-
:examples, :parent_of_leaf, :evaluation_context,
|
46
|
-
:indices_to_extract, :evaluation_context
|
47
|
-
|
46
|
+
:last_result, :result, :limit,
|
47
|
+
:examples, :parent_of_leaf, :evaluation_context, :type,
|
48
|
+
:indices_to_extract, :evaluation_context, :referenced_extractor,
|
49
|
+
:referenced_pattern, :write_text
|
50
|
+
attr_reader :generalize_set, :next_page_url, :result_indexer
|
48
51
|
|
49
52
|
def initialize (name, *args)
|
50
53
|
@name = name #name of the pattern
|
@@ -70,7 +73,7 @@ module Scrubyt
|
|
70
73
|
#Grab any examples that are defined!
|
71
74
|
look_for_examples(args)
|
72
75
|
args.each do |arg|
|
73
|
-
next if !arg.is_a? Hash
|
76
|
+
next if !arg.is_a? Hash
|
74
77
|
arg.each do |k,v|
|
75
78
|
#Set only the setable fields
|
76
79
|
if SETTABLE_FIELDS.include? k.to_s
|
@@ -107,16 +110,16 @@ module Scrubyt
|
|
107
110
|
# camera_data.item[1].item_name[0]
|
108
111
|
def method_missing(method_name, *args, &block)
|
109
112
|
case method_name.to_s
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
113
|
+
when 'select_indices'
|
114
|
+
@result_indexer = Scrubyt::ResultIndexer.new(*args)
|
115
|
+
self
|
116
|
+
when /^to_/
|
117
|
+
Scrubyt::ResultDumper.send(method_name.to_s, self)
|
118
|
+
when /^ensure_/
|
119
|
+
Scrubyt::ConstraintAdder.send(method_name, self, *args)
|
120
|
+
else
|
121
|
+
@children.each { |child| return child if child.name == method_name.to_s }
|
122
|
+
nil
|
120
123
|
end
|
121
124
|
end
|
122
125
|
|
@@ -226,7 +229,11 @@ private
|
|
226
229
|
end
|
227
230
|
end
|
228
231
|
@type = PATTERN_TYPE_REGEXP
|
232
|
+
elsif (args[0].is_a? Hash)
|
233
|
+
@examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
|
234
|
+
@examples = nil if @examples == []
|
229
235
|
end
|
236
|
+
|
230
237
|
end
|
231
238
|
|
232
239
|
def add_result(filter, source, results)
|
@@ -13,8 +13,8 @@ module Scrubyt
|
|
13
13
|
#two classes need to communicate frequently as well as share different information
|
14
14
|
#and this is accomplished through EvaluationContext.
|
15
15
|
class EvaluationContext
|
16
|
-
attr_accessor :root_pattern, :
|
17
|
-
:extractor, :
|
16
|
+
attr_accessor :root_pattern, :document_index, :block_count,
|
17
|
+
:extractor, :uri_builder
|
18
18
|
|
19
19
|
def initialize
|
20
20
|
@root_pattern = nil
|
@@ -26,9 +26,11 @@ module Scrubyt
|
|
26
26
|
|
27
27
|
##
|
28
28
|
#Crawl to a new page. This function should not be called from the outside - it is automatically called
|
29
|
-
#if the next_page is defined
|
30
|
-
def crawl_to_new_page
|
31
|
-
temp_document =
|
29
|
+
#if the next_page pattern is defined
|
30
|
+
def crawl_to_new_page(root_pattern, uri_builder)
|
31
|
+
temp_document = uri_builder.next_page_example ?
|
32
|
+
generate_next_page_link(uri_builder) :
|
33
|
+
uri_builder.generate_next_uri
|
32
34
|
return nil if temp_document == nil
|
33
35
|
clear_sources_and_sinks(@root_pattern)
|
34
36
|
@extractor.fetch(temp_document)
|
@@ -41,9 +43,9 @@ module Scrubyt
|
|
41
43
|
def attach_current_document
|
42
44
|
doc = @extractor.get_hpricot_doc
|
43
45
|
@root_pattern.filters[0].source << doc
|
44
|
-
@root_pattern.filters[0].sink << doc
|
46
|
+
@root_pattern.filters[0].sink << doc
|
45
47
|
@root_pattern.last_result ||= []
|
46
|
-
@root_pattern.last_result << doc
|
48
|
+
@root_pattern.last_result << doc
|
47
49
|
@root_pattern.result.add_result(@root_pattern.filters[0].source,
|
48
50
|
@root_pattern.filters[0].sink)
|
49
51
|
end
|
@@ -54,6 +56,7 @@ module Scrubyt
|
|
54
56
|
get_root_pattern(nil)
|
55
57
|
mark_leaf_parents(@root_pattern)
|
56
58
|
generate_examples(@root_pattern)
|
59
|
+
check_for_multipe_examples(@root_pattern)
|
57
60
|
end
|
58
61
|
|
59
62
|
##
|
@@ -67,24 +70,22 @@ module Scrubyt
|
|
67
70
|
pattern.children.each {|child| clear_sources_and_sinks child}
|
68
71
|
end
|
69
72
|
|
70
|
-
def generate_next_page_link(
|
71
|
-
|
72
|
-
|
73
|
+
def generate_next_page_link(uri_builder)
|
74
|
+
uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
|
75
|
+
xpath = uri_builder.next_page_pattern.filters[0].xpath
|
76
|
+
node = (@extractor.get_hpricot_doc/xpath).map.last
|
77
|
+
node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
|
78
|
+
return nil if node == nil || node.attributes['href'] == nil
|
73
79
|
node.attributes['href'].gsub('&') {'&'}
|
74
80
|
end
|
75
|
-
|
76
|
-
def
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
+
|
82
|
+
def setup_uri_builder(pattern,args)
|
83
|
+
if args[0] =~ /^http.+/
|
84
|
+
args.insert(0, @extractor.get_current_doc_url) if args[1] !~ /^http.+/
|
85
|
+
end
|
86
|
+
@uri_builder = URIBuilder.new(pattern,args)
|
81
87
|
end
|
82
88
|
|
83
|
-
def generate_examples(pattern)
|
84
|
-
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
85
|
-
pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
86
|
-
end
|
87
|
-
|
88
89
|
def get_root_pattern(pattern)
|
89
90
|
if @root_pattern == nil
|
90
91
|
while (pattern.parent != nil)
|
@@ -92,6 +93,27 @@ module Scrubyt
|
|
92
93
|
end
|
93
94
|
@root_pattern = pattern
|
94
95
|
end
|
95
|
-
end
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
def mark_leaf_parents(pattern)
|
100
|
+
pattern.children.each { |child|
|
101
|
+
pattern.parent_of_leaf = true if child.children.size == 0
|
102
|
+
}
|
103
|
+
pattern.children.each { |child| mark_leaf_parents(child) }
|
104
|
+
end
|
105
|
+
|
106
|
+
##
|
107
|
+
#Check the tree and turn all the XPaths for the examples (but the topmost one)
|
108
|
+
#into relative ones
|
109
|
+
def check_for_multipe_examples(pattern)
|
110
|
+
pattern.children.each {|child_pattern| check_for_multipe_examples(child_pattern) }
|
111
|
+
pattern.filters.each { |filter| filter.setup_relative_XPaths } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
112
|
+
end
|
113
|
+
|
114
|
+
def generate_examples(pattern)
|
115
|
+
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
116
|
+
pattern.filters.each { |filter| filter.generate_XPath_for_example(false) } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
117
|
+
end #end of function generate_examples
|
96
118
|
end #end of class EvaluationContext
|
97
119
|
end #end of module Scrubyt
|