scrubyt 0.2.3 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +30 -0
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +5 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +13 -2
- data/lib/scrubyt/core/navigation/navigation_actions.rb +4 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/filter.rb +35 -11
- data/lib/scrubyt/core/scraping/pattern.rb +29 -22
- data/lib/scrubyt/core/scraping/result_indexer.rb +2 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +44 -22
- data/lib/scrubyt/core/shared/extractor.rb +111 -15
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +67 -0
- data/lib/scrubyt/output/export.rb +69 -22
- data/lib/scrubyt/output/result.rb +1 -0
- data/lib/scrubyt/output/result_dumper.rb +26 -7
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/shared_utils.rb +45 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +23 -0
- data/lib/scrubyt/utils/xpathutils.rb +43 -92
- data/test/unittests/simple_example_lookup_test.rb +68 -0
- data/test/unittests/xpathutils_test.rb +0 -13
- metadata +9 -3
data/CHANGELOG
CHANGED
@@ -1,5 +1,35 @@
|
|
1
1
|
= scRUBYt! Changelog
|
2
2
|
|
3
|
+
== 0.2.5
|
4
|
+
=== 22th March, 2007
|
5
|
+
|
6
|
+
The mission of this release was to add even more powerful features, like crawling to detail pages or compound example specification, as well as fixing the most frequently popping-up bugs. Scraping of concrete sites is more and more frequently the cause for new features and bugfixes, which in my opinion means that the framework is beginning to make sense: from a shiny toy which looks cool and everybody wants to play with, it is moving towards a tool which you reach after if you seriously want to scrape a site.
|
7
|
+
The new stuff in this release is 99% scraping related - if you are looking for new features in the navigation part, probably the next version will be for you, where I will concentrate more on adding new widgets and possibilities to the navigation process. Firewatir integration is very close, too - perhaps already the next release will contain FireWatir, or in the worst the next-next one.
|
8
|
+
|
9
|
+
=<tt>changes:</tt>
|
10
|
+
* [NEW] Automatically crawling to and extracting from detail pages
|
11
|
+
* [NEW] Compound example specification: So far the example of a pattern had to be a string.
|
12
|
+
Now it can be a hash as well, like {:contains => /\d\d-\d/, :begins_with => 'Telephone'}
|
13
|
+
* [NEW] More sophisticated example specification: Possible to use regexp as well, and need not
|
14
|
+
(but still possible of course) to specify the whole content of the node - nodes that
|
15
|
+
contain the string/match the regexp will be returned, too
|
16
|
+
* [NEW] Possibility to force writing text in case of non-leaf nodes
|
17
|
+
* [NEW] Crawling to the next page now possible via image links as well
|
18
|
+
* [NEW] Possibility to define examples for any pattern (before it did not make sense for ancestors)
|
19
|
+
* [NEW] Implementation of crawling to the next page with different methods
|
20
|
+
* [NEW] Heuristics: if something ends with _url, it is a shortcut for:
|
21
|
+
some_url 'href', :type => :attribute
|
22
|
+
* [FIX] Crawling to the next page (the broken google example): if the next
|
23
|
+
link text is not an <a>, traverse down until the <a> is found; if it is
|
24
|
+
still not found, traverse up until it is found
|
25
|
+
* [FIX] Crawling to next pages does not break if the next link is greyed out
|
26
|
+
(or otherwise present but has no href attribute (Credit: sorry, I could not find in the comments :(
|
27
|
+
* [FIX] DRY-ed next link lookup - it should be much more robust now as it is uses the 'standard' example lookup
|
28
|
+
* [NEW] Correct exporting of detail page extractors
|
29
|
+
* [NEW] Added more powerful XPath regexp (Credit: Karol Hosiawa)
|
30
|
+
* [NEW] New examples for the new featutres
|
31
|
+
* [FIX] Tons of bugfixes, new blackbox and unit tests, refactoring and stabilization
|
32
|
+
|
3
33
|
== 0.2.3
|
4
34
|
=== 20th February, 2007
|
5
35
|
|
data/Rakefile
CHANGED
@@ -18,7 +18,7 @@ task "cleanup_readme" => ["rdoc"]
|
|
18
18
|
|
19
19
|
gem_spec = Gem::Specification.new do |s|
|
20
20
|
s.name = 'scrubyt'
|
21
|
-
s.version = '0.2.
|
21
|
+
s.version = '0.2.6'
|
22
22
|
s.summary = 'A powerful Web-scraping framework'
|
23
23
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
24
24
|
# Files containing Test::Unit test cases.
|
@@ -82,7 +82,7 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
|
|
82
82
|
pkg.need_tar = false
|
83
83
|
end
|
84
84
|
|
85
|
-
Rake::PackageTask.new('scrubyt-examples', '0.2.
|
85
|
+
Rake::PackageTask.new('scrubyt-examples', '0.2.6') do |pkg|
|
86
86
|
pkg.need_zip = true
|
87
87
|
pkg.need_tar = true
|
88
88
|
pkg.package_files.include("examples/**/*")
|
data/lib/scrubyt.rb
CHANGED
@@ -2,6 +2,7 @@ require 'scrubyt/core/scraping/constraint_adder.rb'
|
|
2
2
|
require 'scrubyt/core/scraping/constraint.rb'
|
3
3
|
require 'scrubyt/core/scraping/result_indexer.rb'
|
4
4
|
require 'scrubyt/core/scraping/pre_filter_document.rb'
|
5
|
+
require 'scrubyt/core/scraping/compound_example.rb'
|
5
6
|
require 'scrubyt/output/export.rb'
|
6
7
|
require 'scrubyt/core/shared/extractor.rb'
|
7
8
|
require 'scrubyt/core/scraping/filter.rb'
|
@@ -13,3 +14,7 @@ require 'scrubyt/output/post_processor.rb'
|
|
13
14
|
require 'scrubyt/core/navigation/navigation_actions.rb'
|
14
15
|
require 'scrubyt/core/navigation/fetch_action.rb'
|
15
16
|
require 'scrubyt/core/shared/evaluation_context.rb'
|
17
|
+
require 'scrubyt/core/shared/u_r_i_builder.rb'
|
18
|
+
require 'scrubyt/utils/shared_utils.rb'
|
19
|
+
require 'scrubyt/utils/simple_example_lookup.rb'
|
20
|
+
require 'scrubyt/utils/compound_example_lookup.rb'
|
@@ -85,7 +85,15 @@ module Scrubyt
|
|
85
85
|
|
86
86
|
def self.get_hpricot_doc
|
87
87
|
@@hpricot_doc
|
88
|
-
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.get_host_name
|
91
|
+
@@host_name
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.restore_host_name
|
95
|
+
@@host_name = @@original_host_name
|
96
|
+
end
|
89
97
|
private
|
90
98
|
def self.determine_protocol
|
91
99
|
old_protocol = @@current_doc_protocol
|
@@ -134,6 +142,8 @@ private
|
|
134
142
|
def self.handle_relative_url(doc_url)
|
135
143
|
return if doc_url =~ /^http/
|
136
144
|
if @@host_name != nil
|
145
|
+
#p doc_url
|
146
|
+
#p @@host_name
|
137
147
|
if doc_url !~ /#{@@host_name}/
|
138
148
|
@@current_doc_url = (@@host_name + doc_url)
|
139
149
|
#remove duplicate parts, like /blogs/en/blogs/en
|
@@ -146,7 +156,8 @@ private
|
|
146
156
|
def self.store_host_name(doc_url)
|
147
157
|
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
|
148
158
|
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
|
149
|
-
@@host_name = doc_url if @@host_name == nil
|
159
|
+
@@host_name = doc_url if @@host_name == nil
|
160
|
+
@@original_host_name ||= @@host_name
|
150
161
|
end #end of function store_host_name
|
151
162
|
end #end of class FetchAction
|
152
163
|
end #end of module Scrubyt
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Represents a compund example</tt>
|
4
|
+
#
|
5
|
+
#There are two types of string examples in scRUBYt! right now:
|
6
|
+
#the simple example and the compound example. The simple example
|
7
|
+
#is specified by a string, and a compound example is specified with
|
8
|
+
#:contains, :begins_with and :ends_with descriptors - which can be
|
9
|
+
#both regexps or strings
|
10
|
+
class CompoundExample
|
11
|
+
|
12
|
+
DESCRIPTORS = [:contains, :begins_with, :ends_with]
|
13
|
+
|
14
|
+
attr_accessor :descriptor_hash
|
15
|
+
|
16
|
+
def initialize(descriptor_hash)
|
17
|
+
@descriptor_hash = descriptor_hash
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
#Is the hash passed to this function a compound example descriptor hash?
|
22
|
+
#Need to decide this when parsing pattern parameters
|
23
|
+
def self.compound_example?(hash)
|
24
|
+
hash.each do |k,v|
|
25
|
+
return false if !DESCRIPTORS.include? k
|
26
|
+
end
|
27
|
+
true
|
28
|
+
end# end of method
|
29
|
+
end# #end of class CompoundExample
|
30
|
+
end# end of module Scrubyt
|
@@ -45,6 +45,8 @@ module Scrubyt
|
|
45
45
|
EXAMPLE_TYPE_CHILDREN = 3
|
46
46
|
#Regexp example, like /\d+@*\d+[a-z]/
|
47
47
|
EXAMPLE_TYPE_REGEXP = 4
|
48
|
+
#Compound example, like :contains => 'goodies'
|
49
|
+
EXAMPLE_TYPE_COMPOUND = 5
|
48
50
|
|
49
51
|
attr_accessor :example_type, :parent_pattern, :temp_sink,
|
50
52
|
:constraints, :xpath, :regexp, :example, :source, :sink
|
@@ -62,7 +64,7 @@ module Scrubyt
|
|
62
64
|
@example = example
|
63
65
|
@xpath = nil #The xpath to evaluate this filter
|
64
66
|
#temp sinks are used for the initial run when determining the XPaths for examples;
|
65
|
-
|
67
|
+
#@temp_sink = nil
|
66
68
|
@constraints = [] #list of constraints
|
67
69
|
end
|
68
70
|
|
@@ -75,38 +77,44 @@ module Scrubyt
|
|
75
77
|
#puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
|
76
78
|
result.class == Hpricot::Elements ? result.map : [result]
|
77
79
|
when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
|
80
|
+
puts "Evaluating: #{@parent_pattern.name}"
|
78
81
|
attribute_value = [source.attributes[@example]]
|
79
82
|
return attribute_value if attribute_value[0]
|
80
83
|
@@attribute_in_parent = nil
|
81
84
|
Filter.traverse_up_until_attribute_found(source.parent, @example)
|
82
|
-
@@attribute_in_parent
|
83
|
-
|
85
|
+
@@attribute_in_parent
|
84
86
|
when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
|
85
87
|
source.inner_text.scan(@example).flatten
|
88
|
+
when Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
89
|
+
#p @parent_pattern.name
|
90
|
+
result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
|
91
|
+
XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
|
92
|
+
@parent_pattern)
|
86
93
|
end
|
87
94
|
end
|
88
95
|
|
89
96
|
#For all the tree patterns, generate an XPath based on the given example
|
90
97
|
#Also this method should not be called directly; It is automatically called for every tree
|
91
98
|
#pattern directly after wrapper definition
|
92
|
-
def generate_XPath_for_example
|
99
|
+
def generate_XPath_for_example(next_page_example=false)
|
100
|
+
#puts "generating example for: #{@parent_pattern.name}"
|
101
|
+
#puts @example_type
|
93
102
|
case @example_type
|
94
103
|
when EXAMPLE_TYPE_XPATH
|
95
104
|
@xpath = @example
|
96
105
|
when EXAMPLE_TYPE_STRING
|
97
|
-
@temp_sink =
|
98
|
-
|
99
|
-
|
106
|
+
@temp_sink = SimpleExampleLookup.find_node_from_text( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
107
|
+
@example,
|
108
|
+
next_page_example )
|
100
109
|
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
101
110
|
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
102
|
-
when EXAMPLE_TYPE_CHILDREN
|
111
|
+
when EXAMPLE_TYPE_CHILDREN
|
103
112
|
current_example_index = 0
|
104
113
|
loop do
|
105
114
|
all_child_temp_sinks = []
|
106
115
|
@parent_pattern.children.each do |child_pattern|
|
107
116
|
all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink
|
108
117
|
end
|
109
|
-
|
110
118
|
result = all_child_temp_sinks.pop
|
111
119
|
if all_child_temp_sinks.empty?
|
112
120
|
result = result.parent
|
@@ -122,7 +130,8 @@ module Scrubyt
|
|
122
130
|
end
|
123
131
|
@parent_pattern.filters[current_example_index].xpath = xpath
|
124
132
|
@parent_pattern.filters[current_example_index].temp_sink = result
|
125
|
-
@parent_pattern.children.each do |child_pattern|
|
133
|
+
@parent_pattern.children.each do |child_pattern|
|
134
|
+
next if child_pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
126
135
|
child_pattern.filters[current_example_index].xpath =
|
127
136
|
child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
|
128
137
|
XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
|
@@ -137,8 +146,20 @@ module Scrubyt
|
|
137
146
|
when EXAMPLE_TYPE_IMAGE
|
138
147
|
@temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)
|
139
148
|
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
|
149
|
+
when EXAMPLE_TYPE_COMPOUND
|
150
|
+
@temp_sink = CompoundExampleLookup.find_node_from_compund_example( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
|
151
|
+
@example,
|
152
|
+
next_page_example )
|
153
|
+
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
154
|
+
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
140
155
|
end
|
141
156
|
end
|
157
|
+
|
158
|
+
def setup_relative_XPaths
|
159
|
+
return if !@parent_pattern.parent.parent
|
160
|
+
parent_filter = @parent_pattern.parent.filters[@parent_pattern.filters.index(self)]
|
161
|
+
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_filter.xpath, @xpath) if (@xpath =~ /^\/html/)
|
162
|
+
end
|
142
163
|
|
143
164
|
#Dispatcher method to add constraints; of course, as with any method_missing, this method
|
144
165
|
#should not be called directly
|
@@ -160,13 +181,16 @@ private
|
|
160
181
|
def self.determine_example_type(example)
|
161
182
|
if example.instance_of? Regexp
|
162
183
|
EXAMPLE_TYPE_REGEXP
|
184
|
+
elsif example.instance_of? Hash
|
185
|
+
EXAMPLE_TYPE_COMPOUND
|
163
186
|
else
|
164
187
|
case example
|
165
188
|
when nil
|
166
189
|
EXAMPLE_TYPE_CHILDREN
|
167
190
|
when /\.(jpg|png|gif|jpeg)$/
|
168
191
|
EXAMPLE_TYPE_IMAGE
|
169
|
-
when
|
192
|
+
when
|
193
|
+
/^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*$/
|
170
194
|
(example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
|
171
195
|
else
|
172
196
|
EXAMPLE_TYPE_STRING
|
@@ -17,13 +17,15 @@ module Scrubyt
|
|
17
17
|
#Type of the pattern;
|
18
18
|
|
19
19
|
# a root pattern represents a (surprise!) root pattern
|
20
|
-
PATTERN_TYPE_ROOT =
|
20
|
+
PATTERN_TYPE_ROOT = 0x00
|
21
21
|
# a tree pattern represents a HTML region
|
22
|
-
PATTERN_TYPE_TREE =
|
22
|
+
PATTERN_TYPE_TREE = 0x01
|
23
23
|
# represents an attribute of the node extracted by the parent pattern
|
24
|
-
PATTERN_TYPE_ATTRIBUTE =
|
24
|
+
PATTERN_TYPE_ATTRIBUTE = 0x02
|
25
25
|
# represents a pattern which filters its output with a regexp
|
26
|
-
PATTERN_TYPE_REGEXP =
|
26
|
+
PATTERN_TYPE_REGEXP = 0x03
|
27
|
+
# represents a pattern which crawls to the detail page and extracts information from there
|
28
|
+
PATTERN_TYPE_DETAIL = 0x04
|
27
29
|
|
28
30
|
#The pattern can be either a model pattern (in this case it is
|
29
31
|
#written to the output) or a temp pattern (in this case it is skipped)
|
@@ -31,20 +33,21 @@ module Scrubyt
|
|
31
33
|
#is considered to be a model pattern
|
32
34
|
|
33
35
|
#Model pattern are shown in the output
|
34
|
-
OUTPUT_TYPE_MODEL =
|
36
|
+
OUTPUT_TYPE_MODEL = 0x10
|
35
37
|
#Temp patterns are skipped in the output (their ancestors are appended to the parent
|
36
38
|
#of the pattrern which was skipped
|
37
|
-
OUTPUT_TYPE_TEMP =
|
39
|
+
OUTPUT_TYPE_TEMP = 0x11
|
38
40
|
|
39
41
|
#These fields can be set upon wrapper creation - i.e. a field which is public but not contained here can be accessed
|
40
42
|
#from outside, but not set as a result of wrapper construction
|
41
|
-
SETTABLE_FIELDS = ['generalize', 'type', 'output_type', '
|
43
|
+
SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'write_text']
|
42
44
|
|
43
45
|
attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
|
44
|
-
:last_result, :result, :
|
45
|
-
:examples, :parent_of_leaf, :evaluation_context,
|
46
|
-
:indices_to_extract, :evaluation_context
|
47
|
-
|
46
|
+
:last_result, :result, :limit,
|
47
|
+
:examples, :parent_of_leaf, :evaluation_context, :type,
|
48
|
+
:indices_to_extract, :evaluation_context, :referenced_extractor,
|
49
|
+
:referenced_pattern, :write_text
|
50
|
+
attr_reader :generalize_set, :next_page_url, :result_indexer
|
48
51
|
|
49
52
|
def initialize (name, *args)
|
50
53
|
@name = name #name of the pattern
|
@@ -70,7 +73,7 @@ module Scrubyt
|
|
70
73
|
#Grab any examples that are defined!
|
71
74
|
look_for_examples(args)
|
72
75
|
args.each do |arg|
|
73
|
-
next if !arg.is_a? Hash
|
76
|
+
next if !arg.is_a? Hash
|
74
77
|
arg.each do |k,v|
|
75
78
|
#Set only the setable fields
|
76
79
|
if SETTABLE_FIELDS.include? k.to_s
|
@@ -107,16 +110,16 @@ module Scrubyt
|
|
107
110
|
# camera_data.item[1].item_name[0]
|
108
111
|
def method_missing(method_name, *args, &block)
|
109
112
|
case method_name.to_s
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
113
|
+
when 'select_indices'
|
114
|
+
@result_indexer = Scrubyt::ResultIndexer.new(*args)
|
115
|
+
self
|
116
|
+
when /^to_/
|
117
|
+
Scrubyt::ResultDumper.send(method_name.to_s, self)
|
118
|
+
when /^ensure_/
|
119
|
+
Scrubyt::ConstraintAdder.send(method_name, self, *args)
|
120
|
+
else
|
121
|
+
@children.each { |child| return child if child.name == method_name.to_s }
|
122
|
+
nil
|
120
123
|
end
|
121
124
|
end
|
122
125
|
|
@@ -226,7 +229,11 @@ private
|
|
226
229
|
end
|
227
230
|
end
|
228
231
|
@type = PATTERN_TYPE_REGEXP
|
232
|
+
elsif (args[0].is_a? Hash)
|
233
|
+
@examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
|
234
|
+
@examples = nil if @examples == []
|
229
235
|
end
|
236
|
+
|
230
237
|
end
|
231
238
|
|
232
239
|
def add_result(filter, source, results)
|
@@ -13,8 +13,8 @@ module Scrubyt
|
|
13
13
|
#two classes need to communicate frequently as well as share different information
|
14
14
|
#and this is accomplished through EvaluationContext.
|
15
15
|
class EvaluationContext
|
16
|
-
attr_accessor :root_pattern, :
|
17
|
-
:extractor, :
|
16
|
+
attr_accessor :root_pattern, :document_index, :block_count,
|
17
|
+
:extractor, :uri_builder
|
18
18
|
|
19
19
|
def initialize
|
20
20
|
@root_pattern = nil
|
@@ -26,9 +26,11 @@ module Scrubyt
|
|
26
26
|
|
27
27
|
##
|
28
28
|
#Crawl to a new page. This function should not be called from the outside - it is automatically called
|
29
|
-
#if the next_page is defined
|
30
|
-
def crawl_to_new_page
|
31
|
-
temp_document =
|
29
|
+
#if the next_page pattern is defined
|
30
|
+
def crawl_to_new_page(root_pattern, uri_builder)
|
31
|
+
temp_document = uri_builder.next_page_example ?
|
32
|
+
generate_next_page_link(uri_builder) :
|
33
|
+
uri_builder.generate_next_uri
|
32
34
|
return nil if temp_document == nil
|
33
35
|
clear_sources_and_sinks(@root_pattern)
|
34
36
|
@extractor.fetch(temp_document)
|
@@ -41,9 +43,9 @@ module Scrubyt
|
|
41
43
|
def attach_current_document
|
42
44
|
doc = @extractor.get_hpricot_doc
|
43
45
|
@root_pattern.filters[0].source << doc
|
44
|
-
@root_pattern.filters[0].sink << doc
|
46
|
+
@root_pattern.filters[0].sink << doc
|
45
47
|
@root_pattern.last_result ||= []
|
46
|
-
@root_pattern.last_result << doc
|
48
|
+
@root_pattern.last_result << doc
|
47
49
|
@root_pattern.result.add_result(@root_pattern.filters[0].source,
|
48
50
|
@root_pattern.filters[0].sink)
|
49
51
|
end
|
@@ -54,6 +56,7 @@ module Scrubyt
|
|
54
56
|
get_root_pattern(nil)
|
55
57
|
mark_leaf_parents(@root_pattern)
|
56
58
|
generate_examples(@root_pattern)
|
59
|
+
check_for_multipe_examples(@root_pattern)
|
57
60
|
end
|
58
61
|
|
59
62
|
##
|
@@ -67,24 +70,22 @@ module Scrubyt
|
|
67
70
|
pattern.children.each {|child| clear_sources_and_sinks child}
|
68
71
|
end
|
69
72
|
|
70
|
-
def generate_next_page_link(
|
71
|
-
|
72
|
-
|
73
|
+
def generate_next_page_link(uri_builder)
|
74
|
+
uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
|
75
|
+
xpath = uri_builder.next_page_pattern.filters[0].xpath
|
76
|
+
node = (@extractor.get_hpricot_doc/xpath).map.last
|
77
|
+
node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
|
78
|
+
return nil if node == nil || node.attributes['href'] == nil
|
73
79
|
node.attributes['href'].gsub('&') {'&'}
|
74
80
|
end
|
75
|
-
|
76
|
-
def
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
+
|
82
|
+
def setup_uri_builder(pattern,args)
|
83
|
+
if args[0] =~ /^http.+/
|
84
|
+
args.insert(0, @extractor.get_current_doc_url) if args[1] !~ /^http.+/
|
85
|
+
end
|
86
|
+
@uri_builder = URIBuilder.new(pattern,args)
|
81
87
|
end
|
82
88
|
|
83
|
-
def generate_examples(pattern)
|
84
|
-
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
85
|
-
pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
86
|
-
end
|
87
|
-
|
88
89
|
def get_root_pattern(pattern)
|
89
90
|
if @root_pattern == nil
|
90
91
|
while (pattern.parent != nil)
|
@@ -92,6 +93,27 @@ module Scrubyt
|
|
92
93
|
end
|
93
94
|
@root_pattern = pattern
|
94
95
|
end
|
95
|
-
end
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
def mark_leaf_parents(pattern)
|
100
|
+
pattern.children.each { |child|
|
101
|
+
pattern.parent_of_leaf = true if child.children.size == 0
|
102
|
+
}
|
103
|
+
pattern.children.each { |child| mark_leaf_parents(child) }
|
104
|
+
end
|
105
|
+
|
106
|
+
##
|
107
|
+
#Check the tree and turn all the XPaths for the examples (but the topmost one)
|
108
|
+
#into relative ones
|
109
|
+
def check_for_multipe_examples(pattern)
|
110
|
+
pattern.children.each {|child_pattern| check_for_multipe_examples(child_pattern) }
|
111
|
+
pattern.filters.each { |filter| filter.setup_relative_XPaths } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
112
|
+
end
|
113
|
+
|
114
|
+
def generate_examples(pattern)
|
115
|
+
pattern.children.each {|child_pattern| generate_examples(child_pattern) }
|
116
|
+
pattern.filters.each { |filter| filter.generate_XPath_for_example(false) } if pattern.type == Pattern::PATTERN_TYPE_TREE
|
117
|
+
end #end of function generate_examples
|
96
118
|
end #end of class EvaluationContext
|
97
119
|
end #end of module Scrubyt
|