scrubyt 0.3.0 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -6
- data/Rakefile +22 -10
- data/lib/scrubyt.rb +9 -4
- data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
- data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/pattern.rb +25 -18
- data/lib/scrubyt/core/shared/extractor.rb +109 -128
- data/lib/scrubyt/logging.rb +146 -8
- data/lib/scrubyt/output/export.rb +60 -44
- data/lib/scrubyt/output/result_node.rb +34 -3
- data/lib/scrubyt/output/scrubyt_result.rb +18 -9
- data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
- metadata +52 -6
- data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67
@@ -14,17 +14,25 @@ module Scrubyt
|
|
14
14
|
|
15
15
|
private
|
16
16
|
def download_file(source)
|
17
|
-
|
17
|
+
return '' if source.size < 4
|
18
|
+
host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
|
18
19
|
outfile = nil
|
20
|
+
host_name += "/" if host_name[-1..-1] != "/"
|
19
21
|
base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
|
20
|
-
return '' if source.size < 4
|
21
22
|
file_name = source.scan(/.+\/(.*)/)[0][0]
|
23
|
+
return nil if @parent_pattern.except.include? file_name
|
22
24
|
Net::HTTP.start(base_url) { |http|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
|
26
|
+
begin
|
27
|
+
ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
|
28
|
+
path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
|
29
|
+
resp = http.get(path, {'User-Agent'=> ua})
|
30
|
+
outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
|
31
|
+
FileUtils.mkdir_p @example
|
32
|
+
open(outfile, 'wb') {|f| f.write(resp.body) }
|
33
|
+
rescue Timeout::Error
|
34
|
+
outfile = "[FAILED]#{file_name}"
|
35
|
+
end
|
28
36
|
}
|
29
37
|
outfile.scan(/.+\/(.*)/)[0][0]
|
30
38
|
end
|
@@ -34,11 +42,21 @@ private
|
|
34
42
|
loop do
|
35
43
|
if File.exists? file_name
|
36
44
|
if already_found
|
37
|
-
|
38
|
-
|
45
|
+
if file_name.include?('.')
|
46
|
+
last_no = file_name.scan(/_(\d+)\./)[0][0]
|
47
|
+
file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
|
48
|
+
else
|
49
|
+
last_no = file_name.scan(/_(\d+)$/)[0][0]
|
50
|
+
file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
|
51
|
+
end
|
39
52
|
else
|
40
|
-
file_name.
|
41
|
-
|
53
|
+
if file_name.include?('.')
|
54
|
+
file_name.sub!(/\./) {"_1\."}
|
55
|
+
already_found = true
|
56
|
+
else
|
57
|
+
file_name << '_1'
|
58
|
+
already_found = true
|
59
|
+
end
|
42
60
|
end
|
43
61
|
else
|
44
62
|
break
|
@@ -1,17 +1,17 @@
|
|
1
1
|
module Scrubyt
|
2
2
|
class RegexpFilter < BaseFilter
|
3
|
-
|
3
|
+
|
4
4
|
def evaluate(source)
|
5
5
|
if source.is_a? String
|
6
6
|
source.scan(@example).flatten
|
7
7
|
else
|
8
|
-
source.
|
9
|
-
end
|
8
|
+
source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
|
9
|
+
end
|
10
10
|
end
|
11
|
-
|
11
|
+
|
12
12
|
def to_sexp
|
13
13
|
[:lit, @example]
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
end #End of class TreeFilter
|
17
17
|
end #End of module Scrubyt
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class ScriptFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
param = source
|
6
|
+
param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
|
7
|
+
@example.call param
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_sexp
|
11
|
+
[:str, "FIXME!!! Can't dump Proc"]
|
12
|
+
end #end of method to_sexp
|
13
|
+
end #End of class ConstantFilter
|
14
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class TextFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
return find_string(source) if @example =~ /^find\(/
|
6
|
+
final_element_name = @example.scan(/^(.+?)\[/)[0][0]
|
7
|
+
text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
|
8
|
+
|
9
|
+
index = @example.scan(/\]:(.+)/).flatten
|
10
|
+
index = 0 if index.empty?
|
11
|
+
index = index[0].to_i unless index[0] == "all"
|
12
|
+
|
13
|
+
result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
|
14
|
+
return "" unless result
|
15
|
+
|
16
|
+
if index[0] == "all"
|
17
|
+
result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
|
18
|
+
else
|
19
|
+
[XPathUtils.traverse_up_until_name(result,final_element_name)]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def find_string(source)
|
24
|
+
str = @example.scan(/find\((.+)\)/).flatten[0]
|
25
|
+
strings_to_find = str.include? ('|') ? str.split('|') : [str]
|
26
|
+
strings_to_find.each do |s|
|
27
|
+
result = SharedUtils.traverse_for_match(source,/#{s}/i)
|
28
|
+
return [s] unless result.empty?
|
29
|
+
end
|
30
|
+
return []
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_sexp
|
34
|
+
[:str, @example]
|
35
|
+
end #end of method to_sexp
|
36
|
+
end #End of class TextFilter
|
37
|
+
end #End of module Scrubyt
|
38
|
+
|
@@ -38,7 +38,7 @@ module Scrubyt
|
|
38
38
|
return if @temp_sink.is_a? String
|
39
39
|
return if @example =~ /.+\[.+\]$/
|
40
40
|
|
41
|
-
text = SharedUtils.prepare_text_for_comparison(@temp_sink.
|
41
|
+
text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
|
42
42
|
match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
|
43
43
|
return if match_range == (0..text.length)
|
44
44
|
|
@@ -64,7 +64,7 @@ module Scrubyt
|
|
64
64
|
when EXAMPLE_TYPE_XPATH
|
65
65
|
@xpath = @example
|
66
66
|
when EXAMPLE_TYPE_STRING
|
67
|
-
@temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.
|
67
|
+
@temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
|
68
68
|
@example,
|
69
69
|
next_page_example)
|
70
70
|
return if @temp_sink == nil
|
@@ -116,10 +116,10 @@ module Scrubyt
|
|
116
116
|
current_example_index += 1
|
117
117
|
end
|
118
118
|
when EXAMPLE_TYPE_IMAGE
|
119
|
-
@temp_sink = XPathUtils.find_image(@parent_pattern.
|
119
|
+
@temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
|
120
120
|
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
|
121
121
|
when EXAMPLE_TYPE_COMPOUND
|
122
|
-
@temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.
|
122
|
+
@temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
|
123
123
|
@example,
|
124
124
|
next_page_example)
|
125
125
|
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
@@ -128,7 +128,7 @@ module Scrubyt
|
|
128
128
|
end
|
129
129
|
|
130
130
|
def generate_relative_XPath(parent_xpath)
|
131
|
-
parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.
|
131
|
+
parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
|
132
132
|
parent_xpath,
|
133
133
|
@parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
|
134
134
|
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
|
@@ -33,7 +33,7 @@ module Scrubyt
|
|
33
33
|
# # write out the HTML subtree beginning at the matched element
|
34
34
|
# PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
|
35
35
|
|
36
|
-
VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree]
|
36
|
+
VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
|
37
37
|
|
38
38
|
#The pattern can be either a model pattern (in this case it is
|
39
39
|
#written to the output) or a temp pattern (in this case it is skipped)
|
@@ -46,27 +46,25 @@ module Scrubyt
|
|
46
46
|
# #of the pattrern which was skipped
|
47
47
|
# OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
|
48
48
|
|
49
|
-
VALID_OUTPUT_TYPES = [:model, :temp]
|
49
|
+
VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
|
50
50
|
|
51
51
|
#These options can be set upon wrapper creation
|
52
|
-
PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve]
|
52
|
+
PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
|
53
53
|
VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
|
54
54
|
|
55
|
-
attr_accessor(:name, :options, :children, :constraints, :filters, :parent,
|
56
|
-
:
|
57
|
-
:indices_to_extract, :referenced_extractor, :referenced_pattern,
|
58
|
-
:source_file, :source_proc, :modifier_calls)
|
55
|
+
attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
|
56
|
+
:indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
|
59
57
|
|
60
58
|
attr_reader(:next_page_url, :result_indexer)
|
61
59
|
|
62
60
|
option_reader(:type => :tree, :output_type => :model, :generalize => false,
|
63
61
|
:write_text => lambda { @children.size == 0 }, :limit => nil,
|
64
|
-
|
62
|
+
:default => nil, :resolve => :full, :except => nil, :example_type => nil)
|
65
63
|
|
66
|
-
def initialize(name, args=[],
|
64
|
+
def initialize(name, args=[], extractor=nil, parent=nil, &block)
|
67
65
|
#init attributes
|
68
66
|
@name = name
|
69
|
-
@
|
67
|
+
@extractor = extractor
|
70
68
|
@parent = parent
|
71
69
|
@options = {}
|
72
70
|
@children = []
|
@@ -83,6 +81,7 @@ module Scrubyt
|
|
83
81
|
#perform checks for special cases
|
84
82
|
examples = check_if_shortcut_pattern() if examples == nil
|
85
83
|
check_if_detail_page(block)
|
84
|
+
@options[:output_type] = :page_list if name == 'page_list'
|
86
85
|
|
87
86
|
#create filters
|
88
87
|
if examples == nil
|
@@ -97,7 +96,7 @@ module Scrubyt
|
|
97
96
|
#@generalize was not set up explicitly
|
98
97
|
if @options[:generalize].nil?
|
99
98
|
@options[:generalize] = true if parent.nil?
|
100
|
-
@options[:generalize] = false if filters[0].example =~ /.+\[[a-zA-Z].+\]$/
|
99
|
+
@options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
|
101
100
|
end
|
102
101
|
|
103
102
|
#parse child patterns if available
|
@@ -160,7 +159,6 @@ module Scrubyt
|
|
160
159
|
if @name =~ /.+_detail/
|
161
160
|
@options[:type] = :detail_page
|
162
161
|
@referenced_extractor = block
|
163
|
-
Scrubyt::Extractor.add_detail_extractor_to_pattern_name(block, self)
|
164
162
|
end
|
165
163
|
end
|
166
164
|
|
@@ -194,7 +192,7 @@ module Scrubyt
|
|
194
192
|
end
|
195
193
|
else
|
196
194
|
#create child pattern
|
197
|
-
child = Scrubyt::Pattern.new(method_name.to_s, args, @current.
|
195
|
+
child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
|
198
196
|
@current.children << child
|
199
197
|
child
|
200
198
|
end
|
@@ -213,7 +211,7 @@ module Scrubyt
|
|
213
211
|
#
|
214
212
|
# camera_data.item[1].item_name[0]
|
215
213
|
def method_missing(method_name, *args, &block)
|
216
|
-
if @
|
214
|
+
if @extractor.evaluating_extractor_definition
|
217
215
|
@modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
|
218
216
|
end
|
219
217
|
|
@@ -294,9 +292,17 @@ module Scrubyt
|
|
294
292
|
result_nodes << node
|
295
293
|
end
|
296
294
|
if result_nodes.empty?
|
297
|
-
result_nodes << ResultNode.new(@name
|
295
|
+
result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
|
296
|
+
end
|
297
|
+
case output_type
|
298
|
+
when :model
|
299
|
+
return result_nodes
|
300
|
+
when :page_list
|
301
|
+
result_nodes.each do |result_node|
|
302
|
+
@extractor.add_to_next_page_list result_node
|
303
|
+
end
|
304
|
+
return []
|
298
305
|
end
|
299
|
-
result_nodes
|
300
306
|
end
|
301
307
|
|
302
308
|
def to_sexp
|
@@ -310,8 +316,7 @@ module Scrubyt
|
|
310
316
|
|
311
317
|
if type == :detail_page
|
312
318
|
#add detail page extractor
|
313
|
-
|
314
|
-
sexp = [:iter, sexp, nil, [:block, detail_root.to_sexp]]
|
319
|
+
sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
|
315
320
|
else
|
316
321
|
#add child block if the pattern has children
|
317
322
|
sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
|
@@ -363,6 +368,8 @@ module Scrubyt
|
|
363
368
|
elsif (args[0].is_a? Hash)
|
364
369
|
examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
|
365
370
|
examples = nil if examples == []
|
371
|
+
elsif (args[0].is_a? Proc)
|
372
|
+
examples = [args[0]]
|
366
373
|
end
|
367
374
|
|
368
375
|
@has_examples = !examples.nil?
|
@@ -7,166 +7,147 @@ module Scrubyt
|
|
7
7
|
#
|
8
8
|
#Originally also the navigation actions were here, but since the class got too
|
9
9
|
#big, they were factored out to an own class, NavigationAction.
|
10
|
-
class Extractor
|
10
|
+
class Extractor
|
11
|
+
include FetchAction
|
12
|
+
|
13
|
+
attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
|
14
|
+
|
11
15
|
#The definition of the extractor is passed through this method
|
12
16
|
def self.define(mode=nil, &extractor_definition)
|
17
|
+
extractor = self.new(mode, extractor_definition)
|
18
|
+
extractor.result
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.load(filename)
|
22
|
+
define(&eval(IO.read(filename)))
|
23
|
+
end
|
24
|
+
|
25
|
+
def initialize(mode, extractor_definition)
|
26
|
+
@mode = mode
|
27
|
+
@root_patterns = []
|
28
|
+
@next_page_pattern = nil
|
29
|
+
# @hpricot_doc = nil
|
30
|
+
# @hpricot_doc_url = nil
|
31
|
+
@evaluating_extractor_definition = false
|
32
|
+
@next_page_list = []
|
33
|
+
@processed_pages = []
|
34
|
+
|
13
35
|
backtrace = SharedUtils.get_backtrace
|
14
36
|
parts = backtrace[1].split(':')
|
15
37
|
source_file = parts[0]
|
16
38
|
|
17
|
-
|
18
|
-
#We are keeping the relations between the detail patterns and their root patterns
|
19
|
-
@@detail_extractor_to_pattern_name = {}
|
20
|
-
@@detail_pattern_relations = {}
|
21
|
-
#root pattern -> URIBuilder mapping
|
22
|
-
@@next_patterns = {}
|
23
|
-
mode_name = (mode == :production ? 'Production' : 'Learning')
|
39
|
+
Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
|
24
40
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
41
|
+
@evaluating_extractor_definition = true
|
42
|
+
context = Object.new
|
43
|
+
context.extend NavigationActions
|
44
|
+
context.instance_eval do
|
45
|
+
def extractor=(value)
|
46
|
+
@extractor = value
|
47
|
+
end
|
48
|
+
|
49
|
+
def next_page(*args)
|
50
|
+
@extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
|
51
|
+
end
|
52
|
+
|
53
|
+
def method_missing(method_name, *args, &block)
|
54
|
+
root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
|
55
|
+
@extractor.root_patterns << root_pattern
|
56
|
+
root_pattern
|
57
|
+
end
|
58
|
+
end
|
59
|
+
context.extractor = self
|
60
|
+
context.instance_eval(&extractor_definition)
|
61
|
+
@evaluating_extractor_definition = false
|
62
|
+
|
63
|
+
if @root_patterns.empty?
|
37
64
|
# TODO: this should be an exception
|
38
65
|
Scrubyt.log :ERROR, 'No extractor defined, exiting...'
|
39
66
|
exit
|
40
67
|
end
|
41
|
-
|
42
|
-
root_pattern.source_file = source_file
|
43
|
-
root_pattern.source_proc = extractor_definition
|
68
|
+
|
44
69
|
#Once all is set up, evaluate the extractor from the root pattern!
|
45
|
-
root_results = evaluate_extractor
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
70
|
+
root_results = evaluate_extractor
|
71
|
+
|
72
|
+
@result = ScrubytResult.new('root')
|
73
|
+
@result.push(*root_results)
|
74
|
+
@result.root_patterns = @root_patterns
|
75
|
+
@result.source_file = source_file
|
76
|
+
@result.source_proc = extractor_definition
|
77
|
+
|
51
78
|
#Return the root pattern
|
52
79
|
Scrubyt.log :INFO, 'Extraction finished succesfully!'
|
53
|
-
scrubyt_result
|
54
|
-
end
|
55
|
-
|
56
|
-
#Evaluate a subexttractor (i.e. an extractor on a detail page).
|
57
|
-
#The url passed to this function is automatically loaded.
|
58
|
-
#The definition of the subextractor is passed as a block
|
59
|
-
#
|
60
|
-
#!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
|
61
|
-
def self.evaluate_subextractor(url, parent_pattern, resolve)
|
62
|
-
if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
|
63
|
-
detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
|
64
|
-
detail_root.last_result = nil
|
65
|
-
FetchAction.store_page
|
66
|
-
@@original_evaluation_context.push @@evaluation_context
|
67
|
-
@@host_stack.push FetchAction.get_host_name
|
68
|
-
@@evaluation_context = EvaluationContext.new
|
69
|
-
FetchAction.restore_host_name
|
70
|
-
fetch url, :resolve => resolve
|
71
|
-
@@evaluation_context.extractor = self
|
72
|
-
@@evaluation_context.root_pattern = detail_root
|
73
|
-
root_results = evaluate_extractor detail_root
|
74
|
-
@@evaluation_context = @@original_evaluation_context.pop
|
75
|
-
FetchAction.restore_page
|
76
|
-
FetchAction.store_host_name(@@host_stack.pop)
|
77
|
-
root_results
|
78
|
-
else
|
79
|
-
@@original_evaluation_context ||= []
|
80
|
-
@@host_stack ||= []
|
81
|
-
FetchAction.store_page
|
82
|
-
@@original_evaluation_context.push @@evaluation_context
|
83
|
-
@@host_stack.push FetchAction.get_host_name
|
84
|
-
@@evaluation_context = EvaluationContext.new
|
85
|
-
FetchAction.restore_host_name
|
86
|
-
fetch url, :resolve => resolve
|
87
|
-
class_eval(&parent_pattern.referenced_extractor)
|
88
|
-
root_pattern = @@evaluation_context.root_pattern
|
89
|
-
@@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern
|
90
|
-
root_results = evaluate_extractor(root_pattern)
|
91
|
-
@@evaluation_context = @@original_evaluation_context.pop
|
92
|
-
FetchAction.restore_page
|
93
|
-
FetchAction.store_host_name(@@host_stack.pop)
|
94
|
-
root_results
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
#build the current wrapper
|
99
|
-
def self.method_missing(method_name, *args, &block)
|
100
|
-
if NavigationActions::KEYWORDS.include? method_name.to_s
|
101
|
-
NavigationActions.send(method_name, *args)
|
102
|
-
return
|
103
|
-
end
|
104
|
-
|
105
|
-
if method_name.to_s == 'next_page'
|
106
|
-
pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
|
107
|
-
pattern.evaluation_context = @@evaluation_context
|
108
|
-
|
109
|
-
@@evaluation_context.setup_uri_builder(pattern, args)
|
110
|
-
@@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
|
111
|
-
else
|
112
|
-
raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
|
113
|
-
#Create a root pattern
|
114
|
-
@@evaluation_context.extractor = self
|
115
|
-
root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
|
116
|
-
@@last_root_pattern = root_pattern
|
117
|
-
@@evaluation_context.root_pattern = root_pattern
|
118
|
-
root_pattern
|
119
|
-
end
|
120
80
|
end
|
121
81
|
|
122
|
-
def
|
123
|
-
|
124
|
-
end
|
125
|
-
|
126
|
-
def self.get_detail_extractor(parent_pattern)
|
127
|
-
@@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
|
128
|
-
end
|
129
|
-
|
130
|
-
def self.get_hpricot_doc
|
131
|
-
NavigationActions.get_hpricot_doc
|
82
|
+
def get_hpricot_doc
|
83
|
+
FetchAction.get_hpricot_doc
|
132
84
|
end
|
133
85
|
|
134
|
-
def
|
135
|
-
|
86
|
+
def get_current_doc_url
|
87
|
+
FetchAction.get_current_doc_url
|
136
88
|
end
|
137
89
|
|
138
|
-
def
|
139
|
-
|
90
|
+
def get_detail_pattern_relations
|
91
|
+
@detail_pattern_relations
|
140
92
|
end
|
141
93
|
|
142
|
-
def
|
143
|
-
|
94
|
+
def get_mode
|
95
|
+
@mode
|
144
96
|
end
|
145
97
|
|
146
|
-
def
|
147
|
-
|
98
|
+
def get_original_host_name
|
99
|
+
@original_host_name
|
148
100
|
end
|
149
101
|
|
150
|
-
def
|
151
|
-
|
102
|
+
def add_to_next_page_list(result_node)
|
103
|
+
if result_node.result.is_a? Hpricot::Elem
|
104
|
+
node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
|
105
|
+
return if node == nil || node.attributes['href'] == nil
|
106
|
+
href = node.attributes['href'].gsub('&') {'&'}
|
107
|
+
elsif result_node.result.is_a? String
|
108
|
+
href = result_node.result
|
109
|
+
end
|
110
|
+
url = href #TODO need absolute address here 1/4
|
111
|
+
@next_page_list << url
|
152
112
|
end
|
153
113
|
|
154
|
-
|
155
|
-
|
156
|
-
def self.evaluate_extractor(root_pattern)
|
114
|
+
def evaluate_extractor
|
157
115
|
root_results = []
|
158
|
-
|
159
|
-
|
116
|
+
current_page_count = 1
|
117
|
+
catch :quit_next_page_loop do
|
160
118
|
loop do
|
161
|
-
|
162
|
-
|
163
|
-
|
119
|
+
url = get_current_doc_url #TODO need absolute address here 2/4
|
120
|
+
puts url
|
121
|
+
@processed_pages << url
|
122
|
+
@root_patterns.each do |root_pattern|
|
123
|
+
root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
124
|
+
end
|
125
|
+
|
126
|
+
while @processed_pages.include? url #TODO need absolute address here 3/4
|
127
|
+
if !@next_page_pattern.nil?
|
128
|
+
throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
|
129
|
+
throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
|
130
|
+
xpath = @next_page_pattern.filters[0].xpath
|
131
|
+
node = (get_hpricot_doc/xpath).map.last
|
132
|
+
node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
|
133
|
+
throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
|
134
|
+
href = node.attributes['href'].gsub('&') {'&'}
|
135
|
+
throw :quit_next_page_loop if href == nil
|
136
|
+
url = href #TODO need absolute address here 4/4
|
137
|
+
else
|
138
|
+
throw :quit_next_page_loop if @next_page_list.empty?
|
139
|
+
url = @next_page_list.pop
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
restore_host_name
|
144
|
+
FetchAction.fetch(url)
|
145
|
+
|
146
|
+
current_page_count += 1
|
164
147
|
end
|
165
|
-
else
|
166
|
-
root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
167
148
|
end
|
168
149
|
root_results
|
169
150
|
end
|
170
151
|
|
171
|
-
end
|
172
|
-
end
|
152
|
+
end
|
153
|
+
end
|