scrubyt 0.3.0 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +13 -6
- data/Rakefile +22 -10
- data/lib/scrubyt.rb +9 -4
- data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
- data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/pattern.rb +25 -18
- data/lib/scrubyt/core/shared/extractor.rb +109 -128
- data/lib/scrubyt/logging.rb +146 -8
- data/lib/scrubyt/output/export.rb +60 -44
- data/lib/scrubyt/output/result_node.rb +34 -3
- data/lib/scrubyt/output/scrubyt_result.rb +18 -9
- data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
- metadata +52 -6
- data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67
@@ -14,17 +14,25 @@ module Scrubyt
|
|
14
14
|
|
15
15
|
private
|
16
16
|
def download_file(source)
|
17
|
-
|
17
|
+
return '' if source.size < 4
|
18
|
+
host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
|
18
19
|
outfile = nil
|
20
|
+
host_name += "/" if host_name[-1..-1] != "/"
|
19
21
|
base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
|
20
|
-
return '' if source.size < 4
|
21
22
|
file_name = source.scan(/.+\/(.*)/)[0][0]
|
23
|
+
return nil if @parent_pattern.except.include? file_name
|
22
24
|
Net::HTTP.start(base_url) { |http|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
|
26
|
+
begin
|
27
|
+
ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
|
28
|
+
path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
|
29
|
+
resp = http.get(path, {'User-Agent'=> ua})
|
30
|
+
outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
|
31
|
+
FileUtils.mkdir_p @example
|
32
|
+
open(outfile, 'wb') {|f| f.write(resp.body) }
|
33
|
+
rescue Timeout::Error
|
34
|
+
outfile = "[FAILED]#{file_name}"
|
35
|
+
end
|
28
36
|
}
|
29
37
|
outfile.scan(/.+\/(.*)/)[0][0]
|
30
38
|
end
|
@@ -34,11 +42,21 @@ private
|
|
34
42
|
loop do
|
35
43
|
if File.exists? file_name
|
36
44
|
if already_found
|
37
|
-
|
38
|
-
|
45
|
+
if file_name.include?('.')
|
46
|
+
last_no = file_name.scan(/_(\d+)\./)[0][0]
|
47
|
+
file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
|
48
|
+
else
|
49
|
+
last_no = file_name.scan(/_(\d+)$/)[0][0]
|
50
|
+
file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
|
51
|
+
end
|
39
52
|
else
|
40
|
-
file_name.
|
41
|
-
|
53
|
+
if file_name.include?('.')
|
54
|
+
file_name.sub!(/\./) {"_1\."}
|
55
|
+
already_found = true
|
56
|
+
else
|
57
|
+
file_name << '_1'
|
58
|
+
already_found = true
|
59
|
+
end
|
42
60
|
end
|
43
61
|
else
|
44
62
|
break
|
@@ -1,17 +1,17 @@
|
|
1
1
|
module Scrubyt
|
2
2
|
class RegexpFilter < BaseFilter
|
3
|
-
|
3
|
+
|
4
4
|
def evaluate(source)
|
5
5
|
if source.is_a? String
|
6
6
|
source.scan(@example).flatten
|
7
7
|
else
|
8
|
-
source.
|
9
|
-
end
|
8
|
+
source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
|
9
|
+
end
|
10
10
|
end
|
11
|
-
|
11
|
+
|
12
12
|
def to_sexp
|
13
13
|
[:lit, @example]
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
end #End of class TreeFilter
|
17
17
|
end #End of module Scrubyt
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class ScriptFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
param = source
|
6
|
+
param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
|
7
|
+
@example.call param
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_sexp
|
11
|
+
[:str, "FIXME!!! Can't dump Proc"]
|
12
|
+
end #end of method to_sexp
|
13
|
+
end #End of class ConstantFilter
|
14
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class TextFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
return find_string(source) if @example =~ /^find\(/
|
6
|
+
final_element_name = @example.scan(/^(.+?)\[/)[0][0]
|
7
|
+
text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
|
8
|
+
|
9
|
+
index = @example.scan(/\]:(.+)/).flatten
|
10
|
+
index = 0 if index.empty?
|
11
|
+
index = index[0].to_i unless index[0] == "all"
|
12
|
+
|
13
|
+
result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
|
14
|
+
return "" unless result
|
15
|
+
|
16
|
+
if index[0] == "all"
|
17
|
+
result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
|
18
|
+
else
|
19
|
+
[XPathUtils.traverse_up_until_name(result,final_element_name)]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def find_string(source)
|
24
|
+
str = @example.scan(/find\((.+)\)/).flatten[0]
|
25
|
+
strings_to_find = str.include? ('|') ? str.split('|') : [str]
|
26
|
+
strings_to_find.each do |s|
|
27
|
+
result = SharedUtils.traverse_for_match(source,/#{s}/i)
|
28
|
+
return [s] unless result.empty?
|
29
|
+
end
|
30
|
+
return []
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_sexp
|
34
|
+
[:str, @example]
|
35
|
+
end #end of method to_sexp
|
36
|
+
end #End of class TextFilter
|
37
|
+
end #End of module Scrubyt
|
38
|
+
|
@@ -38,7 +38,7 @@ module Scrubyt
|
|
38
38
|
return if @temp_sink.is_a? String
|
39
39
|
return if @example =~ /.+\[.+\]$/
|
40
40
|
|
41
|
-
text = SharedUtils.prepare_text_for_comparison(@temp_sink.
|
41
|
+
text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
|
42
42
|
match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
|
43
43
|
return if match_range == (0..text.length)
|
44
44
|
|
@@ -64,7 +64,7 @@ module Scrubyt
|
|
64
64
|
when EXAMPLE_TYPE_XPATH
|
65
65
|
@xpath = @example
|
66
66
|
when EXAMPLE_TYPE_STRING
|
67
|
-
@temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.
|
67
|
+
@temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
|
68
68
|
@example,
|
69
69
|
next_page_example)
|
70
70
|
return if @temp_sink == nil
|
@@ -116,10 +116,10 @@ module Scrubyt
|
|
116
116
|
current_example_index += 1
|
117
117
|
end
|
118
118
|
when EXAMPLE_TYPE_IMAGE
|
119
|
-
@temp_sink = XPathUtils.find_image(@parent_pattern.
|
119
|
+
@temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
|
120
120
|
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
|
121
121
|
when EXAMPLE_TYPE_COMPOUND
|
122
|
-
@temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.
|
122
|
+
@temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
|
123
123
|
@example,
|
124
124
|
next_page_example)
|
125
125
|
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
@@ -128,7 +128,7 @@ module Scrubyt
|
|
128
128
|
end
|
129
129
|
|
130
130
|
def generate_relative_XPath(parent_xpath)
|
131
|
-
parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.
|
131
|
+
parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
|
132
132
|
parent_xpath,
|
133
133
|
@parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
|
134
134
|
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
|
@@ -33,7 +33,7 @@ module Scrubyt
|
|
33
33
|
# # write out the HTML subtree beginning at the matched element
|
34
34
|
# PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
|
35
35
|
|
36
|
-
VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree]
|
36
|
+
VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
|
37
37
|
|
38
38
|
#The pattern can be either a model pattern (in this case it is
|
39
39
|
#written to the output) or a temp pattern (in this case it is skipped)
|
@@ -46,27 +46,25 @@ module Scrubyt
|
|
46
46
|
# #of the pattrern which was skipped
|
47
47
|
# OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
|
48
48
|
|
49
|
-
VALID_OUTPUT_TYPES = [:model, :temp]
|
49
|
+
VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
|
50
50
|
|
51
51
|
#These options can be set upon wrapper creation
|
52
|
-
PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve]
|
52
|
+
PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
|
53
53
|
VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
|
54
54
|
|
55
|
-
attr_accessor(:name, :options, :children, :constraints, :filters, :parent,
|
56
|
-
:
|
57
|
-
:indices_to_extract, :referenced_extractor, :referenced_pattern,
|
58
|
-
:source_file, :source_proc, :modifier_calls)
|
55
|
+
attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
|
56
|
+
:indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
|
59
57
|
|
60
58
|
attr_reader(:next_page_url, :result_indexer)
|
61
59
|
|
62
60
|
option_reader(:type => :tree, :output_type => :model, :generalize => false,
|
63
61
|
:write_text => lambda { @children.size == 0 }, :limit => nil,
|
64
|
-
|
62
|
+
:default => nil, :resolve => :full, :except => nil, :example_type => nil)
|
65
63
|
|
66
|
-
def initialize(name, args=[],
|
64
|
+
def initialize(name, args=[], extractor=nil, parent=nil, &block)
|
67
65
|
#init attributes
|
68
66
|
@name = name
|
69
|
-
@
|
67
|
+
@extractor = extractor
|
70
68
|
@parent = parent
|
71
69
|
@options = {}
|
72
70
|
@children = []
|
@@ -83,6 +81,7 @@ module Scrubyt
|
|
83
81
|
#perform checks for special cases
|
84
82
|
examples = check_if_shortcut_pattern() if examples == nil
|
85
83
|
check_if_detail_page(block)
|
84
|
+
@options[:output_type] = :page_list if name == 'page_list'
|
86
85
|
|
87
86
|
#create filters
|
88
87
|
if examples == nil
|
@@ -97,7 +96,7 @@ module Scrubyt
|
|
97
96
|
#@generalize was not set up explicitly
|
98
97
|
if @options[:generalize].nil?
|
99
98
|
@options[:generalize] = true if parent.nil?
|
100
|
-
@options[:generalize] = false if filters[0].example =~ /.+\[[a-zA-Z].+\]$/
|
99
|
+
@options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
|
101
100
|
end
|
102
101
|
|
103
102
|
#parse child patterns if available
|
@@ -160,7 +159,6 @@ module Scrubyt
|
|
160
159
|
if @name =~ /.+_detail/
|
161
160
|
@options[:type] = :detail_page
|
162
161
|
@referenced_extractor = block
|
163
|
-
Scrubyt::Extractor.add_detail_extractor_to_pattern_name(block, self)
|
164
162
|
end
|
165
163
|
end
|
166
164
|
|
@@ -194,7 +192,7 @@ module Scrubyt
|
|
194
192
|
end
|
195
193
|
else
|
196
194
|
#create child pattern
|
197
|
-
child = Scrubyt::Pattern.new(method_name.to_s, args, @current.
|
195
|
+
child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
|
198
196
|
@current.children << child
|
199
197
|
child
|
200
198
|
end
|
@@ -213,7 +211,7 @@ module Scrubyt
|
|
213
211
|
#
|
214
212
|
# camera_data.item[1].item_name[0]
|
215
213
|
def method_missing(method_name, *args, &block)
|
216
|
-
if @
|
214
|
+
if @extractor.evaluating_extractor_definition
|
217
215
|
@modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
|
218
216
|
end
|
219
217
|
|
@@ -294,9 +292,17 @@ module Scrubyt
|
|
294
292
|
result_nodes << node
|
295
293
|
end
|
296
294
|
if result_nodes.empty?
|
297
|
-
result_nodes << ResultNode.new(@name
|
295
|
+
result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
|
296
|
+
end
|
297
|
+
case output_type
|
298
|
+
when :model
|
299
|
+
return result_nodes
|
300
|
+
when :page_list
|
301
|
+
result_nodes.each do |result_node|
|
302
|
+
@extractor.add_to_next_page_list result_node
|
303
|
+
end
|
304
|
+
return []
|
298
305
|
end
|
299
|
-
result_nodes
|
300
306
|
end
|
301
307
|
|
302
308
|
def to_sexp
|
@@ -310,8 +316,7 @@ module Scrubyt
|
|
310
316
|
|
311
317
|
if type == :detail_page
|
312
318
|
#add detail page extractor
|
313
|
-
|
314
|
-
sexp = [:iter, sexp, nil, [:block, detail_root.to_sexp]]
|
319
|
+
sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
|
315
320
|
else
|
316
321
|
#add child block if the pattern has children
|
317
322
|
sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
|
@@ -363,6 +368,8 @@ module Scrubyt
|
|
363
368
|
elsif (args[0].is_a? Hash)
|
364
369
|
examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
|
365
370
|
examples = nil if examples == []
|
371
|
+
elsif (args[0].is_a? Proc)
|
372
|
+
examples = [args[0]]
|
366
373
|
end
|
367
374
|
|
368
375
|
@has_examples = !examples.nil?
|
@@ -7,166 +7,147 @@ module Scrubyt
|
|
7
7
|
#
|
8
8
|
#Originally also the navigation actions were here, but since the class got too
|
9
9
|
#big, they were factored out to an own class, NavigationAction.
|
10
|
-
class Extractor
|
10
|
+
class Extractor
|
11
|
+
include FetchAction
|
12
|
+
|
13
|
+
attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
|
14
|
+
|
11
15
|
#The definition of the extractor is passed through this method
|
12
16
|
def self.define(mode=nil, &extractor_definition)
|
17
|
+
extractor = self.new(mode, extractor_definition)
|
18
|
+
extractor.result
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.load(filename)
|
22
|
+
define(&eval(IO.read(filename)))
|
23
|
+
end
|
24
|
+
|
25
|
+
def initialize(mode, extractor_definition)
|
26
|
+
@mode = mode
|
27
|
+
@root_patterns = []
|
28
|
+
@next_page_pattern = nil
|
29
|
+
# @hpricot_doc = nil
|
30
|
+
# @hpricot_doc_url = nil
|
31
|
+
@evaluating_extractor_definition = false
|
32
|
+
@next_page_list = []
|
33
|
+
@processed_pages = []
|
34
|
+
|
13
35
|
backtrace = SharedUtils.get_backtrace
|
14
36
|
parts = backtrace[1].split(':')
|
15
37
|
source_file = parts[0]
|
16
38
|
|
17
|
-
|
18
|
-
#We are keeping the relations between the detail patterns and their root patterns
|
19
|
-
@@detail_extractor_to_pattern_name = {}
|
20
|
-
@@detail_pattern_relations = {}
|
21
|
-
#root pattern -> URIBuilder mapping
|
22
|
-
@@next_patterns = {}
|
23
|
-
mode_name = (mode == :production ? 'Production' : 'Learning')
|
39
|
+
Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
|
24
40
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
41
|
+
@evaluating_extractor_definition = true
|
42
|
+
context = Object.new
|
43
|
+
context.extend NavigationActions
|
44
|
+
context.instance_eval do
|
45
|
+
def extractor=(value)
|
46
|
+
@extractor = value
|
47
|
+
end
|
48
|
+
|
49
|
+
def next_page(*args)
|
50
|
+
@extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
|
51
|
+
end
|
52
|
+
|
53
|
+
def method_missing(method_name, *args, &block)
|
54
|
+
root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
|
55
|
+
@extractor.root_patterns << root_pattern
|
56
|
+
root_pattern
|
57
|
+
end
|
58
|
+
end
|
59
|
+
context.extractor = self
|
60
|
+
context.instance_eval(&extractor_definition)
|
61
|
+
@evaluating_extractor_definition = false
|
62
|
+
|
63
|
+
if @root_patterns.empty?
|
37
64
|
# TODO: this should be an exception
|
38
65
|
Scrubyt.log :ERROR, 'No extractor defined, exiting...'
|
39
66
|
exit
|
40
67
|
end
|
41
|
-
|
42
|
-
root_pattern.source_file = source_file
|
43
|
-
root_pattern.source_proc = extractor_definition
|
68
|
+
|
44
69
|
#Once all is set up, evaluate the extractor from the root pattern!
|
45
|
-
root_results = evaluate_extractor
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
70
|
+
root_results = evaluate_extractor
|
71
|
+
|
72
|
+
@result = ScrubytResult.new('root')
|
73
|
+
@result.push(*root_results)
|
74
|
+
@result.root_patterns = @root_patterns
|
75
|
+
@result.source_file = source_file
|
76
|
+
@result.source_proc = extractor_definition
|
77
|
+
|
51
78
|
#Return the root pattern
|
52
79
|
Scrubyt.log :INFO, 'Extraction finished succesfully!'
|
53
|
-
scrubyt_result
|
54
|
-
end
|
55
|
-
|
56
|
-
#Evaluate a subexttractor (i.e. an extractor on a detail page).
|
57
|
-
#The url passed to this function is automatically loaded.
|
58
|
-
#The definition of the subextractor is passed as a block
|
59
|
-
#
|
60
|
-
#!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
|
61
|
-
def self.evaluate_subextractor(url, parent_pattern, resolve)
|
62
|
-
if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
|
63
|
-
detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
|
64
|
-
detail_root.last_result = nil
|
65
|
-
FetchAction.store_page
|
66
|
-
@@original_evaluation_context.push @@evaluation_context
|
67
|
-
@@host_stack.push FetchAction.get_host_name
|
68
|
-
@@evaluation_context = EvaluationContext.new
|
69
|
-
FetchAction.restore_host_name
|
70
|
-
fetch url, :resolve => resolve
|
71
|
-
@@evaluation_context.extractor = self
|
72
|
-
@@evaluation_context.root_pattern = detail_root
|
73
|
-
root_results = evaluate_extractor detail_root
|
74
|
-
@@evaluation_context = @@original_evaluation_context.pop
|
75
|
-
FetchAction.restore_page
|
76
|
-
FetchAction.store_host_name(@@host_stack.pop)
|
77
|
-
root_results
|
78
|
-
else
|
79
|
-
@@original_evaluation_context ||= []
|
80
|
-
@@host_stack ||= []
|
81
|
-
FetchAction.store_page
|
82
|
-
@@original_evaluation_context.push @@evaluation_context
|
83
|
-
@@host_stack.push FetchAction.get_host_name
|
84
|
-
@@evaluation_context = EvaluationContext.new
|
85
|
-
FetchAction.restore_host_name
|
86
|
-
fetch url, :resolve => resolve
|
87
|
-
class_eval(&parent_pattern.referenced_extractor)
|
88
|
-
root_pattern = @@evaluation_context.root_pattern
|
89
|
-
@@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern
|
90
|
-
root_results = evaluate_extractor(root_pattern)
|
91
|
-
@@evaluation_context = @@original_evaluation_context.pop
|
92
|
-
FetchAction.restore_page
|
93
|
-
FetchAction.store_host_name(@@host_stack.pop)
|
94
|
-
root_results
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
#build the current wrapper
|
99
|
-
def self.method_missing(method_name, *args, &block)
|
100
|
-
if NavigationActions::KEYWORDS.include? method_name.to_s
|
101
|
-
NavigationActions.send(method_name, *args)
|
102
|
-
return
|
103
|
-
end
|
104
|
-
|
105
|
-
if method_name.to_s == 'next_page'
|
106
|
-
pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
|
107
|
-
pattern.evaluation_context = @@evaluation_context
|
108
|
-
|
109
|
-
@@evaluation_context.setup_uri_builder(pattern, args)
|
110
|
-
@@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
|
111
|
-
else
|
112
|
-
raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
|
113
|
-
#Create a root pattern
|
114
|
-
@@evaluation_context.extractor = self
|
115
|
-
root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
|
116
|
-
@@last_root_pattern = root_pattern
|
117
|
-
@@evaluation_context.root_pattern = root_pattern
|
118
|
-
root_pattern
|
119
|
-
end
|
120
80
|
end
|
121
81
|
|
122
|
-
def
|
123
|
-
|
124
|
-
end
|
125
|
-
|
126
|
-
def self.get_detail_extractor(parent_pattern)
|
127
|
-
@@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
|
128
|
-
end
|
129
|
-
|
130
|
-
def self.get_hpricot_doc
|
131
|
-
NavigationActions.get_hpricot_doc
|
82
|
+
def get_hpricot_doc
|
83
|
+
FetchAction.get_hpricot_doc
|
132
84
|
end
|
133
85
|
|
134
|
-
def
|
135
|
-
|
86
|
+
def get_current_doc_url
|
87
|
+
FetchAction.get_current_doc_url
|
136
88
|
end
|
137
89
|
|
138
|
-
def
|
139
|
-
|
90
|
+
def get_detail_pattern_relations
|
91
|
+
@detail_pattern_relations
|
140
92
|
end
|
141
93
|
|
142
|
-
def
|
143
|
-
|
94
|
+
def get_mode
|
95
|
+
@mode
|
144
96
|
end
|
145
97
|
|
146
|
-
def
|
147
|
-
|
98
|
+
def get_original_host_name
|
99
|
+
@original_host_name
|
148
100
|
end
|
149
101
|
|
150
|
-
def
|
151
|
-
|
102
|
+
def add_to_next_page_list(result_node)
|
103
|
+
if result_node.result.is_a? Hpricot::Elem
|
104
|
+
node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
|
105
|
+
return if node == nil || node.attributes['href'] == nil
|
106
|
+
href = node.attributes['href'].gsub('&') {'&'}
|
107
|
+
elsif result_node.result.is_a? String
|
108
|
+
href = result_node.result
|
109
|
+
end
|
110
|
+
url = href #TODO need absolute address here 1/4
|
111
|
+
@next_page_list << url
|
152
112
|
end
|
153
113
|
|
154
|
-
|
155
|
-
|
156
|
-
def self.evaluate_extractor(root_pattern)
|
114
|
+
def evaluate_extractor
|
157
115
|
root_results = []
|
158
|
-
|
159
|
-
|
116
|
+
current_page_count = 1
|
117
|
+
catch :quit_next_page_loop do
|
160
118
|
loop do
|
161
|
-
|
162
|
-
|
163
|
-
|
119
|
+
url = get_current_doc_url #TODO need absolute address here 2/4
|
120
|
+
puts url
|
121
|
+
@processed_pages << url
|
122
|
+
@root_patterns.each do |root_pattern|
|
123
|
+
root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
124
|
+
end
|
125
|
+
|
126
|
+
while @processed_pages.include? url #TODO need absolute address here 3/4
|
127
|
+
if !@next_page_pattern.nil?
|
128
|
+
throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
|
129
|
+
throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
|
130
|
+
xpath = @next_page_pattern.filters[0].xpath
|
131
|
+
node = (get_hpricot_doc/xpath).map.last
|
132
|
+
node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
|
133
|
+
throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
|
134
|
+
href = node.attributes['href'].gsub('&') {'&'}
|
135
|
+
throw :quit_next_page_loop if href == nil
|
136
|
+
url = href #TODO need absolute address here 4/4
|
137
|
+
else
|
138
|
+
throw :quit_next_page_loop if @next_page_list.empty?
|
139
|
+
url = @next_page_list.pop
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
restore_host_name
|
144
|
+
FetchAction.fetch(url)
|
145
|
+
|
146
|
+
current_page_count += 1
|
164
147
|
end
|
165
|
-
else
|
166
|
-
root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
167
148
|
end
|
168
149
|
root_results
|
169
150
|
end
|
170
151
|
|
171
|
-
end
|
172
|
-
end
|
152
|
+
end
|
153
|
+
end
|