scrubyt 0.3.0 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +13 -6
- data/Rakefile +22 -10
- data/lib/scrubyt.rb +9 -4
- data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
- data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/pattern.rb +25 -18
- data/lib/scrubyt/core/shared/extractor.rb +109 -128
- data/lib/scrubyt/logging.rb +146 -8
- data/lib/scrubyt/output/export.rb +60 -44
- data/lib/scrubyt/output/result_node.rb +34 -3
- data/lib/scrubyt/output/scrubyt_result.rb +18 -9
- data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
- metadata +52 -6
- data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67
data/lib/scrubyt/logging.rb
CHANGED
@@ -1,16 +1,154 @@
|
|
1
|
+
#
|
2
|
+
# TODO: if multiline messages aren't needed, then remove them.
|
3
|
+
#
|
4
|
+
# TODO: switch to the conventional Ruby logger interface,
|
5
|
+
# or create an adapter to it. If the former, then decided what to
|
6
|
+
# do with the unit tests.
|
7
|
+
#
|
8
|
+
|
1
9
|
module Scrubyt
|
2
|
-
|
10
|
+
# Logging is disabled by default. It can be enabled as follows:
|
11
|
+
#
|
12
|
+
# Scrubyt.logger = Scrubyt::Logger.new # logs *all* messages to STDERR
|
13
|
+
#
|
14
|
+
def self.logger=(logger)
|
15
|
+
@logger = logger
|
16
|
+
end
|
17
|
+
|
18
|
+
# Simple logger implementation, based on Scrubyt's original logging style.
|
19
|
+
# Messages will be sent to STDERR. Logging can be limited to certain message
|
20
|
+
# levels by specifying them on initialization, e.g.
|
21
|
+
#
|
22
|
+
# Scrubyt::Logger.new(:ACTION, :ERROR) # will only log action/error messages
|
23
|
+
#
|
24
|
+
class Logger
|
25
|
+
class Message
|
26
|
+
def initialize(level, text)
|
27
|
+
@level, @text = level.to_s, text.to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
prefix + @text
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def prefix
|
37
|
+
@prefix ||= "[#{@level}] "
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class MultiLineMessage < Message
|
42
|
+
def initialize(level, lines)
|
43
|
+
super level, lines.shift
|
44
|
+
|
45
|
+
@lines = lines
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_s
|
49
|
+
[ super, indented_lines ] * "\n"
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def indented_lines
|
55
|
+
@lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
|
56
|
+
end
|
57
|
+
|
58
|
+
def indented(line)
|
59
|
+
' ' * prefix.length + line
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def initialize(*levels)
|
64
|
+
@levels = levels
|
65
|
+
end
|
66
|
+
|
67
|
+
def log(level, message)
|
68
|
+
return unless logging?(level)
|
69
|
+
|
70
|
+
message_class = message.is_a?(Array) ? MultiLineMessage : Message
|
71
|
+
|
72
|
+
output_stream.puts message_class.new(level, message)
|
73
|
+
end
|
74
|
+
|
75
|
+
def output_stream
|
76
|
+
@output_stream || STDERR
|
77
|
+
end
|
3
78
|
|
4
|
-
|
79
|
+
attr_writer :output_stream
|
5
80
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
81
|
+
private
|
82
|
+
|
83
|
+
def logging?(level)
|
84
|
+
@levels.empty? || @levels.include?(level)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.log(level, message)
|
89
|
+
return if logger.nil?
|
90
|
+
|
91
|
+
logger.log(level, message)
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
def self.logger
|
97
|
+
@logger
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
if __FILE__ == $0 then
|
103
|
+
|
104
|
+
require 'test/unit'
|
105
|
+
|
106
|
+
class ScrubytLoggingTestCase < Test::Unit::TestCase
|
107
|
+
class FauxOutputStream < Array
|
108
|
+
def puts(object)
|
109
|
+
self << object.to_s
|
10
110
|
end
|
11
|
-
else
|
12
|
-
puts pre + message.to_s
|
13
111
|
end
|
14
112
|
|
113
|
+
def setup_logger_with_faux_output_stream!(*logger_args)
|
114
|
+
@stream = FauxOutputStream.new
|
115
|
+
logger = Scrubyt::Logger.new(*logger_args)
|
116
|
+
logger.output_stream = @stream
|
117
|
+
Scrubyt.logger = logger
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_that_logging_works_with_nil_logger
|
121
|
+
Scrubyt.logger = nil
|
122
|
+
assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
|
123
|
+
end
|
124
|
+
|
125
|
+
def test_simple_messages_are_output_correctly
|
126
|
+
setup_logger_with_faux_output_stream!
|
127
|
+
|
128
|
+
Scrubyt.log :ACTION, 'i just did something'
|
129
|
+
|
130
|
+
assert_equal 1, @stream.size
|
131
|
+
assert_equal '[ACTION] i just did something', @stream.first
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_that_multiline_messages_are_output_correctly
|
135
|
+
setup_logger_with_faux_output_stream!
|
136
|
+
|
137
|
+
Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
|
138
|
+
|
139
|
+
assert_equal 1, @stream.size
|
140
|
+
assert_equal "[ERROR] something bad happened\n dear oh dear", @stream.first
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_that_loggers_can_be_limited_to_specfied_message_levels
|
144
|
+
setup_logger_with_faux_output_stream! :ERROR
|
145
|
+
|
146
|
+
Scrubyt.log :ACTION, 'i just did something'
|
147
|
+
Scrubyt.log :ERROR, 'something bad happened'
|
148
|
+
|
149
|
+
assert_equal 1, @stream.size
|
150
|
+
assert_equal '[ERROR] something bad happened', @stream.first
|
151
|
+
end
|
15
152
|
end
|
153
|
+
|
16
154
|
end
|
@@ -61,47 +61,70 @@ module Scrubyt
|
|
61
61
|
#This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
|
62
62
|
#After running 'my_super_camera_extractor.rb', the result will be dumped to the file
|
63
63
|
#'/home/peter/stuff/result.xml'.
|
64
|
-
def self.export(
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
64
|
+
def self.export(root_patterns, data)
|
65
|
+
wrapper_name = data[:wrapper_name]
|
66
|
+
template = data[:template] || 'default'
|
67
|
+
output_file_name = data[:output_file_name]
|
68
|
+
extractor_result_file_name = data[:extractor_result_file_name]
|
69
69
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
70
|
+
case template
|
71
|
+
when 'default'
|
72
|
+
header = "require 'rubygems'\n"
|
73
|
+
header += "require 'scrubyt'\n\n"
|
74
|
+
header += "#{wrapper_name} = Scrubyt::Extractor.define do\n"
|
75
|
+
if extractor_result_file_name
|
76
|
+
footer = "\nend\n\n#{wrapper_name}.to_xml.write(open('#{extractor_result_file_name}', 'w'), 1)"
|
77
|
+
else
|
78
|
+
footer = "\nend\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
|
79
|
+
end
|
80
|
+
indent = 1
|
81
|
+
when 'lambda'
|
82
|
+
header = "lambda do\n"
|
83
|
+
footer = "\nend"
|
84
|
+
indent = 1
|
85
|
+
when 'none'
|
86
|
+
header = ''
|
87
|
+
footer = ''
|
88
|
+
indent = 0
|
89
|
+
else
|
90
|
+
puts "Unknown template type: #{template}"
|
91
|
+
return
|
92
|
+
end
|
93
|
+
|
94
|
+
header = data[:header] if data[:header]
|
95
|
+
footer = data[:footer] if data[:footer]
|
96
|
+
indent = data[:indent] if data[:indent]
|
79
97
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
98
|
+
sexp = [:block]
|
99
|
+
sexp << export_extractor(root_patterns, data[:source_proc], wrapper_name)
|
100
|
+
extractor = RubyToRuby.new.process(sexp).gsub('"' + data[:source_file] + '"', '__FILE__')
|
101
|
+
extractor = extractor.strip
|
102
|
+
extractor = extractor.split("\n").map{ |l| (' ' * indent) + l }.join("\n")
|
84
103
|
|
85
|
-
|
86
|
-
|
104
|
+
result = header + extractor + footer
|
105
|
+
|
106
|
+
if output_file_name
|
107
|
+
open(output_file_name, 'w') do |file|
|
108
|
+
file.write(result)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
result
|
87
113
|
end
|
88
114
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
else
|
93
|
-
create_sexp "#{wrapper_name}.to_xml.write($stdout, 1)"
|
94
|
-
end
|
115
|
+
private
|
116
|
+
def self.create_sexp(code)
|
117
|
+
(ParseTree.new.parse_tree_for_string(code))[0]
|
95
118
|
end
|
96
119
|
|
97
|
-
def self.export_extractor(
|
120
|
+
def self.export_extractor(root_patterns, source_proc, wrapper_name)
|
98
121
|
# filter actions before and after pattern
|
99
122
|
pre_pattern_sexp = []
|
100
123
|
post_pattern_sexp = []
|
101
|
-
|
102
|
-
actions = ['next_page', *NavigationActions
|
124
|
+
patterns_passed = false
|
125
|
+
actions = ['next_page', *NavigationActions.instance_methods]
|
103
126
|
|
104
|
-
|
127
|
+
source_proc.to_sexp[3][1..-1].each do |sexp|
|
105
128
|
get_call = lambda { |sexp|
|
106
129
|
if sexp[0] == :fcall
|
107
130
|
return sexp[1].to_s
|
@@ -113,29 +136,22 @@ private
|
|
113
136
|
}
|
114
137
|
call = get_call.call(sexp)
|
115
138
|
if(call.nil? || actions.index(call) != nil)
|
116
|
-
if !
|
139
|
+
if !patterns_passed
|
117
140
|
pre_pattern_sexp.push(sexp)
|
118
141
|
else
|
119
142
|
post_pattern_sexp.push(sexp)
|
120
143
|
end
|
121
144
|
else
|
122
|
-
|
123
|
-
pattern_skipped = true
|
145
|
+
patterns_passed = true
|
124
146
|
end
|
125
147
|
end
|
126
|
-
|
127
|
-
# build extractor content
|
128
|
-
inner_block = [:block]
|
129
|
-
inner_block.push([:block, *pre_pattern_sexp])
|
130
|
-
inner_block.push([:block, export_pattern(root_pattern)])
|
131
|
-
inner_block.push([:block, *post_pattern_sexp])
|
132
148
|
|
133
149
|
# build extractor
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
150
|
+
sexp = [:block]
|
151
|
+
sexp.push([:block, *pre_pattern_sexp])
|
152
|
+
sexp.push([:block, *root_patterns.to_sexp_array])
|
153
|
+
sexp.push([:block, *post_pattern_sexp])
|
154
|
+
sexp
|
139
155
|
end
|
140
156
|
end
|
141
157
|
end
|
@@ -20,7 +20,7 @@ module Scrubyt
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def to_s
|
23
|
-
text = (@result.is_a? String) ? @result : @result.
|
23
|
+
text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
|
24
24
|
text = SharedUtils.unescape_entities(text)
|
25
25
|
text.strip!
|
26
26
|
text
|
@@ -38,10 +38,10 @@ module Scrubyt
|
|
38
38
|
to_xml_lines.join("\n")
|
39
39
|
end
|
40
40
|
|
41
|
-
def to_hash
|
41
|
+
def to_hash(delimiter=',')
|
42
42
|
result = []
|
43
43
|
flat_hash_inner = lambda {|e, hash|
|
44
|
-
hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] +
|
44
|
+
hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if e.write_text && !e.to_s.empty?
|
45
45
|
e.each {|c| flat_hash_inner.call(c, hash) }
|
46
46
|
hash
|
47
47
|
}
|
@@ -49,6 +49,37 @@ module Scrubyt
|
|
49
49
|
result
|
50
50
|
end
|
51
51
|
|
52
|
+
def to_flat_xml(delimiter=nil)
|
53
|
+
lines = []
|
54
|
+
hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
|
55
|
+
|
56
|
+
if delimiter
|
57
|
+
result_sets = hash_result[0].values.map!{|x| x.split(delimiter)}.transpose
|
58
|
+
final_result = []
|
59
|
+
|
60
|
+
result_sets.each do |rs|
|
61
|
+
temp_result = {}
|
62
|
+
hash_result[0].keys.each do |k|
|
63
|
+
temp_result[k] = rs[hash_result[0].keys.index(k)]
|
64
|
+
end
|
65
|
+
final_result << temp_result
|
66
|
+
end
|
67
|
+
hash_result = final_result
|
68
|
+
end
|
69
|
+
|
70
|
+
hash_result.each do |hash|
|
71
|
+
lines << "<item>"
|
72
|
+
hash.each do |key, value|
|
73
|
+
xml_tag = key.to_s
|
74
|
+
value = '' if value == '#empty#'
|
75
|
+
lines << " <#{xml_tag}>#{REXML::Text.normalize(value)}</#{xml_tag}>"
|
76
|
+
end
|
77
|
+
lines << "</item>"
|
78
|
+
end
|
79
|
+
return lines.join("\n")
|
80
|
+
|
81
|
+
end
|
82
|
+
|
52
83
|
def to_xml_lines
|
53
84
|
lines = []
|
54
85
|
children = self.select{ |child| child.has_content? }
|
@@ -1,13 +1,16 @@
|
|
1
1
|
module Scrubyt
|
2
2
|
class ScrubytResult < ResultNode
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :root_patterns, :source_file, :source_proc
|
4
4
|
|
5
5
|
def export(arg1, output_file_name=nil, extractor_result_file_name=nil)
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
if arg1.is_a? String
|
7
|
+
if File.exists? arg1
|
8
|
+
export_old1(arg1, output_file_name, extractor_result_file_name)
|
9
|
+
else
|
10
|
+
export_old2(arg1, output_file_name, extractor_result_file_name)
|
11
|
+
end
|
9
12
|
else
|
10
|
-
|
13
|
+
export_new(arg1)
|
11
14
|
end
|
12
15
|
end
|
13
16
|
|
@@ -15,14 +18,20 @@ module Scrubyt
|
|
15
18
|
#Implement me...
|
16
19
|
end
|
17
20
|
|
18
|
-
def
|
21
|
+
def export_old1(input_file, output_file_name=nil, extractor_result_file_name=nil)
|
19
22
|
contents = open(input_file).read
|
20
23
|
wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
|
21
|
-
|
24
|
+
export_old2(wrapper_name, output_file_name, extractor_result_file_name)
|
22
25
|
end
|
23
26
|
|
24
|
-
def
|
25
|
-
|
27
|
+
def export_old2(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
|
28
|
+
export_new({ :wrapper_name => wrapper_name, :output_file_name => output_file_name || "#{wrapper_name}_extractor_export.rb", :extractor_result_file_name => extractor_result_file_name })
|
29
|
+
end
|
30
|
+
|
31
|
+
def export_new(data)
|
32
|
+
data[:source_file] = @source_file
|
33
|
+
data[:source_proc] = @source_proc
|
34
|
+
Scrubyt::Export.export(@root_patterns, data)
|
26
35
|
end
|
27
36
|
end
|
28
37
|
end
|
@@ -43,7 +43,7 @@ private
|
|
43
43
|
end
|
44
44
|
|
45
45
|
def self.refine_partial_results(regexp)
|
46
|
-
@partial_results = @partial_results.select {|pr| pr.
|
46
|
+
@partial_results = @partial_results.select {|pr| pr.inner_html.gsub(/<.*?>/, '') =~ regexp}
|
47
47
|
end
|
48
48
|
|
49
49
|
end #End of class CompoundExampleLookup
|
@@ -28,7 +28,7 @@ module Scrubyt
|
|
28
28
|
def self.traverse_for_match(node, regexp)
|
29
29
|
results = []
|
30
30
|
traverse_for_match_inner = lambda { |node, regexp|
|
31
|
-
ft = prepare_text_for_comparison(node.
|
31
|
+
ft = prepare_text_for_comparison(node.inner_html.gsub(/<.*?>/, ''))
|
32
32
|
if ft =~ regexp
|
33
33
|
node.instance_eval do
|
34
34
|
@match_data = $~
|
@@ -16,7 +16,7 @@ module Scrubyt
|
|
16
16
|
#In this case, <a>'s text is considered to be "Bon nuit, monsieur"
|
17
17
|
def self.find_node_from_text(doc, text, next_link=false, index = 0)
|
18
18
|
text.gsub!('»', '»')
|
19
|
-
#Process immediate attribute extraction (like "go to google.com
|
19
|
+
#Process immediate attribute extraction (like "go to google.com/@href")
|
20
20
|
if text =~ /.+\/@.+$/
|
21
21
|
text = text.scan(/^(.+?)\/@.+$/)[0][0]
|
22
22
|
elsif text =~ /.+\[\d+\]$/
|
@@ -27,10 +27,14 @@ module Scrubyt
|
|
27
27
|
final_element_name = text.scan(/^(.+?)\[/)[0][0]
|
28
28
|
text = text.scan(/\[(.+?)\]/)[0][0]
|
29
29
|
end
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
30
|
+
if final_element_name
|
31
|
+
text = Regexp.escape(text) if text.is_a? String
|
32
|
+
result = SharedUtils.traverse_for_match(doc,/#{text}/)[index]
|
33
|
+
result = XPathUtils.traverse_up_until_name(result,final_element_name)
|
34
|
+
else
|
35
|
+
text = Regexp.escape(text) if text.is_a? String
|
36
|
+
result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
|
37
|
+
end
|
34
38
|
end
|
35
39
|
end #End of class SimpleExampleLookup
|
36
40
|
end #End of module Scrubyt
|