scrubyt 0.3.0 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -6
- data/Rakefile +22 -10
- data/lib/scrubyt.rb +9 -4
- data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
- data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/pattern.rb +25 -18
- data/lib/scrubyt/core/shared/extractor.rb +109 -128
- data/lib/scrubyt/logging.rb +146 -8
- data/lib/scrubyt/output/export.rb +60 -44
- data/lib/scrubyt/output/result_node.rb +34 -3
- data/lib/scrubyt/output/scrubyt_result.rb +18 -9
- data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
- metadata +52 -6
- data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67
data/lib/scrubyt/logging.rb
CHANGED
@@ -1,16 +1,154 @@
|
|
1
|
+
#
|
2
|
+
# TODO: if multiline messages aren't needed, then remove them.
|
3
|
+
#
|
4
|
+
# TODO: switch to the conventional Ruby logger interface,
|
5
|
+
# or create an adapter to it. If the former, then decided what to
|
6
|
+
# do with the unit tests.
|
7
|
+
#
|
8
|
+
|
1
9
|
module Scrubyt
|
2
|
-
|
10
|
+
# Logging is disabled by default. It can be enabled as follows:
|
11
|
+
#
|
12
|
+
# Scrubyt.logger = Scrubyt::Logger.new # logs *all* messages to STDERR
|
13
|
+
#
|
14
|
+
def self.logger=(logger)
|
15
|
+
@logger = logger
|
16
|
+
end
|
17
|
+
|
18
|
+
# Simple logger implementation, based on Scrubyt's original logging style.
|
19
|
+
# Messages will be sent to STDERR. Logging can be limited to certain message
|
20
|
+
# levels by specifying them on initialization, e.g.
|
21
|
+
#
|
22
|
+
# Scrubyt::Logger.new(:ACTION, :ERROR) # will only log action/error messages
|
23
|
+
#
|
24
|
+
class Logger
|
25
|
+
class Message
|
26
|
+
def initialize(level, text)
|
27
|
+
@level, @text = level.to_s, text.to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
prefix + @text
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def prefix
|
37
|
+
@prefix ||= "[#{@level}] "
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class MultiLineMessage < Message
|
42
|
+
def initialize(level, lines)
|
43
|
+
super level, lines.shift
|
44
|
+
|
45
|
+
@lines = lines
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_s
|
49
|
+
[ super, indented_lines ] * "\n"
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def indented_lines
|
55
|
+
@lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
|
56
|
+
end
|
57
|
+
|
58
|
+
def indented(line)
|
59
|
+
' ' * prefix.length + line
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def initialize(*levels)
|
64
|
+
@levels = levels
|
65
|
+
end
|
66
|
+
|
67
|
+
def log(level, message)
|
68
|
+
return unless logging?(level)
|
69
|
+
|
70
|
+
message_class = message.is_a?(Array) ? MultiLineMessage : Message
|
71
|
+
|
72
|
+
output_stream.puts message_class.new(level, message)
|
73
|
+
end
|
74
|
+
|
75
|
+
def output_stream
|
76
|
+
@output_stream || STDERR
|
77
|
+
end
|
3
78
|
|
4
|
-
|
79
|
+
attr_writer :output_stream
|
5
80
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
81
|
+
private
|
82
|
+
|
83
|
+
def logging?(level)
|
84
|
+
@levels.empty? || @levels.include?(level)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.log(level, message)
|
89
|
+
return if logger.nil?
|
90
|
+
|
91
|
+
logger.log(level, message)
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
def self.logger
|
97
|
+
@logger
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
if __FILE__ == $0 then
|
103
|
+
|
104
|
+
require 'test/unit'
|
105
|
+
|
106
|
+
class ScrubytLoggingTestCase < Test::Unit::TestCase
|
107
|
+
class FauxOutputStream < Array
|
108
|
+
def puts(object)
|
109
|
+
self << object.to_s
|
10
110
|
end
|
11
|
-
else
|
12
|
-
puts pre + message.to_s
|
13
111
|
end
|
14
112
|
|
113
|
+
def setup_logger_with_faux_output_stream!(*logger_args)
|
114
|
+
@stream = FauxOutputStream.new
|
115
|
+
logger = Scrubyt::Logger.new(*logger_args)
|
116
|
+
logger.output_stream = @stream
|
117
|
+
Scrubyt.logger = logger
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_that_logging_works_with_nil_logger
|
121
|
+
Scrubyt.logger = nil
|
122
|
+
assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
|
123
|
+
end
|
124
|
+
|
125
|
+
def test_simple_messages_are_output_correctly
|
126
|
+
setup_logger_with_faux_output_stream!
|
127
|
+
|
128
|
+
Scrubyt.log :ACTION, 'i just did something'
|
129
|
+
|
130
|
+
assert_equal 1, @stream.size
|
131
|
+
assert_equal '[ACTION] i just did something', @stream.first
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_that_multiline_messages_are_output_correctly
|
135
|
+
setup_logger_with_faux_output_stream!
|
136
|
+
|
137
|
+
Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
|
138
|
+
|
139
|
+
assert_equal 1, @stream.size
|
140
|
+
assert_equal "[ERROR] something bad happened\n dear oh dear", @stream.first
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_that_loggers_can_be_limited_to_specfied_message_levels
|
144
|
+
setup_logger_with_faux_output_stream! :ERROR
|
145
|
+
|
146
|
+
Scrubyt.log :ACTION, 'i just did something'
|
147
|
+
Scrubyt.log :ERROR, 'something bad happened'
|
148
|
+
|
149
|
+
assert_equal 1, @stream.size
|
150
|
+
assert_equal '[ERROR] something bad happened', @stream.first
|
151
|
+
end
|
15
152
|
end
|
153
|
+
|
16
154
|
end
|
@@ -61,47 +61,70 @@ module Scrubyt
|
|
61
61
|
#This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
|
62
62
|
#After running 'my_super_camera_extractor.rb', the result will be dumped to the file
|
63
63
|
#'/home/peter/stuff/result.xml'.
|
64
|
-
def self.export(
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
64
|
+
def self.export(root_patterns, data)
|
65
|
+
wrapper_name = data[:wrapper_name]
|
66
|
+
template = data[:template] || 'default'
|
67
|
+
output_file_name = data[:output_file_name]
|
68
|
+
extractor_result_file_name = data[:extractor_result_file_name]
|
69
69
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
70
|
+
case template
|
71
|
+
when 'default'
|
72
|
+
header = "require 'rubygems'\n"
|
73
|
+
header += "require 'scrubyt'\n\n"
|
74
|
+
header += "#{wrapper_name} = Scrubyt::Extractor.define do\n"
|
75
|
+
if extractor_result_file_name
|
76
|
+
footer = "\nend\n\n#{wrapper_name}.to_xml.write(open('#{extractor_result_file_name}', 'w'), 1)"
|
77
|
+
else
|
78
|
+
footer = "\nend\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
|
79
|
+
end
|
80
|
+
indent = 1
|
81
|
+
when 'lambda'
|
82
|
+
header = "lambda do\n"
|
83
|
+
footer = "\nend"
|
84
|
+
indent = 1
|
85
|
+
when 'none'
|
86
|
+
header = ''
|
87
|
+
footer = ''
|
88
|
+
indent = 0
|
89
|
+
else
|
90
|
+
puts "Unknown template type: #{template}"
|
91
|
+
return
|
92
|
+
end
|
93
|
+
|
94
|
+
header = data[:header] if data[:header]
|
95
|
+
footer = data[:footer] if data[:footer]
|
96
|
+
indent = data[:indent] if data[:indent]
|
79
97
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
98
|
+
sexp = [:block]
|
99
|
+
sexp << export_extractor(root_patterns, data[:source_proc], wrapper_name)
|
100
|
+
extractor = RubyToRuby.new.process(sexp).gsub('"' + data[:source_file] + '"', '__FILE__')
|
101
|
+
extractor = extractor.strip
|
102
|
+
extractor = extractor.split("\n").map{ |l| (' ' * indent) + l }.join("\n")
|
84
103
|
|
85
|
-
|
86
|
-
|
104
|
+
result = header + extractor + footer
|
105
|
+
|
106
|
+
if output_file_name
|
107
|
+
open(output_file_name, 'w') do |file|
|
108
|
+
file.write(result)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
result
|
87
113
|
end
|
88
114
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
else
|
93
|
-
create_sexp "#{wrapper_name}.to_xml.write($stdout, 1)"
|
94
|
-
end
|
115
|
+
private
|
116
|
+
def self.create_sexp(code)
|
117
|
+
(ParseTree.new.parse_tree_for_string(code))[0]
|
95
118
|
end
|
96
119
|
|
97
|
-
def self.export_extractor(
|
120
|
+
def self.export_extractor(root_patterns, source_proc, wrapper_name)
|
98
121
|
# filter actions before and after pattern
|
99
122
|
pre_pattern_sexp = []
|
100
123
|
post_pattern_sexp = []
|
101
|
-
|
102
|
-
actions = ['next_page', *NavigationActions
|
124
|
+
patterns_passed = false
|
125
|
+
actions = ['next_page', *NavigationActions.instance_methods]
|
103
126
|
|
104
|
-
|
127
|
+
source_proc.to_sexp[3][1..-1].each do |sexp|
|
105
128
|
get_call = lambda { |sexp|
|
106
129
|
if sexp[0] == :fcall
|
107
130
|
return sexp[1].to_s
|
@@ -113,29 +136,22 @@ private
|
|
113
136
|
}
|
114
137
|
call = get_call.call(sexp)
|
115
138
|
if(call.nil? || actions.index(call) != nil)
|
116
|
-
if !
|
139
|
+
if !patterns_passed
|
117
140
|
pre_pattern_sexp.push(sexp)
|
118
141
|
else
|
119
142
|
post_pattern_sexp.push(sexp)
|
120
143
|
end
|
121
144
|
else
|
122
|
-
|
123
|
-
pattern_skipped = true
|
145
|
+
patterns_passed = true
|
124
146
|
end
|
125
147
|
end
|
126
|
-
|
127
|
-
# build extractor content
|
128
|
-
inner_block = [:block]
|
129
|
-
inner_block.push([:block, *pre_pattern_sexp])
|
130
|
-
inner_block.push([:block, export_pattern(root_pattern)])
|
131
|
-
inner_block.push([:block, *post_pattern_sexp])
|
132
148
|
|
133
149
|
# build extractor
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
150
|
+
sexp = [:block]
|
151
|
+
sexp.push([:block, *pre_pattern_sexp])
|
152
|
+
sexp.push([:block, *root_patterns.to_sexp_array])
|
153
|
+
sexp.push([:block, *post_pattern_sexp])
|
154
|
+
sexp
|
139
155
|
end
|
140
156
|
end
|
141
157
|
end
|
@@ -20,7 +20,7 @@ module Scrubyt
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def to_s
|
23
|
-
text = (@result.is_a? String) ? @result : @result.
|
23
|
+
text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
|
24
24
|
text = SharedUtils.unescape_entities(text)
|
25
25
|
text.strip!
|
26
26
|
text
|
@@ -38,10 +38,10 @@ module Scrubyt
|
|
38
38
|
to_xml_lines.join("\n")
|
39
39
|
end
|
40
40
|
|
41
|
-
def to_hash
|
41
|
+
def to_hash(delimiter=',')
|
42
42
|
result = []
|
43
43
|
flat_hash_inner = lambda {|e, hash|
|
44
|
-
hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] +
|
44
|
+
hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if e.write_text && !e.to_s.empty?
|
45
45
|
e.each {|c| flat_hash_inner.call(c, hash) }
|
46
46
|
hash
|
47
47
|
}
|
@@ -49,6 +49,37 @@ module Scrubyt
|
|
49
49
|
result
|
50
50
|
end
|
51
51
|
|
52
|
+
def to_flat_xml(delimiter=nil)
|
53
|
+
lines = []
|
54
|
+
hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
|
55
|
+
|
56
|
+
if delimiter
|
57
|
+
result_sets = hash_result[0].values.map!{|x| x.split(delimiter)}.transpose
|
58
|
+
final_result = []
|
59
|
+
|
60
|
+
result_sets.each do |rs|
|
61
|
+
temp_result = {}
|
62
|
+
hash_result[0].keys.each do |k|
|
63
|
+
temp_result[k] = rs[hash_result[0].keys.index(k)]
|
64
|
+
end
|
65
|
+
final_result << temp_result
|
66
|
+
end
|
67
|
+
hash_result = final_result
|
68
|
+
end
|
69
|
+
|
70
|
+
hash_result.each do |hash|
|
71
|
+
lines << "<item>"
|
72
|
+
hash.each do |key, value|
|
73
|
+
xml_tag = key.to_s
|
74
|
+
value = '' if value == '#empty#'
|
75
|
+
lines << " <#{xml_tag}>#{REXML::Text.normalize(value)}</#{xml_tag}>"
|
76
|
+
end
|
77
|
+
lines << "</item>"
|
78
|
+
end
|
79
|
+
return lines.join("\n")
|
80
|
+
|
81
|
+
end
|
82
|
+
|
52
83
|
def to_xml_lines
|
53
84
|
lines = []
|
54
85
|
children = self.select{ |child| child.has_content? }
|
@@ -1,13 +1,16 @@
|
|
1
1
|
module Scrubyt
|
2
2
|
class ScrubytResult < ResultNode
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :root_patterns, :source_file, :source_proc
|
4
4
|
|
5
5
|
def export(arg1, output_file_name=nil, extractor_result_file_name=nil)
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
if arg1.is_a? String
|
7
|
+
if File.exists? arg1
|
8
|
+
export_old1(arg1, output_file_name, extractor_result_file_name)
|
9
|
+
else
|
10
|
+
export_old2(arg1, output_file_name, extractor_result_file_name)
|
11
|
+
end
|
9
12
|
else
|
10
|
-
|
13
|
+
export_new(arg1)
|
11
14
|
end
|
12
15
|
end
|
13
16
|
|
@@ -15,14 +18,20 @@ module Scrubyt
|
|
15
18
|
#Implement me...
|
16
19
|
end
|
17
20
|
|
18
|
-
def
|
21
|
+
def export_old1(input_file, output_file_name=nil, extractor_result_file_name=nil)
|
19
22
|
contents = open(input_file).read
|
20
23
|
wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
|
21
|
-
|
24
|
+
export_old2(wrapper_name, output_file_name, extractor_result_file_name)
|
22
25
|
end
|
23
26
|
|
24
|
-
def
|
25
|
-
|
27
|
+
def export_old2(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
|
28
|
+
export_new({ :wrapper_name => wrapper_name, :output_file_name => output_file_name || "#{wrapper_name}_extractor_export.rb", :extractor_result_file_name => extractor_result_file_name })
|
29
|
+
end
|
30
|
+
|
31
|
+
def export_new(data)
|
32
|
+
data[:source_file] = @source_file
|
33
|
+
data[:source_proc] = @source_proc
|
34
|
+
Scrubyt::Export.export(@root_patterns, data)
|
26
35
|
end
|
27
36
|
end
|
28
37
|
end
|
@@ -43,7 +43,7 @@ private
|
|
43
43
|
end
|
44
44
|
|
45
45
|
def self.refine_partial_results(regexp)
|
46
|
-
@partial_results = @partial_results.select {|pr| pr.
|
46
|
+
@partial_results = @partial_results.select {|pr| pr.inner_html.gsub(/<.*?>/, '') =~ regexp}
|
47
47
|
end
|
48
48
|
|
49
49
|
end #End of class CompoundExampleLookup
|
@@ -28,7 +28,7 @@ module Scrubyt
|
|
28
28
|
def self.traverse_for_match(node, regexp)
|
29
29
|
results = []
|
30
30
|
traverse_for_match_inner = lambda { |node, regexp|
|
31
|
-
ft = prepare_text_for_comparison(node.
|
31
|
+
ft = prepare_text_for_comparison(node.inner_html.gsub(/<.*?>/, ''))
|
32
32
|
if ft =~ regexp
|
33
33
|
node.instance_eval do
|
34
34
|
@match_data = $~
|
@@ -16,7 +16,7 @@ module Scrubyt
|
|
16
16
|
#In this case, <a>'s text is considered to be "Bon nuit, monsieur"
|
17
17
|
def self.find_node_from_text(doc, text, next_link=false, index = 0)
|
18
18
|
text.gsub!('»', '»')
|
19
|
-
#Process immediate attribute extraction (like "go to google.com
|
19
|
+
#Process immediate attribute extraction (like "go to google.com/@href")
|
20
20
|
if text =~ /.+\/@.+$/
|
21
21
|
text = text.scan(/^(.+?)\/@.+$/)[0][0]
|
22
22
|
elsif text =~ /.+\[\d+\]$/
|
@@ -27,10 +27,14 @@ module Scrubyt
|
|
27
27
|
final_element_name = text.scan(/^(.+?)\[/)[0][0]
|
28
28
|
text = text.scan(/\[(.+?)\]/)[0][0]
|
29
29
|
end
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
30
|
+
if final_element_name
|
31
|
+
text = Regexp.escape(text) if text.is_a? String
|
32
|
+
result = SharedUtils.traverse_for_match(doc,/#{text}/)[index]
|
33
|
+
result = XPathUtils.traverse_up_until_name(result,final_element_name)
|
34
|
+
else
|
35
|
+
text = Regexp.escape(text) if text.is_a? String
|
36
|
+
result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
|
37
|
+
end
|
34
38
|
end
|
35
39
|
end #End of class SimpleExampleLookup
|
36
40
|
end #End of module Scrubyt
|