scrubyt 0.3.0 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,154 @@
1
+ #
2
+ # TODO: if multiline messages aren't needed, then remove them.
3
+ #
4
+ # TODO: switch to the conventional Ruby logger interface,
5
+ # or create an adapter to it. If the former, then decided what to
6
+ # do with the unit tests.
7
+ #
8
+
1
9
  module Scrubyt
2
- def self.log(message_type, message)
10
+ # Logging is disabled by default. It can be enabled as follows:
11
+ #
12
+ # Scrubyt.logger = Scrubyt::Logger.new # logs *all* messages to STDERR
13
+ #
14
+ def self.logger=(logger)
15
+ @logger = logger
16
+ end
17
+
18
+ # Simple logger implementation, based on Scrubyt's original logging style.
19
+ # Messages will be sent to STDERR. Logging can be limited to certain message
20
+ # levels by specifying them on initialization, e.g.
21
+ #
22
+ # Scrubyt::Logger.new(:ACTION, :ERROR) # will only log action/error messages
23
+ #
24
+ class Logger
25
+ class Message
26
+ def initialize(level, text)
27
+ @level, @text = level.to_s, text.to_s
28
+ end
29
+
30
+ def to_s
31
+ prefix + @text
32
+ end
33
+
34
+ protected
35
+
36
+ def prefix
37
+ @prefix ||= "[#{@level}] "
38
+ end
39
+ end
40
+
41
+ class MultiLineMessage < Message
42
+ def initialize(level, lines)
43
+ super level, lines.shift
44
+
45
+ @lines = lines
46
+ end
47
+
48
+ def to_s
49
+ [ super, indented_lines ] * "\n"
50
+ end
51
+
52
+ private
53
+
54
+ def indented_lines
55
+ @lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
56
+ end
57
+
58
+ def indented(line)
59
+ ' ' * prefix.length + line
60
+ end
61
+ end
62
+
63
+ def initialize(*levels)
64
+ @levels = levels
65
+ end
66
+
67
+ def log(level, message)
68
+ return unless logging?(level)
69
+
70
+ message_class = message.is_a?(Array) ? MultiLineMessage : Message
71
+
72
+ output_stream.puts message_class.new(level, message)
73
+ end
74
+
75
+ def output_stream
76
+ @output_stream || STDERR
77
+ end
3
78
 
4
- pre = "[#{message_type}] "
79
+ attr_writer :output_stream
5
80
 
6
- if message.is_a? Array
7
- puts pre + message.first
8
- message[1..-1].each do |line|
9
- puts ' ' * pre.length + line
81
+ private
82
+
83
+ def logging?(level)
84
+ @levels.empty? || @levels.include?(level)
85
+ end
86
+ end
87
+
88
+ def self.log(level, message)
89
+ return if logger.nil?
90
+
91
+ logger.log(level, message)
92
+ end
93
+
94
+ private
95
+
96
+ def self.logger
97
+ @logger
98
+ end
99
+ end
100
+
101
+
102
+ if __FILE__ == $0 then
103
+
104
+ require 'test/unit'
105
+
106
+ class ScrubytLoggingTestCase < Test::Unit::TestCase
107
+ class FauxOutputStream < Array
108
+ def puts(object)
109
+ self << object.to_s
10
110
  end
11
- else
12
- puts pre + message.to_s
13
111
  end
14
112
 
113
+ def setup_logger_with_faux_output_stream!(*logger_args)
114
+ @stream = FauxOutputStream.new
115
+ logger = Scrubyt::Logger.new(*logger_args)
116
+ logger.output_stream = @stream
117
+ Scrubyt.logger = logger
118
+ end
119
+
120
+ def test_that_logging_works_with_nil_logger
121
+ Scrubyt.logger = nil
122
+ assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
123
+ end
124
+
125
+ def test_simple_messages_are_output_correctly
126
+ setup_logger_with_faux_output_stream!
127
+
128
+ Scrubyt.log :ACTION, 'i just did something'
129
+
130
+ assert_equal 1, @stream.size
131
+ assert_equal '[ACTION] i just did something', @stream.first
132
+ end
133
+
134
+ def test_that_multiline_messages_are_output_correctly
135
+ setup_logger_with_faux_output_stream!
136
+
137
+ Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
138
+
139
+ assert_equal 1, @stream.size
140
+ assert_equal "[ERROR] something bad happened\n dear oh dear", @stream.first
141
+ end
142
+
143
+ def test_that_loggers_can_be_limited_to_specfied_message_levels
144
+ setup_logger_with_faux_output_stream! :ERROR
145
+
146
+ Scrubyt.log :ACTION, 'i just did something'
147
+ Scrubyt.log :ERROR, 'something bad happened'
148
+
149
+ assert_equal 1, @stream.size
150
+ assert_equal '[ERROR] something bad happened', @stream.first
151
+ end
15
152
  end
153
+
16
154
  end
@@ -61,47 +61,70 @@ module Scrubyt
61
61
  #This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
62
62
  #After running 'my_super_camera_extractor.rb', the result will be dumped to the file
63
63
  #'/home/peter/stuff/result.xml'.
64
- def self.export(root_pattern, wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
65
- sexp = [:block]
66
- sexp << export_header(wrapper_name)
67
- sexp << export_extractor(root_pattern, wrapper_name)
68
- sexp << export_footer(wrapper_name, extractor_result_file_name)
64
+ def self.export(root_patterns, data)
65
+ wrapper_name = data[:wrapper_name]
66
+ template = data[:template] || 'default'
67
+ output_file_name = data[:output_file_name]
68
+ extractor_result_file_name = data[:extractor_result_file_name]
69
69
 
70
- result = RubyToRuby.new.process(sexp)
71
- result.gsub! '"' + root_pattern.source_file + '"', '__FILE__'
72
-
73
- output_file_name ||= "#{wrapper_name}_extractor_export.rb"
74
- output_file = open(output_file_name, 'w')
75
- output_file.write(result)
76
- output_file.close
77
- result
78
- end
70
+ case template
71
+ when 'default'
72
+ header = "require 'rubygems'\n"
73
+ header += "require 'scrubyt'\n\n"
74
+ header += "#{wrapper_name} = Scrubyt::Extractor.define do\n"
75
+ if extractor_result_file_name
76
+ footer = "\nend\n\n#{wrapper_name}.to_xml.write(open('#{extractor_result_file_name}', 'w'), 1)"
77
+ else
78
+ footer = "\nend\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
79
+ end
80
+ indent = 1
81
+ when 'lambda'
82
+ header = "lambda do\n"
83
+ footer = "\nend"
84
+ indent = 1
85
+ when 'none'
86
+ header = ''
87
+ footer = ''
88
+ indent = 0
89
+ else
90
+ puts "Unknown template type: #{template}"
91
+ return
92
+ end
93
+
94
+ header = data[:header] if data[:header]
95
+ footer = data[:footer] if data[:footer]
96
+ indent = data[:indent] if data[:indent]
79
97
 
80
- private
81
- def self.create_sexp(code)
82
- (ParseTree.new.parse_tree_for_string(code))[0]
83
- end
98
+ sexp = [:block]
99
+ sexp << export_extractor(root_patterns, data[:source_proc], wrapper_name)
100
+ extractor = RubyToRuby.new.process(sexp).gsub('"' + data[:source_file] + '"', '__FILE__')
101
+ extractor = extractor.strip
102
+ extractor = extractor.split("\n").map{ |l| (' ' * indent) + l }.join("\n")
84
103
 
85
- def self.export_header(wrapper_name)
86
- create_sexp "require 'rubygems'; require 'scrubyt'"
104
+ result = header + extractor + footer
105
+
106
+ if output_file_name
107
+ open(output_file_name, 'w') do |file|
108
+ file.write(result)
109
+ end
110
+ end
111
+
112
+ result
87
113
  end
88
114
 
89
- def self.export_footer(wrapper_name, extractor_result_file_name)
90
- if extractor_result_file_name
91
- create_sexp "#{wrapper_name}.to_xml.write(open('result_of_exported_extractor.xml', 'w'), 1)"
92
- else
93
- create_sexp "#{wrapper_name}.to_xml.write($stdout, 1)"
94
- end
115
+ private
116
+ def self.create_sexp(code)
117
+ (ParseTree.new.parse_tree_for_string(code))[0]
95
118
  end
96
119
 
97
- def self.export_extractor(root_pattern, wrapper_name)
120
+ def self.export_extractor(root_patterns, source_proc, wrapper_name)
98
121
  # filter actions before and after pattern
99
122
  pre_pattern_sexp = []
100
123
  post_pattern_sexp = []
101
- pattern_skipped = false
102
- actions = ['next_page', *NavigationActions::KEYWORDS]
124
+ patterns_passed = false
125
+ actions = ['next_page', *NavigationActions.instance_methods]
103
126
 
104
- root_pattern.source_proc.to_sexp[3][1..-1].each do |sexp|
127
+ source_proc.to_sexp[3][1..-1].each do |sexp|
105
128
  get_call = lambda { |sexp|
106
129
  if sexp[0] == :fcall
107
130
  return sexp[1].to_s
@@ -113,29 +136,22 @@ private
113
136
  }
114
137
  call = get_call.call(sexp)
115
138
  if(call.nil? || actions.index(call) != nil)
116
- if !pattern_skipped
139
+ if !patterns_passed
117
140
  pre_pattern_sexp.push(sexp)
118
141
  else
119
142
  post_pattern_sexp.push(sexp)
120
143
  end
121
144
  else
122
- raise "Second pattern tree found while exporting." if pattern_skipped
123
- pattern_skipped = true
145
+ patterns_passed = true
124
146
  end
125
147
  end
126
-
127
- # build extractor content
128
- inner_block = [:block]
129
- inner_block.push([:block, *pre_pattern_sexp])
130
- inner_block.push([:block, export_pattern(root_pattern)])
131
- inner_block.push([:block, *post_pattern_sexp])
132
148
 
133
149
  # build extractor
134
- [:block, [:lasgn, wrapper_name, [:iter, [:call, [:colon2, [:const, :Scrubyt], :Extractor], :define], nil, inner_block]]]
135
- end
136
-
137
- def self.export_pattern(root_pattern)
138
- root_pattern.to_sexp
150
+ sexp = [:block]
151
+ sexp.push([:block, *pre_pattern_sexp])
152
+ sexp.push([:block, *root_patterns.to_sexp_array])
153
+ sexp.push([:block, *post_pattern_sexp])
154
+ sexp
139
155
  end
140
156
  end
141
157
  end
@@ -20,7 +20,7 @@ module Scrubyt
20
20
  end
21
21
 
22
22
  def to_s
23
- text = (@result.is_a? String) ? @result : @result.inner_text
23
+ text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
24
24
  text = SharedUtils.unescape_entities(text)
25
25
  text.strip!
26
26
  text
@@ -38,10 +38,10 @@ module Scrubyt
38
38
  to_xml_lines.join("\n")
39
39
  end
40
40
 
41
- def to_hash
41
+ def to_hash(delimiter=',')
42
42
  result = []
43
43
  flat_hash_inner = lambda {|e, hash|
44
- hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + "," + e.to_s : e.to_s if e.write_text && !e.to_s.empty?
44
+ hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if e.write_text && !e.to_s.empty?
45
45
  e.each {|c| flat_hash_inner.call(c, hash) }
46
46
  hash
47
47
  }
@@ -49,6 +49,37 @@ module Scrubyt
49
49
  result
50
50
  end
51
51
 
52
+ def to_flat_xml(delimiter=nil)
53
+ lines = []
54
+ hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
55
+
56
+ if delimiter
57
+ result_sets = hash_result[0].values.map!{|x| x.split(delimiter)}.transpose
58
+ final_result = []
59
+
60
+ result_sets.each do |rs|
61
+ temp_result = {}
62
+ hash_result[0].keys.each do |k|
63
+ temp_result[k] = rs[hash_result[0].keys.index(k)]
64
+ end
65
+ final_result << temp_result
66
+ end
67
+ hash_result = final_result
68
+ end
69
+
70
+ hash_result.each do |hash|
71
+ lines << "<item>"
72
+ hash.each do |key, value|
73
+ xml_tag = key.to_s
74
+ value = '' if value == '#empty#'
75
+ lines << " <#{xml_tag}>#{REXML::Text.normalize(value)}</#{xml_tag}>"
76
+ end
77
+ lines << "</item>"
78
+ end
79
+ return lines.join("\n")
80
+
81
+ end
82
+
52
83
  def to_xml_lines
53
84
  lines = []
54
85
  children = self.select{ |child| child.has_content? }
@@ -1,13 +1,16 @@
1
1
  module Scrubyt
2
2
  class ScrubytResult < ResultNode
3
- attr_accessor :root_pattern
3
+ attr_accessor :root_patterns, :source_file, :source_proc
4
4
 
5
5
  def export(arg1, output_file_name=nil, extractor_result_file_name=nil)
6
- # require 'scrubyt/output/export_old'; Scrubyt::ExportOld.export(arg1, self, output_file_name, extractor_result_file_name) ; return
7
- if File.exists? arg1
8
- old_export(arg1, output_file_name, extractor_result_file_name)
6
+ if arg1.is_a? String
7
+ if File.exists? arg1
8
+ export_old1(arg1, output_file_name, extractor_result_file_name)
9
+ else
10
+ export_old2(arg1, output_file_name, extractor_result_file_name)
11
+ end
9
12
  else
10
- new_export(arg1, output_file_name, extractor_result_file_name)
13
+ export_new(arg1)
11
14
  end
12
15
  end
13
16
 
@@ -15,14 +18,20 @@ module Scrubyt
15
18
  #Implement me...
16
19
  end
17
20
 
18
- def old_export(input_file, output_file_name=nil, extractor_result_file_name=nil)
21
+ def export_old1(input_file, output_file_name=nil, extractor_result_file_name=nil)
19
22
  contents = open(input_file).read
20
23
  wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
21
- Scrubyt::Export.export(@root_pattern, wrapper_name, output_file_name, extractor_result_file_name)
24
+ export_old2(wrapper_name, output_file_name, extractor_result_file_name)
22
25
  end
23
26
 
24
- def new_export(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
25
- Scrubyt::Export.export(@root_pattern, wrapper_name, output_file_name, extractor_result_file_name)
27
+ def export_old2(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
28
+ export_new({ :wrapper_name => wrapper_name, :output_file_name => output_file_name || "#{wrapper_name}_extractor_export.rb", :extractor_result_file_name => extractor_result_file_name })
29
+ end
30
+
31
+ def export_new(data)
32
+ data[:source_file] = @source_file
33
+ data[:source_proc] = @source_proc
34
+ Scrubyt::Export.export(@root_patterns, data)
26
35
  end
27
36
  end
28
37
  end
@@ -43,7 +43,7 @@ private
43
43
  end
44
44
 
45
45
  def self.refine_partial_results(regexp)
46
- @partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
46
+ @partial_results = @partial_results.select {|pr| pr.inner_html.gsub(/<.*?>/, '') =~ regexp}
47
47
  end
48
48
 
49
49
  end #End of class CompoundExampleLookup
@@ -28,7 +28,7 @@ module Scrubyt
28
28
  def self.traverse_for_match(node, regexp)
29
29
  results = []
30
30
  traverse_for_match_inner = lambda { |node, regexp|
31
- ft = prepare_text_for_comparison(node.inner_text)
31
+ ft = prepare_text_for_comparison(node.inner_html.gsub(/<.*?>/, ''))
32
32
  if ft =~ regexp
33
33
  node.instance_eval do
34
34
  @match_data = $~
@@ -16,7 +16,7 @@ module Scrubyt
16
16
  #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
17
17
  def self.find_node_from_text(doc, text, next_link=false, index = 0)
18
18
  text.gsub!('»', '&#187;')
19
- #Process immediate attribute extraction (like "go to google.com[@href]")
19
+ #Process immediate attribute extraction (like "go to google.com/@href")
20
20
  if text =~ /.+\/@.+$/
21
21
  text = text.scan(/^(.+?)\/@.+$/)[0][0]
22
22
  elsif text =~ /.+\[\d+\]$/
@@ -27,10 +27,14 @@ module Scrubyt
27
27
  final_element_name = text.scan(/^(.+?)\[/)[0][0]
28
28
  text = text.scan(/\[(.+?)\]/)[0][0]
29
29
  end
30
- text = Regexp.escape(text) if text.is_a? String
31
- result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
32
- result = XPathUtils.traverse_up_until_name(result,final_element_name) if final_element_name
33
- result
30
+ if final_element_name
31
+ text = Regexp.escape(text) if text.is_a? String
32
+ result = SharedUtils.traverse_for_match(doc,/#{text}/)[index]
33
+ result = XPathUtils.traverse_up_until_name(result,final_element_name)
34
+ else
35
+ text = Regexp.escape(text) if text.is_a? String
36
+ result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
37
+ end
34
38
  end
35
39
  end #End of class SimpleExampleLookup
36
40
  end #End of module Scrubyt