scrubyt 0.3.0 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,16 +1,154 @@
1
+ #
2
+ # TODO: if multiline messages aren't needed, then remove them.
3
+ #
4
+ # TODO: switch to the conventional Ruby logger interface,
5
+ # or create an adapter to it. If the former, then decided what to
6
+ # do with the unit tests.
7
+ #
8
+
1
9
  module Scrubyt
2
- def self.log(message_type, message)
10
+ # Logging is disabled by default. It can be enabled as follows:
11
+ #
12
+ # Scrubyt.logger = Scrubyt::Logger.new # logs *all* messages to STDERR
13
+ #
14
+ def self.logger=(logger)
15
+ @logger = logger
16
+ end
17
+
18
+ # Simple logger implementation, based on Scrubyt's original logging style.
19
+ # Messages will be sent to STDERR. Logging can be limited to certain message
20
+ # levels by specifying them on initialization, e.g.
21
+ #
22
+ # Scrubyt::Logger.new(:ACTION, :ERROR) # will only log action/error messages
23
+ #
24
+ class Logger
25
+ class Message
26
+ def initialize(level, text)
27
+ @level, @text = level.to_s, text.to_s
28
+ end
29
+
30
+ def to_s
31
+ prefix + @text
32
+ end
33
+
34
+ protected
35
+
36
+ def prefix
37
+ @prefix ||= "[#{@level}] "
38
+ end
39
+ end
40
+
41
+ class MultiLineMessage < Message
42
+ def initialize(level, lines)
43
+ super level, lines.shift
44
+
45
+ @lines = lines
46
+ end
47
+
48
+ def to_s
49
+ [ super, indented_lines ] * "\n"
50
+ end
51
+
52
+ private
53
+
54
+ def indented_lines
55
+ @lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
56
+ end
57
+
58
+ def indented(line)
59
+ ' ' * prefix.length + line
60
+ end
61
+ end
62
+
63
+ def initialize(*levels)
64
+ @levels = levels
65
+ end
66
+
67
+ def log(level, message)
68
+ return unless logging?(level)
69
+
70
+ message_class = message.is_a?(Array) ? MultiLineMessage : Message
71
+
72
+ output_stream.puts message_class.new(level, message)
73
+ end
74
+
75
+ def output_stream
76
+ @output_stream || STDERR
77
+ end
3
78
 
4
- pre = "[#{message_type}] "
79
+ attr_writer :output_stream
5
80
 
6
- if message.is_a? Array
7
- puts pre + message.first
8
- message[1..-1].each do |line|
9
- puts ' ' * pre.length + line
81
+ private
82
+
83
+ def logging?(level)
84
+ @levels.empty? || @levels.include?(level)
85
+ end
86
+ end
87
+
88
+ def self.log(level, message)
89
+ return if logger.nil?
90
+
91
+ logger.log(level, message)
92
+ end
93
+
94
+ private
95
+
96
+ def self.logger
97
+ @logger
98
+ end
99
+ end
100
+
101
+
102
+ if __FILE__ == $0 then
103
+
104
+ require 'test/unit'
105
+
106
+ class ScrubytLoggingTestCase < Test::Unit::TestCase
107
+ class FauxOutputStream < Array
108
+ def puts(object)
109
+ self << object.to_s
10
110
  end
11
- else
12
- puts pre + message.to_s
13
111
  end
14
112
 
113
+ def setup_logger_with_faux_output_stream!(*logger_args)
114
+ @stream = FauxOutputStream.new
115
+ logger = Scrubyt::Logger.new(*logger_args)
116
+ logger.output_stream = @stream
117
+ Scrubyt.logger = logger
118
+ end
119
+
120
+ def test_that_logging_works_with_nil_logger
121
+ Scrubyt.logger = nil
122
+ assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
123
+ end
124
+
125
+ def test_simple_messages_are_output_correctly
126
+ setup_logger_with_faux_output_stream!
127
+
128
+ Scrubyt.log :ACTION, 'i just did something'
129
+
130
+ assert_equal 1, @stream.size
131
+ assert_equal '[ACTION] i just did something', @stream.first
132
+ end
133
+
134
+ def test_that_multiline_messages_are_output_correctly
135
+ setup_logger_with_faux_output_stream!
136
+
137
+ Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
138
+
139
+ assert_equal 1, @stream.size
140
+ assert_equal "[ERROR] something bad happened\n dear oh dear", @stream.first
141
+ end
142
+
143
+ def test_that_loggers_can_be_limited_to_specfied_message_levels
144
+ setup_logger_with_faux_output_stream! :ERROR
145
+
146
+ Scrubyt.log :ACTION, 'i just did something'
147
+ Scrubyt.log :ERROR, 'something bad happened'
148
+
149
+ assert_equal 1, @stream.size
150
+ assert_equal '[ERROR] something bad happened', @stream.first
151
+ end
15
152
  end
153
+
16
154
  end
@@ -61,47 +61,70 @@ module Scrubyt
61
61
  #This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
62
62
  #After running 'my_super_camera_extractor.rb', the result will be dumped to the file
63
63
  #'/home/peter/stuff/result.xml'.
64
- def self.export(root_pattern, wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
65
- sexp = [:block]
66
- sexp << export_header(wrapper_name)
67
- sexp << export_extractor(root_pattern, wrapper_name)
68
- sexp << export_footer(wrapper_name, extractor_result_file_name)
64
+ def self.export(root_patterns, data)
65
+ wrapper_name = data[:wrapper_name]
66
+ template = data[:template] || 'default'
67
+ output_file_name = data[:output_file_name]
68
+ extractor_result_file_name = data[:extractor_result_file_name]
69
69
 
70
- result = RubyToRuby.new.process(sexp)
71
- result.gsub! '"' + root_pattern.source_file + '"', '__FILE__'
72
-
73
- output_file_name ||= "#{wrapper_name}_extractor_export.rb"
74
- output_file = open(output_file_name, 'w')
75
- output_file.write(result)
76
- output_file.close
77
- result
78
- end
70
+ case template
71
+ when 'default'
72
+ header = "require 'rubygems'\n"
73
+ header += "require 'scrubyt'\n\n"
74
+ header += "#{wrapper_name} = Scrubyt::Extractor.define do\n"
75
+ if extractor_result_file_name
76
+ footer = "\nend\n\n#{wrapper_name}.to_xml.write(open('#{extractor_result_file_name}', 'w'), 1)"
77
+ else
78
+ footer = "\nend\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
79
+ end
80
+ indent = 1
81
+ when 'lambda'
82
+ header = "lambda do\n"
83
+ footer = "\nend"
84
+ indent = 1
85
+ when 'none'
86
+ header = ''
87
+ footer = ''
88
+ indent = 0
89
+ else
90
+ puts "Unknown template type: #{template}"
91
+ return
92
+ end
93
+
94
+ header = data[:header] if data[:header]
95
+ footer = data[:footer] if data[:footer]
96
+ indent = data[:indent] if data[:indent]
79
97
 
80
- private
81
- def self.create_sexp(code)
82
- (ParseTree.new.parse_tree_for_string(code))[0]
83
- end
98
+ sexp = [:block]
99
+ sexp << export_extractor(root_patterns, data[:source_proc], wrapper_name)
100
+ extractor = RubyToRuby.new.process(sexp).gsub('"' + data[:source_file] + '"', '__FILE__')
101
+ extractor = extractor.strip
102
+ extractor = extractor.split("\n").map{ |l| (' ' * indent) + l }.join("\n")
84
103
 
85
- def self.export_header(wrapper_name)
86
- create_sexp "require 'rubygems'; require 'scrubyt'"
104
+ result = header + extractor + footer
105
+
106
+ if output_file_name
107
+ open(output_file_name, 'w') do |file|
108
+ file.write(result)
109
+ end
110
+ end
111
+
112
+ result
87
113
  end
88
114
 
89
- def self.export_footer(wrapper_name, extractor_result_file_name)
90
- if extractor_result_file_name
91
- create_sexp "#{wrapper_name}.to_xml.write(open('result_of_exported_extractor.xml', 'w'), 1)"
92
- else
93
- create_sexp "#{wrapper_name}.to_xml.write($stdout, 1)"
94
- end
115
+ private
116
+ def self.create_sexp(code)
117
+ (ParseTree.new.parse_tree_for_string(code))[0]
95
118
  end
96
119
 
97
- def self.export_extractor(root_pattern, wrapper_name)
120
+ def self.export_extractor(root_patterns, source_proc, wrapper_name)
98
121
  # filter actions before and after pattern
99
122
  pre_pattern_sexp = []
100
123
  post_pattern_sexp = []
101
- pattern_skipped = false
102
- actions = ['next_page', *NavigationActions::KEYWORDS]
124
+ patterns_passed = false
125
+ actions = ['next_page', *NavigationActions.instance_methods]
103
126
 
104
- root_pattern.source_proc.to_sexp[3][1..-1].each do |sexp|
127
+ source_proc.to_sexp[3][1..-1].each do |sexp|
105
128
  get_call = lambda { |sexp|
106
129
  if sexp[0] == :fcall
107
130
  return sexp[1].to_s
@@ -113,29 +136,22 @@ private
113
136
  }
114
137
  call = get_call.call(sexp)
115
138
  if(call.nil? || actions.index(call) != nil)
116
- if !pattern_skipped
139
+ if !patterns_passed
117
140
  pre_pattern_sexp.push(sexp)
118
141
  else
119
142
  post_pattern_sexp.push(sexp)
120
143
  end
121
144
  else
122
- raise "Second pattern tree found while exporting." if pattern_skipped
123
- pattern_skipped = true
145
+ patterns_passed = true
124
146
  end
125
147
  end
126
-
127
- # build extractor content
128
- inner_block = [:block]
129
- inner_block.push([:block, *pre_pattern_sexp])
130
- inner_block.push([:block, export_pattern(root_pattern)])
131
- inner_block.push([:block, *post_pattern_sexp])
132
148
 
133
149
  # build extractor
134
- [:block, [:lasgn, wrapper_name, [:iter, [:call, [:colon2, [:const, :Scrubyt], :Extractor], :define], nil, inner_block]]]
135
- end
136
-
137
- def self.export_pattern(root_pattern)
138
- root_pattern.to_sexp
150
+ sexp = [:block]
151
+ sexp.push([:block, *pre_pattern_sexp])
152
+ sexp.push([:block, *root_patterns.to_sexp_array])
153
+ sexp.push([:block, *post_pattern_sexp])
154
+ sexp
139
155
  end
140
156
  end
141
157
  end
@@ -20,7 +20,7 @@ module Scrubyt
20
20
  end
21
21
 
22
22
  def to_s
23
- text = (@result.is_a? String) ? @result : @result.inner_text
23
+ text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
24
24
  text = SharedUtils.unescape_entities(text)
25
25
  text.strip!
26
26
  text
@@ -38,10 +38,10 @@ module Scrubyt
38
38
  to_xml_lines.join("\n")
39
39
  end
40
40
 
41
- def to_hash
41
+ def to_hash(delimiter=',')
42
42
  result = []
43
43
  flat_hash_inner = lambda {|e, hash|
44
- hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + "," + e.to_s : e.to_s if e.write_text && !e.to_s.empty?
44
+ hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if e.write_text && !e.to_s.empty?
45
45
  e.each {|c| flat_hash_inner.call(c, hash) }
46
46
  hash
47
47
  }
@@ -49,6 +49,37 @@ module Scrubyt
49
49
  result
50
50
  end
51
51
 
52
+ def to_flat_xml(delimiter=nil)
53
+ lines = []
54
+ hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
55
+
56
+ if delimiter
57
+ result_sets = hash_result[0].values.map!{|x| x.split(delimiter)}.transpose
58
+ final_result = []
59
+
60
+ result_sets.each do |rs|
61
+ temp_result = {}
62
+ hash_result[0].keys.each do |k|
63
+ temp_result[k] = rs[hash_result[0].keys.index(k)]
64
+ end
65
+ final_result << temp_result
66
+ end
67
+ hash_result = final_result
68
+ end
69
+
70
+ hash_result.each do |hash|
71
+ lines << "<item>"
72
+ hash.each do |key, value|
73
+ xml_tag = key.to_s
74
+ value = '' if value == '#empty#'
75
+ lines << " <#{xml_tag}>#{REXML::Text.normalize(value)}</#{xml_tag}>"
76
+ end
77
+ lines << "</item>"
78
+ end
79
+ return lines.join("\n")
80
+
81
+ end
82
+
52
83
  def to_xml_lines
53
84
  lines = []
54
85
  children = self.select{ |child| child.has_content? }
@@ -1,13 +1,16 @@
1
1
  module Scrubyt
2
2
  class ScrubytResult < ResultNode
3
- attr_accessor :root_pattern
3
+ attr_accessor :root_patterns, :source_file, :source_proc
4
4
 
5
5
  def export(arg1, output_file_name=nil, extractor_result_file_name=nil)
6
- # require 'scrubyt/output/export_old'; Scrubyt::ExportOld.export(arg1, self, output_file_name, extractor_result_file_name) ; return
7
- if File.exists? arg1
8
- old_export(arg1, output_file_name, extractor_result_file_name)
6
+ if arg1.is_a? String
7
+ if File.exists? arg1
8
+ export_old1(arg1, output_file_name, extractor_result_file_name)
9
+ else
10
+ export_old2(arg1, output_file_name, extractor_result_file_name)
11
+ end
9
12
  else
10
- new_export(arg1, output_file_name, extractor_result_file_name)
13
+ export_new(arg1)
11
14
  end
12
15
  end
13
16
 
@@ -15,14 +18,20 @@ module Scrubyt
15
18
  #Implement me...
16
19
  end
17
20
 
18
- def old_export(input_file, output_file_name=nil, extractor_result_file_name=nil)
21
+ def export_old1(input_file, output_file_name=nil, extractor_result_file_name=nil)
19
22
  contents = open(input_file).read
20
23
  wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)[0][0]
21
- Scrubyt::Export.export(@root_pattern, wrapper_name, output_file_name, extractor_result_file_name)
24
+ export_old2(wrapper_name, output_file_name, extractor_result_file_name)
22
25
  end
23
26
 
24
- def new_export(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
25
- Scrubyt::Export.export(@root_pattern, wrapper_name, output_file_name, extractor_result_file_name)
27
+ def export_old2(wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
28
+ export_new({ :wrapper_name => wrapper_name, :output_file_name => output_file_name || "#{wrapper_name}_extractor_export.rb", :extractor_result_file_name => extractor_result_file_name })
29
+ end
30
+
31
+ def export_new(data)
32
+ data[:source_file] = @source_file
33
+ data[:source_proc] = @source_proc
34
+ Scrubyt::Export.export(@root_patterns, data)
26
35
  end
27
36
  end
28
37
  end
@@ -43,7 +43,7 @@ private
43
43
  end
44
44
 
45
45
  def self.refine_partial_results(regexp)
46
- @partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
46
+ @partial_results = @partial_results.select {|pr| pr.inner_html.gsub(/<.*?>/, '') =~ regexp}
47
47
  end
48
48
 
49
49
  end #End of class CompoundExampleLookup
@@ -28,7 +28,7 @@ module Scrubyt
28
28
  def self.traverse_for_match(node, regexp)
29
29
  results = []
30
30
  traverse_for_match_inner = lambda { |node, regexp|
31
- ft = prepare_text_for_comparison(node.inner_text)
31
+ ft = prepare_text_for_comparison(node.inner_html.gsub(/<.*?>/, ''))
32
32
  if ft =~ regexp
33
33
  node.instance_eval do
34
34
  @match_data = $~
@@ -16,7 +16,7 @@ module Scrubyt
16
16
  #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
17
17
  def self.find_node_from_text(doc, text, next_link=false, index = 0)
18
18
  text.gsub!('»', '&#187;')
19
- #Process immediate attribute extraction (like "go to google.com[@href]")
19
+ #Process immediate attribute extraction (like "go to google.com/@href")
20
20
  if text =~ /.+\/@.+$/
21
21
  text = text.scan(/^(.+?)\/@.+$/)[0][0]
22
22
  elsif text =~ /.+\[\d+\]$/
@@ -27,10 +27,14 @@ module Scrubyt
27
27
  final_element_name = text.scan(/^(.+?)\[/)[0][0]
28
28
  text = text.scan(/\[(.+?)\]/)[0][0]
29
29
  end
30
- text = Regexp.escape(text) if text.is_a? String
31
- result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
32
- result = XPathUtils.traverse_up_until_name(result,final_element_name) if final_element_name
33
- result
30
+ if final_element_name
31
+ text = Regexp.escape(text) if text.is_a? String
32
+ result = SharedUtils.traverse_for_match(doc,/#{text}/)[index]
33
+ result = XPathUtils.traverse_up_until_name(result,final_element_name)
34
+ else
35
+ text = Regexp.escape(text) if text.is_a? String
36
+ result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
37
+ end
34
38
  end
35
39
  end #End of class SimpleExampleLookup
36
40
  end #End of module Scrubyt