scrubber-scrubyt 0.4.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/CHANGELOG +343 -0
  2. data/COPYING +340 -0
  3. data/README +99 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
  6. data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
  7. data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
  8. data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
  9. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  10. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  11. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  13. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  14. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  15. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  16. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  17. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  18. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  19. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  20. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  21. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  22. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  23. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  24. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  25. data/lib/scrubyt/core/shared/extractor.rb +167 -0
  26. data/lib/scrubyt/logging.rb +154 -0
  27. data/lib/scrubyt/output/post_processor.rb +139 -0
  28. data/lib/scrubyt/output/result.rb +44 -0
  29. data/lib/scrubyt/output/result_dumper.rb +154 -0
  30. data/lib/scrubyt/output/result_node.rb +140 -0
  31. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  32. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  33. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  34. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  35. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  36. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  37. data/lib/scrubyt.rb +43 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +115 -0
@@ -0,0 +1,154 @@
1
+ #
2
+ # TODO: if multiline messages aren't needed, then remove them.
3
+ #
4
+ # TODO: switch to the conventional Ruby logger interface,
5
+ # or create an adapter to it. If the former, then decided what to
6
+ # do with the unit tests.
7
+ #
8
+
9
+ module Scrubyt
10
+ # Logging is disabled by default. It can be enabled as follows:
11
+ #
12
+ # Scrubyt.logger = Scrubyt::Logger.new # logs *all* messages to STDERR
13
+ #
14
+ def self.logger=(logger)
15
+ @logger = logger
16
+ end
17
+
18
+ # Simple logger implementation, based on Scrubyt's original logging style.
19
+ # Messages will be sent to STDERR. Logging can be limited to certain message
20
+ # levels by specifying them on initialization, e.g.
21
+ #
22
+ # Scrubyt::Logger.new(:ACTION, :ERROR) # will only log action/error messages
23
+ #
24
+ class Logger
25
+ class Message
26
+ def initialize(level, text)
27
+ @level, @text = level.to_s, text.to_s
28
+ end
29
+
30
+ def to_s
31
+ prefix + @text
32
+ end
33
+
34
+ protected
35
+
36
+ def prefix
37
+ @prefix ||= "[#{@level}] "
38
+ end
39
+ end
40
+
41
+ class MultiLineMessage < Message
42
+ def initialize(level, lines)
43
+ super level, lines.shift
44
+
45
+ @lines = lines
46
+ end
47
+
48
+ def to_s
49
+ [ super, indented_lines ] * "\n"
50
+ end
51
+
52
+ private
53
+
54
+ def indented_lines
55
+ @lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
56
+ end
57
+
58
+ def indented(line)
59
+ ' ' * prefix.length + line
60
+ end
61
+ end
62
+
63
+ def initialize(*levels)
64
+ @levels = levels
65
+ end
66
+
67
+ def log(level, message)
68
+ return unless logging?(level)
69
+
70
+ message_class = message.is_a?(Array) ? MultiLineMessage : Message
71
+
72
+ output_stream.puts message_class.new(level, message)
73
+ end
74
+
75
+ def output_stream
76
+ @output_stream || STDERR
77
+ end
78
+
79
+ attr_writer :output_stream
80
+
81
+ private
82
+
83
+ def logging?(level)
84
+ @levels.empty? || @levels.include?(level)
85
+ end
86
+ end
87
+
88
+ def self.log(level, message)
89
+ return if logger.nil?
90
+
91
+ logger.log(level, message)
92
+ end
93
+
94
+ private
95
+
96
+ def self.logger
97
+ @logger
98
+ end
99
+ end
100
+
101
+
102
+ if __FILE__ == $0 then
103
+
104
+ require 'test/unit'
105
+
106
+ class ScrubytLoggingTestCase < Test::Unit::TestCase
107
+ class FauxOutputStream < Array
108
+ def puts(object)
109
+ self << object.to_s
110
+ end
111
+ end
112
+
113
+ def setup_logger_with_faux_output_stream!(*logger_args)
114
+ @stream = FauxOutputStream.new
115
+ logger = Scrubyt::Logger.new(*logger_args)
116
+ logger.output_stream = @stream
117
+ Scrubyt.logger = logger
118
+ end
119
+
120
+ def test_that_logging_works_with_nil_logger
121
+ Scrubyt.logger = nil
122
+ assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
123
+ end
124
+
125
+ def test_simple_messages_are_output_correctly
126
+ setup_logger_with_faux_output_stream!
127
+
128
+ Scrubyt.log :ACTION, 'i just did something'
129
+
130
+ assert_equal 1, @stream.size
131
+ assert_equal '[ACTION] i just did something', @stream.first
132
+ end
133
+
134
+ def test_that_multiline_messages_are_output_correctly
135
+ setup_logger_with_faux_output_stream!
136
+
137
+ Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
138
+
139
+ assert_equal 1, @stream.size
140
+ assert_equal "[ERROR] something bad happened\n dear oh dear", @stream.first
141
+ end
142
+
143
+ def test_that_loggers_can_be_limited_to_specfied_message_levels
144
+ setup_logger_with_faux_output_stream! :ERROR
145
+
146
+ Scrubyt.log :ACTION, 'i just did something'
147
+ Scrubyt.log :ERROR, 'something bad happened'
148
+
149
+ assert_equal 1, @stream.size
150
+ assert_equal '[ERROR] something bad happened', @stream.first
151
+ end
152
+ end
153
+
154
+ end
@@ -0,0 +1,139 @@
1
+ module Scrubyt
2
+
3
+ ########################################## NOT USED ANY MORE ##########################################
4
+ require 'set'
5
+ ##
6
+ #=<tt>Post processing results after the extraction</tt>
7
+ #Some things can not be carried out during evaluation - for example
8
+ #the ensure_presence_of_pattern constraint (since the evaluation is top
9
+ #to bottom, at a given point we don't know yet whether the currently
10
+ #evaluated pattern will have a child pattern or not) or removing unneeded
11
+ #results caused by evaluating multiple filters.
12
+ #
13
+ #The sole purpose of this class is to execute these post-processing tasks.
14
+ class PostProcessor
15
+ ##
16
+ #This is just a convenience method do call all the postprocessing
17
+ #functionality and checks
18
+ def self.apply_post_processing(root_pattern)
19
+ ensure_presence_of_pattern_full(root_pattern)
20
+ remove_multiple_filter_duplicates(root_pattern) if root_pattern.children[0].filters.size > 1
21
+ report_if_no_results(root_pattern) if root_pattern.evaluation_context.extractor.get_mode != :production
22
+ end
23
+
24
+ ##
25
+ #Apply the ensure_presence_of_pattern constraint on
26
+ #the full extractor
27
+ def self.ensure_presence_of_pattern_full(pattern)
28
+ ensure_presence_of_pattern(pattern)
29
+ pattern.children.each {|child| ensure_presence_of_pattern_full(child)}
30
+ end
31
+
32
+ ##
33
+ #Remove unneeded results of a pattern (caused by evaluating multiple filters)
34
+ #See for example the B&N scenario - the book titles are extracted two times
35
+ #for every pattern (since both examples generate the same XPath for them)
36
+ #but since always only one of the results has a price, the other is discarded
37
+ def self.remove_multiple_filter_duplicates(pattern)
38
+ remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
39
+ pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
40
+ end
41
+
42
+ ##
43
+ #Issue an error report if the document did not extract anything.
44
+ #Probably this is because the structure of the page changed or
45
+ #because of some rather nasty bug - in any case, something wrong
46
+ #is going on, and we need to inform the user about this!
47
+ def self.report_if_no_results(root_pattern)
48
+ results_found = false
49
+ root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
50
+
51
+ Scrubyt.log :WARNING, [
52
+ "The extractor did not find any result instances. Most probably this is wrong.",
53
+ "Check your extractor and if you are sure it should work, report a bug!"
54
+ ]
55
+ end
56
+
57
+ private
58
+ def self.ensure_presence_of_pattern(pattern)
59
+ #holds the name of those child patterns which have to be present as children of the input parameter
60
+ epop_names = pattern.constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
61
+ return if epop_names.empty?
62
+ #all_parent_values holds instances extracted by pattern
63
+ all_parent_values = []
64
+ pattern.result.childmap.each { |h| all_parent_values << h.values }
65
+ all_parent_values.flatten!
66
+ #indices of result instances (of pattern) we are going to remove
67
+ results_to_remove = Set.new
68
+ pattern.children.each do |child_pattern|
69
+ #all_child_values holds instances extracted by child_pattern
70
+ all_child_values = []
71
+ child_pattern.result.childmap.each { |h| all_child_values << h.values }
72
+ all_child_values.flatten!
73
+
74
+ #populate results_to_remove
75
+ i = 0
76
+ all_parent_values.each do |parent_value|
77
+ #Hey! Not just the direct children but all the ancestors
78
+ @found_ancestor = false
79
+ check_ancestors(parent_value, all_child_values)
80
+
81
+ results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
82
+ i += 1
83
+ end
84
+ end
85
+ #based on results_to_remove, populate the array 'rejected' which holds the actual instances
86
+ #(and not indices, as in the case of results_to_remove!). In other words, we are mapping
87
+ #results_to_remove indices to their actual instances
88
+ rejected = []
89
+ i = -1
90
+ pattern.result.childmap.each do |h|
91
+ h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
92
+ end
93
+
94
+ #Finally, do the actual delete!
95
+ pattern.result.childmap.each { |h| h.each { |k,v| rejected.each { |r| v.delete(r)} } }
96
+ end
97
+
98
+ def self.check_ancestors(parent_value, all_child_values)
99
+ parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child } if
100
+ parent_value.is_a? Hpricot::Elem
101
+ parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem } if parent_value.is_a? Hpricot::Elem
102
+ end
103
+
104
+ def self.remove_multiple_filter_duplicates_intern(pattern)
105
+ possible_duplicates = {}
106
+ longest_result = 0
107
+ pattern.result.childmap.each { |r|
108
+ r.each do |k,v|
109
+ v.each do |x|
110
+ all_child_results = []
111
+ pattern.children.each { |child|
112
+ temp_res = child.result.lookup(x)
113
+ all_child_results << temp_res if temp_res != nil
114
+ }
115
+ next if all_child_results.size <= 1
116
+ longest_result = all_child_results.map {|e| e.size}.max
117
+ all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
118
+ possible_duplicates[x] = all_child_results.transpose
119
+ end
120
+ end
121
+ }
122
+ #Determine the 'real' duplicates
123
+ real_duplicates = {}
124
+ possible_duplicates.each { |k,v|
125
+ next if v.size == 1
126
+ v.each { |r| real_duplicates[k] = r }
127
+ }
128
+
129
+ #Finally, remove them!
130
+ pattern.children.each { |child|
131
+ child.result.childmap.each { |r|
132
+ r.each { |k,v|
133
+ real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
134
+ }
135
+ }
136
+ }
137
+ end #end of function
138
+ end #end of class PostProcessor
139
+ end #end of module Scrubyt
@@ -0,0 +1,44 @@
1
+ ########################################## NOT USED ANY MORE ##########################################
2
+ module Scrubyt
3
+ ##
4
+ #=<tt>Represents the results of a pattern</tt>
5
+ class Result
6
+ attr_reader :childmap, :instances
7
+
8
+ def initialize
9
+ @childmap ||= []
10
+ end
11
+
12
+ def add_result(source, result)
13
+ @childmap.each do |hash|
14
+ if hash.keys[0] == source
15
+ hash[source] << result if !hash[source].include? result
16
+ return
17
+ end
18
+ end
19
+ @childmap << {source => [result]}
20
+ end
21
+
22
+ def lookup(last_result)
23
+ @childmap.each do |hashes|
24
+ hashes.each { |key, value| return value if (key == last_result) }
25
+ end
26
+ nil
27
+ end#end of method lookup
28
+ end#end of class Result
29
+ end#end of module Scrubyt
30
+
31
+ #It roughly works like this:
32
+ #
33
+ # root
34
+ # source: nil
35
+ # childmap: [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
36
+
37
+ #table
38
+ # source: doc1
39
+ # childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
40
+
41
+ #row
42
+ # source: table1s1, table2s1, table3s1
43
+ # childmap: [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
44
+ # {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
@@ -0,0 +1,154 @@
1
+ require 'rexml/document'
2
+ require 'rexml/xpath'
3
+
4
+ ########################################## NOT USED ANY MORE ##########################################
5
+ module Scrubyt
6
+ ##
7
+ #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
8
+ class ResultDumper
9
+ ##
10
+ #Output the results as XML
11
+ def self.to_xml(pattern)
12
+ doc = REXML::Document.new
13
+ root = REXML::Element.new('root')
14
+ doc.add_element(root)
15
+ all_extracted_docs = pattern.last_result
16
+ [all_extracted_docs].flatten.each do |lr|
17
+ pattern.last_result = lr
18
+ to_xml_recursive(pattern, root)
19
+ end
20
+ remove_empty_leaves(doc)
21
+ @@last_doc = doc
22
+ end
23
+
24
+ def self.remove_empty_leaves(node)
25
+ node.remove if node.elements.empty? && node.text == nil
26
+ node.elements.each {|child| remove_empty_leaves child }
27
+ end
28
+
29
+ ##
30
+ #Output the text of the pattern; If this pattern is a tree, collect the text from its
31
+ #result instance node; otherwise rely on the last_result
32
+ #TODO: throw this away!!!
33
+ def self.to_text(pattern)
34
+ last_result = pattern.last_result
35
+ result = ""
36
+ if pattern.type == :tree
37
+ last_result.traverse_text { |t| result += t.to_s }
38
+ else
39
+ result = last_result
40
+ end
41
+ result
42
+ end
43
+
44
+ def self.to_csv(pattern)
45
+ result = []
46
+ flat_csv_inner = lambda {|e, parts|
47
+ content = e.text || ''
48
+ parts << content if ((e.is_a? REXML::Element) && content != '')
49
+ e.children.each {|c| flat_csv_inner.call(c, parts) if c.is_a? REXML::Element }
50
+ parts
51
+ }
52
+ to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
53
+ (result.map! {|a| a.join(',')}).join("\n")
54
+ end
55
+
56
+ def self.to_hash(pattern)
57
+ result = []
58
+ flat_hash_inner = lambda {|e, parts|
59
+ content = e.text ? REXML::Text.unnormalize(e.text) : ''
60
+ if ((e.is_a? REXML::Element) && content != '')
61
+ if parts[e.local_name]
62
+ parts[e.local_name] = parts[e.local_name] + "," + content
63
+ else
64
+ parts[e.local_name] = content
65
+ end
66
+ end
67
+ e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
68
+ parts
69
+ }
70
+ to_xml(pattern).root.elements['/root'].each {|e| result << flat_hash_inner.call(e, {}) }
71
+ result
72
+ end
73
+
74
+
75
+
76
+ ##
77
+ #Print some simple statistics on the extracted results, like the count of extracted
78
+ #instances by each pattern
79
+ def self.print_statistics(pattern)
80
+ puts "\n" * 2
81
+ print_statistics_recursive(pattern,0)
82
+ puts
83
+ end
84
+
85
+ private
86
+ def self.to_xml_recursive(pattern, element)
87
+ pattern.children.each do |child|
88
+ childresults = child.result.lookup(child.parent.last_result)
89
+ #Output text for leaf nodes only; Maybe add possibility to customize this later
90
+ if (childresults == nil)
91
+ ##TODO: is this needed for anything? I guess not! Drop it!!!!!!
92
+ #Update: it seems the blackbox tests are not passing because of this (?) so temporarily adding it back
93
+ ##=begin
94
+ res = ""
95
+ if child.parent.last_result.is_a? String
96
+ res = child.parent.last_result
97
+ else
98
+ child.parent.last_result.traverse_text { |t| res += t.to_s }
99
+ end
100
+ if (child.parent.respond_to?(:size) && child.parent.size == 0) #TODO: respond_to should not be used here, it's just a quick workaround
101
+ element.text = SharedUtils.unescape_entities(res).strip unless element.parent.is_a? REXML::Document
102
+ end
103
+ next
104
+ ##=end
105
+ end
106
+
107
+ generate_children(child, childresults, element)
108
+ end
109
+ end
110
+
111
+ def self.generate_children(child, childresults, element)
112
+ if childresults == nil
113
+ child_node = REXML::Element.new(child.name)
114
+ child_node.text = child.default
115
+ element.add_element(child_node)
116
+ else
117
+ childresults.size.times do |num|
118
+ child.last_result = childresults[num]
119
+ res = ""
120
+ if child.last_result.instance_of? String
121
+ res = child.last_result
122
+ else
123
+ if child.last_result.respond_to? 'traverse_text'
124
+ child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
125
+ else
126
+ child.last_result.children.each { |c| element.add_element c }
127
+ end
128
+ end
129
+ child_node = REXML::Element.new(child.name)
130
+ child_node.text = SharedUtils.unescape_entities(res).strip if child.write_text
131
+ element.add_element(child_node) if (child.type != :detail_page && child_node.text != '')
132
+ to_xml_recursive(child, child_node)
133
+ end
134
+ end
135
+ end
136
+
137
+ def self.print_statistics_recursive(pattern, depth)
138
+ if pattern.name != 'root'
139
+ if pattern.type == :detail_page
140
+ pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
141
+ print_statistics_recursive(child, depth)
142
+ end
143
+ else
144
+ count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
145
+ Scrubyt.log :INFO, (' ' * depth.to_i) + "#{pattern.name} extracted #{count} instances."
146
+ end
147
+ end
148
+
149
+ pattern.children.each do |child|
150
+ print_statistics_recursive(child, depth + 4)
151
+ end
152
+ end#end of method print_statistics_recursive
153
+ end #end of class ResultDumper
154
+ end #end of module Scrubyt
@@ -0,0 +1,140 @@
1
+ module Scrubyt
2
+ class ResultNode < Array
3
+ OUTPUT_OPTIONS = [:write_text]
4
+
5
+ attr_accessor :name, :result, :options, :generated_by_leaf
6
+
7
+ def initialize(name, result=nil, options={})
8
+ @name = name
9
+ @result = result
10
+ @options = options
11
+ end
12
+
13
+ def write_text
14
+ @options[:write_text].nil? ? @generated_by_leaf : @options[:write_text]
15
+ end
16
+
17
+ def has_content?
18
+ return true if result.is_a? String
19
+ write_text || (inject(false) { |one_child_has_content, child| one_child_has_content || child.has_content? })
20
+ end
21
+
22
+ def to_s
23
+ text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
24
+ text = SharedUtils.unescape_entities(text)
25
+ text.strip!
26
+ if (@options[:default] && ((text == '') || (text == @options[:default])))
27
+ @options[:default]
28
+ else
29
+ text
30
+ end
31
+ end
32
+
33
+ def to_libxml
34
+ libxml_node = XML::Node.new(name)
35
+ self.each { |child| libxml_node << child.to_libxml if child.has_content? }
36
+ libxml_node << to_s if write_text
37
+ libxml_node
38
+ end
39
+
40
+ #note: see ruby_extensions.rb for String#write
41
+ def to_xml
42
+ to_xml_lines.join("\n")
43
+ end
44
+
45
+ def to_hash(delimiter=',')
46
+ result = []
47
+ flat_hash_inner = lambda {|e, hash|
48
+ hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if ((e.write_text && !e.to_s.empty?) || e.options[:default])
49
+ e.each {|c| flat_hash_inner.call(c, hash) }
50
+ hash
51
+ }
52
+ self.each {|e| result << flat_hash_inner.call(e, {}) }
53
+ result
54
+ end
55
+
56
+ def to_flat_hash()
57
+ hash_result = self.to_hash('@@@@@@')
58
+ merged_hash = hash_result.delete_at 0
59
+ hash_result.each do |hash|
60
+ merged_hash.keys.each do |key|
61
+ merged_hash[key] += "@@@@@@#{hash[key]}"
62
+ end
63
+ end
64
+ result_sets = merged_hash.values.map!{|x| x.split('@@@@@@')}.transpose
65
+ final_result = []
66
+
67
+ result_sets.each do |rs|
68
+ temp_result = {}
69
+ merged_hash.keys.each do |k|
70
+ temp_result[k] = rs[merged_hash.keys.index(k)]
71
+ end
72
+ final_result << temp_result
73
+ end
74
+ final_result
75
+ end
76
+
77
+ def to_flat_xml(delimiter=nil)
78
+ lines = []
79
+ hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
80
+ merged_hash = hash_result.delete_at 0
81
+
82
+ hash_result.each do |hash|
83
+ merged_hash.keys.each do |key|
84
+ merged_hash[key] += "#{delimiter}#{hash[key]}"
85
+ end
86
+ end
87
+
88
+ if delimiter
89
+ result_sets = merged_hash.values.map!{|x| x.split(delimiter)}.transpose
90
+ final_result = []
91
+
92
+ result_sets.each do |rs|
93
+ temp_result = {}
94
+ merged_hash.keys.each do |k|
95
+ temp_result[k] = rs[merged_hash.keys.index(k)]
96
+ end
97
+ final_result << temp_result
98
+ end
99
+ hash_result = final_result
100
+ end
101
+
102
+ hash_result.each do |hash|
103
+ lines << "<item>"
104
+ hash.each do |key, value|
105
+ xml_tag = key.to_s
106
+ value = '' if value == '#empty#'
107
+ lines << " <#{xml_tag}>#{REXML::Text.normalize(value)}</#{xml_tag}>"
108
+ end
109
+ lines << "</item>"
110
+ end
111
+ return lines.join("\n")
112
+
113
+ end
114
+
115
+ def to_xml_lines
116
+ lines = []
117
+ children = self.select{ |child| child.has_content? }
118
+ if children.empty?
119
+ if result.is_a? String
120
+ lines << "<#{name}>#{result}</#{name}>"
121
+ elsif write_text && !to_s.empty?
122
+ lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
123
+ else
124
+ if @options[:default]
125
+ lines << "<#{name}>#{@options[:default]}</#{name}>"
126
+ else
127
+ lines << "<#{name}/>"
128
+ end
129
+ end
130
+ else
131
+ lines << "<#{name}>"
132
+ lines << " #{ERB::Util.html_escape(to_s)}" if write_text && !to_s.empty?
133
+ children.each do |child|
134
+ lines.push(*child.to_xml_lines.map{ |line| " #{line}" })
135
+ end
136
+ lines << "</#{name}>"
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,42 @@
1
+ module Scrubyt
2
+ class ScrubytResult < ResultNode
3
+ attr_accessor :root_patterns, :source_file, :source_proc
4
+
5
+ def export
6
+ #Temporary solution; the real one will be back later - or not
7
+ result = <<-EXPLANATION
8
+
9
+ === Extractor tree ===
10
+
11
+ export() is not working at the moment, due to the removal or ParseTree, ruby2ruby and RubyInline.
12
+ For now, in case you are using examples, you can replace them by hand based on the output below.
13
+ So if your pattern in the learning extractor looks like
14
+
15
+ book "Ruby Cookbook"
16
+
17
+ and you see the following below:
18
+
19
+ [book] /table[1]/tr/td[2]
20
+
21
+ then replace "Ruby Cookbook" with "/table[1]/tr/td[2]" (and all the other XPaths) and you are ready!
22
+
23
+ EXPLANATION
24
+
25
+ tree_builder = lambda do |node, level|
26
+ result += current_level = (" " * (level == 0 ? 0 : level-1) +
27
+ "|\n" * (level == 0 ? 0 : 1) +
28
+ " " * (level == 0 ? 0 : level-1) +
29
+ "+-- " * (level == 0 ? 0 : 1) +
30
+ "[#{node.name}]")
31
+ result += " #{node.filters[0].xpath}" if node.type == :tree
32
+ result += "\n"
33
+
34
+ node.children.each {|c| tree_builder[c, level+1]}
35
+ end
36
+
37
+ tree_builder[root_patterns[0],0]
38
+
39
+ result += "\n"
40
+ end
41
+ end
42
+ end