scrubber-scrubyt 0.4.11
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +343 -0
- data/COPYING +340 -0
- data/README +99 -0
- data/Rakefile +101 -0
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/constraint.rb +169 -0
- data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
- data/lib/scrubyt/core/scraping/pattern.rb +359 -0
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
- data/lib/scrubyt/core/shared/extractor.rb +167 -0
- data/lib/scrubyt/logging.rb +154 -0
- data/lib/scrubyt/output/post_processor.rb +139 -0
- data/lib/scrubyt/output/result.rb +44 -0
- data/lib/scrubyt/output/result_dumper.rb +154 -0
- data/lib/scrubyt/output/result_node.rb +140 -0
- data/lib/scrubyt/output/scrubyt_result.rb +42 -0
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
- data/lib/scrubyt/utils/shared_utils.rb +58 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
- data/lib/scrubyt/utils/xpathutils.rb +202 -0
- data/lib/scrubyt.rb +43 -0
- data/test/blackbox_test.rb +60 -0
- data/test/blackbox_tests/basic/multi_root.rb +6 -0
- data/test/blackbox_tests/basic/simple.rb +5 -0
- data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
- data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
- data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
- data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
- metadata +115 -0
@@ -0,0 +1,154 @@
|
|
1
|
+
#
|
2
|
+
# TODO: if multiline messages aren't needed, then remove them.
|
3
|
+
#
|
4
|
+
# TODO: switch to the conventional Ruby logger interface,
|
5
|
+
# or create an adapter to it. If the former, then decided what to
|
6
|
+
# do with the unit tests.
|
7
|
+
#
|
8
|
+
|
9
|
+
module Scrubyt
|
10
|
+
# Logging is disabled by default. It can be enabled as follows:
|
11
|
+
#
|
12
|
+
# Scrubyt.logger = Scrubyt::Logger.new # logs *all* messages to STDERR
|
13
|
+
#
|
14
|
+
def self.logger=(logger)
|
15
|
+
@logger = logger
|
16
|
+
end
|
17
|
+
|
18
|
+
# Simple logger implementation, based on Scrubyt's original logging style.
|
19
|
+
# Messages will be sent to STDERR. Logging can be limited to certain message
|
20
|
+
# levels by specifying them on initialization, e.g.
|
21
|
+
#
|
22
|
+
# Scrubyt::Logger.new(:ACTION, :ERROR) # will only log action/error messages
|
23
|
+
#
|
24
|
+
class Logger
|
25
|
+
class Message
|
26
|
+
def initialize(level, text)
|
27
|
+
@level, @text = level.to_s, text.to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
prefix + @text
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def prefix
|
37
|
+
@prefix ||= "[#{@level}] "
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class MultiLineMessage < Message
|
42
|
+
def initialize(level, lines)
|
43
|
+
super level, lines.shift
|
44
|
+
|
45
|
+
@lines = lines
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_s
|
49
|
+
[ super, indented_lines ] * "\n"
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def indented_lines
|
55
|
+
@lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
|
56
|
+
end
|
57
|
+
|
58
|
+
def indented(line)
|
59
|
+
' ' * prefix.length + line
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def initialize(*levels)
|
64
|
+
@levels = levels
|
65
|
+
end
|
66
|
+
|
67
|
+
def log(level, message)
|
68
|
+
return unless logging?(level)
|
69
|
+
|
70
|
+
message_class = message.is_a?(Array) ? MultiLineMessage : Message
|
71
|
+
|
72
|
+
output_stream.puts message_class.new(level, message)
|
73
|
+
end
|
74
|
+
|
75
|
+
def output_stream
|
76
|
+
@output_stream || STDERR
|
77
|
+
end
|
78
|
+
|
79
|
+
attr_writer :output_stream
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def logging?(level)
|
84
|
+
@levels.empty? || @levels.include?(level)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.log(level, message)
|
89
|
+
return if logger.nil?
|
90
|
+
|
91
|
+
logger.log(level, message)
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
def self.logger
|
97
|
+
@logger
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
if __FILE__ == $0 then
|
103
|
+
|
104
|
+
require 'test/unit'
|
105
|
+
|
106
|
+
class ScrubytLoggingTestCase < Test::Unit::TestCase
|
107
|
+
class FauxOutputStream < Array
|
108
|
+
def puts(object)
|
109
|
+
self << object.to_s
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def setup_logger_with_faux_output_stream!(*logger_args)
|
114
|
+
@stream = FauxOutputStream.new
|
115
|
+
logger = Scrubyt::Logger.new(*logger_args)
|
116
|
+
logger.output_stream = @stream
|
117
|
+
Scrubyt.logger = logger
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_that_logging_works_with_nil_logger
|
121
|
+
Scrubyt.logger = nil
|
122
|
+
assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
|
123
|
+
end
|
124
|
+
|
125
|
+
def test_simple_messages_are_output_correctly
|
126
|
+
setup_logger_with_faux_output_stream!
|
127
|
+
|
128
|
+
Scrubyt.log :ACTION, 'i just did something'
|
129
|
+
|
130
|
+
assert_equal 1, @stream.size
|
131
|
+
assert_equal '[ACTION] i just did something', @stream.first
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_that_multiline_messages_are_output_correctly
|
135
|
+
setup_logger_with_faux_output_stream!
|
136
|
+
|
137
|
+
Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
|
138
|
+
|
139
|
+
assert_equal 1, @stream.size
|
140
|
+
assert_equal "[ERROR] something bad happened\n dear oh dear", @stream.first
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_that_loggers_can_be_limited_to_specfied_message_levels
|
144
|
+
setup_logger_with_faux_output_stream! :ERROR
|
145
|
+
|
146
|
+
Scrubyt.log :ACTION, 'i just did something'
|
147
|
+
Scrubyt.log :ERROR, 'something bad happened'
|
148
|
+
|
149
|
+
assert_equal 1, @stream.size
|
150
|
+
assert_equal '[ERROR] something bad happened', @stream.first
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
|
3
|
+
########################################## NOT USED ANY MORE ##########################################
|
4
|
+
require 'set'
|
5
|
+
##
|
6
|
+
#=<tt>Post processing results after the extraction</tt>
|
7
|
+
#Some things can not be carried out during evaluation - for example
|
8
|
+
#the ensure_presence_of_pattern constraint (since the evaluation is top
|
9
|
+
#to bottom, at a given point we don't know yet whether the currently
|
10
|
+
#evaluated pattern will have a child pattern or not) or removing unneeded
|
11
|
+
#results caused by evaluating multiple filters.
|
12
|
+
#
|
13
|
+
#The sole purpose of this class is to execute these post-processing tasks.
|
14
|
+
class PostProcessor
|
15
|
+
##
|
16
|
+
#This is just a convenience method do call all the postprocessing
|
17
|
+
#functionality and checks
|
18
|
+
def self.apply_post_processing(root_pattern)
|
19
|
+
ensure_presence_of_pattern_full(root_pattern)
|
20
|
+
remove_multiple_filter_duplicates(root_pattern) if root_pattern.children[0].filters.size > 1
|
21
|
+
report_if_no_results(root_pattern) if root_pattern.evaluation_context.extractor.get_mode != :production
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
#Apply the ensure_presence_of_pattern constraint on
|
26
|
+
#the full extractor
|
27
|
+
def self.ensure_presence_of_pattern_full(pattern)
|
28
|
+
ensure_presence_of_pattern(pattern)
|
29
|
+
pattern.children.each {|child| ensure_presence_of_pattern_full(child)}
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
#Remove unneeded results of a pattern (caused by evaluating multiple filters)
|
34
|
+
#See for example the B&N scenario - the book titles are extracted two times
|
35
|
+
#for every pattern (since both examples generate the same XPath for them)
|
36
|
+
#but since always only one of the results has a price, the other is discarded
|
37
|
+
def self.remove_multiple_filter_duplicates(pattern)
|
38
|
+
remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
|
39
|
+
pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
#Issue an error report if the document did not extract anything.
|
44
|
+
#Probably this is because the structure of the page changed or
|
45
|
+
#because of some rather nasty bug - in any case, something wrong
|
46
|
+
#is going on, and we need to inform the user about this!
|
47
|
+
def self.report_if_no_results(root_pattern)
|
48
|
+
results_found = false
|
49
|
+
root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
|
50
|
+
|
51
|
+
Scrubyt.log :WARNING, [
|
52
|
+
"The extractor did not find any result instances. Most probably this is wrong.",
|
53
|
+
"Check your extractor and if you are sure it should work, report a bug!"
|
54
|
+
]
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
def self.ensure_presence_of_pattern(pattern)
|
59
|
+
#holds the name of those child patterns which have to be present as children of the input parameter
|
60
|
+
epop_names = pattern.constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
|
61
|
+
return if epop_names.empty?
|
62
|
+
#all_parent_values holds instances extracted by pattern
|
63
|
+
all_parent_values = []
|
64
|
+
pattern.result.childmap.each { |h| all_parent_values << h.values }
|
65
|
+
all_parent_values.flatten!
|
66
|
+
#indices of result instances (of pattern) we are going to remove
|
67
|
+
results_to_remove = Set.new
|
68
|
+
pattern.children.each do |child_pattern|
|
69
|
+
#all_child_values holds instances extracted by child_pattern
|
70
|
+
all_child_values = []
|
71
|
+
child_pattern.result.childmap.each { |h| all_child_values << h.values }
|
72
|
+
all_child_values.flatten!
|
73
|
+
|
74
|
+
#populate results_to_remove
|
75
|
+
i = 0
|
76
|
+
all_parent_values.each do |parent_value|
|
77
|
+
#Hey! Not just the direct children but all the ancestors
|
78
|
+
@found_ancestor = false
|
79
|
+
check_ancestors(parent_value, all_child_values)
|
80
|
+
|
81
|
+
results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
|
82
|
+
i += 1
|
83
|
+
end
|
84
|
+
end
|
85
|
+
#based on results_to_remove, populate the array 'rejected' which holds the actual instances
|
86
|
+
#(and not indices, as in the case of results_to_remove!). In other words, we are mapping
|
87
|
+
#results_to_remove indices to their actual instances
|
88
|
+
rejected = []
|
89
|
+
i = -1
|
90
|
+
pattern.result.childmap.each do |h|
|
91
|
+
h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
|
92
|
+
end
|
93
|
+
|
94
|
+
#Finally, do the actual delete!
|
95
|
+
pattern.result.childmap.each { |h| h.each { |k,v| rejected.each { |r| v.delete(r)} } }
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.check_ancestors(parent_value, all_child_values)
|
99
|
+
parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child } if
|
100
|
+
parent_value.is_a? Hpricot::Elem
|
101
|
+
parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem } if parent_value.is_a? Hpricot::Elem
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.remove_multiple_filter_duplicates_intern(pattern)
|
105
|
+
possible_duplicates = {}
|
106
|
+
longest_result = 0
|
107
|
+
pattern.result.childmap.each { |r|
|
108
|
+
r.each do |k,v|
|
109
|
+
v.each do |x|
|
110
|
+
all_child_results = []
|
111
|
+
pattern.children.each { |child|
|
112
|
+
temp_res = child.result.lookup(x)
|
113
|
+
all_child_results << temp_res if temp_res != nil
|
114
|
+
}
|
115
|
+
next if all_child_results.size <= 1
|
116
|
+
longest_result = all_child_results.map {|e| e.size}.max
|
117
|
+
all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
|
118
|
+
possible_duplicates[x] = all_child_results.transpose
|
119
|
+
end
|
120
|
+
end
|
121
|
+
}
|
122
|
+
#Determine the 'real' duplicates
|
123
|
+
real_duplicates = {}
|
124
|
+
possible_duplicates.each { |k,v|
|
125
|
+
next if v.size == 1
|
126
|
+
v.each { |r| real_duplicates[k] = r }
|
127
|
+
}
|
128
|
+
|
129
|
+
#Finally, remove them!
|
130
|
+
pattern.children.each { |child|
|
131
|
+
child.result.childmap.each { |r|
|
132
|
+
r.each { |k,v|
|
133
|
+
real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
|
134
|
+
}
|
135
|
+
}
|
136
|
+
}
|
137
|
+
end #end of function
|
138
|
+
end #end of class PostProcessor
|
139
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,44 @@
|
|
1
|
+
########################################## NOT USED ANY MORE ##########################################
|
2
|
+
module Scrubyt
|
3
|
+
##
|
4
|
+
#=<tt>Represents the results of a pattern</tt>
|
5
|
+
class Result
|
6
|
+
attr_reader :childmap, :instances
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@childmap ||= []
|
10
|
+
end
|
11
|
+
|
12
|
+
def add_result(source, result)
|
13
|
+
@childmap.each do |hash|
|
14
|
+
if hash.keys[0] == source
|
15
|
+
hash[source] << result if !hash[source].include? result
|
16
|
+
return
|
17
|
+
end
|
18
|
+
end
|
19
|
+
@childmap << {source => [result]}
|
20
|
+
end
|
21
|
+
|
22
|
+
def lookup(last_result)
|
23
|
+
@childmap.each do |hashes|
|
24
|
+
hashes.each { |key, value| return value if (key == last_result) }
|
25
|
+
end
|
26
|
+
nil
|
27
|
+
end#end of method lookup
|
28
|
+
end#end of class Result
|
29
|
+
end#end of module Scrubyt
|
30
|
+
|
31
|
+
#It roughly works like this:
|
32
|
+
#
|
33
|
+
# root
|
34
|
+
# source: nil
|
35
|
+
# childmap: [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
|
36
|
+
|
37
|
+
#table
|
38
|
+
# source: doc1
|
39
|
+
# childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
|
40
|
+
|
41
|
+
#row
|
42
|
+
# source: table1s1, table2s1, table3s1
|
43
|
+
# childmap: [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
|
44
|
+
# {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
require 'rexml/xpath'
|
3
|
+
|
4
|
+
########################################## NOT USED ANY MORE ##########################################
|
5
|
+
module Scrubyt
|
6
|
+
##
|
7
|
+
#=<tt>Dumping the result in various formats and providing statistics on the results</tt>
|
8
|
+
class ResultDumper
|
9
|
+
##
|
10
|
+
#Output the results as XML
|
11
|
+
def self.to_xml(pattern)
|
12
|
+
doc = REXML::Document.new
|
13
|
+
root = REXML::Element.new('root')
|
14
|
+
doc.add_element(root)
|
15
|
+
all_extracted_docs = pattern.last_result
|
16
|
+
[all_extracted_docs].flatten.each do |lr|
|
17
|
+
pattern.last_result = lr
|
18
|
+
to_xml_recursive(pattern, root)
|
19
|
+
end
|
20
|
+
remove_empty_leaves(doc)
|
21
|
+
@@last_doc = doc
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.remove_empty_leaves(node)
|
25
|
+
node.remove if node.elements.empty? && node.text == nil
|
26
|
+
node.elements.each {|child| remove_empty_leaves child }
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
#Output the text of the pattern; If this pattern is a tree, collect the text from its
|
31
|
+
#result instance node; otherwise rely on the last_result
|
32
|
+
#TODO: throw this away!!!
|
33
|
+
def self.to_text(pattern)
|
34
|
+
last_result = pattern.last_result
|
35
|
+
result = ""
|
36
|
+
if pattern.type == :tree
|
37
|
+
last_result.traverse_text { |t| result += t.to_s }
|
38
|
+
else
|
39
|
+
result = last_result
|
40
|
+
end
|
41
|
+
result
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.to_csv(pattern)
|
45
|
+
result = []
|
46
|
+
flat_csv_inner = lambda {|e, parts|
|
47
|
+
content = e.text || ''
|
48
|
+
parts << content if ((e.is_a? REXML::Element) && content != '')
|
49
|
+
e.children.each {|c| flat_csv_inner.call(c, parts) if c.is_a? REXML::Element }
|
50
|
+
parts
|
51
|
+
}
|
52
|
+
to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
|
53
|
+
(result.map! {|a| a.join(',')}).join("\n")
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.to_hash(pattern)
|
57
|
+
result = []
|
58
|
+
flat_hash_inner = lambda {|e, parts|
|
59
|
+
content = e.text ? REXML::Text.unnormalize(e.text) : ''
|
60
|
+
if ((e.is_a? REXML::Element) && content != '')
|
61
|
+
if parts[e.local_name]
|
62
|
+
parts[e.local_name] = parts[e.local_name] + "," + content
|
63
|
+
else
|
64
|
+
parts[e.local_name] = content
|
65
|
+
end
|
66
|
+
end
|
67
|
+
e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
|
68
|
+
parts
|
69
|
+
}
|
70
|
+
to_xml(pattern).root.elements['/root'].each {|e| result << flat_hash_inner.call(e, {}) }
|
71
|
+
result
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
##
|
77
|
+
#Print some simple statistics on the extracted results, like the count of extracted
|
78
|
+
#instances by each pattern
|
79
|
+
def self.print_statistics(pattern)
|
80
|
+
puts "\n" * 2
|
81
|
+
print_statistics_recursive(pattern,0)
|
82
|
+
puts
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
def self.to_xml_recursive(pattern, element)
|
87
|
+
pattern.children.each do |child|
|
88
|
+
childresults = child.result.lookup(child.parent.last_result)
|
89
|
+
#Output text for leaf nodes only; Maybe add possibility to customize this later
|
90
|
+
if (childresults == nil)
|
91
|
+
##TODO: is this needed for anything? I guess not! Drop it!!!!!!
|
92
|
+
#Update: it seems the blackbox tests are not passing because of this (?) so temporarily adding it back
|
93
|
+
##=begin
|
94
|
+
res = ""
|
95
|
+
if child.parent.last_result.is_a? String
|
96
|
+
res = child.parent.last_result
|
97
|
+
else
|
98
|
+
child.parent.last_result.traverse_text { |t| res += t.to_s }
|
99
|
+
end
|
100
|
+
if (child.parent.respond_to?(:size) && child.parent.size == 0) #TODO: respond_to should not be used here, it's just a quick workaround
|
101
|
+
element.text = SharedUtils.unescape_entities(res).strip unless element.parent.is_a? REXML::Document
|
102
|
+
end
|
103
|
+
next
|
104
|
+
##=end
|
105
|
+
end
|
106
|
+
|
107
|
+
generate_children(child, childresults, element)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.generate_children(child, childresults, element)
|
112
|
+
if childresults == nil
|
113
|
+
child_node = REXML::Element.new(child.name)
|
114
|
+
child_node.text = child.default
|
115
|
+
element.add_element(child_node)
|
116
|
+
else
|
117
|
+
childresults.size.times do |num|
|
118
|
+
child.last_result = childresults[num]
|
119
|
+
res = ""
|
120
|
+
if child.last_result.instance_of? String
|
121
|
+
res = child.last_result
|
122
|
+
else
|
123
|
+
if child.last_result.respond_to? 'traverse_text'
|
124
|
+
child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
|
125
|
+
else
|
126
|
+
child.last_result.children.each { |c| element.add_element c }
|
127
|
+
end
|
128
|
+
end
|
129
|
+
child_node = REXML::Element.new(child.name)
|
130
|
+
child_node.text = SharedUtils.unescape_entities(res).strip if child.write_text
|
131
|
+
element.add_element(child_node) if (child.type != :detail_page && child_node.text != '')
|
132
|
+
to_xml_recursive(child, child_node)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def self.print_statistics_recursive(pattern, depth)
|
138
|
+
if pattern.name != 'root'
|
139
|
+
if pattern.type == :detail_page
|
140
|
+
pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
|
141
|
+
print_statistics_recursive(child, depth)
|
142
|
+
end
|
143
|
+
else
|
144
|
+
count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
|
145
|
+
Scrubyt.log :INFO, (' ' * depth.to_i) + "#{pattern.name} extracted #{count} instances."
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
pattern.children.each do |child|
|
150
|
+
print_statistics_recursive(child, depth + 4)
|
151
|
+
end
|
152
|
+
end#end of method print_statistics_recursive
|
153
|
+
end #end of class ResultDumper
|
154
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,140 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class ResultNode < Array
|
3
|
+
OUTPUT_OPTIONS = [:write_text]
|
4
|
+
|
5
|
+
attr_accessor :name, :result, :options, :generated_by_leaf
|
6
|
+
|
7
|
+
def initialize(name, result=nil, options={})
|
8
|
+
@name = name
|
9
|
+
@result = result
|
10
|
+
@options = options
|
11
|
+
end
|
12
|
+
|
13
|
+
def write_text
|
14
|
+
@options[:write_text].nil? ? @generated_by_leaf : @options[:write_text]
|
15
|
+
end
|
16
|
+
|
17
|
+
def has_content?
|
18
|
+
return true if result.is_a? String
|
19
|
+
write_text || (inject(false) { |one_child_has_content, child| one_child_has_content || child.has_content? })
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
|
24
|
+
text = SharedUtils.unescape_entities(text)
|
25
|
+
text.strip!
|
26
|
+
if (@options[:default] && ((text == '') || (text == @options[:default])))
|
27
|
+
@options[:default]
|
28
|
+
else
|
29
|
+
text
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_libxml
|
34
|
+
libxml_node = XML::Node.new(name)
|
35
|
+
self.each { |child| libxml_node << child.to_libxml if child.has_content? }
|
36
|
+
libxml_node << to_s if write_text
|
37
|
+
libxml_node
|
38
|
+
end
|
39
|
+
|
40
|
+
#note: see ruby_extensions.rb for String#write
|
41
|
+
def to_xml
|
42
|
+
to_xml_lines.join("\n")
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_hash(delimiter=',')
|
46
|
+
result = []
|
47
|
+
flat_hash_inner = lambda {|e, hash|
|
48
|
+
hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if ((e.write_text && !e.to_s.empty?) || e.options[:default])
|
49
|
+
e.each {|c| flat_hash_inner.call(c, hash) }
|
50
|
+
hash
|
51
|
+
}
|
52
|
+
self.each {|e| result << flat_hash_inner.call(e, {}) }
|
53
|
+
result
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_flat_hash()
|
57
|
+
hash_result = self.to_hash('@@@@@@')
|
58
|
+
merged_hash = hash_result.delete_at 0
|
59
|
+
hash_result.each do |hash|
|
60
|
+
merged_hash.keys.each do |key|
|
61
|
+
merged_hash[key] += "@@@@@@#{hash[key]}"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
result_sets = merged_hash.values.map!{|x| x.split('@@@@@@')}.transpose
|
65
|
+
final_result = []
|
66
|
+
|
67
|
+
result_sets.each do |rs|
|
68
|
+
temp_result = {}
|
69
|
+
merged_hash.keys.each do |k|
|
70
|
+
temp_result[k] = rs[merged_hash.keys.index(k)]
|
71
|
+
end
|
72
|
+
final_result << temp_result
|
73
|
+
end
|
74
|
+
final_result
|
75
|
+
end
|
76
|
+
|
77
|
+
def to_flat_xml(delimiter=nil)
|
78
|
+
lines = []
|
79
|
+
hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
|
80
|
+
merged_hash = hash_result.delete_at 0
|
81
|
+
|
82
|
+
hash_result.each do |hash|
|
83
|
+
merged_hash.keys.each do |key|
|
84
|
+
merged_hash[key] += "#{delimiter}#{hash[key]}"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
if delimiter
|
89
|
+
result_sets = merged_hash.values.map!{|x| x.split(delimiter)}.transpose
|
90
|
+
final_result = []
|
91
|
+
|
92
|
+
result_sets.each do |rs|
|
93
|
+
temp_result = {}
|
94
|
+
merged_hash.keys.each do |k|
|
95
|
+
temp_result[k] = rs[merged_hash.keys.index(k)]
|
96
|
+
end
|
97
|
+
final_result << temp_result
|
98
|
+
end
|
99
|
+
hash_result = final_result
|
100
|
+
end
|
101
|
+
|
102
|
+
hash_result.each do |hash|
|
103
|
+
lines << "<item>"
|
104
|
+
hash.each do |key, value|
|
105
|
+
xml_tag = key.to_s
|
106
|
+
value = '' if value == '#empty#'
|
107
|
+
lines << " <#{xml_tag}>#{REXML::Text.normalize(value)}</#{xml_tag}>"
|
108
|
+
end
|
109
|
+
lines << "</item>"
|
110
|
+
end
|
111
|
+
return lines.join("\n")
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
def to_xml_lines
|
116
|
+
lines = []
|
117
|
+
children = self.select{ |child| child.has_content? }
|
118
|
+
if children.empty?
|
119
|
+
if result.is_a? String
|
120
|
+
lines << "<#{name}>#{result}</#{name}>"
|
121
|
+
elsif write_text && !to_s.empty?
|
122
|
+
lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
|
123
|
+
else
|
124
|
+
if @options[:default]
|
125
|
+
lines << "<#{name}>#{@options[:default]}</#{name}>"
|
126
|
+
else
|
127
|
+
lines << "<#{name}/>"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
else
|
131
|
+
lines << "<#{name}>"
|
132
|
+
lines << " #{ERB::Util.html_escape(to_s)}" if write_text && !to_s.empty?
|
133
|
+
children.each do |child|
|
134
|
+
lines.push(*child.to_xml_lines.map{ |line| " #{line}" })
|
135
|
+
end
|
136
|
+
lines << "</#{name}>"
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class ScrubytResult < ResultNode
|
3
|
+
attr_accessor :root_patterns, :source_file, :source_proc
|
4
|
+
|
5
|
+
def export
|
6
|
+
#Temporary solution; the real one will be back later - or not
|
7
|
+
result = <<-EXPLANATION
|
8
|
+
|
9
|
+
=== Extractor tree ===
|
10
|
+
|
11
|
+
export() is not working at the moment, due to the removal or ParseTree, ruby2ruby and RubyInline.
|
12
|
+
For now, in case you are using examples, you can replace them by hand based on the output below.
|
13
|
+
So if your pattern in the learning extractor looks like
|
14
|
+
|
15
|
+
book "Ruby Cookbook"
|
16
|
+
|
17
|
+
and you see the following below:
|
18
|
+
|
19
|
+
[book] /table[1]/tr/td[2]
|
20
|
+
|
21
|
+
then replace "Ruby Cookbook" with "/table[1]/tr/td[2]" (and all the other XPaths) and you are ready!
|
22
|
+
|
23
|
+
EXPLANATION
|
24
|
+
|
25
|
+
tree_builder = lambda do |node, level|
|
26
|
+
result += current_level = (" " * (level == 0 ? 0 : level-1) +
|
27
|
+
"|\n" * (level == 0 ? 0 : 1) +
|
28
|
+
" " * (level == 0 ? 0 : level-1) +
|
29
|
+
"+-- " * (level == 0 ? 0 : 1) +
|
30
|
+
"[#{node.name}]")
|
31
|
+
result += " #{node.filters[0].xpath}" if node.type == :tree
|
32
|
+
result += "\n"
|
33
|
+
|
34
|
+
node.children.each {|c| tree_builder[c, level+1]}
|
35
|
+
end
|
36
|
+
|
37
|
+
tree_builder[root_patterns[0],0]
|
38
|
+
|
39
|
+
result += "\n"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|