scrubyt 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +59 -12
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +24 -6
- data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
- data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
- data/lib/scrubyt/core/scraping/constraint.rb +53 -57
- data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
- data/lib/scrubyt/core/scraping/pattern.rb +292 -157
- data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
- data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
- data/lib/scrubyt/core/shared/extractor.rb +122 -163
- data/lib/scrubyt/output/export.rb +59 -174
- data/lib/scrubyt/output/post_processor.rb +4 -3
- data/lib/scrubyt/output/result.rb +8 -9
- data/lib/scrubyt/output/result_dumper.rb +81 -42
- data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
- data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
- data/lib/scrubyt/utils/shared_utils.rb +39 -26
- data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
- data/lib/scrubyt/utils/xpathutils.rb +31 -30
- data/test/unittests/constraint_test.rb +11 -7
- data/test/unittests/extractor_test.rb +6 -6
- data/test/unittests/filter_test.rb +66 -66
- metadata +22 -15
- data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -1,5 +1,3 @@
|
|
1
|
-
#require File.join(File.dirname(__FILE__), 'pattern.rb')
|
2
|
-
|
3
1
|
module Scrubyt
|
4
2
|
# =<tt>exporting previously defined extractors</tt>
|
5
3
|
class Export
|
@@ -15,7 +13,7 @@ module Scrubyt
|
|
15
13
|
#
|
16
14
|
#*parameters*
|
17
15
|
#
|
18
|
-
#
|
16
|
+
#_root_pattern_ - the root pattern of the extractor. This is the variable 'something' in
|
19
17
|
#such a call:
|
20
18
|
#
|
21
19
|
# something = Scrubyt::Extractor.define ...
|
@@ -63,194 +61,81 @@ module Scrubyt
|
|
63
61
|
#This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
|
64
62
|
#After running 'my_super_camera_extractor.rb', the result will be dumped to the file
|
65
63
|
#'/home/peter/stuff/result.xml'.
|
66
|
-
def self.export(
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
output_file.write(
|
64
|
+
def self.export(root_pattern, wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
|
65
|
+
sexp = [:block]
|
66
|
+
sexp << export_header(wrapper_name)
|
67
|
+
sexp << export_extractor(root_pattern, wrapper_name)
|
68
|
+
sexp << export_footer(wrapper_name, extractor_result_file_name)
|
69
|
+
|
70
|
+
result = RubyToRuby.new.process(sexp)
|
71
|
+
result.gsub! '"' + root_pattern.source_file + '"', '__FILE__'
|
72
|
+
|
73
|
+
output_file_name ||= "#{wrapper_name}_extractor_export.rb"
|
74
|
+
output_file = open(output_file_name, 'w')
|
75
|
+
output_file.write(result)
|
78
76
|
output_file.close
|
79
|
-
|
77
|
+
result
|
80
78
|
end
|
81
79
|
|
82
80
|
private
|
83
|
-
def self.
|
84
|
-
|
85
|
-
end
|
86
|
-
|
87
|
-
def self.cleanup_result
|
88
|
-
@result.gsub!('P.') {}
|
89
|
-
CompoundExample::DESCRIPTORS.each {|d|
|
90
|
-
@result.gsub!(/,\s*:#{d.to_s}.+?'.+?'/) {}
|
91
|
-
}
|
92
|
-
end
|
93
|
-
|
94
|
-
|
95
|
-
def self.export_subextractors(contents, pattern, output_file)
|
96
|
-
all_subextractor_code = contents.scan(/.+=\s+lambda.+Extractor\.define/m)
|
97
|
-
return if all_subextractor_code.empty?
|
98
|
-
all_subextractor_code = all_subextractor_code[0].split("\n")
|
99
|
-
pure_subextractor_code = []
|
100
|
-
meaningful_code = false
|
101
|
-
all_subextractor_code.each do |sec|
|
102
|
-
meaningful_code = true if sec =~ /lambda/
|
103
|
-
meaningful_code = false if sec =~ /Extractor.define/
|
104
|
-
pure_subextractor_code << sec if meaningful_code
|
105
|
-
end
|
106
|
-
add_P pure_subextractor_code
|
107
|
-
substitute_examples_with_XPaths(pattern,pure_subextractor_code)
|
81
|
+
def self.create_sexp(code)
|
82
|
+
(ParseTree.new.parse_tree_for_string(code))[0]
|
108
83
|
end
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
#
|
113
|
-
#The only reason I can expect it to work is that it passes all the tests of the extractors
|
114
|
-
#I have created so far. However at the same time I know how to create one easily which
|
115
|
-
#would break the exporting, so don't experiment with this too much...
|
116
|
-
#
|
117
|
-
#The other solutions include:
|
118
|
-
#- serialization (yaml, pstore etc) but that would mess the code terribly up - so
|
119
|
-
#therefore I did not chose this solution.
|
120
|
-
#- defining the block as string - however, this introduces ugly %q{}s etc - all in all,
|
121
|
-
#this is still a more viable solution that serialization IMHO
|
122
|
-
#- a lot of other tricks - however, all of these introduce a lot of noise which I don't
|
123
|
-
#like.
|
124
|
-
#
|
125
|
-
#Conclusion: If there will be no terrible, unrepairable, uncontrollable etc. problems
|
126
|
-
#with this approach, it will be replaced (probably with constructing the extractor as
|
127
|
-
#a string). However, until that point, it will stay.
|
128
|
-
def self.export_extractor(contents, pattern, output_file)
|
129
|
-
first_line = contents.scan(/.*Extractor\.define.*/)
|
130
|
-
#During wrapper construction, we count the number of blocks; add one occurrence of
|
131
|
-
#end (to close the block of the extractor definition)
|
132
|
-
count = pattern.evaluation_context.block_count + 1
|
133
|
-
#Construct the extractor definition matching regexp based on the number of ends
|
134
|
-
definition = contents.scan(/Extractor\.define(?:.*?(?:\}|\s+end)){#{count.to_s}}/m)
|
135
|
-
#Since the regexp matching the extractor definition was multiline, get the first
|
136
|
-
#line separately and patch it in!
|
137
|
-
rows = definition[0].split("\n")
|
138
|
-
add_P(rows)
|
139
|
-
rows[0] = first_line
|
140
|
-
substitute_examples_with_XPaths(pattern,rows)
|
141
|
-
end
|
142
|
-
|
143
|
-
def self.substitute_examples_with_XPaths(pattern,rows)
|
144
|
-
#@full_definition holds the original definition (at this point, later on it will be
|
145
|
-
#gsub!bed and all)
|
146
|
-
@full_definition = rows.join("\n")
|
147
|
-
#This hash contains all the examples that need to be replaced with their XPath
|
148
|
-
#counterparts;"P.#{name}"
|
149
|
-
#We are relying on the convention that if an example is definied, it is always
|
150
|
-
#the first parameter and it is always a string
|
151
|
-
@name_to_xpath_map = {}
|
152
|
-
create_name_to_xpath_map(pattern)
|
153
|
-
#Replace the examples which are quoted with " and '
|
154
|
-
@name_to_xpath_map.each do |name, xpaths|
|
155
|
-
replace_example_with_xpath(name, xpaths, %q{"})
|
156
|
-
replace_example_with_xpath(name, xpaths, %q{'})
|
157
|
-
end
|
158
|
-
#Finally, add XPaths to pattern which had no example at the beginning (the XPath was
|
159
|
-
#generated from the child patterns)
|
160
|
-
@name_to_xpath_map.each do |name, xpaths|
|
161
|
-
xpaths.reverse.each do |xpath|
|
162
|
-
next if !@full_definition.include? "P.#{name}"
|
163
|
-
comma = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0].sub('do'){}.strip == '' ? '' : ','
|
164
|
-
if (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{')
|
165
|
-
@full_definition.sub!("P.#{name}") {"P.#{name}('#{xpath}')"}
|
166
|
-
else
|
167
|
-
@full_definition.sub!("P.#{name}") {"P.#{name} \"#{xpath}\"#{comma}"}
|
168
|
-
end
|
169
|
-
end
|
170
|
-
end
|
171
|
-
@result += @full_definition
|
84
|
+
|
85
|
+
def self.export_header(wrapper_name)
|
86
|
+
create_sexp "require 'rubygems'; require 'scrubyt'"
|
172
87
|
end
|
173
88
|
|
174
|
-
def self.export_footer(
|
89
|
+
def self.export_footer(wrapper_name, extractor_result_file_name)
|
175
90
|
if extractor_result_file_name
|
176
|
-
|
91
|
+
create_sexp "#{wrapper_name}.to_xml.write(open('result_of_exported_extractor.xml', 'w'), 1)"
|
177
92
|
else
|
178
|
-
|
93
|
+
create_sexp "#{wrapper_name}.to_xml.write($stdout, 1)"
|
179
94
|
end
|
180
95
|
end
|
181
96
|
|
182
|
-
def self.
|
183
|
-
#
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
row.gsub!(/\{\s+/) {"{P."}
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
|
202
|
-
def self.create_name_to_xpath_map(pattern)
|
203
|
-
puts " Cereating mapping for: #{pattern.name}"
|
204
|
-
@name_to_xpath_map[pattern.name] = []
|
205
|
-
pattern.filters.each do |filter|
|
206
|
-
@name_to_xpath_map[pattern.name] << filter.xpath if pattern.filters[0].xpath != nil
|
207
|
-
end
|
208
|
-
pattern.children.each {|child| create_name_to_xpath_map child}
|
209
|
-
if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
210
|
-
puts pattern.name
|
211
|
-
puts "-------"
|
212
|
-
puts pattern.evaluation_context.extractor.get_detail_pattern_relations.each {|k,v|
|
213
|
-
if k.include? pattern
|
214
|
-
v.parent.children.each do |child|
|
215
|
-
create_name_to_xpath_map child
|
216
|
-
end
|
97
|
+
def self.export_extractor(root_pattern, wrapper_name)
|
98
|
+
# filter actions before and after pattern
|
99
|
+
pre_pattern_sexp = []
|
100
|
+
post_pattern_sexp = []
|
101
|
+
pattern_skipped = false
|
102
|
+
actions = ['next_page', *NavigationActions::KEYWORDS]
|
103
|
+
|
104
|
+
root_pattern.source_proc.to_sexp[3][1..-1].each do |sexp|
|
105
|
+
get_call = lambda { |sexp|
|
106
|
+
if sexp[0] == :fcall
|
107
|
+
return sexp[1].to_s
|
108
|
+
elsif sexp[0] == :iter || sexp[0] == :call
|
109
|
+
return get_call.call(sexp[1])
|
110
|
+
else
|
111
|
+
return nil
|
217
112
|
end
|
218
113
|
}
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
114
|
+
call = get_call.call(sexp)
|
115
|
+
if(call.nil? || actions.index(call) != nil)
|
116
|
+
if !pattern_skipped
|
117
|
+
pre_pattern_sexp.push(sexp)
|
118
|
+
else
|
119
|
+
post_pattern_sexp.push(sexp)
|
120
|
+
end
|
121
|
+
else
|
122
|
+
raise "Second pattern tree found while exporting." if pattern_skipped
|
123
|
+
pattern_skipped = true
|
124
|
+
end
|
223
125
|
end
|
126
|
+
|
127
|
+
# build extractor content
|
128
|
+
inner_block = [:block]
|
129
|
+
inner_block.push([:block, *pre_pattern_sexp])
|
130
|
+
inner_block.push([:block, export_pattern(root_pattern)])
|
131
|
+
inner_block.push([:block, *post_pattern_sexp])
|
132
|
+
|
133
|
+
# build extractor
|
134
|
+
[:block, [:lasgn, wrapper_name, [:iter, [:call, [:colon2, [:const, :Scrubyt], :Extractor], :define], nil, inner_block]]]
|
224
135
|
end
|
225
136
|
|
226
|
-
def self.
|
227
|
-
|
228
|
-
return if !@full_definition.include? "P.#{name}"
|
229
|
-
parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
|
230
|
-
if parens.empty?
|
231
|
-
full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
|
232
|
-
else
|
233
|
-
full_line = parens[0][0]
|
234
|
-
end
|
235
|
-
examples = full_line.split(",")
|
236
|
-
examples.reject! {|exa| exa.strip!; exa[0..0] != %q{"} && exa[0..0] != %q{'} }
|
237
|
-
all_xpaths = ""
|
238
|
-
examples.each do |e|
|
239
|
-
index = examples.index(e)
|
240
|
-
xpath = xpaths[index]
|
241
|
-
return if xpath == nil
|
242
|
-
all_xpaths += ", " if index > 0
|
243
|
-
all_xpaths += '"' + xpath + '"'
|
244
|
-
end
|
245
|
-
replacing_xpath = full_line.include?('{') ? "P.#{name}('#{all_xpaths}')" :
|
246
|
-
"P.#{name} #{all_xpaths}"
|
247
|
-
optional_paren_escaped = parens.empty? ? '' : '\('
|
248
|
-
optional_paren = parens.empty? ? '' : '('
|
249
|
-
@full_definition.sub!(/P\.#{name}\s*#{optional_paren_escaped}#{left_delimiter}(.*)#{right_delimiter}/) do
|
250
|
-
@name_to_xpath_map.delete("#{name}")
|
251
|
-
optional_paren + replacing_xpath
|
252
|
-
end
|
137
|
+
def self.export_pattern(root_pattern)
|
138
|
+
root_pattern.children[0].to_sexp
|
253
139
|
end
|
254
|
-
|
255
140
|
end
|
256
141
|
end
|
@@ -56,7 +56,7 @@ require 'set'
|
|
56
56
|
private
|
57
57
|
def self.ensure_presence_of_pattern(pattern)
|
58
58
|
#holds the name of those child patterns which have to be present as children of the input parameter
|
59
|
-
epop_names = pattern.
|
59
|
+
epop_names = pattern.constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
|
60
60
|
return if epop_names.empty?
|
61
61
|
#all_parent_values holds instances extracted by pattern
|
62
62
|
all_parent_values = []
|
@@ -95,8 +95,9 @@ private
|
|
95
95
|
end
|
96
96
|
|
97
97
|
def self.check_ancestors(parent_value, all_child_values)
|
98
|
-
parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child }
|
99
|
-
|
98
|
+
parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child } if
|
99
|
+
parent_value.is_a? Hpricot::Elem
|
100
|
+
parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem } if parent_value.is_a? Hpricot::Elem
|
100
101
|
end
|
101
102
|
|
102
103
|
def self.remove_multiple_filter_duplicates_intern(pattern)
|
@@ -1,24 +1,23 @@
|
|
1
|
-
module Scrubyt
|
1
|
+
module Scrubyt
|
2
2
|
##
|
3
3
|
#=<tt>Represents the results of a pattern</tt>
|
4
4
|
class Result
|
5
5
|
attr_reader :childmap, :instances
|
6
|
-
|
6
|
+
|
7
7
|
def initialize
|
8
8
|
@childmap ||= []
|
9
9
|
end
|
10
|
-
|
10
|
+
|
11
11
|
def add_result(source, result)
|
12
12
|
@childmap.each do |hash|
|
13
13
|
if hash.keys[0] == source
|
14
|
-
return if hash[source] == nil
|
15
14
|
hash[source] << result if !hash[source].include? result
|
16
15
|
return
|
17
16
|
end
|
18
17
|
end
|
19
|
-
@childmap << {source => [result]}
|
18
|
+
@childmap << {source => [result]}
|
20
19
|
end
|
21
|
-
|
20
|
+
|
22
21
|
def lookup(last_result)
|
23
22
|
@childmap.each do |hashes|
|
24
23
|
hashes.each { |key, value| return value if (key == last_result) }
|
@@ -33,12 +32,12 @@ end#end of module Scrubyt
|
|
33
32
|
# root
|
34
33
|
# source: nil
|
35
34
|
# childmap: [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
|
36
|
-
|
35
|
+
|
37
36
|
#table
|
38
37
|
# source: doc1
|
39
38
|
# childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
|
40
|
-
|
39
|
+
|
41
40
|
#row
|
42
41
|
# source: table1s1, table2s1, table3s1
|
43
42
|
# childmap: [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
|
44
|
-
# {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
|
43
|
+
# {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
|
@@ -12,36 +12,69 @@ module Scrubyt
|
|
12
12
|
root = REXML::Element.new('root')
|
13
13
|
doc.add_element(root)
|
14
14
|
all_extracted_docs = pattern.last_result
|
15
|
-
all_extracted_docs.each do |lr|
|
15
|
+
[all_extracted_docs].flatten.each do |lr|
|
16
16
|
pattern.last_result = lr
|
17
|
-
to_xml_recursive(pattern, root)
|
17
|
+
to_xml_recursive(pattern, root)
|
18
18
|
end
|
19
19
|
remove_empty_leaves(doc)
|
20
20
|
@@last_doc = doc
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
def self.remove_empty_leaves(node)
|
24
24
|
node.remove if node.elements.empty? && node.text == nil
|
25
25
|
node.elements.each {|child| remove_empty_leaves child }
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
##
|
29
29
|
#Output the text of the pattern; If this pattern is a tree, collect the text from its
|
30
30
|
#result instance node; otherwise rely on the last_result
|
31
|
+
#TODO: throw this away!!!
|
31
32
|
def self.to_text(pattern)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
33
|
+
last_result = pattern.last_result
|
34
|
+
result = ""
|
35
|
+
if pattern.type == :tree
|
36
|
+
last_result.traverse_text { |t| result += t.to_s }
|
37
|
+
else
|
38
|
+
result = last_result
|
39
|
+
end
|
40
|
+
result
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.to_csv(pattern)
|
44
|
+
result = []
|
45
|
+
flat_csv_inner = lambda {|e, parts|
|
46
|
+
content = e.text || ''
|
47
|
+
parts << content if ((e.is_a? REXML::Element) && content != '')
|
48
|
+
e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
|
49
|
+
parts
|
50
|
+
}
|
51
|
+
to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
|
52
|
+
(result.map! {|a| a.join(',')}).join("\n")
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.to_hash(pattern)
|
56
|
+
result = []
|
57
|
+
flat_hash_inner = lambda {|e, parts|
|
58
|
+
content = e.text || ''
|
59
|
+
if ((e.is_a? REXML::Element) && content != '')
|
60
|
+
if parts[e.local_name]
|
61
|
+
parts[e.local_name] = parts[e.local_name] + "," + content
|
62
|
+
else
|
63
|
+
parts[e.local_name] = content
|
64
|
+
end
|
65
|
+
end
|
66
|
+
e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
|
67
|
+
parts
|
68
|
+
}
|
69
|
+
to_xml(pattern).root.elements['/root'].each {|e| result << flat_hash_inner.call(e, {}) }
|
70
|
+
result
|
40
71
|
end
|
41
|
-
|
72
|
+
|
73
|
+
|
74
|
+
|
42
75
|
##
|
43
|
-
#Print some simple statistics on the extracted results, like the count of extracted
|
44
|
-
#instances by each pattern
|
76
|
+
#Print some simple statistics on the extracted results, like the count of extracted
|
77
|
+
#instances by each pattern
|
45
78
|
def self.print_statistics(pattern)
|
46
79
|
puts "\n" * 2
|
47
80
|
print_statistics_recursive(pattern,0)
|
@@ -54,20 +87,34 @@ private
|
|
54
87
|
childresults = child.result.lookup(child.parent.last_result)
|
55
88
|
#Output text for leaf nodes only; Maybe add possibility to customize this later
|
56
89
|
if (childresults == nil)
|
90
|
+
##TODO: is this needed for anything? I guess not! Drop it!!!!!!
|
91
|
+
#Update: it seems the blackbox tests are not passing because of this (?) so temporarily adding it back
|
92
|
+
##=begin
|
57
93
|
res = ""
|
58
|
-
child.parent.last_result.
|
59
|
-
|
60
|
-
|
94
|
+
if child.parent.last_result.is_a? String
|
95
|
+
res = child.parent.last_result
|
96
|
+
else
|
97
|
+
child.parent.last_result.traverse_text { |t| res += t.to_s }
|
98
|
+
end
|
99
|
+
if (child.parent.respond_to?(:size) && child.parent.size == 0) #TODO: respond_to should not be used here, it's just a quick workaround
|
100
|
+
element.text = SharedUtils.unescape_entities(res).strip unless element.parent.is_a? REXML::Document
|
61
101
|
end
|
62
102
|
next
|
103
|
+
##=end
|
63
104
|
end
|
105
|
+
|
64
106
|
generate_children(child, childresults, element)
|
65
107
|
end
|
66
108
|
end
|
67
|
-
|
109
|
+
|
68
110
|
def self.generate_children(child, childresults, element)
|
111
|
+
if childresults == nil
|
112
|
+
child_node = REXML::Element.new(child.name)
|
113
|
+
child_node.text = child.default
|
114
|
+
element.add_element(child_node)
|
115
|
+
else
|
69
116
|
childresults.size.times do |num|
|
70
|
-
child.last_result = childresults[num]
|
117
|
+
child.last_result = childresults[num]
|
71
118
|
res = ""
|
72
119
|
if child.last_result.instance_of? String
|
73
120
|
res = child.last_result
|
@@ -78,37 +125,29 @@ private
|
|
78
125
|
child.last_result.children.each { |c| element.add_element c }
|
79
126
|
end
|
80
127
|
end
|
81
|
-
child_node = REXML::Element.new(child.name)
|
82
|
-
child_node.text = (res
|
83
|
-
|
84
|
-
element.add_element(child_node) if child.type != Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
128
|
+
child_node = REXML::Element.new(child.name)
|
129
|
+
child_node.text = SharedUtils.unescape_entities(res).strip if child.write_text
|
130
|
+
element.add_element(child_node) if (child.type != :detail_page && child_node.text != '')
|
85
131
|
to_xml_recursive(child, child_node)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
def self.write_text_criteria_met(pattern)
|
90
|
-
if (pattern.write_text == nil)
|
91
|
-
return pattern.children.size == 0
|
92
|
-
else
|
93
|
-
pattern.write_text
|
132
|
+
end
|
94
133
|
end
|
95
134
|
end
|
96
|
-
|
135
|
+
|
97
136
|
def self.print_statistics_recursive(pattern, depth)
|
98
|
-
if pattern.name != 'root'
|
99
|
-
if pattern.type ==
|
137
|
+
if pattern.name != 'root'
|
138
|
+
if pattern.type == :detail_page
|
100
139
|
pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
|
101
140
|
print_statistics_recursive(child, depth)
|
102
|
-
|
141
|
+
end
|
103
142
|
else
|
104
|
-
count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
|
143
|
+
count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
|
105
144
|
puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{count} instances.")
|
106
145
|
end
|
107
146
|
end
|
108
|
-
|
147
|
+
|
109
148
|
pattern.children.each do |child|
|
110
149
|
print_statistics_recursive(child, depth + 4)
|
111
|
-
end
|
112
|
-
|
113
|
-
|
114
|
-
end #end of module Scrubyt
|
150
|
+
end
|
151
|
+
end#end of method print_statistics_recursive
|
152
|
+
end #end of class ResultDumper
|
153
|
+
end #end of module Scrubyt
|