scrubyt 0.2.3 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +30 -0
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +5 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +13 -2
- data/lib/scrubyt/core/navigation/navigation_actions.rb +4 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/filter.rb +35 -11
- data/lib/scrubyt/core/scraping/pattern.rb +29 -22
- data/lib/scrubyt/core/scraping/result_indexer.rb +2 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +44 -22
- data/lib/scrubyt/core/shared/extractor.rb +111 -15
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +67 -0
- data/lib/scrubyt/output/export.rb +69 -22
- data/lib/scrubyt/output/result.rb +1 -0
- data/lib/scrubyt/output/result_dumper.rb +26 -7
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/shared_utils.rb +45 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +23 -0
- data/lib/scrubyt/utils/xpathutils.rb +43 -92
- data/test/unittests/simple_example_lookup_test.rb +68 -0
- data/test/unittests/xpathutils_test.rb +0 -13
- metadata +9 -3
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'logger'
|
2
1
|
require 'open-uri'
|
3
2
|
require 'rubygems'
|
4
3
|
require 'mechanize'
|
@@ -17,6 +16,11 @@ module Scrubyt
|
|
17
16
|
#The definition of the extractor is passed through this method
|
18
17
|
def self.define(mode=nil, &extractor_definition)
|
19
18
|
@@mode = mode
|
19
|
+
#We are keeping the relations between the detail patterns and their root patterns
|
20
|
+
@@detail_extractor_to_pattern_name = {}
|
21
|
+
@@detail_pattern_relations = {}
|
22
|
+
#root pattern -> URIBuilder mapping
|
23
|
+
@@next_patterns = {}
|
20
24
|
mode_name = (mode == :production ? 'Production' : 'Learning')
|
21
25
|
puts "[MODE] #{mode_name}"
|
22
26
|
NavigationActions.new
|
@@ -41,6 +45,47 @@ module Scrubyt
|
|
41
45
|
root_pattern
|
42
46
|
end
|
43
47
|
|
48
|
+
#Evaluate a subexttractor (i.e. an extractor on a detail page).
|
49
|
+
#The url passed to this function is automatically loaded.
|
50
|
+
#The definition of the subextractor is passed as a block
|
51
|
+
#
|
52
|
+
#!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
|
53
|
+
def self.evaluate_subextractor(url, parent_pattern)
|
54
|
+
if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
|
55
|
+
detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
|
56
|
+
detail_root.result = Result.new
|
57
|
+
detail_root.last_result = nil
|
58
|
+
@@original_evaluation_context.push @@evaluation_context
|
59
|
+
@@evaluation_context = EvaluationContext.new
|
60
|
+
@@evaluation_context.clear_sources_and_sinks detail_root
|
61
|
+
FetchAction.restore_host_name
|
62
|
+
fetch url
|
63
|
+
@@evaluation_context.extractor = self
|
64
|
+
@@evaluation_context.root_pattern = detail_root
|
65
|
+
@@evaluation_context.attach_current_document
|
66
|
+
evaluate_extractor detail_root
|
67
|
+
@@evaluation_context = @@original_evaluation_context.pop
|
68
|
+
detail_root.to_xml
|
69
|
+
else
|
70
|
+
@@original_evaluation_context ||= []
|
71
|
+
FetchAction.restore_host_name
|
72
|
+
@@original_evaluation_context.push @@evaluation_context
|
73
|
+
@@evaluation_context = EvaluationContext.new
|
74
|
+
fetch url
|
75
|
+
evaluated_extractor = (class_eval(&parent_pattern.referenced_extractor))
|
76
|
+
root_pattern = evaluated_extractor.parent
|
77
|
+
@@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern.children[0]
|
78
|
+
@@evaluation_context.setup_examples
|
79
|
+
evaluate_extractor(root_pattern)
|
80
|
+
#Apply all postprocess steps
|
81
|
+
PostProcessor.apply_post_processing(root_pattern)
|
82
|
+
#Return the root pattern
|
83
|
+
#puts "Extracted detail page"
|
84
|
+
@@evaluation_context = @@original_evaluation_context.pop
|
85
|
+
root_pattern.to_xml
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
44
89
|
#build the current wrapper
|
45
90
|
def self.method_missing(method_name, *args, &block)
|
46
91
|
if NavigationActions::KEYWORDS.include? method_name.to_s
|
@@ -48,22 +93,25 @@ module Scrubyt
|
|
48
93
|
return
|
49
94
|
end
|
50
95
|
pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
|
96
|
+
check_if_shortcut_pattern(pattern)
|
97
|
+
check_if_detail_page(pattern, args)
|
51
98
|
pattern.evaluation_context = @@evaluation_context
|
52
99
|
if @parent == nil
|
53
100
|
if method_name.to_s == 'next_page'
|
54
|
-
@@evaluation_context.
|
55
|
-
@@
|
56
|
-
|
101
|
+
@@evaluation_context.setup_uri_builder(pattern, args)
|
102
|
+
@@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
|
103
|
+
p @@last_root_pattern.children[0].name
|
57
104
|
return @@last_pattern
|
58
105
|
else
|
59
106
|
#Create a root pattern
|
60
107
|
root_pattern = Scrubyt::Pattern.new('root', :type => :root)
|
108
|
+
@@last_root_pattern = root_pattern
|
61
109
|
root_pattern.evaluation_context = @@evaluation_context
|
62
110
|
@@evaluation_context.root_pattern = root_pattern
|
63
111
|
@@evaluation_context.extractor = self
|
64
112
|
#add the currently active document to the root pattern
|
65
113
|
@@evaluation_context.attach_current_document
|
66
|
-
@@evaluation_context.root_pattern.add_child_pattern(pattern)
|
114
|
+
@@evaluation_context.root_pattern.add_child_pattern(pattern)
|
67
115
|
@@evaluation_context.block_count = 0
|
68
116
|
end
|
69
117
|
else
|
@@ -80,28 +128,76 @@ module Scrubyt
|
|
80
128
|
end
|
81
129
|
@@last_pattern = pattern
|
82
130
|
end
|
83
|
-
|
84
|
-
#
|
85
|
-
#
|
86
|
-
|
87
|
-
|
131
|
+
|
132
|
+
#Shortcut patterns, as their name says, are a shortcut for creating patterns
|
133
|
+
#from predefined rules; for example:
|
134
|
+
#
|
135
|
+
# detail_url
|
136
|
+
#
|
137
|
+
# is equivalent to
|
138
|
+
#
|
139
|
+
# detail_url 'href', type => :attribute
|
140
|
+
#
|
141
|
+
#i.e. the system figures out on it's own that because of the postfix, the
|
142
|
+
#example should be looked up (but it should never override the user input!)
|
143
|
+
#another example (will be available later):
|
144
|
+
#
|
145
|
+
# every_img
|
146
|
+
#
|
147
|
+
# is equivivalent to
|
148
|
+
#
|
149
|
+
# every_img '//img'
|
150
|
+
#
|
151
|
+
def self.check_if_shortcut_pattern(pattern)
|
152
|
+
case pattern.name
|
153
|
+
when /.+_url/
|
154
|
+
#make sure that we are not overriding the user's settings
|
155
|
+
if !pattern.examples
|
156
|
+
pattern.filters[0].example = 'href'
|
157
|
+
pattern.type = Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
|
158
|
+
end
|
159
|
+
end
|
88
160
|
end
|
89
161
|
|
162
|
+
#Check whether the currently created pattern is a detail pattern (i.e. it refrences
|
163
|
+
#a subextractor). Also check if the currently created pattern is
|
164
|
+
#an ancestor of a detail pattern , and store this in a hash if yes (to be able to
|
165
|
+
#traverse the pattern structure on detail pages as well).
|
166
|
+
def self.check_if_detail_page(pattern, args)
|
167
|
+
return if args.size == 0
|
168
|
+
return if !args[0].is_a? Hash
|
169
|
+
return if !args[0][:references]
|
170
|
+
referenced_extractor = args[0][:references]
|
171
|
+
pattern.type = Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
172
|
+
pattern.referenced_extractor = referenced_extractor
|
173
|
+
@@detail_extractor_to_pattern_name[referenced_extractor] ||= []
|
174
|
+
@@detail_extractor_to_pattern_name[referenced_extractor] = @@detail_extractor_to_pattern_name[referenced_extractor] << pattern
|
175
|
+
end
|
176
|
+
|
90
177
|
def self.get_hpricot_doc
|
91
178
|
NavigationActions.get_hpricot_doc
|
92
179
|
end
|
93
180
|
|
181
|
+
def self.get_current_doc_url
|
182
|
+
NavigationActions.get_current_doc_url
|
183
|
+
end
|
184
|
+
|
185
|
+
def self.get_detail_pattern_relations
|
186
|
+
@@detail_pattern_relations
|
187
|
+
end
|
188
|
+
|
94
189
|
def self.get_mode
|
95
190
|
@@mode
|
96
|
-
end
|
191
|
+
end
|
192
|
+
|
97
193
|
private
|
98
|
-
def self.evaluate_extractor(root_pattern)
|
99
|
-
if @@
|
194
|
+
def self.evaluate_extractor(root_pattern)
|
195
|
+
if @@next_patterns[root_pattern]
|
100
196
|
current_page_count = 1
|
101
197
|
loop do
|
102
198
|
really_evaluate_extractor(root_pattern)
|
103
|
-
break if (@@
|
104
|
-
current_page_count += 1 if @@
|
199
|
+
break if (@@next_patterns[root_pattern].limit == current_page_count || @@evaluation_context.crawl_to_new_page(root_pattern, @@next_patterns[root_pattern]) == nil)
|
200
|
+
current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
|
105
201
|
end
|
106
202
|
else
|
107
203
|
really_evaluate_extractor(root_pattern)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Build URIs from different parameters</tt>
|
4
|
+
#
|
5
|
+
#When crawling to further pages which are machine-generated
|
6
|
+
#(most typically "next" pages) we need to detect the pattern
|
7
|
+
#and generate the next URI based on the edetected rule. This
|
8
|
+
#class provides methods to build URIs based on different criteria.
|
9
|
+
#
|
10
|
+
#The other possibility is to use constant objects ('Next' links,
|
11
|
+
#or image links (like right arrow) pointing to the next page).
|
12
|
+
#URIBUilder supports both possibilities.
|
13
|
+
class URIBuilder
|
14
|
+
attr_reader :next_page_example, :next_page_pattern, :limit, :next_param, :next_increment, :increment, :current_uri
|
15
|
+
|
16
|
+
def initialize(pattern,args)
|
17
|
+
if args[0] =~ /^http.+/
|
18
|
+
#Figure out how are the URLs generated based on the next URL
|
19
|
+
get_next_param(string_diff(args[0], args[1]))
|
20
|
+
@increment = 0
|
21
|
+
@current_uri = args[1]
|
22
|
+
@limit = args[2][:limit] if args.size > 2
|
23
|
+
else
|
24
|
+
#Otherwise, do this in the 'classic' way (by clicking on the "next" link)
|
25
|
+
@next_page_pattern = pattern
|
26
|
+
@next_page_example = args[0]
|
27
|
+
@limit = args[1][:limit] if args.size > 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
#Used when generating the next URI (as opposed to 'clicking' the next link)
|
32
|
+
def generate_next_uri
|
33
|
+
@increment += @next_increment
|
34
|
+
return @current_uri if @increment == @next_increment
|
35
|
+
@next_increment = 1 if @next_increment == 2
|
36
|
+
if @current_uri !~ /#{@next_param}/
|
37
|
+
@current_uri += (@next_param + '=' + @next_increment.to_s)
|
38
|
+
else
|
39
|
+
@current_uri = @current_uri.sub(/#{@next_param}=#{@increment-@next_increment}/) do
|
40
|
+
"#{@next_param}=#{@increment}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def get_next_param(pair)
|
47
|
+
param_and_value = pair.split('=')
|
48
|
+
@next_param = param_and_value[0]
|
49
|
+
@next_increment = param_and_value[1].to_i
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_difference_index(s1,s2)
|
53
|
+
cmp = s2.scan(/./).zip(s1.scan(/./))
|
54
|
+
i = 0
|
55
|
+
loop do
|
56
|
+
return i if cmp[i][0] != cmp[i][1]
|
57
|
+
i+=1
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def string_diff(s1,s2)
|
62
|
+
s2[find_difference_index(s1, s2)..s2.size-find_difference_index(s1.reverse, s2.reverse)-1]
|
63
|
+
end #end of method string_diff
|
64
|
+
end #end of class URIBuilder
|
65
|
+
end #end of module Scrubyt
|
66
|
+
|
67
|
+
|
@@ -70,6 +70,7 @@ module Scrubyt
|
|
70
70
|
output_file = output_file_name == nil ? open("#{wrapper_name}_extractor_export.rb", 'w') :
|
71
71
|
open(output_file_name, 'w')
|
72
72
|
export_header(output_file)
|
73
|
+
export_subextractors(contents, pattern, output_file)
|
73
74
|
export_extractor(contents, pattern, output_file)
|
74
75
|
export_footer(output_file, wrapper_name, extractor_result_file_name)
|
75
76
|
cleanup_result
|
@@ -85,8 +86,27 @@ private
|
|
85
86
|
|
86
87
|
def self.cleanup_result
|
87
88
|
@result.gsub!('P.') {}
|
89
|
+
CompoundExample::DESCRIPTORS.each {|d|
|
90
|
+
@result.gsub!(/,\s*:#{d.to_s}.+?'.+?'/) {}
|
91
|
+
}
|
88
92
|
end
|
89
93
|
|
94
|
+
|
95
|
+
def self.export_subextractors(contents, pattern, output_file)
|
96
|
+
all_subextractor_code = contents.scan(/.+=\s+lambda.+Extractor\.define/m)
|
97
|
+
return if all_subextractor_code.empty?
|
98
|
+
all_subextractor_code = all_subextractor_code[0].split("\n")
|
99
|
+
pure_subextractor_code = []
|
100
|
+
meaningful_code = false
|
101
|
+
all_subextractor_code.each do |sec|
|
102
|
+
meaningful_code = true if sec =~ /lambda/
|
103
|
+
meaningful_code = false if sec =~ /Extractor.define/
|
104
|
+
pure_subextractor_code << sec if meaningful_code
|
105
|
+
end
|
106
|
+
add_P pure_subextractor_code
|
107
|
+
substitute_examples_with_XPaths(pattern,pure_subextractor_code)
|
108
|
+
end
|
109
|
+
|
90
110
|
#OK, I have to admit: this function is powered by woodo magic. A lots of woodoo magic.
|
91
111
|
#Piles of tons of heaps of woodoo magic :-)
|
92
112
|
#
|
@@ -111,26 +131,16 @@ private
|
|
111
131
|
#end (to close the block of the extractor definition)
|
112
132
|
count = pattern.evaluation_context.block_count + 1
|
113
133
|
#Construct the extractor definition matching regexp based on the number of ends
|
114
|
-
definition = contents.scan(/Extractor\.define(?:.*?(?:\}
|
134
|
+
definition = contents.scan(/Extractor\.define(?:.*?(?:\}|\s+end)){#{count.to_s}}/m)
|
115
135
|
#Since the regexp matching the extractor definition was multiline, get the first
|
116
136
|
#line separately and patch it in!
|
117
137
|
rows = definition[0].split("\n")
|
118
|
-
|
119
|
-
#patterns could be matched very easily from the extractor definition (because they begun
|
120
|
-
#with 'P.'). Now that P has been removed, mimick it!
|
121
|
-
rows.each do |row|
|
122
|
-
#Do not prepend P. to comments and empty lines
|
123
|
-
next if (row.strip =~ /^#/ || row.strip == '')
|
124
|
-
#Do not prepend P. to any of the reserved keywords
|
125
|
-
jump_to_next = false
|
126
|
-
NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
|
127
|
-
next if jump_to_next
|
128
|
-
#Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
|
129
|
-
row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
|
130
|
-
#Don't forget also the stuff in parentheses!
|
131
|
-
row.gsub!(/\{\s+/) {"{P."}
|
132
|
-
end
|
138
|
+
add_P(rows)
|
133
139
|
rows[0] = first_line
|
140
|
+
substitute_examples_with_XPaths(pattern,rows)
|
141
|
+
end
|
142
|
+
|
143
|
+
def self.substitute_examples_with_XPaths(pattern,rows)
|
134
144
|
#@full_definition holds the original definition (at this point, later on it will be
|
135
145
|
#gsub!bed and all)
|
136
146
|
@full_definition = rows.join("\n")
|
@@ -146,9 +156,10 @@ private
|
|
146
156
|
replace_example_with_xpath(name, xpaths, %q{'})
|
147
157
|
end
|
148
158
|
#Finally, add XPaths to pattern which had no example at the beginning (the XPath was
|
149
|
-
#generated from the child patterns
|
159
|
+
#generated from the child patterns)
|
150
160
|
@name_to_xpath_map.each do |name, xpaths|
|
151
161
|
xpaths.reverse.each do |xpath|
|
162
|
+
next if !@full_definition.include? "P.#{name}"
|
152
163
|
comma = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0].sub('do'){}.strip == '' ? '' : ','
|
153
164
|
if (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{')
|
154
165
|
@full_definition.sub!("P.#{name}") {"P.#{name}('#{xpath}')"}
|
@@ -157,8 +168,8 @@ private
|
|
157
168
|
end
|
158
169
|
end
|
159
170
|
end
|
160
|
-
@result += @full_definition
|
161
|
-
end
|
171
|
+
@result += @full_definition
|
172
|
+
end
|
162
173
|
|
163
174
|
def self.export_footer(output_file, wrapper_name, extractor_result_file_name)
|
164
175
|
if extractor_result_file_name
|
@@ -167,20 +178,56 @@ private
|
|
167
178
|
@result += "\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
|
168
179
|
end
|
169
180
|
end
|
181
|
+
|
182
|
+
def self.add_P(rows)
|
183
|
+
#Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
|
184
|
+
#patterns could be matched very easily from the extractor definition (because they begun
|
185
|
+
#with 'P.'). Now that P has been removed, mimick it!
|
186
|
+
rows.each do |row|
|
187
|
+
#Do not prepend P. to comments and empty lines
|
188
|
+
next if (row.strip =~ /^#/ || row.strip == '')
|
189
|
+
#Do not prepend P. to any of the reserved keywords
|
190
|
+
jump_to_next = false
|
191
|
+
NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
|
192
|
+
jump_to_next = true if row =~ /lambda/
|
193
|
+
next if jump_to_next
|
194
|
+
#Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
|
195
|
+
row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
|
196
|
+
#Don't forget also the stuff in parentheses!
|
197
|
+
row.gsub!(/\{\s+/) {"{P."}
|
198
|
+
end
|
199
|
+
end
|
170
200
|
|
171
201
|
|
172
202
|
def self.create_name_to_xpath_map(pattern)
|
203
|
+
puts " Cereating mapping for: #{pattern.name}"
|
173
204
|
@name_to_xpath_map[pattern.name] = []
|
174
205
|
pattern.filters.each do |filter|
|
175
206
|
@name_to_xpath_map[pattern.name] << filter.xpath if pattern.filters[0].xpath != nil
|
207
|
+
end
|
208
|
+
pattern.children.each {|child| create_name_to_xpath_map child}
|
209
|
+
if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
210
|
+
puts pattern.name
|
211
|
+
puts "-------"
|
212
|
+
puts pattern.evaluation_context.extractor.get_detail_pattern_relations.each {|k,v|
|
213
|
+
if k.include? pattern
|
214
|
+
v.parent.children.each do |child|
|
215
|
+
create_name_to_xpath_map child
|
216
|
+
end
|
217
|
+
end
|
218
|
+
}
|
219
|
+
|
220
|
+
#pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
|
221
|
+
# create_name_to_xpath_map child
|
222
|
+
#end
|
176
223
|
end
|
177
|
-
|
178
|
-
end
|
224
|
+
end
|
179
225
|
|
180
226
|
def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
|
181
227
|
return if name=='root'
|
228
|
+
return if !@full_definition.include? "P.#{name}"
|
182
229
|
parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
|
183
|
-
if parens.empty?
|
230
|
+
if parens.empty?
|
184
231
|
full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
|
185
232
|
else
|
186
233
|
full_line = parens[0][0]
|
@@ -50,7 +50,7 @@ module Scrubyt
|
|
50
50
|
|
51
51
|
private
|
52
52
|
def self.to_xml_recursive(pattern, element)
|
53
|
-
pattern.children.each do |child|
|
53
|
+
pattern.children.each do |child|
|
54
54
|
childresults = child.result.lookup(child.parent.last_result)
|
55
55
|
#Output text for leaf nodes only; Maybe add possibility to customize this later
|
56
56
|
if (childresults == nil)
|
@@ -72,19 +72,38 @@ private
|
|
72
72
|
if child.last_result.instance_of? String
|
73
73
|
res = child.last_result
|
74
74
|
else
|
75
|
-
|
75
|
+
if child.last_result.respond_to? 'traverse_text'
|
76
|
+
child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
|
77
|
+
else
|
78
|
+
child.last_result.children.each { |c| element.add_element c }
|
79
|
+
end
|
76
80
|
end
|
77
81
|
child_node = REXML::Element.new(child.name)
|
78
|
-
child_node.text = (res.gsub(' '){' '}).strip if (child
|
79
|
-
|
82
|
+
child_node.text = (res.gsub(' '){' '}).strip if write_text_criteria_met(child)
|
83
|
+
|
84
|
+
element.add_element(child_node) if child.type != Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
80
85
|
to_xml_recursive(child, child_node)
|
81
86
|
end
|
82
87
|
end
|
83
88
|
|
89
|
+
def self.write_text_criteria_met(pattern)
|
90
|
+
if (pattern.write_text == nil)
|
91
|
+
return pattern.children.size == 0
|
92
|
+
else
|
93
|
+
pattern.write_text
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
84
97
|
def self.print_statistics_recursive(pattern, depth)
|
85
|
-
if pattern.name != 'root'
|
86
|
-
|
87
|
-
|
98
|
+
if pattern.name != 'root'
|
99
|
+
if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
100
|
+
pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
|
101
|
+
print_statistics_recursive(child, depth)
|
102
|
+
end
|
103
|
+
else
|
104
|
+
count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
|
105
|
+
puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{count} instances.")
|
106
|
+
end
|
88
107
|
end
|
89
108
|
|
90
109
|
pattern.children.each do |child|
|