scrubyt 0.2.3 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +30 -0
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +5 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +13 -2
- data/lib/scrubyt/core/navigation/navigation_actions.rb +4 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/filter.rb +35 -11
- data/lib/scrubyt/core/scraping/pattern.rb +29 -22
- data/lib/scrubyt/core/scraping/result_indexer.rb +2 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +44 -22
- data/lib/scrubyt/core/shared/extractor.rb +111 -15
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +67 -0
- data/lib/scrubyt/output/export.rb +69 -22
- data/lib/scrubyt/output/result.rb +1 -0
- data/lib/scrubyt/output/result_dumper.rb +26 -7
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/shared_utils.rb +45 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +23 -0
- data/lib/scrubyt/utils/xpathutils.rb +43 -92
- data/test/unittests/simple_example_lookup_test.rb +68 -0
- data/test/unittests/xpathutils_test.rb +0 -13
- metadata +9 -3
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'logger'
|
2
1
|
require 'open-uri'
|
3
2
|
require 'rubygems'
|
4
3
|
require 'mechanize'
|
@@ -17,6 +16,11 @@ module Scrubyt
|
|
17
16
|
#The definition of the extractor is passed through this method
|
18
17
|
def self.define(mode=nil, &extractor_definition)
|
19
18
|
@@mode = mode
|
19
|
+
#We are keeping the relations between the detail patterns and their root patterns
|
20
|
+
@@detail_extractor_to_pattern_name = {}
|
21
|
+
@@detail_pattern_relations = {}
|
22
|
+
#root pattern -> URIBuilder mapping
|
23
|
+
@@next_patterns = {}
|
20
24
|
mode_name = (mode == :production ? 'Production' : 'Learning')
|
21
25
|
puts "[MODE] #{mode_name}"
|
22
26
|
NavigationActions.new
|
@@ -41,6 +45,47 @@ module Scrubyt
|
|
41
45
|
root_pattern
|
42
46
|
end
|
43
47
|
|
48
|
+
#Evaluate a subexttractor (i.e. an extractor on a detail page).
|
49
|
+
#The url passed to this function is automatically loaded.
|
50
|
+
#The definition of the subextractor is passed as a block
|
51
|
+
#
|
52
|
+
#!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
|
53
|
+
def self.evaluate_subextractor(url, parent_pattern)
|
54
|
+
if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
|
55
|
+
detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
|
56
|
+
detail_root.result = Result.new
|
57
|
+
detail_root.last_result = nil
|
58
|
+
@@original_evaluation_context.push @@evaluation_context
|
59
|
+
@@evaluation_context = EvaluationContext.new
|
60
|
+
@@evaluation_context.clear_sources_and_sinks detail_root
|
61
|
+
FetchAction.restore_host_name
|
62
|
+
fetch url
|
63
|
+
@@evaluation_context.extractor = self
|
64
|
+
@@evaluation_context.root_pattern = detail_root
|
65
|
+
@@evaluation_context.attach_current_document
|
66
|
+
evaluate_extractor detail_root
|
67
|
+
@@evaluation_context = @@original_evaluation_context.pop
|
68
|
+
detail_root.to_xml
|
69
|
+
else
|
70
|
+
@@original_evaluation_context ||= []
|
71
|
+
FetchAction.restore_host_name
|
72
|
+
@@original_evaluation_context.push @@evaluation_context
|
73
|
+
@@evaluation_context = EvaluationContext.new
|
74
|
+
fetch url
|
75
|
+
evaluated_extractor = (class_eval(&parent_pattern.referenced_extractor))
|
76
|
+
root_pattern = evaluated_extractor.parent
|
77
|
+
@@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern.children[0]
|
78
|
+
@@evaluation_context.setup_examples
|
79
|
+
evaluate_extractor(root_pattern)
|
80
|
+
#Apply all postprocess steps
|
81
|
+
PostProcessor.apply_post_processing(root_pattern)
|
82
|
+
#Return the root pattern
|
83
|
+
#puts "Extracted detail page"
|
84
|
+
@@evaluation_context = @@original_evaluation_context.pop
|
85
|
+
root_pattern.to_xml
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
44
89
|
#build the current wrapper
|
45
90
|
def self.method_missing(method_name, *args, &block)
|
46
91
|
if NavigationActions::KEYWORDS.include? method_name.to_s
|
@@ -48,22 +93,25 @@ module Scrubyt
|
|
48
93
|
return
|
49
94
|
end
|
50
95
|
pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
|
96
|
+
check_if_shortcut_pattern(pattern)
|
97
|
+
check_if_detail_page(pattern, args)
|
51
98
|
pattern.evaluation_context = @@evaluation_context
|
52
99
|
if @parent == nil
|
53
100
|
if method_name.to_s == 'next_page'
|
54
|
-
@@evaluation_context.
|
55
|
-
@@
|
56
|
-
|
101
|
+
@@evaluation_context.setup_uri_builder(pattern, args)
|
102
|
+
@@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
|
103
|
+
p @@last_root_pattern.children[0].name
|
57
104
|
return @@last_pattern
|
58
105
|
else
|
59
106
|
#Create a root pattern
|
60
107
|
root_pattern = Scrubyt::Pattern.new('root', :type => :root)
|
108
|
+
@@last_root_pattern = root_pattern
|
61
109
|
root_pattern.evaluation_context = @@evaluation_context
|
62
110
|
@@evaluation_context.root_pattern = root_pattern
|
63
111
|
@@evaluation_context.extractor = self
|
64
112
|
#add the currently active document to the root pattern
|
65
113
|
@@evaluation_context.attach_current_document
|
66
|
-
@@evaluation_context.root_pattern.add_child_pattern(pattern)
|
114
|
+
@@evaluation_context.root_pattern.add_child_pattern(pattern)
|
67
115
|
@@evaluation_context.block_count = 0
|
68
116
|
end
|
69
117
|
else
|
@@ -80,28 +128,76 @@ module Scrubyt
|
|
80
128
|
end
|
81
129
|
@@last_pattern = pattern
|
82
130
|
end
|
83
|
-
|
84
|
-
#
|
85
|
-
#
|
86
|
-
|
87
|
-
|
131
|
+
|
132
|
+
#Shortcut patterns, as their name says, are a shortcut for creating patterns
|
133
|
+
#from predefined rules; for example:
|
134
|
+
#
|
135
|
+
# detail_url
|
136
|
+
#
|
137
|
+
# is equivalent to
|
138
|
+
#
|
139
|
+
# detail_url 'href', type => :attribute
|
140
|
+
#
|
141
|
+
#i.e. the system figures out on it's own that because of the postfix, the
|
142
|
+
#example should be looked up (but it should never override the user input!)
|
143
|
+
#another example (will be available later):
|
144
|
+
#
|
145
|
+
# every_img
|
146
|
+
#
|
147
|
+
# is equivivalent to
|
148
|
+
#
|
149
|
+
# every_img '//img'
|
150
|
+
#
|
151
|
+
def self.check_if_shortcut_pattern(pattern)
|
152
|
+
case pattern.name
|
153
|
+
when /.+_url/
|
154
|
+
#make sure that we are not overriding the user's settings
|
155
|
+
if !pattern.examples
|
156
|
+
pattern.filters[0].example = 'href'
|
157
|
+
pattern.type = Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
|
158
|
+
end
|
159
|
+
end
|
88
160
|
end
|
89
161
|
|
162
|
+
#Check whether the currently created pattern is a detail pattern (i.e. it refrences
|
163
|
+
#a subextractor). Also check if the currently created pattern is
|
164
|
+
#an ancestor of a detail pattern , and store this in a hash if yes (to be able to
|
165
|
+
#traverse the pattern structure on detail pages as well).
|
166
|
+
def self.check_if_detail_page(pattern, args)
|
167
|
+
return if args.size == 0
|
168
|
+
return if !args[0].is_a? Hash
|
169
|
+
return if !args[0][:references]
|
170
|
+
referenced_extractor = args[0][:references]
|
171
|
+
pattern.type = Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
172
|
+
pattern.referenced_extractor = referenced_extractor
|
173
|
+
@@detail_extractor_to_pattern_name[referenced_extractor] ||= []
|
174
|
+
@@detail_extractor_to_pattern_name[referenced_extractor] = @@detail_extractor_to_pattern_name[referenced_extractor] << pattern
|
175
|
+
end
|
176
|
+
|
90
177
|
def self.get_hpricot_doc
|
91
178
|
NavigationActions.get_hpricot_doc
|
92
179
|
end
|
93
180
|
|
181
|
+
def self.get_current_doc_url
|
182
|
+
NavigationActions.get_current_doc_url
|
183
|
+
end
|
184
|
+
|
185
|
+
def self.get_detail_pattern_relations
|
186
|
+
@@detail_pattern_relations
|
187
|
+
end
|
188
|
+
|
94
189
|
def self.get_mode
|
95
190
|
@@mode
|
96
|
-
end
|
191
|
+
end
|
192
|
+
|
97
193
|
private
|
98
|
-
def self.evaluate_extractor(root_pattern)
|
99
|
-
if @@
|
194
|
+
def self.evaluate_extractor(root_pattern)
|
195
|
+
if @@next_patterns[root_pattern]
|
100
196
|
current_page_count = 1
|
101
197
|
loop do
|
102
198
|
really_evaluate_extractor(root_pattern)
|
103
|
-
break if (@@
|
104
|
-
current_page_count += 1 if @@
|
199
|
+
break if (@@next_patterns[root_pattern].limit == current_page_count || @@evaluation_context.crawl_to_new_page(root_pattern, @@next_patterns[root_pattern]) == nil)
|
200
|
+
current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
|
105
201
|
end
|
106
202
|
else
|
107
203
|
really_evaluate_extractor(root_pattern)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Build URIs from different parameters</tt>
|
4
|
+
#
|
5
|
+
#When crawling to further pages which are machine-generated
|
6
|
+
#(most typically "next" pages) we need to detect the pattern
|
7
|
+
#and generate the next URI based on the edetected rule. This
|
8
|
+
#class provides methods to build URIs based on different criteria.
|
9
|
+
#
|
10
|
+
#The other possibility is to use constant objects ('Next' links,
|
11
|
+
#or image links (like right arrow) pointing to the next page).
|
12
|
+
#URIBUilder supports both possibilities.
|
13
|
+
class URIBuilder
|
14
|
+
attr_reader :next_page_example, :next_page_pattern, :limit, :next_param, :next_increment, :increment, :current_uri
|
15
|
+
|
16
|
+
def initialize(pattern,args)
|
17
|
+
if args[0] =~ /^http.+/
|
18
|
+
#Figure out how are the URLs generated based on the next URL
|
19
|
+
get_next_param(string_diff(args[0], args[1]))
|
20
|
+
@increment = 0
|
21
|
+
@current_uri = args[1]
|
22
|
+
@limit = args[2][:limit] if args.size > 2
|
23
|
+
else
|
24
|
+
#Otherwise, do this in the 'classic' way (by clicking on the "next" link)
|
25
|
+
@next_page_pattern = pattern
|
26
|
+
@next_page_example = args[0]
|
27
|
+
@limit = args[1][:limit] if args.size > 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
#Used when generating the next URI (as opposed to 'clicking' the next link)
|
32
|
+
def generate_next_uri
|
33
|
+
@increment += @next_increment
|
34
|
+
return @current_uri if @increment == @next_increment
|
35
|
+
@next_increment = 1 if @next_increment == 2
|
36
|
+
if @current_uri !~ /#{@next_param}/
|
37
|
+
@current_uri += (@next_param + '=' + @next_increment.to_s)
|
38
|
+
else
|
39
|
+
@current_uri = @current_uri.sub(/#{@next_param}=#{@increment-@next_increment}/) do
|
40
|
+
"#{@next_param}=#{@increment}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def get_next_param(pair)
|
47
|
+
param_and_value = pair.split('=')
|
48
|
+
@next_param = param_and_value[0]
|
49
|
+
@next_increment = param_and_value[1].to_i
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_difference_index(s1,s2)
|
53
|
+
cmp = s2.scan(/./).zip(s1.scan(/./))
|
54
|
+
i = 0
|
55
|
+
loop do
|
56
|
+
return i if cmp[i][0] != cmp[i][1]
|
57
|
+
i+=1
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def string_diff(s1,s2)
|
62
|
+
s2[find_difference_index(s1, s2)..s2.size-find_difference_index(s1.reverse, s2.reverse)-1]
|
63
|
+
end #end of method string_diff
|
64
|
+
end #end of class URIBuilder
|
65
|
+
end #end of module Scrubyt
|
66
|
+
|
67
|
+
|
@@ -70,6 +70,7 @@ module Scrubyt
|
|
70
70
|
output_file = output_file_name == nil ? open("#{wrapper_name}_extractor_export.rb", 'w') :
|
71
71
|
open(output_file_name, 'w')
|
72
72
|
export_header(output_file)
|
73
|
+
export_subextractors(contents, pattern, output_file)
|
73
74
|
export_extractor(contents, pattern, output_file)
|
74
75
|
export_footer(output_file, wrapper_name, extractor_result_file_name)
|
75
76
|
cleanup_result
|
@@ -85,8 +86,27 @@ private
|
|
85
86
|
|
86
87
|
def self.cleanup_result
|
87
88
|
@result.gsub!('P.') {}
|
89
|
+
CompoundExample::DESCRIPTORS.each {|d|
|
90
|
+
@result.gsub!(/,\s*:#{d.to_s}.+?'.+?'/) {}
|
91
|
+
}
|
88
92
|
end
|
89
93
|
|
94
|
+
|
95
|
+
def self.export_subextractors(contents, pattern, output_file)
|
96
|
+
all_subextractor_code = contents.scan(/.+=\s+lambda.+Extractor\.define/m)
|
97
|
+
return if all_subextractor_code.empty?
|
98
|
+
all_subextractor_code = all_subextractor_code[0].split("\n")
|
99
|
+
pure_subextractor_code = []
|
100
|
+
meaningful_code = false
|
101
|
+
all_subextractor_code.each do |sec|
|
102
|
+
meaningful_code = true if sec =~ /lambda/
|
103
|
+
meaningful_code = false if sec =~ /Extractor.define/
|
104
|
+
pure_subextractor_code << sec if meaningful_code
|
105
|
+
end
|
106
|
+
add_P pure_subextractor_code
|
107
|
+
substitute_examples_with_XPaths(pattern,pure_subextractor_code)
|
108
|
+
end
|
109
|
+
|
90
110
|
#OK, I have to admit: this function is powered by woodo magic. A lots of woodoo magic.
|
91
111
|
#Piles of tons of heaps of woodoo magic :-)
|
92
112
|
#
|
@@ -111,26 +131,16 @@ private
|
|
111
131
|
#end (to close the block of the extractor definition)
|
112
132
|
count = pattern.evaluation_context.block_count + 1
|
113
133
|
#Construct the extractor definition matching regexp based on the number of ends
|
114
|
-
definition = contents.scan(/Extractor\.define(?:.*?(?:\}
|
134
|
+
definition = contents.scan(/Extractor\.define(?:.*?(?:\}|\s+end)){#{count.to_s}}/m)
|
115
135
|
#Since the regexp matching the extractor definition was multiline, get the first
|
116
136
|
#line separately and patch it in!
|
117
137
|
rows = definition[0].split("\n")
|
118
|
-
|
119
|
-
#patterns could be matched very easily from the extractor definition (because they begun
|
120
|
-
#with 'P.'). Now that P has been removed, mimick it!
|
121
|
-
rows.each do |row|
|
122
|
-
#Do not prepend P. to comments and empty lines
|
123
|
-
next if (row.strip =~ /^#/ || row.strip == '')
|
124
|
-
#Do not prepend P. to any of the reserved keywords
|
125
|
-
jump_to_next = false
|
126
|
-
NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
|
127
|
-
next if jump_to_next
|
128
|
-
#Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
|
129
|
-
row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
|
130
|
-
#Don't forget also the stuff in parentheses!
|
131
|
-
row.gsub!(/\{\s+/) {"{P."}
|
132
|
-
end
|
138
|
+
add_P(rows)
|
133
139
|
rows[0] = first_line
|
140
|
+
substitute_examples_with_XPaths(pattern,rows)
|
141
|
+
end
|
142
|
+
|
143
|
+
def self.substitute_examples_with_XPaths(pattern,rows)
|
134
144
|
#@full_definition holds the original definition (at this point, later on it will be
|
135
145
|
#gsub!bed and all)
|
136
146
|
@full_definition = rows.join("\n")
|
@@ -146,9 +156,10 @@ private
|
|
146
156
|
replace_example_with_xpath(name, xpaths, %q{'})
|
147
157
|
end
|
148
158
|
#Finally, add XPaths to pattern which had no example at the beginning (the XPath was
|
149
|
-
#generated from the child patterns
|
159
|
+
#generated from the child patterns)
|
150
160
|
@name_to_xpath_map.each do |name, xpaths|
|
151
161
|
xpaths.reverse.each do |xpath|
|
162
|
+
next if !@full_definition.include? "P.#{name}"
|
152
163
|
comma = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0].sub('do'){}.strip == '' ? '' : ','
|
153
164
|
if (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{')
|
154
165
|
@full_definition.sub!("P.#{name}") {"P.#{name}('#{xpath}')"}
|
@@ -157,8 +168,8 @@ private
|
|
157
168
|
end
|
158
169
|
end
|
159
170
|
end
|
160
|
-
@result += @full_definition
|
161
|
-
end
|
171
|
+
@result += @full_definition
|
172
|
+
end
|
162
173
|
|
163
174
|
def self.export_footer(output_file, wrapper_name, extractor_result_file_name)
|
164
175
|
if extractor_result_file_name
|
@@ -167,20 +178,56 @@ private
|
|
167
178
|
@result += "\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
|
168
179
|
end
|
169
180
|
end
|
181
|
+
|
182
|
+
def self.add_P(rows)
|
183
|
+
#Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
|
184
|
+
#patterns could be matched very easily from the extractor definition (because they begun
|
185
|
+
#with 'P.'). Now that P has been removed, mimick it!
|
186
|
+
rows.each do |row|
|
187
|
+
#Do not prepend P. to comments and empty lines
|
188
|
+
next if (row.strip =~ /^#/ || row.strip == '')
|
189
|
+
#Do not prepend P. to any of the reserved keywords
|
190
|
+
jump_to_next = false
|
191
|
+
NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
|
192
|
+
jump_to_next = true if row =~ /lambda/
|
193
|
+
next if jump_to_next
|
194
|
+
#Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
|
195
|
+
row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
|
196
|
+
#Don't forget also the stuff in parentheses!
|
197
|
+
row.gsub!(/\{\s+/) {"{P."}
|
198
|
+
end
|
199
|
+
end
|
170
200
|
|
171
201
|
|
172
202
|
def self.create_name_to_xpath_map(pattern)
|
203
|
+
puts " Cereating mapping for: #{pattern.name}"
|
173
204
|
@name_to_xpath_map[pattern.name] = []
|
174
205
|
pattern.filters.each do |filter|
|
175
206
|
@name_to_xpath_map[pattern.name] << filter.xpath if pattern.filters[0].xpath != nil
|
207
|
+
end
|
208
|
+
pattern.children.each {|child| create_name_to_xpath_map child}
|
209
|
+
if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
210
|
+
puts pattern.name
|
211
|
+
puts "-------"
|
212
|
+
puts pattern.evaluation_context.extractor.get_detail_pattern_relations.each {|k,v|
|
213
|
+
if k.include? pattern
|
214
|
+
v.parent.children.each do |child|
|
215
|
+
create_name_to_xpath_map child
|
216
|
+
end
|
217
|
+
end
|
218
|
+
}
|
219
|
+
|
220
|
+
#pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
|
221
|
+
# create_name_to_xpath_map child
|
222
|
+
#end
|
176
223
|
end
|
177
|
-
|
178
|
-
end
|
224
|
+
end
|
179
225
|
|
180
226
|
def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
|
181
227
|
return if name=='root'
|
228
|
+
return if !@full_definition.include? "P.#{name}"
|
182
229
|
parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
|
183
|
-
if parens.empty?
|
230
|
+
if parens.empty?
|
184
231
|
full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
|
185
232
|
else
|
186
233
|
full_line = parens[0][0]
|
@@ -50,7 +50,7 @@ module Scrubyt
|
|
50
50
|
|
51
51
|
private
|
52
52
|
def self.to_xml_recursive(pattern, element)
|
53
|
-
pattern.children.each do |child|
|
53
|
+
pattern.children.each do |child|
|
54
54
|
childresults = child.result.lookup(child.parent.last_result)
|
55
55
|
#Output text for leaf nodes only; Maybe add possibility to customize this later
|
56
56
|
if (childresults == nil)
|
@@ -72,19 +72,38 @@ private
|
|
72
72
|
if child.last_result.instance_of? String
|
73
73
|
res = child.last_result
|
74
74
|
else
|
75
|
-
|
75
|
+
if child.last_result.respond_to? 'traverse_text'
|
76
|
+
child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
|
77
|
+
else
|
78
|
+
child.last_result.children.each { |c| element.add_element c }
|
79
|
+
end
|
76
80
|
end
|
77
81
|
child_node = REXML::Element.new(child.name)
|
78
|
-
child_node.text = (res.gsub(' '){' '}).strip if (child
|
79
|
-
|
82
|
+
child_node.text = (res.gsub(' '){' '}).strip if write_text_criteria_met(child)
|
83
|
+
|
84
|
+
element.add_element(child_node) if child.type != Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
80
85
|
to_xml_recursive(child, child_node)
|
81
86
|
end
|
82
87
|
end
|
83
88
|
|
89
|
+
def self.write_text_criteria_met(pattern)
|
90
|
+
if (pattern.write_text == nil)
|
91
|
+
return pattern.children.size == 0
|
92
|
+
else
|
93
|
+
pattern.write_text
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
84
97
|
def self.print_statistics_recursive(pattern, depth)
|
85
|
-
if pattern.name != 'root'
|
86
|
-
|
87
|
-
|
98
|
+
if pattern.name != 'root'
|
99
|
+
if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
|
100
|
+
pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
|
101
|
+
print_statistics_recursive(child, depth)
|
102
|
+
end
|
103
|
+
else
|
104
|
+
count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
|
105
|
+
puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{count} instances.")
|
106
|
+
end
|
88
107
|
end
|
89
108
|
|
90
109
|
pattern.children.each do |child|
|