scrubyt 0.2.3 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,3 @@
1
- require 'logger'
2
1
  require 'open-uri'
3
2
  require 'rubygems'
4
3
  require 'mechanize'
@@ -17,6 +16,11 @@ module Scrubyt
17
16
  #The definition of the extractor is passed through this method
18
17
  def self.define(mode=nil, &extractor_definition)
19
18
  @@mode = mode
19
+ #We are keeping the relations between the detail patterns and their root patterns
20
+ @@detail_extractor_to_pattern_name = {}
21
+ @@detail_pattern_relations = {}
22
+ #root pattern -> URIBuilder mapping
23
+ @@next_patterns = {}
20
24
  mode_name = (mode == :production ? 'Production' : 'Learning')
21
25
  puts "[MODE] #{mode_name}"
22
26
  NavigationActions.new
@@ -41,6 +45,47 @@ module Scrubyt
41
45
  root_pattern
42
46
  end
43
47
 
48
+ #Evaluate a subexttractor (i.e. an extractor on a detail page).
49
+ #The url passed to this function is automatically loaded.
50
+ #The definition of the subextractor is passed as a block
51
+ #
52
+ #!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
53
+ def self.evaluate_subextractor(url, parent_pattern)
54
+ if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
55
+ detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
56
+ detail_root.result = Result.new
57
+ detail_root.last_result = nil
58
+ @@original_evaluation_context.push @@evaluation_context
59
+ @@evaluation_context = EvaluationContext.new
60
+ @@evaluation_context.clear_sources_and_sinks detail_root
61
+ FetchAction.restore_host_name
62
+ fetch url
63
+ @@evaluation_context.extractor = self
64
+ @@evaluation_context.root_pattern = detail_root
65
+ @@evaluation_context.attach_current_document
66
+ evaluate_extractor detail_root
67
+ @@evaluation_context = @@original_evaluation_context.pop
68
+ detail_root.to_xml
69
+ else
70
+ @@original_evaluation_context ||= []
71
+ FetchAction.restore_host_name
72
+ @@original_evaluation_context.push @@evaluation_context
73
+ @@evaluation_context = EvaluationContext.new
74
+ fetch url
75
+ evaluated_extractor = (class_eval(&parent_pattern.referenced_extractor))
76
+ root_pattern = evaluated_extractor.parent
77
+ @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern.children[0]
78
+ @@evaluation_context.setup_examples
79
+ evaluate_extractor(root_pattern)
80
+ #Apply all postprocess steps
81
+ PostProcessor.apply_post_processing(root_pattern)
82
+ #Return the root pattern
83
+ #puts "Extracted detail page"
84
+ @@evaluation_context = @@original_evaluation_context.pop
85
+ root_pattern.to_xml
86
+ end
87
+ end
88
+
44
89
  #build the current wrapper
45
90
  def self.method_missing(method_name, *args, &block)
46
91
  if NavigationActions::KEYWORDS.include? method_name.to_s
@@ -48,22 +93,25 @@ module Scrubyt
48
93
  return
49
94
  end
50
95
  pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
96
+ check_if_shortcut_pattern(pattern)
97
+ check_if_detail_page(pattern, args)
51
98
  pattern.evaluation_context = @@evaluation_context
52
99
  if @parent == nil
53
100
  if method_name.to_s == 'next_page'
54
- @@evaluation_context.next_page = args[0]
55
- @@evaluation_context.limit =
56
- args[1][:limit] if args.size > 1
101
+ @@evaluation_context.setup_uri_builder(pattern, args)
102
+ @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
103
+ p @@last_root_pattern.children[0].name
57
104
  return @@last_pattern
58
105
  else
59
106
  #Create a root pattern
60
107
  root_pattern = Scrubyt::Pattern.new('root', :type => :root)
108
+ @@last_root_pattern = root_pattern
61
109
  root_pattern.evaluation_context = @@evaluation_context
62
110
  @@evaluation_context.root_pattern = root_pattern
63
111
  @@evaluation_context.extractor = self
64
112
  #add the currently active document to the root pattern
65
113
  @@evaluation_context.attach_current_document
66
- @@evaluation_context.root_pattern.add_child_pattern(pattern)
114
+ @@evaluation_context.root_pattern.add_child_pattern(pattern)
67
115
  @@evaluation_context.block_count = 0
68
116
  end
69
117
  else
@@ -80,28 +128,76 @@ module Scrubyt
80
128
  end
81
129
  @@last_pattern = pattern
82
130
  end
83
-
84
- #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
85
- #(You should not be :)
86
- def self.get_block_count
87
- @@root_pattern.block_count
131
+
132
+ #Shortcut patterns, as their name says, are a shortcut for creating patterns
133
+ #from predefined rules; for example:
134
+ #
135
+ # detail_url
136
+ #
137
+ # is equivalent to
138
+ #
139
+ # detail_url 'href', type => :attribute
140
+ #
141
+ #i.e. the system figures out on it's own that because of the postfix, the
142
+ #example should be looked up (but it should never override the user input!)
143
+ #another example (will be available later):
144
+ #
145
+ # every_img
146
+ #
147
+ # is equivivalent to
148
+ #
149
+ # every_img '//img'
150
+ #
151
+ def self.check_if_shortcut_pattern(pattern)
152
+ case pattern.name
153
+ when /.+_url/
154
+ #make sure that we are not overriding the user's settings
155
+ if !pattern.examples
156
+ pattern.filters[0].example = 'href'
157
+ pattern.type = Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
158
+ end
159
+ end
88
160
  end
89
161
 
162
+ #Check whether the currently created pattern is a detail pattern (i.e. it refrences
163
+ #a subextractor). Also check if the currently created pattern is
164
+ #an ancestor of a detail pattern , and store this in a hash if yes (to be able to
165
+ #traverse the pattern structure on detail pages as well).
166
+ def self.check_if_detail_page(pattern, args)
167
+ return if args.size == 0
168
+ return if !args[0].is_a? Hash
169
+ return if !args[0][:references]
170
+ referenced_extractor = args[0][:references]
171
+ pattern.type = Scrubyt::Pattern::PATTERN_TYPE_DETAIL
172
+ pattern.referenced_extractor = referenced_extractor
173
+ @@detail_extractor_to_pattern_name[referenced_extractor] ||= []
174
+ @@detail_extractor_to_pattern_name[referenced_extractor] = @@detail_extractor_to_pattern_name[referenced_extractor] << pattern
175
+ end
176
+
90
177
  def self.get_hpricot_doc
91
178
  NavigationActions.get_hpricot_doc
92
179
  end
93
180
 
181
+ def self.get_current_doc_url
182
+ NavigationActions.get_current_doc_url
183
+ end
184
+
185
+ def self.get_detail_pattern_relations
186
+ @@detail_pattern_relations
187
+ end
188
+
94
189
  def self.get_mode
95
190
  @@mode
96
- end
191
+ end
192
+
97
193
  private
98
- def self.evaluate_extractor(root_pattern)
99
- if @@evaluation_context.next_page
194
+ def self.evaluate_extractor(root_pattern)
195
+ if @@next_patterns[root_pattern]
100
196
  current_page_count = 1
101
197
  loop do
102
198
  really_evaluate_extractor(root_pattern)
103
- break if (@@evaluation_context.limit == current_page_count || @@evaluation_context.crawl_to_new_page == nil)
104
- current_page_count += 1 if @@evaluation_context.limit != nil
199
+ break if (@@next_patterns[root_pattern].limit == current_page_count || @@evaluation_context.crawl_to_new_page(root_pattern, @@next_patterns[root_pattern]) == nil)
200
+ current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
105
201
  end
106
202
  else
107
203
  really_evaluate_extractor(root_pattern)
@@ -0,0 +1,67 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Build URIs from different parameters</tt>
4
+ #
5
+ #When crawling to further pages which are machine-generated
6
+ #(most typically "next" pages) we need to detect the pattern
7
+ #and generate the next URI based on the edetected rule. This
8
+ #class provides methods to build URIs based on different criteria.
9
+ #
10
+ #The other possibility is to use constant objects ('Next' links,
11
+ #or image links (like right arrow) pointing to the next page).
12
+ #URIBUilder supports both possibilities.
13
+ class URIBuilder
14
+ attr_reader :next_page_example, :next_page_pattern, :limit, :next_param, :next_increment, :increment, :current_uri
15
+
16
+ def initialize(pattern,args)
17
+ if args[0] =~ /^http.+/
18
+ #Figure out how are the URLs generated based on the next URL
19
+ get_next_param(string_diff(args[0], args[1]))
20
+ @increment = 0
21
+ @current_uri = args[1]
22
+ @limit = args[2][:limit] if args.size > 2
23
+ else
24
+ #Otherwise, do this in the 'classic' way (by clicking on the "next" link)
25
+ @next_page_pattern = pattern
26
+ @next_page_example = args[0]
27
+ @limit = args[1][:limit] if args.size > 1
28
+ end
29
+ end
30
+
31
+ #Used when generating the next URI (as opposed to 'clicking' the next link)
32
+ def generate_next_uri
33
+ @increment += @next_increment
34
+ return @current_uri if @increment == @next_increment
35
+ @next_increment = 1 if @next_increment == 2
36
+ if @current_uri !~ /#{@next_param}/
37
+ @current_uri += (@next_param + '=' + @next_increment.to_s)
38
+ else
39
+ @current_uri = @current_uri.sub(/#{@next_param}=#{@increment-@next_increment}/) do
40
+ "#{@next_param}=#{@increment}"
41
+ end
42
+ end
43
+ end
44
+
45
+ private
46
+ def get_next_param(pair)
47
+ param_and_value = pair.split('=')
48
+ @next_param = param_and_value[0]
49
+ @next_increment = param_and_value[1].to_i
50
+ end
51
+
52
+ def find_difference_index(s1,s2)
53
+ cmp = s2.scan(/./).zip(s1.scan(/./))
54
+ i = 0
55
+ loop do
56
+ return i if cmp[i][0] != cmp[i][1]
57
+ i+=1
58
+ end
59
+ end
60
+
61
+ def string_diff(s1,s2)
62
+ s2[find_difference_index(s1, s2)..s2.size-find_difference_index(s1.reverse, s2.reverse)-1]
63
+ end #end of method string_diff
64
+ end #end of class URIBuilder
65
+ end #end of module Scrubyt
66
+
67
+
@@ -70,6 +70,7 @@ module Scrubyt
70
70
  output_file = output_file_name == nil ? open("#{wrapper_name}_extractor_export.rb", 'w') :
71
71
  open(output_file_name, 'w')
72
72
  export_header(output_file)
73
+ export_subextractors(contents, pattern, output_file)
73
74
  export_extractor(contents, pattern, output_file)
74
75
  export_footer(output_file, wrapper_name, extractor_result_file_name)
75
76
  cleanup_result
@@ -85,8 +86,27 @@ private
85
86
 
86
87
  def self.cleanup_result
87
88
  @result.gsub!('P.') {}
89
+ CompoundExample::DESCRIPTORS.each {|d|
90
+ @result.gsub!(/,\s*:#{d.to_s}.+?'.+?'/) {}
91
+ }
88
92
  end
89
93
 
94
+
95
+ def self.export_subextractors(contents, pattern, output_file)
96
+ all_subextractor_code = contents.scan(/.+=\s+lambda.+Extractor\.define/m)
97
+ return if all_subextractor_code.empty?
98
+ all_subextractor_code = all_subextractor_code[0].split("\n")
99
+ pure_subextractor_code = []
100
+ meaningful_code = false
101
+ all_subextractor_code.each do |sec|
102
+ meaningful_code = true if sec =~ /lambda/
103
+ meaningful_code = false if sec =~ /Extractor.define/
104
+ pure_subextractor_code << sec if meaningful_code
105
+ end
106
+ add_P pure_subextractor_code
107
+ substitute_examples_with_XPaths(pattern,pure_subextractor_code)
108
+ end
109
+
90
110
  #OK, I have to admit: this function is powered by woodo magic. A lots of woodoo magic.
91
111
  #Piles of tons of heaps of woodoo magic :-)
92
112
  #
@@ -111,26 +131,16 @@ private
111
131
  #end (to close the block of the extractor definition)
112
132
  count = pattern.evaluation_context.block_count + 1
113
133
  #Construct the extractor definition matching regexp based on the number of ends
114
- definition = contents.scan(/Extractor\.define(?:.*?(?:\}|end)){#{count.to_s}}/m)
134
+ definition = contents.scan(/Extractor\.define(?:.*?(?:\}|\s+end)){#{count.to_s}}/m)
115
135
  #Since the regexp matching the extractor definition was multiline, get the first
116
136
  #line separately and patch it in!
117
137
  rows = definition[0].split("\n")
118
- #Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
119
- #patterns could be matched very easily from the extractor definition (because they begun
120
- #with 'P.'). Now that P has been removed, mimick it!
121
- rows.each do |row|
122
- #Do not prepend P. to comments and empty lines
123
- next if (row.strip =~ /^#/ || row.strip == '')
124
- #Do not prepend P. to any of the reserved keywords
125
- jump_to_next = false
126
- NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
127
- next if jump_to_next
128
- #Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
129
- row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
130
- #Don't forget also the stuff in parentheses!
131
- row.gsub!(/\{\s+/) {"{P."}
132
- end
138
+ add_P(rows)
133
139
  rows[0] = first_line
140
+ substitute_examples_with_XPaths(pattern,rows)
141
+ end
142
+
143
+ def self.substitute_examples_with_XPaths(pattern,rows)
134
144
  #@full_definition holds the original definition (at this point, later on it will be
135
145
  #gsub!bed and all)
136
146
  @full_definition = rows.join("\n")
@@ -146,9 +156,10 @@ private
146
156
  replace_example_with_xpath(name, xpaths, %q{'})
147
157
  end
148
158
  #Finally, add XPaths to pattern which had no example at the beginning (the XPath was
149
- #generated from the child patterns
159
+ #generated from the child patterns)
150
160
  @name_to_xpath_map.each do |name, xpaths|
151
161
  xpaths.reverse.each do |xpath|
162
+ next if !@full_definition.include? "P.#{name}"
152
163
  comma = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0].sub('do'){}.strip == '' ? '' : ','
153
164
  if (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{')
154
165
  @full_definition.sub!("P.#{name}") {"P.#{name}('#{xpath}')"}
@@ -157,8 +168,8 @@ private
157
168
  end
158
169
  end
159
170
  end
160
- @result += @full_definition
161
- end
171
+ @result += @full_definition
172
+ end
162
173
 
163
174
  def self.export_footer(output_file, wrapper_name, extractor_result_file_name)
164
175
  if extractor_result_file_name
@@ -167,20 +178,56 @@ private
167
178
  @result += "\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
168
179
  end
169
180
  end
181
+
182
+ def self.add_P(rows)
183
+ #Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
184
+ #patterns could be matched very easily from the extractor definition (because they begun
185
+ #with 'P.'). Now that P has been removed, mimick it!
186
+ rows.each do |row|
187
+ #Do not prepend P. to comments and empty lines
188
+ next if (row.strip =~ /^#/ || row.strip == '')
189
+ #Do not prepend P. to any of the reserved keywords
190
+ jump_to_next = false
191
+ NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
192
+ jump_to_next = true if row =~ /lambda/
193
+ next if jump_to_next
194
+ #Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
195
+ row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
196
+ #Don't forget also the stuff in parentheses!
197
+ row.gsub!(/\{\s+/) {"{P."}
198
+ end
199
+ end
170
200
 
171
201
 
172
202
  def self.create_name_to_xpath_map(pattern)
203
+ puts " Cereating mapping for: #{pattern.name}"
173
204
  @name_to_xpath_map[pattern.name] = []
174
205
  pattern.filters.each do |filter|
175
206
  @name_to_xpath_map[pattern.name] << filter.xpath if pattern.filters[0].xpath != nil
207
+ end
208
+ pattern.children.each {|child| create_name_to_xpath_map child}
209
+ if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
210
+ puts pattern.name
211
+ puts "-------"
212
+ puts pattern.evaluation_context.extractor.get_detail_pattern_relations.each {|k,v|
213
+ if k.include? pattern
214
+ v.parent.children.each do |child|
215
+ create_name_to_xpath_map child
216
+ end
217
+ end
218
+ }
219
+
220
+ #pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
221
+ # create_name_to_xpath_map child
222
+ #end
176
223
  end
177
- pattern.children.each {|child| create_name_to_xpath_map child}
178
- end
224
+ end
179
225
 
180
226
  def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
181
227
  return if name=='root'
228
+ return if !@full_definition.include? "P.#{name}"
182
229
  parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
183
- if parens.empty?
230
+ if parens.empty?
184
231
  full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
185
232
  else
186
233
  full_line = parens[0][0]
@@ -11,6 +11,7 @@ module Scrubyt
11
11
  def add_result(source, result)
12
12
  @childmap.each do |hash|
13
13
  if hash.keys[0] == source
14
+ return if hash[source] == nil
14
15
  hash[source] << result if !hash[source].include? result
15
16
  return
16
17
  end
@@ -50,7 +50,7 @@ module Scrubyt
50
50
 
51
51
  private
52
52
  def self.to_xml_recursive(pattern, element)
53
- pattern.children.each do |child|
53
+ pattern.children.each do |child|
54
54
  childresults = child.result.lookup(child.parent.last_result)
55
55
  #Output text for leaf nodes only; Maybe add possibility to customize this later
56
56
  if (childresults == nil)
@@ -72,19 +72,38 @@ private
72
72
  if child.last_result.instance_of? String
73
73
  res = child.last_result
74
74
  else
75
- child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
75
+ if child.last_result.respond_to? 'traverse_text'
76
+ child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
77
+ else
78
+ child.last_result.children.each { |c| element.add_element c }
79
+ end
76
80
  end
77
81
  child_node = REXML::Element.new(child.name)
78
- child_node.text = (res.gsub('&nbsp;'){' '}).strip if (child.children.size == 0)
79
- element.add_element(child_node)
82
+ child_node.text = (res.gsub('&nbsp;'){' '}).strip if write_text_criteria_met(child)
83
+
84
+ element.add_element(child_node) if child.type != Scrubyt::Pattern::PATTERN_TYPE_DETAIL
80
85
  to_xml_recursive(child, child_node)
81
86
  end
82
87
  end
83
88
 
89
+ def self.write_text_criteria_met(pattern)
90
+ if (pattern.write_text == nil)
91
+ return pattern.children.size == 0
92
+ else
93
+ pattern.write_text
94
+ end
95
+ end
96
+
84
97
  def self.print_statistics_recursive(pattern, depth)
85
- if pattern.name != 'root'
86
- count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
87
- puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{count} instances.")
98
+ if pattern.name != 'root'
99
+ if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
100
+ pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
101
+ print_statistics_recursive(child, depth)
102
+ end
103
+ else
104
+ count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
105
+ puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{count} instances.")
106
+ end
88
107
  end
89
108
 
90
109
  pattern.children.each do |child|