scrubyt 0.2.3 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,3 @@
1
- require 'logger'
2
1
  require 'open-uri'
3
2
  require 'rubygems'
4
3
  require 'mechanize'
@@ -17,6 +16,11 @@ module Scrubyt
17
16
  #The definition of the extractor is passed through this method
18
17
  def self.define(mode=nil, &extractor_definition)
19
18
  @@mode = mode
19
+ #We are keeping the relations between the detail patterns and their root patterns
20
+ @@detail_extractor_to_pattern_name = {}
21
+ @@detail_pattern_relations = {}
22
+ #root pattern -> URIBuilder mapping
23
+ @@next_patterns = {}
20
24
  mode_name = (mode == :production ? 'Production' : 'Learning')
21
25
  puts "[MODE] #{mode_name}"
22
26
  NavigationActions.new
@@ -41,6 +45,47 @@ module Scrubyt
41
45
  root_pattern
42
46
  end
43
47
 
48
+ #Evaluate a subexttractor (i.e. an extractor on a detail page).
49
+ #The url passed to this function is automatically loaded.
50
+ #The definition of the subextractor is passed as a block
51
+ #
52
+ #!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
53
+ def self.evaluate_subextractor(url, parent_pattern)
54
+ if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
55
+ detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
56
+ detail_root.result = Result.new
57
+ detail_root.last_result = nil
58
+ @@original_evaluation_context.push @@evaluation_context
59
+ @@evaluation_context = EvaluationContext.new
60
+ @@evaluation_context.clear_sources_and_sinks detail_root
61
+ FetchAction.restore_host_name
62
+ fetch url
63
+ @@evaluation_context.extractor = self
64
+ @@evaluation_context.root_pattern = detail_root
65
+ @@evaluation_context.attach_current_document
66
+ evaluate_extractor detail_root
67
+ @@evaluation_context = @@original_evaluation_context.pop
68
+ detail_root.to_xml
69
+ else
70
+ @@original_evaluation_context ||= []
71
+ FetchAction.restore_host_name
72
+ @@original_evaluation_context.push @@evaluation_context
73
+ @@evaluation_context = EvaluationContext.new
74
+ fetch url
75
+ evaluated_extractor = (class_eval(&parent_pattern.referenced_extractor))
76
+ root_pattern = evaluated_extractor.parent
77
+ @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern.children[0]
78
+ @@evaluation_context.setup_examples
79
+ evaluate_extractor(root_pattern)
80
+ #Apply all postprocess steps
81
+ PostProcessor.apply_post_processing(root_pattern)
82
+ #Return the root pattern
83
+ #puts "Extracted detail page"
84
+ @@evaluation_context = @@original_evaluation_context.pop
85
+ root_pattern.to_xml
86
+ end
87
+ end
88
+
44
89
  #build the current wrapper
45
90
  def self.method_missing(method_name, *args, &block)
46
91
  if NavigationActions::KEYWORDS.include? method_name.to_s
@@ -48,22 +93,25 @@ module Scrubyt
48
93
  return
49
94
  end
50
95
  pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
96
+ check_if_shortcut_pattern(pattern)
97
+ check_if_detail_page(pattern, args)
51
98
  pattern.evaluation_context = @@evaluation_context
52
99
  if @parent == nil
53
100
  if method_name.to_s == 'next_page'
54
- @@evaluation_context.next_page = args[0]
55
- @@evaluation_context.limit =
56
- args[1][:limit] if args.size > 1
101
+ @@evaluation_context.setup_uri_builder(pattern, args)
102
+ @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
103
+ p @@last_root_pattern.children[0].name
57
104
  return @@last_pattern
58
105
  else
59
106
  #Create a root pattern
60
107
  root_pattern = Scrubyt::Pattern.new('root', :type => :root)
108
+ @@last_root_pattern = root_pattern
61
109
  root_pattern.evaluation_context = @@evaluation_context
62
110
  @@evaluation_context.root_pattern = root_pattern
63
111
  @@evaluation_context.extractor = self
64
112
  #add the currently active document to the root pattern
65
113
  @@evaluation_context.attach_current_document
66
- @@evaluation_context.root_pattern.add_child_pattern(pattern)
114
+ @@evaluation_context.root_pattern.add_child_pattern(pattern)
67
115
  @@evaluation_context.block_count = 0
68
116
  end
69
117
  else
@@ -80,28 +128,76 @@ module Scrubyt
80
128
  end
81
129
  @@last_pattern = pattern
82
130
  end
83
-
84
- #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
85
- #(You should not be :)
86
- def self.get_block_count
87
- @@root_pattern.block_count
131
+
132
+ #Shortcut patterns, as their name says, are a shortcut for creating patterns
133
+ #from predefined rules; for example:
134
+ #
135
+ # detail_url
136
+ #
137
+ # is equivalent to
138
+ #
139
+ # detail_url 'href', type => :attribute
140
+ #
141
+ #i.e. the system figures out on it's own that because of the postfix, the
142
+ #example should be looked up (but it should never override the user input!)
143
+ #another example (will be available later):
144
+ #
145
+ # every_img
146
+ #
147
+ # is equivivalent to
148
+ #
149
+ # every_img '//img'
150
+ #
151
+ def self.check_if_shortcut_pattern(pattern)
152
+ case pattern.name
153
+ when /.+_url/
154
+ #make sure that we are not overriding the user's settings
155
+ if !pattern.examples
156
+ pattern.filters[0].example = 'href'
157
+ pattern.type = Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
158
+ end
159
+ end
88
160
  end
89
161
 
162
+ #Check whether the currently created pattern is a detail pattern (i.e. it refrences
163
+ #a subextractor). Also check if the currently created pattern is
164
+ #an ancestor of a detail pattern , and store this in a hash if yes (to be able to
165
+ #traverse the pattern structure on detail pages as well).
166
+ def self.check_if_detail_page(pattern, args)
167
+ return if args.size == 0
168
+ return if !args[0].is_a? Hash
169
+ return if !args[0][:references]
170
+ referenced_extractor = args[0][:references]
171
+ pattern.type = Scrubyt::Pattern::PATTERN_TYPE_DETAIL
172
+ pattern.referenced_extractor = referenced_extractor
173
+ @@detail_extractor_to_pattern_name[referenced_extractor] ||= []
174
+ @@detail_extractor_to_pattern_name[referenced_extractor] = @@detail_extractor_to_pattern_name[referenced_extractor] << pattern
175
+ end
176
+
90
177
  def self.get_hpricot_doc
91
178
  NavigationActions.get_hpricot_doc
92
179
  end
93
180
 
181
+ def self.get_current_doc_url
182
+ NavigationActions.get_current_doc_url
183
+ end
184
+
185
+ def self.get_detail_pattern_relations
186
+ @@detail_pattern_relations
187
+ end
188
+
94
189
  def self.get_mode
95
190
  @@mode
96
- end
191
+ end
192
+
97
193
  private
98
- def self.evaluate_extractor(root_pattern)
99
- if @@evaluation_context.next_page
194
+ def self.evaluate_extractor(root_pattern)
195
+ if @@next_patterns[root_pattern]
100
196
  current_page_count = 1
101
197
  loop do
102
198
  really_evaluate_extractor(root_pattern)
103
- break if (@@evaluation_context.limit == current_page_count || @@evaluation_context.crawl_to_new_page == nil)
104
- current_page_count += 1 if @@evaluation_context.limit != nil
199
+ break if (@@next_patterns[root_pattern].limit == current_page_count || @@evaluation_context.crawl_to_new_page(root_pattern, @@next_patterns[root_pattern]) == nil)
200
+ current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
105
201
  end
106
202
  else
107
203
  really_evaluate_extractor(root_pattern)
@@ -0,0 +1,67 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Build URIs from different parameters</tt>
4
+ #
5
+ #When crawling to further pages which are machine-generated
6
+ #(most typically "next" pages) we need to detect the pattern
7
+ #and generate the next URI based on the edetected rule. This
8
+ #class provides methods to build URIs based on different criteria.
9
+ #
10
+ #The other possibility is to use constant objects ('Next' links,
11
+ #or image links (like right arrow) pointing to the next page).
12
+ #URIBUilder supports both possibilities.
13
+ class URIBuilder
14
+ attr_reader :next_page_example, :next_page_pattern, :limit, :next_param, :next_increment, :increment, :current_uri
15
+
16
+ def initialize(pattern,args)
17
+ if args[0] =~ /^http.+/
18
+ #Figure out how are the URLs generated based on the next URL
19
+ get_next_param(string_diff(args[0], args[1]))
20
+ @increment = 0
21
+ @current_uri = args[1]
22
+ @limit = args[2][:limit] if args.size > 2
23
+ else
24
+ #Otherwise, do this in the 'classic' way (by clicking on the "next" link)
25
+ @next_page_pattern = pattern
26
+ @next_page_example = args[0]
27
+ @limit = args[1][:limit] if args.size > 1
28
+ end
29
+ end
30
+
31
+ #Used when generating the next URI (as opposed to 'clicking' the next link)
32
+ def generate_next_uri
33
+ @increment += @next_increment
34
+ return @current_uri if @increment == @next_increment
35
+ @next_increment = 1 if @next_increment == 2
36
+ if @current_uri !~ /#{@next_param}/
37
+ @current_uri += (@next_param + '=' + @next_increment.to_s)
38
+ else
39
+ @current_uri = @current_uri.sub(/#{@next_param}=#{@increment-@next_increment}/) do
40
+ "#{@next_param}=#{@increment}"
41
+ end
42
+ end
43
+ end
44
+
45
+ private
46
+ def get_next_param(pair)
47
+ param_and_value = pair.split('=')
48
+ @next_param = param_and_value[0]
49
+ @next_increment = param_and_value[1].to_i
50
+ end
51
+
52
+ def find_difference_index(s1,s2)
53
+ cmp = s2.scan(/./).zip(s1.scan(/./))
54
+ i = 0
55
+ loop do
56
+ return i if cmp[i][0] != cmp[i][1]
57
+ i+=1
58
+ end
59
+ end
60
+
61
+ def string_diff(s1,s2)
62
+ s2[find_difference_index(s1, s2)..s2.size-find_difference_index(s1.reverse, s2.reverse)-1]
63
+ end #end of method string_diff
64
+ end #end of class URIBuilder
65
+ end #end of module Scrubyt
66
+
67
+
@@ -70,6 +70,7 @@ module Scrubyt
70
70
  output_file = output_file_name == nil ? open("#{wrapper_name}_extractor_export.rb", 'w') :
71
71
  open(output_file_name, 'w')
72
72
  export_header(output_file)
73
+ export_subextractors(contents, pattern, output_file)
73
74
  export_extractor(contents, pattern, output_file)
74
75
  export_footer(output_file, wrapper_name, extractor_result_file_name)
75
76
  cleanup_result
@@ -85,8 +86,27 @@ private
85
86
 
86
87
  def self.cleanup_result
87
88
  @result.gsub!('P.') {}
89
+ CompoundExample::DESCRIPTORS.each {|d|
90
+ @result.gsub!(/,\s*:#{d.to_s}.+?'.+?'/) {}
91
+ }
88
92
  end
89
93
 
94
+
95
+ def self.export_subextractors(contents, pattern, output_file)
96
+ all_subextractor_code = contents.scan(/.+=\s+lambda.+Extractor\.define/m)
97
+ return if all_subextractor_code.empty?
98
+ all_subextractor_code = all_subextractor_code[0].split("\n")
99
+ pure_subextractor_code = []
100
+ meaningful_code = false
101
+ all_subextractor_code.each do |sec|
102
+ meaningful_code = true if sec =~ /lambda/
103
+ meaningful_code = false if sec =~ /Extractor.define/
104
+ pure_subextractor_code << sec if meaningful_code
105
+ end
106
+ add_P pure_subextractor_code
107
+ substitute_examples_with_XPaths(pattern,pure_subextractor_code)
108
+ end
109
+
90
110
  #OK, I have to admit: this function is powered by woodo magic. A lots of woodoo magic.
91
111
  #Piles of tons of heaps of woodoo magic :-)
92
112
  #
@@ -111,26 +131,16 @@ private
111
131
  #end (to close the block of the extractor definition)
112
132
  count = pattern.evaluation_context.block_count + 1
113
133
  #Construct the extractor definition matching regexp based on the number of ends
114
- definition = contents.scan(/Extractor\.define(?:.*?(?:\}|end)){#{count.to_s}}/m)
134
+ definition = contents.scan(/Extractor\.define(?:.*?(?:\}|\s+end)){#{count.to_s}}/m)
115
135
  #Since the regexp matching the extractor definition was multiline, get the first
116
136
  #line separately and patch it in!
117
137
  rows = definition[0].split("\n")
118
- #Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
119
- #patterns could be matched very easily from the extractor definition (because they begun
120
- #with 'P.'). Now that P has been removed, mimick it!
121
- rows.each do |row|
122
- #Do not prepend P. to comments and empty lines
123
- next if (row.strip =~ /^#/ || row.strip == '')
124
- #Do not prepend P. to any of the reserved keywords
125
- jump_to_next = false
126
- NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
127
- next if jump_to_next
128
- #Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
129
- row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
130
- #Don't forget also the stuff in parentheses!
131
- row.gsub!(/\{\s+/) {"{P."}
132
- end
138
+ add_P(rows)
133
139
  rows[0] = first_line
140
+ substitute_examples_with_XPaths(pattern,rows)
141
+ end
142
+
143
+ def self.substitute_examples_with_XPaths(pattern,rows)
134
144
  #@full_definition holds the original definition (at this point, later on it will be
135
145
  #gsub!bed and all)
136
146
  @full_definition = rows.join("\n")
@@ -146,9 +156,10 @@ private
146
156
  replace_example_with_xpath(name, xpaths, %q{'})
147
157
  end
148
158
  #Finally, add XPaths to pattern which had no example at the beginning (the XPath was
149
- #generated from the child patterns
159
+ #generated from the child patterns)
150
160
  @name_to_xpath_map.each do |name, xpaths|
151
161
  xpaths.reverse.each do |xpath|
162
+ next if !@full_definition.include? "P.#{name}"
152
163
  comma = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0].sub('do'){}.strip == '' ? '' : ','
153
164
  if (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{')
154
165
  @full_definition.sub!("P.#{name}") {"P.#{name}('#{xpath}')"}
@@ -157,8 +168,8 @@ private
157
168
  end
158
169
  end
159
170
  end
160
- @result += @full_definition
161
- end
171
+ @result += @full_definition
172
+ end
162
173
 
163
174
  def self.export_footer(output_file, wrapper_name, extractor_result_file_name)
164
175
  if extractor_result_file_name
@@ -167,20 +178,56 @@ private
167
178
  @result += "\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
168
179
  end
169
180
  end
181
+
182
+ def self.add_P(rows)
183
+ #Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
184
+ #patterns could be matched very easily from the extractor definition (because they begun
185
+ #with 'P.'). Now that P has been removed, mimick it!
186
+ rows.each do |row|
187
+ #Do not prepend P. to comments and empty lines
188
+ next if (row.strip =~ /^#/ || row.strip == '')
189
+ #Do not prepend P. to any of the reserved keywords
190
+ jump_to_next = false
191
+ NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
192
+ jump_to_next = true if row =~ /lambda/
193
+ next if jump_to_next
194
+ #Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
195
+ row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
196
+ #Don't forget also the stuff in parentheses!
197
+ row.gsub!(/\{\s+/) {"{P."}
198
+ end
199
+ end
170
200
 
171
201
 
172
202
  def self.create_name_to_xpath_map(pattern)
203
+ puts " Cereating mapping for: #{pattern.name}"
173
204
  @name_to_xpath_map[pattern.name] = []
174
205
  pattern.filters.each do |filter|
175
206
  @name_to_xpath_map[pattern.name] << filter.xpath if pattern.filters[0].xpath != nil
207
+ end
208
+ pattern.children.each {|child| create_name_to_xpath_map child}
209
+ if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
210
+ puts pattern.name
211
+ puts "-------"
212
+ puts pattern.evaluation_context.extractor.get_detail_pattern_relations.each {|k,v|
213
+ if k.include? pattern
214
+ v.parent.children.each do |child|
215
+ create_name_to_xpath_map child
216
+ end
217
+ end
218
+ }
219
+
220
+ #pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
221
+ # create_name_to_xpath_map child
222
+ #end
176
223
  end
177
- pattern.children.each {|child| create_name_to_xpath_map child}
178
- end
224
+ end
179
225
 
180
226
  def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
181
227
  return if name=='root'
228
+ return if !@full_definition.include? "P.#{name}"
182
229
  parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
183
- if parens.empty?
230
+ if parens.empty?
184
231
  full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
185
232
  else
186
233
  full_line = parens[0][0]
@@ -11,6 +11,7 @@ module Scrubyt
11
11
  def add_result(source, result)
12
12
  @childmap.each do |hash|
13
13
  if hash.keys[0] == source
14
+ return if hash[source] == nil
14
15
  hash[source] << result if !hash[source].include? result
15
16
  return
16
17
  end
@@ -50,7 +50,7 @@ module Scrubyt
50
50
 
51
51
  private
52
52
  def self.to_xml_recursive(pattern, element)
53
- pattern.children.each do |child|
53
+ pattern.children.each do |child|
54
54
  childresults = child.result.lookup(child.parent.last_result)
55
55
  #Output text for leaf nodes only; Maybe add possibility to customize this later
56
56
  if (childresults == nil)
@@ -72,19 +72,38 @@ private
72
72
  if child.last_result.instance_of? String
73
73
  res = child.last_result
74
74
  else
75
- child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
75
+ if child.last_result.respond_to? 'traverse_text'
76
+ child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
77
+ else
78
+ child.last_result.children.each { |c| element.add_element c }
79
+ end
76
80
  end
77
81
  child_node = REXML::Element.new(child.name)
78
- child_node.text = (res.gsub('&nbsp;'){' '}).strip if (child.children.size == 0)
79
- element.add_element(child_node)
82
+ child_node.text = (res.gsub('&nbsp;'){' '}).strip if write_text_criteria_met(child)
83
+
84
+ element.add_element(child_node) if child.type != Scrubyt::Pattern::PATTERN_TYPE_DETAIL
80
85
  to_xml_recursive(child, child_node)
81
86
  end
82
87
  end
83
88
 
89
+ def self.write_text_criteria_met(pattern)
90
+ if (pattern.write_text == nil)
91
+ return pattern.children.size == 0
92
+ else
93
+ pattern.write_text
94
+ end
95
+ end
96
+
84
97
  def self.print_statistics_recursive(pattern, depth)
85
- if pattern.name != 'root'
86
- count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
87
- puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{count} instances.")
98
+ if pattern.name != 'root'
99
+ if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
100
+ pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
101
+ print_statistics_recursive(child, depth)
102
+ end
103
+ else
104
+ count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
105
+ puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{count} instances.")
106
+ end
88
107
  end
89
108
 
90
109
  pattern.children.each do |child|