scrubyt 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -69,12 +69,18 @@ module Scrubyt
69
69
  #Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
70
70
  #is evaluated, every pattern evaluates its filters and then they are calling this method
71
71
  def evaluate(source)
72
- case @parent_pattern.type
72
+ case @parent_pattern.type
73
73
  when Scrubyt::Pattern::PATTERN_TYPE_TREE
74
74
  result = source/@xpath
75
+ #puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
75
76
  result.class == Hpricot::Elements ? result.map : [result]
76
77
  when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
77
- [source.attributes[@example]]
78
+ attribute_value = [source.attributes[@example]]
79
+ return attribute_value if attribute_value[0]
80
+ @@attribute_in_parent = nil
81
+ Filter.traverse_up_until_attribute_found(source.parent, @example)
82
+ @@attribute_in_parent
83
+
78
84
  when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
79
85
  source.inner_text.scan(@example).flatten
80
86
  end
@@ -88,7 +94,9 @@ module Scrubyt
88
94
  when EXAMPLE_TYPE_XPATH
89
95
  @xpath = @example
90
96
  when EXAMPLE_TYPE_STRING
91
- @temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.filters[0].source[0], @example, false )
97
+ @temp_sink = XPathUtils.find_node_from_text( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
98
+ @example,
99
+ false )
92
100
  @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
93
101
  XPathUtils.generate_XPath(@temp_sink, nil, true)
94
102
  when EXAMPLE_TYPE_CHILDREN
@@ -127,7 +135,7 @@ module Scrubyt
127
135
  current_example_index += 1
128
136
  end
129
137
  when EXAMPLE_TYPE_IMAGE
130
- @temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.filters[0].source[0], @example)
138
+ @temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)
131
139
  @xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
132
140
  end
133
141
  end
@@ -139,6 +147,16 @@ module Scrubyt
139
147
  end
140
148
 
141
149
  private
150
+ def self.traverse_up_until_attribute_found(source, attribute)
151
+ if (!source.parent.is_a? Hpricot::Doc)
152
+ #p source.attributes
153
+ #p attribute
154
+ #p source.attributes[attribute]
155
+ @@attribute_in_parent = source.attributes[attribute] if source.attributes[attribute]
156
+ traverse_up_until_attribute_found(source.parent, attribute) if !@attribute_in_parent
157
+ end
158
+ end
159
+
142
160
  def self.determine_example_type(example)
143
161
  if example.instance_of? Regexp
144
162
  EXAMPLE_TYPE_REGEXP
@@ -41,23 +41,18 @@ module Scrubyt
41
41
  SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'example']
42
42
 
43
43
  attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
44
- :last_result, :result, :root_pattern, :example, :block_count,
45
- :next_page, :limit, :extractor, :extracted_docs,
46
- :examples, :parent_of_leaf, :document_index
47
- attr_reader :type, :generalize_set, :next_page_url
44
+ :last_result, :result, :example, :limit,
45
+ :examples, :parent_of_leaf, :evaluation_context,
46
+ :indices_to_extract, :evaluation_context
47
+ attr_reader :type, :generalize_set, :next_page_url, :result_indexer
48
48
 
49
49
  def initialize (name, *args)
50
50
  @name = name #name of the pattern
51
51
  parse_args(args) #parse the rest of the arguments
52
- @root_pattern = nil #root pattern of the wrapper
53
52
  @children = [] #child patterns
54
53
  @filters = [] #filters of the wrapper
55
54
  @result = Result.new #hierarchical results of the pattern
56
- @@instance_count = Hash.new(0)
57
- @evaluated_examples = []
58
- @next_page = nil
59
- @document_index = 0
60
- if @examples == nil
55
+ if @examples == nil
61
56
  filters << Scrubyt::Filter.new(self) #create a default filter
62
57
  else
63
58
  @examples.each do |example|
@@ -112,6 +107,9 @@ module Scrubyt
112
107
  # camera_data.item[1].item_name[0]
113
108
  def method_missing(method_name, *args, &block)
114
109
  case method_name.to_s
110
+ when 'select_indices'
111
+ @result_indexer = Scrubyt::ResultIndexer.new(*args)
112
+ self
115
113
  when /^to_/
116
114
  Scrubyt::ResultDumper.send(method_name.to_s, self)
117
115
  when /^ensure_/
@@ -135,9 +133,9 @@ module Scrubyt
135
133
  # camera_data[1].item[1].item_name[0]
136
134
  def [](index)
137
135
  if @name == 'root'
138
- @root_pattern.document_index = index
136
+ @evaluation_context.document_index = index
139
137
  else
140
- @parent.last_result = @parent.last_result[@root_pattern.document_index] if @parent.last_result.is_a? Array
138
+ @parent.last_result = @parent.last_result[@evaluation_context.document_index] if @parent.last_result.is_a? Array
141
139
  return nil if (@result.lookup(@parent.last_result)) == nil
142
140
  @last_result = @result.lookup(@parent.last_result)[index]
143
141
  end
@@ -150,7 +148,7 @@ module Scrubyt
150
148
  def export(file, output_file_name=nil, extractor_result_file_name=nil)
151
149
  Scrubyt::Export.export(file, self, output_file_name, extractor_result_file_name)
152
150
  end
153
-
151
+
154
152
  ##
155
153
  #Add a filter to this pattern
156
154
  def add_filter(filter)
@@ -167,39 +165,7 @@ module Scrubyt
167
165
  child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
168
166
  @children << child
169
167
  end
170
-
171
- ##
172
- #Crawl to a new page. This function should not be called from the outside - it is automatically called
173
- #if the next_page is defined
174
- def crawl_to_new_page
175
- temp_document = generate_next_page_link(@next_page)
176
- return nil if temp_document == nil
177
- clear_sources_and_sinks(@root_pattern)
178
- @root_pattern.extractor.fetch(temp_document)
179
- attach_current_document
180
- end
181
-
182
- ##
183
- #Attach document to the root pattern; This is happening automatically as the root pattern is defined or
184
- #crawling to a new page
185
- def attach_current_document
186
- doc = @root_pattern.extractor.get_hpricot_doc
187
- filters[0].source << doc
188
- filters[0].sink << doc
189
- @last_result ||= []
190
- @last_result << doc
191
- @result.add_result(filters[0].source, filters[0].sink)
192
- end
193
-
194
- ##
195
- #Based on the given examples, calculate the XPaths for the tree patterns
196
- def setup_examples
197
- get_root_pattern(self)
198
- mark_leaf_parents(self)
199
- set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
200
- generate_examples(@root_pattern)
201
- end
202
-
168
+
203
169
  ##
204
170
  #Evaluate the pattern. This means evaluating all the filters and adding
205
171
  #their extracted instances to the array of results of this pattern
@@ -225,23 +191,21 @@ module Scrubyt
225
191
  end
226
192
  result = result_hash.reject {|k,v| k if !v}
227
193
  sorted_result = r.reject {|e| !result.keys.include? e}
228
- add_result(filter, source, sorted_result)
194
+ indexer = @result_indexer == nil ? sorted_result : @result_indexer.select_indices_to_extract(sorted_result)
195
+ add_result(filter, source, indexer)
229
196
  else
230
- add_result(filter, source, r)
197
+ indexer = @result_indexer == nil ? r : @result_indexer.select_indices_to_extract(r)
198
+ add_result(filter, source, indexer)
231
199
  end#end of constraint check
232
200
  end#end of source iteration
233
201
  end#end of filter iteration
234
- end
235
-
236
- def get_instance_count
237
- @@instance_count
238
- end
202
+ end
239
203
 
240
204
  def get_constraints
241
205
  filters[0].constraints
242
- end
243
-
244
- private
206
+ end
207
+
208
+ private
245
209
  def look_for_examples(args)
246
210
  if (args[0].is_a? String)
247
211
  @examples = args.select {|e| e.is_a? String}
@@ -269,48 +233,7 @@ private
269
233
  results.each do |res|
270
234
  filter.sink << res
271
235
  @result.add_result(source, res)
272
- @@instance_count[@name] += 1
273
236
  end
274
- end
275
-
276
- def get_root_pattern(pattern)
277
- if @root_pattern == nil
278
- while (pattern.parent != nil)
279
- get_root_pattern(pattern.parent)
280
- end
281
- @root_pattern = pattern
282
- end
283
- end
284
-
285
- def mark_leaf_parents(pattern)
286
- pattern.children.each { |child|
287
- pattern.parent_of_leaf = true if child.children.size == 0
288
- }
289
- pattern.children.each { |child| mark_leaf_parents(child) }
290
- end
291
-
292
- def set_root_pattern_whole_wrapper(pattern, root_pattern)
293
- pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
294
- pattern.root_pattern = root_pattern
295
- end
296
-
297
- def generate_examples(pattern)
298
- pattern.children.each {|child_pattern| generate_examples(child_pattern) }
299
- pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == PATTERN_TYPE_TREE
300
- end
301
-
302
- def clear_sources_and_sinks(pattern)
303
- pattern.filters.each do |filter|
304
- filter.source = []
305
- filter.sink = []
306
- end
307
- pattern.children.each {|child| clear_sources_and_sinks child}
308
- end
309
-
310
- def generate_next_page_link(example)
311
- node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example, true)
312
- return nil if node == nil
313
- node.attributes['href'].gsub('&amp;') {'&'}
314
- end # end of method generate_next_page_link
237
+ end # end of method generate_examples
315
238
  end #end of class Pattern
316
239
  end #end of module Scrubyt
@@ -0,0 +1,13 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Apply different functions on the input document</tt>
4
+ #Before the document is passed to Hpricot for parsing, we may need
5
+ #to do different stuff with it which are clumsy/not appropriate/impossible
6
+ #to do once the document is loaded.
7
+ class PreFilterDocument
8
+ #Replace <br/> tags with newlines
9
+ def self.br_to_newline(doc)
10
+ doc.gsub(/<br[ \/]*>/i, "\r\n")
11
+ end #end of function br_to_newline
12
+ end #end of class PreFilterDocument
13
+ end #end of module Scrubyt
@@ -0,0 +1,88 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Selecting results based on indices</tt>
4
+ #
5
+ #If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
6
+ #probably with a variable count of results (like tags, authors etc.), you may need just
7
+ #specific elements - like the last one, every third one, or at specific indices.
8
+ #In this case you should use the select_indices syntax.
9
+ class ResultIndexer
10
+ attr_reader :indices_to_extract
11
+
12
+ def initialize(*args)
13
+ select_indices(*args)
14
+ end
15
+
16
+ ##
17
+ #Perform selection of the desires result instances, based on their indices
18
+ def select_indices_to_extract(ary)
19
+ return ary if @indices_to_extract == nil
20
+ to_keep = []
21
+ @indices_to_extract.each {|e|
22
+ if e.is_a? Symbol
23
+ case e
24
+ when :first
25
+ to_keep << 0
26
+ when :last
27
+ to_keep << ary.size-1
28
+ when :all_but_last
29
+ (0..ary.size-2).each {|i| to_keep << i}
30
+ when :all_but_first
31
+ (1..ary.size-1).each {|i| to_keep << i}
32
+ when :every_even
33
+ (0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
34
+ when :every_odd
35
+ (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
36
+ when :every_second
37
+ (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
38
+ when :every_third
39
+ (0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
40
+ end
41
+ end
42
+ }
43
+ @indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
44
+ to_keep.sort!
45
+ ary.reject! {|e| !to_keep.include? ary.index(e)}
46
+ ary
47
+ end
48
+
49
+ private
50
+ ##
51
+ #Do not return the whole result set, just specified indices - like
52
+ #first,last, every odd index, indices from [1..3] etc.
53
+ #
54
+ #This method can accept:
55
+ #- a range, like (2..3)
56
+ #- an array of indices, like [1,2,3]
57
+ #- specified set of keywords:
58
+ # - :first
59
+ # - :last
60
+ # - :every_even
61
+ # - :every_odd
62
+ # (there can be more of these keywords in one select_indices call)
63
+ def select_indices(*args)
64
+ indices_to_grab = args[0]
65
+ case indices_to_grab.class.to_s
66
+ when "Range"
67
+ @indices_to_extract = indices_to_grab.to_a
68
+ when "Array"
69
+ nested_arrays = []
70
+ indices_to_grab.each {|e|
71
+ if e.is_a? Array
72
+ nested_arrays << e
73
+ elsif e.is_a? Range
74
+ nested_arrays << e.to_a
75
+ end
76
+ }
77
+ @indices_to_extract = indices_to_grab
78
+ nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
79
+ @indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
80
+ when "Symbol"
81
+ #parse this when we already have the results
82
+ @indices_to_extract = [indices_to_grab]
83
+ else
84
+ puts "Invalid index specification"
85
+ end
86
+ end #end of function select_indices
87
+ end #end of class ResultIndexer
88
+ end #end of module Scrubyt
@@ -0,0 +1,97 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Holding the evaluation context of the extraction process</tt>
4
+ #
5
+ #Every kind of data that is shared among patterns during the extraction process
6
+ #is held in this class, so it can be looked up anytime.
7
+ #
8
+ #This class provides also some high-level basic functionality in navigation, like
9
+ #crawling to new pages, attaching doucment to the root pattern once arrived at the
10
+ #desired page etc.
11
+ #
12
+ #It can be viewed as a glue between Extractor and NavigationActions as well - these
13
+ #two classes need to communicate frequently as well as share different information
14
+ #and this is accomplished through EvaluationContext.
15
+ class EvaluationContext
16
+ attr_accessor :root_pattern, :next_page, :document_index, :block_count,
17
+ :extractor, :limit
18
+
19
+ def initialize
20
+ @root_pattern = nil
21
+ @next_page = nil
22
+ @block_count = 0
23
+ @document_index = 0
24
+ @extractor = nil
25
+ end
26
+
27
+ ##
28
+ #Crawl to a new page. This function should not be called from the outside - it is automatically called
29
+ #if the next_page is defined
30
+ def crawl_to_new_page
31
+ temp_document = generate_next_page_link(@next_page)
32
+ return nil if temp_document == nil
33
+ clear_sources_and_sinks(@root_pattern)
34
+ @extractor.fetch(temp_document)
35
+ attach_current_document
36
+ end
37
+
38
+ ##
39
+ #Attach document to the root pattern; This is happening automatically as the root pattern is defined or
40
+ #crawling to a new page
41
+ def attach_current_document
42
+ doc = @extractor.get_hpricot_doc
43
+ @root_pattern.filters[0].source << doc
44
+ @root_pattern.filters[0].sink << doc
45
+ @root_pattern.last_result ||= []
46
+ @root_pattern.last_result << doc
47
+ @root_pattern.result.add_result(@root_pattern.filters[0].source,
48
+ @root_pattern.filters[0].sink)
49
+ end
50
+
51
+ ##
52
+ #Based on the given examples, calculate the XPaths for the tree patterns
53
+ def setup_examples
54
+ get_root_pattern(nil)
55
+ mark_leaf_parents(@root_pattern)
56
+ generate_examples(@root_pattern)
57
+ end
58
+
59
+ ##
60
+ #After crawling to the new page, the sources and sinks need to be cleaned
61
+ #since they are no more valid
62
+ def clear_sources_and_sinks(pattern)
63
+ pattern.filters.each do |filter|
64
+ filter.source = []
65
+ filter.sink = []
66
+ end
67
+ pattern.children.each {|child| clear_sources_and_sinks child}
68
+ end
69
+
70
+ def generate_next_page_link(example)
71
+ node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example, true)
72
+ return nil if node == nil
73
+ node.attributes['href'].gsub('&amp;') {'&'}
74
+ end
75
+
76
+ def mark_leaf_parents(pattern)
77
+ pattern.children.each { |child|
78
+ pattern.parent_of_leaf = true if child.children.size == 0
79
+ }
80
+ pattern.children.each { |child| mark_leaf_parents(child) }
81
+ end
82
+
83
+ def generate_examples(pattern)
84
+ pattern.children.each {|child_pattern| generate_examples(child_pattern) }
85
+ pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == Pattern::PATTERN_TYPE_TREE
86
+ end
87
+
88
+ def get_root_pattern(pattern)
89
+ if @root_pattern == nil
90
+ while (pattern.parent != nil)
91
+ get_root_pattern(pattern.parent)
92
+ end
93
+ @root_pattern = pattern
94
+ end
95
+ end #end of function
96
+ end #end of class EvaluationContext
97
+ end #end of module Scrubyt
@@ -0,0 +1,116 @@
1
+ require 'logger'
2
+ require 'open-uri'
3
+ require 'rubygems'
4
+ require 'mechanize'
5
+ require 'hpricot'
6
+
7
+ module Scrubyt
8
+ ##
9
+ #=<tt>Driving the whole extraction process</tt>
10
+ #
11
+ #Extractor is a performer class - it gets an extractor definition and carries
12
+ #out the actions and evaluates the wrappers sequentially.
13
+ #
14
+ #Originally also the navigation actions were here, but since the class got too
15
+ #big, they were factored out to an own class, NavigationAction.
16
+ class Extractor
17
+ #The definition of the extractor is passed through this method
18
+ def self.define(mode=nil, &extractor_definition)
19
+ @@mode = mode
20
+ mode_name = (mode == :production ? 'Production' : 'Learning')
21
+ puts "[MODE] #{mode_name}"
22
+ NavigationActions.new
23
+ @@evaluation_context = EvaluationContext.new
24
+ #Hack up an artificial root pattern (i.e. do not return the pattern which
25
+ #is the root one in the user's definition, but rather the real (invisible)
26
+ #root pattern
27
+ evaluated_extractor = (class_eval(&extractor_definition))
28
+ if evaluated_extractor == nil
29
+ puts "No extractor defined, exiting..."
30
+ exit
31
+ end
32
+ root_pattern = evaluated_extractor.parent
33
+ #Recursively match data based on examples
34
+ @@evaluation_context.setup_examples
35
+ #Once all is set up, evaluate the extractor from the root pattern!
36
+ evaluate_extractor(root_pattern)
37
+ #Apply all postprocess steps
38
+ PostProcessor.apply_post_processing(root_pattern)
39
+ #Return the root pattern
40
+ puts "Extraction finished succesfully!"
41
+ root_pattern
42
+ end
43
+
44
+ #build the current wrapper
45
+ def self.method_missing(method_name, *args, &block)
46
+ if NavigationActions::KEYWORDS.include? method_name.to_s
47
+ NavigationActions.send(method_name, *args)
48
+ return
49
+ end
50
+ pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
51
+ pattern.evaluation_context = @@evaluation_context
52
+ if @parent == nil
53
+ if method_name.to_s == 'next_page'
54
+ @@evaluation_context.next_page = args[0]
55
+ @@evaluation_context.limit =
56
+ args[1][:limit] if args.size > 1
57
+ return @@last_pattern
58
+ else
59
+ #Create a root pattern
60
+ root_pattern = Scrubyt::Pattern.new('root', :type => :root)
61
+ root_pattern.evaluation_context = @@evaluation_context
62
+ @@evaluation_context.root_pattern = root_pattern
63
+ @@evaluation_context.extractor = self
64
+ #add the currently active document to the root pattern
65
+ @@evaluation_context.attach_current_document
66
+ @@evaluation_context.root_pattern.add_child_pattern(pattern)
67
+ @@evaluation_context.block_count = 0
68
+ end
69
+ else
70
+ @parent.add_child_pattern(pattern) if @parent != nil
71
+ end
72
+ if block_given?
73
+ @@evaluation_context.block_count = @@evaluation_context.block_count + 1
74
+ @stack ||=[]
75
+ @parent = pattern
76
+ @stack.push @parent
77
+ class_eval(&block)
78
+ @stack.pop
79
+ @parent = @stack.last
80
+ end
81
+ @@last_pattern = pattern
82
+ end
83
+
84
+ #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
85
+ #(You should not be :)
86
+ def self.get_block_count
87
+ @@root_pattern.block_count
88
+ end
89
+
90
+ def self.get_hpricot_doc
91
+ NavigationActions.get_hpricot_doc
92
+ end
93
+
94
+ def self.get_mode
95
+ @@mode
96
+ end
97
+ private
98
+ def self.evaluate_extractor(root_pattern)
99
+ if @@evaluation_context.next_page
100
+ current_page_count = 1
101
+ loop do
102
+ really_evaluate_extractor(root_pattern)
103
+ break if (@@evaluation_context.limit == current_page_count || @@evaluation_context.crawl_to_new_page == nil)
104
+ current_page_count += 1 if @@evaluation_context.limit != nil
105
+ end
106
+ else
107
+ really_evaluate_extractor(root_pattern)
108
+ end
109
+ end
110
+
111
+ def self.really_evaluate_extractor(pattern)
112
+ pattern.evaluate
113
+ pattern.children.each { |child| really_evaluate_extractor child }
114
+ end #end of method evaluate_wrapper
115
+ end #end of class Extractor
116
+ end #end of module Scrubyt