scrubyt 0.2.0 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -69,12 +69,18 @@ module Scrubyt
69
69
  #Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
70
70
  #is evaluated, every pattern evaluates its filters and then they are calling this method
71
71
  def evaluate(source)
72
- case @parent_pattern.type
72
+ case @parent_pattern.type
73
73
  when Scrubyt::Pattern::PATTERN_TYPE_TREE
74
74
  result = source/@xpath
75
+ #puts "Evaluating #{@parent_pattern.name} with #{@xpath}"
75
76
  result.class == Hpricot::Elements ? result.map : [result]
76
77
  when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
77
- [source.attributes[@example]]
78
+ attribute_value = [source.attributes[@example]]
79
+ return attribute_value if attribute_value[0]
80
+ @@attribute_in_parent = nil
81
+ Filter.traverse_up_until_attribute_found(source.parent, @example)
82
+ @@attribute_in_parent
83
+
78
84
  when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
79
85
  source.inner_text.scan(@example).flatten
80
86
  end
@@ -88,7 +94,9 @@ module Scrubyt
88
94
  when EXAMPLE_TYPE_XPATH
89
95
  @xpath = @example
90
96
  when EXAMPLE_TYPE_STRING
91
- @temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.filters[0].source[0], @example, false )
97
+ @temp_sink = XPathUtils.find_node_from_text( @parent_pattern.evaluation_context.root_pattern.filters[0].source[0],
98
+ @example,
99
+ false )
92
100
  @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
93
101
  XPathUtils.generate_XPath(@temp_sink, nil, true)
94
102
  when EXAMPLE_TYPE_CHILDREN
@@ -127,7 +135,7 @@ module Scrubyt
127
135
  current_example_index += 1
128
136
  end
129
137
  when EXAMPLE_TYPE_IMAGE
130
- @temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.filters[0].source[0], @example)
138
+ @temp_sink = XPathUtils.find_image(@parent_pattern.evaluation_context.root_pattern.filters[0].source[0], @example)
131
139
  @xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
132
140
  end
133
141
  end
@@ -139,6 +147,16 @@ module Scrubyt
139
147
  end
140
148
 
141
149
  private
150
+ def self.traverse_up_until_attribute_found(source, attribute)
151
+ if (!source.parent.is_a? Hpricot::Doc)
152
+ #p source.attributes
153
+ #p attribute
154
+ #p source.attributes[attribute]
155
+ @@attribute_in_parent = source.attributes[attribute] if source.attributes[attribute]
156
+ traverse_up_until_attribute_found(source.parent, attribute) if !@attribute_in_parent
157
+ end
158
+ end
159
+
142
160
  def self.determine_example_type(example)
143
161
  if example.instance_of? Regexp
144
162
  EXAMPLE_TYPE_REGEXP
@@ -41,23 +41,18 @@ module Scrubyt
41
41
  SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'example']
42
42
 
43
43
  attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
44
- :last_result, :result, :root_pattern, :example, :block_count,
45
- :next_page, :limit, :extractor, :extracted_docs,
46
- :examples, :parent_of_leaf, :document_index
47
- attr_reader :type, :generalize_set, :next_page_url
44
+ :last_result, :result, :example, :limit,
45
+ :examples, :parent_of_leaf, :evaluation_context,
46
+ :indices_to_extract, :evaluation_context
47
+ attr_reader :type, :generalize_set, :next_page_url, :result_indexer
48
48
 
49
49
  def initialize (name, *args)
50
50
  @name = name #name of the pattern
51
51
  parse_args(args) #parse the rest of the arguments
52
- @root_pattern = nil #root pattern of the wrapper
53
52
  @children = [] #child patterns
54
53
  @filters = [] #filters of the wrapper
55
54
  @result = Result.new #hierarchical results of the pattern
56
- @@instance_count = Hash.new(0)
57
- @evaluated_examples = []
58
- @next_page = nil
59
- @document_index = 0
60
- if @examples == nil
55
+ if @examples == nil
61
56
  filters << Scrubyt::Filter.new(self) #create a default filter
62
57
  else
63
58
  @examples.each do |example|
@@ -112,6 +107,9 @@ module Scrubyt
112
107
  # camera_data.item[1].item_name[0]
113
108
  def method_missing(method_name, *args, &block)
114
109
  case method_name.to_s
110
+ when 'select_indices'
111
+ @result_indexer = Scrubyt::ResultIndexer.new(*args)
112
+ self
115
113
  when /^to_/
116
114
  Scrubyt::ResultDumper.send(method_name.to_s, self)
117
115
  when /^ensure_/
@@ -135,9 +133,9 @@ module Scrubyt
135
133
  # camera_data[1].item[1].item_name[0]
136
134
  def [](index)
137
135
  if @name == 'root'
138
- @root_pattern.document_index = index
136
+ @evaluation_context.document_index = index
139
137
  else
140
- @parent.last_result = @parent.last_result[@root_pattern.document_index] if @parent.last_result.is_a? Array
138
+ @parent.last_result = @parent.last_result[@evaluation_context.document_index] if @parent.last_result.is_a? Array
141
139
  return nil if (@result.lookup(@parent.last_result)) == nil
142
140
  @last_result = @result.lookup(@parent.last_result)[index]
143
141
  end
@@ -150,7 +148,7 @@ module Scrubyt
150
148
  def export(file, output_file_name=nil, extractor_result_file_name=nil)
151
149
  Scrubyt::Export.export(file, self, output_file_name, extractor_result_file_name)
152
150
  end
153
-
151
+
154
152
  ##
155
153
  #Add a filter to this pattern
156
154
  def add_filter(filter)
@@ -167,39 +165,7 @@ module Scrubyt
167
165
  child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
168
166
  @children << child
169
167
  end
170
-
171
- ##
172
- #Crawl to a new page. This function should not be called from the outside - it is automatically called
173
- #if the next_page is defined
174
- def crawl_to_new_page
175
- temp_document = generate_next_page_link(@next_page)
176
- return nil if temp_document == nil
177
- clear_sources_and_sinks(@root_pattern)
178
- @root_pattern.extractor.fetch(temp_document)
179
- attach_current_document
180
- end
181
-
182
- ##
183
- #Attach document to the root pattern; This is happening automatically as the root pattern is defined or
184
- #crawling to a new page
185
- def attach_current_document
186
- doc = @root_pattern.extractor.get_hpricot_doc
187
- filters[0].source << doc
188
- filters[0].sink << doc
189
- @last_result ||= []
190
- @last_result << doc
191
- @result.add_result(filters[0].source, filters[0].sink)
192
- end
193
-
194
- ##
195
- #Based on the given examples, calculate the XPaths for the tree patterns
196
- def setup_examples
197
- get_root_pattern(self)
198
- mark_leaf_parents(self)
199
- set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
200
- generate_examples(@root_pattern)
201
- end
202
-
168
+
203
169
  ##
204
170
  #Evaluate the pattern. This means evaluating all the filters and adding
205
171
  #their extracted instances to the array of results of this pattern
@@ -225,23 +191,21 @@ module Scrubyt
225
191
  end
226
192
  result = result_hash.reject {|k,v| k if !v}
227
193
  sorted_result = r.reject {|e| !result.keys.include? e}
228
- add_result(filter, source, sorted_result)
194
+ indexer = @result_indexer == nil ? sorted_result : @result_indexer.select_indices_to_extract(sorted_result)
195
+ add_result(filter, source, indexer)
229
196
  else
230
- add_result(filter, source, r)
197
+ indexer = @result_indexer == nil ? r : @result_indexer.select_indices_to_extract(r)
198
+ add_result(filter, source, indexer)
231
199
  end#end of constraint check
232
200
  end#end of source iteration
233
201
  end#end of filter iteration
234
- end
235
-
236
- def get_instance_count
237
- @@instance_count
238
- end
202
+ end
239
203
 
240
204
  def get_constraints
241
205
  filters[0].constraints
242
- end
243
-
244
- private
206
+ end
207
+
208
+ private
245
209
  def look_for_examples(args)
246
210
  if (args[0].is_a? String)
247
211
  @examples = args.select {|e| e.is_a? String}
@@ -269,48 +233,7 @@ private
269
233
  results.each do |res|
270
234
  filter.sink << res
271
235
  @result.add_result(source, res)
272
- @@instance_count[@name] += 1
273
236
  end
274
- end
275
-
276
- def get_root_pattern(pattern)
277
- if @root_pattern == nil
278
- while (pattern.parent != nil)
279
- get_root_pattern(pattern.parent)
280
- end
281
- @root_pattern = pattern
282
- end
283
- end
284
-
285
- def mark_leaf_parents(pattern)
286
- pattern.children.each { |child|
287
- pattern.parent_of_leaf = true if child.children.size == 0
288
- }
289
- pattern.children.each { |child| mark_leaf_parents(child) }
290
- end
291
-
292
- def set_root_pattern_whole_wrapper(pattern, root_pattern)
293
- pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
294
- pattern.root_pattern = root_pattern
295
- end
296
-
297
- def generate_examples(pattern)
298
- pattern.children.each {|child_pattern| generate_examples(child_pattern) }
299
- pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == PATTERN_TYPE_TREE
300
- end
301
-
302
- def clear_sources_and_sinks(pattern)
303
- pattern.filters.each do |filter|
304
- filter.source = []
305
- filter.sink = []
306
- end
307
- pattern.children.each {|child| clear_sources_and_sinks child}
308
- end
309
-
310
- def generate_next_page_link(example)
311
- node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example, true)
312
- return nil if node == nil
313
- node.attributes['href'].gsub('&amp;') {'&'}
314
- end # end of method generate_next_page_link
237
+ end # end of method generate_examples
315
238
  end #end of class Pattern
316
239
  end #end of module Scrubyt
@@ -0,0 +1,13 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Apply different functions on the input document</tt>
4
+ #Before the document is passed to Hpricot for parsing, we may need
5
+ #to do different stuff with it which are clumsy/not appropriate/impossible
6
+ #to do once the document is loaded.
7
+ class PreFilterDocument
8
+ #Replace <br/> tags with newlines
9
+ def self.br_to_newline(doc)
10
+ doc.gsub(/<br[ \/]*>/i, "\r\n")
11
+ end #end of function br_to_newline
12
+ end #end of class PreFilterDocument
13
+ end #end of module Scrubyt
@@ -0,0 +1,88 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Selecting results based on indices</tt>
4
+ #
5
+ #If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
6
+ #probably with a variable count of results (like tags, authors etc.), you may need just
7
+ #specific elements - like the last one, every third one, or at specific indices.
8
+ #In this case you should use the select_indices syntax.
9
+ class ResultIndexer
10
+ attr_reader :indices_to_extract
11
+
12
+ def initialize(*args)
13
+ select_indices(*args)
14
+ end
15
+
16
+ ##
17
+ #Perform selection of the desires result instances, based on their indices
18
+ def select_indices_to_extract(ary)
19
+ return ary if @indices_to_extract == nil
20
+ to_keep = []
21
+ @indices_to_extract.each {|e|
22
+ if e.is_a? Symbol
23
+ case e
24
+ when :first
25
+ to_keep << 0
26
+ when :last
27
+ to_keep << ary.size-1
28
+ when :all_but_last
29
+ (0..ary.size-2).each {|i| to_keep << i}
30
+ when :all_but_first
31
+ (1..ary.size-1).each {|i| to_keep << i}
32
+ when :every_even
33
+ (0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
34
+ when :every_odd
35
+ (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
36
+ when :every_second
37
+ (0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
38
+ when :every_third
39
+ (0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
40
+ end
41
+ end
42
+ }
43
+ @indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
44
+ to_keep.sort!
45
+ ary.reject! {|e| !to_keep.include? ary.index(e)}
46
+ ary
47
+ end
48
+
49
+ private
50
+ ##
51
+ #Do not return the whole result set, just specified indices - like
52
+ #first,last, every odd index, indices from [1..3] etc.
53
+ #
54
+ #This method can accept:
55
+ #- a range, like (2..3)
56
+ #- an array of indices, like [1,2,3]
57
+ #- specified set of keywords:
58
+ # - :first
59
+ # - :last
60
+ # - :every_even
61
+ # - :every_odd
62
+ # (there can be more of these keywords in one select_indices call)
63
+ def select_indices(*args)
64
+ indices_to_grab = args[0]
65
+ case indices_to_grab.class.to_s
66
+ when "Range"
67
+ @indices_to_extract = indices_to_grab.to_a
68
+ when "Array"
69
+ nested_arrays = []
70
+ indices_to_grab.each {|e|
71
+ if e.is_a? Array
72
+ nested_arrays << e
73
+ elsif e.is_a? Range
74
+ nested_arrays << e.to_a
75
+ end
76
+ }
77
+ @indices_to_extract = indices_to_grab
78
+ nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
79
+ @indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
80
+ when "Symbol"
81
+ #parse this when we already have the results
82
+ @indices_to_extract = [indices_to_grab]
83
+ else
84
+ puts "Invalid index specification"
85
+ end
86
+ end #end of function select_indices
87
+ end #end of class ResultIndexer
88
+ end #end of module Scrubyt
@@ -0,0 +1,97 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Holding the evaluation context of the extraction process</tt>
4
+ #
5
+ #Every kind of data that is shared among patterns during the extraction process
6
+ #is held in this class, so it can be looked up anytime.
7
+ #
8
+ #This class provides also some high-level basic functionality in navigation, like
9
+ #crawling to new pages, attaching doucment to the root pattern once arrived at the
10
+ #desired page etc.
11
+ #
12
+ #It can be viewed as a glue between Extractor and NavigationActions as well - these
13
+ #two classes need to communicate frequently as well as share different information
14
+ #and this is accomplished through EvaluationContext.
15
+ class EvaluationContext
16
+ attr_accessor :root_pattern, :next_page, :document_index, :block_count,
17
+ :extractor, :limit
18
+
19
+ def initialize
20
+ @root_pattern = nil
21
+ @next_page = nil
22
+ @block_count = 0
23
+ @document_index = 0
24
+ @extractor = nil
25
+ end
26
+
27
+ ##
28
+ #Crawl to a new page. This function should not be called from the outside - it is automatically called
29
+ #if the next_page is defined
30
+ def crawl_to_new_page
31
+ temp_document = generate_next_page_link(@next_page)
32
+ return nil if temp_document == nil
33
+ clear_sources_and_sinks(@root_pattern)
34
+ @extractor.fetch(temp_document)
35
+ attach_current_document
36
+ end
37
+
38
+ ##
39
+ #Attach document to the root pattern; This is happening automatically as the root pattern is defined or
40
+ #crawling to a new page
41
+ def attach_current_document
42
+ doc = @extractor.get_hpricot_doc
43
+ @root_pattern.filters[0].source << doc
44
+ @root_pattern.filters[0].sink << doc
45
+ @root_pattern.last_result ||= []
46
+ @root_pattern.last_result << doc
47
+ @root_pattern.result.add_result(@root_pattern.filters[0].source,
48
+ @root_pattern.filters[0].sink)
49
+ end
50
+
51
+ ##
52
+ #Based on the given examples, calculate the XPaths for the tree patterns
53
+ def setup_examples
54
+ get_root_pattern(nil)
55
+ mark_leaf_parents(@root_pattern)
56
+ generate_examples(@root_pattern)
57
+ end
58
+
59
+ ##
60
+ #After crawling to the new page, the sources and sinks need to be cleaned
61
+ #since they are no more valid
62
+ def clear_sources_and_sinks(pattern)
63
+ pattern.filters.each do |filter|
64
+ filter.source = []
65
+ filter.sink = []
66
+ end
67
+ pattern.children.each {|child| clear_sources_and_sinks child}
68
+ end
69
+
70
+ def generate_next_page_link(example)
71
+ node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example, true)
72
+ return nil if node == nil
73
+ node.attributes['href'].gsub('&amp;') {'&'}
74
+ end
75
+
76
+ def mark_leaf_parents(pattern)
77
+ pattern.children.each { |child|
78
+ pattern.parent_of_leaf = true if child.children.size == 0
79
+ }
80
+ pattern.children.each { |child| mark_leaf_parents(child) }
81
+ end
82
+
83
+ def generate_examples(pattern)
84
+ pattern.children.each {|child_pattern| generate_examples(child_pattern) }
85
+ pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == Pattern::PATTERN_TYPE_TREE
86
+ end
87
+
88
+ def get_root_pattern(pattern)
89
+ if @root_pattern == nil
90
+ while (pattern.parent != nil)
91
+ get_root_pattern(pattern.parent)
92
+ end
93
+ @root_pattern = pattern
94
+ end
95
+ end #end of function
96
+ end #end of class EvaluationContext
97
+ end #end of module Scrubyt
@@ -0,0 +1,116 @@
1
+ require 'logger'
2
+ require 'open-uri'
3
+ require 'rubygems'
4
+ require 'mechanize'
5
+ require 'hpricot'
6
+
7
+ module Scrubyt
8
+ ##
9
+ #=<tt>Driving the whole extraction process</tt>
10
+ #
11
+ #Extractor is a performer class - it gets an extractor definition and carries
12
+ #out the actions and evaluates the wrappers sequentially.
13
+ #
14
+ #Originally also the navigation actions were here, but since the class got too
15
+ #big, they were factored out to an own class, NavigationAction.
16
+ class Extractor
17
+ #The definition of the extractor is passed through this method
18
+ def self.define(mode=nil, &extractor_definition)
19
+ @@mode = mode
20
+ mode_name = (mode == :production ? 'Production' : 'Learning')
21
+ puts "[MODE] #{mode_name}"
22
+ NavigationActions.new
23
+ @@evaluation_context = EvaluationContext.new
24
+ #Hack up an artificial root pattern (i.e. do not return the pattern which
25
+ #is the root one in the user's definition, but rather the real (invisible)
26
+ #root pattern
27
+ evaluated_extractor = (class_eval(&extractor_definition))
28
+ if evaluated_extractor == nil
29
+ puts "No extractor defined, exiting..."
30
+ exit
31
+ end
32
+ root_pattern = evaluated_extractor.parent
33
+ #Recursively match data based on examples
34
+ @@evaluation_context.setup_examples
35
+ #Once all is set up, evaluate the extractor from the root pattern!
36
+ evaluate_extractor(root_pattern)
37
+ #Apply all postprocess steps
38
+ PostProcessor.apply_post_processing(root_pattern)
39
+ #Return the root pattern
40
+ puts "Extraction finished succesfully!"
41
+ root_pattern
42
+ end
43
+
44
+ #build the current wrapper
45
+ def self.method_missing(method_name, *args, &block)
46
+ if NavigationActions::KEYWORDS.include? method_name.to_s
47
+ NavigationActions.send(method_name, *args)
48
+ return
49
+ end
50
+ pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
51
+ pattern.evaluation_context = @@evaluation_context
52
+ if @parent == nil
53
+ if method_name.to_s == 'next_page'
54
+ @@evaluation_context.next_page = args[0]
55
+ @@evaluation_context.limit =
56
+ args[1][:limit] if args.size > 1
57
+ return @@last_pattern
58
+ else
59
+ #Create a root pattern
60
+ root_pattern = Scrubyt::Pattern.new('root', :type => :root)
61
+ root_pattern.evaluation_context = @@evaluation_context
62
+ @@evaluation_context.root_pattern = root_pattern
63
+ @@evaluation_context.extractor = self
64
+ #add the currently active document to the root pattern
65
+ @@evaluation_context.attach_current_document
66
+ @@evaluation_context.root_pattern.add_child_pattern(pattern)
67
+ @@evaluation_context.block_count = 0
68
+ end
69
+ else
70
+ @parent.add_child_pattern(pattern) if @parent != nil
71
+ end
72
+ if block_given?
73
+ @@evaluation_context.block_count = @@evaluation_context.block_count + 1
74
+ @stack ||=[]
75
+ @parent = pattern
76
+ @stack.push @parent
77
+ class_eval(&block)
78
+ @stack.pop
79
+ @parent = @stack.last
80
+ end
81
+ @@last_pattern = pattern
82
+ end
83
+
84
+ #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
85
+ #(You should not be :)
86
+ def self.get_block_count
87
+ @@root_pattern.block_count
88
+ end
89
+
90
+ def self.get_hpricot_doc
91
+ NavigationActions.get_hpricot_doc
92
+ end
93
+
94
+ def self.get_mode
95
+ @@mode
96
+ end
97
+ private
98
+ def self.evaluate_extractor(root_pattern)
99
+ if @@evaluation_context.next_page
100
+ current_page_count = 1
101
+ loop do
102
+ really_evaluate_extractor(root_pattern)
103
+ break if (@@evaluation_context.limit == current_page_count || @@evaluation_context.crawl_to_new_page == nil)
104
+ current_page_count += 1 if @@evaluation_context.limit != nil
105
+ end
106
+ else
107
+ really_evaluate_extractor(root_pattern)
108
+ end
109
+ end
110
+
111
+ def self.really_evaluate_extractor(pattern)
112
+ pattern.evaluate
113
+ pattern.children.each { |child| really_evaluate_extractor child }
114
+ end #end of method evaluate_wrapper
115
+ end #end of class Extractor
116
+ end #end of module Scrubyt