scrubyt 0.1.0 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,5 @@
1
1
  require 'rubygems'
2
2
  require 'hpricot'
3
- require 'open-uri'
4
3
 
5
4
  module Scrubyt
6
5
  ##
@@ -43,7 +42,8 @@ module Scrubyt
43
42
 
44
43
  attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
45
44
  :last_result, :result, :root_pattern, :example, :block_count,
46
- :next_page, :limit, :extractor, :extracted_docs, :source, :sink
45
+ :next_page, :limit, :extractor, :extracted_docs,
46
+ :examples, :parent_of_leaf
47
47
  attr_reader :type, :generalize_set, :next_page_url
48
48
 
49
49
  def initialize (name, *args)
@@ -52,12 +52,17 @@ module Scrubyt
52
52
  @root_pattern = nil #root pattern of the wrapper
53
53
  @children = [] #child patterns
54
54
  @filters = [] #filters of the wrapper
55
- @sink = [] #output of a pattern
56
- @source = [] #input of a pattern
57
55
  @result = Result.new #hierarchical results of the pattern
58
56
  @@instance_count = Hash.new(0)
57
+ @evaluated_examples = []
59
58
  @next_page = nil
60
- filters << Scrubyt::Filter.new(self) #create a filter
59
+ if @examples == nil
60
+ filters << Scrubyt::Filter.new(self) #create a default filter
61
+ else
62
+ @examples.each do |example|
63
+ filters << Scrubyt::Filter.new(self,example) #create a filter
64
+ end
65
+ end
61
66
  end
62
67
 
63
68
  #Parse the args passed as *args; There is only one compulsory parameter to pattern: it's name
@@ -66,10 +71,8 @@ module Scrubyt
66
71
  #If an example is specified, it *MUST* be the first parameter; the order of the other
67
72
  #parameters is irrelevant
68
73
  def parse_args(args)
69
- #If an example id defined, not only get it but also remove it so it
70
- #does not interfere with the other possible string parameters
71
- @example = args.delete_at(0) if args[0].instance_of? String
72
- @example = args.delete_at(0) if args[0].instance_of? Regexp
74
+ #Grab any examples that are defined!
75
+ look_for_examples(args)
73
76
  args.each do |arg|
74
77
  arg.each do |k,v|
75
78
  #Set only the setable fields
@@ -96,7 +99,7 @@ module Scrubyt
96
99
  #This flag indicates that the user set 'generalize' to some value;
97
100
  #This way we can ensure that the explicit setting will not be overridden
98
101
  @generalize_set ||= false
99
- end
102
+ end
100
103
 
101
104
  #Dispatcher function; The class was already too big so I have decided to factor
102
105
  #out some methods based on their functionality (like output, adding constraints)
@@ -162,7 +165,7 @@ module Scrubyt
162
165
  temp_document = generate_next_page_link(@next_page)
163
166
  return nil if temp_document == nil
164
167
  clear_sources_and_sinks(@root_pattern)
165
- @root_pattern.extractor.fetch(temp_document, nil)
168
+ @root_pattern.extractor.fetch(temp_document)
166
169
  attach_current_document
167
170
  end
168
171
 
@@ -171,17 +174,18 @@ module Scrubyt
171
174
  #crawling to a new page
172
175
  def attach_current_document
173
176
  doc = @root_pattern.extractor.get_hpricot_doc
174
- @source << doc
175
- @sink << doc
177
+ filters[0].source << doc
178
+ filters[0].sink << doc
176
179
  @last_result ||= []
177
180
  @last_result << doc
178
- @result.add_result(@source, @sink)
181
+ @result.add_result(filters[0].source, filters[0].sink)
179
182
  end
180
183
 
181
184
  ##
182
185
  #Based on the given examples, calculate the XPaths for the tree patterns
183
186
  def setup_examples
184
187
  get_root_pattern(self)
188
+ mark_leaf_parents(self)
185
189
  set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
186
190
  generate_examples(@root_pattern)
187
191
  end
@@ -192,10 +196,14 @@ module Scrubyt
192
196
  def evaluate
193
197
  #No need to evaluate if there is no parent pattern
194
198
  return if @parent == nil
195
- @source = @parent.sink
196
- @source.each do |source|
197
- @filters.each do |filter|
199
+ all_filter_results = []
200
+ @filters.each do |filter|
201
+ filter_index = @filters.index(filter)
202
+ filter_index = 0 if @parent.filters.size <= filter_index
203
+ filter.source = @parent.filters[filter_index].sink
204
+ filter.source.each do |source|
198
205
  r = filter.evaluate(source)
206
+ next if r == nil
199
207
  if filter.constraints.size > 0
200
208
  #in the beginning, keys of result_hash are made up of all the results of the filter
201
209
  #with value = true; Later on, only those results will have 'true' value which are
@@ -206,24 +214,51 @@ module Scrubyt
206
214
  filter.constraints.each { |constraint| result_hash[res] &&= constraint.check(res) }
207
215
  end
208
216
  result = result_hash.reject {|k,v| k if !v}
209
- sorted_result = r.reject {|e| !result.keys.include? e}
210
- add_result(source, sorted_result)
217
+ sorted_result = r.reject {|e| !result.keys.include? e}
218
+ add_result(filter, source, sorted_result)
211
219
  else
212
- add_result(source, r)
213
- end
214
- end
215
- end
216
-
220
+ if ( (xe = @result.lookup(source)) != nil )
221
+ #puts "ha"; p xe
222
+ end
223
+ add_result(filter, source, r)
224
+ end#end of constraint check
225
+ end#end of source iteration
226
+ end#end of filter iteration
217
227
  end
218
228
 
219
229
  def get_instance_count
220
230
  @@instance_count
221
231
  end
232
+
233
+ def get_constraints
234
+ filters[0].constraints
235
+ end
222
236
 
223
237
  private
224
- def add_result(source, results)
238
+ def look_for_examples(args)
239
+ if (args[0].is_a? String)
240
+ @examples = args.select {|e| e.is_a? String}
241
+ #Check if all the String parameters are really the first
242
+ #parameters
243
+ args[0..@examples.size-1].each do |example|
244
+ if !example.is_a? String
245
+ puts 'FATAL: Problem with example specification'
246
+ end
247
+ end
248
+ elsif (args[0].is_a? Regexp)
249
+ #Check if all the String parameters are really the first
250
+ #parameters
251
+ args[0..@examples.size].each do |example|
252
+ if !example.is_a? Regexp
253
+ puts 'FATAL: Problem with example specification'
254
+ end
255
+ end
256
+ end
257
+ end
258
+
259
+ def add_result(filter, source, results)
225
260
  results.each do |res|
226
- @sink << res
261
+ filter.sink << res
227
262
  @result.add_result(source, res)
228
263
  @@instance_count[@name] += 1
229
264
  end
@@ -238,6 +273,13 @@ private
238
273
  end
239
274
  end
240
275
 
276
+ def mark_leaf_parents(pattern)
277
+ pattern.children.each { |child|
278
+ pattern.parent_of_leaf = true if child.children.size == 0
279
+ }
280
+ pattern.children.each { |child| mark_leaf_parents(child) }
281
+ end
282
+
241
283
  def set_root_pattern_whole_wrapper(pattern, root_pattern)
242
284
  pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
243
285
  pattern.root_pattern = root_pattern
@@ -249,15 +291,17 @@ private
249
291
  end
250
292
 
251
293
  def clear_sources_and_sinks(pattern)
252
- pattern.source = []
253
- pattern.sink = []
294
+ pattern.filters.each do |filter|
295
+ filter.source = []
296
+ filter.sink = []
297
+ end
254
298
  pattern.children.each {|child| clear_sources_and_sinks child}
255
299
  end
256
300
 
257
301
  def generate_next_page_link(example)
258
- node = XPathUtils.find_node_from_text(@root_pattern.source[0], example)
302
+ node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example)
259
303
  return nil if node == nil
260
- node.attributes['href']
304
+ node.attributes['href'].gsub('&amp;') {'&'}
261
305
  end # end of method generate_next_page_link
262
306
  end #end of class Pattern
263
307
  end #end of module Scrubyt
@@ -0,0 +1,58 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Post processing results after the extraction</tt>
4
+ #Some things can not be carried out during evaluation - for example
5
+ #the ensure_presence_of_pattern constraint (since the evaluation is top
6
+ #to bottom, at a given point we don't know yet whether the currently
7
+ #evaluated pattern will have a child pattern or not) or removing unneeded
8
+ #results caused by evaluating multiple filters.
9
+ #
10
+ #The sole purpose of this class is to execute these post-processing tasks.
11
+ class PostProcessor
12
+ ##
13
+ #Remove unneeded results of a pattern (caused by evaluating multiple filters)
14
+ #See for example the B&N scenario - the book titles are extracted two times
15
+ #for every pattern (since both examples generate the same XPath for them)
16
+ #but since always only one of the results has a price, the other is discarded
17
+ def self.remove_multiple_filter_duplicates(pattern)
18
+ remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
19
+ pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
20
+ end
21
+
22
+ private
23
+ def self.remove_multiple_filter_duplicates_intern(pattern)
24
+ possible_duplicates = {}
25
+ longest_result = 0
26
+ pattern.result.childmap.each { |r|
27
+ r.each do |k,v|
28
+ v.each do |x|
29
+ all_child_results = []
30
+ pattern.children.each { |child|
31
+ temp_res = child.result.lookup(x)
32
+ all_child_results << temp_res if temp_res != nil
33
+ }
34
+ next if all_child_results.size <= 1
35
+ longest_result = all_child_results.map {|e| e.size}.max
36
+ all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
37
+ possible_duplicates[x] = all_child_results.transpose
38
+ end
39
+ end
40
+ }
41
+ #Determine the 'real' duplicates
42
+ real_duplicates = {}
43
+ possible_duplicates.each { |k,v|
44
+ next if v.size == 1
45
+ v.each { |r| real_duplicates[k] = r }
46
+ }
47
+
48
+ #Finally, remove them!
49
+ pattern.children.each { |child|
50
+ child.result.childmap.each { |r|
51
+ r.each { |k,v|
52
+ real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
53
+ }
54
+ }
55
+ }
56
+ end
57
+ end
58
+ end
@@ -11,7 +11,7 @@ module Scrubyt
11
11
  def add_result(source, result)
12
12
  @childmap.each do |hash|
13
13
  if hash.keys[0] == source
14
- hash[source] << result
14
+ hash[source] << result if !hash[source].include? result
15
15
  return
16
16
  end
17
17
  end
@@ -35,7 +35,7 @@ end#end of module Scrubyt
35
35
 
36
36
  #table
37
37
  # source: doc1
38
- # childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, doc2 => [table[1]s2, table[2]s2, table[3]s2] ]
38
+ # childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
39
39
 
40
40
  #row
41
41
  # source: table1s1, table2s1, table3s1
@@ -15,9 +15,15 @@ module Scrubyt
15
15
  pattern.last_result = lr
16
16
  to_xml_recursive(pattern, root)
17
17
  end
18
+ remove_empty_leaves(doc)
18
19
  doc
19
20
  end
20
21
 
22
+ def self.remove_empty_leaves(node)
23
+ node.remove if node.elements.empty? && node.text == nil
24
+ node.elements.each {|child| remove_empty_leaves child }
25
+ end
26
+
21
27
  ##
22
28
  #Output the text of the pattern; If this pattern is a tree, collect the text from its
23
29
  #result instance node; otherwise rely on the last_result
@@ -7,6 +7,13 @@ module Scrubyt
7
7
  class XPathUtils
8
8
  #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
9
9
  NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
10
+ ENTITIES = {
11
+ 'quot' => '"',
12
+ 'apos' => "'",
13
+ 'amp' => '&',
14
+ 'lt' => '<',
15
+ 'gt' => '>',
16
+ 'nbsp' => ' '}
10
17
 
11
18
  #From the example text defined by the user, find the lowest possible node with the text 'text'.
12
19
  #The text can be also a mixed content text, e.g.
@@ -17,14 +24,23 @@ module Scrubyt
17
24
  def self.find_node_from_text(doc, text)
18
25
  @node = nil
19
26
  @found = false
20
- self.traverse_for_text(doc,text)
21
- self.lowest_possible_node_with_text(@node, text)
27
+ self.traverse_for_full_text(doc,text)
28
+ self.lowest_possible_node_with_text(@node, text) if @node != nil
22
29
  #$Logger.warn("Node for example #{text} Not found!") if (@found == false)
23
- puts "Node for example #{text} Not found!" if (@found == false)
30
+ if (@found == false)
31
+ #Fallback to per node text lookup
32
+ self.traverse_for_node_text(doc,text)
33
+ if (@found == false)
34
+ puts "FATAL: Node for example #{text} Not found!"
35
+ puts "Please make sure your specified the example properly"
36
+ end
37
+ end
38
+ p @node
24
39
  @node
25
40
  end
26
41
 
27
- #Full text of the node; this is equivalent to Hpricot's inner_text. Will be
42
+ #Full text of the node; this is equivalent to Hpricot's inner_text
43
+ #(? be sure to check). Will be
28
44
  #replaced if Hpricot 0.5 will be released
29
45
  def self.full_text(node)
30
46
  result = ""
@@ -119,7 +135,7 @@ module Scrubyt
119
135
  #_index_ - there might be more images with the same src on the page -
120
136
  #most typically the user will need the 0th - but if this is not the
121
137
  #case, there is the possibility to override this
122
- def self.find_image(doc, example, index=0)
138
+ def self.find_image(doc, example, index=1)
123
139
  (doc/"img[@src='#{example}']")[index]
124
140
  end
125
141
 
@@ -150,7 +166,7 @@ private
150
166
  #Note that in classic XPath, the indices start with 1 (rather
151
167
  #than 0).
152
168
  def self.find_index(node)
153
- c = -1
169
+ c = 0
154
170
  node.parent.children.each do |child|
155
171
  if child.class == Hpricot::Elem
156
172
  c += 1 if (child.name == node.name)
@@ -170,27 +186,48 @@ private
170
186
  path
171
187
  end
172
188
 
173
- def self.traverse_for_text(node, text)
189
+ def self.traverse_for_node_text(node, text)
174
190
  return if @found
175
191
  if (node.instance_of? Hpricot::Elem)
176
- @node = node
177
- ft = full_text(node)
178
- @found = true if (ft.gsub('&nbsp;'){' '} == text)
192
+ node.traverse_text do |t|
193
+ if (t.to_s == text)
194
+ @found = true
195
+ @node = t.parent
196
+ end
197
+ end
198
+ end
199
+ node.children.each do |child|
200
+ if child.instance_of? Hpricot::Elem
201
+ traverse_for_node_text(child, text) unless NON_CONTENT_TAGS.include? child.name
202
+ end
203
+ end
204
+ end
205
+
206
+ def self.traverse_for_full_text(node, text)
207
+ return if @found
208
+ if (node.instance_of? Hpricot::Elem)
209
+ ft = unescape_entities(full_text(node)).strip
210
+ if (ft == text)
211
+ @found = true
212
+ @node = node
213
+ end
179
214
  end
180
215
  node.children.each do |child|
181
- traverse_nodes child if child.instance_of? Hpricot::Doc
182
216
  if child.instance_of? Hpricot::Elem
183
- traverse_for_text(child, text) unless NON_CONTENT_TAGS.include? child.name
217
+ traverse_for_full_text(child, text) unless NON_CONTENT_TAGS.include? child.name
184
218
  end
185
219
  end
186
220
  end
221
+
222
+ def self.unescape_entities(text)
223
+ ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
224
+ text
225
+ end
187
226
 
188
227
  def self.lowest_possible_node_with_text(node, text)
189
228
  return if node.instance_of? Hpricot::Text
190
229
  @node = node if full_text(node) == text
191
- node.children.each do |child|
192
- lowest_possible_node_with_text(child, text)
193
- end
230
+ node.children.each { |child| lowest_possible_node_with_text(child, text) }
194
231
  end #End of method lowest_possible_node_with_text
195
232
  end #End of class XPathUtils
196
233
  end #End of module Scrubyt
@@ -1,6 +1,3 @@
1
- #require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint')
2
- #require File.join(File.dirname(__FILE__), '../..', 'lib', 'extractor')
3
- #require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint_adder')
4
1
  require 'scrubyt'
5
2
  require 'test/unit'
6
3
 
@@ -1,5 +1,3 @@
1
- #require File.join(File.dirname(__FILE__), '../../lib', 'extractor.rb')
2
- #require File.join(File.dirname(__FILE__), '../../lib', 'pattern.rb')
3
1
  require 'scrubyt'
4
2
  require 'test/unit'
5
3
 
@@ -7,7 +5,7 @@ class ExtractorTest < Test::Unit::TestCase
7
5
  def test_create_one_pattern
8
6
  pattern = Scrubyt::Extractor.define do
9
7
  fetch File.join(File.dirname(__FILE__), "input/test.html")
10
- pattern "x"
8
+ pattern "1"
11
9
  end
12
10
  assert_instance_of(Scrubyt::Pattern, pattern)
13
11
 
@@ -23,7 +21,7 @@ class ExtractorTest < Test::Unit::TestCase
23
21
  def test_create_child_pattern
24
22
  pattern = Scrubyt::Extractor.define do
25
23
  fetch File.join(File.dirname(__FILE__), "input/test.html")
26
- parent { child "x" }
24
+ parent { child "2" }
27
25
  end
28
26
 
29
27
  assert_equal(pattern.name, "root")
@@ -39,10 +37,10 @@ class ExtractorTest < Test::Unit::TestCase
39
37
  pattern = Scrubyt::Extractor.define do
40
38
  fetch File.join(File.dirname(__FILE__), "input/test.html")
41
39
  parent do
42
- child1 'x'
43
- child2 'y'
44
- child3 'z'
45
- child4 'a'
40
+ child1 '1'
41
+ child2 '2'
42
+ child3 '3'
43
+ child4 '4'
46
44
  end
47
45
  end
48
46
 
@@ -61,7 +59,7 @@ class ExtractorTest < Test::Unit::TestCase
61
59
  def test_create_hierarchy
62
60
  tree = Scrubyt::Extractor.define do
63
61
  fetch File.join(File.dirname(__FILE__), "input/test.html")
64
- a { b { c { d { e "x" } } } }
62
+ a { b { c { d { e "1" } } } }
65
63
  end
66
64
 
67
65
  assert_equal(tree.name,"root")
@@ -76,8 +74,8 @@ class ExtractorTest < Test::Unit::TestCase
76
74
  tree = Scrubyt::Extractor.define do
77
75
  fetch File.join(File.dirname(__FILE__), "input/test.html")
78
76
  a do
79
- b 'x'
80
- c 'y'
77
+ b '1'
78
+ c '2'
81
79
  end
82
80
  end
83
81
 
@@ -86,8 +84,8 @@ class ExtractorTest < Test::Unit::TestCase
86
84
  assert_not_nil(tree.children[0].filters[0])
87
85
  assert_nil(tree.children[0].example)
88
86
  assert_not_nil(tree.children[0].children[0].filters[0])
89
- assert_equal(tree.children[0].children[0].example,'x')
87
+ assert_equal(tree.children[0].children[0].filters[0].example,'1')
90
88
  assert_not_nil(tree.children[0].children[1].filters[0])
91
- assert_equal(tree.children[0].children[1].example,'y')
89
+ assert_equal(tree.children[0].children[1].filters[0].example,'2')
92
90
  end
93
91
  end