scrubyt 0.1.0 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,5 @@
1
1
  require 'rubygems'
2
2
  require 'hpricot'
3
- require 'open-uri'
4
3
 
5
4
  module Scrubyt
6
5
  ##
@@ -43,7 +42,8 @@ module Scrubyt
43
42
 
44
43
  attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
45
44
  :last_result, :result, :root_pattern, :example, :block_count,
46
- :next_page, :limit, :extractor, :extracted_docs, :source, :sink
45
+ :next_page, :limit, :extractor, :extracted_docs,
46
+ :examples, :parent_of_leaf
47
47
  attr_reader :type, :generalize_set, :next_page_url
48
48
 
49
49
  def initialize (name, *args)
@@ -52,12 +52,17 @@ module Scrubyt
52
52
  @root_pattern = nil #root pattern of the wrapper
53
53
  @children = [] #child patterns
54
54
  @filters = [] #filters of the wrapper
55
- @sink = [] #output of a pattern
56
- @source = [] #input of a pattern
57
55
  @result = Result.new #hierarchical results of the pattern
58
56
  @@instance_count = Hash.new(0)
57
+ @evaluated_examples = []
59
58
  @next_page = nil
60
- filters << Scrubyt::Filter.new(self) #create a filter
59
+ if @examples == nil
60
+ filters << Scrubyt::Filter.new(self) #create a default filter
61
+ else
62
+ @examples.each do |example|
63
+ filters << Scrubyt::Filter.new(self,example) #create a filter
64
+ end
65
+ end
61
66
  end
62
67
 
63
68
  #Parse the args passed as *args; There is only one compulsory parameter to pattern: it's name
@@ -66,10 +71,8 @@ module Scrubyt
66
71
  #If an example is specified, it *MUST* be the first parameter; the order of the other
67
72
  #parameters is irrelevant
68
73
  def parse_args(args)
69
- #If an example id defined, not only get it but also remove it so it
70
- #does not interfere with the other possible string parameters
71
- @example = args.delete_at(0) if args[0].instance_of? String
72
- @example = args.delete_at(0) if args[0].instance_of? Regexp
74
+ #Grab any examples that are defined!
75
+ look_for_examples(args)
73
76
  args.each do |arg|
74
77
  arg.each do |k,v|
75
78
  #Set only the setable fields
@@ -96,7 +99,7 @@ module Scrubyt
96
99
  #This flag indicates that the user set 'generalize' to some value;
97
100
  #This way we can ensure that the explicit setting will not be overridden
98
101
  @generalize_set ||= false
99
- end
102
+ end
100
103
 
101
104
  #Dispatcher function; The class was already too big so I have decided to factor
102
105
  #out some methods based on their functionality (like output, adding constraints)
@@ -162,7 +165,7 @@ module Scrubyt
162
165
  temp_document = generate_next_page_link(@next_page)
163
166
  return nil if temp_document == nil
164
167
  clear_sources_and_sinks(@root_pattern)
165
- @root_pattern.extractor.fetch(temp_document, nil)
168
+ @root_pattern.extractor.fetch(temp_document)
166
169
  attach_current_document
167
170
  end
168
171
 
@@ -171,17 +174,18 @@ module Scrubyt
171
174
  #crawling to a new page
172
175
  def attach_current_document
173
176
  doc = @root_pattern.extractor.get_hpricot_doc
174
- @source << doc
175
- @sink << doc
177
+ filters[0].source << doc
178
+ filters[0].sink << doc
176
179
  @last_result ||= []
177
180
  @last_result << doc
178
- @result.add_result(@source, @sink)
181
+ @result.add_result(filters[0].source, filters[0].sink)
179
182
  end
180
183
 
181
184
  ##
182
185
  #Based on the given examples, calculate the XPaths for the tree patterns
183
186
  def setup_examples
184
187
  get_root_pattern(self)
188
+ mark_leaf_parents(self)
185
189
  set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
186
190
  generate_examples(@root_pattern)
187
191
  end
@@ -192,10 +196,14 @@ module Scrubyt
192
196
  def evaluate
193
197
  #No need to evaluate if there is no parent pattern
194
198
  return if @parent == nil
195
- @source = @parent.sink
196
- @source.each do |source|
197
- @filters.each do |filter|
199
+ all_filter_results = []
200
+ @filters.each do |filter|
201
+ filter_index = @filters.index(filter)
202
+ filter_index = 0 if @parent.filters.size <= filter_index
203
+ filter.source = @parent.filters[filter_index].sink
204
+ filter.source.each do |source|
198
205
  r = filter.evaluate(source)
206
+ next if r == nil
199
207
  if filter.constraints.size > 0
200
208
  #in the beginning, keys of result_hash are made up of all the results of the filter
201
209
  #with value = true; Later on, only those results will have 'true' value which are
@@ -206,24 +214,51 @@ module Scrubyt
206
214
  filter.constraints.each { |constraint| result_hash[res] &&= constraint.check(res) }
207
215
  end
208
216
  result = result_hash.reject {|k,v| k if !v}
209
- sorted_result = r.reject {|e| !result.keys.include? e}
210
- add_result(source, sorted_result)
217
+ sorted_result = r.reject {|e| !result.keys.include? e}
218
+ add_result(filter, source, sorted_result)
211
219
  else
212
- add_result(source, r)
213
- end
214
- end
215
- end
216
-
220
+ if ( (xe = @result.lookup(source)) != nil )
221
+ #puts "ha"; p xe
222
+ end
223
+ add_result(filter, source, r)
224
+ end#end of constraint check
225
+ end#end of source iteration
226
+ end#end of filter iteration
217
227
  end
218
228
 
219
229
  def get_instance_count
220
230
  @@instance_count
221
231
  end
232
+
233
+ def get_constraints
234
+ filters[0].constraints
235
+ end
222
236
 
223
237
  private
224
- def add_result(source, results)
238
+ def look_for_examples(args)
239
+ if (args[0].is_a? String)
240
+ @examples = args.select {|e| e.is_a? String}
241
+ #Check if all the String parameters are really the first
242
+ #parameters
243
+ args[0..@examples.size-1].each do |example|
244
+ if !example.is_a? String
245
+ puts 'FATAL: Problem with example specification'
246
+ end
247
+ end
248
+ elsif (args[0].is_a? Regexp)
249
+ #Check if all the String parameters are really the first
250
+ #parameters
251
+ args[0..@examples.size].each do |example|
252
+ if !example.is_a? Regexp
253
+ puts 'FATAL: Problem with example specification'
254
+ end
255
+ end
256
+ end
257
+ end
258
+
259
+ def add_result(filter, source, results)
225
260
  results.each do |res|
226
- @sink << res
261
+ filter.sink << res
227
262
  @result.add_result(source, res)
228
263
  @@instance_count[@name] += 1
229
264
  end
@@ -238,6 +273,13 @@ private
238
273
  end
239
274
  end
240
275
 
276
+ def mark_leaf_parents(pattern)
277
+ pattern.children.each { |child|
278
+ pattern.parent_of_leaf = true if child.children.size == 0
279
+ }
280
+ pattern.children.each { |child| mark_leaf_parents(child) }
281
+ end
282
+
241
283
  def set_root_pattern_whole_wrapper(pattern, root_pattern)
242
284
  pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
243
285
  pattern.root_pattern = root_pattern
@@ -249,15 +291,17 @@ private
249
291
  end
250
292
 
251
293
  def clear_sources_and_sinks(pattern)
252
- pattern.source = []
253
- pattern.sink = []
294
+ pattern.filters.each do |filter|
295
+ filter.source = []
296
+ filter.sink = []
297
+ end
254
298
  pattern.children.each {|child| clear_sources_and_sinks child}
255
299
  end
256
300
 
257
301
  def generate_next_page_link(example)
258
- node = XPathUtils.find_node_from_text(@root_pattern.source[0], example)
302
+ node = XPathUtils.find_node_from_text(@root_pattern.filters[0].source[0], example)
259
303
  return nil if node == nil
260
- node.attributes['href']
304
+ node.attributes['href'].gsub('&amp;') {'&'}
261
305
  end # end of method generate_next_page_link
262
306
  end #end of class Pattern
263
307
  end #end of module Scrubyt
@@ -0,0 +1,58 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Post processing results after the extraction</tt>
4
+ #Some things can not be carried out during evaluation - for example
5
+ #the ensure_presence_of_pattern constraint (since the evaluation is top
6
+ #to bottom, at a given point we don't know yet whether the currently
7
+ #evaluated pattern will have a child pattern or not) or removing unneeded
8
+ #results caused by evaluating multiple filters.
9
+ #
10
+ #The sole purpose of this class is to execute these post-processing tasks.
11
+ class PostProcessor
12
+ ##
13
+ #Remove unneeded results of a pattern (caused by evaluating multiple filters)
14
+ #See for example the B&N scenario - the book titles are extracted two times
15
+ #for every pattern (since both examples generate the same XPath for them)
16
+ #but since always only one of the results has a price, the other is discarded
17
+ def self.remove_multiple_filter_duplicates(pattern)
18
+ remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
19
+ pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
20
+ end
21
+
22
+ private
23
+ def self.remove_multiple_filter_duplicates_intern(pattern)
24
+ possible_duplicates = {}
25
+ longest_result = 0
26
+ pattern.result.childmap.each { |r|
27
+ r.each do |k,v|
28
+ v.each do |x|
29
+ all_child_results = []
30
+ pattern.children.each { |child|
31
+ temp_res = child.result.lookup(x)
32
+ all_child_results << temp_res if temp_res != nil
33
+ }
34
+ next if all_child_results.size <= 1
35
+ longest_result = all_child_results.map {|e| e.size}.max
36
+ all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
37
+ possible_duplicates[x] = all_child_results.transpose
38
+ end
39
+ end
40
+ }
41
+ #Determine the 'real' duplicates
42
+ real_duplicates = {}
43
+ possible_duplicates.each { |k,v|
44
+ next if v.size == 1
45
+ v.each { |r| real_duplicates[k] = r }
46
+ }
47
+
48
+ #Finally, remove them!
49
+ pattern.children.each { |child|
50
+ child.result.childmap.each { |r|
51
+ r.each { |k,v|
52
+ real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
53
+ }
54
+ }
55
+ }
56
+ end
57
+ end
58
+ end
@@ -11,7 +11,7 @@ module Scrubyt
11
11
  def add_result(source, result)
12
12
  @childmap.each do |hash|
13
13
  if hash.keys[0] == source
14
- hash[source] << result
14
+ hash[source] << result if !hash[source].include? result
15
15
  return
16
16
  end
17
17
  end
@@ -35,7 +35,7 @@ end#end of module Scrubyt
35
35
 
36
36
  #table
37
37
  # source: doc1
38
- # childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, doc2 => [table[1]s2, table[2]s2, table[3]s2] ]
38
+ # childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
39
39
 
40
40
  #row
41
41
  # source: table1s1, table2s1, table3s1
@@ -15,9 +15,15 @@ module Scrubyt
15
15
  pattern.last_result = lr
16
16
  to_xml_recursive(pattern, root)
17
17
  end
18
+ remove_empty_leaves(doc)
18
19
  doc
19
20
  end
20
21
 
22
+ def self.remove_empty_leaves(node)
23
+ node.remove if node.elements.empty? && node.text == nil
24
+ node.elements.each {|child| remove_empty_leaves child }
25
+ end
26
+
21
27
  ##
22
28
  #Output the text of the pattern; If this pattern is a tree, collect the text from its
23
29
  #result instance node; otherwise rely on the last_result
@@ -7,6 +7,13 @@ module Scrubyt
7
7
  class XPathUtils
8
8
  #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
9
9
  NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
10
+ ENTITIES = {
11
+ 'quot' => '"',
12
+ 'apos' => "'",
13
+ 'amp' => '&',
14
+ 'lt' => '<',
15
+ 'gt' => '>',
16
+ 'nbsp' => ' '}
10
17
 
11
18
  #From the example text defined by the user, find the lowest possible node with the text 'text'.
12
19
  #The text can be also a mixed content text, e.g.
@@ -17,14 +24,23 @@ module Scrubyt
17
24
  def self.find_node_from_text(doc, text)
18
25
  @node = nil
19
26
  @found = false
20
- self.traverse_for_text(doc,text)
21
- self.lowest_possible_node_with_text(@node, text)
27
+ self.traverse_for_full_text(doc,text)
28
+ self.lowest_possible_node_with_text(@node, text) if @node != nil
22
29
  #$Logger.warn("Node for example #{text} Not found!") if (@found == false)
23
- puts "Node for example #{text} Not found!" if (@found == false)
30
+ if (@found == false)
31
+ #Fallback to per node text lookup
32
+ self.traverse_for_node_text(doc,text)
33
+ if (@found == false)
34
+ puts "FATAL: Node for example #{text} Not found!"
35
+ puts "Please make sure your specified the example properly"
36
+ end
37
+ end
38
+ p @node
24
39
  @node
25
40
  end
26
41
 
27
- #Full text of the node; this is equivalent to Hpricot's inner_text. Will be
42
+ #Full text of the node; this is equivalent to Hpricot's inner_text
43
+ #(? be sure to check). Will be
28
44
  #replaced if Hpricot 0.5 will be released
29
45
  def self.full_text(node)
30
46
  result = ""
@@ -119,7 +135,7 @@ module Scrubyt
119
135
  #_index_ - there might be more images with the same src on the page -
120
136
  #most typically the user will need the 0th - but if this is not the
121
137
  #case, there is the possibility to override this
122
- def self.find_image(doc, example, index=0)
138
+ def self.find_image(doc, example, index=1)
123
139
  (doc/"img[@src='#{example}']")[index]
124
140
  end
125
141
 
@@ -150,7 +166,7 @@ private
150
166
  #Note that in classic XPath, the indices start with 1 (rather
151
167
  #than 0).
152
168
  def self.find_index(node)
153
- c = -1
169
+ c = 0
154
170
  node.parent.children.each do |child|
155
171
  if child.class == Hpricot::Elem
156
172
  c += 1 if (child.name == node.name)
@@ -170,27 +186,48 @@ private
170
186
  path
171
187
  end
172
188
 
173
- def self.traverse_for_text(node, text)
189
+ def self.traverse_for_node_text(node, text)
174
190
  return if @found
175
191
  if (node.instance_of? Hpricot::Elem)
176
- @node = node
177
- ft = full_text(node)
178
- @found = true if (ft.gsub('&nbsp;'){' '} == text)
192
+ node.traverse_text do |t|
193
+ if (t.to_s == text)
194
+ @found = true
195
+ @node = t.parent
196
+ end
197
+ end
198
+ end
199
+ node.children.each do |child|
200
+ if child.instance_of? Hpricot::Elem
201
+ traverse_for_node_text(child, text) unless NON_CONTENT_TAGS.include? child.name
202
+ end
203
+ end
204
+ end
205
+
206
+ def self.traverse_for_full_text(node, text)
207
+ return if @found
208
+ if (node.instance_of? Hpricot::Elem)
209
+ ft = unescape_entities(full_text(node)).strip
210
+ if (ft == text)
211
+ @found = true
212
+ @node = node
213
+ end
179
214
  end
180
215
  node.children.each do |child|
181
- traverse_nodes child if child.instance_of? Hpricot::Doc
182
216
  if child.instance_of? Hpricot::Elem
183
- traverse_for_text(child, text) unless NON_CONTENT_TAGS.include? child.name
217
+ traverse_for_full_text(child, text) unless NON_CONTENT_TAGS.include? child.name
184
218
  end
185
219
  end
186
220
  end
221
+
222
+ def self.unescape_entities(text)
223
+ ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
224
+ text
225
+ end
187
226
 
188
227
  def self.lowest_possible_node_with_text(node, text)
189
228
  return if node.instance_of? Hpricot::Text
190
229
  @node = node if full_text(node) == text
191
- node.children.each do |child|
192
- lowest_possible_node_with_text(child, text)
193
- end
230
+ node.children.each { |child| lowest_possible_node_with_text(child, text) }
194
231
  end #End of method lowest_possible_node_with_text
195
232
  end #End of class XPathUtils
196
233
  end #End of module Scrubyt
@@ -1,6 +1,3 @@
1
- #require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint')
2
- #require File.join(File.dirname(__FILE__), '../..', 'lib', 'extractor')
3
- #require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint_adder')
4
1
  require 'scrubyt'
5
2
  require 'test/unit'
6
3
 
@@ -1,5 +1,3 @@
1
- #require File.join(File.dirname(__FILE__), '../../lib', 'extractor.rb')
2
- #require File.join(File.dirname(__FILE__), '../../lib', 'pattern.rb')
3
1
  require 'scrubyt'
4
2
  require 'test/unit'
5
3
 
@@ -7,7 +5,7 @@ class ExtractorTest < Test::Unit::TestCase
7
5
  def test_create_one_pattern
8
6
  pattern = Scrubyt::Extractor.define do
9
7
  fetch File.join(File.dirname(__FILE__), "input/test.html")
10
- pattern "x"
8
+ pattern "1"
11
9
  end
12
10
  assert_instance_of(Scrubyt::Pattern, pattern)
13
11
 
@@ -23,7 +21,7 @@ class ExtractorTest < Test::Unit::TestCase
23
21
  def test_create_child_pattern
24
22
  pattern = Scrubyt::Extractor.define do
25
23
  fetch File.join(File.dirname(__FILE__), "input/test.html")
26
- parent { child "x" }
24
+ parent { child "2" }
27
25
  end
28
26
 
29
27
  assert_equal(pattern.name, "root")
@@ -39,10 +37,10 @@ class ExtractorTest < Test::Unit::TestCase
39
37
  pattern = Scrubyt::Extractor.define do
40
38
  fetch File.join(File.dirname(__FILE__), "input/test.html")
41
39
  parent do
42
- child1 'x'
43
- child2 'y'
44
- child3 'z'
45
- child4 'a'
40
+ child1 '1'
41
+ child2 '2'
42
+ child3 '3'
43
+ child4 '4'
46
44
  end
47
45
  end
48
46
 
@@ -61,7 +59,7 @@ class ExtractorTest < Test::Unit::TestCase
61
59
  def test_create_hierarchy
62
60
  tree = Scrubyt::Extractor.define do
63
61
  fetch File.join(File.dirname(__FILE__), "input/test.html")
64
- a { b { c { d { e "x" } } } }
62
+ a { b { c { d { e "1" } } } }
65
63
  end
66
64
 
67
65
  assert_equal(tree.name,"root")
@@ -76,8 +74,8 @@ class ExtractorTest < Test::Unit::TestCase
76
74
  tree = Scrubyt::Extractor.define do
77
75
  fetch File.join(File.dirname(__FILE__), "input/test.html")
78
76
  a do
79
- b 'x'
80
- c 'y'
77
+ b '1'
78
+ c '2'
81
79
  end
82
80
  end
83
81
 
@@ -86,8 +84,8 @@ class ExtractorTest < Test::Unit::TestCase
86
84
  assert_not_nil(tree.children[0].filters[0])
87
85
  assert_nil(tree.children[0].example)
88
86
  assert_not_nil(tree.children[0].children[0].filters[0])
89
- assert_equal(tree.children[0].children[0].example,'x')
87
+ assert_equal(tree.children[0].children[0].filters[0].example,'1')
90
88
  assert_not_nil(tree.children[0].children[1].filters[0])
91
- assert_equal(tree.children[0].children[1].example,'y')
89
+ assert_equal(tree.children[0].children[1].filters[0].example,'2')
92
90
  end
93
91
  end