scrubyt 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,73 +0,0 @@
1
- module Scrubyt
2
- ##
3
- #=<tt>Post processing results after the extraction</tt>
4
- #Some things can not be carried out during evaluation - for example
5
- #the ensure_presence_of_pattern constraint (since the evaluation is top
6
- #to bottom, at a given point we don't know yet whether the currently
7
- #evaluated pattern will have a child pattern or not) or removing unneeded
8
- #results caused by evaluating multiple filters.
9
- #
10
- #The sole purpose of this class is to execute these post-processing tasks.
11
- class PostProcessor
12
- ##
13
- #Remove unneeded results of a pattern (caused by evaluating multiple filters)
14
- #See for example the B&N scenario - the book titles are extracted two times
15
- #for every pattern (since both examples generate the same XPath for them)
16
- #but since always only one of the results has a price, the other is discarded
17
- def self.remove_multiple_filter_duplicates(pattern)
18
- remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
19
- pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
20
- end
21
-
22
- ##
23
- #Issue an error report if the document did not extract anything.
24
- #Probably this is because the structure of the page changed or
25
- #because of some rather nasty bug - in any case, something wrong
26
- #is going on, and we need to inform the user about this!
27
- def self.report_if_no_results(root_pattern)
28
- results_found = false
29
- root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
30
- puts
31
- puts "!!!!!! WARNING: The extractor did not find any result instances"
32
- puts "Most probably this is wrong. Check your extractor and if you are"
33
- puts "sure it should work, report a bug!"
34
- puts
35
- end
36
-
37
- private
38
- def self.remove_multiple_filter_duplicates_intern(pattern)
39
- possible_duplicates = {}
40
- longest_result = 0
41
- pattern.result.childmap.each { |r|
42
- r.each do |k,v|
43
- v.each do |x|
44
- all_child_results = []
45
- pattern.children.each { |child|
46
- temp_res = child.result.lookup(x)
47
- all_child_results << temp_res if temp_res != nil
48
- }
49
- next if all_child_results.size <= 1
50
- longest_result = all_child_results.map {|e| e.size}.max
51
- all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
52
- possible_duplicates[x] = all_child_results.transpose
53
- end
54
- end
55
- }
56
- #Determine the 'real' duplicates
57
- real_duplicates = {}
58
- possible_duplicates.each { |k,v|
59
- next if v.size == 1
60
- v.each { |r| real_duplicates[k] = r }
61
- }
62
-
63
- #Finally, remove them!
64
- pattern.children.each { |child|
65
- child.result.childmap.each { |r|
66
- r.each { |k,v|
67
- real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
68
- }
69
- }
70
- }
71
- end
72
- end
73
- end