scrubyt 0.2.0 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,73 +0,0 @@
1
- module Scrubyt
2
- ##
3
- #=<tt>Post processing results after the extraction</tt>
4
- #Some things can not be carried out during evaluation - for example
5
- #the ensure_presence_of_pattern constraint (since the evaluation is top
6
- #to bottom, at a given point we don't know yet whether the currently
7
- #evaluated pattern will have a child pattern or not) or removing unneeded
8
- #results caused by evaluating multiple filters.
9
- #
10
- #The sole purpose of this class is to execute these post-processing tasks.
11
- class PostProcessor
12
- ##
13
- #Remove unneeded results of a pattern (caused by evaluating multiple filters)
14
- #See for example the B&N scenario - the book titles are extracted two times
15
- #for every pattern (since both examples generate the same XPath for them)
16
- #but since always only one of the results has a price, the other is discarded
17
- def self.remove_multiple_filter_duplicates(pattern)
18
- remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
19
- pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
20
- end
21
-
22
- ##
23
- #Issue an error report if the document did not extract anything.
24
- #Probably this is because the structure of the page changed or
25
- #because of some rather nasty bug - in any case, something wrong
26
- #is going on, and we need to inform the user about this!
27
- def self.report_if_no_results(root_pattern)
28
- results_found = false
29
- root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
30
- puts
31
- puts "!!!!!! WARNING: The extractor did not find any result instances"
32
- puts "Most probably this is wrong. Check your extractor and if you are"
33
- puts "sure it should work, report a bug!"
34
- puts
35
- end
36
-
37
- private
38
- def self.remove_multiple_filter_duplicates_intern(pattern)
39
- possible_duplicates = {}
40
- longest_result = 0
41
- pattern.result.childmap.each { |r|
42
- r.each do |k,v|
43
- v.each do |x|
44
- all_child_results = []
45
- pattern.children.each { |child|
46
- temp_res = child.result.lookup(x)
47
- all_child_results << temp_res if temp_res != nil
48
- }
49
- next if all_child_results.size <= 1
50
- longest_result = all_child_results.map {|e| e.size}.max
51
- all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
52
- possible_duplicates[x] = all_child_results.transpose
53
- end
54
- end
55
- }
56
- #Determine the 'real' duplicates
57
- real_duplicates = {}
58
- possible_duplicates.each { |k,v|
59
- next if v.size == 1
60
- v.each { |r| real_duplicates[k] = r }
61
- }
62
-
63
- #Finally, remove them!
64
- pattern.children.each { |child|
65
- child.result.childmap.each { |r|
66
- r.each { |k,v|
67
- real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
68
- }
69
- }
70
- }
71
- end
72
- end
73
- end