scrubyt 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +132 -1
- data/Rakefile +4 -2
- data/lib/scrubyt.rb +15 -10
- data/lib/scrubyt/core/navigation/fetch_action.rb +152 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +106 -0
- data/lib/scrubyt/{constraint.rb → core/scraping/constraint.rb} +0 -0
- data/lib/scrubyt/{constraint_adder.rb → core/scraping/constraint_adder.rb} +0 -0
- data/lib/scrubyt/{filter.rb → core/scraping/filter.rb} +22 -4
- data/lib/scrubyt/{pattern.rb → core/scraping/pattern.rb} +21 -98
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +13 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +88 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +97 -0
- data/lib/scrubyt/core/shared/extractor.rb +116 -0
- data/lib/scrubyt/{export.rb → output/export.rb} +14 -8
- data/lib/scrubyt/output/post_processor.rb +137 -0
- data/lib/scrubyt/{result.rb → output/result.rb} +0 -0
- data/lib/scrubyt/{result_dumper.rb → output/result_dumper.rb} +0 -7
- data/lib/scrubyt/{xpathutils.rb → utils/xpathutils.rb} +5 -2
- data/test/unittests/pattern_test.rb +27 -0
- metadata +40 -17
- data/lib/scrubyt/extractor.rb +0 -279
- data/lib/scrubyt/post_processor.rb +0 -73
@@ -1,73 +0,0 @@
|
|
1
|
-
module Scrubyt
|
2
|
-
##
|
3
|
-
#=<tt>Post processing results after the extraction</tt>
|
4
|
-
#Some things can not be carried out during evaluation - for example
|
5
|
-
#the ensure_presence_of_pattern constraint (since the evaluation is top
|
6
|
-
#to bottom, at a given point we don't know yet whether the currently
|
7
|
-
#evaluated pattern will have a child pattern or not) or removing unneeded
|
8
|
-
#results caused by evaluating multiple filters.
|
9
|
-
#
|
10
|
-
#The sole purpose of this class is to execute these post-processing tasks.
|
11
|
-
class PostProcessor
|
12
|
-
##
|
13
|
-
#Remove unneeded results of a pattern (caused by evaluating multiple filters)
|
14
|
-
#See for example the B&N scenario - the book titles are extracted two times
|
15
|
-
#for every pattern (since both examples generate the same XPath for them)
|
16
|
-
#but since always only one of the results has a price, the other is discarded
|
17
|
-
def self.remove_multiple_filter_duplicates(pattern)
|
18
|
-
remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
|
19
|
-
pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
|
20
|
-
end
|
21
|
-
|
22
|
-
##
|
23
|
-
#Issue an error report if the document did not extract anything.
|
24
|
-
#Probably this is because the structure of the page changed or
|
25
|
-
#because of some rather nasty bug - in any case, something wrong
|
26
|
-
#is going on, and we need to inform the user about this!
|
27
|
-
def self.report_if_no_results(root_pattern)
|
28
|
-
results_found = false
|
29
|
-
root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
|
30
|
-
puts
|
31
|
-
puts "!!!!!! WARNING: The extractor did not find any result instances"
|
32
|
-
puts "Most probably this is wrong. Check your extractor and if you are"
|
33
|
-
puts "sure it should work, report a bug!"
|
34
|
-
puts
|
35
|
-
end
|
36
|
-
|
37
|
-
private
|
38
|
-
def self.remove_multiple_filter_duplicates_intern(pattern)
|
39
|
-
possible_duplicates = {}
|
40
|
-
longest_result = 0
|
41
|
-
pattern.result.childmap.each { |r|
|
42
|
-
r.each do |k,v|
|
43
|
-
v.each do |x|
|
44
|
-
all_child_results = []
|
45
|
-
pattern.children.each { |child|
|
46
|
-
temp_res = child.result.lookup(x)
|
47
|
-
all_child_results << temp_res if temp_res != nil
|
48
|
-
}
|
49
|
-
next if all_child_results.size <= 1
|
50
|
-
longest_result = all_child_results.map {|e| e.size}.max
|
51
|
-
all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
|
52
|
-
possible_duplicates[x] = all_child_results.transpose
|
53
|
-
end
|
54
|
-
end
|
55
|
-
}
|
56
|
-
#Determine the 'real' duplicates
|
57
|
-
real_duplicates = {}
|
58
|
-
possible_duplicates.each { |k,v|
|
59
|
-
next if v.size == 1
|
60
|
-
v.each { |r| real_duplicates[k] = r }
|
61
|
-
}
|
62
|
-
|
63
|
-
#Finally, remove them!
|
64
|
-
pattern.children.each { |child|
|
65
|
-
child.result.childmap.each { |r|
|
66
|
-
r.each { |k,v|
|
67
|
-
real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
|
68
|
-
}
|
69
|
-
}
|
70
|
-
}
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|