scrubyt 0.2.0 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +132 -1
- data/Rakefile +4 -2
- data/lib/scrubyt.rb +15 -10
- data/lib/scrubyt/core/navigation/fetch_action.rb +152 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +106 -0
- data/lib/scrubyt/{constraint.rb → core/scraping/constraint.rb} +0 -0
- data/lib/scrubyt/{constraint_adder.rb → core/scraping/constraint_adder.rb} +0 -0
- data/lib/scrubyt/{filter.rb → core/scraping/filter.rb} +22 -4
- data/lib/scrubyt/{pattern.rb → core/scraping/pattern.rb} +21 -98
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +13 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +88 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +97 -0
- data/lib/scrubyt/core/shared/extractor.rb +116 -0
- data/lib/scrubyt/{export.rb → output/export.rb} +14 -8
- data/lib/scrubyt/output/post_processor.rb +137 -0
- data/lib/scrubyt/{result.rb → output/result.rb} +0 -0
- data/lib/scrubyt/{result_dumper.rb → output/result_dumper.rb} +0 -7
- data/lib/scrubyt/{xpathutils.rb → utils/xpathutils.rb} +5 -2
- data/test/unittests/pattern_test.rb +27 -0
- metadata +40 -17
- data/lib/scrubyt/extractor.rb +0 -279
- data/lib/scrubyt/post_processor.rb +0 -73
@@ -1,73 +0,0 @@
|
|
1
|
-
module Scrubyt
|
2
|
-
##
|
3
|
-
#=<tt>Post processing results after the extraction</tt>
|
4
|
-
#Some things can not be carried out during evaluation - for example
|
5
|
-
#the ensure_presence_of_pattern constraint (since the evaluation is top
|
6
|
-
#to bottom, at a given point we don't know yet whether the currently
|
7
|
-
#evaluated pattern will have a child pattern or not) or removing unneeded
|
8
|
-
#results caused by evaluating multiple filters.
|
9
|
-
#
|
10
|
-
#The sole purpose of this class is to execute these post-processing tasks.
|
11
|
-
class PostProcessor
|
12
|
-
##
|
13
|
-
#Remove unneeded results of a pattern (caused by evaluating multiple filters)
|
14
|
-
#See for example the B&N scenario - the book titles are extracted two times
|
15
|
-
#for every pattern (since both examples generate the same XPath for them)
|
16
|
-
#but since always only one of the results has a price, the other is discarded
|
17
|
-
def self.remove_multiple_filter_duplicates(pattern)
|
18
|
-
remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
|
19
|
-
pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
|
20
|
-
end
|
21
|
-
|
22
|
-
##
|
23
|
-
#Issue an error report if the document did not extract anything.
|
24
|
-
#Probably this is because the structure of the page changed or
|
25
|
-
#because of some rather nasty bug - in any case, something wrong
|
26
|
-
#is going on, and we need to inform the user about this!
|
27
|
-
def self.report_if_no_results(root_pattern)
|
28
|
-
results_found = false
|
29
|
-
root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
|
30
|
-
puts
|
31
|
-
puts "!!!!!! WARNING: The extractor did not find any result instances"
|
32
|
-
puts "Most probably this is wrong. Check your extractor and if you are"
|
33
|
-
puts "sure it should work, report a bug!"
|
34
|
-
puts
|
35
|
-
end
|
36
|
-
|
37
|
-
private
|
38
|
-
def self.remove_multiple_filter_duplicates_intern(pattern)
|
39
|
-
possible_duplicates = {}
|
40
|
-
longest_result = 0
|
41
|
-
pattern.result.childmap.each { |r|
|
42
|
-
r.each do |k,v|
|
43
|
-
v.each do |x|
|
44
|
-
all_child_results = []
|
45
|
-
pattern.children.each { |child|
|
46
|
-
temp_res = child.result.lookup(x)
|
47
|
-
all_child_results << temp_res if temp_res != nil
|
48
|
-
}
|
49
|
-
next if all_child_results.size <= 1
|
50
|
-
longest_result = all_child_results.map {|e| e.size}.max
|
51
|
-
all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
|
52
|
-
possible_duplicates[x] = all_child_results.transpose
|
53
|
-
end
|
54
|
-
end
|
55
|
-
}
|
56
|
-
#Determine the 'real' duplicates
|
57
|
-
real_duplicates = {}
|
58
|
-
possible_duplicates.each { |k,v|
|
59
|
-
next if v.size == 1
|
60
|
-
v.each { |r| real_duplicates[k] = r }
|
61
|
-
}
|
62
|
-
|
63
|
-
#Finally, remove them!
|
64
|
-
pattern.children.each { |child|
|
65
|
-
child.result.childmap.each { |r|
|
66
|
-
r.each { |k,v|
|
67
|
-
real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
|
68
|
-
}
|
69
|
-
}
|
70
|
-
}
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|