andyverprauskus-scrubyt 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +355 -0
- data/COPYING +340 -0
- data/README.rdoc +121 -0
- data/Rakefile +101 -0
- data/lib/scrubyt.rb +53 -0
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +318 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +312 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +63 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +107 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/constraint.rb +169 -0
- data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
- data/lib/scrubyt/core/scraping/pattern.rb +359 -0
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
- data/lib/scrubyt/core/shared/extractor.rb +183 -0
- data/lib/scrubyt/logging.rb +154 -0
- data/lib/scrubyt/output/post_processor.rb +139 -0
- data/lib/scrubyt/output/result.rb +44 -0
- data/lib/scrubyt/output/result_dumper.rb +154 -0
- data/lib/scrubyt/output/result_node.rb +145 -0
- data/lib/scrubyt/output/scrubyt_result.rb +42 -0
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
- data/lib/scrubyt/utils/shared_utils.rb +58 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
- data/lib/scrubyt/utils/xpathutils.rb +202 -0
- data/test/blackbox_test.rb +60 -0
- data/test/blackbox_tests/basic/multi_root.rb +6 -0
- data/test/blackbox_tests/basic/simple.rb +5 -0
- data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
- data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
- data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
- data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
- metadata +120 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Apply different functions on the input document</tt>
|
4
|
+
#Before the document is passed to Hpricot for parsing, we may need
|
5
|
+
#to do different stuff with it which are clumsy/not appropriate/impossible
|
6
|
+
#to do once the document is loaded.
|
7
|
+
class PreFilterDocument
|
8
|
+
#Replace <br/> tags with newlines
|
9
|
+
def self.br_to_newline(doc)
|
10
|
+
doc.gsub(/<br[ \/]*>/i, "\r\n")
|
11
|
+
doc = doc.tr("\240"," ")
|
12
|
+
end #end of function br_to_newline
|
13
|
+
end #end of class PreFilterDocument
|
14
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Selecting results based on indices</tt>
|
4
|
+
#
|
5
|
+
#If the results is list-like (as opposed to a 'hard' result, like a _price_ or a _title_),
|
6
|
+
#probably with a variable count of results (like tags, authors etc.), you may need just
|
7
|
+
#specific elements - like the last one, every third one, or at specific indices.
|
8
|
+
#In this case you should use the select_indices syntax.
|
9
|
+
class ResultIndexer
|
10
|
+
attr_reader :indices_to_extract
|
11
|
+
|
12
|
+
def initialize(*args)
|
13
|
+
select_indices(*args)
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
#Perform selection of the desires result instances, based on their indices
|
18
|
+
def select_indices_to_extract(ary)
|
19
|
+
return ary if @indices_to_extract == nil
|
20
|
+
to_keep = []
|
21
|
+
@indices_to_extract.each {|e|
|
22
|
+
if e.is_a? Symbol
|
23
|
+
case e
|
24
|
+
when :first
|
25
|
+
to_keep << 0
|
26
|
+
when :last
|
27
|
+
to_keep << ary.size-1
|
28
|
+
when :all_but_last
|
29
|
+
(0..ary.size-2).each {|i| to_keep << i}
|
30
|
+
when :all_but_first
|
31
|
+
(1..ary.size-1).each {|i| to_keep << i}
|
32
|
+
when :every_even
|
33
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 1)}
|
34
|
+
when :every_odd
|
35
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
36
|
+
when :every_second
|
37
|
+
(0..ary.size).each {|i| to_keep << i if (i % 2 == 0)}
|
38
|
+
when :every_third
|
39
|
+
(0..ary.size).each {|i| to_keep << i if (i % 3 == 0)}
|
40
|
+
when :every_fourth
|
41
|
+
(0..ary.size).each {|i| to_keep << i if (i % 4 == 0)}
|
42
|
+
end
|
43
|
+
end
|
44
|
+
}
|
45
|
+
@indices_to_extract.each {|i| to_keep << i if !i.is_a? Symbol}
|
46
|
+
to_keep.sort!
|
47
|
+
ary.reject! {|e| !to_keep.include? ary.index(e)}
|
48
|
+
ary
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
##
|
53
|
+
#Do not return the whole result set, just specified indices - like
|
54
|
+
#first,last, every odd index, indices from [1..3] etc.
|
55
|
+
#
|
56
|
+
#This method can accept:
|
57
|
+
#- a range, like (2..3)
|
58
|
+
#- an array of indices, like [1,2,3]
|
59
|
+
#- specified set of keywords:
|
60
|
+
# - :first
|
61
|
+
# - :last
|
62
|
+
# - :every_even
|
63
|
+
# - :every_odd
|
64
|
+
# (there can be more of these keywords in one select_indices call)
|
65
|
+
def select_indices(*args)
|
66
|
+
indices_to_grab = args[0]
|
67
|
+
case indices_to_grab.class.to_s
|
68
|
+
when "Range"
|
69
|
+
@indices_to_extract = indices_to_grab.to_a
|
70
|
+
when "Array"
|
71
|
+
nested_arrays = []
|
72
|
+
indices_to_grab.each {|e|
|
73
|
+
if e.is_a? Array
|
74
|
+
nested_arrays << e
|
75
|
+
elsif e.is_a? Range
|
76
|
+
nested_arrays << e.to_a
|
77
|
+
end
|
78
|
+
}
|
79
|
+
@indices_to_extract = indices_to_grab
|
80
|
+
nested_arrays.each {|a| a.each {|e| @indices_to_extract << e if !@indices_to_extract.include? e }}
|
81
|
+
@indices_to_extract.reject! {|e| ((e.is_a? Range) || (e.is_a? Array)) }
|
82
|
+
when "Symbol"
|
83
|
+
#parse this when we already have the results
|
84
|
+
@indices_to_extract = [indices_to_grab]
|
85
|
+
else
|
86
|
+
puts "Invalid index specification"
|
87
|
+
end
|
88
|
+
end #end of function select_indices
|
89
|
+
end #end of class ResultIndexer
|
90
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,183 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Driving the whole extraction process</tt>
|
4
|
+
#
|
5
|
+
#Extractor is a performer class - it gets an extractor definition and carries
|
6
|
+
#out the actions and evaluates the wrappers sequentially.
|
7
|
+
#
|
8
|
+
#Originally also the navigation actions were here, but since the class got too
|
9
|
+
#big, they were factored out to an own class, NavigationAction.
|
10
|
+
class Extractor
|
11
|
+
include FetchAction
|
12
|
+
|
13
|
+
attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
|
14
|
+
|
15
|
+
#The definition of the extractor is passed through this method
|
16
|
+
def self.define(mode=nil, &extractor_definition)
|
17
|
+
if mode.is_a?(Hash)
|
18
|
+
if mode[:agent]==:firefox
|
19
|
+
FetchAction.class_eval do
|
20
|
+
include Navigation::Firewatir
|
21
|
+
end
|
22
|
+
else
|
23
|
+
FetchAction.class_eval do
|
24
|
+
include Navigation::Mechanize
|
25
|
+
end
|
26
|
+
end
|
27
|
+
else
|
28
|
+
FetchAction.class_eval do
|
29
|
+
include Navigation::Mechanize
|
30
|
+
end
|
31
|
+
end
|
32
|
+
extractor = self.new(mode, extractor_definition)
|
33
|
+
extractor.result
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.load(filename)
|
37
|
+
define(&eval(IO.read(filename)))
|
38
|
+
end
|
39
|
+
|
40
|
+
def initialize(mode, extractor_definition)
|
41
|
+
@mode = mode
|
42
|
+
@root_patterns = []
|
43
|
+
@next_page_pattern = nil
|
44
|
+
# @hpricot_doc = nil
|
45
|
+
# @hpricot_doc_url = nil
|
46
|
+
@evaluating_extractor_definition = false
|
47
|
+
@next_page_list = []
|
48
|
+
@processed_pages = []
|
49
|
+
|
50
|
+
backtrace = SharedUtils.get_backtrace
|
51
|
+
parts = backtrace[1].split(':')
|
52
|
+
source_file = parts[0]
|
53
|
+
|
54
|
+
Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
|
55
|
+
|
56
|
+
@evaluating_extractor_definition = true
|
57
|
+
context = Object.new
|
58
|
+
context.extend NavigationActions
|
59
|
+
context.instance_eval do
|
60
|
+
def extractor=(value)
|
61
|
+
@extractor = value
|
62
|
+
end
|
63
|
+
|
64
|
+
def next_page(*args)
|
65
|
+
@extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
|
66
|
+
end
|
67
|
+
|
68
|
+
def method_missing(method_name, *args, &block)
|
69
|
+
root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
|
70
|
+
@extractor.root_patterns << root_pattern
|
71
|
+
root_pattern
|
72
|
+
end
|
73
|
+
end
|
74
|
+
FetchAction.extractor = self
|
75
|
+
context.extractor = self
|
76
|
+
context.instance_eval(&extractor_definition)
|
77
|
+
@evaluating_extractor_definition = false
|
78
|
+
|
79
|
+
if @root_patterns.empty?
|
80
|
+
# TODO: this should be an exception
|
81
|
+
Scrubyt.log :ERROR, 'No extractor defined, exiting...'
|
82
|
+
exit
|
83
|
+
end
|
84
|
+
|
85
|
+
#Once all is set up, evaluate the extractor from the root pattern!
|
86
|
+
root_results = evaluate_extractor
|
87
|
+
FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
|
88
|
+
|
89
|
+
|
90
|
+
@result = ScrubytResult.new('root')
|
91
|
+
@result.push(*@root_results)
|
92
|
+
@result.root_patterns = @root_patterns
|
93
|
+
@result.source_file = source_file
|
94
|
+
@result.source_proc = extractor_definition
|
95
|
+
|
96
|
+
#Return the root pattern
|
97
|
+
Scrubyt.log :INFO, 'Extraction finished succesfully!'
|
98
|
+
end
|
99
|
+
|
100
|
+
def get_hpricot_doc
|
101
|
+
FetchAction.get_hpricot_doc
|
102
|
+
end
|
103
|
+
|
104
|
+
def get_current_doc_url
|
105
|
+
FetchAction.get_current_doc_url
|
106
|
+
end
|
107
|
+
|
108
|
+
def get_detail_pattern_relations
|
109
|
+
@detail_pattern_relations
|
110
|
+
end
|
111
|
+
|
112
|
+
def get_mode
|
113
|
+
@mode
|
114
|
+
end
|
115
|
+
|
116
|
+
def get_original_host_name
|
117
|
+
@original_host_name
|
118
|
+
end
|
119
|
+
|
120
|
+
def add_to_next_page_list(result_node)
|
121
|
+
if result_node.result.is_a? Hpricot::Elem
|
122
|
+
node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
|
123
|
+
return if node == nil || node.attributes['href'] == nil
|
124
|
+
href = node.attributes['href'].gsub('&') {'&'}
|
125
|
+
elsif result_node.result.is_a? String
|
126
|
+
href = result_node.result
|
127
|
+
end
|
128
|
+
url = href #TODO need absolute address here 1/4
|
129
|
+
@next_page_list << url
|
130
|
+
end
|
131
|
+
|
132
|
+
def evaluate_extractor
|
133
|
+
@root_results ||= []
|
134
|
+
current_page_count = 1
|
135
|
+
xpath = nil
|
136
|
+
catch :quit_next_page_loop do
|
137
|
+
loop do
|
138
|
+
url = get_current_doc_url #TODO need absolute address here 2/4
|
139
|
+
@processed_pages << url
|
140
|
+
@root_patterns.each do |root_pattern|
|
141
|
+
@root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
142
|
+
end
|
143
|
+
|
144
|
+
node = nil
|
145
|
+
while @processed_pages.include? url #TODO need absolute address here 3/4
|
146
|
+
if !@next_page_pattern.nil?
|
147
|
+
if @next_page_pattern.options[:limit] == current_page_count
|
148
|
+
throw :quit_next_page_loop
|
149
|
+
end
|
150
|
+
unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
|
151
|
+
throw :quit_next_page_loop
|
152
|
+
end
|
153
|
+
xpath = @next_page_pattern.filters[0].xpath
|
154
|
+
node = (get_hpricot_doc/xpath).last
|
155
|
+
node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
|
156
|
+
if node == nil || node.attributes['href'] == nil
|
157
|
+
throw :quit_next_page_loop
|
158
|
+
end
|
159
|
+
href = node.attributes['href'].gsub('&') {'&'}
|
160
|
+
throw :quit_next_page_loop if href == nil
|
161
|
+
url = href #TODO need absolute address here 4/4
|
162
|
+
else
|
163
|
+
throw :quit_next_page_loop if @next_page_list.empty?
|
164
|
+
url = @next_page_list.pop
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
restore_host_name
|
169
|
+
if url == "#"
|
170
|
+
FetchAction.click_by_xpath_without_evaluate(xpath)
|
171
|
+
else
|
172
|
+
FetchAction.fetch(url)
|
173
|
+
end
|
174
|
+
|
175
|
+
current_page_count += 1
|
176
|
+
end
|
177
|
+
end
|
178
|
+
@root_patterns = []
|
179
|
+
@root_results
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#
|
2
|
+
# TODO: if multiline messages aren't needed, then remove them.
|
3
|
+
#
|
4
|
+
# TODO: switch to the conventional Ruby logger interface,
|
5
|
+
# or create an adapter to it. If the former, then decided what to
|
6
|
+
# do with the unit tests.
|
7
|
+
#
|
8
|
+
|
9
|
+
module Scrubyt
|
10
|
+
# Logging is disabled by default. It can be enabled as follows:
|
11
|
+
#
|
12
|
+
# Scrubyt.logger = Scrubyt::Logger.new # logs *all* messages to STDERR
|
13
|
+
#
|
14
|
+
def self.logger=(logger)
|
15
|
+
@logger = logger
|
16
|
+
end
|
17
|
+
|
18
|
+
# Simple logger implementation, based on Scrubyt's original logging style.
|
19
|
+
# Messages will be sent to STDERR. Logging can be limited to certain message
|
20
|
+
# levels by specifying them on initialization, e.g.
|
21
|
+
#
|
22
|
+
# Scrubyt::Logger.new(:ACTION, :ERROR) # will only log action/error messages
|
23
|
+
#
|
24
|
+
class Logger
|
25
|
+
class Message
|
26
|
+
def initialize(level, text)
|
27
|
+
@level, @text = level.to_s, text.to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
prefix + @text
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def prefix
|
37
|
+
@prefix ||= "[#{@level}] "
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class MultiLineMessage < Message
|
42
|
+
def initialize(level, lines)
|
43
|
+
super level, lines.shift
|
44
|
+
|
45
|
+
@lines = lines
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_s
|
49
|
+
[ super, indented_lines ] * "\n"
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def indented_lines
|
55
|
+
@lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
|
56
|
+
end
|
57
|
+
|
58
|
+
def indented(line)
|
59
|
+
' ' * prefix.length + line
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def initialize(*levels)
|
64
|
+
@levels = levels
|
65
|
+
end
|
66
|
+
|
67
|
+
def log(level, message)
|
68
|
+
return unless logging?(level)
|
69
|
+
|
70
|
+
message_class = message.is_a?(Array) ? MultiLineMessage : Message
|
71
|
+
|
72
|
+
output_stream.puts message_class.new(level, message)
|
73
|
+
end
|
74
|
+
|
75
|
+
def output_stream
|
76
|
+
@output_stream || STDERR
|
77
|
+
end
|
78
|
+
|
79
|
+
attr_writer :output_stream
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def logging?(level)
|
84
|
+
@levels.empty? || @levels.include?(level)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.log(level, message)
|
89
|
+
return if logger.nil?
|
90
|
+
|
91
|
+
logger.log(level, message)
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
def self.logger
|
97
|
+
@logger
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
if __FILE__ == $0 then
|
103
|
+
|
104
|
+
require 'test/unit'
|
105
|
+
|
106
|
+
class ScrubytLoggingTestCase < Test::Unit::TestCase
|
107
|
+
class FauxOutputStream < Array
|
108
|
+
def puts(object)
|
109
|
+
self << object.to_s
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def setup_logger_with_faux_output_stream!(*logger_args)
|
114
|
+
@stream = FauxOutputStream.new
|
115
|
+
logger = Scrubyt::Logger.new(*logger_args)
|
116
|
+
logger.output_stream = @stream
|
117
|
+
Scrubyt.logger = logger
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_that_logging_works_with_nil_logger
|
121
|
+
Scrubyt.logger = nil
|
122
|
+
assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
|
123
|
+
end
|
124
|
+
|
125
|
+
def test_simple_messages_are_output_correctly
|
126
|
+
setup_logger_with_faux_output_stream!
|
127
|
+
|
128
|
+
Scrubyt.log :ACTION, 'i just did something'
|
129
|
+
|
130
|
+
assert_equal 1, @stream.size
|
131
|
+
assert_equal '[ACTION] i just did something', @stream.first
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_that_multiline_messages_are_output_correctly
|
135
|
+
setup_logger_with_faux_output_stream!
|
136
|
+
|
137
|
+
Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
|
138
|
+
|
139
|
+
assert_equal 1, @stream.size
|
140
|
+
assert_equal "[ERROR] something bad happened\n dear oh dear", @stream.first
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_that_loggers_can_be_limited_to_specfied_message_levels
|
144
|
+
setup_logger_with_faux_output_stream! :ERROR
|
145
|
+
|
146
|
+
Scrubyt.log :ACTION, 'i just did something'
|
147
|
+
Scrubyt.log :ERROR, 'something bad happened'
|
148
|
+
|
149
|
+
assert_equal 1, @stream.size
|
150
|
+
assert_equal '[ERROR] something bad happened', @stream.first
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
|
3
|
+
########################################## NOT USED ANY MORE ##########################################
|
4
|
+
require 'set'
|
5
|
+
##
|
6
|
+
#=<tt>Post processing results after the extraction</tt>
|
7
|
+
#Some things can not be carried out during evaluation - for example
|
8
|
+
#the ensure_presence_of_pattern constraint (since the evaluation is top
|
9
|
+
#to bottom, at a given point we don't know yet whether the currently
|
10
|
+
#evaluated pattern will have a child pattern or not) or removing unneeded
|
11
|
+
#results caused by evaluating multiple filters.
|
12
|
+
#
|
13
|
+
#The sole purpose of this class is to execute these post-processing tasks.
|
14
|
+
class PostProcessor
|
15
|
+
##
|
16
|
+
#This is just a convenience method do call all the postprocessing
|
17
|
+
#functionality and checks
|
18
|
+
def self.apply_post_processing(root_pattern)
|
19
|
+
ensure_presence_of_pattern_full(root_pattern)
|
20
|
+
remove_multiple_filter_duplicates(root_pattern) if root_pattern.children[0].filters.size > 1
|
21
|
+
report_if_no_results(root_pattern) if root_pattern.evaluation_context.extractor.get_mode != :production
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
#Apply the ensure_presence_of_pattern constraint on
|
26
|
+
#the full extractor
|
27
|
+
def self.ensure_presence_of_pattern_full(pattern)
|
28
|
+
ensure_presence_of_pattern(pattern)
|
29
|
+
pattern.children.each {|child| ensure_presence_of_pattern_full(child)}
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
#Remove unneeded results of a pattern (caused by evaluating multiple filters)
|
34
|
+
#See for example the B&N scenario - the book titles are extracted two times
|
35
|
+
#for every pattern (since both examples generate the same XPath for them)
|
36
|
+
#but since always only one of the results has a price, the other is discarded
|
37
|
+
def self.remove_multiple_filter_duplicates(pattern)
|
38
|
+
remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
|
39
|
+
pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
#Issue an error report if the document did not extract anything.
|
44
|
+
#Probably this is because the structure of the page changed or
|
45
|
+
#because of some rather nasty bug - in any case, something wrong
|
46
|
+
#is going on, and we need to inform the user about this!
|
47
|
+
def self.report_if_no_results(root_pattern)
|
48
|
+
results_found = false
|
49
|
+
root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
|
50
|
+
|
51
|
+
Scrubyt.log :WARNING, [
|
52
|
+
"The extractor did not find any result instances. Most probably this is wrong.",
|
53
|
+
"Check your extractor and if you are sure it should work, report a bug!"
|
54
|
+
]
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
def self.ensure_presence_of_pattern(pattern)
|
59
|
+
#holds the name of those child patterns which have to be present as children of the input parameter
|
60
|
+
epop_names = pattern.constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
|
61
|
+
return if epop_names.empty?
|
62
|
+
#all_parent_values holds instances extracted by pattern
|
63
|
+
all_parent_values = []
|
64
|
+
pattern.result.childmap.each { |h| all_parent_values << h.values }
|
65
|
+
all_parent_values.flatten!
|
66
|
+
#indices of result instances (of pattern) we are going to remove
|
67
|
+
results_to_remove = Set.new
|
68
|
+
pattern.children.each do |child_pattern|
|
69
|
+
#all_child_values holds instances extracted by child_pattern
|
70
|
+
all_child_values = []
|
71
|
+
child_pattern.result.childmap.each { |h| all_child_values << h.values }
|
72
|
+
all_child_values.flatten!
|
73
|
+
|
74
|
+
#populate results_to_remove
|
75
|
+
i = 0
|
76
|
+
all_parent_values.each do |parent_value|
|
77
|
+
#Hey! Not just the direct children but all the ancestors
|
78
|
+
@found_ancestor = false
|
79
|
+
check_ancestors(parent_value, all_child_values)
|
80
|
+
|
81
|
+
results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
|
82
|
+
i += 1
|
83
|
+
end
|
84
|
+
end
|
85
|
+
#based on results_to_remove, populate the array 'rejected' which holds the actual instances
|
86
|
+
#(and not indices, as in the case of results_to_remove!). In other words, we are mapping
|
87
|
+
#results_to_remove indices to their actual instances
|
88
|
+
rejected = []
|
89
|
+
i = -1
|
90
|
+
pattern.result.childmap.each do |h|
|
91
|
+
h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
|
92
|
+
end
|
93
|
+
|
94
|
+
#Finally, do the actual delete!
|
95
|
+
pattern.result.childmap.each { |h| h.each { |k,v| rejected.each { |r| v.delete(r)} } }
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.check_ancestors(parent_value, all_child_values)
|
99
|
+
parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child } if
|
100
|
+
parent_value.is_a? Hpricot::Elem
|
101
|
+
parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem } if parent_value.is_a? Hpricot::Elem
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.remove_multiple_filter_duplicates_intern(pattern)
|
105
|
+
possible_duplicates = {}
|
106
|
+
longest_result = 0
|
107
|
+
pattern.result.childmap.each { |r|
|
108
|
+
r.each do |k,v|
|
109
|
+
v.each do |x|
|
110
|
+
all_child_results = []
|
111
|
+
pattern.children.each { |child|
|
112
|
+
temp_res = child.result.lookup(x)
|
113
|
+
all_child_results << temp_res if temp_res != nil
|
114
|
+
}
|
115
|
+
next if all_child_results.size <= 1
|
116
|
+
longest_result = all_child_results.map {|e| e.size}.max
|
117
|
+
all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
|
118
|
+
possible_duplicates[x] = all_child_results.transpose
|
119
|
+
end
|
120
|
+
end
|
121
|
+
}
|
122
|
+
#Determine the 'real' duplicates
|
123
|
+
real_duplicates = {}
|
124
|
+
possible_duplicates.each { |k,v|
|
125
|
+
next if v.size == 1
|
126
|
+
v.each { |r| real_duplicates[k] = r }
|
127
|
+
}
|
128
|
+
|
129
|
+
#Finally, remove them!
|
130
|
+
pattern.children.each { |child|
|
131
|
+
child.result.childmap.each { |r|
|
132
|
+
r.each { |k,v|
|
133
|
+
real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
|
134
|
+
}
|
135
|
+
}
|
136
|
+
}
|
137
|
+
end #end of function
|
138
|
+
end #end of class PostProcessor
|
139
|
+
end #end of module Scrubyt
|