scrubber-scrubyt 0.4.11
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +343 -0
- data/COPYING +340 -0
- data/README +99 -0
- data/Rakefile +101 -0
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/constraint.rb +169 -0
- data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
- data/lib/scrubyt/core/scraping/pattern.rb +359 -0
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
- data/lib/scrubyt/core/shared/extractor.rb +167 -0
- data/lib/scrubyt/logging.rb +154 -0
- data/lib/scrubyt/output/post_processor.rb +139 -0
- data/lib/scrubyt/output/result.rb +44 -0
- data/lib/scrubyt/output/result_dumper.rb +154 -0
- data/lib/scrubyt/output/result_node.rb +140 -0
- data/lib/scrubyt/output/scrubyt_result.rb +42 -0
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
- data/lib/scrubyt/utils/shared_utils.rb +58 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
- data/lib/scrubyt/utils/xpathutils.rb +202 -0
- data/lib/scrubyt.rb +43 -0
- data/test/blackbox_test.rb +60 -0
- data/test/blackbox_tests/basic/multi_root.rb +6 -0
- data/test/blackbox_tests/basic/simple.rb +5 -0
- data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
- data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
- data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
- data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
- metadata +115 -0
@@ -0,0 +1,112 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Filter out relevant pieces from the parent pattern</tt>
|
4
|
+
#
|
5
|
+
#A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
|
6
|
+
#it reaches the bottom. The biggest difference is that instead of water, a HTML
|
7
|
+
#document travels through the space.
|
8
|
+
#
|
9
|
+
#Of course Scrubyt would not make much sense if the same document would arrive at
|
10
|
+
#the bottom that was poured in at the top - since in this case we might use an
|
11
|
+
#indentity transformation (i.e. do nothing with the input) as well.
|
12
|
+
#
|
13
|
+
#This is where filters came in: as they name says, they filter the stuff that is
|
14
|
+
#pouring from above, to leave the interesting parts and discard the rest.
|
15
|
+
#The working of a filter will be explained most easily by the help of an example.
|
16
|
+
#Let's consider that we would like to extract information from a webshop; Concretely
|
17
|
+
#we are interested in the name of the items and the URL pointing to the image of the
|
18
|
+
#item.
|
19
|
+
#
|
20
|
+
#To accomplish this, first we select the items with the pattern item (a pattern is
|
21
|
+
#a logical grouping of fillters; see Pattern documentation) Then our new
|
22
|
+
#context is the result extracted by the 'item' pattern; For every 'item' pattern, further
|
23
|
+
#extract the name and the image of the item; and finally, extract the href attribute
|
24
|
+
#of the image. Let's see an illustration:
|
25
|
+
#
|
26
|
+
# root --> This pattern is called a 'root pattern', It is invisible to you
|
27
|
+
# | and basically it represents the document; it has no filters
|
28
|
+
# +-- item --> Filter what's coming from above (the whole document) to get
|
29
|
+
# | relevant pieces of data (in this case webshop items)
|
30
|
+
# +-- name --> Again, filter what's coming from above (a webshop item) and
|
31
|
+
# | leave only item names after this operation
|
32
|
+
# +-- image --> This time filter the image of the item
|
33
|
+
# |
|
34
|
+
# +-- href --> And finally, from the image elements, get the attribute 'href'
|
35
|
+
class BaseFilter
|
36
|
+
#Type of the example this filter is extracted with
|
37
|
+
|
38
|
+
#XPath example, like html/body/tr/td[1] etc.
|
39
|
+
EXAMPLE_TYPE_XPATH = 0
|
40
|
+
#String from the document, for example 'Canon EOS 300 D'.
|
41
|
+
EXAMPLE_TYPE_STRING = 1
|
42
|
+
#Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
|
43
|
+
EXAMPLE_TYPE_IMAGE = 2
|
44
|
+
#No example - the actual XPath is determined from the children XPaths (their LCA)
|
45
|
+
EXAMPLE_TYPE_CHILDREN = 3
|
46
|
+
|
47
|
+
#Regexp example, like /\d+@*\d+[a-z]/
|
48
|
+
EXAMPLE_TYPE_REGEXP = 4
|
49
|
+
#Compound example, like :contains => 'goodies'
|
50
|
+
EXAMPLE_TYPE_COMPOUND = 5
|
51
|
+
|
52
|
+
attr_accessor(:example_type, :parent_pattern, :temp_sink,
|
53
|
+
:constraints, :xpath, :regexp, :example, :final_result)
|
54
|
+
|
55
|
+
def self.create(parent_pattern, example=nil)
|
56
|
+
filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
|
57
|
+
if filter_name == 'RootFilter'
|
58
|
+
BaseFilter.new(parent_pattern, example)
|
59
|
+
else
|
60
|
+
instance_eval("#{filter_name}.new(parent_pattern, example)")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
#Dispatcher method to add constraints; of course, as with any method_missing, this method
|
65
|
+
#should not be called directly
|
66
|
+
|
67
|
+
#TODO still used?
|
68
|
+
alias_method :throw_method_missing, :method_missing
|
69
|
+
def method_missing(method_name, *args, &block)
|
70
|
+
case method_name.to_s
|
71
|
+
when /^ensure.+/
|
72
|
+
constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
|
73
|
+
else
|
74
|
+
throw_method_missing(method_name, *args, &block)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
#We don't want this to be accessible from outside
|
80
|
+
def initialize(parent_pattern, example)
|
81
|
+
case parent_pattern.example_type
|
82
|
+
when :xpath
|
83
|
+
@example_type = EXAMPLE_TYPE_XPATH
|
84
|
+
else
|
85
|
+
@example_type = BaseFilter.determine_example_type(example)
|
86
|
+
end
|
87
|
+
@parent_pattern = parent_pattern
|
88
|
+
@example = example
|
89
|
+
@xpath = nil #The xpath to evaluate this filter
|
90
|
+
@constraints = [] #list of constraints
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.determine_example_type(example)
|
94
|
+
if example.instance_of? Regexp
|
95
|
+
EXAMPLE_TYPE_REGEXP
|
96
|
+
elsif example.instance_of? Hash
|
97
|
+
EXAMPLE_TYPE_COMPOUND
|
98
|
+
else
|
99
|
+
case example
|
100
|
+
when nil
|
101
|
+
EXAMPLE_TYPE_CHILDREN
|
102
|
+
when /\.(jpg|png|gif|jpeg)(\[\d+\])?$/
|
103
|
+
EXAMPLE_TYPE_IMAGE
|
104
|
+
when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*(\[@.+=.+\])?(\/@.+)?$/
|
105
|
+
(example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
|
106
|
+
else
|
107
|
+
EXAMPLE_TYPE_STRING
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end #end of method
|
111
|
+
end #End of class
|
112
|
+
end #End of module
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class DetailPageFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
if source.is_a?(String)
|
6
|
+
url = source
|
7
|
+
else
|
8
|
+
url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
|
9
|
+
end
|
10
|
+
@parent_pattern.extractor.store_page
|
11
|
+
original_host_name = @parent_pattern.extractor.get_host_name
|
12
|
+
@parent_pattern.extractor.restore_host_name
|
13
|
+
|
14
|
+
begin
|
15
|
+
FetchAction.fetch url, :resolve => @parent_pattern.resolve
|
16
|
+
rescue
|
17
|
+
Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
if @detail_extractor.nil?
|
22
|
+
@detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
|
23
|
+
root_results = @detail_extractor.result
|
24
|
+
else
|
25
|
+
root_results = @detail_extractor.evaluate_extractor
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
@parent_pattern.extractor.restore_page
|
31
|
+
@parent_pattern.extractor.store_host_name original_host_name
|
32
|
+
|
33
|
+
root_results
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Scrubyt
|
5
|
+
class DownloadFilter < BaseFilter
|
6
|
+
|
7
|
+
def evaluate(source)
|
8
|
+
download_file(source)
|
9
|
+
end #end of method
|
10
|
+
|
11
|
+
private
|
12
|
+
def download_file(source)
|
13
|
+
return '' if source.size < 4
|
14
|
+
host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
|
15
|
+
outfile = nil
|
16
|
+
host_name += "/" if host_name[-1..-1] != "/"
|
17
|
+
base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
|
18
|
+
file_name = source.scan(/.+\/(.*)/)[0][0]
|
19
|
+
return nil if @parent_pattern.except.include? file_name
|
20
|
+
Net::HTTP.start(base_url) { |http|
|
21
|
+
Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
|
22
|
+
begin
|
23
|
+
ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
|
24
|
+
path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
|
25
|
+
resp = http.get(path, {'User-Agent'=> ua})
|
26
|
+
outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
|
27
|
+
FileUtils.mkdir_p @example
|
28
|
+
open(outfile, 'wb') {|f| f.write(resp.body) }
|
29
|
+
rescue Timeout::Error
|
30
|
+
outfile = "[FAILED]#{file_name}"
|
31
|
+
end
|
32
|
+
}
|
33
|
+
outfile.scan(/.+\/(.*)/)[0][0]
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.find_nonexisting_file_name(file_name)
|
37
|
+
already_found = false
|
38
|
+
loop do
|
39
|
+
if File.exists? file_name
|
40
|
+
if already_found
|
41
|
+
if file_name.include?('.')
|
42
|
+
last_no = file_name.scan(/_(\d+)\./)[0][0]
|
43
|
+
file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
|
44
|
+
else
|
45
|
+
last_no = file_name.scan(/_(\d+)$/)[0][0]
|
46
|
+
file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
|
47
|
+
end
|
48
|
+
else
|
49
|
+
if file_name.include?('.')
|
50
|
+
file_name.sub!(/\./) {"_1\."}
|
51
|
+
already_found = true
|
52
|
+
else
|
53
|
+
file_name << '_1'
|
54
|
+
already_found = true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
else
|
58
|
+
break
|
59
|
+
end
|
60
|
+
end
|
61
|
+
file_name
|
62
|
+
end #end of method
|
63
|
+
end #End of class DownloadFilter
|
64
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class RegexpFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
if source.is_a? String
|
6
|
+
source.scan(@example).flatten
|
7
|
+
else
|
8
|
+
source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
end #End of class TreeFilter
|
13
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class ScriptFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
param = source
|
6
|
+
param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
|
7
|
+
@example.call param
|
8
|
+
end
|
9
|
+
|
10
|
+
end #End of class ConstantFilter
|
11
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class TextFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
return find_string(source) if @example =~ /^find\(/
|
6
|
+
final_element_name = @example.scan(/^(.+?)\[/)[0][0]
|
7
|
+
text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
|
8
|
+
|
9
|
+
index = @example.scan(/\]:(.+)/).flatten
|
10
|
+
index = 0 if index.empty?
|
11
|
+
index = index[0].to_i unless index[0] == "all"
|
12
|
+
result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
|
13
|
+
return "" unless result
|
14
|
+
|
15
|
+
if index[0] == "all"
|
16
|
+
result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
|
17
|
+
else
|
18
|
+
[XPathUtils.traverse_up_until_name(result,final_element_name)]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def find_string(source)
|
23
|
+
str = @example.scan(/find\((.+)\)/).flatten[0]
|
24
|
+
strings_to_find = str.include?('|') ? str.split('|') : [str]
|
25
|
+
strings_to_find.each do |s|
|
26
|
+
result = SharedUtils.traverse_for_match(source,/#{s}/i)
|
27
|
+
return [s] unless result.empty?
|
28
|
+
end
|
29
|
+
return []
|
30
|
+
end
|
31
|
+
|
32
|
+
end #End of class TextFilter
|
33
|
+
end #End of module Scrubyt
|
34
|
+
|
@@ -0,0 +1,138 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class TreeFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
return [@final_result] if @final_result
|
6
|
+
#Crude hack! Drop it after it will be supported in Hpricot
|
7
|
+
if @xpath =~ /.+\/@.+$/
|
8
|
+
@example = @xpath
|
9
|
+
@xpath = @xpath.scan(/^(.+?)\/@/)[0][0]
|
10
|
+
end
|
11
|
+
result = source/@xpath
|
12
|
+
|
13
|
+
Scrubyt.log :ACTION, "Evaluating #{@parent_pattern.name} with #{@xpath}"
|
14
|
+
|
15
|
+
xpath_results = Hpricot::Elements === result ? result : [result]
|
16
|
+
|
17
|
+
if @example =~ /.+\/@.+$/
|
18
|
+
result_attribute = @example.scan(/.+\/@(.+?)$/)[0][0]
|
19
|
+
xpath_results.map! {|r| r.attributes[result_attribute] }
|
20
|
+
end
|
21
|
+
if @regexp == nil
|
22
|
+
xpath_results
|
23
|
+
else
|
24
|
+
regexp_results = []
|
25
|
+
xpath_results.each do |entry|
|
26
|
+
text = SharedUtils.prepare_text_for_comparison(result.inner_html)
|
27
|
+
if text =~ @regexp
|
28
|
+
regexp_results << $1
|
29
|
+
end
|
30
|
+
end
|
31
|
+
regexp_results
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def generate_regexp_for_example
|
36
|
+
return if @example_type != EXAMPLE_TYPE_STRING
|
37
|
+
return if @temp_sink.nil?
|
38
|
+
return if @temp_sink.is_a? String
|
39
|
+
return if @example =~ /.+\[.+\]$/
|
40
|
+
|
41
|
+
text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
|
42
|
+
match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
|
43
|
+
return if match_range == (0..text.length)
|
44
|
+
|
45
|
+
@regexp = text
|
46
|
+
@temp_sink.changing_ranges.sort.reverse.each do |range|
|
47
|
+
@regexp[range] = if range == match_range then '<<<regexp_selection>>>' else '<<<regexp_changing>>>' end
|
48
|
+
end
|
49
|
+
@regexp = Regexp.escape(@regexp)
|
50
|
+
@regexp = @regexp.gsub('<<<regexp_changing>>>', '.*?')
|
51
|
+
@regexp = @regexp.gsub('<<<regexp_selection>>>', '(.*?)')
|
52
|
+
@regexp = '^' + @regexp + '$'
|
53
|
+
@regexp = /#{@regexp}/
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
#For all the tree patterns, generate an XPath based on the given example
|
58
|
+
#Also this method should not be called directly; It is automatically called for every tree
|
59
|
+
#pattern directly after wrapper definition
|
60
|
+
def generate_XPath_for_example(next_page_example=false)
|
61
|
+
#puts "generating example for: #{@parent_pattern.name}"
|
62
|
+
#puts @example_type
|
63
|
+
case @example_type
|
64
|
+
when EXAMPLE_TYPE_XPATH
|
65
|
+
@xpath = @example
|
66
|
+
when EXAMPLE_TYPE_STRING
|
67
|
+
@temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
|
68
|
+
@example,
|
69
|
+
next_page_example)
|
70
|
+
return if @temp_sink == nil
|
71
|
+
if @temp_sink.is_a? String
|
72
|
+
@final_result = @temp_sink
|
73
|
+
return
|
74
|
+
end
|
75
|
+
|
76
|
+
mark_changing_ranges = lambda { |element, range|
|
77
|
+
element.instance_eval do
|
78
|
+
@changing_ranges ||= [] << range
|
79
|
+
def changing_ranges
|
80
|
+
@changing_ranges
|
81
|
+
end
|
82
|
+
end
|
83
|
+
}
|
84
|
+
mark_changing_ranges.call(@temp_sink, @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0))
|
85
|
+
write_indices = next_page_example ? true : !@parent_pattern.generalize
|
86
|
+
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, write_indices)
|
87
|
+
when EXAMPLE_TYPE_CHILDREN
|
88
|
+
current_example_index = 0
|
89
|
+
loop do
|
90
|
+
all_child_temp_sinks = []
|
91
|
+
@parent_pattern.children.each do |child_pattern|
|
92
|
+
all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink if child_pattern.filters[current_example_index].temp_sink
|
93
|
+
end
|
94
|
+
result = all_child_temp_sinks.pop
|
95
|
+
if all_child_temp_sinks.empty?
|
96
|
+
result = result.parent
|
97
|
+
else
|
98
|
+
all_child_temp_sinks.each do |child_sink|
|
99
|
+
result = XPathUtils.lowest_common_ancestor(result, child_sink)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
|
103
|
+
XPathUtils.generate_XPath(result, nil, true)
|
104
|
+
if @parent_pattern.filters.size < current_example_index + 1
|
105
|
+
@parent_pattern.filters << Scrubyt::BaseFilter.create(@parent_pattern)
|
106
|
+
end
|
107
|
+
@parent_pattern.filters[current_example_index].xpath = xpath
|
108
|
+
@parent_pattern.filters[current_example_index].temp_sink = result
|
109
|
+
@parent_pattern.children.each do |child_pattern|
|
110
|
+
next if child_pattern.type == :detail_page
|
111
|
+
child_pattern.filters[current_example_index].xpath =
|
112
|
+
child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
|
113
|
+
XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
|
114
|
+
end
|
115
|
+
break if @parent_pattern.children[0].filters.size == current_example_index + 1
|
116
|
+
current_example_index += 1
|
117
|
+
end
|
118
|
+
when EXAMPLE_TYPE_IMAGE
|
119
|
+
@temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
|
120
|
+
@xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
|
121
|
+
when EXAMPLE_TYPE_COMPOUND
|
122
|
+
@temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
|
123
|
+
@example,
|
124
|
+
next_page_example)
|
125
|
+
@xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
|
126
|
+
XPathUtils.generate_XPath(@temp_sink, nil, true)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def generate_relative_XPath(parent_xpath)
|
131
|
+
parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
|
132
|
+
parent_xpath,
|
133
|
+
@parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
|
134
|
+
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
|
135
|
+
end
|
136
|
+
|
137
|
+
end #End of class TreeFilter
|
138
|
+
end #End of module Scrubyt
|