scrubber-scrubyt 0.4.11
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +343 -0
- data/COPYING +340 -0
- data/README +99 -0
- data/Rakefile +101 -0
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/constraint.rb +169 -0
- data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
- data/lib/scrubyt/core/scraping/pattern.rb +359 -0
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
- data/lib/scrubyt/core/shared/extractor.rb +167 -0
- data/lib/scrubyt/logging.rb +154 -0
- data/lib/scrubyt/output/post_processor.rb +139 -0
- data/lib/scrubyt/output/result.rb +44 -0
- data/lib/scrubyt/output/result_dumper.rb +154 -0
- data/lib/scrubyt/output/result_node.rb +140 -0
- data/lib/scrubyt/output/scrubyt_result.rb +42 -0
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
- data/lib/scrubyt/utils/shared_utils.rb +58 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
- data/lib/scrubyt/utils/xpathutils.rb +202 -0
- data/lib/scrubyt.rb +43 -0
- data/test/blackbox_test.rb +60 -0
- data/test/blackbox_tests/basic/multi_root.rb +6 -0
- data/test/blackbox_tests/basic/simple.rb +5 -0
- data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
- data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
- data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
- data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
- metadata +115 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
#=<tt>Lookup of compund examples</tt>
|
3
|
+
#There are two types of string examples in scRUBYt! right now:
|
4
|
+
#the simple example and the compound example.
|
5
|
+
#
|
6
|
+
#This class is responsible for finding elements matched by compound examples.
|
7
|
+
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
+
#(e.g. match the n-th which matches the text, or element that matches the
|
9
|
+
#text but also contains a specific attribute etc.)
|
10
|
+
class CompoundExampleLookup
|
11
|
+
def self.find_node_from_compund_example(doc, compound_example, next_link=false, index = 0)
|
12
|
+
@partial_results = []
|
13
|
+
self.lookup_compound_example(doc, compound_example, index)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
#Lookup the first element which is matched by this compund example
|
18
|
+
#
|
19
|
+
#A compound example is specified with :contains, :begins_with and
|
20
|
+
#:ends_with descriptors - which can be both regexps or strings
|
21
|
+
#
|
22
|
+
#Example:
|
23
|
+
#
|
24
|
+
#flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
|
25
|
+
def self.lookup_compound_example(doc, compound_example, index)
|
26
|
+
compound_example.each do |k,v|
|
27
|
+
v = Regexp.escape(v) if v.is_a? String
|
28
|
+
case k
|
29
|
+
when :contains
|
30
|
+
v = /#{v}/
|
31
|
+
when :begins_with
|
32
|
+
v = /^\s*#{v}/
|
33
|
+
when :ends_with
|
34
|
+
v = /#{v}\s*$/
|
35
|
+
end
|
36
|
+
if (@partial_results.empty?)
|
37
|
+
@partial_results = SharedUtils.traverse_for_match(doc, v)
|
38
|
+
else
|
39
|
+
refine_partial_results(v)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
@partial_results[index]
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.refine_partial_results(regexp)
|
46
|
+
@partial_results = @partial_results.select {|pr| pr.inner_html.gsub(/<.*?>/, '') =~ regexp}
|
47
|
+
end
|
48
|
+
|
49
|
+
end #End of class CompoundExampleLookup
|
50
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,85 @@
|
|
1
|
+
class Module
|
2
|
+
def option_reader(key_default_hash)
|
3
|
+
key_default_hash.each do |key, default|
|
4
|
+
define_method(key) {
|
5
|
+
if @options[key].nil?
|
6
|
+
if default.is_a? Proc
|
7
|
+
instance_eval(&default)
|
8
|
+
else
|
9
|
+
default
|
10
|
+
end
|
11
|
+
else
|
12
|
+
@options[key]
|
13
|
+
end
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def option_writer(*keys)
|
19
|
+
keys.each do |key|
|
20
|
+
define_method("#{key.to_s}=".to_sym) { |value|
|
21
|
+
@options[key] = value
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def option(key, default=nil, writable=false)
|
27
|
+
option_reader(key => default)
|
28
|
+
option_writer(key) if writable
|
29
|
+
end
|
30
|
+
|
31
|
+
def option_accessor(key_default_hash)
|
32
|
+
key_default_hash.each do |key, default|
|
33
|
+
option(key, default, true)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class Range
|
39
|
+
def <=>(other)
|
40
|
+
self.begin <=> other.begin
|
41
|
+
end
|
42
|
+
|
43
|
+
def +(amount)
|
44
|
+
(self.begin + amount)..(self.end + amount)
|
45
|
+
end
|
46
|
+
|
47
|
+
def -(amount)
|
48
|
+
(self.begin - amount)..(self.end - amount)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
module Math
|
53
|
+
def self.min(a, b)
|
54
|
+
a < b ? a : b
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.max(a, b)
|
58
|
+
a > b ? a : b
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
#dec 16: Dropped - causes some errors w/ Rails
|
63
|
+
#just some hack here to allow current examples' syntax:
|
64
|
+
#table_data.to_xml.write(open('result.xml', 'w'), 1)
|
65
|
+
#class String
|
66
|
+
# def write(stringio, add_indent=0)
|
67
|
+
# stringio.write((self.split("\n").collect { |line| (' ' * add_indent) + line }).join("\n"))
|
68
|
+
# end
|
69
|
+
#end
|
70
|
+
|
71
|
+
#hack to simulate ancestor::tag selector of XPAth
|
72
|
+
module Hpricot
|
73
|
+
class Elem
|
74
|
+
def ancestors(tag = nil)
|
75
|
+
element=self
|
76
|
+
path=Hpricot::Elements.new
|
77
|
+
while element.class != Hpricot::Doc do
|
78
|
+
return element if (tag && (tag ==element.name))
|
79
|
+
path.push element
|
80
|
+
element = element.parent
|
81
|
+
end
|
82
|
+
path
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
|
4
|
+
#
|
5
|
+
class SharedUtils
|
6
|
+
#Entities to replace - need to make this more complete, or install htmlentities or similar package
|
7
|
+
ENTITIES = {
|
8
|
+
'quot' => '"',
|
9
|
+
'apos' => "'",
|
10
|
+
'amp' => '&',
|
11
|
+
'lt' => '<',
|
12
|
+
'gt' => '>',
|
13
|
+
'nbsp' => ' '}
|
14
|
+
|
15
|
+
def self.prepare_text_for_comparison(text)
|
16
|
+
unescape_entities text
|
17
|
+
text.strip!
|
18
|
+
text
|
19
|
+
end
|
20
|
+
|
21
|
+
#Unescape the entities in the HTML!
|
22
|
+
def self.unescape_entities(text)
|
23
|
+
ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
|
24
|
+
text
|
25
|
+
end
|
26
|
+
|
27
|
+
#Entry point for finding the elements specified by examples
|
28
|
+
def self.traverse_for_match(node, regexp)
|
29
|
+
results = []
|
30
|
+
traverse_for_match_inner = lambda { |node, regexp|
|
31
|
+
ft = prepare_text_for_comparison(node.inner_html.gsub(/<.*?>/, ''))
|
32
|
+
if ft =~ regexp
|
33
|
+
node.instance_eval do
|
34
|
+
@match_data = $~
|
35
|
+
def match_data
|
36
|
+
@match_data
|
37
|
+
end
|
38
|
+
end
|
39
|
+
results << node
|
40
|
+
results.delete node.parent if node.is_a? Hpricot::Elem
|
41
|
+
end
|
42
|
+
node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
|
43
|
+
}
|
44
|
+
traverse_for_match_inner.call(node,regexp)
|
45
|
+
results
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.get_backtrace
|
49
|
+
begin
|
50
|
+
raise
|
51
|
+
rescue Exception => ex
|
52
|
+
backtrace = ex.backtrace
|
53
|
+
end
|
54
|
+
backtrace.slice!(0)
|
55
|
+
backtrace
|
56
|
+
end
|
57
|
+
end #end of class SharedUtils
|
58
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
#=<tt>Lookup of simple examples</tt>
|
3
|
+
#There are two types of string examples in scRUBYt! right now:
|
4
|
+
#the simple example and the compound example.
|
5
|
+
#
|
6
|
+
#This class is responsible for finding elements matched by simple examples.
|
7
|
+
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
+
#(e.g. match the n-th which matches the text, or element that matches the
|
9
|
+
#text but also contains a specific attribute etc.)
|
10
|
+
class SimpleExampleLookup
|
11
|
+
#From the example text defined by the user, find the lowest possible node which contains the text 'text'.
|
12
|
+
#The text can be also a mixed content text, e.g.
|
13
|
+
#
|
14
|
+
# <a>Bon <b>nuit</b>, monsieur!</a>
|
15
|
+
#
|
16
|
+
#In this case, <a>'s text is considered to be "Bon nuit, monsieur"
|
17
|
+
def self.find_node_from_text(doc, text, next_link=false, index = 0)
|
18
|
+
text.gsub!('»', '»')
|
19
|
+
#Process immediate attribute extraction (like "go to google.com/@href")
|
20
|
+
if text =~ /.+\/@.+$/
|
21
|
+
text = text.scan(/^(.+?)\/@.+$/)[0][0]
|
22
|
+
elsif text =~ /.+\[\d+\]$/
|
23
|
+
res = text.scan(/(.+)\[(\d+)\]$/)
|
24
|
+
text = res[0][0]
|
25
|
+
index = res[0][1].to_i
|
26
|
+
elsif text =~ /.+\[.+\]$/
|
27
|
+
final_element_name = text.scan(/^(.+?)\[/)[0][0]
|
28
|
+
text = text.scan(/\[(.+?)\]/)[0][0]
|
29
|
+
end
|
30
|
+
if final_element_name
|
31
|
+
text = Regexp.escape(text) if text.is_a? String
|
32
|
+
result = SharedUtils.traverse_for_match(doc,/#{text}/)[index]
|
33
|
+
result = XPathUtils.traverse_up_until_name(result,final_element_name)
|
34
|
+
else
|
35
|
+
text = Regexp.escape(text) if text.is_a? String
|
36
|
+
result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end #End of class SimpleExampleLookup
|
40
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,202 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module Scrubyt
|
5
|
+
##
|
6
|
+
#=<tt>Various XPath utility functions</tt>
|
7
|
+
class XPathUtils
|
8
|
+
|
9
|
+
#Find the LCA (Lowest Common Ancestor) of two nodes
|
10
|
+
def self.lowest_common_ancestor(node1, node2)
|
11
|
+
path1 = traverse_up(node1)
|
12
|
+
path2 = traverse_up(node2)
|
13
|
+
return node1.parent if path1 == path2
|
14
|
+
|
15
|
+
closure = nil
|
16
|
+
while (!path1.empty? && !path2.empty?)
|
17
|
+
closure = path1.pop
|
18
|
+
return closure.parent if (closure != path2.pop)
|
19
|
+
end
|
20
|
+
path1.size > path2.size ? path1.last.parent : path2.last.parent
|
21
|
+
end
|
22
|
+
|
23
|
+
##
|
24
|
+
#Generate XPath for the given node
|
25
|
+
#
|
26
|
+
#*parameters*
|
27
|
+
#
|
28
|
+
#_node_ - The node we are looking up the XPath for
|
29
|
+
#
|
30
|
+
#_stopnode_ - The Xpath generation is stopped and the XPath that
|
31
|
+
#was generated so far is returned if this node is reached.
|
32
|
+
#
|
33
|
+
#_write_indices_ - whether the index inside the parent shuold be
|
34
|
+
#added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
|
35
|
+
def self.generate_XPath(node, stopnode=nil, write_indices=false)
|
36
|
+
path = []
|
37
|
+
indices = []
|
38
|
+
found = false
|
39
|
+
while !node.nil? && node.class != Hpricot::Doc do
|
40
|
+
if node == stopnode
|
41
|
+
found = true
|
42
|
+
break
|
43
|
+
end
|
44
|
+
path.push node.name
|
45
|
+
indices.push find_index(node) if write_indices
|
46
|
+
node = node.parent
|
47
|
+
end
|
48
|
+
#This condition ensures that if there is a stopnode, and we did not found it along the way,
|
49
|
+
#we return nil (since the stopnode is not contained in the path at all)
|
50
|
+
return nil if stopnode != nil && !found
|
51
|
+
result = ""
|
52
|
+
if write_indices
|
53
|
+
path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
|
54
|
+
else
|
55
|
+
path.reverse.each{ |node| result += "#{node}/" }
|
56
|
+
end
|
57
|
+
"/" + result.chop
|
58
|
+
end
|
59
|
+
|
60
|
+
#Generate an XPath of the node with indices, relatively to the given
|
61
|
+
#relative_root.
|
62
|
+
#
|
63
|
+
#For example if the elem's absolute XPath is /a/b/c,
|
64
|
+
#and the relative root's Xpath is a/b, the result of the function will
|
65
|
+
#be /c.
|
66
|
+
def self.generate_relative_XPath( elem,relative_root )
|
67
|
+
return nil if (elem == relative_root)
|
68
|
+
generate_XPath(elem, relative_root, true)
|
69
|
+
end
|
70
|
+
|
71
|
+
#Generate a generalized XPath (i.e. without indices) of the node,
|
72
|
+
#relatively to the given relative_root.
|
73
|
+
#
|
74
|
+
#For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
|
75
|
+
#and the relative root's Xpath is a[1]/b[3], the result of the function will
|
76
|
+
#be /c.
|
77
|
+
def self.generate_generalized_relative_XPath( elem,relative_root )
|
78
|
+
return nil if (elem == relative_root)
|
79
|
+
generate_XPath(elem, relative_root, false)
|
80
|
+
end
|
81
|
+
|
82
|
+
#Find an image based on the src of the example
|
83
|
+
#
|
84
|
+
#*parameters*
|
85
|
+
#
|
86
|
+
#_doc_ - The containing document
|
87
|
+
#
|
88
|
+
#_example_ - The value of the src attribute of the img tag
|
89
|
+
#This is convenient, since if the users rigth-clicks an image and
|
90
|
+
#copies image location, this string will be copied to the clipboard
|
91
|
+
#and thus can be easily pasted as an examle
|
92
|
+
#
|
93
|
+
#_index_ - there might be more images with the same src on the page -
|
94
|
+
#most typically the user will need the 0th - but if this is not the
|
95
|
+
#case, there is the possibility to override this
|
96
|
+
def self.find_image(doc, example, index=0)
|
97
|
+
if example =~ /\.(jpg|png|gif|jpeg)(\[\d+\])$/
|
98
|
+
res = example.scan(/(.+)\[(\d+)\]$/)
|
99
|
+
example = res[0][0]
|
100
|
+
index = res[0][1].to_i
|
101
|
+
end
|
102
|
+
(doc/"//img[@src='#{example}']")[index]
|
103
|
+
end
|
104
|
+
|
105
|
+
##
|
106
|
+
#Used to find the parent of a node with the given name - for example
|
107
|
+
#find the <form> node which is the parent of the <input> node
|
108
|
+
def self.traverse_up_until_name(node, name)
|
109
|
+
while node.class != Hpricot::Doc do
|
110
|
+
#raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
|
111
|
+
return nil unless node
|
112
|
+
break if node.name == name
|
113
|
+
node = node.parent
|
114
|
+
end
|
115
|
+
node
|
116
|
+
end
|
117
|
+
|
118
|
+
##
|
119
|
+
#Used when automatically looking up href attributes (for detail or next links)
|
120
|
+
#If the detail pattern did not extract a link, we first look up it's
|
121
|
+
#children - and if we don't find a link, traverse up
|
122
|
+
def self.find_nearest_node_with_attribute(node, attribute)
|
123
|
+
@node = nil
|
124
|
+
return node if node.is_a? Hpricot::Elem and node[attribute]
|
125
|
+
first_child_node_with_attribute(node, attribute)
|
126
|
+
first_parent_node_with_attribute(node, attribute) if !@node
|
127
|
+
@node
|
128
|
+
end
|
129
|
+
|
130
|
+
##
|
131
|
+
#Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
|
132
|
+
#and a child one. The result of the method is the relative XPath of the node pointed to
|
133
|
+
#by the second XPath to the node pointed to by the firs XPath.
|
134
|
+
def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
|
135
|
+
original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
|
136
|
+
pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
|
137
|
+
i = 0
|
138
|
+
pairs.each_with_index do |pair,index|
|
139
|
+
i = index
|
140
|
+
break if pair[0] != pair[1]
|
141
|
+
end
|
142
|
+
"/" + original_child_xpath_parts[i..-1].join('/')
|
143
|
+
end
|
144
|
+
|
145
|
+
def self.to_full_XPath(doc, xpath, generalize)
|
146
|
+
elem = doc/xpath
|
147
|
+
elem = elem.map[0] if elem.is_a? Hpricot::Elements
|
148
|
+
XPathUtils.generate_XPath(elem, nil, generalize)
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
#Find the index of the child inside the parent
|
153
|
+
#For example:
|
154
|
+
#
|
155
|
+
# tr
|
156
|
+
# / | \
|
157
|
+
# td td td
|
158
|
+
# 0 1 2
|
159
|
+
#
|
160
|
+
#The last row contains the indices of the td's from the
|
161
|
+
#tow above.
|
162
|
+
#
|
163
|
+
#Note that in classic XPath, the indices start with 1 (rather
|
164
|
+
#than 0).
|
165
|
+
def self.find_index(node)
|
166
|
+
c = 0
|
167
|
+
node.parent.children.each do |child|
|
168
|
+
if child.class == Hpricot::Elem
|
169
|
+
c += 1 if (child.name == node.name)
|
170
|
+
break if (node == child)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
c
|
174
|
+
end
|
175
|
+
|
176
|
+
def self.traverse_up(node, stopnode=nil)
|
177
|
+
path = []
|
178
|
+
while node.class != Hpricot::Doc do
|
179
|
+
break if node == stopnode
|
180
|
+
path.push node
|
181
|
+
node = node.parent
|
182
|
+
end
|
183
|
+
path
|
184
|
+
end
|
185
|
+
|
186
|
+
def self.first_child_node_with_attribute(node, attribute)
|
187
|
+
return if !node.instance_of? Hpricot::Elem || @node
|
188
|
+
@node = node if node.attributes[attribute]
|
189
|
+
node.children.each { |child| first_child_node_with_attribute(child, attribute) }
|
190
|
+
end
|
191
|
+
|
192
|
+
def self.first_parent_node_with_attribute(node, attribute)
|
193
|
+
return if !node.instance_of? Hpricot::Elem || @node
|
194
|
+
@node = node if node.attributes[attribute]
|
195
|
+
first_parent_node_with_attribute(node.parent, attribute)
|
196
|
+
end
|
197
|
+
|
198
|
+
def self.to_general_XPath(xpath)
|
199
|
+
xpath.gsub(/\[.+?\]/) {""}
|
200
|
+
end #End of method to_general_XPath
|
201
|
+
end #End of class XPathUtils
|
202
|
+
end #End of module Scrubyt
|
data/lib/scrubyt.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
$KCODE = "u"
|
2
|
+
require "jcode"
|
3
|
+
|
4
|
+
#ruby core
|
5
|
+
require "open-uri"
|
6
|
+
require "erb"
|
7
|
+
|
8
|
+
#gems
|
9
|
+
require "rexml/text"
|
10
|
+
require "rubygems"
|
11
|
+
require "mechanize"
|
12
|
+
require "hpricot"
|
13
|
+
|
14
|
+
#scrubyt
|
15
|
+
require "#{File.dirname(__FILE__)}/scrubyt/logging"
|
16
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/ruby_extensions.rb"
|
17
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/xpathutils.rb"
|
18
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/shared_utils.rb"
|
19
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/simple_example_lookup.rb"
|
20
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/compound_example_lookup.rb"
|
21
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint_adder.rb"
|
22
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint.rb"
|
23
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/result_indexer.rb"
|
24
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pre_filter_document.rb"
|
25
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
|
26
|
+
require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
|
27
|
+
require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
|
28
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
|
29
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
|
30
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
|
31
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
|
32
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
|
33
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/base_filter.rb"
|
34
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/attribute_filter.rb"
|
35
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/constant_filter.rb"
|
36
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/script_filter.rb"
|
37
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/text_filter.rb"
|
38
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/detail_page_filter.rb"
|
39
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/download_filter.rb"
|
40
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/html_subtree_filter.rb"
|
41
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/regexp_filter.rb"
|
42
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/tree_filter.rb"
|
43
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pattern.rb"
|
@@ -0,0 +1,60 @@
|
|
1
|
+
$lib_path = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$:.unshift $lib_path
|
3
|
+
|
4
|
+
require 'scrubyt'
|
5
|
+
require 'test/unit'
|
6
|
+
|
7
|
+
def perform_test(test_path, detailed = false)
|
8
|
+
out = $stdout
|
9
|
+
$stdout = StringIO.new unless detailed
|
10
|
+
cwd = Dir.getwd
|
11
|
+
Dir.chdir(File.dirname(test_path))
|
12
|
+
|
13
|
+
out.puts "Test: #{test_path}" if detailed
|
14
|
+
out.puts "========== Print Output ==========" if detailed
|
15
|
+
|
16
|
+
begin
|
17
|
+
expected_xml = File.read(File.basename(test_path)[0..-4] + ".expected.xml")
|
18
|
+
|
19
|
+
scrubyt_result_native = Scrubyt::Extractor.load(File.basename(test_path))
|
20
|
+
|
21
|
+
exported_code = scrubyt_result_native.export({:template => 'lambda'})
|
22
|
+
scrubyt_result_exported = Scrubyt::Extractor.define(&eval(exported_code))
|
23
|
+
ensure
|
24
|
+
if detailed
|
25
|
+
out.puts "========== Native Extractor =========="
|
26
|
+
out.puts IO.read(File.basename(test_path))
|
27
|
+
out.puts "========== Exported Extractor =========="
|
28
|
+
out.puts exported_code
|
29
|
+
out.puts "========== Expected =========="
|
30
|
+
out.puts expected_xml
|
31
|
+
out.puts "========== Result (native) =========="
|
32
|
+
out.puts scrubyt_result_native.to_xml
|
33
|
+
out.puts "========== Result (exported) =========="
|
34
|
+
out.puts scrubyt_result_exported.to_xml
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
assert_equal expected_xml, scrubyt_result_native.to_xml
|
39
|
+
assert_equal expected_xml, scrubyt_result_exported.to_xml
|
40
|
+
ensure
|
41
|
+
Dir.chdir(cwd)
|
42
|
+
$stdout = out
|
43
|
+
end
|
44
|
+
|
45
|
+
if $0 == __FILE__ && ARGV[0]
|
46
|
+
include Test::Unit::Assertions
|
47
|
+
perform_test(ARGV[0], true)
|
48
|
+
exit
|
49
|
+
end
|
50
|
+
|
51
|
+
class BlackboxTest < Test::Unit::TestCase
|
52
|
+
tests = Dir.glob(File.join(File.dirname(__FILE__), 'blackbox_tests', '**', '*.rb'))
|
53
|
+
tests = tests.sort
|
54
|
+
|
55
|
+
tests.each do |test_path|
|
56
|
+
define_method("test_#{test_path.gsub('/', '_')}") do
|
57
|
+
perform_test(test_path)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|