scrubyt 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +41 -0
- data/Rakefile +55 -0
- data/lib/scrubyt.rb +9 -0
- data/lib/scrubyt/constraint.rb +185 -0
- data/lib/scrubyt/constraint_adder.rb +86 -0
- data/lib/scrubyt/export.rb +187 -0
- data/lib/scrubyt/extractor.rb +187 -0
- data/lib/scrubyt/filter.rb +144 -0
- data/lib/scrubyt/pattern.rb +263 -0
- data/lib/scrubyt/result.rb +43 -0
- data/lib/scrubyt/result_dumper.rb +84 -0
- data/lib/scrubyt/xpathutils.rb +196 -0
- data/test/unittests/constraint_test.rb +106 -0
- data/test/unittests/extractor_test.rb +93 -0
- data/test/unittests/filter_test.rb +71 -0
- data/test/unittests/input/constraint_test.html +55 -0
- data/test/unittests/input/test.html +39 -0
- data/test/unittests/xpathutils_test.rb +165 -0
- metadata +63 -0
@@ -0,0 +1,43 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Represents the results of a pattern</tt>
|
4
|
+
class Result
|
5
|
+
attr_reader :childmap, :instances
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@childmap ||= []
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_result(source, result)
|
12
|
+
@childmap.each do |hash|
|
13
|
+
if hash.keys[0] == source
|
14
|
+
hash[source] << result
|
15
|
+
return
|
16
|
+
end
|
17
|
+
end
|
18
|
+
@childmap << {source => [result]}
|
19
|
+
end
|
20
|
+
|
21
|
+
def lookup(last_result)
|
22
|
+
@childmap.each do |hashes|
|
23
|
+
hashes.each { |key, value| return value if (key == last_result) }
|
24
|
+
end
|
25
|
+
nil
|
26
|
+
end#end of method lookup
|
27
|
+
end#end of class Result
|
28
|
+
end#end of module Scrubyt
|
29
|
+
|
30
|
+
#It roughly works like this:
|
31
|
+
#
|
32
|
+
# root
|
33
|
+
# source: nil
|
34
|
+
# childmap: [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
|
35
|
+
|
36
|
+
#table
|
37
|
+
# source: doc1
|
38
|
+
# childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, doc2 => [table[1]s2, table[2]s2, table[3]s2] ]
|
39
|
+
|
40
|
+
#row
|
41
|
+
# source: table1s1, table2s1, table3s1
|
42
|
+
# childmap: [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
|
43
|
+
# {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module Scrubyt
|
4
|
+
##
|
5
|
+
#=<tt>Dumping the result in various formats and providing statistics on the results</tt>
|
6
|
+
class ResultDumper
|
7
|
+
##
|
8
|
+
#Output the results as XML
|
9
|
+
def self.to_xml(pattern)
|
10
|
+
doc = REXML::Document.new
|
11
|
+
root = REXML::Element.new('root')
|
12
|
+
doc.add_element(root)
|
13
|
+
all_extracted_docs = pattern.last_result
|
14
|
+
all_extracted_docs.each do |lr|
|
15
|
+
pattern.last_result = lr
|
16
|
+
to_xml_recursive(pattern, root)
|
17
|
+
end
|
18
|
+
doc
|
19
|
+
end
|
20
|
+
|
21
|
+
##
|
22
|
+
#Output the text of the pattern; If this pattern is a tree, collect the text from its
|
23
|
+
#result instance node; otherwise rely on the last_result
|
24
|
+
def self.to_text(pattern)
|
25
|
+
last_result = pattern.last_result
|
26
|
+
result = ""
|
27
|
+
if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_TREE
|
28
|
+
last_result.traverse_text { |t| result += t.to_s }
|
29
|
+
else
|
30
|
+
result = last_result
|
31
|
+
end
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
#Print some simple statistics on the extracted results, like the count of extracted
|
37
|
+
#instances by each pattern
|
38
|
+
def self.print_statistics(pattern)
|
39
|
+
puts "\n" * 2
|
40
|
+
print_statistics_recursive(pattern,0)
|
41
|
+
puts
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
def self.to_xml_recursive(pattern, element)
|
46
|
+
pattern.children.each do |child|
|
47
|
+
childresults = child.result.lookup(child.parent.last_result)
|
48
|
+
#Output text for leaf nodes only; Maybe add possibility to customize this later
|
49
|
+
if (childresults == nil)
|
50
|
+
res = ""
|
51
|
+
child.parent.last_result.traverse_text { |t| res += t.to_s }
|
52
|
+
if (child.parent.size == 0)
|
53
|
+
element.text = (res.gsub(' '){' '}).strip unless element.parent.is_a? REXML::Document
|
54
|
+
end
|
55
|
+
next
|
56
|
+
end
|
57
|
+
generate_children(child, childresults, element)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.generate_children(child, childresults, element)
|
62
|
+
childresults.size.times do |num|
|
63
|
+
child.last_result = childresults[num]
|
64
|
+
res = ""
|
65
|
+
if child.last_result.instance_of? String
|
66
|
+
res = child.last_result
|
67
|
+
else
|
68
|
+
child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
|
69
|
+
end
|
70
|
+
child_node = REXML::Element.new(child.name)
|
71
|
+
child_node.text = (res.gsub(' '){' '}).strip if (child.children.size == 0)
|
72
|
+
element.add_element(child_node)
|
73
|
+
to_xml_recursive(child, child_node)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.print_statistics_recursive(pattern, depth)
|
78
|
+
puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{pattern.get_instance_count[pattern.name]} instances.") if pattern.name != 'root'
|
79
|
+
pattern.children.each do |child|
|
80
|
+
print_statistics_recursive(child, depth + 4)
|
81
|
+
end
|
82
|
+
end#end of method print_statistics_recursive
|
83
|
+
end #end of class ResultDumper
|
84
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,196 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module Scrubyt
|
5
|
+
##
|
6
|
+
#=<tt>Various XPath utility functions</tt>
|
7
|
+
class XPathUtils
|
8
|
+
#When looking up examples, do NOT recurse into these tags since they won't contain any usable info
|
9
|
+
NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
|
10
|
+
|
11
|
+
#From the example text defined by the user, find the lowest possible node with the text 'text'.
|
12
|
+
#The text can be also a mixed content text, e.g.
|
13
|
+
#
|
14
|
+
# <a>Bon <b>nuit</b>, monsieur!</a>
|
15
|
+
#
|
16
|
+
#In this case, <a>'s text is considered to be "Bon nuit, monsieur"
|
17
|
+
def self.find_node_from_text(doc, text)
|
18
|
+
@node = nil
|
19
|
+
@found = false
|
20
|
+
self.traverse_for_text(doc,text)
|
21
|
+
self.lowest_possible_node_with_text(@node, text)
|
22
|
+
#$Logger.warn("Node for example #{text} Not found!") if (@found == false)
|
23
|
+
puts "Node for example #{text} Not found!" if (@found == false)
|
24
|
+
@node
|
25
|
+
end
|
26
|
+
|
27
|
+
#Full text of the node; this is equivalent to Hpricot's inner_text. Will be
|
28
|
+
#replaced if Hpricot 0.5 will be released
|
29
|
+
def self.full_text(node)
|
30
|
+
result = ""
|
31
|
+
node.traverse_text { |t| result += t.to_s }
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
#Find the LCA (Lowest Common Ancestor) of two nodes
|
36
|
+
def self.lowest_common_ancestor(node1, node2)
|
37
|
+
path1 = traverse_up(node1)
|
38
|
+
path2 = traverse_up(node2)
|
39
|
+
return node1.parent if path1 == path2
|
40
|
+
|
41
|
+
closure = nil
|
42
|
+
while (!path1.empty? && !path2.empty?)
|
43
|
+
closure = path1.pop
|
44
|
+
return closure.parent if (closure != path2.pop)
|
45
|
+
end
|
46
|
+
path1.size > path2.size ? path1.last.parent : path2.last.parent
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
#Generate XPath for the given node
|
51
|
+
#
|
52
|
+
#*parameters*
|
53
|
+
#
|
54
|
+
#_node_ - The node we are looking the XPath for
|
55
|
+
#
|
56
|
+
#_stopnode_ - The Xpath generation is stopped and the XPath that
|
57
|
+
#was generated so far is returned if this node is reached.
|
58
|
+
#
|
59
|
+
#_write_indices_ - whether the index inside the parent shuold be
|
60
|
+
#added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
|
61
|
+
def self.generate_XPath(node, stopnode=nil, write_indices=false)
|
62
|
+
path = []
|
63
|
+
indices = []
|
64
|
+
found = false
|
65
|
+
while node.class != Hpricot::Doc do
|
66
|
+
if node == stopnode
|
67
|
+
found = true
|
68
|
+
break
|
69
|
+
end
|
70
|
+
path.push node.name
|
71
|
+
indices.push find_index(node) if write_indices
|
72
|
+
node = node.parent
|
73
|
+
end
|
74
|
+
#This condition ensures that if there is a stopnode, and we did not found it along the way,
|
75
|
+
#we return nil (since the stopnode is not contained in the path at all)
|
76
|
+
return nil if stopnode != nil && !found
|
77
|
+
result = ""
|
78
|
+
if write_indices
|
79
|
+
path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
|
80
|
+
else
|
81
|
+
path.reverse.each{ |node| result += "#{node}/" }
|
82
|
+
end
|
83
|
+
"/" + result.chop
|
84
|
+
end
|
85
|
+
|
86
|
+
#Generate an XPath of the node with indices, relatively to the given
|
87
|
+
#relative_root.
|
88
|
+
#
|
89
|
+
#For example if the elem's absolute XPath is /a/b/c,
|
90
|
+
#and the relative root's Xpath is a/b, the result of the function will
|
91
|
+
#be /c.
|
92
|
+
def self.generate_relative_XPath( elem,relative_root )
|
93
|
+
return nil if (elem == relative_root)
|
94
|
+
generate_XPath(elem, relative_root, true)
|
95
|
+
end
|
96
|
+
|
97
|
+
#Generate a generalized XPath (i.e. without indices) of the node,
|
98
|
+
#relatively to the given relative_root.
|
99
|
+
#
|
100
|
+
#For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
|
101
|
+
#and the relative root's Xpath is a[1]/b[3], the result of the function will
|
102
|
+
#be /c.
|
103
|
+
def self.generate_generalized_relative_XPath( elem,relative_root )
|
104
|
+
return nil if (elem == relative_root)
|
105
|
+
generate_XPath(elem, relative_root, false)
|
106
|
+
end
|
107
|
+
|
108
|
+
#Find an image based on the src of the example
|
109
|
+
#
|
110
|
+
#*parameters*
|
111
|
+
#
|
112
|
+
#_doc_ - The containing document
|
113
|
+
#
|
114
|
+
#_example_ - The value of the src attribute of the img tag
|
115
|
+
#This is convenient, since if the users rigth-clicks an image and
|
116
|
+
#copies image location, this string will be copied to the clipboard
|
117
|
+
#and thus can be easily pasted as an examle
|
118
|
+
#
|
119
|
+
#_index_ - there might be more images with the same src on the page -
|
120
|
+
#most typically the user will need the 0th - but if this is not the
|
121
|
+
#case, there is the possibility to override this
|
122
|
+
def self.find_image(doc, example, index=0)
|
123
|
+
(doc/"img[@src='#{example}']")[index]
|
124
|
+
end
|
125
|
+
|
126
|
+
##
|
127
|
+
#Used to find the parent of a node with the given name - for example
|
128
|
+
#find the <form> node which is the parent of the <input> node
|
129
|
+
def self.traverse_up_until_name(node, name)
|
130
|
+
while node.class != Hpricot::Doc do
|
131
|
+
break if node.name == name
|
132
|
+
node = node.parent
|
133
|
+
end
|
134
|
+
node
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
private
|
139
|
+
#Find the index of the child inside the parent
|
140
|
+
#For example:
|
141
|
+
#
|
142
|
+
# tr
|
143
|
+
# / | \
|
144
|
+
# td td td
|
145
|
+
# 0 1 2
|
146
|
+
#
|
147
|
+
#The last row contains the indices of the td's from the
|
148
|
+
#tow above.
|
149
|
+
#
|
150
|
+
#Note that in classic XPath, the indices start with 1 (rather
|
151
|
+
#than 0).
|
152
|
+
def self.find_index(node)
|
153
|
+
c = -1
|
154
|
+
node.parent.children.each do |child|
|
155
|
+
if child.class == Hpricot::Elem
|
156
|
+
c += 1 if (child.name == node.name)
|
157
|
+
break if (node == child)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
c
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.traverse_up(node, stopnode=nil)
|
164
|
+
path = []
|
165
|
+
while node.class != Hpricot::Doc do
|
166
|
+
break if node == stopnode
|
167
|
+
path.push node
|
168
|
+
node = node.parent
|
169
|
+
end
|
170
|
+
path
|
171
|
+
end
|
172
|
+
|
173
|
+
def self.traverse_for_text(node, text)
|
174
|
+
return if @found
|
175
|
+
if (node.instance_of? Hpricot::Elem)
|
176
|
+
@node = node
|
177
|
+
ft = full_text(node)
|
178
|
+
@found = true if (ft.gsub(' '){' '} == text)
|
179
|
+
end
|
180
|
+
node.children.each do |child|
|
181
|
+
traverse_nodes child if child.instance_of? Hpricot::Doc
|
182
|
+
if child.instance_of? Hpricot::Elem
|
183
|
+
traverse_for_text(child, text) unless NON_CONTENT_TAGS.include? child.name
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def self.lowest_possible_node_with_text(node, text)
|
189
|
+
return if node.instance_of? Hpricot::Text
|
190
|
+
@node = node if full_text(node) == text
|
191
|
+
node.children.each do |child|
|
192
|
+
lowest_possible_node_with_text(child, text)
|
193
|
+
end
|
194
|
+
end #End of method lowest_possible_node_with_text
|
195
|
+
end #End of class XPathUtils
|
196
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,106 @@
|
|
1
|
+
#require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint')
|
2
|
+
#require File.join(File.dirname(__FILE__), '../..', 'lib', 'extractor')
|
3
|
+
#require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint_adder')
|
4
|
+
require 'scrubyt'
|
5
|
+
require 'test/unit'
|
6
|
+
|
7
|
+
class ConstraintTest < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def test_presence_of_attribute_constraints
|
10
|
+
data = Scrubyt::Extractor.define do
|
11
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
12
|
+
|
13
|
+
(shape 'ruby_diamond').ensure_presence_of_attribute('color' => 'red').
|
14
|
+
ensure_absence_of_attribute('fill' => 'small_circles')
|
15
|
+
end
|
16
|
+
|
17
|
+
assert_equal(data.children[0].filters[0].constraints[0].type,
|
18
|
+
Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
|
19
|
+
assert_equal(data.children[0].filters[0].constraints[1].type,
|
20
|
+
Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_presence_of_ancestor_node_constraints
|
24
|
+
data = Scrubyt::Extractor.define do
|
25
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
26
|
+
|
27
|
+
(shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
|
28
|
+
ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
|
29
|
+
end
|
30
|
+
|
31
|
+
assert_equal(data.children[0].filters[0].constraints[0].type,
|
32
|
+
Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
|
33
|
+
assert_equal(data.children[0].filters[0].constraints[1].type,
|
34
|
+
Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_ancestor_node_constraints
|
38
|
+
data0 = Scrubyt::Extractor.define do
|
39
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
40
|
+
|
41
|
+
(shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham')
|
42
|
+
end
|
43
|
+
|
44
|
+
data1 = Scrubyt::Extractor.define do
|
45
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
46
|
+
|
47
|
+
(shape 'funky_rectangle').ensure_presence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
|
48
|
+
end
|
49
|
+
|
50
|
+
data2 = Scrubyt::Extractor.define do
|
51
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
52
|
+
|
53
|
+
(shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
|
54
|
+
ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
|
55
|
+
end
|
56
|
+
|
57
|
+
data3 = Scrubyt::Extractor.define do
|
58
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
59
|
+
|
60
|
+
(shape 'line').ensure_presence_of_ancestor_node(:contains, 'name' => 'fungus_ooze').
|
61
|
+
ensure_presence_of_ancestor_node(:intersects_with, 'object' => 'funky_lemon')
|
62
|
+
end
|
63
|
+
|
64
|
+
data4 = Scrubyt::Extractor.define do
|
65
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
66
|
+
|
67
|
+
(shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'chunky_bacon').
|
68
|
+
ensure_absence_of_attribute 'thickness' => '2'
|
69
|
+
end
|
70
|
+
|
71
|
+
assert_equal(data0.to_xml.to_s, "<root><shape>blue_circle</shape><shape>splatted_ellipse</shape></root>")
|
72
|
+
assert_equal(data1.to_xml.to_s, "<root><shape>splatted_ellipse</shape></root>")
|
73
|
+
assert_equal(data2.to_xml.to_s, "<root><shape>blue_circle</shape></root>")
|
74
|
+
assert_equal(data3.to_xml.to_s, "<root><shape>big_rectangle</shape></root>")
|
75
|
+
assert_equal(data4.to_xml.to_s, "<root><shape>ruby_diamond</shape></root>")
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_attribute_constraints
|
79
|
+
data0 = Scrubyt::Extractor.define do
|
80
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
81
|
+
|
82
|
+
(shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red'
|
83
|
+
end
|
84
|
+
|
85
|
+
data1 = Scrubyt::Extractor.define do
|
86
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
87
|
+
|
88
|
+
(shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => '10x20'
|
89
|
+
end
|
90
|
+
|
91
|
+
data2 = Scrubyt::Extractor.define do
|
92
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
93
|
+
(shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => nil
|
94
|
+
end
|
95
|
+
|
96
|
+
data3 = Scrubyt::Extractor.define do
|
97
|
+
fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
|
98
|
+
(shape 'ruby_diamond').ensure_presence_of_attribute 'thickness' => nil
|
99
|
+
end
|
100
|
+
|
101
|
+
assert_equal(data0.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape><shape>twinky_line</shape></root>")
|
102
|
+
assert_equal(data1.to_xml.to_s, "<root><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
|
103
|
+
assert_equal(data2.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
|
104
|
+
assert_equal(data3.to_xml.to_s, "<root><shape>twinky_line</shape><shape>line</shape><shape>chunky_line</shape></root>")
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#require File.join(File.dirname(__FILE__), '../../lib', 'extractor.rb')
|
2
|
+
#require File.join(File.dirname(__FILE__), '../../lib', 'pattern.rb')
|
3
|
+
require 'scrubyt'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class ExtractorTest < Test::Unit::TestCase
|
7
|
+
def test_create_one_pattern
|
8
|
+
pattern = Scrubyt::Extractor.define do
|
9
|
+
fetch File.join(File.dirname(__FILE__), "input/test.html")
|
10
|
+
pattern "x"
|
11
|
+
end
|
12
|
+
assert_instance_of(Scrubyt::Pattern, pattern)
|
13
|
+
|
14
|
+
assert_equal(pattern.name, "root")
|
15
|
+
assert_equal(pattern.children[0].name, 'pattern')
|
16
|
+
assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
|
17
|
+
assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
|
18
|
+
|
19
|
+
assert_equal(pattern.generalize, false)
|
20
|
+
assert_equal(pattern.children[0].generalize, true)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_create_child_pattern
|
24
|
+
pattern = Scrubyt::Extractor.define do
|
25
|
+
fetch File.join(File.dirname(__FILE__), "input/test.html")
|
26
|
+
parent { child "x" }
|
27
|
+
end
|
28
|
+
|
29
|
+
assert_equal(pattern.name, "root")
|
30
|
+
assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
|
31
|
+
assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
|
32
|
+
|
33
|
+
assert_equal(pattern.children[0].name, "parent")
|
34
|
+
assert_equal(pattern.children[0].type, Scrubyt::Pattern::PATTERN_TYPE_TREE)
|
35
|
+
assert_equal(pattern.children[0].output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_create_more_children
|
39
|
+
pattern = Scrubyt::Extractor.define do
|
40
|
+
fetch File.join(File.dirname(__FILE__), "input/test.html")
|
41
|
+
parent do
|
42
|
+
child1 'x'
|
43
|
+
child2 'y'
|
44
|
+
child3 'z'
|
45
|
+
child4 'a'
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
assert_equal(pattern.children[0].children.size, 4)
|
50
|
+
|
51
|
+
i = 0
|
52
|
+
3.times do
|
53
|
+
assert_equal(pattern.children[0].children[i].parent,
|
54
|
+
pattern.children[0].children[i+=1].parent)
|
55
|
+
assert_equal(pattern.children[0].children[i].children, [])
|
56
|
+
end
|
57
|
+
assert_equal(pattern.children[0].children[3].parent, pattern.children[0])
|
58
|
+
assert_equal(pattern.children[0].children[3].parent.parent, pattern)
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_create_hierarchy
|
62
|
+
tree = Scrubyt::Extractor.define do
|
63
|
+
fetch File.join(File.dirname(__FILE__), "input/test.html")
|
64
|
+
a { b { c { d { e "x" } } } }
|
65
|
+
end
|
66
|
+
|
67
|
+
assert_equal(tree.name,"root")
|
68
|
+
assert_equal(tree.children[0].name,"a")
|
69
|
+
assert_equal(tree.children[0].children[0].name,"b")
|
70
|
+
assert_equal(tree.children[0].children[0].children[0].name,"c")
|
71
|
+
assert_equal(tree.children[0].children[0].children[0].children[0].name,"d")
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
def test_empty_filter
|
76
|
+
tree = Scrubyt::Extractor.define do
|
77
|
+
fetch File.join(File.dirname(__FILE__), "input/test.html")
|
78
|
+
a do
|
79
|
+
b 'x'
|
80
|
+
c 'y'
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
assert_not_nil(tree.filters[0])
|
85
|
+
assert_nil(tree.example)
|
86
|
+
assert_not_nil(tree.children[0].filters[0])
|
87
|
+
assert_nil(tree.children[0].example)
|
88
|
+
assert_not_nil(tree.children[0].children[0].filters[0])
|
89
|
+
assert_equal(tree.children[0].children[0].example,'x')
|
90
|
+
assert_not_nil(tree.children[0].children[1].filters[0])
|
91
|
+
assert_equal(tree.children[0].children[1].example,'y')
|
92
|
+
end
|
93
|
+
end
|