scrubyt 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,43 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Represents the results of a pattern</tt>
4
+ class Result
5
+ attr_reader :childmap, :instances
6
+
7
+ def initialize
8
+ @childmap ||= []
9
+ end
10
+
11
+ def add_result(source, result)
12
+ @childmap.each do |hash|
13
+ if hash.keys[0] == source
14
+ hash[source] << result
15
+ return
16
+ end
17
+ end
18
+ @childmap << {source => [result]}
19
+ end
20
+
21
+ def lookup(last_result)
22
+ @childmap.each do |hashes|
23
+ hashes.each { |key, value| return value if (key == last_result) }
24
+ end
25
+ nil
26
+ end#end of method lookup
27
+ end#end of class Result
28
+ end#end of module Scrubyt
29
+
30
+ #It roughly works like this:
31
+ #
32
+ # root
33
+ # source: nil
34
+ # childmap: [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
35
+
36
+ #table
37
+ # source: doc1
38
+ # childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, doc2 => [table[1]s2, table[2]s2, table[3]s2] ]
39
+
40
+ #row
41
+ # source: table1s1, table2s1, table3s1
42
+ # childmap: [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
43
+ # {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
@@ -0,0 +1,84 @@
1
+ require 'rexml/document'
2
+
3
+ module Scrubyt
4
+ ##
5
+ #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
6
+ class ResultDumper
7
+ ##
8
+ #Output the results as XML
9
+ def self.to_xml(pattern)
10
+ doc = REXML::Document.new
11
+ root = REXML::Element.new('root')
12
+ doc.add_element(root)
13
+ all_extracted_docs = pattern.last_result
14
+ all_extracted_docs.each do |lr|
15
+ pattern.last_result = lr
16
+ to_xml_recursive(pattern, root)
17
+ end
18
+ doc
19
+ end
20
+
21
+ ##
22
+ #Output the text of the pattern; If this pattern is a tree, collect the text from its
23
+ #result instance node; otherwise rely on the last_result
24
+ def self.to_text(pattern)
25
+ last_result = pattern.last_result
26
+ result = ""
27
+ if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_TREE
28
+ last_result.traverse_text { |t| result += t.to_s }
29
+ else
30
+ result = last_result
31
+ end
32
+ result
33
+ end
34
+
35
+ ##
36
+ #Print some simple statistics on the extracted results, like the count of extracted
37
+ #instances by each pattern
38
+ def self.print_statistics(pattern)
39
+ puts "\n" * 2
40
+ print_statistics_recursive(pattern,0)
41
+ puts
42
+ end
43
+
44
+ private
45
+ def self.to_xml_recursive(pattern, element)
46
+ pattern.children.each do |child|
47
+ childresults = child.result.lookup(child.parent.last_result)
48
+ #Output text for leaf nodes only; Maybe add possibility to customize this later
49
+ if (childresults == nil)
50
+ res = ""
51
+ child.parent.last_result.traverse_text { |t| res += t.to_s }
52
+ if (child.parent.size == 0)
53
+ element.text = (res.gsub('&nbsp;'){' '}).strip unless element.parent.is_a? REXML::Document
54
+ end
55
+ next
56
+ end
57
+ generate_children(child, childresults, element)
58
+ end
59
+ end
60
+
61
+ def self.generate_children(child, childresults, element)
62
+ childresults.size.times do |num|
63
+ child.last_result = childresults[num]
64
+ res = ""
65
+ if child.last_result.instance_of? String
66
+ res = child.last_result
67
+ else
68
+ child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
69
+ end
70
+ child_node = REXML::Element.new(child.name)
71
+ child_node.text = (res.gsub('&nbsp;'){' '}).strip if (child.children.size == 0)
72
+ element.add_element(child_node)
73
+ to_xml_recursive(child, child_node)
74
+ end
75
+ end
76
+
77
+ def self.print_statistics_recursive(pattern, depth)
78
+ puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{pattern.get_instance_count[pattern.name]} instances.") if pattern.name != 'root'
79
+ pattern.children.each do |child|
80
+ print_statistics_recursive(child, depth + 4)
81
+ end
82
+ end#end of method print_statistics_recursive
83
+ end #end of class ResultDumper
84
+ end #end of module Scrubyt
@@ -0,0 +1,196 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+
4
+ module Scrubyt
5
+ ##
6
+ #=<tt>Various XPath utility functions</tt>
7
+ class XPathUtils
8
+ #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
9
+ NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
10
+
11
+ #From the example text defined by the user, find the lowest possible node with the text 'text'.
12
+ #The text can be also a mixed content text, e.g.
13
+ #
14
+ # <a>Bon <b>nuit</b>, monsieur!</a>
15
+ #
16
+ #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
17
+ def self.find_node_from_text(doc, text)
18
+ @node = nil
19
+ @found = false
20
+ self.traverse_for_text(doc,text)
21
+ self.lowest_possible_node_with_text(@node, text)
22
+ #$Logger.warn("Node for example #{text} Not found!") if (@found == false)
23
+ puts "Node for example #{text} Not found!" if (@found == false)
24
+ @node
25
+ end
26
+
27
+ #Full text of the node; this is equivalent to Hpricot's inner_text. Will be
28
+ #replaced if Hpricot 0.5 will be released
29
+ def self.full_text(node)
30
+ result = ""
31
+ node.traverse_text { |t| result += t.to_s }
32
+ result
33
+ end
34
+
35
+ #Find the LCA (Lowest Common Ancestor) of two nodes
36
+ def self.lowest_common_ancestor(node1, node2)
37
+ path1 = traverse_up(node1)
38
+ path2 = traverse_up(node2)
39
+ return node1.parent if path1 == path2
40
+
41
+ closure = nil
42
+ while (!path1.empty? && !path2.empty?)
43
+ closure = path1.pop
44
+ return closure.parent if (closure != path2.pop)
45
+ end
46
+ path1.size > path2.size ? path1.last.parent : path2.last.parent
47
+ end
48
+
49
+ ##
50
+ #Generate XPath for the given node
51
+ #
52
+ #*parameters*
53
+ #
54
+ #_node_ - The node we are looking the XPath for
55
+ #
56
+ #_stopnode_ - The Xpath generation is stopped and the XPath that
57
+ #was generated so far is returned if this node is reached.
58
+ #
59
+ #_write_indices_ - whether the index inside the parent shuold be
60
+ #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
61
+ def self.generate_XPath(node, stopnode=nil, write_indices=false)
62
+ path = []
63
+ indices = []
64
+ found = false
65
+ while node.class != Hpricot::Doc do
66
+ if node == stopnode
67
+ found = true
68
+ break
69
+ end
70
+ path.push node.name
71
+ indices.push find_index(node) if write_indices
72
+ node = node.parent
73
+ end
74
+ #This condition ensures that if there is a stopnode, and we did not found it along the way,
75
+ #we return nil (since the stopnode is not contained in the path at all)
76
+ return nil if stopnode != nil && !found
77
+ result = ""
78
+ if write_indices
79
+ path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
80
+ else
81
+ path.reverse.each{ |node| result += "#{node}/" }
82
+ end
83
+ "/" + result.chop
84
+ end
85
+
86
+ #Generate an XPath of the node with indices, relatively to the given
87
+ #relative_root.
88
+ #
89
+ #For example if the elem's absolute XPath is /a/b/c,
90
+ #and the relative root's Xpath is a/b, the result of the function will
91
+ #be /c.
92
+ def self.generate_relative_XPath( elem,relative_root )
93
+ return nil if (elem == relative_root)
94
+ generate_XPath(elem, relative_root, true)
95
+ end
96
+
97
+ #Generate a generalized XPath (i.e. without indices) of the node,
98
+ #relatively to the given relative_root.
99
+ #
100
+ #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
101
+ #and the relative root's Xpath is a[1]/b[3], the result of the function will
102
+ #be /c.
103
+ def self.generate_generalized_relative_XPath( elem,relative_root )
104
+ return nil if (elem == relative_root)
105
+ generate_XPath(elem, relative_root, false)
106
+ end
107
+
108
+ #Find an image based on the src of the example
109
+ #
110
+ #*parameters*
111
+ #
112
+ #_doc_ - The containing document
113
+ #
114
+ #_example_ - The value of the src attribute of the img tag
115
+ #This is convenient, since if the users rigth-clicks an image and
116
+ #copies image location, this string will be copied to the clipboard
117
+ #and thus can be easily pasted as an examle
118
+ #
119
+ #_index_ - there might be more images with the same src on the page -
120
+ #most typically the user will need the 0th - but if this is not the
121
+ #case, there is the possibility to override this
122
+ def self.find_image(doc, example, index=0)
123
+ (doc/"img[@src='#{example}']")[index]
124
+ end
125
+
126
+ ##
127
+ #Used to find the parent of a node with the given name - for example
128
+ #find the <form> node which is the parent of the <input> node
129
+ def self.traverse_up_until_name(node, name)
130
+ while node.class != Hpricot::Doc do
131
+ break if node.name == name
132
+ node = node.parent
133
+ end
134
+ node
135
+ end
136
+
137
+
138
+ private
139
+ #Find the index of the child inside the parent
140
+ #For example:
141
+ #
142
+ # tr
143
+ # / | \
144
+ # td td td
145
+ # 0 1 2
146
+ #
147
+ #The last row contains the indices of the td's from the
148
+ #tow above.
149
+ #
150
+ #Note that in classic XPath, the indices start with 1 (rather
151
+ #than 0).
152
+ def self.find_index(node)
153
+ c = -1
154
+ node.parent.children.each do |child|
155
+ if child.class == Hpricot::Elem
156
+ c += 1 if (child.name == node.name)
157
+ break if (node == child)
158
+ end
159
+ end
160
+ c
161
+ end
162
+
163
+ def self.traverse_up(node, stopnode=nil)
164
+ path = []
165
+ while node.class != Hpricot::Doc do
166
+ break if node == stopnode
167
+ path.push node
168
+ node = node.parent
169
+ end
170
+ path
171
+ end
172
+
173
+ def self.traverse_for_text(node, text)
174
+ return if @found
175
+ if (node.instance_of? Hpricot::Elem)
176
+ @node = node
177
+ ft = full_text(node)
178
+ @found = true if (ft.gsub('&nbsp;'){' '} == text)
179
+ end
180
+ node.children.each do |child|
181
+ traverse_nodes child if child.instance_of? Hpricot::Doc
182
+ if child.instance_of? Hpricot::Elem
183
+ traverse_for_text(child, text) unless NON_CONTENT_TAGS.include? child.name
184
+ end
185
+ end
186
+ end
187
+
188
+ def self.lowest_possible_node_with_text(node, text)
189
+ return if node.instance_of? Hpricot::Text
190
+ @node = node if full_text(node) == text
191
+ node.children.each do |child|
192
+ lowest_possible_node_with_text(child, text)
193
+ end
194
+ end #End of method lowest_possible_node_with_text
195
+ end #End of class XPathUtils
196
+ end #End of module Scrubyt
@@ -0,0 +1,106 @@
1
+ #require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint')
2
+ #require File.join(File.dirname(__FILE__), '../..', 'lib', 'extractor')
3
+ #require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint_adder')
4
+ require 'scrubyt'
5
+ require 'test/unit'
6
+
7
+ class ConstraintTest < Test::Unit::TestCase
8
+
9
+ def test_presence_of_attribute_constraints
10
+ data = Scrubyt::Extractor.define do
11
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
12
+
13
+ (shape 'ruby_diamond').ensure_presence_of_attribute('color' => 'red').
14
+ ensure_absence_of_attribute('fill' => 'small_circles')
15
+ end
16
+
17
+ assert_equal(data.children[0].filters[0].constraints[0].type,
18
+ Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
19
+ assert_equal(data.children[0].filters[0].constraints[1].type,
20
+ Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
21
+ end
22
+
23
+ def test_presence_of_ancestor_node_constraints
24
+ data = Scrubyt::Extractor.define do
25
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
26
+
27
+ (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
28
+ ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
29
+ end
30
+
31
+ assert_equal(data.children[0].filters[0].constraints[0].type,
32
+ Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
33
+ assert_equal(data.children[0].filters[0].constraints[1].type,
34
+ Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
35
+ end
36
+
37
+ def test_ancestor_node_constraints
38
+ data0 = Scrubyt::Extractor.define do
39
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
40
+
41
+ (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham')
42
+ end
43
+
44
+ data1 = Scrubyt::Extractor.define do
45
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
46
+
47
+ (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
48
+ end
49
+
50
+ data2 = Scrubyt::Extractor.define do
51
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
52
+
53
+ (shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
54
+ ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
55
+ end
56
+
57
+ data3 = Scrubyt::Extractor.define do
58
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
59
+
60
+ (shape 'line').ensure_presence_of_ancestor_node(:contains, 'name' => 'fungus_ooze').
61
+ ensure_presence_of_ancestor_node(:intersects_with, 'object' => 'funky_lemon')
62
+ end
63
+
64
+ data4 = Scrubyt::Extractor.define do
65
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
66
+
67
+ (shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'chunky_bacon').
68
+ ensure_absence_of_attribute 'thickness' => '2'
69
+ end
70
+
71
+ assert_equal(data0.to_xml.to_s, "<root><shape>blue_circle</shape><shape>splatted_ellipse</shape></root>")
72
+ assert_equal(data1.to_xml.to_s, "<root><shape>splatted_ellipse</shape></root>")
73
+ assert_equal(data2.to_xml.to_s, "<root><shape>blue_circle</shape></root>")
74
+ assert_equal(data3.to_xml.to_s, "<root><shape>big_rectangle</shape></root>")
75
+ assert_equal(data4.to_xml.to_s, "<root><shape>ruby_diamond</shape></root>")
76
+ end
77
+
78
+ def test_attribute_constraints
79
+ data0 = Scrubyt::Extractor.define do
80
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
81
+
82
+ (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red'
83
+ end
84
+
85
+ data1 = Scrubyt::Extractor.define do
86
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
87
+
88
+ (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => '10x20'
89
+ end
90
+
91
+ data2 = Scrubyt::Extractor.define do
92
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
93
+ (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => nil
94
+ end
95
+
96
+ data3 = Scrubyt::Extractor.define do
97
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
98
+ (shape 'ruby_diamond').ensure_presence_of_attribute 'thickness' => nil
99
+ end
100
+
101
+ assert_equal(data0.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape><shape>twinky_line</shape></root>")
102
+ assert_equal(data1.to_xml.to_s, "<root><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
103
+ assert_equal(data2.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
104
+ assert_equal(data3.to_xml.to_s, "<root><shape>twinky_line</shape><shape>line</shape><shape>chunky_line</shape></root>")
105
+ end
106
+ end
@@ -0,0 +1,93 @@
1
+ #require File.join(File.dirname(__FILE__), '../../lib', 'extractor.rb')
2
+ #require File.join(File.dirname(__FILE__), '../../lib', 'pattern.rb')
3
+ require 'scrubyt'
4
+ require 'test/unit'
5
+
6
+ class ExtractorTest < Test::Unit::TestCase
7
+ def test_create_one_pattern
8
+ pattern = Scrubyt::Extractor.define do
9
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
10
+ pattern "x"
11
+ end
12
+ assert_instance_of(Scrubyt::Pattern, pattern)
13
+
14
+ assert_equal(pattern.name, "root")
15
+ assert_equal(pattern.children[0].name, 'pattern')
16
+ assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
17
+ assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
18
+
19
+ assert_equal(pattern.generalize, false)
20
+ assert_equal(pattern.children[0].generalize, true)
21
+ end
22
+
23
+ def test_create_child_pattern
24
+ pattern = Scrubyt::Extractor.define do
25
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
26
+ parent { child "x" }
27
+ end
28
+
29
+ assert_equal(pattern.name, "root")
30
+ assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
31
+ assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
32
+
33
+ assert_equal(pattern.children[0].name, "parent")
34
+ assert_equal(pattern.children[0].type, Scrubyt::Pattern::PATTERN_TYPE_TREE)
35
+ assert_equal(pattern.children[0].output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
36
+ end
37
+
38
+ def test_create_more_children
39
+ pattern = Scrubyt::Extractor.define do
40
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
41
+ parent do
42
+ child1 'x'
43
+ child2 'y'
44
+ child3 'z'
45
+ child4 'a'
46
+ end
47
+ end
48
+
49
+ assert_equal(pattern.children[0].children.size, 4)
50
+
51
+ i = 0
52
+ 3.times do
53
+ assert_equal(pattern.children[0].children[i].parent,
54
+ pattern.children[0].children[i+=1].parent)
55
+ assert_equal(pattern.children[0].children[i].children, [])
56
+ end
57
+ assert_equal(pattern.children[0].children[3].parent, pattern.children[0])
58
+ assert_equal(pattern.children[0].children[3].parent.parent, pattern)
59
+ end
60
+
61
+ def test_create_hierarchy
62
+ tree = Scrubyt::Extractor.define do
63
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
64
+ a { b { c { d { e "x" } } } }
65
+ end
66
+
67
+ assert_equal(tree.name,"root")
68
+ assert_equal(tree.children[0].name,"a")
69
+ assert_equal(tree.children[0].children[0].name,"b")
70
+ assert_equal(tree.children[0].children[0].children[0].name,"c")
71
+ assert_equal(tree.children[0].children[0].children[0].children[0].name,"d")
72
+ end
73
+
74
+
75
+ def test_empty_filter
76
+ tree = Scrubyt::Extractor.define do
77
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
78
+ a do
79
+ b 'x'
80
+ c 'y'
81
+ end
82
+ end
83
+
84
+ assert_not_nil(tree.filters[0])
85
+ assert_nil(tree.example)
86
+ assert_not_nil(tree.children[0].filters[0])
87
+ assert_nil(tree.children[0].example)
88
+ assert_not_nil(tree.children[0].children[0].filters[0])
89
+ assert_equal(tree.children[0].children[0].example,'x')
90
+ assert_not_nil(tree.children[0].children[1].filters[0])
91
+ assert_equal(tree.children[0].children[1].example,'y')
92
+ end
93
+ end