scrubyt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Represents the results of a pattern</tt>
4
+ class Result
5
+ attr_reader :childmap, :instances
6
+
7
+ def initialize
8
+ @childmap ||= []
9
+ end
10
+
11
+ def add_result(source, result)
12
+ @childmap.each do |hash|
13
+ if hash.keys[0] == source
14
+ hash[source] << result
15
+ return
16
+ end
17
+ end
18
+ @childmap << {source => [result]}
19
+ end
20
+
21
+ def lookup(last_result)
22
+ @childmap.each do |hashes|
23
+ hashes.each { |key, value| return value if (key == last_result) }
24
+ end
25
+ nil
26
+ end#end of method lookup
27
+ end#end of class Result
28
+ end#end of module Scrubyt
29
+
30
+ #It roughly works like this:
31
+ #
32
+ # root
33
+ # source: nil
34
+ # childmap: [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
35
+
36
+ #table
37
+ # source: doc1
38
+ # childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, doc2 => [table[1]s2, table[2]s2, table[3]s2] ]
39
+
40
+ #row
41
+ # source: table1s1, table2s1, table3s1
42
+ # childmap: [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
43
+ # {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
@@ -0,0 +1,84 @@
1
+ require 'rexml/document'
2
+
3
+ module Scrubyt
4
+ ##
5
+ #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
6
+ class ResultDumper
7
+ ##
8
+ #Output the results as XML
9
+ def self.to_xml(pattern)
10
+ doc = REXML::Document.new
11
+ root = REXML::Element.new('root')
12
+ doc.add_element(root)
13
+ all_extracted_docs = pattern.last_result
14
+ all_extracted_docs.each do |lr|
15
+ pattern.last_result = lr
16
+ to_xml_recursive(pattern, root)
17
+ end
18
+ doc
19
+ end
20
+
21
+ ##
22
+ #Output the text of the pattern; If this pattern is a tree, collect the text from its
23
+ #result instance node; otherwise rely on the last_result
24
+ def self.to_text(pattern)
25
+ last_result = pattern.last_result
26
+ result = ""
27
+ if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_TREE
28
+ last_result.traverse_text { |t| result += t.to_s }
29
+ else
30
+ result = last_result
31
+ end
32
+ result
33
+ end
34
+
35
+ ##
36
+ #Print some simple statistics on the extracted results, like the count of extracted
37
+ #instances by each pattern
38
+ def self.print_statistics(pattern)
39
+ puts "\n" * 2
40
+ print_statistics_recursive(pattern,0)
41
+ puts
42
+ end
43
+
44
+ private
45
+ def self.to_xml_recursive(pattern, element)
46
+ pattern.children.each do |child|
47
+ childresults = child.result.lookup(child.parent.last_result)
48
+ #Output text for leaf nodes only; Maybe add possibility to customize this later
49
+ if (childresults == nil)
50
+ res = ""
51
+ child.parent.last_result.traverse_text { |t| res += t.to_s }
52
+ if (child.parent.size == 0)
53
+ element.text = (res.gsub('&nbsp;'){' '}).strip unless element.parent.is_a? REXML::Document
54
+ end
55
+ next
56
+ end
57
+ generate_children(child, childresults, element)
58
+ end
59
+ end
60
+
61
+ def self.generate_children(child, childresults, element)
62
+ childresults.size.times do |num|
63
+ child.last_result = childresults[num]
64
+ res = ""
65
+ if child.last_result.instance_of? String
66
+ res = child.last_result
67
+ else
68
+ child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
69
+ end
70
+ child_node = REXML::Element.new(child.name)
71
+ child_node.text = (res.gsub('&nbsp;'){' '}).strip if (child.children.size == 0)
72
+ element.add_element(child_node)
73
+ to_xml_recursive(child, child_node)
74
+ end
75
+ end
76
+
77
+ def self.print_statistics_recursive(pattern, depth)
78
+ puts((' ' * "#{depth}".to_i) + "#{pattern.name} extracted #{pattern.get_instance_count[pattern.name]} instances.") if pattern.name != 'root'
79
+ pattern.children.each do |child|
80
+ print_statistics_recursive(child, depth + 4)
81
+ end
82
+ end#end of method print_statistics_recursive
83
+ end #end of class ResultDumper
84
+ end #end of module Scrubyt
@@ -0,0 +1,196 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+
4
+ module Scrubyt
5
+ ##
6
+ #=<tt>Various XPath utility functions</tt>
7
+ class XPathUtils
8
+ #When looking up examples, do NOT recurse into these tags since they won't contain any usable info
9
+ NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
10
+
11
+ #From the example text defined by the user, find the lowest possible node with the text 'text'.
12
+ #The text can be also a mixed content text, e.g.
13
+ #
14
+ # <a>Bon <b>nuit</b>, monsieur!</a>
15
+ #
16
+ #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
17
+ def self.find_node_from_text(doc, text)
18
+ @node = nil
19
+ @found = false
20
+ self.traverse_for_text(doc,text)
21
+ self.lowest_possible_node_with_text(@node, text)
22
+ #$Logger.warn("Node for example #{text} Not found!") if (@found == false)
23
+ puts "Node for example #{text} Not found!" if (@found == false)
24
+ @node
25
+ end
26
+
27
+ #Full text of the node; this is equivalent to Hpricot's inner_text. Will be
28
+ #replaced if Hpricot 0.5 will be released
29
+ def self.full_text(node)
30
+ result = ""
31
+ node.traverse_text { |t| result += t.to_s }
32
+ result
33
+ end
34
+
35
+ #Find the LCA (Lowest Common Ancestor) of two nodes
36
+ def self.lowest_common_ancestor(node1, node2)
37
+ path1 = traverse_up(node1)
38
+ path2 = traverse_up(node2)
39
+ return node1.parent if path1 == path2
40
+
41
+ closure = nil
42
+ while (!path1.empty? && !path2.empty?)
43
+ closure = path1.pop
44
+ return closure.parent if (closure != path2.pop)
45
+ end
46
+ path1.size > path2.size ? path1.last.parent : path2.last.parent
47
+ end
48
+
49
+ ##
50
+ #Generate XPath for the given node
51
+ #
52
+ #*parameters*
53
+ #
54
+ #_node_ - The node we are looking the XPath for
55
+ #
56
+ #_stopnode_ - The Xpath generation is stopped and the XPath that
57
+ #was generated so far is returned if this node is reached.
58
+ #
59
+ #_write_indices_ - whether the index inside the parent shuold be
60
+ #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
61
+ def self.generate_XPath(node, stopnode=nil, write_indices=false)
62
+ path = []
63
+ indices = []
64
+ found = false
65
+ while node.class != Hpricot::Doc do
66
+ if node == stopnode
67
+ found = true
68
+ break
69
+ end
70
+ path.push node.name
71
+ indices.push find_index(node) if write_indices
72
+ node = node.parent
73
+ end
74
+ #This condition ensures that if there is a stopnode, and we did not found it along the way,
75
+ #we return nil (since the stopnode is not contained in the path at all)
76
+ return nil if stopnode != nil && !found
77
+ result = ""
78
+ if write_indices
79
+ path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
80
+ else
81
+ path.reverse.each{ |node| result += "#{node}/" }
82
+ end
83
+ "/" + result.chop
84
+ end
85
+
86
+ #Generate an XPath of the node with indices, relatively to the given
87
+ #relative_root.
88
+ #
89
+ #For example if the elem's absolute XPath is /a/b/c,
90
+ #and the relative root's Xpath is a/b, the result of the function will
91
+ #be /c.
92
+ def self.generate_relative_XPath( elem,relative_root )
93
+ return nil if (elem == relative_root)
94
+ generate_XPath(elem, relative_root, true)
95
+ end
96
+
97
+ #Generate a generalized XPath (i.e. without indices) of the node,
98
+ #relatively to the given relative_root.
99
+ #
100
+ #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
101
+ #and the relative root's Xpath is a[1]/b[3], the result of the function will
102
+ #be /c.
103
+ def self.generate_generalized_relative_XPath( elem,relative_root )
104
+ return nil if (elem == relative_root)
105
+ generate_XPath(elem, relative_root, false)
106
+ end
107
+
108
+ #Find an image based on the src of the example
109
+ #
110
+ #*parameters*
111
+ #
112
+ #_doc_ - The containing document
113
+ #
114
+ #_example_ - The value of the src attribute of the img tag
115
+ #This is convenient, since if the users rigth-clicks an image and
116
+ #copies image location, this string will be copied to the clipboard
117
+ #and thus can be easily pasted as an examle
118
+ #
119
+ #_index_ - there might be more images with the same src on the page -
120
+ #most typically the user will need the 0th - but if this is not the
121
+ #case, there is the possibility to override this
122
+ def self.find_image(doc, example, index=0)
123
+ (doc/"img[@src='#{example}']")[index]
124
+ end
125
+
126
+ ##
127
+ #Used to find the parent of a node with the given name - for example
128
+ #find the <form> node which is the parent of the <input> node
129
+ def self.traverse_up_until_name(node, name)
130
+ while node.class != Hpricot::Doc do
131
+ break if node.name == name
132
+ node = node.parent
133
+ end
134
+ node
135
+ end
136
+
137
+
138
+ private
139
+ #Find the index of the child inside the parent
140
+ #For example:
141
+ #
142
+ # tr
143
+ # / | \
144
+ # td td td
145
+ # 0 1 2
146
+ #
147
+ #The last row contains the indices of the td's from the
148
+ #tow above.
149
+ #
150
+ #Note that in classic XPath, the indices start with 1 (rather
151
+ #than 0).
152
+ def self.find_index(node)
153
+ c = -1
154
+ node.parent.children.each do |child|
155
+ if child.class == Hpricot::Elem
156
+ c += 1 if (child.name == node.name)
157
+ break if (node == child)
158
+ end
159
+ end
160
+ c
161
+ end
162
+
163
+ def self.traverse_up(node, stopnode=nil)
164
+ path = []
165
+ while node.class != Hpricot::Doc do
166
+ break if node == stopnode
167
+ path.push node
168
+ node = node.parent
169
+ end
170
+ path
171
+ end
172
+
173
+ def self.traverse_for_text(node, text)
174
+ return if @found
175
+ if (node.instance_of? Hpricot::Elem)
176
+ @node = node
177
+ ft = full_text(node)
178
+ @found = true if (ft.gsub('&nbsp;'){' '} == text)
179
+ end
180
+ node.children.each do |child|
181
+ traverse_nodes child if child.instance_of? Hpricot::Doc
182
+ if child.instance_of? Hpricot::Elem
183
+ traverse_for_text(child, text) unless NON_CONTENT_TAGS.include? child.name
184
+ end
185
+ end
186
+ end
187
+
188
+ def self.lowest_possible_node_with_text(node, text)
189
+ return if node.instance_of? Hpricot::Text
190
+ @node = node if full_text(node) == text
191
+ node.children.each do |child|
192
+ lowest_possible_node_with_text(child, text)
193
+ end
194
+ end #End of method lowest_possible_node_with_text
195
+ end #End of class XPathUtils
196
+ end #End of module Scrubyt
@@ -0,0 +1,106 @@
1
+ #require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint')
2
+ #require File.join(File.dirname(__FILE__), '../..', 'lib', 'extractor')
3
+ #require File.join(File.dirname(__FILE__), '../..', 'lib', 'constraint_adder')
4
+ require 'scrubyt'
5
+ require 'test/unit'
6
+
7
+ class ConstraintTest < Test::Unit::TestCase
8
+
9
+ def test_presence_of_attribute_constraints
10
+ data = Scrubyt::Extractor.define do
11
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
12
+
13
+ (shape 'ruby_diamond').ensure_presence_of_attribute('color' => 'red').
14
+ ensure_absence_of_attribute('fill' => 'small_circles')
15
+ end
16
+
17
+ assert_equal(data.children[0].filters[0].constraints[0].type,
18
+ Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
19
+ assert_equal(data.children[0].filters[0].constraints[1].type,
20
+ Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
21
+ end
22
+
23
+ def test_presence_of_ancestor_node_constraints
24
+ data = Scrubyt::Extractor.define do
25
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
26
+
27
+ (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
28
+ ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
29
+ end
30
+
31
+ assert_equal(data.children[0].filters[0].constraints[0].type,
32
+ Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
33
+ assert_equal(data.children[0].filters[0].constraints[1].type,
34
+ Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
35
+ end
36
+
37
+ def test_ancestor_node_constraints
38
+ data0 = Scrubyt::Extractor.define do
39
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
40
+
41
+ (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham')
42
+ end
43
+
44
+ data1 = Scrubyt::Extractor.define do
45
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
46
+
47
+ (shape 'funky_rectangle').ensure_presence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
48
+ end
49
+
50
+ data2 = Scrubyt::Extractor.define do
51
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
52
+
53
+ (shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'crispy_ham').
54
+ ensure_absence_of_ancestor_node(:intersects_with, 'name' => 'spaghetti_ice')
55
+ end
56
+
57
+ data3 = Scrubyt::Extractor.define do
58
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
59
+
60
+ (shape 'line').ensure_presence_of_ancestor_node(:contains, 'name' => 'fungus_ooze').
61
+ ensure_presence_of_ancestor_node(:intersects_with, 'object' => 'funky_lemon')
62
+ end
63
+
64
+ data4 = Scrubyt::Extractor.define do
65
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
66
+
67
+ (shape 'ruby_diamond').ensure_presence_of_ancestor_node(:contains, 'name' => 'chunky_bacon').
68
+ ensure_absence_of_attribute 'thickness' => '2'
69
+ end
70
+
71
+ assert_equal(data0.to_xml.to_s, "<root><shape>blue_circle</shape><shape>splatted_ellipse</shape></root>")
72
+ assert_equal(data1.to_xml.to_s, "<root><shape>splatted_ellipse</shape></root>")
73
+ assert_equal(data2.to_xml.to_s, "<root><shape>blue_circle</shape></root>")
74
+ assert_equal(data3.to_xml.to_s, "<root><shape>big_rectangle</shape></root>")
75
+ assert_equal(data4.to_xml.to_s, "<root><shape>ruby_diamond</shape></root>")
76
+ end
77
+
78
+ def test_attribute_constraints
79
+ data0 = Scrubyt::Extractor.define do
80
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
81
+
82
+ (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red'
83
+ end
84
+
85
+ data1 = Scrubyt::Extractor.define do
86
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
87
+
88
+ (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => '10x20'
89
+ end
90
+
91
+ data2 = Scrubyt::Extractor.define do
92
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
93
+ (shape 'ruby_diamond').ensure_presence_of_attribute 'color' => 'red', 'size' => nil
94
+ end
95
+
96
+ data3 = Scrubyt::Extractor.define do
97
+ fetch File.join(File.dirname(__FILE__), "input/constraint_test.html")
98
+ (shape 'ruby_diamond').ensure_presence_of_attribute 'thickness' => nil
99
+ end
100
+
101
+ assert_equal(data0.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape><shape>twinky_line</shape></root>")
102
+ assert_equal(data1.to_xml.to_s, "<root><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
103
+ assert_equal(data2.to_xml.to_s, "<root><shape>funky_rectangle</shape><shape>blue_circle</shape><shape>shiny_diamond</shape><shape>clunky_ellipse</shape></root>")
104
+ assert_equal(data3.to_xml.to_s, "<root><shape>twinky_line</shape><shape>line</shape><shape>chunky_line</shape></root>")
105
+ end
106
+ end
@@ -0,0 +1,93 @@
1
+ #require File.join(File.dirname(__FILE__), '../../lib', 'extractor.rb')
2
+ #require File.join(File.dirname(__FILE__), '../../lib', 'pattern.rb')
3
+ require 'scrubyt'
4
+ require 'test/unit'
5
+
6
+ class ExtractorTest < Test::Unit::TestCase
7
+ def test_create_one_pattern
8
+ pattern = Scrubyt::Extractor.define do
9
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
10
+ pattern "x"
11
+ end
12
+ assert_instance_of(Scrubyt::Pattern, pattern)
13
+
14
+ assert_equal(pattern.name, "root")
15
+ assert_equal(pattern.children[0].name, 'pattern')
16
+ assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
17
+ assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
18
+
19
+ assert_equal(pattern.generalize, false)
20
+ assert_equal(pattern.children[0].generalize, true)
21
+ end
22
+
23
+ def test_create_child_pattern
24
+ pattern = Scrubyt::Extractor.define do
25
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
26
+ parent { child "x" }
27
+ end
28
+
29
+ assert_equal(pattern.name, "root")
30
+ assert_equal(pattern.type, Scrubyt::Pattern::PATTERN_TYPE_ROOT)
31
+ assert_equal(pattern.output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
32
+
33
+ assert_equal(pattern.children[0].name, "parent")
34
+ assert_equal(pattern.children[0].type, Scrubyt::Pattern::PATTERN_TYPE_TREE)
35
+ assert_equal(pattern.children[0].output_type, Scrubyt::Pattern::OUTPUT_TYPE_MODEL)
36
+ end
37
+
38
+ def test_create_more_children
39
+ pattern = Scrubyt::Extractor.define do
40
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
41
+ parent do
42
+ child1 'x'
43
+ child2 'y'
44
+ child3 'z'
45
+ child4 'a'
46
+ end
47
+ end
48
+
49
+ assert_equal(pattern.children[0].children.size, 4)
50
+
51
+ i = 0
52
+ 3.times do
53
+ assert_equal(pattern.children[0].children[i].parent,
54
+ pattern.children[0].children[i+=1].parent)
55
+ assert_equal(pattern.children[0].children[i].children, [])
56
+ end
57
+ assert_equal(pattern.children[0].children[3].parent, pattern.children[0])
58
+ assert_equal(pattern.children[0].children[3].parent.parent, pattern)
59
+ end
60
+
61
+ def test_create_hierarchy
62
+ tree = Scrubyt::Extractor.define do
63
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
64
+ a { b { c { d { e "x" } } } }
65
+ end
66
+
67
+ assert_equal(tree.name,"root")
68
+ assert_equal(tree.children[0].name,"a")
69
+ assert_equal(tree.children[0].children[0].name,"b")
70
+ assert_equal(tree.children[0].children[0].children[0].name,"c")
71
+ assert_equal(tree.children[0].children[0].children[0].children[0].name,"d")
72
+ end
73
+
74
+
75
+ def test_empty_filter
76
+ tree = Scrubyt::Extractor.define do
77
+ fetch File.join(File.dirname(__FILE__), "input/test.html")
78
+ a do
79
+ b 'x'
80
+ c 'y'
81
+ end
82
+ end
83
+
84
+ assert_not_nil(tree.filters[0])
85
+ assert_nil(tree.example)
86
+ assert_not_nil(tree.children[0].filters[0])
87
+ assert_nil(tree.children[0].example)
88
+ assert_not_nil(tree.children[0].children[0].filters[0])
89
+ assert_equal(tree.children[0].children[0].example,'x')
90
+ assert_not_nil(tree.children[0].children[1].filters[0])
91
+ assert_equal(tree.children[0].children[1].example,'y')
92
+ end
93
+ end