scrubyt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ #require File.join(File.dirname(__FILE__), '../..', 'lib', 'filter')
2
+ require 'scrubyt'
3
+ require 'test/unit'
4
+
5
+ class FilterTest < Test::Unit::TestCase
6
+ def test_determine_example_type
7
+ #Test children example
8
+ assert_equal(Scrubyt::Filter.determine_example_type(nil),
9
+ Scrubyt::Filter::EXAMPLE_TYPE_CHILDREN)
10
+ #Test image example
11
+ assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.png'),
12
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
13
+ assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.gif'),
14
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
15
+ assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.jpg'),
16
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
17
+ assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.jpeg'),
18
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
19
+ assert_not_equal(Scrubyt::Filter.determine_example_type('scrubyt.zip'),
20
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
21
+ assert_not_equal(Scrubyt::Filter.determine_example_type('scrubyt.pif'),
22
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
23
+ #Test XPaths
24
+ assert_equal(Scrubyt::Filter.determine_example_type('/p/img'),
25
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
26
+ assert_equal(Scrubyt::Filter.determine_example_type('/p'),
27
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
28
+ assert_equal(Scrubyt::Filter.determine_example_type('//p'),
29
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
30
+ assert_equal(Scrubyt::Filter.determine_example_type('/p//img'),
31
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
32
+ assert_equal(Scrubyt::Filter.determine_example_type('//p//img'),
33
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
34
+ assert_equal(Scrubyt::Filter.determine_example_type('/p[0]/img'),
35
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
36
+ assert_equal(Scrubyt::Filter.determine_example_type('/p[0]'),
37
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
38
+ assert_equal(Scrubyt::Filter.determine_example_type('//p[1]'),
39
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
40
+ assert_equal(Scrubyt::Filter.determine_example_type('/p[1]//img[2]'),
41
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
42
+ assert_equal(Scrubyt::Filter.determine_example_type('//p[1]//img'),
43
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
44
+ assert_equal(Scrubyt::Filter.determine_example_type('/table/tr/td//span/b'),
45
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
46
+ assert_equal(Scrubyt::Filter.determine_example_type('/table[0]//tr/td[1]/span[2]/b'),
47
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
48
+ assert_not_equal(Scrubyt::Filter.determine_example_type('table[0]//tr/td[1]/span[2]/b'),
49
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
50
+ assert_not_equal(Scrubyt::Filter.determine_example_type('/table[a]//tr/td[1]/span[2]/b'),
51
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
52
+ assert_not_equal(Scrubyt::Filter.determine_example_type('/tab2le[a]//tr/td[1]/span[2]/b'),
53
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
54
+ assert_not_equal(Scrubyt::Filter.determine_example_type('/table[a]///tr/td[1]/span[2]/b'),
55
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
56
+ #Test string example
57
+ assert_equal(Scrubyt::Filter.determine_example_type('Hello, world!'),
58
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
59
+ assert_equal(Scrubyt::Filter.determine_example_type('$1022'),
60
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
61
+ assert_equal(Scrubyt::Filter.determine_example_type('CANON'),
62
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
63
+ assert_equal(Scrubyt::Filter.determine_example_type('This is a string'),
64
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
65
+ assert_equal(Scrubyt::Filter.determine_example_type('45'),
66
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
67
+ assert_equal(Scrubyt::Filter.determine_example_type('td'),
68
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
69
+
70
+ end
71
+ end
@@ -0,0 +1,55 @@
1
+ <shapes>
2
+
3
+ <!-- red shapes -->
4
+ <shape color='red' size='10x10' fill='none'>funky_rectangle<contains name='stuff' color='blue'/></shape>
5
+ <shape color='red' size='20x10' fill='small_circles'>blue_circle
6
+ <contains name='crispy_ham' color='blue'/>
7
+ <intersects_with object='banana'/>
8
+ </shape>
9
+ <shape color='red' size='10x20' fill='big_boxes'>
10
+ shiny_diamond
11
+ <intersects_with object='chunky_bacon'/>
12
+ </shape>
13
+ <shape color='red' size='10x20'>clunky_ellipse
14
+ <contains name='sliced_orange' color='blue'/>
15
+ <intersects_with object='nintendo_wii'/>
16
+ </shape>
17
+ <shape color='red' thickness='2'>twinky_line</shape>
18
+
19
+ <!-- green shapes -->
20
+ <shape color='green' size='18x5' fill='big_boxes'>boxy_rectangle
21
+ <contains name='banana_fluff' color='blue'/>
22
+ <intersects_with object='bacon_fudge'/>
23
+ </shape>
24
+ <shape color='green' size='20x10' fill='chunky_bacon'>whammed_circle
25
+ <contains name='big_ham' color='blue'/>
26
+ <intersects_with object='crispy_ham'/>
27
+ </shape>
28
+ <shape color='green' size='30x15' fill='chunky_bacon'>
29
+ spherical_diamond
30
+ <intersects_with object='crispy_orange'/>
31
+ </shape>
32
+ <shape color='green' size='20x30' fill='none'>avocado_ellipse
33
+ <contains name='avocado_fudge' color='blue'/>
34
+ </shape>
35
+ <shape color='green' thickness='2'>line</shape>
36
+
37
+ <!-- green shapes -->
38
+ <shape color='blue' size='120x10' fill='small_circles'>big_rectangle
39
+ <contains name='fungus_ooze' color='blue'/>
40
+ <intersects_with object='funky_lemon'/>
41
+ </shape>
42
+ <shape color='blue' size='50x20' fill='small_circles'>crazy_circle
43
+ <contains name='nut_shake' color='blue'/>
44
+ <intersects_with object='crazy_nut'/>
45
+ </shape>
46
+ <shape color='blue' size='30x30' fill='big_boxes'>ruby_diamond<contains name='chunky_bacon' color='blue'/></shape>
47
+ <shape color='blue' size='5x12' fill='chunky_bacon'>splatted_ellipse
48
+ <contains name='crispy_ham' color='blue'/>
49
+ <intersects_with name='spaghetti_ice' color='blue'/>
50
+ </shape>
51
+ <shape color='blue' thickness='2'>chunky_line
52
+ <contains name='chunky_bacon' color='blue'/>
53
+ </shape>
54
+
55
+ </shapes>
@@ -0,0 +1,39 @@
1
+ <html>
2
+ <body>
3
+ <table border=1 padding=10>
4
+ <tr>
5
+ <td>1</td>
6
+ <td>2</td>
7
+ </tr>
8
+ <tr>
9
+ <td>3</td>
10
+ <td>4</td>
11
+ <td>5</td>
12
+ </tr>
13
+ </table>
14
+ <br/>
15
+ <table border=1 padding=10>
16
+ <tr>
17
+ <td>6</td>
18
+ </tr>
19
+ <tr>
20
+ <td>7</td>
21
+ </tr>
22
+ <tr>
23
+ <td>8</td>
24
+ </tr>
25
+ </table>
26
+ <br>
27
+ <table border=1 padding=10>
28
+ <tr>
29
+ <td>9</td>
30
+ <td>10</td>
31
+ </tr>
32
+ <tr>
33
+ </tr>
34
+ <tr>
35
+ <td>11</td>
36
+ </tr>
37
+ </table>
38
+ </body>
39
+ </html>
@@ -0,0 +1,165 @@
1
+ #require File.join(File.dirname(__FILE__), '../../lib', 'xpathutils')
2
+ require 'scrubyt'
3
+ require 'test/unit'
4
+
5
+ class XPathUtilsTest < Test::Unit::TestCase
6
+
7
+ def setup
8
+ doc1 = <<-DOC
9
+ <a>
10
+ <b>
11
+ <c/>
12
+ <d>dddd</d>
13
+ <e>
14
+ <f>fff</f>
15
+ <k>kk</k>
16
+ <j/>
17
+ <l>lll</l>
18
+ <m/>
19
+ <n>nnn</n>
20
+ <n>nnnnnn</n>
21
+ <n>
22
+ nnnnnnnnn
23
+ <q/>
24
+ <r>rrr</r>
25
+ </n>
26
+ <o>ooo</o>
27
+ <n>nnnnnnnnnnnn</n>
28
+ <p>ppp</p>
29
+ </e>
30
+ </b>
31
+ <g>ggg</g>
32
+ </a>
33
+ DOC
34
+ @doc1 = Hpricot(doc1)
35
+ @a = @doc1.children[1]
36
+ @b = @a.children[1]
37
+ @c = @b.children[1]
38
+ @d = @b.children[3]
39
+ @e = @b.children[5]
40
+ @f = @e.children[1]
41
+ @g = @a.children[@a.children.size-2]
42
+ @k = @e.children[3]
43
+ @j = @e.children[5]
44
+ @l = @e.children[7]
45
+ @m = @e.children[9]
46
+ @n_1 = @e.children[11]
47
+ @n_2 = @e.children[13]
48
+ @n_3 = @e.children[15]
49
+ @o = @e.children[17]
50
+ @n_4 = @e.children[19]
51
+ @p = @e.children[21]
52
+ @q = @n_3.children[1]
53
+ @r = @n_3.children[3]
54
+ #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
55
+ end
56
+
57
+ def test_find_node_from_text
58
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff")
59
+ assert_instance_of(Hpricot::Elem, elem)
60
+ assert_equal(elem, @f)
61
+
62
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd")
63
+ assert_equal(elem, @d)
64
+
65
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr")
66
+ assert_equal(elem, @r)
67
+
68
+ end
69
+
70
+ def test_lowest_common_ancestor
71
+ lca_b_g = Scrubyt::XPathUtils.lowest_common_ancestor(@b,@g)
72
+ lca_f_d = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@d)
73
+ lca_f_g = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@g)
74
+ lca_f_f = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@f)
75
+ lca_f_k = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@k)
76
+ lca_a_g = Scrubyt::XPathUtils.lowest_common_ancestor(@a,@g)
77
+ lca_q_r = Scrubyt::XPathUtils.lowest_common_ancestor(@q,@r)
78
+ lca_m_r = Scrubyt::XPathUtils.lowest_common_ancestor(@m,@r)
79
+ lca_n1_e = Scrubyt::XPathUtils.lowest_common_ancestor(@n_1,@e)
80
+ lca_r_b = Scrubyt::XPathUtils.lowest_common_ancestor(@r,@b)
81
+ lca_a_a = Scrubyt::XPathUtils.lowest_common_ancestor(@a,@a)
82
+
83
+ assert_equal(lca_b_g, @a)
84
+ assert_equal(lca_f_d, @b)
85
+ assert_equal(lca_f_g, @a)
86
+ assert_equal(lca_f_f, @e)
87
+ assert_equal(lca_f_k, @e)
88
+ assert_equal(lca_q_r, @n_3)
89
+ assert_equal(lca_m_r, @e)
90
+ assert_equal(lca_n1_e, @e)
91
+ assert_equal(lca_a_g, @a)
92
+ assert_equal(lca_a_a, @doc1)
93
+ assert_equal(lca_r_b, @b)
94
+ end
95
+
96
+ def test_find_index
97
+ assert_equal(Scrubyt::XPathUtils.find_index(@a), 0)
98
+ assert_equal(Scrubyt::XPathUtils.find_index(@b), 0)
99
+ assert_equal(Scrubyt::XPathUtils.find_index(@c), 0)
100
+ assert_equal(Scrubyt::XPathUtils.find_index(@d), 0)
101
+ assert_equal(Scrubyt::XPathUtils.find_index(@n_1), 0)
102
+ assert_equal(Scrubyt::XPathUtils.find_index(@n_2), 1)
103
+ assert_equal(Scrubyt::XPathUtils.find_index(@n_3), 2)
104
+ assert_equal(Scrubyt::XPathUtils.find_index(@n_4), 3)
105
+ assert_equal(Scrubyt::XPathUtils.find_index(@r), 0)
106
+ end
107
+
108
+ def test_generate_XPath
109
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@a), "/a")
110
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@b), "/a/b")
111
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@c), "/a/b/c")
112
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@d), "/a/b/d")
113
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@e), "/a/b/e")
114
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@f), "/a/b/e/f")
115
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_1), "/a/b/e/n")
116
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_2), "/a/b/e/n")
117
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_3), "/a/b/e/n")
118
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_4), "/a/b/e/n")
119
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@r), "/a/b/e/n/r")
120
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@g), "/a/g")
121
+ end
122
+
123
+ def test_generate_XPath_with_indices
124
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@a, nil, true), "/a[0]")
125
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@b, nil, true), "/a[0]/b[0]")
126
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@c, nil, true), "/a[0]/b[0]/c[0]")
127
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@d, nil, true), "/a[0]/b[0]/d[0]")
128
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@e, nil, true), "/a[0]/b[0]/e[0]")
129
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@f, nil, true), "/a[0]/b[0]/e[0]/f[0]")
130
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_1, nil, true), "/a[0]/b[0]/e[0]/n[0]")
131
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_2, nil, true), "/a[0]/b[0]/e[0]/n[1]")
132
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_3, nil, true), "/a[0]/b[0]/e[0]/n[2]")
133
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_4, nil, true), "/a[0]/b[0]/e[0]/n[3]")
134
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@p, nil, true), "/a[0]/b[0]/e[0]/p[0]")
135
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@r, nil, true), "/a[0]/b[0]/e[0]/n[2]/r[0]")
136
+ end
137
+
138
+ def test_generate_relative_XPath
139
+ assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@a,@a))
140
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@b, @a), "/b[0]")
141
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@c, @a), "/b[0]/c[0]")
142
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@d, @a), "/b[0]/d[0]")
143
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@f, @a), "/b[0]/e[0]/f[0]")
144
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_1, @a), "/b[0]/e[0]/n[0]")
145
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_2, @a), "/b[0]/e[0]/n[1]")
146
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_3, @a), "/b[0]/e[0]/n[2]")
147
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_4, @a), "/b[0]/e[0]/n[3]")
148
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@r, @b), "/e[0]/n[2]/r[0]")
149
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@q, @e), "/n[2]/q[0]")
150
+
151
+ assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@r, @n_2))
152
+ assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@q, @g))
153
+ assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@n_3, @n_2))
154
+ end
155
+
156
+ def test_generate_generalized_relative_XPath
157
+ assert_nil(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@b,@b))
158
+ assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@b, @a), "/b")
159
+ assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@f, @a), "/b/e/f")
160
+ assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@r, @n_3), "/r")
161
+
162
+ assert_nil(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@r, @n_2))
163
+ end
164
+
165
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: scrubyt
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2007-01-15 00:00:00 +01:00
8
+ summary: A powerful Web-scraping framework
9
+ require_paths:
10
+ - lib
11
+ email: peter@rubyrailways.com
12
+ homepage: http://www.scrubyt.rubyforge.org
13
+ rubyforge_project:
14
+ description: scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Peter Szinek
31
+ files:
32
+ - README
33
+ - Rakefile
34
+ - lib/scrubyt.rb
35
+ - lib/scrubyt/constraint_adder.rb
36
+ - lib/scrubyt/constraint.rb
37
+ - lib/scrubyt/result_dumper.rb
38
+ - lib/scrubyt/export.rb
39
+ - lib/scrubyt/extractor.rb
40
+ - lib/scrubyt/filter.rb
41
+ - lib/scrubyt/pattern.rb
42
+ - lib/scrubyt/result.rb
43
+ - lib/scrubyt/xpathutils.rb
44
+ test_files:
45
+ - test/unittests/filter_test.rb
46
+ - test/unittests/input
47
+ - test/unittests/extractor_test.rb
48
+ - test/unittests/xpathutils_test.rb
49
+ - test/unittests/constraint_test.rb
50
+ - test/unittests/input/constraint_test.html
51
+ - test/unittests/input/test.html
52
+ rdoc_options: []
53
+
54
+ extra_rdoc_files: []
55
+
56
+ executables: []
57
+
58
+ extensions: []
59
+
60
+ requirements: []
61
+
62
+ dependencies: []
63
+