scrubyt 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,71 @@
1
+ #require File.join(File.dirname(__FILE__), '../..', 'lib', 'filter')
2
+ require 'scrubyt'
3
+ require 'test/unit'
4
+
5
+ class FilterTest < Test::Unit::TestCase
6
+ def test_determine_example_type
7
+ #Test children example
8
+ assert_equal(Scrubyt::Filter.determine_example_type(nil),
9
+ Scrubyt::Filter::EXAMPLE_TYPE_CHILDREN)
10
+ #Test image example
11
+ assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.png'),
12
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
13
+ assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.gif'),
14
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
15
+ assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.jpg'),
16
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
17
+ assert_equal(Scrubyt::Filter.determine_example_type('scrubyt.jpeg'),
18
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
19
+ assert_not_equal(Scrubyt::Filter.determine_example_type('scrubyt.zip'),
20
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
21
+ assert_not_equal(Scrubyt::Filter.determine_example_type('scrubyt.pif'),
22
+ Scrubyt::Filter::EXAMPLE_TYPE_IMAGE)
23
+ #Test XPaths
24
+ assert_equal(Scrubyt::Filter.determine_example_type('/p/img'),
25
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
26
+ assert_equal(Scrubyt::Filter.determine_example_type('/p'),
27
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
28
+ assert_equal(Scrubyt::Filter.determine_example_type('//p'),
29
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
30
+ assert_equal(Scrubyt::Filter.determine_example_type('/p//img'),
31
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
32
+ assert_equal(Scrubyt::Filter.determine_example_type('//p//img'),
33
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
34
+ assert_equal(Scrubyt::Filter.determine_example_type('/p[0]/img'),
35
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
36
+ assert_equal(Scrubyt::Filter.determine_example_type('/p[0]'),
37
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
38
+ assert_equal(Scrubyt::Filter.determine_example_type('//p[1]'),
39
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
40
+ assert_equal(Scrubyt::Filter.determine_example_type('/p[1]//img[2]'),
41
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
42
+ assert_equal(Scrubyt::Filter.determine_example_type('//p[1]//img'),
43
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
44
+ assert_equal(Scrubyt::Filter.determine_example_type('/table/tr/td//span/b'),
45
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
46
+ assert_equal(Scrubyt::Filter.determine_example_type('/table[0]//tr/td[1]/span[2]/b'),
47
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
48
+ assert_not_equal(Scrubyt::Filter.determine_example_type('table[0]//tr/td[1]/span[2]/b'),
49
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
50
+ assert_not_equal(Scrubyt::Filter.determine_example_type('/table[a]//tr/td[1]/span[2]/b'),
51
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
52
+ assert_not_equal(Scrubyt::Filter.determine_example_type('/tab2le[a]//tr/td[1]/span[2]/b'),
53
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
54
+ assert_not_equal(Scrubyt::Filter.determine_example_type('/table[a]///tr/td[1]/span[2]/b'),
55
+ Scrubyt::Filter::EXAMPLE_TYPE_XPATH)
56
+ #Test string example
57
+ assert_equal(Scrubyt::Filter.determine_example_type('Hello, world!'),
58
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
59
+ assert_equal(Scrubyt::Filter.determine_example_type('$1022'),
60
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
61
+ assert_equal(Scrubyt::Filter.determine_example_type('CANON'),
62
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
63
+ assert_equal(Scrubyt::Filter.determine_example_type('This is a string'),
64
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
65
+ assert_equal(Scrubyt::Filter.determine_example_type('45'),
66
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
67
+ assert_equal(Scrubyt::Filter.determine_example_type('td'),
68
+ Scrubyt::Filter::EXAMPLE_TYPE_STRING)
69
+
70
+ end
71
+ end
@@ -0,0 +1,55 @@
1
+ <shapes>
2
+
3
+ <!-- red shapes -->
4
+ <shape color='red' size='10x10' fill='none'>funky_rectangle<contains name='stuff' color='blue'/></shape>
5
+ <shape color='red' size='20x10' fill='small_circles'>blue_circle
6
+ <contains name='crispy_ham' color='blue'/>
7
+ <intersects_with object='banana'/>
8
+ </shape>
9
+ <shape color='red' size='10x20' fill='big_boxes'>
10
+ shiny_diamond
11
+ <intersects_with object='chunky_bacon'/>
12
+ </shape>
13
+ <shape color='red' size='10x20'>clunky_ellipse
14
+ <contains name='sliced_orange' color='blue'/>
15
+ <intersects_with object='nintendo_wii'/>
16
+ </shape>
17
+ <shape color='red' thickness='2'>twinky_line</shape>
18
+
19
+ <!-- green shapes -->
20
+ <shape color='green' size='18x5' fill='big_boxes'>boxy_rectangle
21
+ <contains name='banana_fluff' color='blue'/>
22
+ <intersects_with object='bacon_fudge'/>
23
+ </shape>
24
+ <shape color='green' size='20x10' fill='chunky_bacon'>whammed_circle
25
+ <contains name='big_ham' color='blue'/>
26
+ <intersects_with object='crispy_ham'/>
27
+ </shape>
28
+ <shape color='green' size='30x15' fill='chunky_bacon'>
29
+ spherical_diamond
30
+ <intersects_with object='crispy_orange'/>
31
+ </shape>
32
+ <shape color='green' size='20x30' fill='none'>avocado_ellipse
33
+ <contains name='avocado_fudge' color='blue'/>
34
+ </shape>
35
+ <shape color='green' thickness='2'>line</shape>
36
+
37
+ <!-- green shapes -->
38
+ <shape color='blue' size='120x10' fill='small_circles'>big_rectangle
39
+ <contains name='fungus_ooze' color='blue'/>
40
+ <intersects_with object='funky_lemon'/>
41
+ </shape>
42
+ <shape color='blue' size='50x20' fill='small_circles'>crazy_circle
43
+ <contains name='nut_shake' color='blue'/>
44
+ <intersects_with object='crazy_nut'/>
45
+ </shape>
46
+ <shape color='blue' size='30x30' fill='big_boxes'>ruby_diamond<contains name='chunky_bacon' color='blue'/></shape>
47
+ <shape color='blue' size='5x12' fill='chunky_bacon'>splatted_ellipse
48
+ <contains name='crispy_ham' color='blue'/>
49
+ <intersects_with name='spaghetti_ice' color='blue'/>
50
+ </shape>
51
+ <shape color='blue' thickness='2'>chunky_line
52
+ <contains name='chunky_bacon' color='blue'/>
53
+ </shape>
54
+
55
+ </shapes>
@@ -0,0 +1,39 @@
1
+ <html>
2
+ <body>
3
+ <table border=1 padding=10>
4
+ <tr>
5
+ <td>1</td>
6
+ <td>2</td>
7
+ </tr>
8
+ <tr>
9
+ <td>3</td>
10
+ <td>4</td>
11
+ <td>5</td>
12
+ </tr>
13
+ </table>
14
+ <br/>
15
+ <table border=1 padding=10>
16
+ <tr>
17
+ <td>6</td>
18
+ </tr>
19
+ <tr>
20
+ <td>7</td>
21
+ </tr>
22
+ <tr>
23
+ <td>8</td>
24
+ </tr>
25
+ </table>
26
+ <br>
27
+ <table border=1 padding=10>
28
+ <tr>
29
+ <td>9</td>
30
+ <td>10</td>
31
+ </tr>
32
+ <tr>
33
+ </tr>
34
+ <tr>
35
+ <td>11</td>
36
+ </tr>
37
+ </table>
38
+ </body>
39
+ </html>
@@ -0,0 +1,165 @@
1
+ #require File.join(File.dirname(__FILE__), '../../lib', 'xpathutils')
2
+ require 'scrubyt'
3
+ require 'test/unit'
4
+
5
+ class XPathUtilsTest < Test::Unit::TestCase
6
+
7
+ def setup
8
+ doc1 = <<-DOC
9
+ <a>
10
+ <b>
11
+ <c/>
12
+ <d>dddd</d>
13
+ <e>
14
+ <f>fff</f>
15
+ <k>kk</k>
16
+ <j/>
17
+ <l>lll</l>
18
+ <m/>
19
+ <n>nnn</n>
20
+ <n>nnnnnn</n>
21
+ <n>
22
+ nnnnnnnnn
23
+ <q/>
24
+ <r>rrr</r>
25
+ </n>
26
+ <o>ooo</o>
27
+ <n>nnnnnnnnnnnn</n>
28
+ <p>ppp</p>
29
+ </e>
30
+ </b>
31
+ <g>ggg</g>
32
+ </a>
33
+ DOC
34
+ @doc1 = Hpricot(doc1)
35
+ @a = @doc1.children[1]
36
+ @b = @a.children[1]
37
+ @c = @b.children[1]
38
+ @d = @b.children[3]
39
+ @e = @b.children[5]
40
+ @f = @e.children[1]
41
+ @g = @a.children[@a.children.size-2]
42
+ @k = @e.children[3]
43
+ @j = @e.children[5]
44
+ @l = @e.children[7]
45
+ @m = @e.children[9]
46
+ @n_1 = @e.children[11]
47
+ @n_2 = @e.children[13]
48
+ @n_3 = @e.children[15]
49
+ @o = @e.children[17]
50
+ @n_4 = @e.children[19]
51
+ @p = @e.children[21]
52
+ @q = @n_3.children[1]
53
+ @r = @n_3.children[3]
54
+ #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
55
+ end
56
+
57
+ def test_find_node_from_text
58
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff")
59
+ assert_instance_of(Hpricot::Elem, elem)
60
+ assert_equal(elem, @f)
61
+
62
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd")
63
+ assert_equal(elem, @d)
64
+
65
+ elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr")
66
+ assert_equal(elem, @r)
67
+
68
+ end
69
+
70
+ def test_lowest_common_ancestor
71
+ lca_b_g = Scrubyt::XPathUtils.lowest_common_ancestor(@b,@g)
72
+ lca_f_d = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@d)
73
+ lca_f_g = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@g)
74
+ lca_f_f = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@f)
75
+ lca_f_k = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@k)
76
+ lca_a_g = Scrubyt::XPathUtils.lowest_common_ancestor(@a,@g)
77
+ lca_q_r = Scrubyt::XPathUtils.lowest_common_ancestor(@q,@r)
78
+ lca_m_r = Scrubyt::XPathUtils.lowest_common_ancestor(@m,@r)
79
+ lca_n1_e = Scrubyt::XPathUtils.lowest_common_ancestor(@n_1,@e)
80
+ lca_r_b = Scrubyt::XPathUtils.lowest_common_ancestor(@r,@b)
81
+ lca_a_a = Scrubyt::XPathUtils.lowest_common_ancestor(@a,@a)
82
+
83
+ assert_equal(lca_b_g, @a)
84
+ assert_equal(lca_f_d, @b)
85
+ assert_equal(lca_f_g, @a)
86
+ assert_equal(lca_f_f, @e)
87
+ assert_equal(lca_f_k, @e)
88
+ assert_equal(lca_q_r, @n_3)
89
+ assert_equal(lca_m_r, @e)
90
+ assert_equal(lca_n1_e, @e)
91
+ assert_equal(lca_a_g, @a)
92
+ assert_equal(lca_a_a, @doc1)
93
+ assert_equal(lca_r_b, @b)
94
+ end
95
+
96
+ def test_find_index
97
+ assert_equal(Scrubyt::XPathUtils.find_index(@a), 0)
98
+ assert_equal(Scrubyt::XPathUtils.find_index(@b), 0)
99
+ assert_equal(Scrubyt::XPathUtils.find_index(@c), 0)
100
+ assert_equal(Scrubyt::XPathUtils.find_index(@d), 0)
101
+ assert_equal(Scrubyt::XPathUtils.find_index(@n_1), 0)
102
+ assert_equal(Scrubyt::XPathUtils.find_index(@n_2), 1)
103
+ assert_equal(Scrubyt::XPathUtils.find_index(@n_3), 2)
104
+ assert_equal(Scrubyt::XPathUtils.find_index(@n_4), 3)
105
+ assert_equal(Scrubyt::XPathUtils.find_index(@r), 0)
106
+ end
107
+
108
+ def test_generate_XPath
109
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@a), "/a")
110
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@b), "/a/b")
111
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@c), "/a/b/c")
112
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@d), "/a/b/d")
113
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@e), "/a/b/e")
114
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@f), "/a/b/e/f")
115
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_1), "/a/b/e/n")
116
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_2), "/a/b/e/n")
117
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_3), "/a/b/e/n")
118
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_4), "/a/b/e/n")
119
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@r), "/a/b/e/n/r")
120
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@g), "/a/g")
121
+ end
122
+
123
+ def test_generate_XPath_with_indices
124
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@a, nil, true), "/a[0]")
125
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@b, nil, true), "/a[0]/b[0]")
126
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@c, nil, true), "/a[0]/b[0]/c[0]")
127
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@d, nil, true), "/a[0]/b[0]/d[0]")
128
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@e, nil, true), "/a[0]/b[0]/e[0]")
129
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@f, nil, true), "/a[0]/b[0]/e[0]/f[0]")
130
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_1, nil, true), "/a[0]/b[0]/e[0]/n[0]")
131
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_2, nil, true), "/a[0]/b[0]/e[0]/n[1]")
132
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_3, nil, true), "/a[0]/b[0]/e[0]/n[2]")
133
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_4, nil, true), "/a[0]/b[0]/e[0]/n[3]")
134
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@p, nil, true), "/a[0]/b[0]/e[0]/p[0]")
135
+ assert_equal(Scrubyt::XPathUtils.generate_XPath(@r, nil, true), "/a[0]/b[0]/e[0]/n[2]/r[0]")
136
+ end
137
+
138
+ def test_generate_relative_XPath
139
+ assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@a,@a))
140
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@b, @a), "/b[0]")
141
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@c, @a), "/b[0]/c[0]")
142
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@d, @a), "/b[0]/d[0]")
143
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@f, @a), "/b[0]/e[0]/f[0]")
144
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_1, @a), "/b[0]/e[0]/n[0]")
145
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_2, @a), "/b[0]/e[0]/n[1]")
146
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_3, @a), "/b[0]/e[0]/n[2]")
147
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_4, @a), "/b[0]/e[0]/n[3]")
148
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@r, @b), "/e[0]/n[2]/r[0]")
149
+ assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@q, @e), "/n[2]/q[0]")
150
+
151
+ assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@r, @n_2))
152
+ assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@q, @g))
153
+ assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@n_3, @n_2))
154
+ end
155
+
156
+ def test_generate_generalized_relative_XPath
157
+ assert_nil(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@b,@b))
158
+ assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@b, @a), "/b")
159
+ assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@f, @a), "/b/e/f")
160
+ assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@r, @n_3), "/r")
161
+
162
+ assert_nil(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@r, @n_2))
163
+ end
164
+
165
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: scrubyt
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2007-01-15 00:00:00 +01:00
8
+ summary: A powerful Web-scraping framework
9
+ require_paths:
10
+ - lib
11
+ email: peter@rubyrailways.com
12
+ homepage: http://www.scrubyt.rubyforge.org
13
+ rubyforge_project:
14
+ description: scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Peter Szinek
31
+ files:
32
+ - README
33
+ - Rakefile
34
+ - lib/scrubyt.rb
35
+ - lib/scrubyt/constraint_adder.rb
36
+ - lib/scrubyt/constraint.rb
37
+ - lib/scrubyt/result_dumper.rb
38
+ - lib/scrubyt/export.rb
39
+ - lib/scrubyt/extractor.rb
40
+ - lib/scrubyt/filter.rb
41
+ - lib/scrubyt/pattern.rb
42
+ - lib/scrubyt/result.rb
43
+ - lib/scrubyt/xpathutils.rb
44
+ test_files:
45
+ - test/unittests/filter_test.rb
46
+ - test/unittests/input
47
+ - test/unittests/extractor_test.rb
48
+ - test/unittests/xpathutils_test.rb
49
+ - test/unittests/constraint_test.rb
50
+ - test/unittests/input/constraint_test.html
51
+ - test/unittests/input/test.html
52
+ rdoc_options: []
53
+
54
+ extra_rdoc_files: []
55
+
56
+ executables: []
57
+
58
+ extensions: []
59
+
60
+ requirements: []
61
+
62
+ dependencies: []
63
+