scrubyt 0.2.8 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +32 -2
- data/Rakefile +25 -20
- data/lib/scrubyt.rb +24 -5
- data/lib/scrubyt/core/navigation/fetch_action.rb +76 -42
- data/lib/scrubyt/core/navigation/navigation_actions.rb +24 -6
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +2 -2
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +2 -1
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -2
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +37 -12
- data/lib/scrubyt/core/scraping/pattern.rb +82 -90
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +2 -1
- data/lib/scrubyt/core/shared/evaluation_context.rb +14 -37
- data/lib/scrubyt/core/shared/extractor.rb +55 -54
- data/lib/scrubyt/logging.rb +16 -0
- data/lib/scrubyt/output/export.rb +1 -1
- data/lib/scrubyt/output/post_processor.rb +6 -5
- data/lib/scrubyt/output/result.rb +1 -0
- data/lib/scrubyt/output/result_dumper.rb +4 -3
- data/lib/scrubyt/output/result_node.rb +73 -0
- data/lib/scrubyt/output/scrubyt_result.rb +28 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +8 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +14 -1
- data/lib/scrubyt/utils/xpathutils.rb +11 -0
- metadata +7 -12
- data/test/unittests/constraint_test.rb +0 -107
- data/test/unittests/extractor_test.rb +0 -91
- data/test/unittests/filter_test.rb +0 -79
- data/test/unittests/input/constraint_test.html +0 -55
- data/test/unittests/input/test.html +0 -39
- data/test/unittests/pattern_test.rb +0 -27
- data/test/unittests/simple_example_lookup_test.rb +0 -68
- data/test/unittests/xpathutils_test.rb +0 -152
@@ -1,39 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<body>
|
3
|
-
<table border=1 padding=10>
|
4
|
-
<tr>
|
5
|
-
<td>1</td>
|
6
|
-
<td>2</td>
|
7
|
-
</tr>
|
8
|
-
<tr>
|
9
|
-
<td>3</td>
|
10
|
-
<td>4</td>
|
11
|
-
<td>5</td>
|
12
|
-
</tr>
|
13
|
-
</table>
|
14
|
-
<br/>
|
15
|
-
<table border=1 padding=10>
|
16
|
-
<tr>
|
17
|
-
<td>6</td>
|
18
|
-
</tr>
|
19
|
-
<tr>
|
20
|
-
<td>7</td>
|
21
|
-
</tr>
|
22
|
-
<tr>
|
23
|
-
<td>8</td>
|
24
|
-
</tr>
|
25
|
-
</table>
|
26
|
-
<br>
|
27
|
-
<table border=1 padding=10>
|
28
|
-
<tr>
|
29
|
-
<td>9</td>
|
30
|
-
<td>10</td>
|
31
|
-
</tr>
|
32
|
-
<tr>
|
33
|
-
</tr>
|
34
|
-
<tr>
|
35
|
-
<td>11</td>
|
36
|
-
</tr>
|
37
|
-
</table>
|
38
|
-
</body>
|
39
|
-
</html>
|
@@ -1,27 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'scrubyt'
|
3
|
-
require 'test/unit'
|
4
|
-
|
5
|
-
class PatternTest < Test::Unit::TestCase
|
6
|
-
|
7
|
-
def test_select_indices
|
8
|
-
some_pattern = Scrubyt::Pattern.new('some_pattern')
|
9
|
-
some_pattern.select_indices(1..3)
|
10
|
-
assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
|
11
|
-
some_pattern.select_indices([1])
|
12
|
-
assert_equal(some_pattern.result_indexer.indices_to_extract, [1])
|
13
|
-
some_pattern.select_indices([1,2,3])
|
14
|
-
assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
|
15
|
-
some_pattern.select_indices(:first)
|
16
|
-
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first])
|
17
|
-
some_pattern.select_indices([:first, :last])
|
18
|
-
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,:last])
|
19
|
-
some_pattern.select_indices([:first, [5,6]])
|
20
|
-
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,5,6])
|
21
|
-
some_pattern.select_indices([:first, 1..2])
|
22
|
-
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,1,2])
|
23
|
-
some_pattern.select_indices([4..5, :first, [5,6]])
|
24
|
-
assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,4,5,6])
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
@@ -1,68 +0,0 @@
|
|
1
|
-
require 'scrubyt'
|
2
|
-
require 'test/unit'
|
3
|
-
|
4
|
-
class SimpleExampleLookupTest
|
5
|
-
|
6
|
-
def setup
|
7
|
-
doc1 = <<-DOC
|
8
|
-
<a>
|
9
|
-
<b>
|
10
|
-
<c/>
|
11
|
-
<d>dddd</d>
|
12
|
-
<e>
|
13
|
-
<f>fff</f>
|
14
|
-
<k>kk</k>
|
15
|
-
<j/>
|
16
|
-
<l>lll</l>
|
17
|
-
<m/>
|
18
|
-
<n>nnn</n>
|
19
|
-
<n>nnnnnn</n>
|
20
|
-
<n>
|
21
|
-
nnnnnnnnn
|
22
|
-
<q/>
|
23
|
-
<r>rrr</r>
|
24
|
-
</n>
|
25
|
-
<o>ooo</o>
|
26
|
-
<n>nnnnnnnnnnnn</n>
|
27
|
-
<p>ppp</p>
|
28
|
-
</e>
|
29
|
-
</b>
|
30
|
-
<g>ggg</g>
|
31
|
-
</a>
|
32
|
-
DOC
|
33
|
-
@doc1 = Hpricot(doc1)
|
34
|
-
@a = @doc1.children[1]
|
35
|
-
@b = @a.children[1]
|
36
|
-
@c = @b.children[1]
|
37
|
-
@d = @b.children[3]
|
38
|
-
@e = @b.children[5]
|
39
|
-
@f = @e.children[1]
|
40
|
-
@g = @a.children[@a.children.size-2]
|
41
|
-
@k = @e.children[3]
|
42
|
-
@j = @e.children[5]
|
43
|
-
@l = @e.children[7]
|
44
|
-
@m = @e.children[9]
|
45
|
-
@n_1 = @e.children[11]
|
46
|
-
@n_2 = @e.children[13]
|
47
|
-
@n_3 = @e.children[15]
|
48
|
-
@o = @e.children[17]
|
49
|
-
@n_4 = @e.children[19]
|
50
|
-
@p = @e.children[21]
|
51
|
-
@q = @n_3.children[1]
|
52
|
-
@r = @n_3.children[3]
|
53
|
-
#@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
|
54
|
-
end
|
55
|
-
|
56
|
-
def test_find_node_from_text
|
57
|
-
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
|
58
|
-
assert_instance_of(Hpricot::Elem, elem)
|
59
|
-
assert_equal(elem, @f)
|
60
|
-
|
61
|
-
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
|
62
|
-
assert_equal(elem, @d)
|
63
|
-
|
64
|
-
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
|
65
|
-
assert_equal(elem, @r)
|
66
|
-
|
67
|
-
end
|
68
|
-
end
|
@@ -1,152 +0,0 @@
|
|
1
|
-
#require File.join(File.dirname(__FILE__), '../../lib', 'xpathutils')
|
2
|
-
require 'scrubyt'
|
3
|
-
require 'test/unit'
|
4
|
-
|
5
|
-
class XPathUtilsTest < Test::Unit::TestCase
|
6
|
-
|
7
|
-
def setup
|
8
|
-
doc1 = <<-DOC
|
9
|
-
<a>
|
10
|
-
<b>
|
11
|
-
<c/>
|
12
|
-
<d>dddd</d>
|
13
|
-
<e>
|
14
|
-
<f>fff</f>
|
15
|
-
<k>kk</k>
|
16
|
-
<j/>
|
17
|
-
<l>lll</l>
|
18
|
-
<m/>
|
19
|
-
<n>nnn</n>
|
20
|
-
<n>nnnnnn</n>
|
21
|
-
<n>
|
22
|
-
nnnnnnnnn
|
23
|
-
<q/>
|
24
|
-
<r>rrr</r>
|
25
|
-
</n>
|
26
|
-
<o>ooo</o>
|
27
|
-
<n>nnnnnnnnnnnn</n>
|
28
|
-
<p>ppp</p>
|
29
|
-
</e>
|
30
|
-
</b>
|
31
|
-
<g>ggg</g>
|
32
|
-
</a>
|
33
|
-
DOC
|
34
|
-
@doc1 = Hpricot(doc1)
|
35
|
-
@a = @doc1.children[1]
|
36
|
-
@b = @a.children[1]
|
37
|
-
@c = @b.children[1]
|
38
|
-
@d = @b.children[3]
|
39
|
-
@e = @b.children[5]
|
40
|
-
@f = @e.children[1]
|
41
|
-
@g = @a.children[@a.children.size-2]
|
42
|
-
@k = @e.children[3]
|
43
|
-
@j = @e.children[5]
|
44
|
-
@l = @e.children[7]
|
45
|
-
@m = @e.children[9]
|
46
|
-
@n_1 = @e.children[11]
|
47
|
-
@n_2 = @e.children[13]
|
48
|
-
@n_3 = @e.children[15]
|
49
|
-
@o = @e.children[17]
|
50
|
-
@n_4 = @e.children[19]
|
51
|
-
@p = @e.children[21]
|
52
|
-
@q = @n_3.children[1]
|
53
|
-
@r = @n_3.children[3]
|
54
|
-
#@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
|
55
|
-
end
|
56
|
-
|
57
|
-
def test_lowest_common_ancestor
|
58
|
-
lca_b_g = Scrubyt::XPathUtils.lowest_common_ancestor(@b,@g)
|
59
|
-
lca_f_d = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@d)
|
60
|
-
lca_f_g = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@g)
|
61
|
-
lca_f_f = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@f)
|
62
|
-
lca_f_k = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@k)
|
63
|
-
lca_a_g = Scrubyt::XPathUtils.lowest_common_ancestor(@a,@g)
|
64
|
-
lca_q_r = Scrubyt::XPathUtils.lowest_common_ancestor(@q,@r)
|
65
|
-
lca_m_r = Scrubyt::XPathUtils.lowest_common_ancestor(@m,@r)
|
66
|
-
lca_n1_e = Scrubyt::XPathUtils.lowest_common_ancestor(@n_1,@e)
|
67
|
-
lca_r_b = Scrubyt::XPathUtils.lowest_common_ancestor(@r,@b)
|
68
|
-
lca_a_a = Scrubyt::XPathUtils.lowest_common_ancestor(@a,@a)
|
69
|
-
|
70
|
-
assert_equal(lca_b_g, @a)
|
71
|
-
assert_equal(lca_f_d, @b)
|
72
|
-
assert_equal(lca_f_g, @a)
|
73
|
-
assert_equal(lca_f_f, @e)
|
74
|
-
assert_equal(lca_f_k, @e)
|
75
|
-
assert_equal(lca_q_r, @n_3)
|
76
|
-
assert_equal(lca_m_r, @e)
|
77
|
-
assert_equal(lca_n1_e, @e)
|
78
|
-
assert_equal(lca_a_g, @a)
|
79
|
-
assert_equal(lca_a_a, @doc1)
|
80
|
-
assert_equal(lca_r_b, @b)
|
81
|
-
end
|
82
|
-
|
83
|
-
def test_find_index
|
84
|
-
assert_equal(Scrubyt::XPathUtils.find_index(@a), 1)
|
85
|
-
assert_equal(Scrubyt::XPathUtils.find_index(@b), 1)
|
86
|
-
assert_equal(Scrubyt::XPathUtils.find_index(@c), 1)
|
87
|
-
assert_equal(Scrubyt::XPathUtils.find_index(@d), 1)
|
88
|
-
assert_equal(Scrubyt::XPathUtils.find_index(@n_1), 1)
|
89
|
-
assert_equal(Scrubyt::XPathUtils.find_index(@n_2), 2)
|
90
|
-
assert_equal(Scrubyt::XPathUtils.find_index(@n_3), 3)
|
91
|
-
assert_equal(Scrubyt::XPathUtils.find_index(@n_4), 4)
|
92
|
-
assert_equal(Scrubyt::XPathUtils.find_index(@r), 1)
|
93
|
-
end
|
94
|
-
|
95
|
-
def test_generate_XPath
|
96
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@a), "/a")
|
97
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@b), "/a/b")
|
98
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@c), "/a/b/c")
|
99
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@d), "/a/b/d")
|
100
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@e), "/a/b/e")
|
101
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@f), "/a/b/e/f")
|
102
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_1), "/a/b/e/n")
|
103
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_2), "/a/b/e/n")
|
104
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_3), "/a/b/e/n")
|
105
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_4), "/a/b/e/n")
|
106
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@r), "/a/b/e/n/r")
|
107
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@g), "/a/g")
|
108
|
-
end
|
109
|
-
|
110
|
-
def test_generate_XPath_with_indices
|
111
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@a, nil, true), "/a[1]")
|
112
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@b, nil, true), "/a[1]/b[1]")
|
113
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@c, nil, true), "/a[1]/b[1]/c[1]")
|
114
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@d, nil, true), "/a[1]/b[1]/d[1]")
|
115
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@e, nil, true), "/a[1]/b[1]/e[1]")
|
116
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@f, nil, true), "/a[1]/b[1]/e[1]/f[1]")
|
117
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_1, nil, true), "/a[1]/b[1]/e[1]/n[1]")
|
118
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_2, nil, true), "/a[1]/b[1]/e[1]/n[2]")
|
119
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_3, nil, true), "/a[1]/b[1]/e[1]/n[3]")
|
120
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_4, nil, true), "/a[1]/b[1]/e[1]/n[4]")
|
121
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@p, nil, true), "/a[1]/b[1]/e[1]/p[1]")
|
122
|
-
assert_equal(Scrubyt::XPathUtils.generate_XPath(@r, nil, true), "/a[1]/b[1]/e[1]/n[3]/r[1]")
|
123
|
-
end
|
124
|
-
|
125
|
-
def test_generate_relative_XPath
|
126
|
-
assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@a,@a))
|
127
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@b, @a), "/b[1]")
|
128
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@c, @a), "/b[1]/c[1]")
|
129
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@d, @a), "/b[1]/d[1]")
|
130
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@f, @a), "/b[1]/e[1]/f[1]")
|
131
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_1, @a), "/b[1]/e[1]/n[1]")
|
132
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_2, @a), "/b[1]/e[1]/n[2]")
|
133
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_3, @a), "/b[1]/e[1]/n[3]")
|
134
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_4, @a), "/b[1]/e[1]/n[4]")
|
135
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@r, @b), "/e[1]/n[3]/r[1]")
|
136
|
-
assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@q, @e), "/n[3]/q[1]")
|
137
|
-
|
138
|
-
assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@r, @n_2))
|
139
|
-
assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@q, @g))
|
140
|
-
assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@n_3, @n_2))
|
141
|
-
end
|
142
|
-
|
143
|
-
def test_generate_generalized_relative_XPath
|
144
|
-
assert_nil(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@b,@b))
|
145
|
-
assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@b, @a), "/b")
|
146
|
-
assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@f, @a), "/b/e/f")
|
147
|
-
assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@r, @n_3), "/r")
|
148
|
-
|
149
|
-
assert_nil(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@r, @n_2))
|
150
|
-
end
|
151
|
-
|
152
|
-
end
|