scrubyt 0.2.8 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. data/CHANGELOG +32 -2
  2. data/Rakefile +25 -20
  3. data/lib/scrubyt.rb +24 -5
  4. data/lib/scrubyt/core/navigation/fetch_action.rb +76 -42
  5. data/lib/scrubyt/core/navigation/navigation_actions.rb +24 -6
  6. data/lib/scrubyt/core/scraping/filters/base_filter.rb +5 -5
  7. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +2 -2
  8. data/lib/scrubyt/core/scraping/filters/download_filter.rb +2 -1
  9. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -2
  10. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +37 -12
  11. data/lib/scrubyt/core/scraping/pattern.rb +82 -90
  12. data/lib/scrubyt/core/scraping/pre_filter_document.rb +2 -1
  13. data/lib/scrubyt/core/shared/evaluation_context.rb +14 -37
  14. data/lib/scrubyt/core/shared/extractor.rb +55 -54
  15. data/lib/scrubyt/logging.rb +16 -0
  16. data/lib/scrubyt/output/export.rb +1 -1
  17. data/lib/scrubyt/output/post_processor.rb +6 -5
  18. data/lib/scrubyt/output/result.rb +1 -0
  19. data/lib/scrubyt/output/result_dumper.rb +4 -3
  20. data/lib/scrubyt/output/result_node.rb +73 -0
  21. data/lib/scrubyt/output/scrubyt_result.rb +28 -0
  22. data/lib/scrubyt/utils/ruby_extensions.rb +8 -0
  23. data/lib/scrubyt/utils/simple_example_lookup.rb +14 -1
  24. data/lib/scrubyt/utils/xpathutils.rb +11 -0
  25. metadata +7 -12
  26. data/test/unittests/constraint_test.rb +0 -107
  27. data/test/unittests/extractor_test.rb +0 -91
  28. data/test/unittests/filter_test.rb +0 -79
  29. data/test/unittests/input/constraint_test.html +0 -55
  30. data/test/unittests/input/test.html +0 -39
  31. data/test/unittests/pattern_test.rb +0 -27
  32. data/test/unittests/simple_example_lookup_test.rb +0 -68
  33. data/test/unittests/xpathutils_test.rb +0 -152
@@ -1,39 +0,0 @@
1
- <html>
2
- <body>
3
- <table border=1 padding=10>
4
- <tr>
5
- <td>1</td>
6
- <td>2</td>
7
- </tr>
8
- <tr>
9
- <td>3</td>
10
- <td>4</td>
11
- <td>5</td>
12
- </tr>
13
- </table>
14
- <br/>
15
- <table border=1 padding=10>
16
- <tr>
17
- <td>6</td>
18
- </tr>
19
- <tr>
20
- <td>7</td>
21
- </tr>
22
- <tr>
23
- <td>8</td>
24
- </tr>
25
- </table>
26
- <br>
27
- <table border=1 padding=10>
28
- <tr>
29
- <td>9</td>
30
- <td>10</td>
31
- </tr>
32
- <tr>
33
- </tr>
34
- <tr>
35
- <td>11</td>
36
- </tr>
37
- </table>
38
- </body>
39
- </html>
@@ -1,27 +0,0 @@
1
- require 'rubygems'
2
- require 'scrubyt'
3
- require 'test/unit'
4
-
5
- class PatternTest < Test::Unit::TestCase
6
-
7
- def test_select_indices
8
- some_pattern = Scrubyt::Pattern.new('some_pattern')
9
- some_pattern.select_indices(1..3)
10
- assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
11
- some_pattern.select_indices([1])
12
- assert_equal(some_pattern.result_indexer.indices_to_extract, [1])
13
- some_pattern.select_indices([1,2,3])
14
- assert_equal(some_pattern.result_indexer.indices_to_extract, [1,2,3])
15
- some_pattern.select_indices(:first)
16
- assert_equal(some_pattern.result_indexer.indices_to_extract, [:first])
17
- some_pattern.select_indices([:first, :last])
18
- assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,:last])
19
- some_pattern.select_indices([:first, [5,6]])
20
- assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,5,6])
21
- some_pattern.select_indices([:first, 1..2])
22
- assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,1,2])
23
- some_pattern.select_indices([4..5, :first, [5,6]])
24
- assert_equal(some_pattern.result_indexer.indices_to_extract, [:first,4,5,6])
25
- end
26
-
27
- end
@@ -1,68 +0,0 @@
1
- require 'scrubyt'
2
- require 'test/unit'
3
-
4
- class SimpleExampleLookupTest
5
-
6
- def setup
7
- doc1 = <<-DOC
8
- <a>
9
- <b>
10
- <c/>
11
- <d>dddd</d>
12
- <e>
13
- <f>fff</f>
14
- <k>kk</k>
15
- <j/>
16
- <l>lll</l>
17
- <m/>
18
- <n>nnn</n>
19
- <n>nnnnnn</n>
20
- <n>
21
- nnnnnnnnn
22
- <q/>
23
- <r>rrr</r>
24
- </n>
25
- <o>ooo</o>
26
- <n>nnnnnnnnnnnn</n>
27
- <p>ppp</p>
28
- </e>
29
- </b>
30
- <g>ggg</g>
31
- </a>
32
- DOC
33
- @doc1 = Hpricot(doc1)
34
- @a = @doc1.children[1]
35
- @b = @a.children[1]
36
- @c = @b.children[1]
37
- @d = @b.children[3]
38
- @e = @b.children[5]
39
- @f = @e.children[1]
40
- @g = @a.children[@a.children.size-2]
41
- @k = @e.children[3]
42
- @j = @e.children[5]
43
- @l = @e.children[7]
44
- @m = @e.children[9]
45
- @n_1 = @e.children[11]
46
- @n_2 = @e.children[13]
47
- @n_3 = @e.children[15]
48
- @o = @e.children[17]
49
- @n_4 = @e.children[19]
50
- @p = @e.children[21]
51
- @q = @n_3.children[1]
52
- @r = @n_3.children[3]
53
- #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
54
- end
55
-
56
- def test_find_node_from_text
57
- elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
58
- assert_instance_of(Hpricot::Elem, elem)
59
- assert_equal(elem, @f)
60
-
61
- elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
62
- assert_equal(elem, @d)
63
-
64
- elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
65
- assert_equal(elem, @r)
66
-
67
- end
68
- end
@@ -1,152 +0,0 @@
1
- #require File.join(File.dirname(__FILE__), '../../lib', 'xpathutils')
2
- require 'scrubyt'
3
- require 'test/unit'
4
-
5
- class XPathUtilsTest < Test::Unit::TestCase
6
-
7
- def setup
8
- doc1 = <<-DOC
9
- <a>
10
- <b>
11
- <c/>
12
- <d>dddd</d>
13
- <e>
14
- <f>fff</f>
15
- <k>kk</k>
16
- <j/>
17
- <l>lll</l>
18
- <m/>
19
- <n>nnn</n>
20
- <n>nnnnnn</n>
21
- <n>
22
- nnnnnnnnn
23
- <q/>
24
- <r>rrr</r>
25
- </n>
26
- <o>ooo</o>
27
- <n>nnnnnnnnnnnn</n>
28
- <p>ppp</p>
29
- </e>
30
- </b>
31
- <g>ggg</g>
32
- </a>
33
- DOC
34
- @doc1 = Hpricot(doc1)
35
- @a = @doc1.children[1]
36
- @b = @a.children[1]
37
- @c = @b.children[1]
38
- @d = @b.children[3]
39
- @e = @b.children[5]
40
- @f = @e.children[1]
41
- @g = @a.children[@a.children.size-2]
42
- @k = @e.children[3]
43
- @j = @e.children[5]
44
- @l = @e.children[7]
45
- @m = @e.children[9]
46
- @n_1 = @e.children[11]
47
- @n_2 = @e.children[13]
48
- @n_3 = @e.children[15]
49
- @o = @e.children[17]
50
- @n_4 = @e.children[19]
51
- @p = @e.children[21]
52
- @q = @n_3.children[1]
53
- @r = @n_3.children[3]
54
- #@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
55
- end
56
-
57
- def test_lowest_common_ancestor
58
- lca_b_g = Scrubyt::XPathUtils.lowest_common_ancestor(@b,@g)
59
- lca_f_d = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@d)
60
- lca_f_g = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@g)
61
- lca_f_f = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@f)
62
- lca_f_k = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@k)
63
- lca_a_g = Scrubyt::XPathUtils.lowest_common_ancestor(@a,@g)
64
- lca_q_r = Scrubyt::XPathUtils.lowest_common_ancestor(@q,@r)
65
- lca_m_r = Scrubyt::XPathUtils.lowest_common_ancestor(@m,@r)
66
- lca_n1_e = Scrubyt::XPathUtils.lowest_common_ancestor(@n_1,@e)
67
- lca_r_b = Scrubyt::XPathUtils.lowest_common_ancestor(@r,@b)
68
- lca_a_a = Scrubyt::XPathUtils.lowest_common_ancestor(@a,@a)
69
-
70
- assert_equal(lca_b_g, @a)
71
- assert_equal(lca_f_d, @b)
72
- assert_equal(lca_f_g, @a)
73
- assert_equal(lca_f_f, @e)
74
- assert_equal(lca_f_k, @e)
75
- assert_equal(lca_q_r, @n_3)
76
- assert_equal(lca_m_r, @e)
77
- assert_equal(lca_n1_e, @e)
78
- assert_equal(lca_a_g, @a)
79
- assert_equal(lca_a_a, @doc1)
80
- assert_equal(lca_r_b, @b)
81
- end
82
-
83
- def test_find_index
84
- assert_equal(Scrubyt::XPathUtils.find_index(@a), 1)
85
- assert_equal(Scrubyt::XPathUtils.find_index(@b), 1)
86
- assert_equal(Scrubyt::XPathUtils.find_index(@c), 1)
87
- assert_equal(Scrubyt::XPathUtils.find_index(@d), 1)
88
- assert_equal(Scrubyt::XPathUtils.find_index(@n_1), 1)
89
- assert_equal(Scrubyt::XPathUtils.find_index(@n_2), 2)
90
- assert_equal(Scrubyt::XPathUtils.find_index(@n_3), 3)
91
- assert_equal(Scrubyt::XPathUtils.find_index(@n_4), 4)
92
- assert_equal(Scrubyt::XPathUtils.find_index(@r), 1)
93
- end
94
-
95
- def test_generate_XPath
96
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@a), "/a")
97
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@b), "/a/b")
98
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@c), "/a/b/c")
99
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@d), "/a/b/d")
100
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@e), "/a/b/e")
101
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@f), "/a/b/e/f")
102
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_1), "/a/b/e/n")
103
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_2), "/a/b/e/n")
104
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_3), "/a/b/e/n")
105
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_4), "/a/b/e/n")
106
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@r), "/a/b/e/n/r")
107
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@g), "/a/g")
108
- end
109
-
110
- def test_generate_XPath_with_indices
111
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@a, nil, true), "/a[1]")
112
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@b, nil, true), "/a[1]/b[1]")
113
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@c, nil, true), "/a[1]/b[1]/c[1]")
114
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@d, nil, true), "/a[1]/b[1]/d[1]")
115
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@e, nil, true), "/a[1]/b[1]/e[1]")
116
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@f, nil, true), "/a[1]/b[1]/e[1]/f[1]")
117
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_1, nil, true), "/a[1]/b[1]/e[1]/n[1]")
118
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_2, nil, true), "/a[1]/b[1]/e[1]/n[2]")
119
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_3, nil, true), "/a[1]/b[1]/e[1]/n[3]")
120
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@n_4, nil, true), "/a[1]/b[1]/e[1]/n[4]")
121
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@p, nil, true), "/a[1]/b[1]/e[1]/p[1]")
122
- assert_equal(Scrubyt::XPathUtils.generate_XPath(@r, nil, true), "/a[1]/b[1]/e[1]/n[3]/r[1]")
123
- end
124
-
125
- def test_generate_relative_XPath
126
- assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@a,@a))
127
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@b, @a), "/b[1]")
128
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@c, @a), "/b[1]/c[1]")
129
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@d, @a), "/b[1]/d[1]")
130
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@f, @a), "/b[1]/e[1]/f[1]")
131
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_1, @a), "/b[1]/e[1]/n[1]")
132
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_2, @a), "/b[1]/e[1]/n[2]")
133
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_3, @a), "/b[1]/e[1]/n[3]")
134
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@n_4, @a), "/b[1]/e[1]/n[4]")
135
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@r, @b), "/e[1]/n[3]/r[1]")
136
- assert_equal(Scrubyt::XPathUtils.generate_relative_XPath(@q, @e), "/n[3]/q[1]")
137
-
138
- assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@r, @n_2))
139
- assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@q, @g))
140
- assert_nil(Scrubyt::XPathUtils.generate_relative_XPath(@n_3, @n_2))
141
- end
142
-
143
- def test_generate_generalized_relative_XPath
144
- assert_nil(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@b,@b))
145
- assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@b, @a), "/b")
146
- assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@f, @a), "/b/e/f")
147
- assert_equal(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@r, @n_3), "/r")
148
-
149
- assert_nil(Scrubyt::XPathUtils.generate_generalized_relative_XPath(@r, @n_2))
150
- end
151
-
152
- end