scrapes 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ module Scrapes
26
+ ################################################################################
27
+ # Link pages together. Useful for when one only contains links to the next page.
28
+ class ToProxy
29
+ ################################################################################
30
+ def initialize (from, to)
31
+ @from, @to = from, to
32
+ end
33
+
34
+ ################################################################################
35
+ def extract (data, uri, session, &block)
36
+ @from.extract(data, uri, session) do |link|
37
+ session.page(@to, link, &block)
38
+ end
39
+ end
40
+
41
+ ################################################################################
42
+ # Continue the string of connections.
43
+ def to (next_in_line)
44
+ self.class.new(self, next_in_line)
45
+ end
46
+
47
+ end
48
+ ################################################################################
49
+ end
50
+ ################################################################################
@@ -0,0 +1,75 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'fileutils'
26
+ require 'test/lib/server'
27
+ require 'scrapes/cache'
28
+ require 'test/unit'
29
+
30
+ class TestCache < Test::Unit::TestCase
31
+ include LocalHTTPServer
32
+
33
+ def setup
34
+ start_server
35
+ @cache = Scrapes::Cache.new
36
+ end
37
+
38
+ def teardown
39
+ stop_server
40
+ FileUtils.remove_dir 'cache' if File.exist? 'cache'
41
+ end
42
+
43
+ def test_truth
44
+ assert @server
45
+ assert @cache
46
+ end
47
+
48
+ def test_directory_attr
49
+ assert_equal @cache.directory, File.expand_path('cache')
50
+ assert_nothing_raised { @cache.directory = 'cache' }
51
+ assert_equal @cache.directory, 'cache'
52
+ end
53
+
54
+ def test_enabled_attr
55
+ assert_equal @cache.enabled, false
56
+ assert_nothing_raised { @cache.enabled = true }
57
+ assert @cache.enabled
58
+ end
59
+
60
+ def test_update
61
+ assert_nothing_raised { @cache.update 'foo.txt', localhost_http_get('foo.txt') }
62
+ assert(!@cache.check('foo.txt'))
63
+ assert_nothing_raised { @cache.enabled = true }
64
+ assert_nothing_raised { @cache.update 'foo.txt', localhost_http_get('foo.txt') }
65
+ assert(@cache.check('foo.txt'))
66
+ end
67
+
68
+ def test_without_cache
69
+ assert_nothing_raised { @cache.enabled = true }
70
+ @cache.without_cache do
71
+ assert_nothing_raised { @cache.update 'foo.txt', localhost_http_get('foo.txt') }
72
+ assert(!@cache.check('foo.txt'))
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,34 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'scrapes/cookies'
26
+ require 'test/unit'
27
+
28
+ class TestCookies < Test::Unit::TestCase
29
+ def test_parser
30
+ cookies = Scrapes::Cookies.new
31
+ cookies.from_header('sid=21;domain=.example.com;Path=/')
32
+ assert_equal('sid=21', cookies.to_header)
33
+ end
34
+ end
@@ -0,0 +1,69 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'fileutils'
26
+ require 'test/lib/server'
27
+ require 'scrapes/crawler'
28
+ require 'scrapes/session'
29
+ require 'test/unit'
30
+
31
+ class TestCrawler < Test::Unit::TestCase
32
+ include LocalHTTPServer
33
+
34
+ def setup
35
+ @session = Scrapes::Session::new
36
+ start_server
37
+ @crawler = Scrapes::Crawler.new @session
38
+ end
39
+
40
+ def teardown
41
+ stop_server
42
+ FileUtils.remove_dir 'cache' if File.exist? 'cache'
43
+ end
44
+
45
+ def test_truth
46
+ assert @session
47
+ assert @server
48
+ assert @crawler
49
+ end
50
+
51
+ def test_cache_attr
52
+ assert @crawler.cache
53
+ cache = Scrapes::Cache.new
54
+ assert_nothing_raised { @crawler.cache = cache }
55
+ assert_equal @crawler.cache, cache
56
+ end
57
+
58
+ def test_log_attr
59
+ assert @crawler.log.nil?
60
+ log = Object.new
61
+ assert_nothing_raised { @crawler.log = log }
62
+ assert_equal @crawler.log, log
63
+ end
64
+
65
+ def test_fetch
66
+ assert @crawler.fetch(localhost_url('foo.txt'))
67
+ assert_equal @crawler.fetch(localhost_url('dummy')).class, Net::HTTPNotFound
68
+ end
69
+ end
@@ -0,0 +1,55 @@
1
+ ################################################################################ #
2
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #
23
+ ################################################################################
24
+ require 'rubygems'
25
+ ################################################################################
26
+ require 'scrapes'
27
+ ################################################################################
28
+ require 'test/lib/server'
29
+ require 'test/unit'
30
+
31
+ class TestRedhandedPage < Test::Unit::TestCase
32
+ include LocalHTTPServer
33
+
34
+ def setup
35
+ start_server
36
+ Scrapes::Initializer.run do |initializer|
37
+ initializer.pages_parent = 'test'
38
+ initializer.process
39
+ end
40
+ end
41
+
42
+ def teardown
43
+ stop_server
44
+ end
45
+
46
+ def test_truth
47
+ assert @server
48
+ end
49
+
50
+ def test_texts
51
+ Scrapes::Session.start do |session|
52
+ @page = session.page(LocalRedhanded, localhost_url('redhanded.html'))
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,54 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'rubygems'
26
+ require 'scrapes/initializer'
27
+ require 'scrapes/page'
28
+ require 'test/unit'
29
+
30
+ class TestInitializer < Test::Unit::TestCase
31
+ def setup
32
+ assert_nothing_raised { @initializer = Scrapes::Initializer.new }
33
+ end
34
+
35
+ def test_run
36
+ assert_nothing_raised { Scrapes::Initializer.run { } }
37
+ end
38
+
39
+ def test_pages_parent
40
+ assert_equal @initializer.pages_parent, File.dirname($0)
41
+ assert_nothing_raised { @initializer.pages_parent = 'foobar' }
42
+ assert_equal @initializer.pages_parent, 'foobar'
43
+ end
44
+
45
+ def test_pages_dir
46
+ assert_equal @initializer.pages_dir, 'pages'
47
+ assert_nothing_raised { @initializer.pages_dir = 'foobar' }
48
+ assert_equal @initializer.pages_dir, 'foobar'
49
+ end
50
+
51
+ def test_process
52
+ assert_nothing_raised { @initializer.process }
53
+ end
54
+ end
@@ -0,0 +1,63 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ # TODO figure out how to suppress get output
26
+ ################################################################################
27
+ require 'logger'
28
+ require 'webrick'
29
+ require 'net/http'
30
+ ################################################################################
31
+ # webrick localhost http server
32
+ module LocalHTTPServer
33
+ ################################################################################
34
+ # start the server and return it
35
+ def start_server
36
+ @server = WEBrick::HTTPServer.new :Port=>4270, :Logger=>Logger.new(nil),
37
+ :DocumentRoot=>File.expand_path('test/public'), :AccessLog=>[]
38
+ @server_thread = Thread.new { @server.start }
39
+ end
40
+
41
+ ################################################################################
42
+ # wait for server to shutdown and return it
43
+ def stop_server
44
+ if @server
45
+ @server.shutdown
46
+ @server_thread.join
47
+ @server
48
+ end
49
+ end
50
+
51
+ ################################################################################
52
+ # return a localhost url given a doc path
53
+ def localhost_url path = nil
54
+ "http://localhost:4270/#{path}"
55
+ end
56
+
57
+ ################################################################################
58
+ # get a page from the localhost http server
59
+ def localhost_http_get path = nil
60
+ Net::HTTP.get(URI.parse(localhost_url(path)))
61
+ end
62
+ end
63
+ ################################################################################
@@ -0,0 +1,77 @@
1
+ ################################################################################ #
2
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #
23
+ ################################################################################
24
+ require 'rubygems'
25
+ ################################################################################
26
+ require 'scrapes'
27
+ ################################################################################
28
+ require 'test/lib/server'
29
+ require 'test/unit'
30
+
31
+ class TestRedhandedPage < Test::Unit::TestCase
32
+ include LocalHTTPServer
33
+
34
+ def setup
35
+ start_server
36
+ Scrapes::Initializer.run do |initializer|
37
+ initializer.pages_parent = 'test'
38
+ initializer.process
39
+ end
40
+ end
41
+
42
+ def teardown
43
+ stop_server
44
+ end
45
+
46
+ def test_truth
47
+ assert @server
48
+ end
49
+
50
+ def test_local_redhanded
51
+ Scrapes::Session.start do |session|
52
+ @page = session.page(LocalRedhanded, localhost_url('redhanded.html'))
53
+ end
54
+ assert_equal Array , @page.syndicate_content.class
55
+ assert_equal 2 , @page.syndicate_content.size
56
+ assert_equal '/index.xml' , @page.syndicate_link
57
+ assert_equal 'JavaScript' , @page.script_language
58
+ assert_equal 274 , @page.links.size
59
+ #assert_equal 0 , @page.element.size
60
+ assert_equal 'RSS' , @page.syndicate_content[0]
61
+ assert_equal '2.0' , @page.syndicate_content[1]
62
+ end
63
+
64
+ def test_local_redhanded_enties
65
+ Scrapes::Session.start do |session|
66
+ @entries = session.page(LocalRedhandedEntries, localhost_url('redhanded.html'))
67
+ end
68
+ assert_equal 20, @entries.size
69
+ #assert_equal "Denver Accord#", @entries[0].entry_title
70
+ end
71
+
72
+ def test_local_pagination_1
73
+ Scrapes::Session.start do |session|
74
+ @foil = session.page(LocalPagination, localhost_url('foil74.html'))
75
+ end
76
+ end
77
+ end