scrapes 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,50 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ module Scrapes
26
+ ################################################################################
27
+ # Link pages together. Useful for when one only contains links to the next page.
28
+ class ToProxy
29
+ ################################################################################
30
+ def initialize (from, to)
31
+ @from, @to = from, to
32
+ end
33
+
34
+ ################################################################################
35
+ def extract (data, uri, session, &block)
36
+ @from.extract(data, uri, session) do |link|
37
+ session.page(@to, link, &block)
38
+ end
39
+ end
40
+
41
+ ################################################################################
42
+ # Continue the string of connections.
43
+ def to (next_in_line)
44
+ self.class.new(self, next_in_line)
45
+ end
46
+
47
+ end
48
+ ################################################################################
49
+ end
50
+ ################################################################################
@@ -0,0 +1,75 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'fileutils'
26
+ require 'test/lib/server'
27
+ require 'scrapes/cache'
28
+ require 'test/unit'
29
+
30
+ class TestCache < Test::Unit::TestCase
31
+ include LocalHTTPServer
32
+
33
+ def setup
34
+ start_server
35
+ @cache = Scrapes::Cache.new
36
+ end
37
+
38
+ def teardown
39
+ stop_server
40
+ FileUtils.remove_dir 'cache' if File.exist? 'cache'
41
+ end
42
+
43
+ def test_truth
44
+ assert @server
45
+ assert @cache
46
+ end
47
+
48
+ def test_directory_attr
49
+ assert_equal @cache.directory, File.expand_path('cache')
50
+ assert_nothing_raised { @cache.directory = 'cache' }
51
+ assert_equal @cache.directory, 'cache'
52
+ end
53
+
54
+ def test_enabled_attr
55
+ assert_equal @cache.enabled, false
56
+ assert_nothing_raised { @cache.enabled = true }
57
+ assert @cache.enabled
58
+ end
59
+
60
+ def test_update
61
+ assert_nothing_raised { @cache.update 'foo.txt', localhost_http_get('foo.txt') }
62
+ assert(!@cache.check('foo.txt'))
63
+ assert_nothing_raised { @cache.enabled = true }
64
+ assert_nothing_raised { @cache.update 'foo.txt', localhost_http_get('foo.txt') }
65
+ assert(@cache.check('foo.txt'))
66
+ end
67
+
68
+ def test_without_cache
69
+ assert_nothing_raised { @cache.enabled = true }
70
+ @cache.without_cache do
71
+ assert_nothing_raised { @cache.update 'foo.txt', localhost_http_get('foo.txt') }
72
+ assert(!@cache.check('foo.txt'))
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,34 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'scrapes/cookies'
26
+ require 'test/unit'
27
+
28
+ class TestCookies < Test::Unit::TestCase
29
+ def test_parser
30
+ cookies = Scrapes::Cookies.new
31
+ cookies.from_header('sid=21;domain=.example.com;Path=/')
32
+ assert_equal('sid=21', cookies.to_header)
33
+ end
34
+ end
@@ -0,0 +1,69 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'fileutils'
26
+ require 'test/lib/server'
27
+ require 'scrapes/crawler'
28
+ require 'scrapes/session'
29
+ require 'test/unit'
30
+
31
+ class TestCrawler < Test::Unit::TestCase
32
+ include LocalHTTPServer
33
+
34
+ def setup
35
+ @session = Scrapes::Session::new
36
+ start_server
37
+ @crawler = Scrapes::Crawler.new @session
38
+ end
39
+
40
+ def teardown
41
+ stop_server
42
+ FileUtils.remove_dir 'cache' if File.exist? 'cache'
43
+ end
44
+
45
+ def test_truth
46
+ assert @session
47
+ assert @server
48
+ assert @crawler
49
+ end
50
+
51
+ def test_cache_attr
52
+ assert @crawler.cache
53
+ cache = Scrapes::Cache.new
54
+ assert_nothing_raised { @crawler.cache = cache }
55
+ assert_equal @crawler.cache, cache
56
+ end
57
+
58
+ def test_log_attr
59
+ assert @crawler.log.nil?
60
+ log = Object.new
61
+ assert_nothing_raised { @crawler.log = log }
62
+ assert_equal @crawler.log, log
63
+ end
64
+
65
+ def test_fetch
66
+ assert @crawler.fetch(localhost_url('foo.txt'))
67
+ assert_equal @crawler.fetch(localhost_url('dummy')).class, Net::HTTPNotFound
68
+ end
69
+ end
@@ -0,0 +1,55 @@
1
+ ################################################################################ #
2
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #
23
+ ################################################################################
24
+ require 'rubygems'
25
+ ################################################################################
26
+ require 'scrapes'
27
+ ################################################################################
28
+ require 'test/lib/server'
29
+ require 'test/unit'
30
+
31
+ class TestRedhandedPage < Test::Unit::TestCase
32
+ include LocalHTTPServer
33
+
34
+ def setup
35
+ start_server
36
+ Scrapes::Initializer.run do |initializer|
37
+ initializer.pages_parent = 'test'
38
+ initializer.process
39
+ end
40
+ end
41
+
42
+ def teardown
43
+ stop_server
44
+ end
45
+
46
+ def test_truth
47
+ assert @server
48
+ end
49
+
50
+ def test_texts
51
+ Scrapes::Session.start do |session|
52
+ @page = session.page(LocalRedhanded, localhost_url('redhanded.html'))
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,54 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'rubygems'
26
+ require 'scrapes/initializer'
27
+ require 'scrapes/page'
28
+ require 'test/unit'
29
+
30
+ class TestInitializer < Test::Unit::TestCase
31
+ def setup
32
+ assert_nothing_raised { @initializer = Scrapes::Initializer.new }
33
+ end
34
+
35
+ def test_run
36
+ assert_nothing_raised { Scrapes::Initializer.run { } }
37
+ end
38
+
39
+ def test_pages_parent
40
+ assert_equal @initializer.pages_parent, File.dirname($0)
41
+ assert_nothing_raised { @initializer.pages_parent = 'foobar' }
42
+ assert_equal @initializer.pages_parent, 'foobar'
43
+ end
44
+
45
+ def test_pages_dir
46
+ assert_equal @initializer.pages_dir, 'pages'
47
+ assert_nothing_raised { @initializer.pages_dir = 'foobar' }
48
+ assert_equal @initializer.pages_dir, 'foobar'
49
+ end
50
+
51
+ def test_process
52
+ assert_nothing_raised { @initializer.process }
53
+ end
54
+ end
@@ -0,0 +1,63 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ # TODO figure out how to suppress get output
26
+ ################################################################################
27
+ require 'logger'
28
+ require 'webrick'
29
+ require 'net/http'
30
+ ################################################################################
31
+ # webrick localhost http server
32
+ module LocalHTTPServer
33
+ ################################################################################
34
+ # start the server and return it
35
+ def start_server
36
+ @server = WEBrick::HTTPServer.new :Port=>4270, :Logger=>Logger.new(nil),
37
+ :DocumentRoot=>File.expand_path('test/public'), :AccessLog=>[]
38
+ @server_thread = Thread.new { @server.start }
39
+ end
40
+
41
+ ################################################################################
42
+ # wait for server to shutdown and return it
43
+ def stop_server
44
+ if @server
45
+ @server.shutdown
46
+ @server_thread.join
47
+ @server
48
+ end
49
+ end
50
+
51
+ ################################################################################
52
+ # return a localhost url given a doc path
53
+ def localhost_url path = nil
54
+ "http://localhost:4270/#{path}"
55
+ end
56
+
57
+ ################################################################################
58
+ # get a page from the localhost http server
59
+ def localhost_http_get path = nil
60
+ Net::HTTP.get(URI.parse(localhost_url(path)))
61
+ end
62
+ end
63
+ ################################################################################
@@ -0,0 +1,77 @@
1
+ ################################################################################ #
2
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #
23
+ ################################################################################
24
+ require 'rubygems'
25
+ ################################################################################
26
+ require 'scrapes'
27
+ ################################################################################
28
+ require 'test/lib/server'
29
+ require 'test/unit'
30
+
31
+ class TestRedhandedPage < Test::Unit::TestCase
32
+ include LocalHTTPServer
33
+
34
+ def setup
35
+ start_server
36
+ Scrapes::Initializer.run do |initializer|
37
+ initializer.pages_parent = 'test'
38
+ initializer.process
39
+ end
40
+ end
41
+
42
+ def teardown
43
+ stop_server
44
+ end
45
+
46
+ def test_truth
47
+ assert @server
48
+ end
49
+
50
+ def test_local_redhanded
51
+ Scrapes::Session.start do |session|
52
+ @page = session.page(LocalRedhanded, localhost_url('redhanded.html'))
53
+ end
54
+ assert_equal Array , @page.syndicate_content.class
55
+ assert_equal 2 , @page.syndicate_content.size
56
+ assert_equal '/index.xml' , @page.syndicate_link
57
+ assert_equal 'JavaScript' , @page.script_language
58
+ assert_equal 274 , @page.links.size
59
+ #assert_equal 0 , @page.element.size
60
+ assert_equal 'RSS' , @page.syndicate_content[0]
61
+ assert_equal '2.0' , @page.syndicate_content[1]
62
+ end
63
+
64
+ def test_local_redhanded_enties
65
+ Scrapes::Session.start do |session|
66
+ @entries = session.page(LocalRedhandedEntries, localhost_url('redhanded.html'))
67
+ end
68
+ assert_equal 20, @entries.size
69
+ #assert_equal "Denver Accord#", @entries[0].entry_title
70
+ end
71
+
72
+ def test_local_pagination_1
73
+ Scrapes::Session.start do |session|
74
+ @foil = session.page(LocalPagination, localhost_url('foil74.html'))
75
+ end
76
+ end
77
+ end