discovery-mission 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ pkg
2
+ example.ru
data/MIT_LICENCE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 - 2010 Mickael Riga
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,37 @@
1
+ = Discovery Mission
2
+
3
+ A SPACE ODYSSEY
4
+
5
+ This is just a simple, basic (dummy?) and lightweight crawler.
6
+ It is used to reference all URLs of your website in order to record them on a database, and use them for a sitemap manager.
7
+
8
+ Just install it:
9
+
10
+ # sudo gem install discovery-mission
11
+
12
+ And then you're ready to take off:
13
+
14
+ require 'discovery_mission'
15
+
16
+ # Use a block to do something with each path
17
+ # response is a 'net/http' response object, therefore responds to :body, :code ...
18
+
19
+ DiscoveryMission.for("http://www.my-domain.com") do |url, response|
20
+ puts "Dave Bowman landed on #{url}"
21
+ case response
22
+ when Net::HTTPSuccess
23
+ puts "Fuck you HAL 9000"
24
+ when Net::HTTPRedirection
25
+ puts "Shit! That's Jupiter!"
26
+ else
27
+ puts "Bad trip indeed"
28
+ end
29
+ end
30
+
31
+ # Or just without a block because an Array with all paths is returned
32
+
33
+ my_sitemap_entries = DiscoveryMission.for("http://www.my-domain.com")
34
+
35
+ -
36
+
37
+ Copyright (c) 2009 - 2010 Mickael Riga. See MIT_LICENCE file for details.
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ spec = Gem::Specification.new do |s|
2
+ s.name = 'discovery_mission'
3
+ s.version = "0.0.1"
4
+ s.platform = Gem::Platform::RUBY
5
+ s.summary = "This is just a simple, basic (dummy?) and lightweight crawler."
6
+ s.files = ['discovery_mission.rb']
7
+ s.test_files = ['spec.rb']
8
+ s.require_path = '.'
9
+ s.autorequire = 'discovery_mission'
10
+ s.author = "Mickael Riga"
11
+ s.email = "mig@mypeplum.com"
12
+ s.homepage = "http://github.com/mig-hub/discovery_mission"
13
+ end
14
+
15
+ Rake::GemPackageTask.new(spec) do |pkg|
16
+ pkg.need_tar = true
17
+ end
@@ -0,0 +1,13 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'discovery-mission'
3
+ s.version = "0.0.1"
4
+ s.platform = Gem::Platform::RUBY
5
+ s.summary = "A simple website crawler"
6
+ s.description = "Discovery Mission is an easy-to-use website crawler. Use it for generating sitemaps."
7
+ s.files = `git ls-files`.split("\n").sort
8
+ s.test_files = ['spec.rb']
9
+ s.require_path = '.'
10
+ s.author = "Mickael Riga"
11
+ s.email = "mig@mypeplum.com"
12
+ s.homepage = "http://github.com/mig-hub/discovery_mission"
13
+ end
@@ -0,0 +1,56 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ class DiscoveryMission
5
+ attr_accessor :roadmap
6
+
7
+ def self.for(domain, &block)
8
+ new(domain).launch(&block)
9
+ end
10
+
11
+ def initialize(domain)
12
+ @domain = URI(domain)
13
+ @domain.path = "/" if @domain.path == ""
14
+ reset
15
+ puts "\nDiscovery Mission Planned for #{@domain}\nEverything is under control Dave\n\n"
16
+ end
17
+
18
+ def launch
19
+ while destination = @roadmap.find {|k,v| v==false}
20
+ response = land_on(destination.first)
21
+ yield(destination.first, response) if block_given?
22
+ explore(destination.first, response)
23
+ @roadmap[destination.first] = true
24
+ end
25
+ uri_list = @roadmap.keys
26
+ reset
27
+ uri_list
28
+ end
29
+
30
+ def reset
31
+ @roadmap = {@domain.path => false}
32
+ end
33
+
34
+ def land_on(destination)
35
+ begin
36
+ response = Net::HTTP.get_response(@domain.host, destination)
37
+ rescue Exception
38
+ puts "Error: #{$!}"
39
+ end
40
+ return response
41
+ end
42
+
43
+ def explore(destination, response)
44
+ html = response.body
45
+ html.scan(/<a href\s*=\s*["']([^"']+)["']/i) do |w|
46
+ uri_found = URI("#{w}") rescue nil
47
+ unless (uri_found.nil? or (uri_found.absolute? and uri_found.host!=@domain.host) or (uri_found.path=='' or uri_found.path=='#' or uri_found.path[/^javascript/]))
48
+ destination += '/' unless destination[/\/$/]
49
+ uri_found.path = destination + uri_found.path unless uri_found.path[/^\//]
50
+ unless @roadmap.key?(uri_found.path)
51
+ @roadmap.store(uri_found.path, false)
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
data/spec.rb ADDED
@@ -0,0 +1,65 @@
1
+ require 'rubygems'
2
+ require 'bacon'
3
+ Bacon.summary_on_exit
4
+
5
+ MyResponse = Struct::new(:body)
6
+
7
+ def template_this(content)
8
+ "<html><head><title></title</head><body>#{content}</body></html>"
9
+ end
10
+
11
+ require File.expand_path('./discovery_mission', File.dirname(__FILE__))
12
+ class DiscoveryMission
13
+ attr_reader :roadmap
14
+ def land_on(destination)
15
+ destination=='/' ? MyResponse.new("<html><head><title></title</head><body><a href='/good/destination'>click</a></body></html>") : MyResponse.new('')
16
+ end
17
+ end
18
+
19
+ describe DiscoveryMission do
20
+ describe 'explore' do
21
+ before do
22
+ @dm = DiscoveryMission.new('http://www.domain.com')
23
+ end
24
+ it 'Should add correct destinations to roadmap' do
25
+ html = template_this("<a href='/good/destination'>click</a>")
26
+ @dm.explore("/", MyResponse.new(html))
27
+ @dm.roadmap.keys.should.include?('/good/destination')
28
+ end
29
+ it 'Should not raise when bad URI' do
30
+ html = template_this("<a href='http://'>click</a>")
31
+ @dm.explore("/", MyResponse.new(html))
32
+ @dm.roadmap.keys.should==["/"]
33
+ end
34
+ it 'Should skip pointless uri' do
35
+ html = template_this(<<-EOT)
36
+ <a href='http://www.another_domain.com'>click</a>
37
+ <a href='http://www.another_domain.com/with/path'>click</a>
38
+ <a href='#'>click</a>
39
+ <a href='javascript:void(0);'>click</a>
40
+ EOT
41
+ @dm.explore("/", MyResponse.new(html))
42
+ @dm.roadmap.keys.should==["/"]
43
+ end
44
+ it 'Sould add current path if uri is relative to it' do
45
+ html = template_this("<a href='chapter_one'>click</a>")
46
+ @dm.explore("/novel", MyResponse.new(html))
47
+ @dm.roadmap.keys.should.include?("/novel/chapter_one")
48
+ end
49
+ it 'Should not duplicate entries' do
50
+ html = template_this(<<-EOT)
51
+ <a href='/'>click</a>
52
+ <a href='http://www.domain.com'>click</a>
53
+ EOT
54
+ @dm.explore("/", MyResponse.new(html))
55
+ @dm.roadmap.keys.should==["/"]
56
+ end
57
+ end
58
+ describe 'launch' do
59
+ it 'Should be reseted for next launch' do
60
+ dm = DiscoveryMission.new('http://www.domain.com')
61
+ dm.launch.size.should==2
62
+ dm.roadmap.size.should==1
63
+ end
64
+ end
65
+ end
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: discovery-mission
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Mickael Riga
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-06-02 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Discovery Mission is an easy-to-use website crawler. Use it for generating sitemaps.
23
+ email: mig@mypeplum.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - .gitignore
32
+ - MIT_LICENCE
33
+ - README.rdoc
34
+ - Rakefile
35
+ - discovery_mission.gemspec
36
+ - discovery_mission.rb
37
+ - spec.rb
38
+ has_rdoc: true
39
+ homepage: http://github.com/mig-hub/discovery_mission
40
+ licenses: []
41
+
42
+ post_install_message:
43
+ rdoc_options: []
44
+
45
+ require_paths:
46
+ - .
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ hash: 3
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 3
62
+ segments:
63
+ - 0
64
+ version: "0"
65
+ requirements: []
66
+
67
+ rubyforge_project:
68
+ rubygems_version: 1.3.7
69
+ signing_key:
70
+ specification_version: 3
71
+ summary: A simple website crawler
72
+ test_files:
73
+ - spec.rb