discovery-mission 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ pkg
2
+ example.ru
data/MIT_LICENCE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 - 2010 Mickael Riga
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,37 @@
1
+ = Discovery Mission
2
+
3
+ A SPACE ODYSSEY
4
+
5
+ This is just a simple, basic (dummy?) and lightweight crawler.
6
+ It is used to reference all URLs of your website in order to record them on a database, and use them for a sitemap manager.
7
+
8
+ Just install it:
9
+
10
+ # sudo gem install discovery-mission
11
+
12
+ And then you're ready to take off:
13
+
14
+ require 'discovery_mission'
15
+
16
+ # Use a block to do something with each path
17
+ # response is a 'net/http' response object, therefore responds to :body, :code ...
18
+
19
+ DiscoveryMission.for("http://www.my-domain.com") do |url, response|
20
+ puts "Dave Bowman landed on #{url}"
21
+ case response
22
+ when Net::HTTPSuccess
23
+ puts "Fuck you HAL 9000"
24
+ when Net::HTTPRedirection
25
+ puts "Shit! That's Jupiter!"
26
+ else
27
+ puts "Bad trip indeed"
28
+ end
29
+ end
30
+
31
+ # Or just without a block because an Array with all paths is returned
32
+
33
+ my_sitemap_entries = DiscoveryMission.for("http://www.my-domain.com")
34
+
35
+ -
36
+
37
+ Copyright (c) 2009 - 2010 Mickael Riga. See MIT_LICENCE file for details.
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ spec = Gem::Specification.new do |s|
2
+ s.name = 'discovery_mission'
3
+ s.version = "0.0.1"
4
+ s.platform = Gem::Platform::RUBY
5
+ s.summary = "This is just a simple, basic (dummy?) and lightweight crawler."
6
+ s.files = ['discovery_mission.rb']
7
+ s.test_files = ['spec.rb']
8
+ s.require_path = '.'
9
+ s.autorequire = 'discovery_mission'
10
+ s.author = "Mickael Riga"
11
+ s.email = "mig@mypeplum.com"
12
+ s.homepage = "http://github.com/mig-hub/discovery_mission"
13
+ end
14
+
15
+ Rake::GemPackageTask.new(spec) do |pkg|
16
+ pkg.need_tar = true
17
+ end
@@ -0,0 +1,13 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'discovery-mission'
3
+ s.version = "0.0.1"
4
+ s.platform = Gem::Platform::RUBY
5
+ s.summary = "A simple website crawler"
6
+ s.description = "Discovery Mission is an easy-to-use website crawler. Use it for generating sitemaps."
7
+ s.files = `git ls-files`.split("\n").sort
8
+ s.test_files = ['spec.rb']
9
+ s.require_path = '.'
10
+ s.author = "Mickael Riga"
11
+ s.email = "mig@mypeplum.com"
12
+ s.homepage = "http://github.com/mig-hub/discovery_mission"
13
+ end
@@ -0,0 +1,56 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ class DiscoveryMission
5
+ attr_accessor :roadmap
6
+
7
+ def self.for(domain, &block)
8
+ new(domain).launch(&block)
9
+ end
10
+
11
+ def initialize(domain)
12
+ @domain = URI(domain)
13
+ @domain.path = "/" if @domain.path == ""
14
+ reset
15
+ puts "\nDiscovery Mission Planned for #{@domain}\nEverything is under control Dave\n\n"
16
+ end
17
+
18
+ def launch
19
+ while destination = @roadmap.find {|k,v| v==false}
20
+ response = land_on(destination.first)
21
+ yield(destination.first, response) if block_given?
22
+ explore(destination.first, response)
23
+ @roadmap[destination.first] = true
24
+ end
25
+ uri_list = @roadmap.keys
26
+ reset
27
+ uri_list
28
+ end
29
+
30
+ def reset
31
+ @roadmap = {@domain.path => false}
32
+ end
33
+
34
+ def land_on(destination)
35
+ begin
36
+ response = Net::HTTP.get_response(@domain.host, destination)
37
+ rescue Exception
38
+ puts "Error: #{$!}"
39
+ end
40
+ return response
41
+ end
42
+
43
+ def explore(destination, response)
44
+ html = response.body
45
+ html.scan(/<a href\s*=\s*["']([^"']+)["']/i) do |w|
46
+ uri_found = URI("#{w}") rescue nil
47
+ unless (uri_found.nil? or (uri_found.absolute? and uri_found.host!=@domain.host) or (uri_found.path=='' or uri_found.path=='#' or uri_found.path[/^javascript/]))
48
+ destination += '/' unless destination[/\/$/]
49
+ uri_found.path = destination + uri_found.path unless uri_found.path[/^\//]
50
+ unless @roadmap.key?(uri_found.path)
51
+ @roadmap.store(uri_found.path, false)
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
data/spec.rb ADDED
@@ -0,0 +1,65 @@
1
+ require 'rubygems'
2
+ require 'bacon'
3
+ Bacon.summary_on_exit
4
+
5
+ MyResponse = Struct::new(:body)
6
+
7
+ def template_this(content)
8
+ "<html><head><title></title</head><body>#{content}</body></html>"
9
+ end
10
+
11
+ require File.expand_path('./discovery_mission', File.dirname(__FILE__))
12
+ class DiscoveryMission
13
+ attr_reader :roadmap
14
+ def land_on(destination)
15
+ destination=='/' ? MyResponse.new("<html><head><title></title</head><body><a href='/good/destination'>click</a></body></html>") : MyResponse.new('')
16
+ end
17
+ end
18
+
19
+ describe DiscoveryMission do
20
+ describe 'explore' do
21
+ before do
22
+ @dm = DiscoveryMission.new('http://www.domain.com')
23
+ end
24
+ it 'Should add correct destinations to roadmap' do
25
+ html = template_this("<a href='/good/destination'>click</a>")
26
+ @dm.explore("/", MyResponse.new(html))
27
+ @dm.roadmap.keys.should.include?('/good/destination')
28
+ end
29
+ it 'Should not raise when bad URI' do
30
+ html = template_this("<a href='http://'>click</a>")
31
+ @dm.explore("/", MyResponse.new(html))
32
+ @dm.roadmap.keys.should==["/"]
33
+ end
34
+ it 'Should skip pointless uri' do
35
+ html = template_this(<<-EOT)
36
+ <a href='http://www.another_domain.com'>click</a>
37
+ <a href='http://www.another_domain.com/with/path'>click</a>
38
+ <a href='#'>click</a>
39
+ <a href='javascript:void(0);'>click</a>
40
+ EOT
41
+ @dm.explore("/", MyResponse.new(html))
42
+ @dm.roadmap.keys.should==["/"]
43
+ end
44
+ it 'Sould add current path if uri is relative to it' do
45
+ html = template_this("<a href='chapter_one'>click</a>")
46
+ @dm.explore("/novel", MyResponse.new(html))
47
+ @dm.roadmap.keys.should.include?("/novel/chapter_one")
48
+ end
49
+ it 'Should not duplicate entries' do
50
+ html = template_this(<<-EOT)
51
+ <a href='/'>click</a>
52
+ <a href='http://www.domain.com'>click</a>
53
+ EOT
54
+ @dm.explore("/", MyResponse.new(html))
55
+ @dm.roadmap.keys.should==["/"]
56
+ end
57
+ end
58
+ describe 'launch' do
59
+ it 'Should be reseted for next launch' do
60
+ dm = DiscoveryMission.new('http://www.domain.com')
61
+ dm.launch.size.should==2
62
+ dm.roadmap.size.should==1
63
+ end
64
+ end
65
+ end
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: discovery-mission
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Mickael Riga
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-06-02 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Discovery Mission is an easy-to-use website crawler. Use it for generating sitemaps.
23
+ email: mig@mypeplum.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - .gitignore
32
+ - MIT_LICENCE
33
+ - README.rdoc
34
+ - Rakefile
35
+ - discovery_mission.gemspec
36
+ - discovery_mission.rb
37
+ - spec.rb
38
+ has_rdoc: true
39
+ homepage: http://github.com/mig-hub/discovery_mission
40
+ licenses: []
41
+
42
+ post_install_message:
43
+ rdoc_options: []
44
+
45
+ require_paths:
46
+ - .
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ hash: 3
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 3
62
+ segments:
63
+ - 0
64
+ version: "0"
65
+ requirements: []
66
+
67
+ rubyforge_project:
68
+ rubygems_version: 1.3.7
69
+ signing_key:
70
+ specification_version: 3
71
+ summary: A simple website crawler
72
+ test_files:
73
+ - spec.rb