w3map 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in w3map.gemspec
4
+ gemspec
5
+
data/LICENSE.txt ADDED
@@ -0,0 +1 @@
1
+ Copyright (c) 2013 Samuel Sanchez
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # W3map
2
+
3
+ w3map is a distributed system for process OSINT.
4
+ Open Source Intelligence (OSINT) is an information processing discipline that involves finding,
5
+ selecting, mining and acquiring information from publicly available sources and analyzing it to produce actionable intelligence/knowledge.
6
+ The term "open" refers to overt, publicly available sources (as opposed to covert or classified sources);
7
+ it is not related to open-source software.
8
+
9
+ w3map is in alpha version.
10
+ For the time being:
11
+ - you can't configure redis server
12
+ - you can't use your add your processing system
13
+ - you can't use your use your reporting system
14
+ - potentially you can discover bugs
15
+
16
+ ## Installation
17
+
18
+ Install it yourself as:
19
+
20
+ $ gem install w3map
21
+
22
+ ## Usage
23
+
24
+ WARNING: You must have Redis server started in your localhost.
25
+
26
+ $ export W3MAP_SESSION=<your-processing-session-name>
27
+ $ w3map push http://www.example.com
28
+
29
+ Start bot:
30
+
31
+ $ w3map bot # you can start many bots
32
+
33
+ Session information:
34
+
35
+ $ w3map queuelen # show queue length
36
+ $ w3map queue 5 # show last 5 entries in queue
37
+ $ w3map urls # show processing urls list and data store
38
+ $ w3map report ResponseCodeReport # execute standard report
39
+
40
+ ## Contributing
41
+
42
+ 1. Fork it
43
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
44
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
45
+ 4. Push to the branch (`git push origin my-new-feature`)
46
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+
5
+ task :default => :spec
6
+ desc 'Run tests with RSpec'
7
+ RSpec::Core::RakeTask.new(:spec)
data/bin/w3map ADDED
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+ #encoding: utf-8
3
+
4
+ $:.unshift(File.dirname(__FILE__) + "/../lib")
5
+ require 'w3map'
6
+
7
+ @session = W3map::Session.new(ENV['W3MAP_SESSION'])
8
+
9
+ # TODO: Refactor this !
10
+ def exec_cmd(cmd)
11
+ case cmd[0]
12
+ when 'flushall'
13
+ @session.queue.flush_processing_urls
14
+ @session.queue.flush_queue
15
+ when 'queuelen'
16
+ puts @session.queue.len
17
+ when 'queue'
18
+ @session.queue.last_entries(cmd[1].to_i).reverse.each_with_index do |url, idx|
19
+ puts "[#{idx}] #{url}"
20
+ end
21
+ when 'pop'
22
+ puts @session.queue.pop
23
+ when 'push'
24
+ @session.queue.push cmd[1]
25
+ when 'bot'
26
+ begin
27
+ bot = W3map::Bot.new(@session)
28
+ # TODO: load neurons
29
+ bot.start
30
+ rescue SignalException => e
31
+ raise e
32
+ rescue Exception => e
33
+ puts
34
+ puts "-- ERROR: #{e.message}"
35
+ puts "-- #{e.backtrace.first}"
36
+ end
37
+ when 'report' # TODO: load reports ...
38
+ W3map::Reports.const_get(cmd[1]).new(@session).exec
39
+ when 'urls'
40
+ @session.queue.processing_urls.each do |url|
41
+ puts url
42
+ @session.dstore.read_hash(url).each do |field, value|
43
+ puts " #{field}: #{value}"
44
+ end
45
+ end
46
+ else
47
+ puts "Unknow command: #{cmd[0]}"
48
+ end
49
+ end
50
+
51
+ if ARGV[0] == '-i'
52
+ puts "Welcome on w3map #{W3map::VERSION} #{Time.now}"
53
+ puts "Session: #{@session.name} loaded"
54
+ puts
55
+ loop do
56
+ print "#{@session.name}>> "
57
+ cmd = STDIN.gets.strip.split(' ')
58
+ break if cmd[0] == 'quit' || cmd[0] == 'exit'
59
+ exec_cmd cmd
60
+ end
61
+ else
62
+ exec_cmd ARGV
63
+ end
data/lib/ext/string.rb ADDED
@@ -0,0 +1,11 @@
1
+ class String
2
+ def to_utf8
3
+ if 1.8 == RUBY_VERSION.to_f
4
+ require 'iconv'
5
+ Iconv.conv('ISO-8859-1//TRANSLIT', 'utf-8', self)
6
+ else
7
+ self.encode('UTF-8')
8
+ #why not self.force_encoding('UTF-8') ???
9
+ end
10
+ end
11
+ end
data/lib/w3map.rb ADDED
@@ -0,0 +1,179 @@
1
+ require 'w3map/version'
2
+ require 'ext/string'
3
+
4
+ require 'uri'
5
+ require 'cgi'
6
+ require 'net/http'
7
+ require 'redis'
8
+ require 'nokogiri'
9
+
10
+ require 'w3map/neurons/neuron'
11
+ require 'w3map/neurons/simple_crawler'
12
+
13
+ require 'w3map/reports/report'
14
+ require 'w3map/reports/response_code_report'
15
+
16
+ module W3map
17
+ REDIS_PREFIX = "w3map"
18
+ class InvalidSessionName < Exception; end
19
+
20
+ class Session
21
+ attr_reader :name, :queue, :dstore
22
+ def initialize(name=nil, redis_config={})
23
+ raise InvalidSessionName if name.nil? || name.length == 0
24
+ @name = name
25
+ @redis_config = redis_config
26
+ end
27
+ def queue
28
+ @queue ||= DistributedQueue.new(@name, @redis_config)
29
+ end
30
+ def dstore
31
+ @dstore ||= DataStore.new(@name, @redis_config)
32
+ end
33
+ end
34
+
35
+ class DataStore
36
+ def initialize(session_name, config={})
37
+ @session_name = session_name
38
+ @redis = Redis.new(config)
39
+ end
40
+ def save(key, field, value)
41
+ @redis.hset schema(key), field, value
42
+ end
43
+ def save_hash(key, hash)
44
+ @redis.hmset schema(key), hash.flatten
45
+ end
46
+ def read(key, field)
47
+ @redis.hget schema(key), field
48
+ end
49
+ def read_hash(key)
50
+ @redis.hgetall schema(key)
51
+ end
52
+ def flush
53
+ @redis.keys(schema('*')).each { |key| @redis.del key }
54
+ end
55
+ def close
56
+ @redis.quit
57
+ end
58
+ private
59
+ def schema(key=nil)
60
+ key.nil? ? "#{REDIS_PREFIX}:#{@session_name}:dstore" : "#{REDIS_PREFIX}:#{@session_name}:dstore:#{key}"
61
+ end
62
+ end
63
+
64
+ class DistributedQueue
65
+ def initialize(session_name, config={})
66
+ @session_name = session_name
67
+ @redis = Redis.new config
68
+ end
69
+ def push(url)
70
+ add_to_processing_urls url
71
+ @redis.lpush queue_schema, url
72
+ end
73
+ def add_to_processing_urls(url)
74
+ @redis.sadd urls_schema, url
75
+ end
76
+ def close
77
+ @redis.quit
78
+ end
79
+ def last_entries(limit)
80
+ limit = limit + -1 if limit > 0
81
+ @redis.lrange queue_schema, 0, limit
82
+ end
83
+ def len
84
+ @redis.llen queue_schema
85
+ end
86
+ def already_processed?(url)
87
+ @redis.sismember urls_schema, url
88
+ end
89
+ def processing_urls
90
+ @redis.smembers(urls_schema)
91
+ end
92
+ def pop
93
+ _, url = @redis.brpop queue_schema
94
+ url
95
+ end
96
+ def flush_processing_urls
97
+ @redis.del urls_schema
98
+ end
99
+ def flush_queue
100
+ @redis.del queue_schema
101
+ end
102
+ private
103
+ def queue_schema
104
+ "#{REDIS_PREFIX}:#{@session_name}:queue"
105
+ end
106
+ def urls_schema
107
+ "#{REDIS_PREFIX}:#{@session_name}:urls"
108
+ end
109
+ end
110
+
111
+ class Bot
112
+ attr_reader :session
113
+ def initialize(session)
114
+ @session = session
115
+ @http_engine = HttpEngine.new(self)
116
+ # TODO: create class NeuronStack (NeuronPipeline?)
117
+ @neurons = []
118
+ @neurons << Neurons::SimpleCrawler.new(self)
119
+ end
120
+ def start
121
+ loop do
122
+ puts "Bot waiting for task (#{Time.now})"
123
+ url = @session.queue.pop
124
+ # break if url.nil?
125
+ puts "PROCESS #{url}"
126
+ response = @http_engine.response_for(url)
127
+ # save url, 'response-code', response.code
128
+ data = {'response-code' => response.code}
129
+ @neurons.each { |neuron| neuron.process url, response, data } #TODO: use NeuronStack or other
130
+ @session.dstore.save_hash url, data
131
+ end
132
+ rescue SignalException => e
133
+ puts
134
+ end
135
+ def stop
136
+ # @session.close ?
137
+ end
138
+ end
139
+
140
+ class HttpEngine
141
+ attr_reader :bot
142
+ def initialize(bot)
143
+ @bot = bot
144
+ end
145
+ def response_for(url)
146
+ uri = URI.parse(url)
147
+ http = Net::HTTP.new(uri.host, uri.port)
148
+ if uri.port == 443
149
+ http.use_ssl = true
150
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
151
+ end
152
+ get = Net::HTTP::Get.new(uri.request_uri)
153
+ http.request(get)
154
+ end
155
+ end
156
+
157
+ module HtmlHelpers
158
+ module_function
159
+ def normalize(url)
160
+ url = url.to_utf8
161
+ begin
162
+ uri = URI.parse URI.unescape(url)
163
+ rescue URI::InvalidURIError
164
+ uri = URI.parse URI.escape(url)
165
+ end
166
+ uri.fragment = nil
167
+ uri.to_s
168
+ end
169
+ def make_absolute(url, href)
170
+ url, href = url.to_utf8, href.to_utf8
171
+ a = URI.parse(URI.encode(url)) + URI.encode(href)
172
+ a.to_s
173
+ end
174
+ def is_internal?(url1, url2)
175
+ url1, url2 = url1.to_utf8, url2.to_utf8
176
+ URI.parse(URI.encode(url1)).host.to_s.downcase == URI.parse(URI.encode(url2)).host.to_s.downcase
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,14 @@
1
+ module W3map
2
+ module Neurons
3
+
4
+ class Neuron
5
+ def initialize(bot)
6
+ @bot = bot
7
+ end
8
+ def process(url, response, data)
9
+ raise NotImplementedError
10
+ end
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,24 @@
1
+ module W3map
2
+ module Neurons
3
+
4
+ class SimpleCrawler < Neuron
5
+ def process(url, response, data)
6
+ if response.header['Content-Type'] =~ /(.*)text\/html(.*)/
7
+ doc = Nokogiri::HTML(response.body)
8
+ data['page-title'] = doc.search('head/title').text || ''
9
+ doc.css('a').each do |a|
10
+ unless a[:href].nil?
11
+ founded_url = HtmlHelpers.normalize HtmlHelpers.make_absolute(url, a[:href])
12
+ if HtmlHelpers.is_internal?(url, founded_url) && !@bot.session.queue.already_processed?(founded_url)
13
+ @bot.session.queue.push founded_url
14
+ else
15
+ @bot.session.queue.add_to_processing_urls(founded_url)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,14 @@
1
+ module W3map
2
+ module Reports
3
+
4
+ class Report
5
+ def initialize(session)
6
+ @session = session
7
+ end
8
+ def exec
9
+ raise NotImplementedError
10
+ end
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,25 @@
1
+ module W3map
2
+ module Reports
3
+
4
+ class ResponseCodeReport < Report
5
+ def exec
6
+ @session.queue.processing_urls.each do |url|
7
+ hash = @session.dstore.read_hash(url)
8
+ puts "[#{hash['response-code']}] #{url}"
9
+ end
10
+ end
11
+ end
12
+
13
+ class PageTitleReport < Report
14
+ def exec
15
+ @session.queue.processing_urls.each do |url|
16
+ hash = @session.dstore.read_hash(url)
17
+ puts
18
+ puts url
19
+ puts " ## #{hash['page-title']}"
20
+ end
21
+ end
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module W3map
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,31 @@
1
+ require 'spec_helper'
2
+
3
+ describe HtmlHelpers do
4
+
5
+ it '.normalize' do
6
+ normalized_url = 'http://www.w3map.org/mypage'
7
+ HtmlHelpers.normalize('http://www.w3map.org/mypage').should == normalized_url
8
+ HtmlHelpers.normalize('http://www.w3map.org/mypage%23').should == normalized_url
9
+ HtmlHelpers.normalize('http://www.w3map.org/mypage#23').should == normalized_url
10
+ HtmlHelpers.normalize(' http://www.w3map.org/mypage').should == normalized_url
11
+ HtmlHelpers.normalize('%20http://www.w3map.org/mypage').should == normalized_url
12
+ HtmlHelpers.normalize('http://www.w3map.org/images/Sublime Text 2.icns').should == 'http://www.w3map.org/images/Sublime%20Text%202.icns'
13
+ HtmlHelpers.normalize('http://www.foo.com/reseaux/10/05112009/media-sociaux-entreprise-reseaux-professionnels-usage-podcast-facebook-twitter-38937-.html%23xtor=EPR-233-[HTML]-20091105')
14
+ .should == 'http://www.foo.com/reseaux/10/05112009/media-sociaux-entreprise-reseaux-professionnels-usage-podcast-facebook-twitter-38937-.html'
15
+ end
16
+
17
+ it '.is_internal?' do
18
+ initial_url = 'http://www.w3map.org'
19
+ HtmlHelpers.is_internal?(initial_url, 'http://www.w3map.org/foo/bar').should be_true
20
+ HtmlHelpers.is_internal?(initial_url, 'http://www.w3map.org/foo?bar=baz').should be_true
21
+ HtmlHelpers.is_internal?(initial_url, 'http://www.google.com').should be_false
22
+ end
23
+
24
+ it '.make_absolute' do
25
+ initial_url = 'http://www.w3map.org'
26
+ HtmlHelpers.make_absolute(initial_url, '/foo/bar').should == "#{initial_url}/foo/bar"
27
+ HtmlHelpers.make_absolute("#{initial_url}/", '/foo/bar').should == "#{initial_url}/foo/bar"
28
+ HtmlHelpers.make_absolute(initial_url, 'http://www.google.com/foo/bar').should == 'http://www.google.com/foo/bar'
29
+ HtmlHelpers.make_absolute("#{initial_url}/", 'http://www.google.com/foo/bar').should == 'http://www.google.com/foo/bar'
30
+ end
31
+ end
@@ -0,0 +1,21 @@
1
+ require 'spec_helper'
2
+
3
+ describe HttpEngine do
4
+ before(:each) do
5
+ @bot = stub(Bot)
6
+ @http_engine = HttpEngine.new(@bot)
7
+ end
8
+ it 'have a bot' do
9
+ @http_engine.bot.should_not be_nil
10
+ @http_engine.bot.should be(@bot)
11
+ end
12
+ it '.response_for' do
13
+ resp = @http_engine.response_for 'http://www.w3map.org/'
14
+ resp.should_not be_nil
15
+ resp.code.should == 200.to_s
16
+ resp.body.should match /w3map/
17
+ end
18
+
19
+ # TODO:
20
+ # - test ssl
21
+ end
@@ -0,0 +1,5 @@
1
+ require 'rspec'
2
+
3
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
4
+ require 'w3map'
5
+ include W3map
data/w3map.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'w3map/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "w3map"
8
+ gem.version = W3map::VERSION
9
+ gem.authors = ["Sam"]
10
+ gem.email = ["samuel@pagedegeek.com"]
11
+ gem.description = %q{w3map is a distributed system for process OSINT.
12
+ Open Source Intelligence (OSINT) is an information processing discipline that involves finding,
13
+ selecting, mining and acquiring information from publicly available sources and analyzing it to produce actionable intelligence/knowledge.
14
+ The term "open" refers to overt, publicly available sources (as opposed to covert or classified sources);
15
+ it is not related to open-source software.}
16
+ gem.summary = %q{Web scanner for gathering, mining, process, improve information open/public sources (OSINT).}
17
+ gem.homepage = "http://www.w3map.org"
18
+
19
+ gem.files = `git ls-files`.split($/)
20
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
21
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
22
+ gem.require_paths = ["lib", "bin"]
23
+
24
+ gem.add_dependency 'redis'
25
+ gem.add_dependency 'nokogiri'
26
+
27
+ gem.add_development_dependency 'rspec'
28
+ end
metadata ADDED
@@ -0,0 +1,121 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: w3map
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Sam
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: redis
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: nokogiri
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: ! "w3map is a distributed system for process OSINT.\n Open Source
63
+ Intelligence (OSINT) is an information processing discipline that involves finding,\n
64
+ \ selecting, mining and acquiring information from publicly available sources
65
+ and analyzing it to produce actionable intelligence/knowledge.\n The term \"open\"
66
+ refers to overt, publicly available sources (as opposed to covert or classified
67
+ sources);\n it is not related to open-source software."
68
+ email:
69
+ - samuel@pagedegeek.com
70
+ executables:
71
+ - w3map
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - .gitignore
76
+ - Gemfile
77
+ - LICENSE.txt
78
+ - README.md
79
+ - Rakefile
80
+ - bin/w3map
81
+ - lib/ext/string.rb
82
+ - lib/w3map.rb
83
+ - lib/w3map/neurons/neuron.rb
84
+ - lib/w3map/neurons/simple_crawler.rb
85
+ - lib/w3map/reports/report.rb
86
+ - lib/w3map/reports/response_code_report.rb
87
+ - lib/w3map/version.rb
88
+ - spec/html_helpers_spec.rb
89
+ - spec/http_engine_helpers_spec.rb
90
+ - spec/spec_helper.rb
91
+ - w3map.gemspec
92
+ homepage: http://www.w3map.org
93
+ licenses: []
94
+ post_install_message:
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ - bin
99
+ required_ruby_version: !ruby/object:Gem::Requirement
100
+ none: false
101
+ requirements:
102
+ - - ! '>='
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ required_rubygems_version: !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ requirements: []
112
+ rubyforge_project:
113
+ rubygems_version: 1.8.25
114
+ signing_key:
115
+ specification_version: 3
116
+ summary: Web scanner for gathering, mining, process, improve information open/public
117
+ sources (OSINT).
118
+ test_files:
119
+ - spec/html_helpers_spec.rb
120
+ - spec/http_engine_helpers_spec.rb
121
+ - spec/spec_helper.rb