w3map 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in w3map.gemspec
4
+ gemspec
5
+
data/LICENSE.txt ADDED
@@ -0,0 +1 @@
1
+ Copyright (c) 2013 Samuel Sanchez
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # W3map
2
+
3
+ w3map is a distributed system for process OSINT.
4
+ Open Source Intelligence (OSINT) is an information processing discipline that involves finding,
5
+ selecting, mining and acquiring information from publicly available sources and analyzing it to produce actionable intelligence/knowledge.
6
+ The term "open" refers to overt, publicly available sources (as opposed to covert or classified sources);
7
+ it is not related to open-source software.
8
+
9
+ w3map is in alpha version.
10
+ For the time being:
11
+ - you can't configure redis server
12
+ - you can't use your add your processing system
13
+ - you can't use your use your reporting system
14
+ - potentially you can discover bugs
15
+
16
+ ## Installation
17
+
18
+ Install it yourself as:
19
+
20
+ $ gem install w3map
21
+
22
+ ## Usage
23
+
24
+ WARNING: You must have Redis server started in your localhost.
25
+
26
+ $ export W3MAP_SESSION=<your-processing-session-name>
27
+ $ w3map push http://www.example.com
28
+
29
+ Start bot:
30
+
31
+ $ w3map bot # you can start many bots
32
+
33
+ Session information:
34
+
35
+ $ w3map queuelen # show queue length
36
+ $ w3map queue 5 # show last 5 entries in queue
37
+ $ w3map urls # show processing urls list and data store
38
+ $ w3map report ResponseCodeReport # execute standard report
39
+
40
+ ## Contributing
41
+
42
+ 1. Fork it
43
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
44
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
45
+ 4. Push to the branch (`git push origin my-new-feature`)
46
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+
5
+ task :default => :spec
6
+ desc 'Run tests with RSpec'
7
+ RSpec::Core::RakeTask.new(:spec)
data/bin/w3map ADDED
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+ #encoding: utf-8
3
+
4
+ $:.unshift(File.dirname(__FILE__) + "/../lib")
5
+ require 'w3map'
6
+
7
+ @session = W3map::Session.new(ENV['W3MAP_SESSION'])
8
+
9
+ # TODO: Refactor this !
10
+ def exec_cmd(cmd)
11
+ case cmd[0]
12
+ when 'flushall'
13
+ @session.queue.flush_processing_urls
14
+ @session.queue.flush_queue
15
+ when 'queuelen'
16
+ puts @session.queue.len
17
+ when 'queue'
18
+ @session.queue.last_entries(cmd[1].to_i).reverse.each_with_index do |url, idx|
19
+ puts "[#{idx}] #{url}"
20
+ end
21
+ when 'pop'
22
+ puts @session.queue.pop
23
+ when 'push'
24
+ @session.queue.push cmd[1]
25
+ when 'bot'
26
+ begin
27
+ bot = W3map::Bot.new(@session)
28
+ # TODO: load neurons
29
+ bot.start
30
+ rescue SignalException => e
31
+ raise e
32
+ rescue Exception => e
33
+ puts
34
+ puts "-- ERROR: #{e.message}"
35
+ puts "-- #{e.backtrace.first}"
36
+ end
37
+ when 'report' # TODO: load reports ...
38
+ W3map::Reports.const_get(cmd[1]).new(@session).exec
39
+ when 'urls'
40
+ @session.queue.processing_urls.each do |url|
41
+ puts url
42
+ @session.dstore.read_hash(url).each do |field, value|
43
+ puts " #{field}: #{value}"
44
+ end
45
+ end
46
+ else
47
+ puts "Unknow command: #{cmd[0]}"
48
+ end
49
+ end
50
+
51
+ if ARGV[0] == '-i'
52
+ puts "Welcome on w3map #{W3map::VERSION} #{Time.now}"
53
+ puts "Session: #{@session.name} loaded"
54
+ puts
55
+ loop do
56
+ print "#{@session.name}>> "
57
+ cmd = STDIN.gets.strip.split(' ')
58
+ break if cmd[0] == 'quit' || cmd[0] == 'exit'
59
+ exec_cmd cmd
60
+ end
61
+ else
62
+ exec_cmd ARGV
63
+ end
data/lib/ext/string.rb ADDED
@@ -0,0 +1,11 @@
1
+ class String
2
+ def to_utf8
3
+ if 1.8 == RUBY_VERSION.to_f
4
+ require 'iconv'
5
+ Iconv.conv('ISO-8859-1//TRANSLIT', 'utf-8', self)
6
+ else
7
+ self.encode('UTF-8')
8
+ #why not self.force_encoding('UTF-8') ???
9
+ end
10
+ end
11
+ end
data/lib/w3map.rb ADDED
@@ -0,0 +1,179 @@
1
+ require 'w3map/version'
2
+ require 'ext/string'
3
+
4
+ require 'uri'
5
+ require 'cgi'
6
+ require 'net/http'
7
+ require 'redis'
8
+ require 'nokogiri'
9
+
10
+ require 'w3map/neurons/neuron'
11
+ require 'w3map/neurons/simple_crawler'
12
+
13
+ require 'w3map/reports/report'
14
+ require 'w3map/reports/response_code_report'
15
+
16
+ module W3map
17
+ REDIS_PREFIX = "w3map"
18
+ class InvalidSessionName < Exception; end
19
+
20
+ class Session
21
+ attr_reader :name, :queue, :dstore
22
+ def initialize(name=nil, redis_config={})
23
+ raise InvalidSessionName if name.nil? || name.length == 0
24
+ @name = name
25
+ @redis_config = redis_config
26
+ end
27
+ def queue
28
+ @queue ||= DistributedQueue.new(@name, @redis_config)
29
+ end
30
+ def dstore
31
+ @dstore ||= DataStore.new(@name, @redis_config)
32
+ end
33
+ end
34
+
35
+ class DataStore
36
+ def initialize(session_name, config={})
37
+ @session_name = session_name
38
+ @redis = Redis.new(config)
39
+ end
40
+ def save(key, field, value)
41
+ @redis.hset schema(key), field, value
42
+ end
43
+ def save_hash(key, hash)
44
+ @redis.hmset schema(key), hash.flatten
45
+ end
46
+ def read(key, field)
47
+ @redis.hget schema(key), field
48
+ end
49
+ def read_hash(key)
50
+ @redis.hgetall schema(key)
51
+ end
52
+ def flush
53
+ @redis.keys(schema('*')).each { |key| @redis.del key }
54
+ end
55
+ def close
56
+ @redis.quit
57
+ end
58
+ private
59
+ def schema(key=nil)
60
+ key.nil? ? "#{REDIS_PREFIX}:#{@session_name}:dstore" : "#{REDIS_PREFIX}:#{@session_name}:dstore:#{key}"
61
+ end
62
+ end
63
+
64
+ class DistributedQueue
65
+ def initialize(session_name, config={})
66
+ @session_name = session_name
67
+ @redis = Redis.new config
68
+ end
69
+ def push(url)
70
+ add_to_processing_urls url
71
+ @redis.lpush queue_schema, url
72
+ end
73
+ def add_to_processing_urls(url)
74
+ @redis.sadd urls_schema, url
75
+ end
76
+ def close
77
+ @redis.quit
78
+ end
79
+ def last_entries(limit)
80
+ limit = limit + -1 if limit > 0
81
+ @redis.lrange queue_schema, 0, limit
82
+ end
83
+ def len
84
+ @redis.llen queue_schema
85
+ end
86
+ def already_processed?(url)
87
+ @redis.sismember urls_schema, url
88
+ end
89
+ def processing_urls
90
+ @redis.smembers(urls_schema)
91
+ end
92
+ def pop
93
+ _, url = @redis.brpop queue_schema
94
+ url
95
+ end
96
+ def flush_processing_urls
97
+ @redis.del urls_schema
98
+ end
99
+ def flush_queue
100
+ @redis.del queue_schema
101
+ end
102
+ private
103
+ def queue_schema
104
+ "#{REDIS_PREFIX}:#{@session_name}:queue"
105
+ end
106
+ def urls_schema
107
+ "#{REDIS_PREFIX}:#{@session_name}:urls"
108
+ end
109
+ end
110
+
111
+ class Bot
112
+ attr_reader :session
113
+ def initialize(session)
114
+ @session = session
115
+ @http_engine = HttpEngine.new(self)
116
+ # TODO: create class NeuronStack (NeuronPipeline?)
117
+ @neurons = []
118
+ @neurons << Neurons::SimpleCrawler.new(self)
119
+ end
120
+ def start
121
+ loop do
122
+ puts "Bot waiting for task (#{Time.now})"
123
+ url = @session.queue.pop
124
+ # break if url.nil?
125
+ puts "PROCESS #{url}"
126
+ response = @http_engine.response_for(url)
127
+ # save url, 'response-code', response.code
128
+ data = {'response-code' => response.code}
129
+ @neurons.each { |neuron| neuron.process url, response, data } #TODO: use NeuronStack or other
130
+ @session.dstore.save_hash url, data
131
+ end
132
+ rescue SignalException => e
133
+ puts
134
+ end
135
+ def stop
136
+ # @session.close ?
137
+ end
138
+ end
139
+
140
+ class HttpEngine
141
+ attr_reader :bot
142
+ def initialize(bot)
143
+ @bot = bot
144
+ end
145
+ def response_for(url)
146
+ uri = URI.parse(url)
147
+ http = Net::HTTP.new(uri.host, uri.port)
148
+ if uri.port == 443
149
+ http.use_ssl = true
150
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
151
+ end
152
+ get = Net::HTTP::Get.new(uri.request_uri)
153
+ http.request(get)
154
+ end
155
+ end
156
+
157
+ module HtmlHelpers
158
+ module_function
159
+ def normalize(url)
160
+ url = url.to_utf8
161
+ begin
162
+ uri = URI.parse URI.unescape(url)
163
+ rescue URI::InvalidURIError
164
+ uri = URI.parse URI.escape(url)
165
+ end
166
+ uri.fragment = nil
167
+ uri.to_s
168
+ end
169
+ def make_absolute(url, href)
170
+ url, href = url.to_utf8, href.to_utf8
171
+ a = URI.parse(URI.encode(url)) + URI.encode(href)
172
+ a.to_s
173
+ end
174
+ def is_internal?(url1, url2)
175
+ url1, url2 = url1.to_utf8, url2.to_utf8
176
+ URI.parse(URI.encode(url1)).host.to_s.downcase == URI.parse(URI.encode(url2)).host.to_s.downcase
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,14 @@
1
+ module W3map
2
+ module Neurons
3
+
4
+ class Neuron
5
+ def initialize(bot)
6
+ @bot = bot
7
+ end
8
+ def process(url, response, data)
9
+ raise NotImplementedError
10
+ end
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,24 @@
1
+ module W3map
2
+ module Neurons
3
+
4
+ class SimpleCrawler < Neuron
5
+ def process(url, response, data)
6
+ if response.header['Content-Type'] =~ /(.*)text\/html(.*)/
7
+ doc = Nokogiri::HTML(response.body)
8
+ data['page-title'] = doc.search('head/title').text || ''
9
+ doc.css('a').each do |a|
10
+ unless a[:href].nil?
11
+ founded_url = HtmlHelpers.normalize HtmlHelpers.make_absolute(url, a[:href])
12
+ if HtmlHelpers.is_internal?(url, founded_url) && !@bot.session.queue.already_processed?(founded_url)
13
+ @bot.session.queue.push founded_url
14
+ else
15
+ @bot.session.queue.add_to_processing_urls(founded_url)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,14 @@
1
+ module W3map
2
+ module Reports
3
+
4
+ class Report
5
+ def initialize(session)
6
+ @session = session
7
+ end
8
+ def exec
9
+ raise NotImplementedError
10
+ end
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,25 @@
1
+ module W3map
2
+ module Reports
3
+
4
+ class ResponseCodeReport < Report
5
+ def exec
6
+ @session.queue.processing_urls.each do |url|
7
+ hash = @session.dstore.read_hash(url)
8
+ puts "[#{hash['response-code']}] #{url}"
9
+ end
10
+ end
11
+ end
12
+
13
+ class PageTitleReport < Report
14
+ def exec
15
+ @session.queue.processing_urls.each do |url|
16
+ hash = @session.dstore.read_hash(url)
17
+ puts
18
+ puts url
19
+ puts " ## #{hash['page-title']}"
20
+ end
21
+ end
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module W3map
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,31 @@
1
+ require 'spec_helper'
2
+
3
+ describe HtmlHelpers do
4
+
5
+ it '.normalize' do
6
+ normalized_url = 'http://www.w3map.org/mypage'
7
+ HtmlHelpers.normalize('http://www.w3map.org/mypage').should == normalized_url
8
+ HtmlHelpers.normalize('http://www.w3map.org/mypage%23').should == normalized_url
9
+ HtmlHelpers.normalize('http://www.w3map.org/mypage#23').should == normalized_url
10
+ HtmlHelpers.normalize(' http://www.w3map.org/mypage').should == normalized_url
11
+ HtmlHelpers.normalize('%20http://www.w3map.org/mypage').should == normalized_url
12
+ HtmlHelpers.normalize('http://www.w3map.org/images/Sublime Text 2.icns').should == 'http://www.w3map.org/images/Sublime%20Text%202.icns'
13
+ HtmlHelpers.normalize('http://www.foo.com/reseaux/10/05112009/media-sociaux-entreprise-reseaux-professionnels-usage-podcast-facebook-twitter-38937-.html%23xtor=EPR-233-[HTML]-20091105')
14
+ .should == 'http://www.foo.com/reseaux/10/05112009/media-sociaux-entreprise-reseaux-professionnels-usage-podcast-facebook-twitter-38937-.html'
15
+ end
16
+
17
+ it '.is_internal?' do
18
+ initial_url = 'http://www.w3map.org'
19
+ HtmlHelpers.is_internal?(initial_url, 'http://www.w3map.org/foo/bar').should be_true
20
+ HtmlHelpers.is_internal?(initial_url, 'http://www.w3map.org/foo?bar=baz').should be_true
21
+ HtmlHelpers.is_internal?(initial_url, 'http://www.google.com').should be_false
22
+ end
23
+
24
+ it '.make_absolute' do
25
+ initial_url = 'http://www.w3map.org'
26
+ HtmlHelpers.make_absolute(initial_url, '/foo/bar').should == "#{initial_url}/foo/bar"
27
+ HtmlHelpers.make_absolute("#{initial_url}/", '/foo/bar').should == "#{initial_url}/foo/bar"
28
+ HtmlHelpers.make_absolute(initial_url, 'http://www.google.com/foo/bar').should == 'http://www.google.com/foo/bar'
29
+ HtmlHelpers.make_absolute("#{initial_url}/", 'http://www.google.com/foo/bar').should == 'http://www.google.com/foo/bar'
30
+ end
31
+ end
@@ -0,0 +1,21 @@
1
+ require 'spec_helper'
2
+
3
+ describe HttpEngine do
4
+ before(:each) do
5
+ @bot = stub(Bot)
6
+ @http_engine = HttpEngine.new(@bot)
7
+ end
8
+ it 'have a bot' do
9
+ @http_engine.bot.should_not be_nil
10
+ @http_engine.bot.should be(@bot)
11
+ end
12
+ it '.response_for' do
13
+ resp = @http_engine.response_for 'http://www.w3map.org/'
14
+ resp.should_not be_nil
15
+ resp.code.should == 200.to_s
16
+ resp.body.should match /w3map/
17
+ end
18
+
19
+ # TODO:
20
+ # - test ssl
21
+ end
@@ -0,0 +1,5 @@
1
+ require 'rspec'
2
+
3
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
4
+ require 'w3map'
5
+ include W3map
data/w3map.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'w3map/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "w3map"
8
+ gem.version = W3map::VERSION
9
+ gem.authors = ["Sam"]
10
+ gem.email = ["samuel@pagedegeek.com"]
11
+ gem.description = %q{w3map is a distributed system for process OSINT.
12
+ Open Source Intelligence (OSINT) is an information processing discipline that involves finding,
13
+ selecting, mining and acquiring information from publicly available sources and analyzing it to produce actionable intelligence/knowledge.
14
+ The term "open" refers to overt, publicly available sources (as opposed to covert or classified sources);
15
+ it is not related to open-source software.}
16
+ gem.summary = %q{Web scanner for gathering, mining, process, improve information open/public sources (OSINT).}
17
+ gem.homepage = "http://www.w3map.org"
18
+
19
+ gem.files = `git ls-files`.split($/)
20
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
21
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
22
+ gem.require_paths = ["lib", "bin"]
23
+
24
+ gem.add_dependency 'redis'
25
+ gem.add_dependency 'nokogiri'
26
+
27
+ gem.add_development_dependency 'rspec'
28
+ end
metadata ADDED
@@ -0,0 +1,121 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: w3map
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Sam
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: redis
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: nokogiri
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: ! "w3map is a distributed system for process OSINT.\n Open Source
63
+ Intelligence (OSINT) is an information processing discipline that involves finding,\n
64
+ \ selecting, mining and acquiring information from publicly available sources
65
+ and analyzing it to produce actionable intelligence/knowledge.\n The term \"open\"
66
+ refers to overt, publicly available sources (as opposed to covert or classified
67
+ sources);\n it is not related to open-source software."
68
+ email:
69
+ - samuel@pagedegeek.com
70
+ executables:
71
+ - w3map
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - .gitignore
76
+ - Gemfile
77
+ - LICENSE.txt
78
+ - README.md
79
+ - Rakefile
80
+ - bin/w3map
81
+ - lib/ext/string.rb
82
+ - lib/w3map.rb
83
+ - lib/w3map/neurons/neuron.rb
84
+ - lib/w3map/neurons/simple_crawler.rb
85
+ - lib/w3map/reports/report.rb
86
+ - lib/w3map/reports/response_code_report.rb
87
+ - lib/w3map/version.rb
88
+ - spec/html_helpers_spec.rb
89
+ - spec/http_engine_helpers_spec.rb
90
+ - spec/spec_helper.rb
91
+ - w3map.gemspec
92
+ homepage: http://www.w3map.org
93
+ licenses: []
94
+ post_install_message:
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ - bin
99
+ required_ruby_version: !ruby/object:Gem::Requirement
100
+ none: false
101
+ requirements:
102
+ - - ! '>='
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ required_rubygems_version: !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ requirements: []
112
+ rubyforge_project:
113
+ rubygems_version: 1.8.25
114
+ signing_key:
115
+ specification_version: 3
116
+ summary: Web scanner for gathering, mining, process, improve information open/public
117
+ sources (OSINT).
118
+ test_files:
119
+ - spec/html_helpers_spec.rb
120
+ - spec/http_engine_helpers_spec.rb
121
+ - spec/spec_helper.rb