sledgehammer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/.rspec +2 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +79 -0
- data/LICENSE +21 -0
- data/README.md +61 -0
- data/Rakefile +10 -0
- data/coverage/.last_run.json +5 -0
- data/db/migrate/20140626075744_create_pages.rb +12 -0
- data/db/migrate/20140626080142_create_contacts.rb +9 -0
- data/db/migrate/20140626105612_create_websites.rb +9 -0
- data/db/migrate/20140704070249_create_page_contacts.rb +8 -0
- data/lib/generators/sledgehammer/USAGE +11 -0
- data/lib/generators/sledgehammer/install_generator.rb +16 -0
- data/lib/sledgehammer.rb +11 -0
- data/lib/sledgehammer/models/contact.rb +6 -0
- data/lib/sledgehammer/models/page.rb +12 -0
- data/lib/sledgehammer/models/page_contact.rb +4 -0
- data/lib/sledgehammer/models/website.rb +4 -0
- data/lib/sledgehammer/version.rb +3 -0
- data/lib/sledgehammer/workers/crawl_worker.rb +113 -0
- data/sledgehammer.gemspec +29 -0
- data/spec/fixtures/example2_com.html +13 -0
- data/spec/fixtures/example_com.html +21 -0
- data/spec/fixtures/example_com_testing.html +13 -0
- data/spec/models/contact_spec.rb +3 -0
- data/spec/models/page_contact.rb +3 -0
- data/spec/models/page_spec.rb +3 -0
- data/spec/models/website_spec.rb +3 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/support/active_record_helper.rb +13 -0
- data/spec/workers/crawl_worker_spec.rb +33 -0
- metadata +201 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8391efc4413b6922054b8531950a5412f3d95429
|
4
|
+
data.tar.gz: 1b8b0b2301cfc4ee217d92c208bc4cd88ecde5f2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 398055d333de9da4f07d110dd602c9ab66d78fd4ddbfdcd4e5ca0bdea92954da343d96f7ba5fb5c3e83b511179a0e6ddf8449ef5011cb885fa4ac4366be3a61e
|
7
|
+
data.tar.gz: 055c2d8e2852fa8dd605c2236b8e962373be94f2022d359f021dbf09d4e84a6aea393c65b95f1ef3b7d31f49084bde26a45bb40ceccc6a4d30b3a6d290d347fa
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
sledgehammer (0.1.0)
|
5
|
+
activerecord (~> 4.1)
|
6
|
+
sidekiq (~> 3.1)
|
7
|
+
typhoeus (~> 0.6)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
activemodel (4.1.4)
|
13
|
+
activesupport (= 4.1.4)
|
14
|
+
builder (~> 3.1)
|
15
|
+
activerecord (4.1.4)
|
16
|
+
activemodel (= 4.1.4)
|
17
|
+
activesupport (= 4.1.4)
|
18
|
+
arel (~> 5.0.0)
|
19
|
+
activesupport (4.1.4)
|
20
|
+
i18n (~> 0.6, >= 0.6.9)
|
21
|
+
json (~> 1.7, >= 1.7.7)
|
22
|
+
minitest (~> 5.1)
|
23
|
+
thread_safe (~> 0.1)
|
24
|
+
tzinfo (~> 1.1)
|
25
|
+
arel (5.0.1.20140414130214)
|
26
|
+
builder (3.2.2)
|
27
|
+
celluloid (0.15.2)
|
28
|
+
timers (~> 1.1.0)
|
29
|
+
connection_pool (2.0.0)
|
30
|
+
diff-lcs (1.2.5)
|
31
|
+
ethon (0.7.1)
|
32
|
+
ffi (>= 1.3.0)
|
33
|
+
ffi (1.9.3)
|
34
|
+
i18n (0.6.11)
|
35
|
+
json (1.8.1)
|
36
|
+
minitest (5.4.0)
|
37
|
+
rake (10.3.2)
|
38
|
+
redis (3.1.0)
|
39
|
+
redis-namespace (1.5.0)
|
40
|
+
redis (~> 3.0, >= 3.0.4)
|
41
|
+
rspec (3.0.0)
|
42
|
+
rspec-core (~> 3.0.0)
|
43
|
+
rspec-expectations (~> 3.0.0)
|
44
|
+
rspec-mocks (~> 3.0.0)
|
45
|
+
rspec-core (3.0.2)
|
46
|
+
rspec-support (~> 3.0.0)
|
47
|
+
rspec-expectations (3.0.2)
|
48
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
49
|
+
rspec-support (~> 3.0.0)
|
50
|
+
rspec-mocks (3.0.2)
|
51
|
+
rspec-support (~> 3.0.0)
|
52
|
+
rspec-sidekiq (1.0.0)
|
53
|
+
rspec (>= 2.0.0)
|
54
|
+
sidekiq (>= 2.4.0)
|
55
|
+
rspec-support (3.0.2)
|
56
|
+
sidekiq (3.2.1)
|
57
|
+
celluloid (>= 0.15.2)
|
58
|
+
connection_pool (>= 2.0.0)
|
59
|
+
json
|
60
|
+
redis (>= 3.0.6)
|
61
|
+
redis-namespace (>= 1.3.1)
|
62
|
+
sqlite3 (1.3.9)
|
63
|
+
thread_safe (0.3.4)
|
64
|
+
timers (1.1.0)
|
65
|
+
typhoeus (0.6.9)
|
66
|
+
ethon (>= 0.7.1)
|
67
|
+
tzinfo (1.2.1)
|
68
|
+
thread_safe (~> 0.1)
|
69
|
+
|
70
|
+
PLATFORMS
|
71
|
+
ruby
|
72
|
+
|
73
|
+
DEPENDENCIES
|
74
|
+
bundler (~> 1.5)
|
75
|
+
rake (~> 10.0)
|
76
|
+
rspec (~> 3.0)
|
77
|
+
rspec-sidekiq (~> 1.0.0)
|
78
|
+
sledgehammer!
|
79
|
+
sqlite3
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2014 Growth Republic Ltd.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Sledgehammer
|
2
|
+
|
3
|
+
Sledgehammer is a gem which allows to crawl websites in search of email addresses.
|
4
|
+
It uses Typhoeus and Sidekiq to spawn ultra-fast workers which gathers data in no-time.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Include the gem in your Gemfile
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
gem "sledgehammer"
|
12
|
+
```
|
13
|
+
|
14
|
+
Bundle the Gemfile
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
bundle install
|
18
|
+
```
|
19
|
+
|
20
|
+
Run the install script which will create a migration file and a config file.
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
bundle exec rails generate sledgehammer:install
|
24
|
+
```
|
25
|
+
|
26
|
+
Migrate your database
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
bundle exec rake db:migrate
|
30
|
+
```
|
31
|
+
|
32
|
+
## Setup
|
33
|
+
|
34
|
+
You should be aware of using this gem with application with sqlite3 database.
|
35
|
+
Due to multi threaded nature of gem you will be greeted with "SQLite3::BusyException: database is locked" errors.
|
36
|
+
PostgreSQL, MySQL or MongoDB should be just fine.
|
37
|
+
|
38
|
+
## Usage
|
39
|
+
|
40
|
+
Run sidekiq worker form your code:
|
41
|
+
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
Sledgehammer::CrawlWorker.perform_async ARRAY_OF_URLS, [OPTIONS]
|
45
|
+
```
|
46
|
+
|
47
|
+
Here is sample usage:
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
Sledgehammer::CrawlWorker.perform_async ['http://example.com'], { depth_limit: 3 }
|
51
|
+
```
|
52
|
+
|
53
|
+
## Contributors
|
54
|
+
|
55
|
+
[d4rky-pl] (https://github.com/d4rky-pl)
|
56
|
+
|
57
|
+
[rabsztok](https://github.com/rabsztok)
|
58
|
+
|
59
|
+
## License
|
60
|
+
|
61
|
+
Sledgehammer is Copyright © 2014 Growth Republic. It is free software, and may be redistributed under the terms specified in the LICENSE file.
|
data/Rakefile
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Description:
|
2
|
+
Generates ActiveRecord migrations needed to use this gem in Rails application
|
3
|
+
|
4
|
+
Example:
|
5
|
+
rails generate sledgehammer:install
|
6
|
+
|
7
|
+
This will create:
|
8
|
+
db/migrate/20140626075744_create_pages.rb
|
9
|
+
db/migrate/20140626080142_create_contacts.rb
|
10
|
+
db/migrate/20140626105612_create_websites.rb
|
11
|
+
db/migrate/20140704070249_create_page_contacts.rb
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'rails/generators/base'
|
2
|
+
|
3
|
+
module Sledgehammer
|
4
|
+
module Generators
|
5
|
+
class InstallGenerator < Rails::Generators::Base
|
6
|
+
source_root File.expand_path('../../../../', __FILE__)
|
7
|
+
|
8
|
+
def generate_migrations
|
9
|
+
copy_file "db/migrate/20140626075744_create_pages.rb"
|
10
|
+
copy_file "db/migrate/20140626080142_create_contacts.rb"
|
11
|
+
copy_file "db/migrate/20140626105612_create_websites.rb"
|
12
|
+
copy_file "db/migrate/20140704070249_create_page_contacts.rb"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/sledgehammer.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
require 'typhoeus'
|
3
|
+
require 'sledgehammer/version'
|
4
|
+
require 'sledgehammer/models/contact'
|
5
|
+
require 'sledgehammer/models/page'
|
6
|
+
require 'sledgehammer/models/page_contact'
|
7
|
+
require 'sledgehammer/models/website'
|
8
|
+
require 'sledgehammer/workers/crawl_worker'
|
9
|
+
|
10
|
+
module Sledgehammer
|
11
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class Sledgehammer::Page < ActiveRecord::Base
|
2
|
+
belongs_to :website
|
3
|
+
has_many :page_contacts
|
4
|
+
has_many :contacts, through: :page_contacts
|
5
|
+
before_create :create_website!
|
6
|
+
|
7
|
+
protected
|
8
|
+
def create_website!
|
9
|
+
hostname = URI.parse(url).host
|
10
|
+
self.website = Sledgehammer::Website.find_or_create_by(hostname: hostname)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
class Sledgehammer::CrawlWorker
|
2
|
+
include ::Sidekiq::Worker
|
3
|
+
MAIL_REGEX = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.(?!jpg|gif|png)[A-Z0-9]+/i
|
4
|
+
URL_REGEX = /<a\s+(?:[^>]*?\s+)?href="((?:http|\/)[^"]+)"/
|
5
|
+
DEFAULT_OPTIONS = { depth: 0, depth_limit: 1, queue: 'default' }
|
6
|
+
|
7
|
+
#
|
8
|
+
# Callbacks to overload in application
|
9
|
+
#
|
10
|
+
def before_queue(urls)
|
11
|
+
# stub
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Stops element from being added to queue if returns false
|
16
|
+
#
|
17
|
+
def on_queue(url)
|
18
|
+
true
|
19
|
+
end
|
20
|
+
|
21
|
+
def after_queue(urls)
|
22
|
+
# stub
|
23
|
+
end
|
24
|
+
|
25
|
+
def on_complete(response)
|
26
|
+
page = self.find_or_create_page!(response.request.url)
|
27
|
+
unless page.completed?
|
28
|
+
self.parse_emails(response, page)
|
29
|
+
self.parse_urls(response)
|
30
|
+
page.update_attributes completed: true
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# There shouldn't be any need to overload methods below
|
36
|
+
#
|
37
|
+
|
38
|
+
def perform(urls, opts = {})
|
39
|
+
@options = HashWithIndifferentAccess.new(DEFAULT_OPTIONS)
|
40
|
+
@options.merge!(opts)
|
41
|
+
|
42
|
+
return if @options[:depth] == @options[:depth_limit]
|
43
|
+
|
44
|
+
before_queue(urls)
|
45
|
+
urls.each { |site| self.queue(site) }
|
46
|
+
run_queue
|
47
|
+
after_queue(urls)
|
48
|
+
end
|
49
|
+
|
50
|
+
def queue(url)
|
51
|
+
return unless self.on_queue(url) && valid_url?(url)
|
52
|
+
|
53
|
+
request = Typhoeus::Request.new(url)
|
54
|
+
request.on_complete { |response| self.on_complete(response) }
|
55
|
+
|
56
|
+
Typhoeus::Hydra.hydra.queue(request)
|
57
|
+
end
|
58
|
+
|
59
|
+
def run_queue
|
60
|
+
Typhoeus::Hydra.hydra.run
|
61
|
+
end
|
62
|
+
|
63
|
+
protected
|
64
|
+
def find_or_create_page!(request_url)
|
65
|
+
page = Sledgehammer::Page.find_by(url: request_url)
|
66
|
+
|
67
|
+
if page.blank?
|
68
|
+
hostname = URI.parse(request_url).host
|
69
|
+
website = Sledgehammer::Website.find_or_create_by(hostname: hostname)
|
70
|
+
page = Sledgehammer::Page.create!(url: request_url, depth: @options[:depth], website: website)
|
71
|
+
elsif page.depth < @options[:depth]
|
72
|
+
page.update_attributes completed: false
|
73
|
+
end
|
74
|
+
page
|
75
|
+
end
|
76
|
+
|
77
|
+
def parse_emails(response, page)
|
78
|
+
mail_list = response.body.scan MAIL_REGEX
|
79
|
+
mail_list.each do |email|
|
80
|
+
contact = Sledgehammer::Contact.find_or_create_by(email: email)
|
81
|
+
Sledgehammer::PageContact.find_or_create_by page: page, contact: contact
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# TODO: remove url == '/' because we not always start at root page
|
86
|
+
def parse_urls(response)
|
87
|
+
request_url = response.request.url
|
88
|
+
request_url = "http://#{request_url}" unless request_url.match /^http/
|
89
|
+
|
90
|
+
url_list = response.body.scan(URL_REGEX).flatten.map do |url|
|
91
|
+
if url == request_url || !valid_url?(url)
|
92
|
+
nil
|
93
|
+
elsif url.starts_with?('/')
|
94
|
+
URI.join(request_url, url).to_s
|
95
|
+
else
|
96
|
+
url
|
97
|
+
end
|
98
|
+
end.compact
|
99
|
+
|
100
|
+
opts = @options.dup
|
101
|
+
opts[:depth] += 1
|
102
|
+
|
103
|
+
unless opts[:depth] >= opts[:depth_limit] || url_list.empty?
|
104
|
+
Sidekiq::Client.push('queue' => opts[:queue],
|
105
|
+
'class' => self.class,
|
106
|
+
'args' => [url_list, opts])
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def valid_url?(url)
|
111
|
+
!!URI.parse(url) rescue false
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require 'sledgehammer/version'
|
3
|
+
|
4
|
+
Gem::Specification.new do |spec|
|
5
|
+
spec.name = 'sledgehammer'
|
6
|
+
spec.version = Sledgehammer::VERSION
|
7
|
+
|
8
|
+
spec.authors = ['Michał Matyas', 'Maciej Walusiak']
|
9
|
+
spec.email = ['michal@higher.lv', 'rabsztok@gmail.com']
|
10
|
+
spec.summary = 'Crawls websites and harvests e-mails'
|
11
|
+
spec.description = 'Website crawler harvesting e-mails. Uses Sidekiq and Typhoeus.'
|
12
|
+
spec.homepage = 'https://github.com/growthrepublic/sledgehammer'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
17
|
+
spec.require_paths = ["lib"]
|
18
|
+
|
19
|
+
spec.add_development_dependency 'bundler', '~> 1.5'
|
20
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
21
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
22
|
+
spec.add_development_dependency 'rspec-sidekiq', '~> 1.0.0'
|
23
|
+
spec.add_development_dependency 'sqlite3'
|
24
|
+
|
25
|
+
spec.add_runtime_dependency 'activerecord', '~> 4.1'
|
26
|
+
spec.add_runtime_dependency 'typhoeus', '~> 0.6'
|
27
|
+
spec.add_runtime_dependency 'sidekiq', '~> 3.1'
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="utf-8">
|
5
|
+
<title>Example website 2: Tests Attack</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<div class="container">This is a second example site to test following links.</div>
|
9
|
+
<div>
|
10
|
+
test4@example.com
|
11
|
+
</div>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,21 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="utf-8">
|
5
|
+
<title>Example website</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<div class="container">This is an example site for the testing purposes. It contains two external links to different test pages and three e-mails.</div>
|
9
|
+
<div>
|
10
|
+
<a class="abc" href="#">This link should not be found</a>
|
11
|
+
<a href="/testing">This is relative link, it should be properly translated and followed</a>
|
12
|
+
<a href="http://www.example2.com">This is external link, it should be followed</a>
|
13
|
+
<a href="http:/www(broken).\com">This is corrupted external link, it should not be followed</a>
|
14
|
+
</div>
|
15
|
+
<div>
|
16
|
+
<a href="mailto:test1@example.com">This is first e-mail, inside link</a>
|
17
|
+
test2@example.com This is second e-mail, outside antyhing and surrounded only by whitespaces
|
18
|
+
john.doe@especially.long.tld.xn--clchc0ea0b2g2a9gcd This is third e-mail, with the longest possible TLD
|
19
|
+
</div>
|
20
|
+
</body>
|
21
|
+
</html>
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
|
3
|
+
ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:"
|
4
|
+
ActiveRecord::Migrator.up "db/migrate"
|
5
|
+
|
6
|
+
RSpec.configure do |config|
|
7
|
+
config.around do |example|
|
8
|
+
ActiveRecord::Base.transaction do
|
9
|
+
example.run
|
10
|
+
raise ActiveRecord::Rollback
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'sidekiq/testing'
|
2
|
+
|
3
|
+
RSpec.describe Sledgehammer::CrawlWorker, sidekiq: :fake do
|
4
|
+
|
5
|
+
before(:example) do
|
6
|
+
fixture_directory = File.expand_path('../../fixtures', __FILE__)
|
7
|
+
Typhoeus.stub('http://www.example.com').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example_com.html')))
|
8
|
+
Typhoeus.stub('http://www.example.com/testing').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example_com_testing.html')))
|
9
|
+
Typhoeus.stub('http://www.example2.com').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example2_com.html')))
|
10
|
+
end
|
11
|
+
|
12
|
+
let(:worker) { Sledgehammer::CrawlWorker.new }
|
13
|
+
|
14
|
+
describe "#perform" do
|
15
|
+
it "finds all e-mail addresses on the first site" do
|
16
|
+
worker.perform(['http://www.example.com'])
|
17
|
+
expect(Sledgehammer::Contact.count).to eq(3)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "doesn't work when the depth limit is hit" do
|
21
|
+
worker.perform(['http://www.example.com'], { 'depth' => 2, 'depth_limit' => 2 })
|
22
|
+
expect(Sledgehammer::Contact.count).to eq(0)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "crawls all pages and finds e-mails on them" do
|
26
|
+
Sidekiq::Testing.inline! do
|
27
|
+
worker.perform(['http://www.example.com'], { 'depth_limit' => 3 })
|
28
|
+
expect(Sledgehammer::Contact.count).to eq(5)
|
29
|
+
expect(Sledgehammer::Page.count).to eq(3)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sledgehammer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Michał Matyas
|
8
|
+
- Maciej Walusiak
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-07-10 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bundler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.5'
|
21
|
+
type: :development
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '1.5'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: rake
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '10.0'
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '10.0'
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: rspec
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - "~>"
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '3.0'
|
49
|
+
type: :development
|
50
|
+
prerelease: false
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - "~>"
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '3.0'
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: rspec-sidekiq
|
58
|
+
requirement: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - "~>"
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 1.0.0
|
63
|
+
type: :development
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.0.0
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: sqlite3
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - ">="
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
- !ruby/object:Gem::Dependency
|
85
|
+
name: activerecord
|
86
|
+
requirement: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - "~>"
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '4.1'
|
91
|
+
type: :runtime
|
92
|
+
prerelease: false
|
93
|
+
version_requirements: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - "~>"
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '4.1'
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: typhoeus
|
100
|
+
requirement: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - "~>"
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0.6'
|
105
|
+
type: :runtime
|
106
|
+
prerelease: false
|
107
|
+
version_requirements: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - "~>"
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0.6'
|
112
|
+
- !ruby/object:Gem::Dependency
|
113
|
+
name: sidekiq
|
114
|
+
requirement: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - "~>"
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '3.1'
|
119
|
+
type: :runtime
|
120
|
+
prerelease: false
|
121
|
+
version_requirements: !ruby/object:Gem::Requirement
|
122
|
+
requirements:
|
123
|
+
- - "~>"
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '3.1'
|
126
|
+
description: Website crawler harvesting e-mails. Uses Sidekiq and Typhoeus.
|
127
|
+
email:
|
128
|
+
- michal@higher.lv
|
129
|
+
- rabsztok@gmail.com
|
130
|
+
executables: []
|
131
|
+
extensions: []
|
132
|
+
extra_rdoc_files: []
|
133
|
+
files:
|
134
|
+
- ".gitignore"
|
135
|
+
- ".rspec"
|
136
|
+
- Gemfile
|
137
|
+
- Gemfile.lock
|
138
|
+
- LICENSE
|
139
|
+
- README.md
|
140
|
+
- Rakefile
|
141
|
+
- coverage/.last_run.json
|
142
|
+
- db/migrate/20140626075744_create_pages.rb
|
143
|
+
- db/migrate/20140626080142_create_contacts.rb
|
144
|
+
- db/migrate/20140626105612_create_websites.rb
|
145
|
+
- db/migrate/20140704070249_create_page_contacts.rb
|
146
|
+
- lib/generators/sledgehammer/USAGE
|
147
|
+
- lib/generators/sledgehammer/install_generator.rb
|
148
|
+
- lib/sledgehammer.rb
|
149
|
+
- lib/sledgehammer/models/contact.rb
|
150
|
+
- lib/sledgehammer/models/page.rb
|
151
|
+
- lib/sledgehammer/models/page_contact.rb
|
152
|
+
- lib/sledgehammer/models/website.rb
|
153
|
+
- lib/sledgehammer/version.rb
|
154
|
+
- lib/sledgehammer/workers/crawl_worker.rb
|
155
|
+
- sledgehammer.gemspec
|
156
|
+
- spec/fixtures/example2_com.html
|
157
|
+
- spec/fixtures/example_com.html
|
158
|
+
- spec/fixtures/example_com_testing.html
|
159
|
+
- spec/models/contact_spec.rb
|
160
|
+
- spec/models/page_contact.rb
|
161
|
+
- spec/models/page_spec.rb
|
162
|
+
- spec/models/website_spec.rb
|
163
|
+
- spec/spec_helper.rb
|
164
|
+
- spec/support/active_record_helper.rb
|
165
|
+
- spec/workers/crawl_worker_spec.rb
|
166
|
+
homepage: https://github.com/growthrepublic/sledgehammer
|
167
|
+
licenses:
|
168
|
+
- MIT
|
169
|
+
metadata: {}
|
170
|
+
post_install_message:
|
171
|
+
rdoc_options: []
|
172
|
+
require_paths:
|
173
|
+
- lib
|
174
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
175
|
+
requirements:
|
176
|
+
- - ">="
|
177
|
+
- !ruby/object:Gem::Version
|
178
|
+
version: '0'
|
179
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
180
|
+
requirements:
|
181
|
+
- - ">="
|
182
|
+
- !ruby/object:Gem::Version
|
183
|
+
version: '0'
|
184
|
+
requirements: []
|
185
|
+
rubyforge_project:
|
186
|
+
rubygems_version: 2.2.2
|
187
|
+
signing_key:
|
188
|
+
specification_version: 4
|
189
|
+
summary: Crawls websites and harvests e-mails
|
190
|
+
test_files:
|
191
|
+
- spec/fixtures/example2_com.html
|
192
|
+
- spec/fixtures/example_com.html
|
193
|
+
- spec/fixtures/example_com_testing.html
|
194
|
+
- spec/models/contact_spec.rb
|
195
|
+
- spec/models/page_contact.rb
|
196
|
+
- spec/models/page_spec.rb
|
197
|
+
- spec/models/website_spec.rb
|
198
|
+
- spec/spec_helper.rb
|
199
|
+
- spec/support/active_record_helper.rb
|
200
|
+
- spec/workers/crawl_worker_spec.rb
|
201
|
+
has_rdoc:
|