sledgehammer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/.rspec +2 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +79 -0
- data/LICENSE +21 -0
- data/README.md +61 -0
- data/Rakefile +10 -0
- data/coverage/.last_run.json +5 -0
- data/db/migrate/20140626075744_create_pages.rb +12 -0
- data/db/migrate/20140626080142_create_contacts.rb +9 -0
- data/db/migrate/20140626105612_create_websites.rb +9 -0
- data/db/migrate/20140704070249_create_page_contacts.rb +8 -0
- data/lib/generators/sledgehammer/USAGE +11 -0
- data/lib/generators/sledgehammer/install_generator.rb +16 -0
- data/lib/sledgehammer.rb +11 -0
- data/lib/sledgehammer/models/contact.rb +6 -0
- data/lib/sledgehammer/models/page.rb +12 -0
- data/lib/sledgehammer/models/page_contact.rb +4 -0
- data/lib/sledgehammer/models/website.rb +4 -0
- data/lib/sledgehammer/version.rb +3 -0
- data/lib/sledgehammer/workers/crawl_worker.rb +113 -0
- data/sledgehammer.gemspec +29 -0
- data/spec/fixtures/example2_com.html +13 -0
- data/spec/fixtures/example_com.html +21 -0
- data/spec/fixtures/example_com_testing.html +13 -0
- data/spec/models/contact_spec.rb +3 -0
- data/spec/models/page_contact.rb +3 -0
- data/spec/models/page_spec.rb +3 -0
- data/spec/models/website_spec.rb +3 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/support/active_record_helper.rb +13 -0
- data/spec/workers/crawl_worker_spec.rb +33 -0
- metadata +201 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8391efc4413b6922054b8531950a5412f3d95429
|
4
|
+
data.tar.gz: 1b8b0b2301cfc4ee217d92c208bc4cd88ecde5f2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 398055d333de9da4f07d110dd602c9ab66d78fd4ddbfdcd4e5ca0bdea92954da343d96f7ba5fb5c3e83b511179a0e6ddf8449ef5011cb885fa4ac4366be3a61e
|
7
|
+
data.tar.gz: 055c2d8e2852fa8dd605c2236b8e962373be94f2022d359f021dbf09d4e84a6aea393c65b95f1ef3b7d31f49084bde26a45bb40ceccc6a4d30b3a6d290d347fa
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
sledgehammer (0.1.0)
|
5
|
+
activerecord (~> 4.1)
|
6
|
+
sidekiq (~> 3.1)
|
7
|
+
typhoeus (~> 0.6)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
activemodel (4.1.4)
|
13
|
+
activesupport (= 4.1.4)
|
14
|
+
builder (~> 3.1)
|
15
|
+
activerecord (4.1.4)
|
16
|
+
activemodel (= 4.1.4)
|
17
|
+
activesupport (= 4.1.4)
|
18
|
+
arel (~> 5.0.0)
|
19
|
+
activesupport (4.1.4)
|
20
|
+
i18n (~> 0.6, >= 0.6.9)
|
21
|
+
json (~> 1.7, >= 1.7.7)
|
22
|
+
minitest (~> 5.1)
|
23
|
+
thread_safe (~> 0.1)
|
24
|
+
tzinfo (~> 1.1)
|
25
|
+
arel (5.0.1.20140414130214)
|
26
|
+
builder (3.2.2)
|
27
|
+
celluloid (0.15.2)
|
28
|
+
timers (~> 1.1.0)
|
29
|
+
connection_pool (2.0.0)
|
30
|
+
diff-lcs (1.2.5)
|
31
|
+
ethon (0.7.1)
|
32
|
+
ffi (>= 1.3.0)
|
33
|
+
ffi (1.9.3)
|
34
|
+
i18n (0.6.11)
|
35
|
+
json (1.8.1)
|
36
|
+
minitest (5.4.0)
|
37
|
+
rake (10.3.2)
|
38
|
+
redis (3.1.0)
|
39
|
+
redis-namespace (1.5.0)
|
40
|
+
redis (~> 3.0, >= 3.0.4)
|
41
|
+
rspec (3.0.0)
|
42
|
+
rspec-core (~> 3.0.0)
|
43
|
+
rspec-expectations (~> 3.0.0)
|
44
|
+
rspec-mocks (~> 3.0.0)
|
45
|
+
rspec-core (3.0.2)
|
46
|
+
rspec-support (~> 3.0.0)
|
47
|
+
rspec-expectations (3.0.2)
|
48
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
49
|
+
rspec-support (~> 3.0.0)
|
50
|
+
rspec-mocks (3.0.2)
|
51
|
+
rspec-support (~> 3.0.0)
|
52
|
+
rspec-sidekiq (1.0.0)
|
53
|
+
rspec (>= 2.0.0)
|
54
|
+
sidekiq (>= 2.4.0)
|
55
|
+
rspec-support (3.0.2)
|
56
|
+
sidekiq (3.2.1)
|
57
|
+
celluloid (>= 0.15.2)
|
58
|
+
connection_pool (>= 2.0.0)
|
59
|
+
json
|
60
|
+
redis (>= 3.0.6)
|
61
|
+
redis-namespace (>= 1.3.1)
|
62
|
+
sqlite3 (1.3.9)
|
63
|
+
thread_safe (0.3.4)
|
64
|
+
timers (1.1.0)
|
65
|
+
typhoeus (0.6.9)
|
66
|
+
ethon (>= 0.7.1)
|
67
|
+
tzinfo (1.2.1)
|
68
|
+
thread_safe (~> 0.1)
|
69
|
+
|
70
|
+
PLATFORMS
|
71
|
+
ruby
|
72
|
+
|
73
|
+
DEPENDENCIES
|
74
|
+
bundler (~> 1.5)
|
75
|
+
rake (~> 10.0)
|
76
|
+
rspec (~> 3.0)
|
77
|
+
rspec-sidekiq (~> 1.0.0)
|
78
|
+
sledgehammer!
|
79
|
+
sqlite3
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2014 Growth Republic Ltd.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Sledgehammer
|
2
|
+
|
3
|
+
Sledgehammer is a gem which allows to crawl websites in search of email addresses.
|
4
|
+
It uses Typhoeus and Sidekiq to spawn ultra-fast workers which gathers data in no-time.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Include the gem in your Gemfile
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
gem "sledgehammer"
|
12
|
+
```
|
13
|
+
|
14
|
+
Bundle the Gemfile
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
bundle install
|
18
|
+
```
|
19
|
+
|
20
|
+
Run the install script which will create a migration file and a config file.
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
bundle exec rails generate sledgehammer:install
|
24
|
+
```
|
25
|
+
|
26
|
+
Migrate your database
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
bundle exec rake db:migrate
|
30
|
+
```
|
31
|
+
|
32
|
+
## Setup
|
33
|
+
|
34
|
+
You should be aware of using this gem with application with sqlite3 database.
|
35
|
+
Due to multi threaded nature of gem you will be greeted with "SQLite3::BusyException: database is locked" errors.
|
36
|
+
PostgreSQL, MySQL or MongoDB should be just fine.
|
37
|
+
|
38
|
+
## Usage
|
39
|
+
|
40
|
+
Run sidekiq worker form your code:
|
41
|
+
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
Sledgehammer::CrawlWorker.perform_async ARRAY_OF_URLS, [OPTIONS]
|
45
|
+
```
|
46
|
+
|
47
|
+
Here is sample usage:
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
Sledgehammer::CrawlWorker.perform_async ['http://example.com'], { depth_limit: 3 }
|
51
|
+
```
|
52
|
+
|
53
|
+
## Contributors
|
54
|
+
|
55
|
+
[d4rky-pl] (https://github.com/d4rky-pl)
|
56
|
+
|
57
|
+
[rabsztok](https://github.com/rabsztok)
|
58
|
+
|
59
|
+
## License
|
60
|
+
|
61
|
+
Sledgehammer is Copyright © 2014 Growth Republic. It is free software, and may be redistributed under the terms specified in the LICENSE file.
|
data/Rakefile
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Description:
|
2
|
+
Generates ActiveRecord migrations needed to use this gem in Rails application
|
3
|
+
|
4
|
+
Example:
|
5
|
+
rails generate sledgehammer:install
|
6
|
+
|
7
|
+
This will create:
|
8
|
+
db/migrate/20140626075744_create_pages.rb
|
9
|
+
db/migrate/20140626080142_create_contacts.rb
|
10
|
+
db/migrate/20140626105612_create_websites.rb
|
11
|
+
db/migrate/20140704070249_create_page_contacts.rb
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'rails/generators/base'
|
2
|
+
|
3
|
+
module Sledgehammer
|
4
|
+
module Generators
|
5
|
+
class InstallGenerator < Rails::Generators::Base
|
6
|
+
source_root File.expand_path('../../../../', __FILE__)
|
7
|
+
|
8
|
+
def generate_migrations
|
9
|
+
copy_file "db/migrate/20140626075744_create_pages.rb"
|
10
|
+
copy_file "db/migrate/20140626080142_create_contacts.rb"
|
11
|
+
copy_file "db/migrate/20140626105612_create_websites.rb"
|
12
|
+
copy_file "db/migrate/20140704070249_create_page_contacts.rb"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/sledgehammer.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
require 'typhoeus'
|
3
|
+
require 'sledgehammer/version'
|
4
|
+
require 'sledgehammer/models/contact'
|
5
|
+
require 'sledgehammer/models/page'
|
6
|
+
require 'sledgehammer/models/page_contact'
|
7
|
+
require 'sledgehammer/models/website'
|
8
|
+
require 'sledgehammer/workers/crawl_worker'
|
9
|
+
|
10
|
+
module Sledgehammer
|
11
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class Sledgehammer::Page < ActiveRecord::Base
|
2
|
+
belongs_to :website
|
3
|
+
has_many :page_contacts
|
4
|
+
has_many :contacts, through: :page_contacts
|
5
|
+
before_create :create_website!
|
6
|
+
|
7
|
+
protected
|
8
|
+
def create_website!
|
9
|
+
hostname = URI.parse(url).host
|
10
|
+
self.website = Sledgehammer::Website.find_or_create_by(hostname: hostname)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
class Sledgehammer::CrawlWorker
|
2
|
+
include ::Sidekiq::Worker
|
3
|
+
MAIL_REGEX = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.(?!jpg|gif|png)[A-Z0-9]+/i
|
4
|
+
URL_REGEX = /<a\s+(?:[^>]*?\s+)?href="((?:http|\/)[^"]+)"/
|
5
|
+
DEFAULT_OPTIONS = { depth: 0, depth_limit: 1, queue: 'default' }
|
6
|
+
|
7
|
+
#
|
8
|
+
# Callbacks to overload in application
|
9
|
+
#
|
10
|
+
def before_queue(urls)
|
11
|
+
# stub
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Stops element from being added to queue if returns false
|
16
|
+
#
|
17
|
+
def on_queue(url)
|
18
|
+
true
|
19
|
+
end
|
20
|
+
|
21
|
+
def after_queue(urls)
|
22
|
+
# stub
|
23
|
+
end
|
24
|
+
|
25
|
+
def on_complete(response)
|
26
|
+
page = self.find_or_create_page!(response.request.url)
|
27
|
+
unless page.completed?
|
28
|
+
self.parse_emails(response, page)
|
29
|
+
self.parse_urls(response)
|
30
|
+
page.update_attributes completed: true
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# There shouldn't be any need to overload methods below
|
36
|
+
#
|
37
|
+
|
38
|
+
def perform(urls, opts = {})
|
39
|
+
@options = HashWithIndifferentAccess.new(DEFAULT_OPTIONS)
|
40
|
+
@options.merge!(opts)
|
41
|
+
|
42
|
+
return if @options[:depth] == @options[:depth_limit]
|
43
|
+
|
44
|
+
before_queue(urls)
|
45
|
+
urls.each { |site| self.queue(site) }
|
46
|
+
run_queue
|
47
|
+
after_queue(urls)
|
48
|
+
end
|
49
|
+
|
50
|
+
def queue(url)
|
51
|
+
return unless self.on_queue(url) && valid_url?(url)
|
52
|
+
|
53
|
+
request = Typhoeus::Request.new(url)
|
54
|
+
request.on_complete { |response| self.on_complete(response) }
|
55
|
+
|
56
|
+
Typhoeus::Hydra.hydra.queue(request)
|
57
|
+
end
|
58
|
+
|
59
|
+
def run_queue
|
60
|
+
Typhoeus::Hydra.hydra.run
|
61
|
+
end
|
62
|
+
|
63
|
+
protected
|
64
|
+
def find_or_create_page!(request_url)
|
65
|
+
page = Sledgehammer::Page.find_by(url: request_url)
|
66
|
+
|
67
|
+
if page.blank?
|
68
|
+
hostname = URI.parse(request_url).host
|
69
|
+
website = Sledgehammer::Website.find_or_create_by(hostname: hostname)
|
70
|
+
page = Sledgehammer::Page.create!(url: request_url, depth: @options[:depth], website: website)
|
71
|
+
elsif page.depth < @options[:depth]
|
72
|
+
page.update_attributes completed: false
|
73
|
+
end
|
74
|
+
page
|
75
|
+
end
|
76
|
+
|
77
|
+
def parse_emails(response, page)
|
78
|
+
mail_list = response.body.scan MAIL_REGEX
|
79
|
+
mail_list.each do |email|
|
80
|
+
contact = Sledgehammer::Contact.find_or_create_by(email: email)
|
81
|
+
Sledgehammer::PageContact.find_or_create_by page: page, contact: contact
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# TODO: remove url == '/' because we not always start at root page
|
86
|
+
def parse_urls(response)
|
87
|
+
request_url = response.request.url
|
88
|
+
request_url = "http://#{request_url}" unless request_url.match /^http/
|
89
|
+
|
90
|
+
url_list = response.body.scan(URL_REGEX).flatten.map do |url|
|
91
|
+
if url == request_url || !valid_url?(url)
|
92
|
+
nil
|
93
|
+
elsif url.starts_with?('/')
|
94
|
+
URI.join(request_url, url).to_s
|
95
|
+
else
|
96
|
+
url
|
97
|
+
end
|
98
|
+
end.compact
|
99
|
+
|
100
|
+
opts = @options.dup
|
101
|
+
opts[:depth] += 1
|
102
|
+
|
103
|
+
unless opts[:depth] >= opts[:depth_limit] || url_list.empty?
|
104
|
+
Sidekiq::Client.push('queue' => opts[:queue],
|
105
|
+
'class' => self.class,
|
106
|
+
'args' => [url_list, opts])
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def valid_url?(url)
|
111
|
+
!!URI.parse(url) rescue false
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require 'sledgehammer/version'
|
3
|
+
|
4
|
+
Gem::Specification.new do |spec|
|
5
|
+
spec.name = 'sledgehammer'
|
6
|
+
spec.version = Sledgehammer::VERSION
|
7
|
+
|
8
|
+
spec.authors = ['Michał Matyas', 'Maciej Walusiak']
|
9
|
+
spec.email = ['michal@higher.lv', 'rabsztok@gmail.com']
|
10
|
+
spec.summary = 'Crawls websites and harvests e-mails'
|
11
|
+
spec.description = 'Website crawler harvesting e-mails. Uses Sidekiq and Typhoeus.'
|
12
|
+
spec.homepage = 'https://github.com/growthrepublic/sledgehammer'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
17
|
+
spec.require_paths = ["lib"]
|
18
|
+
|
19
|
+
spec.add_development_dependency 'bundler', '~> 1.5'
|
20
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
21
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
22
|
+
spec.add_development_dependency 'rspec-sidekiq', '~> 1.0.0'
|
23
|
+
spec.add_development_dependency 'sqlite3'
|
24
|
+
|
25
|
+
spec.add_runtime_dependency 'activerecord', '~> 4.1'
|
26
|
+
spec.add_runtime_dependency 'typhoeus', '~> 0.6'
|
27
|
+
spec.add_runtime_dependency 'sidekiq', '~> 3.1'
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="utf-8">
|
5
|
+
<title>Example website 2: Tests Attack</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<div class="container">This is a second example site to test following links.</div>
|
9
|
+
<div>
|
10
|
+
test4@example.com
|
11
|
+
</div>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,21 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="utf-8">
|
5
|
+
<title>Example website</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<div class="container">This is an example site for the testing purposes. It contains two external links to different test pages and three e-mails.</div>
|
9
|
+
<div>
|
10
|
+
<a class="abc" href="#">This link should not be found</a>
|
11
|
+
<a href="/testing">This is relative link, it should be properly translated and followed</a>
|
12
|
+
<a href="http://www.example2.com">This is external link, it should be followed</a>
|
13
|
+
<a href="http:/www(broken).\com">This is corrupted external link, it should not be followed</a>
|
14
|
+
</div>
|
15
|
+
<div>
|
16
|
+
<a href="mailto:test1@example.com">This is first e-mail, inside link</a>
|
17
|
+
test2@example.com This is second e-mail, outside antyhing and surrounded only by whitespaces
|
18
|
+
john.doe@especially.long.tld.xn--clchc0ea0b2g2a9gcd This is third e-mail, with the longest possible TLD
|
19
|
+
</div>
|
20
|
+
</body>
|
21
|
+
</html>
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
|
3
|
+
ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:"
|
4
|
+
ActiveRecord::Migrator.up "db/migrate"
|
5
|
+
|
6
|
+
RSpec.configure do |config|
|
7
|
+
config.around do |example|
|
8
|
+
ActiveRecord::Base.transaction do
|
9
|
+
example.run
|
10
|
+
raise ActiveRecord::Rollback
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'sidekiq/testing'
|
2
|
+
|
3
|
+
RSpec.describe Sledgehammer::CrawlWorker, sidekiq: :fake do
|
4
|
+
|
5
|
+
before(:example) do
|
6
|
+
fixture_directory = File.expand_path('../../fixtures', __FILE__)
|
7
|
+
Typhoeus.stub('http://www.example.com').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example_com.html')))
|
8
|
+
Typhoeus.stub('http://www.example.com/testing').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example_com_testing.html')))
|
9
|
+
Typhoeus.stub('http://www.example2.com').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example2_com.html')))
|
10
|
+
end
|
11
|
+
|
12
|
+
let(:worker) { Sledgehammer::CrawlWorker.new }
|
13
|
+
|
14
|
+
describe "#perform" do
|
15
|
+
it "finds all e-mail addresses on the first site" do
|
16
|
+
worker.perform(['http://www.example.com'])
|
17
|
+
expect(Sledgehammer::Contact.count).to eq(3)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "doesn't work when the depth limit is hit" do
|
21
|
+
worker.perform(['http://www.example.com'], { 'depth' => 2, 'depth_limit' => 2 })
|
22
|
+
expect(Sledgehammer::Contact.count).to eq(0)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "crawls all pages and finds e-mails on them" do
|
26
|
+
Sidekiq::Testing.inline! do
|
27
|
+
worker.perform(['http://www.example.com'], { 'depth_limit' => 3 })
|
28
|
+
expect(Sledgehammer::Contact.count).to eq(5)
|
29
|
+
expect(Sledgehammer::Page.count).to eq(3)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sledgehammer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Michał Matyas
|
8
|
+
- Maciej Walusiak
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-07-10 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bundler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.5'
|
21
|
+
type: :development
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '1.5'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: rake
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '10.0'
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '10.0'
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: rspec
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - "~>"
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '3.0'
|
49
|
+
type: :development
|
50
|
+
prerelease: false
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - "~>"
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '3.0'
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: rspec-sidekiq
|
58
|
+
requirement: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - "~>"
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 1.0.0
|
63
|
+
type: :development
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.0.0
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: sqlite3
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - ">="
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
- !ruby/object:Gem::Dependency
|
85
|
+
name: activerecord
|
86
|
+
requirement: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - "~>"
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '4.1'
|
91
|
+
type: :runtime
|
92
|
+
prerelease: false
|
93
|
+
version_requirements: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - "~>"
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '4.1'
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: typhoeus
|
100
|
+
requirement: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - "~>"
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0.6'
|
105
|
+
type: :runtime
|
106
|
+
prerelease: false
|
107
|
+
version_requirements: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - "~>"
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0.6'
|
112
|
+
- !ruby/object:Gem::Dependency
|
113
|
+
name: sidekiq
|
114
|
+
requirement: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - "~>"
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '3.1'
|
119
|
+
type: :runtime
|
120
|
+
prerelease: false
|
121
|
+
version_requirements: !ruby/object:Gem::Requirement
|
122
|
+
requirements:
|
123
|
+
- - "~>"
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '3.1'
|
126
|
+
description: Website crawler harvesting e-mails. Uses Sidekiq and Typhoeus.
|
127
|
+
email:
|
128
|
+
- michal@higher.lv
|
129
|
+
- rabsztok@gmail.com
|
130
|
+
executables: []
|
131
|
+
extensions: []
|
132
|
+
extra_rdoc_files: []
|
133
|
+
files:
|
134
|
+
- ".gitignore"
|
135
|
+
- ".rspec"
|
136
|
+
- Gemfile
|
137
|
+
- Gemfile.lock
|
138
|
+
- LICENSE
|
139
|
+
- README.md
|
140
|
+
- Rakefile
|
141
|
+
- coverage/.last_run.json
|
142
|
+
- db/migrate/20140626075744_create_pages.rb
|
143
|
+
- db/migrate/20140626080142_create_contacts.rb
|
144
|
+
- db/migrate/20140626105612_create_websites.rb
|
145
|
+
- db/migrate/20140704070249_create_page_contacts.rb
|
146
|
+
- lib/generators/sledgehammer/USAGE
|
147
|
+
- lib/generators/sledgehammer/install_generator.rb
|
148
|
+
- lib/sledgehammer.rb
|
149
|
+
- lib/sledgehammer/models/contact.rb
|
150
|
+
- lib/sledgehammer/models/page.rb
|
151
|
+
- lib/sledgehammer/models/page_contact.rb
|
152
|
+
- lib/sledgehammer/models/website.rb
|
153
|
+
- lib/sledgehammer/version.rb
|
154
|
+
- lib/sledgehammer/workers/crawl_worker.rb
|
155
|
+
- sledgehammer.gemspec
|
156
|
+
- spec/fixtures/example2_com.html
|
157
|
+
- spec/fixtures/example_com.html
|
158
|
+
- spec/fixtures/example_com_testing.html
|
159
|
+
- spec/models/contact_spec.rb
|
160
|
+
- spec/models/page_contact.rb
|
161
|
+
- spec/models/page_spec.rb
|
162
|
+
- spec/models/website_spec.rb
|
163
|
+
- spec/spec_helper.rb
|
164
|
+
- spec/support/active_record_helper.rb
|
165
|
+
- spec/workers/crawl_worker_spec.rb
|
166
|
+
homepage: https://github.com/growthrepublic/sledgehammer
|
167
|
+
licenses:
|
168
|
+
- MIT
|
169
|
+
metadata: {}
|
170
|
+
post_install_message:
|
171
|
+
rdoc_options: []
|
172
|
+
require_paths:
|
173
|
+
- lib
|
174
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
175
|
+
requirements:
|
176
|
+
- - ">="
|
177
|
+
- !ruby/object:Gem::Version
|
178
|
+
version: '0'
|
179
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
180
|
+
requirements:
|
181
|
+
- - ">="
|
182
|
+
- !ruby/object:Gem::Version
|
183
|
+
version: '0'
|
184
|
+
requirements: []
|
185
|
+
rubyforge_project:
|
186
|
+
rubygems_version: 2.2.2
|
187
|
+
signing_key:
|
188
|
+
specification_version: 4
|
189
|
+
summary: Crawls websites and harvests e-mails
|
190
|
+
test_files:
|
191
|
+
- spec/fixtures/example2_com.html
|
192
|
+
- spec/fixtures/example_com.html
|
193
|
+
- spec/fixtures/example_com_testing.html
|
194
|
+
- spec/models/contact_spec.rb
|
195
|
+
- spec/models/page_contact.rb
|
196
|
+
- spec/models/page_spec.rb
|
197
|
+
- spec/models/website_spec.rb
|
198
|
+
- spec/spec_helper.rb
|
199
|
+
- spec/support/active_record_helper.rb
|
200
|
+
- spec/workers/crawl_worker_spec.rb
|
201
|
+
has_rdoc:
|