sledgehammer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8391efc4413b6922054b8531950a5412f3d95429
4
+ data.tar.gz: 1b8b0b2301cfc4ee217d92c208bc4cd88ecde5f2
5
+ SHA512:
6
+ metadata.gz: 398055d333de9da4f07d110dd602c9ab66d78fd4ddbfdcd4e5ca0bdea92954da343d96f7ba5fb5c3e83b511179a0e6ddf8449ef5011cb885fa4ac4366be3a61e
7
+ data.tar.gz: 055c2d8e2852fa8dd605c2236b8e962373be94f2022d359f021dbf09d4e84a6aea393c65b95f1ef3b7d31f49084bde26a45bb40ceccc6a4d30b3a6d290d347fa
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .idea
2
+ tmp/
3
+ .DS_Store
4
+ */.DS_Store
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "https://rubygems.org"
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,79 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ sledgehammer (0.1.0)
5
+ activerecord (~> 4.1)
6
+ sidekiq (~> 3.1)
7
+ typhoeus (~> 0.6)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ activemodel (4.1.4)
13
+ activesupport (= 4.1.4)
14
+ builder (~> 3.1)
15
+ activerecord (4.1.4)
16
+ activemodel (= 4.1.4)
17
+ activesupport (= 4.1.4)
18
+ arel (~> 5.0.0)
19
+ activesupport (4.1.4)
20
+ i18n (~> 0.6, >= 0.6.9)
21
+ json (~> 1.7, >= 1.7.7)
22
+ minitest (~> 5.1)
23
+ thread_safe (~> 0.1)
24
+ tzinfo (~> 1.1)
25
+ arel (5.0.1.20140414130214)
26
+ builder (3.2.2)
27
+ celluloid (0.15.2)
28
+ timers (~> 1.1.0)
29
+ connection_pool (2.0.0)
30
+ diff-lcs (1.2.5)
31
+ ethon (0.7.1)
32
+ ffi (>= 1.3.0)
33
+ ffi (1.9.3)
34
+ i18n (0.6.11)
35
+ json (1.8.1)
36
+ minitest (5.4.0)
37
+ rake (10.3.2)
38
+ redis (3.1.0)
39
+ redis-namespace (1.5.0)
40
+ redis (~> 3.0, >= 3.0.4)
41
+ rspec (3.0.0)
42
+ rspec-core (~> 3.0.0)
43
+ rspec-expectations (~> 3.0.0)
44
+ rspec-mocks (~> 3.0.0)
45
+ rspec-core (3.0.2)
46
+ rspec-support (~> 3.0.0)
47
+ rspec-expectations (3.0.2)
48
+ diff-lcs (>= 1.2.0, < 2.0)
49
+ rspec-support (~> 3.0.0)
50
+ rspec-mocks (3.0.2)
51
+ rspec-support (~> 3.0.0)
52
+ rspec-sidekiq (1.0.0)
53
+ rspec (>= 2.0.0)
54
+ sidekiq (>= 2.4.0)
55
+ rspec-support (3.0.2)
56
+ sidekiq (3.2.1)
57
+ celluloid (>= 0.15.2)
58
+ connection_pool (>= 2.0.0)
59
+ json
60
+ redis (>= 3.0.6)
61
+ redis-namespace (>= 1.3.1)
62
+ sqlite3 (1.3.9)
63
+ thread_safe (0.3.4)
64
+ timers (1.1.0)
65
+ typhoeus (0.6.9)
66
+ ethon (>= 0.7.1)
67
+ tzinfo (1.2.1)
68
+ thread_safe (~> 0.1)
69
+
70
+ PLATFORMS
71
+ ruby
72
+
73
+ DEPENDENCIES
74
+ bundler (~> 1.5)
75
+ rake (~> 10.0)
76
+ rspec (~> 3.0)
77
+ rspec-sidekiq (~> 1.0.0)
78
+ sledgehammer!
79
+ sqlite3
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Growth Republic Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # Sledgehammer
2
+
3
+ Sledgehammer is a gem which allows to crawl websites in search of email addresses.
4
+ It uses Typhoeus and Sidekiq to spawn ultra-fast workers which gathers data in no-time.
5
+
6
+ ## Installation
7
+
8
+ Include the gem in your Gemfile
9
+
10
+ ```ruby
11
+ gem "sledgehammer"
12
+ ```
13
+
14
+ Bundle the Gemfile
15
+
16
+ ```ruby
17
+ bundle install
18
+ ```
19
+
20
+ Run the install script which will create a migration file and a config file.
21
+
22
+ ```ruby
23
+ bundle exec rails generate sledgehammer:install
24
+ ```
25
+
26
+ Migrate your database
27
+
28
+ ```ruby
29
+ bundle exec rake db:migrate
30
+ ```
31
+
32
+ ## Setup
33
+
34
+ You should be aware of using this gem with application with sqlite3 database.
35
+ Due to multi threaded nature of gem you will be greeted with "SQLite3::BusyException: database is locked" errors.
36
+ PostgreSQL, MySQL or MongoDB should be just fine.
37
+
38
+ ## Usage
39
+
40
+ Run sidekiq worker form your code:
41
+
42
+
43
+ ```ruby
44
+ Sledgehammer::CrawlWorker.perform_async ARRAY_OF_URLS, [OPTIONS]
45
+ ```
46
+
47
+ Here is sample usage:
48
+
49
+ ```ruby
50
+ Sledgehammer::CrawlWorker.perform_async ['http://example.com'], { depth_limit: 3 }
51
+ ```
52
+
53
+ ## Contributors
54
+
55
+ [d4rky-pl] (https://github.com/d4rky-pl)
56
+
57
+ [rabsztok](https://github.com/rabsztok)
58
+
59
+ ## License
60
+
61
+ Sledgehammer is Copyright © 2014 Growth Republic. It is free software, and may be redistributed under the terms specified in the LICENSE file.
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'rspec/core/rake_task'
2
+ require 'bundler/gem_tasks'
3
+
4
+ # Default directory to look in is `/specs`
5
+ # Run with `rake spec`
6
+ RSpec::Core::RakeTask.new(:spec) do |task|
7
+ task.rspec_opts = ['--color']
8
+ end
9
+
10
+ task :default => :spec
@@ -0,0 +1,5 @@
1
+ {
2
+ "result": {
3
+ "covered_percent": 99.31
4
+ }
5
+ }
@@ -0,0 +1,12 @@
1
+ class CreatePages < ActiveRecord::Migration
2
+ def change
3
+ create_table :pages do |t|
4
+ t.references :website, index: true
5
+ t.string :url
6
+ t.integer :depth
7
+ t.boolean :completed
8
+
9
+ t.timestamps
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,9 @@
1
+ class CreateContacts < ActiveRecord::Migration
2
+ def change
3
+ create_table :contacts do |t|
4
+ t.string :email, unique: true
5
+
6
+ t.timestamps
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ class CreateWebsites < ActiveRecord::Migration
2
+ def change
3
+ create_table :websites do |t|
4
+ t.string :hostname
5
+
6
+ t.timestamps
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,8 @@
1
+ class CreatePageContacts < ActiveRecord::Migration
2
+ def change
3
+ create_table :page_contacts do |t|
4
+ t.references :page, index: true
5
+ t.references :contact, index: true
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,11 @@
1
+ Description:
2
+ Generates ActiveRecord migrations needed to use this gem in Rails application
3
+
4
+ Example:
5
+ rails generate sledgehammer:install
6
+
7
+ This will create:
8
+ db/migrate/20140626075744_create_pages.rb
9
+ db/migrate/20140626080142_create_contacts.rb
10
+ db/migrate/20140626105612_create_websites.rb
11
+ db/migrate/20140704070249_create_page_contacts.rb
@@ -0,0 +1,16 @@
1
+ require 'rails/generators/base'
2
+
3
+ module Sledgehammer
4
+ module Generators
5
+ class InstallGenerator < Rails::Generators::Base
6
+ source_root File.expand_path('../../../../', __FILE__)
7
+
8
+ def generate_migrations
9
+ copy_file "db/migrate/20140626075744_create_pages.rb"
10
+ copy_file "db/migrate/20140626080142_create_contacts.rb"
11
+ copy_file "db/migrate/20140626105612_create_websites.rb"
12
+ copy_file "db/migrate/20140704070249_create_page_contacts.rb"
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,11 @@
1
+ require 'sidekiq'
2
+ require 'typhoeus'
3
+ require 'sledgehammer/version'
4
+ require 'sledgehammer/models/contact'
5
+ require 'sledgehammer/models/page'
6
+ require 'sledgehammer/models/page_contact'
7
+ require 'sledgehammer/models/website'
8
+ require 'sledgehammer/workers/crawl_worker'
9
+
10
+ module Sledgehammer
11
+ end
@@ -0,0 +1,6 @@
1
+ class Sledgehammer::Contact < ActiveRecord::Base
2
+ has_many :page_contacts
3
+ has_many :pages, through: :page_contacts
4
+
5
+ validates :email, uniqueness: true
6
+ end
@@ -0,0 +1,12 @@
1
+ class Sledgehammer::Page < ActiveRecord::Base
2
+ belongs_to :website
3
+ has_many :page_contacts
4
+ has_many :contacts, through: :page_contacts
5
+ before_create :create_website!
6
+
7
+ protected
8
+ def create_website!
9
+ hostname = URI.parse(url).host
10
+ self.website = Sledgehammer::Website.find_or_create_by(hostname: hostname)
11
+ end
12
+ end
@@ -0,0 +1,4 @@
1
+ class Sledgehammer::PageContact < ActiveRecord::Base
2
+ belongs_to :page
3
+ belongs_to :contact
4
+ end
@@ -0,0 +1,4 @@
1
+ class Sledgehammer::Website < ActiveRecord::Base
2
+ has_many :pages
3
+ has_many :contacts, through: :pages
4
+ end
@@ -0,0 +1,3 @@
1
+ module Sledgehammer
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,113 @@
1
+ class Sledgehammer::CrawlWorker
2
+ include ::Sidekiq::Worker
3
+ MAIL_REGEX = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.(?!jpg|gif|png)[A-Z0-9]+/i
4
+ URL_REGEX = /<a\s+(?:[^>]*?\s+)?href="((?:http|\/)[^"]+)"/
5
+ DEFAULT_OPTIONS = { depth: 0, depth_limit: 1, queue: 'default' }
6
+
7
+ #
8
+ # Callbacks to overload in application
9
+ #
10
+ def before_queue(urls)
11
+ # stub
12
+ end
13
+
14
+ #
15
+ # Stops element from being added to queue if returns false
16
+ #
17
+ def on_queue(url)
18
+ true
19
+ end
20
+
21
+ def after_queue(urls)
22
+ # stub
23
+ end
24
+
25
+ def on_complete(response)
26
+ page = self.find_or_create_page!(response.request.url)
27
+ unless page.completed?
28
+ self.parse_emails(response, page)
29
+ self.parse_urls(response)
30
+ page.update_attributes completed: true
31
+ end
32
+ end
33
+
34
+ #
35
+ # There shouldn't be any need to overload methods below
36
+ #
37
+
38
+ def perform(urls, opts = {})
39
+ @options = HashWithIndifferentAccess.new(DEFAULT_OPTIONS)
40
+ @options.merge!(opts)
41
+
42
+ return if @options[:depth] == @options[:depth_limit]
43
+
44
+ before_queue(urls)
45
+ urls.each { |site| self.queue(site) }
46
+ run_queue
47
+ after_queue(urls)
48
+ end
49
+
50
+ def queue(url)
51
+ return unless self.on_queue(url) && valid_url?(url)
52
+
53
+ request = Typhoeus::Request.new(url)
54
+ request.on_complete { |response| self.on_complete(response) }
55
+
56
+ Typhoeus::Hydra.hydra.queue(request)
57
+ end
58
+
59
+ def run_queue
60
+ Typhoeus::Hydra.hydra.run
61
+ end
62
+
63
+ protected
64
+ def find_or_create_page!(request_url)
65
+ page = Sledgehammer::Page.find_by(url: request_url)
66
+
67
+ if page.blank?
68
+ hostname = URI.parse(request_url).host
69
+ website = Sledgehammer::Website.find_or_create_by(hostname: hostname)
70
+ page = Sledgehammer::Page.create!(url: request_url, depth: @options[:depth], website: website)
71
+ elsif page.depth < @options[:depth]
72
+ page.update_attributes completed: false
73
+ end
74
+ page
75
+ end
76
+
77
+ def parse_emails(response, page)
78
+ mail_list = response.body.scan MAIL_REGEX
79
+ mail_list.each do |email|
80
+ contact = Sledgehammer::Contact.find_or_create_by(email: email)
81
+ Sledgehammer::PageContact.find_or_create_by page: page, contact: contact
82
+ end
83
+ end
84
+
85
+ # TODO: remove url == '/' because we not always start at root page
86
+ def parse_urls(response)
87
+ request_url = response.request.url
88
+ request_url = "http://#{request_url}" unless request_url.match /^http/
89
+
90
+ url_list = response.body.scan(URL_REGEX).flatten.map do |url|
91
+ if url == request_url || !valid_url?(url)
92
+ nil
93
+ elsif url.starts_with?('/')
94
+ URI.join(request_url, url).to_s
95
+ else
96
+ url
97
+ end
98
+ end.compact
99
+
100
+ opts = @options.dup
101
+ opts[:depth] += 1
102
+
103
+ unless opts[:depth] >= opts[:depth_limit] || url_list.empty?
104
+ Sidekiq::Client.push('queue' => opts[:queue],
105
+ 'class' => self.class,
106
+ 'args' => [url_list, opts])
107
+ end
108
+ end
109
+
110
+ def valid_url?(url)
111
+ !!URI.parse(url) rescue false
112
+ end
113
+ end
@@ -0,0 +1,29 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require 'sledgehammer/version'
3
+
4
+ Gem::Specification.new do |spec|
5
+ spec.name = 'sledgehammer'
6
+ spec.version = Sledgehammer::VERSION
7
+
8
+ spec.authors = ['Michał Matyas', 'Maciej Walusiak']
9
+ spec.email = ['michal@higher.lv', 'rabsztok@gmail.com']
10
+ spec.summary = 'Crawls websites and harvests e-mails'
11
+ spec.description = 'Website crawler harvesting e-mails. Uses Sidekiq and Typhoeus.'
12
+ spec.homepage = 'https://github.com/growthrepublic/sledgehammer'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
17
+ spec.require_paths = ["lib"]
18
+
19
+ spec.add_development_dependency 'bundler', '~> 1.5'
20
+ spec.add_development_dependency 'rake', '~> 10.0'
21
+ spec.add_development_dependency 'rspec', '~> 3.0'
22
+ spec.add_development_dependency 'rspec-sidekiq', '~> 1.0.0'
23
+ spec.add_development_dependency 'sqlite3'
24
+
25
+ spec.add_runtime_dependency 'activerecord', '~> 4.1'
26
+ spec.add_runtime_dependency 'typhoeus', '~> 0.6'
27
+ spec.add_runtime_dependency 'sidekiq', '~> 3.1'
28
+
29
+ end
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Example website 2: Tests Attack</title>
6
+ </head>
7
+ <body>
8
+ <div class="container">This is a second example site to test following links.</div>
9
+ <div>
10
+ test4@example.com
11
+ </div>
12
+ </body>
13
+ </html>
@@ -0,0 +1,21 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Example website</title>
6
+ </head>
7
+ <body>
8
+ <div class="container">This is an example site for the testing purposes. It contains two external links to different test pages and three e-mails.</div>
9
+ <div>
10
+ <a class="abc" href="#">This link should not be found</a>
11
+ <a href="/testing">This is relative link, it should be properly translated and followed</a>
12
+ <a href="http://www.example2.com">This is external link, it should be followed</a>
13
+ <a href="http:/www(broken).\com">This is corrupted external link, it should not be followed</a>
14
+ </div>
15
+ <div>
16
+ <a href="mailto:test1@example.com">This is first e-mail, inside link</a>
17
+ test2@example.com This is second e-mail, outside antyhing and surrounded only by whitespaces
18
+ john.doe@especially.long.tld.xn--clchc0ea0b2g2a9gcd This is third e-mail, with the longest possible TLD
19
+ </div>
20
+ </body>
21
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Example website / Testing</title>
6
+ </head>
7
+ <body>
8
+ <div class="container">This is a subpage of Example website.</div>
9
+ <div>
10
+ test3@example.com
11
+ </div>
12
+ </body>
13
+ </html>
@@ -0,0 +1,3 @@
1
+ RSpec.describe Sledgehammer::Contact, :type => :model do
2
+ pending "add some examples to (or delete) #{__FILE__}"
3
+ end
@@ -0,0 +1,3 @@
1
+ RSpec.describe Sledgehammer::PageContact, :type => :model do
2
+ pending "add some examples to (or delete) #{__FILE__}"
3
+ end
@@ -0,0 +1,3 @@
1
+ RSpec.describe Sledgehammer::Page, :type => :model do
2
+ pending "add some examples to (or delete) #{__FILE__}"
3
+ end
@@ -0,0 +1,3 @@
1
+ RSpec.describe Sledgehammer::Website, :type => :model do
2
+ pending "add some examples to (or delete) #{__FILE__}"
3
+ end
@@ -0,0 +1,8 @@
1
+ require File.expand_path('../support/active_record_helper', __FILE__)
2
+ require File.expand_path('../../lib/sledgehammer.rb', __FILE__)
3
+
4
+ RSpec.configure do |config|
5
+ config.before :each do
6
+ Typhoeus::Expectation.clear
7
+ end
8
+ end
@@ -0,0 +1,13 @@
1
+ require 'active_record'
2
+
3
+ ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:"
4
+ ActiveRecord::Migrator.up "db/migrate"
5
+
6
+ RSpec.configure do |config|
7
+ config.around do |example|
8
+ ActiveRecord::Base.transaction do
9
+ example.run
10
+ raise ActiveRecord::Rollback
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,33 @@
1
+ require 'sidekiq/testing'
2
+
3
+ RSpec.describe Sledgehammer::CrawlWorker, sidekiq: :fake do
4
+
5
+ before(:example) do
6
+ fixture_directory = File.expand_path('../../fixtures', __FILE__)
7
+ Typhoeus.stub('http://www.example.com').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example_com.html')))
8
+ Typhoeus.stub('http://www.example.com/testing').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example_com_testing.html')))
9
+ Typhoeus.stub('http://www.example2.com').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example2_com.html')))
10
+ end
11
+
12
+ let(:worker) { Sledgehammer::CrawlWorker.new }
13
+
14
+ describe "#perform" do
15
+ it "finds all e-mail addresses on the first site" do
16
+ worker.perform(['http://www.example.com'])
17
+ expect(Sledgehammer::Contact.count).to eq(3)
18
+ end
19
+
20
+ it "doesn't work when the depth limit is hit" do
21
+ worker.perform(['http://www.example.com'], { 'depth' => 2, 'depth_limit' => 2 })
22
+ expect(Sledgehammer::Contact.count).to eq(0)
23
+ end
24
+
25
+ it "crawls all pages and finds e-mails on them" do
26
+ Sidekiq::Testing.inline! do
27
+ worker.perform(['http://www.example.com'], { 'depth_limit' => 3 })
28
+ expect(Sledgehammer::Contact.count).to eq(5)
29
+ expect(Sledgehammer::Page.count).to eq(3)
30
+ end
31
+ end
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,201 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sledgehammer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Michał Matyas
8
+ - Maciej Walusiak
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-07-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.5'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.5'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rake
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '10.0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '10.0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: rspec
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '3.0'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '3.0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rspec-sidekiq
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - "~>"
61
+ - !ruby/object:Gem::Version
62
+ version: 1.0.0
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: 1.0.0
70
+ - !ruby/object:Gem::Dependency
71
+ name: sqlite3
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: activerecord
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '4.1'
91
+ type: :runtime
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '4.1'
98
+ - !ruby/object:Gem::Dependency
99
+ name: typhoeus
100
+ requirement: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - "~>"
103
+ - !ruby/object:Gem::Version
104
+ version: '0.6'
105
+ type: :runtime
106
+ prerelease: false
107
+ version_requirements: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - "~>"
110
+ - !ruby/object:Gem::Version
111
+ version: '0.6'
112
+ - !ruby/object:Gem::Dependency
113
+ name: sidekiq
114
+ requirement: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - "~>"
117
+ - !ruby/object:Gem::Version
118
+ version: '3.1'
119
+ type: :runtime
120
+ prerelease: false
121
+ version_requirements: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - "~>"
124
+ - !ruby/object:Gem::Version
125
+ version: '3.1'
126
+ description: Website crawler harvesting e-mails. Uses Sidekiq and Typhoeus.
127
+ email:
128
+ - michal@higher.lv
129
+ - rabsztok@gmail.com
130
+ executables: []
131
+ extensions: []
132
+ extra_rdoc_files: []
133
+ files:
134
+ - ".gitignore"
135
+ - ".rspec"
136
+ - Gemfile
137
+ - Gemfile.lock
138
+ - LICENSE
139
+ - README.md
140
+ - Rakefile
141
+ - coverage/.last_run.json
142
+ - db/migrate/20140626075744_create_pages.rb
143
+ - db/migrate/20140626080142_create_contacts.rb
144
+ - db/migrate/20140626105612_create_websites.rb
145
+ - db/migrate/20140704070249_create_page_contacts.rb
146
+ - lib/generators/sledgehammer/USAGE
147
+ - lib/generators/sledgehammer/install_generator.rb
148
+ - lib/sledgehammer.rb
149
+ - lib/sledgehammer/models/contact.rb
150
+ - lib/sledgehammer/models/page.rb
151
+ - lib/sledgehammer/models/page_contact.rb
152
+ - lib/sledgehammer/models/website.rb
153
+ - lib/sledgehammer/version.rb
154
+ - lib/sledgehammer/workers/crawl_worker.rb
155
+ - sledgehammer.gemspec
156
+ - spec/fixtures/example2_com.html
157
+ - spec/fixtures/example_com.html
158
+ - spec/fixtures/example_com_testing.html
159
+ - spec/models/contact_spec.rb
160
+ - spec/models/page_contact.rb
161
+ - spec/models/page_spec.rb
162
+ - spec/models/website_spec.rb
163
+ - spec/spec_helper.rb
164
+ - spec/support/active_record_helper.rb
165
+ - spec/workers/crawl_worker_spec.rb
166
+ homepage: https://github.com/growthrepublic/sledgehammer
167
+ licenses:
168
+ - MIT
169
+ metadata: {}
170
+ post_install_message:
171
+ rdoc_options: []
172
+ require_paths:
173
+ - lib
174
+ required_ruby_version: !ruby/object:Gem::Requirement
175
+ requirements:
176
+ - - ">="
177
+ - !ruby/object:Gem::Version
178
+ version: '0'
179
+ required_rubygems_version: !ruby/object:Gem::Requirement
180
+ requirements:
181
+ - - ">="
182
+ - !ruby/object:Gem::Version
183
+ version: '0'
184
+ requirements: []
185
+ rubyforge_project:
186
+ rubygems_version: 2.2.2
187
+ signing_key:
188
+ specification_version: 4
189
+ summary: Crawls websites and harvests e-mails
190
+ test_files:
191
+ - spec/fixtures/example2_com.html
192
+ - spec/fixtures/example_com.html
193
+ - spec/fixtures/example_com_testing.html
194
+ - spec/models/contact_spec.rb
195
+ - spec/models/page_contact.rb
196
+ - spec/models/page_spec.rb
197
+ - spec/models/website_spec.rb
198
+ - spec/spec_helper.rb
199
+ - spec/support/active_record_helper.rb
200
+ - spec/workers/crawl_worker_spec.rb
201
+ has_rdoc: