sledgehammer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8391efc4413b6922054b8531950a5412f3d95429
4
+ data.tar.gz: 1b8b0b2301cfc4ee217d92c208bc4cd88ecde5f2
5
+ SHA512:
6
+ metadata.gz: 398055d333de9da4f07d110dd602c9ab66d78fd4ddbfdcd4e5ca0bdea92954da343d96f7ba5fb5c3e83b511179a0e6ddf8449ef5011cb885fa4ac4366be3a61e
7
+ data.tar.gz: 055c2d8e2852fa8dd605c2236b8e962373be94f2022d359f021dbf09d4e84a6aea393c65b95f1ef3b7d31f49084bde26a45bb40ceccc6a4d30b3a6d290d347fa
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .idea
2
+ tmp/
3
+ .DS_Store
4
+ */.DS_Store
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "https://rubygems.org"
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,79 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ sledgehammer (0.1.0)
5
+ activerecord (~> 4.1)
6
+ sidekiq (~> 3.1)
7
+ typhoeus (~> 0.6)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ activemodel (4.1.4)
13
+ activesupport (= 4.1.4)
14
+ builder (~> 3.1)
15
+ activerecord (4.1.4)
16
+ activemodel (= 4.1.4)
17
+ activesupport (= 4.1.4)
18
+ arel (~> 5.0.0)
19
+ activesupport (4.1.4)
20
+ i18n (~> 0.6, >= 0.6.9)
21
+ json (~> 1.7, >= 1.7.7)
22
+ minitest (~> 5.1)
23
+ thread_safe (~> 0.1)
24
+ tzinfo (~> 1.1)
25
+ arel (5.0.1.20140414130214)
26
+ builder (3.2.2)
27
+ celluloid (0.15.2)
28
+ timers (~> 1.1.0)
29
+ connection_pool (2.0.0)
30
+ diff-lcs (1.2.5)
31
+ ethon (0.7.1)
32
+ ffi (>= 1.3.0)
33
+ ffi (1.9.3)
34
+ i18n (0.6.11)
35
+ json (1.8.1)
36
+ minitest (5.4.0)
37
+ rake (10.3.2)
38
+ redis (3.1.0)
39
+ redis-namespace (1.5.0)
40
+ redis (~> 3.0, >= 3.0.4)
41
+ rspec (3.0.0)
42
+ rspec-core (~> 3.0.0)
43
+ rspec-expectations (~> 3.0.0)
44
+ rspec-mocks (~> 3.0.0)
45
+ rspec-core (3.0.2)
46
+ rspec-support (~> 3.0.0)
47
+ rspec-expectations (3.0.2)
48
+ diff-lcs (>= 1.2.0, < 2.0)
49
+ rspec-support (~> 3.0.0)
50
+ rspec-mocks (3.0.2)
51
+ rspec-support (~> 3.0.0)
52
+ rspec-sidekiq (1.0.0)
53
+ rspec (>= 2.0.0)
54
+ sidekiq (>= 2.4.0)
55
+ rspec-support (3.0.2)
56
+ sidekiq (3.2.1)
57
+ celluloid (>= 0.15.2)
58
+ connection_pool (>= 2.0.0)
59
+ json
60
+ redis (>= 3.0.6)
61
+ redis-namespace (>= 1.3.1)
62
+ sqlite3 (1.3.9)
63
+ thread_safe (0.3.4)
64
+ timers (1.1.0)
65
+ typhoeus (0.6.9)
66
+ ethon (>= 0.7.1)
67
+ tzinfo (1.2.1)
68
+ thread_safe (~> 0.1)
69
+
70
+ PLATFORMS
71
+ ruby
72
+
73
+ DEPENDENCIES
74
+ bundler (~> 1.5)
75
+ rake (~> 10.0)
76
+ rspec (~> 3.0)
77
+ rspec-sidekiq (~> 1.0.0)
78
+ sledgehammer!
79
+ sqlite3
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Growth Republic Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # Sledgehammer
2
+
3
+ Sledgehammer is a gem which allows to crawl websites in search of email addresses.
4
+ It uses Typhoeus and Sidekiq to spawn ultra-fast workers which gathers data in no-time.
5
+
6
+ ## Installation
7
+
8
+ Include the gem in your Gemfile
9
+
10
+ ```ruby
11
+ gem "sledgehammer"
12
+ ```
13
+
14
+ Bundle the Gemfile
15
+
16
+ ```ruby
17
+ bundle install
18
+ ```
19
+
20
+ Run the install script which will create a migration file and a config file.
21
+
22
+ ```ruby
23
+ bundle exec rails generate sledgehammer:install
24
+ ```
25
+
26
+ Migrate your database
27
+
28
+ ```ruby
29
+ bundle exec rake db:migrate
30
+ ```
31
+
32
+ ## Setup
33
+
34
+ You should be aware of using this gem with application with sqlite3 database.
35
+ Due to multi threaded nature of gem you will be greeted with "SQLite3::BusyException: database is locked" errors.
36
+ PostgreSQL, MySQL or MongoDB should be just fine.
37
+
38
+ ## Usage
39
+
40
+ Run sidekiq worker form your code:
41
+
42
+
43
+ ```ruby
44
+ Sledgehammer::CrawlWorker.perform_async ARRAY_OF_URLS, [OPTIONS]
45
+ ```
46
+
47
+ Here is sample usage:
48
+
49
+ ```ruby
50
+ Sledgehammer::CrawlWorker.perform_async ['http://example.com'], { depth_limit: 3 }
51
+ ```
52
+
53
+ ## Contributors
54
+
55
+ [d4rky-pl] (https://github.com/d4rky-pl)
56
+
57
+ [rabsztok](https://github.com/rabsztok)
58
+
59
+ ## License
60
+
61
+ Sledgehammer is Copyright © 2014 Growth Republic. It is free software, and may be redistributed under the terms specified in the LICENSE file.
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'rspec/core/rake_task'
2
+ require 'bundler/gem_tasks'
3
+
4
+ # Default directory to look in is `/specs`
5
+ # Run with `rake spec`
6
+ RSpec::Core::RakeTask.new(:spec) do |task|
7
+ task.rspec_opts = ['--color']
8
+ end
9
+
10
+ task :default => :spec
@@ -0,0 +1,5 @@
1
+ {
2
+ "result": {
3
+ "covered_percent": 99.31
4
+ }
5
+ }
@@ -0,0 +1,12 @@
1
+ class CreatePages < ActiveRecord::Migration
2
+ def change
3
+ create_table :pages do |t|
4
+ t.references :website, index: true
5
+ t.string :url
6
+ t.integer :depth
7
+ t.boolean :completed
8
+
9
+ t.timestamps
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,9 @@
1
+ class CreateContacts < ActiveRecord::Migration
2
+ def change
3
+ create_table :contacts do |t|
4
+ t.string :email, unique: true
5
+
6
+ t.timestamps
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ class CreateWebsites < ActiveRecord::Migration
2
+ def change
3
+ create_table :websites do |t|
4
+ t.string :hostname
5
+
6
+ t.timestamps
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,8 @@
1
+ class CreatePageContacts < ActiveRecord::Migration
2
+ def change
3
+ create_table :page_contacts do |t|
4
+ t.references :page, index: true
5
+ t.references :contact, index: true
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,11 @@
1
+ Description:
2
+ Generates ActiveRecord migrations needed to use this gem in Rails application
3
+
4
+ Example:
5
+ rails generate sledgehammer:install
6
+
7
+ This will create:
8
+ db/migrate/20140626075744_create_pages.rb
9
+ db/migrate/20140626080142_create_contacts.rb
10
+ db/migrate/20140626105612_create_websites.rb
11
+ db/migrate/20140704070249_create_page_contacts.rb
@@ -0,0 +1,16 @@
1
+ require 'rails/generators/base'
2
+
3
+ module Sledgehammer
4
+ module Generators
5
+ class InstallGenerator < Rails::Generators::Base
6
+ source_root File.expand_path('../../../../', __FILE__)
7
+
8
+ def generate_migrations
9
+ copy_file "db/migrate/20140626075744_create_pages.rb"
10
+ copy_file "db/migrate/20140626080142_create_contacts.rb"
11
+ copy_file "db/migrate/20140626105612_create_websites.rb"
12
+ copy_file "db/migrate/20140704070249_create_page_contacts.rb"
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,11 @@
1
+ require 'sidekiq'
2
+ require 'typhoeus'
3
+ require 'sledgehammer/version'
4
+ require 'sledgehammer/models/contact'
5
+ require 'sledgehammer/models/page'
6
+ require 'sledgehammer/models/page_contact'
7
+ require 'sledgehammer/models/website'
8
+ require 'sledgehammer/workers/crawl_worker'
9
+
10
+ module Sledgehammer
11
+ end
@@ -0,0 +1,6 @@
1
+ class Sledgehammer::Contact < ActiveRecord::Base
2
+ has_many :page_contacts
3
+ has_many :pages, through: :page_contacts
4
+
5
+ validates :email, uniqueness: true
6
+ end
@@ -0,0 +1,12 @@
1
+ class Sledgehammer::Page < ActiveRecord::Base
2
+ belongs_to :website
3
+ has_many :page_contacts
4
+ has_many :contacts, through: :page_contacts
5
+ before_create :create_website!
6
+
7
+ protected
8
+ def create_website!
9
+ hostname = URI.parse(url).host
10
+ self.website = Sledgehammer::Website.find_or_create_by(hostname: hostname)
11
+ end
12
+ end
@@ -0,0 +1,4 @@
1
+ class Sledgehammer::PageContact < ActiveRecord::Base
2
+ belongs_to :page
3
+ belongs_to :contact
4
+ end
@@ -0,0 +1,4 @@
1
+ class Sledgehammer::Website < ActiveRecord::Base
2
+ has_many :pages
3
+ has_many :contacts, through: :pages
4
+ end
@@ -0,0 +1,3 @@
1
+ module Sledgehammer
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,113 @@
1
+ class Sledgehammer::CrawlWorker
2
+ include ::Sidekiq::Worker
3
+ MAIL_REGEX = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.(?!jpg|gif|png)[A-Z0-9]+/i
4
+ URL_REGEX = /<a\s+(?:[^>]*?\s+)?href="((?:http|\/)[^"]+)"/
5
+ DEFAULT_OPTIONS = { depth: 0, depth_limit: 1, queue: 'default' }
6
+
7
+ #
8
+ # Callbacks to overload in application
9
+ #
10
+ def before_queue(urls)
11
+ # stub
12
+ end
13
+
14
+ #
15
+ # Stops element from being added to queue if returns false
16
+ #
17
+ def on_queue(url)
18
+ true
19
+ end
20
+
21
+ def after_queue(urls)
22
+ # stub
23
+ end
24
+
25
+ def on_complete(response)
26
+ page = self.find_or_create_page!(response.request.url)
27
+ unless page.completed?
28
+ self.parse_emails(response, page)
29
+ self.parse_urls(response)
30
+ page.update_attributes completed: true
31
+ end
32
+ end
33
+
34
+ #
35
+ # There shouldn't be any need to overload methods below
36
+ #
37
+
38
+ def perform(urls, opts = {})
39
+ @options = HashWithIndifferentAccess.new(DEFAULT_OPTIONS)
40
+ @options.merge!(opts)
41
+
42
+ return if @options[:depth] == @options[:depth_limit]
43
+
44
+ before_queue(urls)
45
+ urls.each { |site| self.queue(site) }
46
+ run_queue
47
+ after_queue(urls)
48
+ end
49
+
50
+ def queue(url)
51
+ return unless self.on_queue(url) && valid_url?(url)
52
+
53
+ request = Typhoeus::Request.new(url)
54
+ request.on_complete { |response| self.on_complete(response) }
55
+
56
+ Typhoeus::Hydra.hydra.queue(request)
57
+ end
58
+
59
+ def run_queue
60
+ Typhoeus::Hydra.hydra.run
61
+ end
62
+
63
+ protected
64
+ def find_or_create_page!(request_url)
65
+ page = Sledgehammer::Page.find_by(url: request_url)
66
+
67
+ if page.blank?
68
+ hostname = URI.parse(request_url).host
69
+ website = Sledgehammer::Website.find_or_create_by(hostname: hostname)
70
+ page = Sledgehammer::Page.create!(url: request_url, depth: @options[:depth], website: website)
71
+ elsif page.depth < @options[:depth]
72
+ page.update_attributes completed: false
73
+ end
74
+ page
75
+ end
76
+
77
+ def parse_emails(response, page)
78
+ mail_list = response.body.scan MAIL_REGEX
79
+ mail_list.each do |email|
80
+ contact = Sledgehammer::Contact.find_or_create_by(email: email)
81
+ Sledgehammer::PageContact.find_or_create_by page: page, contact: contact
82
+ end
83
+ end
84
+
85
+ # TODO: remove url == '/' because we not always start at root page
86
+ def parse_urls(response)
87
+ request_url = response.request.url
88
+ request_url = "http://#{request_url}" unless request_url.match /^http/
89
+
90
+ url_list = response.body.scan(URL_REGEX).flatten.map do |url|
91
+ if url == request_url || !valid_url?(url)
92
+ nil
93
+ elsif url.starts_with?('/')
94
+ URI.join(request_url, url).to_s
95
+ else
96
+ url
97
+ end
98
+ end.compact
99
+
100
+ opts = @options.dup
101
+ opts[:depth] += 1
102
+
103
+ unless opts[:depth] >= opts[:depth_limit] || url_list.empty?
104
+ Sidekiq::Client.push('queue' => opts[:queue],
105
+ 'class' => self.class,
106
+ 'args' => [url_list, opts])
107
+ end
108
+ end
109
+
110
+ def valid_url?(url)
111
+ !!URI.parse(url) rescue false
112
+ end
113
+ end
@@ -0,0 +1,29 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require 'sledgehammer/version'
3
+
4
+ Gem::Specification.new do |spec|
5
+ spec.name = 'sledgehammer'
6
+ spec.version = Sledgehammer::VERSION
7
+
8
+ spec.authors = ['Michał Matyas', 'Maciej Walusiak']
9
+ spec.email = ['michal@higher.lv', 'rabsztok@gmail.com']
10
+ spec.summary = 'Crawls websites and harvests e-mails'
11
+ spec.description = 'Website crawler harvesting e-mails. Uses Sidekiq and Typhoeus.'
12
+ spec.homepage = 'https://github.com/growthrepublic/sledgehammer'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
17
+ spec.require_paths = ["lib"]
18
+
19
+ spec.add_development_dependency 'bundler', '~> 1.5'
20
+ spec.add_development_dependency 'rake', '~> 10.0'
21
+ spec.add_development_dependency 'rspec', '~> 3.0'
22
+ spec.add_development_dependency 'rspec-sidekiq', '~> 1.0.0'
23
+ spec.add_development_dependency 'sqlite3'
24
+
25
+ spec.add_runtime_dependency 'activerecord', '~> 4.1'
26
+ spec.add_runtime_dependency 'typhoeus', '~> 0.6'
27
+ spec.add_runtime_dependency 'sidekiq', '~> 3.1'
28
+
29
+ end
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Example website 2: Tests Attack</title>
6
+ </head>
7
+ <body>
8
+ <div class="container">This is a second example site to test following links.</div>
9
+ <div>
10
+ test4@example.com
11
+ </div>
12
+ </body>
13
+ </html>
@@ -0,0 +1,21 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Example website</title>
6
+ </head>
7
+ <body>
8
+ <div class="container">This is an example site for the testing purposes. It contains two external links to different test pages and three e-mails.</div>
9
+ <div>
10
+ <a class="abc" href="#">This link should not be found</a>
11
+ <a href="/testing">This is relative link, it should be properly translated and followed</a>
12
+ <a href="http://www.example2.com">This is external link, it should be followed</a>
13
+ <a href="http:/www(broken).\com">This is corrupted external link, it should not be followed</a>
14
+ </div>
15
+ <div>
16
+ <a href="mailto:test1@example.com">This is first e-mail, inside link</a>
17
+ test2@example.com This is second e-mail, outside antyhing and surrounded only by whitespaces
18
+ john.doe@especially.long.tld.xn--clchc0ea0b2g2a9gcd This is third e-mail, with the longest possible TLD
19
+ </div>
20
+ </body>
21
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Example website / Testing</title>
6
+ </head>
7
+ <body>
8
+ <div class="container">This is a subpage of Example website.</div>
9
+ <div>
10
+ test3@example.com
11
+ </div>
12
+ </body>
13
+ </html>
@@ -0,0 +1,3 @@
1
+ RSpec.describe Sledgehammer::Contact, :type => :model do
2
+ pending "add some examples to (or delete) #{__FILE__}"
3
+ end
@@ -0,0 +1,3 @@
1
+ RSpec.describe Sledgehammer::PageContact, :type => :model do
2
+ pending "add some examples to (or delete) #{__FILE__}"
3
+ end
@@ -0,0 +1,3 @@
1
+ RSpec.describe Sledgehammer::Page, :type => :model do
2
+ pending "add some examples to (or delete) #{__FILE__}"
3
+ end
@@ -0,0 +1,3 @@
1
+ RSpec.describe Sledgehammer::Website, :type => :model do
2
+ pending "add some examples to (or delete) #{__FILE__}"
3
+ end
@@ -0,0 +1,8 @@
1
+ require File.expand_path('../support/active_record_helper', __FILE__)
2
+ require File.expand_path('../../lib/sledgehammer.rb', __FILE__)
3
+
4
+ RSpec.configure do |config|
5
+ config.before :each do
6
+ Typhoeus::Expectation.clear
7
+ end
8
+ end
@@ -0,0 +1,13 @@
1
+ require 'active_record'
2
+
3
+ ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:"
4
+ ActiveRecord::Migrator.up "db/migrate"
5
+
6
+ RSpec.configure do |config|
7
+ config.around do |example|
8
+ ActiveRecord::Base.transaction do
9
+ example.run
10
+ raise ActiveRecord::Rollback
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,33 @@
1
+ require 'sidekiq/testing'
2
+
3
+ RSpec.describe Sledgehammer::CrawlWorker, sidekiq: :fake do
4
+
5
+ before(:example) do
6
+ fixture_directory = File.expand_path('../../fixtures', __FILE__)
7
+ Typhoeus.stub('http://www.example.com').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example_com.html')))
8
+ Typhoeus.stub('http://www.example.com/testing').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example_com_testing.html')))
9
+ Typhoeus.stub('http://www.example2.com').and_return Typhoeus::Response.new(code: 200, body: File.read(File.join(fixture_directory, 'example2_com.html')))
10
+ end
11
+
12
+ let(:worker) { Sledgehammer::CrawlWorker.new }
13
+
14
+ describe "#perform" do
15
+ it "finds all e-mail addresses on the first site" do
16
+ worker.perform(['http://www.example.com'])
17
+ expect(Sledgehammer::Contact.count).to eq(3)
18
+ end
19
+
20
+ it "doesn't work when the depth limit is hit" do
21
+ worker.perform(['http://www.example.com'], { 'depth' => 2, 'depth_limit' => 2 })
22
+ expect(Sledgehammer::Contact.count).to eq(0)
23
+ end
24
+
25
+ it "crawls all pages and finds e-mails on them" do
26
+ Sidekiq::Testing.inline! do
27
+ worker.perform(['http://www.example.com'], { 'depth_limit' => 3 })
28
+ expect(Sledgehammer::Contact.count).to eq(5)
29
+ expect(Sledgehammer::Page.count).to eq(3)
30
+ end
31
+ end
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,201 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sledgehammer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Michał Matyas
8
+ - Maciej Walusiak
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-07-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.5'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.5'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rake
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '10.0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '10.0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: rspec
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '3.0'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '3.0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rspec-sidekiq
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - "~>"
61
+ - !ruby/object:Gem::Version
62
+ version: 1.0.0
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: 1.0.0
70
+ - !ruby/object:Gem::Dependency
71
+ name: sqlite3
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: activerecord
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '4.1'
91
+ type: :runtime
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '4.1'
98
+ - !ruby/object:Gem::Dependency
99
+ name: typhoeus
100
+ requirement: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - "~>"
103
+ - !ruby/object:Gem::Version
104
+ version: '0.6'
105
+ type: :runtime
106
+ prerelease: false
107
+ version_requirements: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - "~>"
110
+ - !ruby/object:Gem::Version
111
+ version: '0.6'
112
+ - !ruby/object:Gem::Dependency
113
+ name: sidekiq
114
+ requirement: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - "~>"
117
+ - !ruby/object:Gem::Version
118
+ version: '3.1'
119
+ type: :runtime
120
+ prerelease: false
121
+ version_requirements: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - "~>"
124
+ - !ruby/object:Gem::Version
125
+ version: '3.1'
126
+ description: Website crawler harvesting e-mails. Uses Sidekiq and Typhoeus.
127
+ email:
128
+ - michal@higher.lv
129
+ - rabsztok@gmail.com
130
+ executables: []
131
+ extensions: []
132
+ extra_rdoc_files: []
133
+ files:
134
+ - ".gitignore"
135
+ - ".rspec"
136
+ - Gemfile
137
+ - Gemfile.lock
138
+ - LICENSE
139
+ - README.md
140
+ - Rakefile
141
+ - coverage/.last_run.json
142
+ - db/migrate/20140626075744_create_pages.rb
143
+ - db/migrate/20140626080142_create_contacts.rb
144
+ - db/migrate/20140626105612_create_websites.rb
145
+ - db/migrate/20140704070249_create_page_contacts.rb
146
+ - lib/generators/sledgehammer/USAGE
147
+ - lib/generators/sledgehammer/install_generator.rb
148
+ - lib/sledgehammer.rb
149
+ - lib/sledgehammer/models/contact.rb
150
+ - lib/sledgehammer/models/page.rb
151
+ - lib/sledgehammer/models/page_contact.rb
152
+ - lib/sledgehammer/models/website.rb
153
+ - lib/sledgehammer/version.rb
154
+ - lib/sledgehammer/workers/crawl_worker.rb
155
+ - sledgehammer.gemspec
156
+ - spec/fixtures/example2_com.html
157
+ - spec/fixtures/example_com.html
158
+ - spec/fixtures/example_com_testing.html
159
+ - spec/models/contact_spec.rb
160
+ - spec/models/page_contact.rb
161
+ - spec/models/page_spec.rb
162
+ - spec/models/website_spec.rb
163
+ - spec/spec_helper.rb
164
+ - spec/support/active_record_helper.rb
165
+ - spec/workers/crawl_worker_spec.rb
166
+ homepage: https://github.com/growthrepublic/sledgehammer
167
+ licenses:
168
+ - MIT
169
+ metadata: {}
170
+ post_install_message:
171
+ rdoc_options: []
172
+ require_paths:
173
+ - lib
174
+ required_ruby_version: !ruby/object:Gem::Requirement
175
+ requirements:
176
+ - - ">="
177
+ - !ruby/object:Gem::Version
178
+ version: '0'
179
+ required_rubygems_version: !ruby/object:Gem::Requirement
180
+ requirements:
181
+ - - ">="
182
+ - !ruby/object:Gem::Version
183
+ version: '0'
184
+ requirements: []
185
+ rubyforge_project:
186
+ rubygems_version: 2.2.2
187
+ signing_key:
188
+ specification_version: 4
189
+ summary: Crawls websites and harvests e-mails
190
+ test_files:
191
+ - spec/fixtures/example2_com.html
192
+ - spec/fixtures/example_com.html
193
+ - spec/fixtures/example_com_testing.html
194
+ - spec/models/contact_spec.rb
195
+ - spec/models/page_contact.rb
196
+ - spec/models/page_spec.rb
197
+ - spec/models/website_spec.rb
198
+ - spec/spec_helper.rb
199
+ - spec/support/active_record_helper.rb
200
+ - spec/workers/crawl_worker_spec.rb
201
+ has_rdoc: