scruber-mongo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +3 -0
- data/.travis.yml +5 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +90 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/scruber/core/extensions/mongo_output.rb +94 -0
- data/lib/scruber/helpers/fetcher_agent_adapters/mongo.rb +68 -0
- data/lib/scruber/mongo.rb +34 -0
- data/lib/scruber/mongo/cli/generators.rb +32 -0
- data/lib/scruber/mongo/cli/templates/mongo.tt +106 -0
- data/lib/scruber/mongo/configuration.rb +23 -0
- data/lib/scruber/mongo/factory.rb +26 -0
- data/lib/scruber/mongo/version.rb +5 -0
- data/lib/scruber/queue_adapters/mongo.rb +98 -0
- data/scruber-mongo.gemspec +40 -0
- metadata +163 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bcedbbaa8337e7002b6bbcfc8f5c87394fd534d3
|
4
|
+
data.tar.gz: 7e95834b4b2b1a39efed8fc984401db78ef26bbb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e3ed0f1aa68d4a720527fc35a6e444f77da821ab234f088bb16aaad1af66c6c8543a1bde1d7bc08541d176d2f7ee1b3fdee8a51ca04ee99653853d5622b4ca8b
|
7
|
+
data.tar.gz: 6372a6bb978b68e8bc8d748a22f2ac20dbdcd5c436ddd4c5b2f65a5f8c9814dd9fd9d6f6072f24bfa65d06ad7d64939655b5aac80dee68302aee8c283d9f3347
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
scruber-mongo (0.1.0)
|
5
|
+
mongo (~> 2.4)
|
6
|
+
scruber (~> 0.1.3)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
activesupport (5.1.5)
|
12
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
13
|
+
i18n (~> 0.7)
|
14
|
+
minitest (~> 5.1)
|
15
|
+
tzinfo (~> 1.1)
|
16
|
+
addressable (2.5.2)
|
17
|
+
public_suffix (>= 2.0.2, < 4.0)
|
18
|
+
bson (4.3.0)
|
19
|
+
concurrent-ruby (1.0.5)
|
20
|
+
crack (0.4.3)
|
21
|
+
safe_yaml (~> 1.0.0)
|
22
|
+
database_cleaner (1.6.2)
|
23
|
+
diff-lcs (1.3)
|
24
|
+
domain_name (0.5.20170404)
|
25
|
+
unf (>= 0.0.5, < 1.0.0)
|
26
|
+
ethon (0.11.0)
|
27
|
+
ffi (>= 1.3.0)
|
28
|
+
ffi (1.9.23)
|
29
|
+
hashdiff (0.3.7)
|
30
|
+
http-cookie (1.0.3)
|
31
|
+
domain_name (~> 0.5)
|
32
|
+
i18n (0.9.5)
|
33
|
+
concurrent-ruby (~> 1.0)
|
34
|
+
mini_portile2 (2.3.0)
|
35
|
+
minitest (5.11.3)
|
36
|
+
mongo (2.5.1)
|
37
|
+
bson (>= 4.3.0, < 5.0.0)
|
38
|
+
nokogiri (1.8.2)
|
39
|
+
mini_portile2 (~> 2.3.0)
|
40
|
+
pickup (0.0.11)
|
41
|
+
public_suffix (3.0.2)
|
42
|
+
rake (10.5.0)
|
43
|
+
rspec (3.7.0)
|
44
|
+
rspec-core (~> 3.7.0)
|
45
|
+
rspec-expectations (~> 3.7.0)
|
46
|
+
rspec-mocks (~> 3.7.0)
|
47
|
+
rspec-core (3.7.1)
|
48
|
+
rspec-support (~> 3.7.0)
|
49
|
+
rspec-expectations (3.7.0)
|
50
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
51
|
+
rspec-support (~> 3.7.0)
|
52
|
+
rspec-mocks (3.7.0)
|
53
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
54
|
+
rspec-support (~> 3.7.0)
|
55
|
+
rspec-support (3.7.1)
|
56
|
+
safe_yaml (1.0.4)
|
57
|
+
scruber (0.1.3)
|
58
|
+
activesupport (= 5.1.5)
|
59
|
+
http-cookie (= 1.0.3)
|
60
|
+
nokogiri (= 1.8.2)
|
61
|
+
pickup (= 0.0.11)
|
62
|
+
thor (= 0.20.0)
|
63
|
+
typhoeus (= 1.1.2)
|
64
|
+
thor (0.20.0)
|
65
|
+
thread_safe (0.3.6)
|
66
|
+
typhoeus (1.1.2)
|
67
|
+
ethon (>= 0.9.0)
|
68
|
+
tzinfo (1.2.5)
|
69
|
+
thread_safe (~> 0.1)
|
70
|
+
unf (0.1.4)
|
71
|
+
unf_ext
|
72
|
+
unf_ext (0.0.7.5)
|
73
|
+
webmock (3.0.1)
|
74
|
+
addressable (>= 2.3.6)
|
75
|
+
crack (>= 0.3.2)
|
76
|
+
hashdiff
|
77
|
+
|
78
|
+
PLATFORMS
|
79
|
+
ruby
|
80
|
+
|
81
|
+
DEPENDENCIES
|
82
|
+
bundler (~> 1.16)
|
83
|
+
database_cleaner (~> 1.6.0)
|
84
|
+
rake (~> 10.0)
|
85
|
+
rspec (~> 3.0)
|
86
|
+
scruber-mongo!
|
87
|
+
webmock (= 3.0.1)
|
88
|
+
|
89
|
+
BUNDLED WITH
|
90
|
+
1.16.1
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2018 Ivan Goncharov
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# Scruber::Mongo
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/scruber/mongo`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'scruber-mongo'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install scruber-mongo
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
TODO: Write usage instructions here
|
26
|
+
|
27
|
+
## Development
|
28
|
+
|
29
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
|
+
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scruber-mongo.
|
36
|
+
|
37
|
+
## License
|
38
|
+
|
39
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "scruber/mongo"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module Extensions
|
4
|
+
class MongoOutput < Base
|
5
|
+
module CoreMethods
|
6
|
+
|
7
|
+
def mongo_out(fields, options={})
|
8
|
+
Scruber::Core::Extensions::MongoOutput.mongo_out self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, fields, options
|
9
|
+
end
|
10
|
+
|
11
|
+
def mongo_find(id)
|
12
|
+
Scruber::Core::Extensions::MongoOutput.mongo_find self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, id
|
13
|
+
end
|
14
|
+
|
15
|
+
def mongo_collection
|
16
|
+
Scruber::Core::Extensions::MongoOutput.mongo_collection self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.included(base)
|
20
|
+
Scruber::Core::Crawler.register_method_missing /\Amongo_out_(\w+)\Z/ do |meth, scan_results, args|
|
21
|
+
suffix = scan_results.first.first.to_sym
|
22
|
+
fields, options = args.first
|
23
|
+
fields = {} if fields.nil?
|
24
|
+
Scruber::Core::Crawler.class_eval do
|
25
|
+
define_method "mongo_out_#{suffix}".to_sym do |fields, opts={}|
|
26
|
+
Scruber::Core::Extensions::MongoOutput.mongo_out(self.scraper_name, suffix, fields, opts)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
Scruber::Core::Extensions::MongoOutput.mongo_out(self.scraper_name, suffix, fields, options)
|
30
|
+
end
|
31
|
+
Scruber::Core::Crawler.register_method_missing /\Amongo_find_(\w+)\Z/ do |meth, scan_results, args|
|
32
|
+
suffix = scan_results.first.first.to_sym
|
33
|
+
id = args.first
|
34
|
+
Scruber::Core::Crawler.class_eval do
|
35
|
+
define_method "mongo_find_#{suffix}".to_sym do |id|
|
36
|
+
Scruber::Core::Extensions::MongoOutput.mongo_find(self.scraper_name, suffix, id)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
Scruber::Core::Extensions::MongoOutput.mongo_find(self.scraper_name, suffix, id)
|
40
|
+
end
|
41
|
+
Scruber::Core::Crawler.register_method_missing /\Amongo_(\w+)_collection\Z/ do |meth, scan_results, args|
|
42
|
+
suffix = scan_results.first.first.to_sym
|
43
|
+
Scruber::Core::Crawler.class_eval do
|
44
|
+
define_method "mongo_#{suffix}_collection".to_sym do
|
45
|
+
Scruber::Core::Extensions::MongoOutput.mongo_collection(self.scraper_name, suffix)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
Scruber::Core::Extensions::MongoOutput.mongo_collection(self.scraper_name, suffix)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class << self
|
54
|
+
attr_writer :default_suffix_name
|
55
|
+
|
56
|
+
def default_suffix_name
|
57
|
+
@default_suffix_name ||= 'records'
|
58
|
+
end
|
59
|
+
|
60
|
+
def mongo_out(scraper_name, suffix, fields, options={})
|
61
|
+
fields = fields.with_indifferent_access
|
62
|
+
if fields[:_id].blank?
|
63
|
+
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].insert_one(fields)
|
64
|
+
else
|
65
|
+
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find_one_and_update(
|
66
|
+
{"_id" => fields[:_id] },
|
67
|
+
{'$set' => fields },
|
68
|
+
{return_document: :before, upsert: true}.merge(options)
|
69
|
+
)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def mongo_find(scraper_name, suffix, id)
|
74
|
+
if id.is_a?(Hash)
|
75
|
+
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find(id)
|
76
|
+
else
|
77
|
+
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find({_id: id}).first
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def mongo_collection(scraper_name, suffix)
|
82
|
+
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)]
|
83
|
+
end
|
84
|
+
|
85
|
+
def out_collection_name(scraper_name, suffix)
|
86
|
+
[Scruber::Mongo.configuration.options['collections_prefix'], scraper_name, suffix].select(&:present?).map(&:to_s).join('_')
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Helpers
|
3
|
+
module FetcherAgentAdapters
|
4
|
+
class Mongo < AbstractAdapter
|
5
|
+
def initialize(options={})
|
6
|
+
options = options.with_indifferent_access
|
7
|
+
super(options)
|
8
|
+
@id = options.fetch(:_id){ options.fetch(:id){ nil } }
|
9
|
+
end
|
10
|
+
|
11
|
+
def attrs
|
12
|
+
serialize_cookies
|
13
|
+
{
|
14
|
+
user_agent: @user_agent,
|
15
|
+
proxy_id: @proxy_id,
|
16
|
+
headers: @headers,
|
17
|
+
cookie_jar: @cookie_jar,
|
18
|
+
disable_proxy: @disable_proxy,
|
19
|
+
updated_at: @updated_at,
|
20
|
+
created_at: @created_at,
|
21
|
+
}.merge((id.present? ? {_id: id} : {}))
|
22
|
+
end
|
23
|
+
|
24
|
+
def save
|
25
|
+
@id = Scruber::Helpers::FetcherAgentAdapters::Mongo.store(self)
|
26
|
+
end
|
27
|
+
|
28
|
+
def delete
|
29
|
+
Scruber::Helpers::FetcherAgentAdapters::Mongo.delete(self)
|
30
|
+
end
|
31
|
+
|
32
|
+
class << self
|
33
|
+
def find(id)
|
34
|
+
obj = mongo_collection.find({_id: id}).first
|
35
|
+
obj.nil? ? nil : new(obj)
|
36
|
+
end
|
37
|
+
|
38
|
+
def mongo_collection
|
39
|
+
Scruber::Mongo.client[agents_collection_name]
|
40
|
+
end
|
41
|
+
|
42
|
+
def agents_collection_name
|
43
|
+
[Scruber::Mongo.configuration.options['collections_prefix'], 'fetcher_agents'].join('_')
|
44
|
+
end
|
45
|
+
|
46
|
+
def store(fetcher_agent, options={})
|
47
|
+
if fetcher_agent.id.blank?
|
48
|
+
mongo_collection.insert_one(fetcher_agent.attrs).inserted_id
|
49
|
+
else
|
50
|
+
mongo_collection.find_one_and_update(
|
51
|
+
{"_id" => fetcher_agent.id },
|
52
|
+
{'$set' => fetcher_agent.attrs },
|
53
|
+
{return_document: :after, upsert: true}.merge(options)
|
54
|
+
)[:_id]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def delete(fetcher_agent)
|
59
|
+
mongo_collection.find({_id: fetcher_agent.id}).delete_one
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
Scruber::Helpers::FetcherAgent.add_adapter(:mongo, Scruber::Helpers::FetcherAgentAdapters::Mongo)
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'scruber'
|
3
|
+
require 'mongo'
|
4
|
+
require "scruber/mongo/version"
|
5
|
+
require "scruber/mongo/configuration"
|
6
|
+
require "scruber/mongo/factory"
|
7
|
+
require "scruber/mongo/cli/generators"
|
8
|
+
|
9
|
+
require "scruber/queue_adapters/mongo"
|
10
|
+
require "scruber/core/extensions/mongo_output"
|
11
|
+
require "scruber/helpers/fetcher_agent_adapters/mongo"
|
12
|
+
|
13
|
+
module Scruber
|
14
|
+
module Mongo
|
15
|
+
class << self
|
16
|
+
attr_writer :configuration
|
17
|
+
attr_writer :clients
|
18
|
+
|
19
|
+
def configuration
|
20
|
+
@configuration ||= ::Scruber::Mongo::Configuration.new
|
21
|
+
end
|
22
|
+
|
23
|
+
def configure(&block)
|
24
|
+
yield configuration
|
25
|
+
end
|
26
|
+
|
27
|
+
def client(client_name=:default)
|
28
|
+
@clients ||= {}
|
29
|
+
@clients[client_name] ||= Scruber::Mongo::Factory.create_client(client_name)
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require "thor"
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Scruber
|
5
|
+
module CLI
|
6
|
+
class Generators < Thor
|
7
|
+
|
8
|
+
class MongoInstall < Thor::Group
|
9
|
+
include Thor::Actions
|
10
|
+
|
11
|
+
def self.source_root
|
12
|
+
File.dirname(__FILE__) + '/templates'
|
13
|
+
end
|
14
|
+
|
15
|
+
def check_for_project
|
16
|
+
raise ::Thor::Error, "ERROR: Scruber project not found." unless defined?(APP_PATH)
|
17
|
+
end
|
18
|
+
|
19
|
+
def create_files
|
20
|
+
template 'mongo.tt', File.expand_path('../../config/mongo.yml', APP_PATH)
|
21
|
+
end
|
22
|
+
|
23
|
+
def change_config
|
24
|
+
gsub_file File.expand_path('../../config/application.rb', APP_PATH), /config\.fetcher_agent_adapter\s*=\s*\:(\w+)/, 'config.fetcher_agent_adapter = :mongo'
|
25
|
+
gsub_file File.expand_path('../../config/application.rb', APP_PATH), /config.queue_adapter\s*=\s*\:(\w+)/, 'config.queue_adapter = :mongo'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
register MongoInstall, 'mongo:install', 'mongo:install', 'Install mongo'
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# Mongoid config was taken
|
2
|
+
# Configure available database clients. (required)
|
3
|
+
clients:
|
4
|
+
# Defines the default client. (required)
|
5
|
+
default:
|
6
|
+
# Defines the name of the default database that Scruber can connect to.
|
7
|
+
# (required).
|
8
|
+
database: scruber
|
9
|
+
# Provides the hosts the default client can connect to. Must be an array
|
10
|
+
# of host:port pairs. (required)
|
11
|
+
hosts:
|
12
|
+
- localhost:27017
|
13
|
+
options:
|
14
|
+
# Change the default write concern. (default = { w: 1 })
|
15
|
+
# write:
|
16
|
+
# w: 1
|
17
|
+
|
18
|
+
# Change the default read preference. Valid options for mode are: :secondary,
|
19
|
+
# :secondary_preferred, :primary, :primary_preferred, :nearest
|
20
|
+
# (default: primary)
|
21
|
+
# read:
|
22
|
+
# mode: :secondary_preferred
|
23
|
+
# tag_sets:
|
24
|
+
# - use: web
|
25
|
+
|
26
|
+
# The name of the user for authentication.
|
27
|
+
# user: 'user'
|
28
|
+
|
29
|
+
# The password of the user for authentication.
|
30
|
+
# password: 'password'
|
31
|
+
|
32
|
+
# The user's database roles.
|
33
|
+
# roles:
|
34
|
+
# - 'dbOwner'
|
35
|
+
|
36
|
+
# Change the default authentication mechanism. Valid options are: :scram,
|
37
|
+
# :mongodb_cr, :mongodb_x509, and :plain. Note that all authentication
|
38
|
+
# mechanisms require username and password, with the exception of :mongodb_x509.
|
39
|
+
# Default on mongoDB 3.0 is :scram, default on 2.4 and 2.6 is :plain.
|
40
|
+
# auth_mech: :scram
|
41
|
+
|
42
|
+
# The database or source to authenticate the user against.
|
43
|
+
# (default: the database specified above or admin)
|
44
|
+
# auth_source: admin
|
45
|
+
|
46
|
+
# Force a the driver cluster to behave in a certain manner instead of auto-
|
47
|
+
# discovering. Can be one of: :direct, :replica_set, :sharded. Set to :direct
|
48
|
+
# when connecting to hidden members of a replica set.
|
49
|
+
# connect: :direct
|
50
|
+
|
51
|
+
# Changes the default time in seconds the server monitors refresh their status
|
52
|
+
# via ismaster commands. (default: 10)
|
53
|
+
# heartbeat_frequency: 10
|
54
|
+
|
55
|
+
# The time in seconds for selecting servers for a near read preference. (default: 0.015)
|
56
|
+
# local_threshold: 0.015
|
57
|
+
|
58
|
+
# The timeout in seconds for selecting a server for an operation. (default: 30)
|
59
|
+
# server_selection_timeout: 30
|
60
|
+
|
61
|
+
# The maximum number of connections in the connection pool. (default: 5)
|
62
|
+
# max_pool_size: 5
|
63
|
+
|
64
|
+
# The minimum number of connections in the connection pool. (default: 1)
|
65
|
+
# min_pool_size: 1
|
66
|
+
|
67
|
+
# The time to wait, in seconds, in the connection pool for a connection
|
68
|
+
# to be checked in before timing out. (default: 5)
|
69
|
+
# wait_queue_timeout: 5
|
70
|
+
|
71
|
+
# The time to wait to establish a connection before timing out, in seconds.
|
72
|
+
# (default: 5)
|
73
|
+
# connect_timeout: 5
|
74
|
+
|
75
|
+
# The timeout to wait to execute operations on a socket before raising an error.
|
76
|
+
# (default: 5)
|
77
|
+
# socket_timeout: 5
|
78
|
+
|
79
|
+
# The name of the replica set to connect to. Servers provided as seeds that do
|
80
|
+
# not belong to this replica set will be ignored.
|
81
|
+
# replica_set: name
|
82
|
+
|
83
|
+
# Whether to connect to the servers via ssl. (default: false)
|
84
|
+
# ssl: true
|
85
|
+
|
86
|
+
# The certificate file used to identify the connection against MongoDB.
|
87
|
+
# ssl_cert: /path/to/my.cert
|
88
|
+
|
89
|
+
# The private keyfile used to identify the connection against MongoDB.
|
90
|
+
# Note that even if the key is stored in the same file as the certificate,
|
91
|
+
# both need to be explicitly specified.
|
92
|
+
# ssl_key: /path/to/my.key
|
93
|
+
|
94
|
+
# A passphrase for the private key.
|
95
|
+
# ssl_key_pass_phrase: password
|
96
|
+
|
97
|
+
# Whether or not to do peer certification validation. (default: true)
|
98
|
+
# ssl_verify: true
|
99
|
+
|
100
|
+
# The file containing a set of concatenated certification authority certifications
|
101
|
+
# used to validate certs passed from the other end of the connection.
|
102
|
+
# ssl_ca_cert: /path/to/ca.cert
|
103
|
+
|
104
|
+
options:
|
105
|
+
# Prefix for collections that will be created
|
106
|
+
collections_prefix: 'scruber'
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Mongo
|
3
|
+
class Configuration
|
4
|
+
attr_accessor :clients, :options
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@clients = {}
|
8
|
+
@options = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def load!(path)
|
12
|
+
config = YAML.load_file(path).with_indifferent_access
|
13
|
+
@clients = config['clients']
|
14
|
+
@options = config['options']
|
15
|
+
end
|
16
|
+
|
17
|
+
def configured?(client_name=:default)
|
18
|
+
@clients.key?(client_name)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Mongo
|
3
|
+
module Factory
|
4
|
+
extend self
|
5
|
+
|
6
|
+
def create_client(client_name=:default)
|
7
|
+
raise Scruber::ArgumentError.new("Not configured") unless Scruber::Mongo.configuration.configured?(client_name)
|
8
|
+
configuration = Scruber::Mongo.configuration.clients[client_name]
|
9
|
+
if configuration[:uri]
|
10
|
+
::Mongo::Client.new(configuration[:uri], options(configuration))
|
11
|
+
else
|
12
|
+
::Mongo::Client.new(
|
13
|
+
configuration[:hosts],
|
14
|
+
options(configuration).merge(database: configuration[:database])
|
15
|
+
)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def options(configuration)
|
20
|
+
config = configuration.dup
|
21
|
+
options = config.delete(:options) || {}
|
22
|
+
options.reject{ |k, v| k == :hosts }.to_hash.symbolize_keys!
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module Scruber
|
2
|
+
module QueueAdapters
|
3
|
+
class Mongo < AbstractAdapter
|
4
|
+
attr_reader :error_pages
|
5
|
+
|
6
|
+
class Page < Scruber::QueueAdapters::AbstractAdapter::Page
|
7
|
+
def id
|
8
|
+
@options[:_id] || @options[:id]
|
9
|
+
end
|
10
|
+
|
11
|
+
def save(options={})
|
12
|
+
if id.blank?
|
13
|
+
@queue.collection.insert_one(attrs)
|
14
|
+
else
|
15
|
+
@queue.collection.find_one_and_update(
|
16
|
+
{"_id" => self.id },
|
17
|
+
{'$set' => attrs },
|
18
|
+
{return_document: :before, upsert: true, projection: {_id: 1}}.merge(options)
|
19
|
+
)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def attrs
|
24
|
+
@options.with_indifferent_access.except('id', '_id').merge(id.present? ? {_id: id} : {}).merge (instance_variables.select{|ivar| !(ivar.to_s =~ /\@_/) }-[:@options, :@queue]).inject({}){|acc,ivar| acc[ivar[1..-1]] = instance_variable_get(ivar);acc }.with_indifferent_access
|
25
|
+
end
|
26
|
+
|
27
|
+
def delete
|
28
|
+
@queue.collection.find({"_id" => self.id }).delete_one if self.id.present?
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# def initialize(options={})
|
33
|
+
# super(options)
|
34
|
+
# end
|
35
|
+
|
36
|
+
def push(url_or_page, options={})
|
37
|
+
if url_or_page.is_a?(Page)
|
38
|
+
url_or_page.queue = self
|
39
|
+
url_or_page.save(options)
|
40
|
+
else
|
41
|
+
Page.new(self, url_or_page, options).save
|
42
|
+
end
|
43
|
+
end
|
44
|
+
alias_method :add, :push
|
45
|
+
|
46
|
+
def queue_size
|
47
|
+
collection.count
|
48
|
+
end
|
49
|
+
|
50
|
+
def find(id)
|
51
|
+
build_pages collection.find({_id: id}).first
|
52
|
+
end
|
53
|
+
|
54
|
+
def fetch_downloaded(count=nil)
|
55
|
+
if count.nil?
|
56
|
+
build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).first
|
57
|
+
else
|
58
|
+
build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).limit(count).to_a
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def fetch_pending(count=nil)
|
63
|
+
if count.nil?
|
64
|
+
build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).first
|
65
|
+
else
|
66
|
+
build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def has_work?
|
71
|
+
fetch_pending.present? || fetch_downloaded.present?
|
72
|
+
end
|
73
|
+
|
74
|
+
def collection
|
75
|
+
Scruber::Mongo.client[pages_collection_name]
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def build_pages(pages)
|
81
|
+
if pages.nil?
|
82
|
+
nil
|
83
|
+
elsif pages.is_a?(Array)
|
84
|
+
pages.map{|p| Page.new(self, p['url'], p.with_indifferent_access )}
|
85
|
+
else
|
86
|
+
Page.new(self, pages['url'], pages.with_indifferent_access )
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def pages_collection_name
|
91
|
+
@_pages_collection_name ||= [Scruber::Mongo.configuration.options['collections_prefix'], @options[:scraper_name], 'pages'].select(&:present?).map(&:to_s).join('_')
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
Scruber::Queue.add_adapter(:mongo, Scruber::QueueAdapters::Mongo)
|
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "scruber/mongo/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "scruber-mongo"
|
8
|
+
spec.version = Scruber::Mongo::VERSION
|
9
|
+
spec.authors = ["Ivan Goncharov"]
|
10
|
+
spec.email = ["revis0r.mob@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Mongo support for Scruber}
|
13
|
+
spec.description = %q{Mongo support for Scruber}
|
14
|
+
spec.homepage = "https://github.com/scruber/scruber-mongo"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
+
if spec.respond_to?(:metadata)
|
20
|
+
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
21
|
+
else
|
22
|
+
raise "RubyGems 2.0 or newer is required to protect against " \
|
23
|
+
"public gem pushes."
|
24
|
+
end
|
25
|
+
|
26
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
27
|
+
f.match(%r{^(test|spec|features)/})
|
28
|
+
end
|
29
|
+
spec.bindir = "exe"
|
30
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
|
+
spec.require_paths = ["lib"]
|
32
|
+
|
33
|
+
spec.add_dependency "scruber", "~> 0.1.3"
|
34
|
+
spec.add_dependency "mongo", "~> 2.4"
|
35
|
+
spec.add_development_dependency "bundler", "~> 1.16"
|
36
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
37
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
38
|
+
spec.add_development_dependency "database_cleaner", "~> 1.6.0"
|
39
|
+
spec.add_development_dependency "webmock", "3.0.1"
|
40
|
+
end
|
metadata
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scruber-mongo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ivan Goncharov
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-03-17 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: scruber
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.1.3
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.1.3
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mongo
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.4'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.4'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.16'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.16'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: database_cleaner
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 1.6.0
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 1.6.0
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: webmock
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - '='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 3.0.1
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - '='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 3.0.1
|
111
|
+
description: Mongo support for Scruber
|
112
|
+
email:
|
113
|
+
- revis0r.mob@gmail.com
|
114
|
+
executables: []
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- ".gitignore"
|
119
|
+
- ".rspec"
|
120
|
+
- ".travis.yml"
|
121
|
+
- Gemfile
|
122
|
+
- Gemfile.lock
|
123
|
+
- LICENSE.txt
|
124
|
+
- README.md
|
125
|
+
- Rakefile
|
126
|
+
- bin/console
|
127
|
+
- bin/setup
|
128
|
+
- lib/scruber/core/extensions/mongo_output.rb
|
129
|
+
- lib/scruber/helpers/fetcher_agent_adapters/mongo.rb
|
130
|
+
- lib/scruber/mongo.rb
|
131
|
+
- lib/scruber/mongo/cli/generators.rb
|
132
|
+
- lib/scruber/mongo/cli/templates/mongo.tt
|
133
|
+
- lib/scruber/mongo/configuration.rb
|
134
|
+
- lib/scruber/mongo/factory.rb
|
135
|
+
- lib/scruber/mongo/version.rb
|
136
|
+
- lib/scruber/queue_adapters/mongo.rb
|
137
|
+
- scruber-mongo.gemspec
|
138
|
+
homepage: https://github.com/scruber/scruber-mongo
|
139
|
+
licenses:
|
140
|
+
- MIT
|
141
|
+
metadata:
|
142
|
+
allowed_push_host: https://rubygems.org
|
143
|
+
post_install_message:
|
144
|
+
rdoc_options: []
|
145
|
+
require_paths:
|
146
|
+
- lib
|
147
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - ">="
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
153
|
+
requirements:
|
154
|
+
- - ">="
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: '0'
|
157
|
+
requirements: []
|
158
|
+
rubyforge_project:
|
159
|
+
rubygems_version: 2.6.14
|
160
|
+
signing_key:
|
161
|
+
specification_version: 4
|
162
|
+
summary: Mongo support for Scruber
|
163
|
+
test_files: []
|