scruber-mongo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bcedbbaa8337e7002b6bbcfc8f5c87394fd534d3
4
+ data.tar.gz: 7e95834b4b2b1a39efed8fc984401db78ef26bbb
5
+ SHA512:
6
+ metadata.gz: e3ed0f1aa68d4a720527fc35a6e444f77da821ab234f088bb16aaad1af66c6c8543a1bde1d7bc08541d176d2f7ee1b3fdee8a51ca04ee99653853d5622b4ca8b
7
+ data.tar.gz: 6372a6bb978b68e8bc8d748a22f2ac20dbdcd5c436ddd4c5b2f65a5f8c9814dd9fd9d6f6072f24bfa65d06ad7d64939655b5aac80dee68302aee8c283d9f3347
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/config.yml
9
+ /tmp/
10
+
11
+ # rspec failure tracking
12
+ .rspec_status
13
+ .ruby-version
14
+ .ruby-gemset
15
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.1
5
+ before_install: gem install bundler -v 1.16.1
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in scruber-mongo.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,90 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ scruber-mongo (0.1.0)
5
+ mongo (~> 2.4)
6
+ scruber (~> 0.1.3)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ activesupport (5.1.5)
12
+ concurrent-ruby (~> 1.0, >= 1.0.2)
13
+ i18n (~> 0.7)
14
+ minitest (~> 5.1)
15
+ tzinfo (~> 1.1)
16
+ addressable (2.5.2)
17
+ public_suffix (>= 2.0.2, < 4.0)
18
+ bson (4.3.0)
19
+ concurrent-ruby (1.0.5)
20
+ crack (0.4.3)
21
+ safe_yaml (~> 1.0.0)
22
+ database_cleaner (1.6.2)
23
+ diff-lcs (1.3)
24
+ domain_name (0.5.20170404)
25
+ unf (>= 0.0.5, < 1.0.0)
26
+ ethon (0.11.0)
27
+ ffi (>= 1.3.0)
28
+ ffi (1.9.23)
29
+ hashdiff (0.3.7)
30
+ http-cookie (1.0.3)
31
+ domain_name (~> 0.5)
32
+ i18n (0.9.5)
33
+ concurrent-ruby (~> 1.0)
34
+ mini_portile2 (2.3.0)
35
+ minitest (5.11.3)
36
+ mongo (2.5.1)
37
+ bson (>= 4.3.0, < 5.0.0)
38
+ nokogiri (1.8.2)
39
+ mini_portile2 (~> 2.3.0)
40
+ pickup (0.0.11)
41
+ public_suffix (3.0.2)
42
+ rake (10.5.0)
43
+ rspec (3.7.0)
44
+ rspec-core (~> 3.7.0)
45
+ rspec-expectations (~> 3.7.0)
46
+ rspec-mocks (~> 3.7.0)
47
+ rspec-core (3.7.1)
48
+ rspec-support (~> 3.7.0)
49
+ rspec-expectations (3.7.0)
50
+ diff-lcs (>= 1.2.0, < 2.0)
51
+ rspec-support (~> 3.7.0)
52
+ rspec-mocks (3.7.0)
53
+ diff-lcs (>= 1.2.0, < 2.0)
54
+ rspec-support (~> 3.7.0)
55
+ rspec-support (3.7.1)
56
+ safe_yaml (1.0.4)
57
+ scruber (0.1.3)
58
+ activesupport (= 5.1.5)
59
+ http-cookie (= 1.0.3)
60
+ nokogiri (= 1.8.2)
61
+ pickup (= 0.0.11)
62
+ thor (= 0.20.0)
63
+ typhoeus (= 1.1.2)
64
+ thor (0.20.0)
65
+ thread_safe (0.3.6)
66
+ typhoeus (1.1.2)
67
+ ethon (>= 0.9.0)
68
+ tzinfo (1.2.5)
69
+ thread_safe (~> 0.1)
70
+ unf (0.1.4)
71
+ unf_ext
72
+ unf_ext (0.0.7.5)
73
+ webmock (3.0.1)
74
+ addressable (>= 2.3.6)
75
+ crack (>= 0.3.2)
76
+ hashdiff
77
+
78
+ PLATFORMS
79
+ ruby
80
+
81
+ DEPENDENCIES
82
+ bundler (~> 1.16)
83
+ database_cleaner (~> 1.6.0)
84
+ rake (~> 10.0)
85
+ rspec (~> 3.0)
86
+ scruber-mongo!
87
+ webmock (= 3.0.1)
88
+
89
+ BUNDLED WITH
90
+ 1.16.1
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Ivan Goncharov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # Scruber::Mongo
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/scruber/mongo`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'scruber-mongo'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install scruber-mongo
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scruber-mongo.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "scruber/mongo"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,94 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class MongoOutput < Base
5
+ module CoreMethods
6
+
7
+ def mongo_out(fields, options={})
8
+ Scruber::Core::Extensions::MongoOutput.mongo_out self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, fields, options
9
+ end
10
+
11
+ def mongo_find(id)
12
+ Scruber::Core::Extensions::MongoOutput.mongo_find self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, id
13
+ end
14
+
15
+ def mongo_collection
16
+ Scruber::Core::Extensions::MongoOutput.mongo_collection self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name
17
+ end
18
+
19
+ def self.included(base)
20
+ Scruber::Core::Crawler.register_method_missing /\Amongo_out_(\w+)\Z/ do |meth, scan_results, args|
21
+ suffix = scan_results.first.first.to_sym
22
+ fields, options = args.first
23
+ fields = {} if fields.nil?
24
+ Scruber::Core::Crawler.class_eval do
25
+ define_method "mongo_out_#{suffix}".to_sym do |fields, opts={}|
26
+ Scruber::Core::Extensions::MongoOutput.mongo_out(self.scraper_name, suffix, fields, opts)
27
+ end
28
+ end
29
+ Scruber::Core::Extensions::MongoOutput.mongo_out(self.scraper_name, suffix, fields, options)
30
+ end
31
+ Scruber::Core::Crawler.register_method_missing /\Amongo_find_(\w+)\Z/ do |meth, scan_results, args|
32
+ suffix = scan_results.first.first.to_sym
33
+ id = args.first
34
+ Scruber::Core::Crawler.class_eval do
35
+ define_method "mongo_find_#{suffix}".to_sym do |id|
36
+ Scruber::Core::Extensions::MongoOutput.mongo_find(self.scraper_name, suffix, id)
37
+ end
38
+ end
39
+ Scruber::Core::Extensions::MongoOutput.mongo_find(self.scraper_name, suffix, id)
40
+ end
41
+ Scruber::Core::Crawler.register_method_missing /\Amongo_(\w+)_collection\Z/ do |meth, scan_results, args|
42
+ suffix = scan_results.first.first.to_sym
43
+ Scruber::Core::Crawler.class_eval do
44
+ define_method "mongo_#{suffix}_collection".to_sym do
45
+ Scruber::Core::Extensions::MongoOutput.mongo_collection(self.scraper_name, suffix)
46
+ end
47
+ end
48
+ Scruber::Core::Extensions::MongoOutput.mongo_collection(self.scraper_name, suffix)
49
+ end
50
+ end
51
+ end
52
+
53
+ class << self
54
+ attr_writer :default_suffix_name
55
+
56
+ def default_suffix_name
57
+ @default_suffix_name ||= 'records'
58
+ end
59
+
60
+ def mongo_out(scraper_name, suffix, fields, options={})
61
+ fields = fields.with_indifferent_access
62
+ if fields[:_id].blank?
63
+ Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].insert_one(fields)
64
+ else
65
+ Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find_one_and_update(
66
+ {"_id" => fields[:_id] },
67
+ {'$set' => fields },
68
+ {return_document: :before, upsert: true}.merge(options)
69
+ )
70
+ end
71
+ end
72
+
73
+ def mongo_find(scraper_name, suffix, id)
74
+ if id.is_a?(Hash)
75
+ Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find(id)
76
+ else
77
+ Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find({_id: id}).first
78
+ end
79
+ end
80
+
81
+ def mongo_collection(scraper_name, suffix)
82
+ Scruber::Mongo.client[out_collection_name(scraper_name, suffix)]
83
+ end
84
+
85
+ def out_collection_name(scraper_name, suffix)
86
+ [Scruber::Mongo.configuration.options['collections_prefix'], scraper_name, suffix].select(&:present?).map(&:to_s).join('_')
87
+ end
88
+
89
+ end
90
+
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,68 @@
1
+ module Scruber
2
+ module Helpers
3
+ module FetcherAgentAdapters
4
+ class Mongo < AbstractAdapter
5
+ def initialize(options={})
6
+ options = options.with_indifferent_access
7
+ super(options)
8
+ @id = options.fetch(:_id){ options.fetch(:id){ nil } }
9
+ end
10
+
11
+ def attrs
12
+ serialize_cookies
13
+ {
14
+ user_agent: @user_agent,
15
+ proxy_id: @proxy_id,
16
+ headers: @headers,
17
+ cookie_jar: @cookie_jar,
18
+ disable_proxy: @disable_proxy,
19
+ updated_at: @updated_at,
20
+ created_at: @created_at,
21
+ }.merge((id.present? ? {_id: id} : {}))
22
+ end
23
+
24
+ def save
25
+ @id = Scruber::Helpers::FetcherAgentAdapters::Mongo.store(self)
26
+ end
27
+
28
+ def delete
29
+ Scruber::Helpers::FetcherAgentAdapters::Mongo.delete(self)
30
+ end
31
+
32
+ class << self
33
+ def find(id)
34
+ obj = mongo_collection.find({_id: id}).first
35
+ obj.nil? ? nil : new(obj)
36
+ end
37
+
38
+ def mongo_collection
39
+ Scruber::Mongo.client[agents_collection_name]
40
+ end
41
+
42
+ def agents_collection_name
43
+ [Scruber::Mongo.configuration.options['collections_prefix'], 'fetcher_agents'].join('_')
44
+ end
45
+
46
+ def store(fetcher_agent, options={})
47
+ if fetcher_agent.id.blank?
48
+ mongo_collection.insert_one(fetcher_agent.attrs).inserted_id
49
+ else
50
+ mongo_collection.find_one_and_update(
51
+ {"_id" => fetcher_agent.id },
52
+ {'$set' => fetcher_agent.attrs },
53
+ {return_document: :after, upsert: true}.merge(options)
54
+ )[:_id]
55
+ end
56
+ end
57
+
58
+ def delete(fetcher_agent)
59
+ mongo_collection.find({_id: fetcher_agent.id}).delete_one
60
+ end
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ Scruber::Helpers::FetcherAgent.add_adapter(:mongo, Scruber::Helpers::FetcherAgentAdapters::Mongo)
@@ -0,0 +1,34 @@
1
+ require 'yaml'
2
+ require 'scruber'
3
+ require 'mongo'
4
+ require "scruber/mongo/version"
5
+ require "scruber/mongo/configuration"
6
+ require "scruber/mongo/factory"
7
+ require "scruber/mongo/cli/generators"
8
+
9
+ require "scruber/queue_adapters/mongo"
10
+ require "scruber/core/extensions/mongo_output"
11
+ require "scruber/helpers/fetcher_agent_adapters/mongo"
12
+
13
+ module Scruber
14
+ module Mongo
15
+ class << self
16
+ attr_writer :configuration
17
+ attr_writer :clients
18
+
19
+ def configuration
20
+ @configuration ||= ::Scruber::Mongo::Configuration.new
21
+ end
22
+
23
+ def configure(&block)
24
+ yield configuration
25
+ end
26
+
27
+ def client(client_name=:default)
28
+ @clients ||= {}
29
+ @clients[client_name] ||= Scruber::Mongo::Factory.create_client(client_name)
30
+ end
31
+
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,32 @@
1
+ require "thor"
2
+ require 'fileutils'
3
+
4
+ module Scruber
5
+ module CLI
6
+ class Generators < Thor
7
+
8
+ class MongoInstall < Thor::Group
9
+ include Thor::Actions
10
+
11
+ def self.source_root
12
+ File.dirname(__FILE__) + '/templates'
13
+ end
14
+
15
+ def check_for_project
16
+ raise ::Thor::Error, "ERROR: Scruber project not found." unless defined?(APP_PATH)
17
+ end
18
+
19
+ def create_files
20
+ template 'mongo.tt', File.expand_path('../../config/mongo.yml', APP_PATH)
21
+ end
22
+
23
+ def change_config
24
+ gsub_file File.expand_path('../../config/application.rb', APP_PATH), /config\.fetcher_agent_adapter\s*=\s*\:(\w+)/, 'config.fetcher_agent_adapter = :mongo'
25
+ gsub_file File.expand_path('../../config/application.rb', APP_PATH), /config.queue_adapter\s*=\s*\:(\w+)/, 'config.queue_adapter = :mongo'
26
+ end
27
+ end
28
+
29
+ register MongoInstall, 'mongo:install', 'mongo:install', 'Install mongo'
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,106 @@
1
+ # Mongoid config was taken
2
+ # Configure available database clients. (required)
3
+ clients:
4
+ # Defines the default client. (required)
5
+ default:
6
+ # Defines the name of the default database that Scruber can connect to.
7
+ # (required).
8
+ database: scruber
9
+ # Provides the hosts the default client can connect to. Must be an array
10
+ # of host:port pairs. (required)
11
+ hosts:
12
+ - localhost:27017
13
+ options:
14
+ # Change the default write concern. (default = { w: 1 })
15
+ # write:
16
+ # w: 1
17
+
18
+ # Change the default read preference. Valid options for mode are: :secondary,
19
+ # :secondary_preferred, :primary, :primary_preferred, :nearest
20
+ # (default: primary)
21
+ # read:
22
+ # mode: :secondary_preferred
23
+ # tag_sets:
24
+ # - use: web
25
+
26
+ # The name of the user for authentication.
27
+ # user: 'user'
28
+
29
+ # The password of the user for authentication.
30
+ # password: 'password'
31
+
32
+ # The user's database roles.
33
+ # roles:
34
+ # - 'dbOwner'
35
+
36
+ # Change the default authentication mechanism. Valid options are: :scram,
37
+ # :mongodb_cr, :mongodb_x509, and :plain. Note that all authentication
38
+ # mechanisms require username and password, with the exception of :mongodb_x509.
39
+ # Default on mongoDB 3.0 is :scram, default on 2.4 and 2.6 is :plain.
40
+ # auth_mech: :scram
41
+
42
+ # The database or source to authenticate the user against.
43
+ # (default: the database specified above or admin)
44
+ # auth_source: admin
45
+
46
+ # Force a the driver cluster to behave in a certain manner instead of auto-
47
+ # discovering. Can be one of: :direct, :replica_set, :sharded. Set to :direct
48
+ # when connecting to hidden members of a replica set.
49
+ # connect: :direct
50
+
51
+ # Changes the default time in seconds the server monitors refresh their status
52
+ # via ismaster commands. (default: 10)
53
+ # heartbeat_frequency: 10
54
+
55
+ # The time in seconds for selecting servers for a near read preference. (default: 0.015)
56
+ # local_threshold: 0.015
57
+
58
+ # The timeout in seconds for selecting a server for an operation. (default: 30)
59
+ # server_selection_timeout: 30
60
+
61
+ # The maximum number of connections in the connection pool. (default: 5)
62
+ # max_pool_size: 5
63
+
64
+ # The minimum number of connections in the connection pool. (default: 1)
65
+ # min_pool_size: 1
66
+
67
+ # The time to wait, in seconds, in the connection pool for a connection
68
+ # to be checked in before timing out. (default: 5)
69
+ # wait_queue_timeout: 5
70
+
71
+ # The time to wait to establish a connection before timing out, in seconds.
72
+ # (default: 5)
73
+ # connect_timeout: 5
74
+
75
+ # The timeout to wait to execute operations on a socket before raising an error.
76
+ # (default: 5)
77
+ # socket_timeout: 5
78
+
79
+ # The name of the replica set to connect to. Servers provided as seeds that do
80
+ # not belong to this replica set will be ignored.
81
+ # replica_set: name
82
+
83
+ # Whether to connect to the servers via ssl. (default: false)
84
+ # ssl: true
85
+
86
+ # The certificate file used to identify the connection against MongoDB.
87
+ # ssl_cert: /path/to/my.cert
88
+
89
+ # The private keyfile used to identify the connection against MongoDB.
90
+ # Note that even if the key is stored in the same file as the certificate,
91
+ # both need to be explicitly specified.
92
+ # ssl_key: /path/to/my.key
93
+
94
+ # A passphrase for the private key.
95
+ # ssl_key_pass_phrase: password
96
+
97
+ # Whether or not to do peer certification validation. (default: true)
98
+ # ssl_verify: true
99
+
100
+ # The file containing a set of concatenated certification authority certifications
101
+ # used to validate certs passed from the other end of the connection.
102
+ # ssl_ca_cert: /path/to/ca.cert
103
+
104
+ options:
105
+ # Prefix for collections that will be created
106
+ collections_prefix: 'scruber'
@@ -0,0 +1,23 @@
1
+ module Scruber
2
+ module Mongo
3
+ class Configuration
4
+ attr_accessor :clients, :options
5
+
6
+ def initialize
7
+ @clients = {}
8
+ @options = {}
9
+ end
10
+
11
+ def load!(path)
12
+ config = YAML.load_file(path).with_indifferent_access
13
+ @clients = config['clients']
14
+ @options = config['options']
15
+ end
16
+
17
+ def configured?(client_name=:default)
18
+ @clients.key?(client_name)
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,26 @@
1
+ module Scruber
2
+ module Mongo
3
+ module Factory
4
+ extend self
5
+
6
+ def create_client(client_name=:default)
7
+ raise Scruber::ArgumentError.new("Not configured") unless Scruber::Mongo.configuration.configured?(client_name)
8
+ configuration = Scruber::Mongo.configuration.clients[client_name]
9
+ if configuration[:uri]
10
+ ::Mongo::Client.new(configuration[:uri], options(configuration))
11
+ else
12
+ ::Mongo::Client.new(
13
+ configuration[:hosts],
14
+ options(configuration).merge(database: configuration[:database])
15
+ )
16
+ end
17
+ end
18
+
19
+ def options(configuration)
20
+ config = configuration.dup
21
+ options = config.delete(:options) || {}
22
+ options.reject{ |k, v| k == :hosts }.to_hash.symbolize_keys!
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,5 @@
1
+ module Scruber
2
+ module Mongo
3
+ VERSION = "0.1.0"
4
+ end
5
+ end
@@ -0,0 +1,98 @@
1
+ module Scruber
2
+ module QueueAdapters
3
+ class Mongo < AbstractAdapter
4
+ attr_reader :error_pages
5
+
6
+ class Page < Scruber::QueueAdapters::AbstractAdapter::Page
7
+ def id
8
+ @options[:_id] || @options[:id]
9
+ end
10
+
11
+ def save(options={})
12
+ if id.blank?
13
+ @queue.collection.insert_one(attrs)
14
+ else
15
+ @queue.collection.find_one_and_update(
16
+ {"_id" => self.id },
17
+ {'$set' => attrs },
18
+ {return_document: :before, upsert: true, projection: {_id: 1}}.merge(options)
19
+ )
20
+ end
21
+ end
22
+
23
+ def attrs
24
+ @options.with_indifferent_access.except('id', '_id').merge(id.present? ? {_id: id} : {}).merge (instance_variables.select{|ivar| !(ivar.to_s =~ /\@_/) }-[:@options, :@queue]).inject({}){|acc,ivar| acc[ivar[1..-1]] = instance_variable_get(ivar);acc }.with_indifferent_access
25
+ end
26
+
27
+ def delete
28
+ @queue.collection.find({"_id" => self.id }).delete_one if self.id.present?
29
+ end
30
+ end
31
+
32
+ # def initialize(options={})
33
+ # super(options)
34
+ # end
35
+
36
+ def push(url_or_page, options={})
37
+ if url_or_page.is_a?(Page)
38
+ url_or_page.queue = self
39
+ url_or_page.save(options)
40
+ else
41
+ Page.new(self, url_or_page, options).save
42
+ end
43
+ end
44
+ alias_method :add, :push
45
+
46
+ def queue_size
47
+ collection.count
48
+ end
49
+
50
+ def find(id)
51
+ build_pages collection.find({_id: id}).first
52
+ end
53
+
54
+ def fetch_downloaded(count=nil)
55
+ if count.nil?
56
+ build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).first
57
+ else
58
+ build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).limit(count).to_a
59
+ end
60
+ end
61
+
62
+ def fetch_pending(count=nil)
63
+ if count.nil?
64
+ build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).first
65
+ else
66
+ build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a
67
+ end
68
+ end
69
+
70
+ def has_work?
71
+ fetch_pending.present? || fetch_downloaded.present?
72
+ end
73
+
74
+ def collection
75
+ Scruber::Mongo.client[pages_collection_name]
76
+ end
77
+
78
+ private
79
+
80
+ def build_pages(pages)
81
+ if pages.nil?
82
+ nil
83
+ elsif pages.is_a?(Array)
84
+ pages.map{|p| Page.new(self, p['url'], p.with_indifferent_access )}
85
+ else
86
+ Page.new(self, pages['url'], pages.with_indifferent_access )
87
+ end
88
+ end
89
+
90
+ def pages_collection_name
91
+ @_pages_collection_name ||= [Scruber::Mongo.configuration.options['collections_prefix'], @options[:scraper_name], 'pages'].select(&:present?).map(&:to_s).join('_')
92
+ end
93
+
94
+ end
95
+ end
96
+ end
97
+
98
+ Scruber::Queue.add_adapter(:mongo, Scruber::QueueAdapters::Mongo)
@@ -0,0 +1,40 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "scruber/mongo/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "scruber-mongo"
8
+ spec.version = Scruber::Mongo::VERSION
9
+ spec.authors = ["Ivan Goncharov"]
10
+ spec.email = ["revis0r.mob@gmail.com"]
11
+
12
+ spec.summary = %q{Mongo support for Scruber}
13
+ spec.description = %q{Mongo support for Scruber}
14
+ spec.homepage = "https://github.com/scruber/scruber-mongo"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
21
+ else
22
+ raise "RubyGems 2.0 or newer is required to protect against " \
23
+ "public gem pushes."
24
+ end
25
+
26
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
27
+ f.match(%r{^(test|spec|features)/})
28
+ end
29
+ spec.bindir = "exe"
30
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
31
+ spec.require_paths = ["lib"]
32
+
33
+ spec.add_dependency "scruber", "~> 0.1.3"
34
+ spec.add_dependency "mongo", "~> 2.4"
35
+ spec.add_development_dependency "bundler", "~> 1.16"
36
+ spec.add_development_dependency "rake", "~> 10.0"
37
+ spec.add_development_dependency "rspec", "~> 3.0"
38
+ spec.add_development_dependency "database_cleaner", "~> 1.6.0"
39
+ spec.add_development_dependency "webmock", "3.0.1"
40
+ end
metadata ADDED
@@ -0,0 +1,163 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scruber-mongo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ivan Goncharov
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-03-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: scruber
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.1.3
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.1.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: mongo
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.4'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.4'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.16'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.16'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: database_cleaner
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 1.6.0
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 1.6.0
97
+ - !ruby/object:Gem::Dependency
98
+ name: webmock
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '='
102
+ - !ruby/object:Gem::Version
103
+ version: 3.0.1
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '='
109
+ - !ruby/object:Gem::Version
110
+ version: 3.0.1
111
+ description: Mongo support for Scruber
112
+ email:
113
+ - revis0r.mob@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".gitignore"
119
+ - ".rspec"
120
+ - ".travis.yml"
121
+ - Gemfile
122
+ - Gemfile.lock
123
+ - LICENSE.txt
124
+ - README.md
125
+ - Rakefile
126
+ - bin/console
127
+ - bin/setup
128
+ - lib/scruber/core/extensions/mongo_output.rb
129
+ - lib/scruber/helpers/fetcher_agent_adapters/mongo.rb
130
+ - lib/scruber/mongo.rb
131
+ - lib/scruber/mongo/cli/generators.rb
132
+ - lib/scruber/mongo/cli/templates/mongo.tt
133
+ - lib/scruber/mongo/configuration.rb
134
+ - lib/scruber/mongo/factory.rb
135
+ - lib/scruber/mongo/version.rb
136
+ - lib/scruber/queue_adapters/mongo.rb
137
+ - scruber-mongo.gemspec
138
+ homepage: https://github.com/scruber/scruber-mongo
139
+ licenses:
140
+ - MIT
141
+ metadata:
142
+ allowed_push_host: https://rubygems.org
143
+ post_install_message:
144
+ rdoc_options: []
145
+ require_paths:
146
+ - lib
147
+ required_ruby_version: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ required_rubygems_version: !ruby/object:Gem::Requirement
153
+ requirements:
154
+ - - ">="
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ requirements: []
158
+ rubyforge_project:
159
+ rubygems_version: 2.6.14
160
+ signing_key:
161
+ specification_version: 4
162
+ summary: Mongo support for Scruber
163
+ test_files: []