scruber-mongo 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/Gemfile +1 -1
- data/Gemfile.lock +19 -13
- data/README.md +26 -14
- data/lib/scruber/core/extensions/mongo_output.rb +89 -3
- data/lib/scruber/mongo/cli/generators.rb +1 -0
- data/lib/scruber/mongo/cli/templates/mongo_initializer.tt +3 -0
- data/lib/scruber/mongo/version.rb +1 -1
- data/lib/scruber/queue_adapters/mongo.rb +126 -21
- data/scruber-mongo.gemspec +2 -2
- metadata +11 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5c298bffe030c11d719251f76cd10f7e12c18c39
|
4
|
+
data.tar.gz: 458c6f63660ef791c504480cd4b6b640b5cca9f9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: acfc91d30bf3f9fb48b344a3b9ad904c9f46eff2411675538fa46ab7de544ea6864ff20dec23b73afcaffb255e4f6c458925b4f36aaa9553c7f079aea2dc4da8
|
7
|
+
data.tar.gz: d4e10f9ffd106090beef149dd346785cc40ec6b8e3f2133a53a773f586f1f8d91b34c54ee373acdf9fd34dbff611d5e8e385eaf14cccf0ac489898b47ebd649f
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
scruber-mongo (0.1.
|
4
|
+
scruber-mongo (0.1.1)
|
5
5
|
mongo (~> 2.4)
|
6
|
-
scruber (~> 0.1.
|
6
|
+
scruber (~> 0.1.6)
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
activesupport (5.
|
11
|
+
activesupport (5.2.0)
|
12
12
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
13
|
-
i18n (
|
13
|
+
i18n (>= 0.7, < 2)
|
14
14
|
minitest (~> 5.1)
|
15
15
|
tzinfo (~> 1.1)
|
16
16
|
addressable (2.5.2)
|
@@ -21,15 +21,16 @@ GEM
|
|
21
21
|
safe_yaml (~> 1.0.0)
|
22
22
|
database_cleaner (1.6.2)
|
23
23
|
diff-lcs (1.3)
|
24
|
-
domain_name (0.5.
|
24
|
+
domain_name (0.5.20180417)
|
25
25
|
unf (>= 0.0.5, < 1.0.0)
|
26
26
|
ethon (0.11.0)
|
27
27
|
ffi (>= 1.3.0)
|
28
28
|
ffi (1.9.23)
|
29
29
|
hashdiff (0.3.7)
|
30
|
+
hashie (3.5.7)
|
30
31
|
http-cookie (1.0.3)
|
31
32
|
domain_name (~> 0.5)
|
32
|
-
i18n (0.
|
33
|
+
i18n (1.0.1)
|
33
34
|
concurrent-ruby (~> 1.0)
|
34
35
|
mini_portile2 (2.3.0)
|
35
36
|
minitest (5.11.3)
|
@@ -37,7 +38,10 @@ GEM
|
|
37
38
|
bson (>= 4.3.0, < 5.0.0)
|
38
39
|
nokogiri (1.8.2)
|
39
40
|
mini_portile2 (~> 2.3.0)
|
41
|
+
paint (2.0.1)
|
40
42
|
pickup (0.0.11)
|
43
|
+
powerbar (2.0.1)
|
44
|
+
hashie (>= 1.1.0)
|
41
45
|
public_suffix (3.0.2)
|
42
46
|
rake (10.5.0)
|
43
47
|
rspec (3.7.0)
|
@@ -54,16 +58,18 @@ GEM
|
|
54
58
|
rspec-support (~> 3.7.0)
|
55
59
|
rspec-support (3.7.1)
|
56
60
|
safe_yaml (1.0.4)
|
57
|
-
scruber (0.1.
|
58
|
-
activesupport (
|
61
|
+
scruber (0.1.6)
|
62
|
+
activesupport (~> 5.1, >= 5.1.5)
|
59
63
|
http-cookie (= 1.0.3)
|
60
|
-
nokogiri (
|
61
|
-
|
64
|
+
nokogiri (~> 1.8, >= 1.8.2)
|
65
|
+
paint (~> 2.0, >= 2.0.1)
|
66
|
+
pickup (~> 0.0.11)
|
67
|
+
powerbar (~> 2.0, >= 2.0.1)
|
62
68
|
thor (= 0.20.0)
|
63
|
-
typhoeus (
|
69
|
+
typhoeus (~> 1.1, >= 1.1.2)
|
64
70
|
thor (0.20.0)
|
65
71
|
thread_safe (0.3.6)
|
66
|
-
typhoeus (1.
|
72
|
+
typhoeus (1.3.0)
|
67
73
|
ethon (>= 0.9.0)
|
68
74
|
tzinfo (1.2.5)
|
69
75
|
thread_safe (~> 0.1)
|
@@ -80,7 +86,7 @@ PLATFORMS
|
|
80
86
|
|
81
87
|
DEPENDENCIES
|
82
88
|
bundler (~> 1.16)
|
83
|
-
database_cleaner (~> 1.6.0)
|
89
|
+
database_cleaner (~> 1.6, >= 1.6.0)
|
84
90
|
rake (~> 10.0)
|
85
91
|
rspec (~> 3.0)
|
86
92
|
scruber-mongo!
|
data/README.md
CHANGED
@@ -1,38 +1,50 @@
|
|
1
|
-
# Scruber
|
1
|
+
# Scruber-mongo
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
TODO: Delete this and the text above, and describe your gem
|
3
|
+
This gem provides Mongo support for Scruber
|
6
4
|
|
7
5
|
## Installation
|
8
6
|
|
9
|
-
Add this line to your application's Gemfile:
|
7
|
+
1. Add this line to your application's Gemfile:
|
10
8
|
|
11
9
|
```ruby
|
12
10
|
gem 'scruber-mongo'
|
13
11
|
```
|
14
12
|
|
15
|
-
And then execute:
|
13
|
+
2. And then execute:
|
16
14
|
|
17
15
|
$ bundle
|
18
16
|
|
19
|
-
|
17
|
+
3. Install gem
|
18
|
+
|
19
|
+
$ scruber generate mongo:install
|
20
20
|
|
21
|
-
|
21
|
+
This gem provides Queue driver, Output driver and FetcherAgent driver for mongo.
|
22
22
|
|
23
|
-
##
|
23
|
+
## Sample scraper
|
24
24
|
|
25
|
-
|
25
|
+
```ruby
|
26
|
+
Scruber.run do
|
27
|
+
get "http://example.abc/product"
|
28
|
+
|
29
|
+
parse :html do |page, doc|
|
30
|
+
id = mongo_out_product title: doc.at('title').text
|
26
31
|
|
27
|
-
|
32
|
+
get_reviews URI.join(page.url, doc.at('a.review_link').attr('href')).to_s, product_id: id
|
33
|
+
end
|
28
34
|
|
29
|
-
|
35
|
+
parse_reviews :html do |page,doc|
|
36
|
+
product = mongo_find_product page.options[:product_id]
|
30
37
|
|
31
|
-
|
38
|
+
product[:reviews] = doc.search('.review').map{|r| {author: r.at('.author').text, text: r.at('.text').text } }
|
39
|
+
|
40
|
+
mongo_out_product product
|
41
|
+
end
|
42
|
+
end
|
43
|
+
```
|
32
44
|
|
33
45
|
## Contributing
|
34
46
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
47
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/scruber/scruber-mongo.
|
36
48
|
|
37
49
|
## License
|
38
50
|
|
@@ -1,17 +1,67 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
|
5
|
+
#
|
6
|
+
# Extension for writing results to mongo collections.
|
7
|
+
# It registers methods for writing documents:
|
8
|
+
# mongo_out({..}) # writing document to {prefix}_{scraper_name}_records
|
9
|
+
# mongo_out_product({..}) # writing document to {prefix}_{scraper_name}_product
|
10
|
+
# Searching methods:
|
11
|
+
# mongo_find({..}) # searching document in {prefix}_{scraper_name}_records
|
12
|
+
# mongo_find_product({..}) # searching document in {prefix}_{scraper_name}_product
|
13
|
+
# Accessing to mongo collection:
|
14
|
+
# mongo_collection({..}) # Direct access to {prefix}_{scraper_name}_records
|
15
|
+
# mongo_product_collection({..}) # Direct access to {prefix}_{scraper_name}_product
|
16
|
+
#
|
17
|
+
# @example Writing products data and companies
|
18
|
+
# Scruber.run :simple do
|
19
|
+
# get_product 'http://example.com/product'
|
20
|
+
# get_company 'http://example.com/product'
|
21
|
+
#
|
22
|
+
# parse_product :html do |page,doc|
|
23
|
+
# id = mongo_out_product {title: doc.at('h1').text, price: doc.at('.price').text }
|
24
|
+
# record = mongo_find_product id
|
25
|
+
# record[:description] = doc.at('.desc').text
|
26
|
+
# mongo_out_product record
|
27
|
+
# log "Count: #{mongo_product_collection.count}"
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# parse_company :html do |page,doc|
|
31
|
+
# mongo_out_company {name: doc.at('h1').text, phone: doc.at('.phone').text }
|
32
|
+
# end
|
33
|
+
# end
|
34
|
+
#
|
35
|
+
# @author Ivan Goncharov
|
36
|
+
#
|
4
37
|
class MongoOutput < Base
|
5
38
|
module CoreMethods
|
6
39
|
|
40
|
+
#
|
41
|
+
# Mongo out default method. By default it uses suffix *_records*
|
42
|
+
#
|
43
|
+
# @param fields [Hash] Fields to output
|
44
|
+
# @param options [Hash] Output options, see https://docs.mongodb.com/manual/reference/method/db.collection.findOneAndUpdate/
|
45
|
+
#
|
46
|
+
# @return [Object] id of writed record
|
7
47
|
def mongo_out(fields, options={})
|
8
48
|
Scruber::Core::Extensions::MongoOutput.mongo_out self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, fields, options
|
9
49
|
end
|
10
50
|
|
51
|
+
#
|
52
|
+
# Find mongo document by id
|
53
|
+
#
|
54
|
+
# @param id [Object] id of document
|
55
|
+
#
|
56
|
+
# @return [Hash] mongo document
|
11
57
|
def mongo_find(id)
|
12
58
|
Scruber::Core::Extensions::MongoOutput.mongo_find self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, id
|
13
59
|
end
|
14
60
|
|
61
|
+
#
|
62
|
+
# Direct access to mongo collection
|
63
|
+
#
|
64
|
+
# @return [Mongo::Collection] Mongo collection instance
|
15
65
|
def mongo_collection
|
16
66
|
Scruber::Core::Extensions::MongoOutput.mongo_collection self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name
|
17
67
|
end
|
@@ -51,25 +101,47 @@ module Scruber
|
|
51
101
|
end
|
52
102
|
|
53
103
|
class << self
|
104
|
+
# Default mongo collection suffix name
|
54
105
|
attr_writer :default_suffix_name
|
55
106
|
|
107
|
+
#
|
108
|
+
# Default mongo collection suffix name
|
109
|
+
#
|
110
|
+
# @return [String] Default mongo collection suffix name
|
56
111
|
def default_suffix_name
|
57
112
|
@default_suffix_name ||= 'records'
|
58
113
|
end
|
59
114
|
|
115
|
+
#
|
116
|
+
# Writing results to mongo collection
|
117
|
+
#
|
118
|
+
# @param scraper_name [String] name of scraper to build collection name
|
119
|
+
# @param suffix [String] suffix to build collection name
|
120
|
+
# @param fields [Hash] Document to output
|
121
|
+
# @param options [Hash] Options for updating record (when *_id* not set), see https://docs.mongodb.com/manual/reference/method/db.collection.findOneAndUpdate/
|
122
|
+
#
|
123
|
+
# @return [type] [description]
|
60
124
|
def mongo_out(scraper_name, suffix, fields, options={})
|
61
125
|
fields = fields.with_indifferent_access
|
62
126
|
if fields[:_id].blank?
|
63
|
-
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].insert_one(fields)
|
127
|
+
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].insert_one(fields).inserted_id
|
64
128
|
else
|
65
129
|
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find_one_and_update(
|
66
130
|
{"_id" => fields[:_id] },
|
67
131
|
{'$set' => fields },
|
68
|
-
{return_document: :
|
69
|
-
)
|
132
|
+
{return_document: :after, upsert: true}.merge(options)
|
133
|
+
)[:_id]
|
70
134
|
end
|
71
135
|
end
|
72
136
|
|
137
|
+
#
|
138
|
+
# Searching document in mongo
|
139
|
+
#
|
140
|
+
# @param scraper_name [String] name of scraper to build collection name
|
141
|
+
# @param suffix [String] suffix to build collection name
|
142
|
+
# @param id [Object] id of document
|
143
|
+
#
|
144
|
+
# @return [Hash] document
|
73
145
|
def mongo_find(scraper_name, suffix, id)
|
74
146
|
if id.is_a?(Hash)
|
75
147
|
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find(id)
|
@@ -78,10 +150,24 @@ module Scruber
|
|
78
150
|
end
|
79
151
|
end
|
80
152
|
|
153
|
+
#
|
154
|
+
# Access to mongo collection
|
155
|
+
#
|
156
|
+
# @param scraper_name [String] name of scraper to build collection name
|
157
|
+
# @param suffix [String] suffix to build collection name
|
158
|
+
#
|
159
|
+
# @return [Mongo::Collection] instance of Mongo::Collection
|
81
160
|
def mongo_collection(scraper_name, suffix)
|
82
161
|
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)]
|
83
162
|
end
|
84
163
|
|
164
|
+
#
|
165
|
+
# Collection name builder
|
166
|
+
#
|
167
|
+
# @param scraper_name [String] name of scraper to build collection name
|
168
|
+
# @param suffix [String] suffix to build collection name
|
169
|
+
#
|
170
|
+
# @return [String] name of collection for given scraper_name and suffix
|
85
171
|
def out_collection_name(scraper_name, suffix)
|
86
172
|
[Scruber::Mongo.configuration.options['collections_prefix'], scraper_name, suffix].select(&:present?).map(&:to_s).join('_')
|
87
173
|
end
|
@@ -1,56 +1,114 @@
|
|
1
1
|
module Scruber
|
2
2
|
module QueueAdapters
|
3
3
|
class Mongo < AbstractAdapter
|
4
|
-
attr_reader :error_pages
|
5
4
|
|
6
5
|
class Page < Scruber::QueueAdapters::AbstractAdapter::Page
|
7
6
|
def id
|
8
|
-
@options[:_id] || @
|
7
|
+
@options[:_id] || @id
|
9
8
|
end
|
10
9
|
|
11
|
-
|
10
|
+
#
|
11
|
+
# Saving page to queue
|
12
|
+
# @param options [Hash] saving options
|
13
|
+
# @param save_options={} [type] [description]
|
14
|
+
#
|
15
|
+
# @return [type] [description]
|
16
|
+
def save(options={}, save_options={})
|
12
17
|
if id.blank?
|
13
18
|
@queue.collection.insert_one(attrs)
|
14
19
|
else
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
+
if options[:new]
|
21
|
+
@queue.collection.find_one_and_update(
|
22
|
+
{"_id" => self.id },
|
23
|
+
{'$setOnInsert' => attrs },
|
24
|
+
{return_document: :after, upsert: true, projection: {_id: 1}}.merge(options)
|
25
|
+
)
|
26
|
+
else
|
27
|
+
@queue.collection.find_one_and_update(
|
28
|
+
{"_id" => self.id },
|
29
|
+
{'$set' => attrs },
|
30
|
+
{return_document: :after, upsert: true, projection: {_id: 1}}.merge(options)
|
31
|
+
)
|
32
|
+
end
|
20
33
|
end
|
21
34
|
end
|
22
35
|
|
36
|
+
#
|
37
|
+
# Mark page as processed by parser and save it
|
38
|
+
#
|
39
|
+
# @return [void]
|
40
|
+
def processed!
|
41
|
+
# Monkey patch for processing error pages.
|
42
|
+
if @fetched_at == 0
|
43
|
+
@fetched_at = -1
|
44
|
+
end
|
45
|
+
super
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Generating hash with mongo doc attributes
|
50
|
+
#
|
51
|
+
# @return [Hash] hash with page attributes
|
23
52
|
def attrs
|
24
53
|
@options.with_indifferent_access.except('id', '_id').merge(id.present? ? {_id: id} : {}).merge (instance_variables.select{|ivar| !(ivar.to_s =~ /\@_/) }-[:@options, :@queue]).inject({}){|acc,ivar| acc[ivar[1..-1]] = instance_variable_get(ivar);acc }.with_indifferent_access
|
25
54
|
end
|
26
55
|
|
56
|
+
#
|
57
|
+
# Delete record from Mongo collection
|
58
|
+
#
|
59
|
+
# @return [void]
|
27
60
|
def delete
|
28
61
|
@queue.collection.find({"_id" => self.id }).delete_one if self.id.present?
|
29
62
|
end
|
30
63
|
end
|
31
64
|
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
|
36
|
-
|
65
|
+
#
|
66
|
+
# Add page to queue
|
67
|
+
# @param url [String] URL of page
|
68
|
+
# @param options [Hash] Other options, see {Scruber::QueueAdapters::AbstractAdapter::Page}
|
69
|
+
#
|
70
|
+
# @return [void]
|
71
|
+
def add(url_or_page, options={})
|
37
72
|
if url_or_page.is_a?(Page)
|
38
73
|
url_or_page.queue = self
|
39
|
-
url_or_page.save(options)
|
74
|
+
url_or_page.save({new: true}.merge(options))
|
40
75
|
else
|
41
|
-
Page.new(self, url_or_page
|
76
|
+
Page.new(self, options.merge(url: url_or_page)).save({new: true})
|
42
77
|
end
|
43
78
|
end
|
44
|
-
alias_method :
|
79
|
+
alias_method :push, :add
|
45
80
|
|
46
|
-
|
81
|
+
#
|
82
|
+
# Size of queue
|
83
|
+
#
|
84
|
+
# @return [Integer] count of pages in queue
|
85
|
+
def size
|
47
86
|
collection.count
|
48
87
|
end
|
49
88
|
|
89
|
+
#
|
90
|
+
# Count of downloaded pages
|
91
|
+
# Using to show downloading progress.
|
92
|
+
#
|
93
|
+
# @return [Integer] count of downloaded pages
|
94
|
+
def downloaded_count
|
95
|
+
collection.find({fetched_at: {"$gt" => 0}}).count
|
96
|
+
end
|
97
|
+
|
98
|
+
#
|
99
|
+
# Search page by id
|
100
|
+
# @param id [Object] id of page
|
101
|
+
#
|
102
|
+
# @return [Page] page object
|
50
103
|
def find(id)
|
51
104
|
build_pages collection.find({_id: id}).first
|
52
105
|
end
|
53
106
|
|
107
|
+
#
|
108
|
+
# Fetch downloaded and not processed pages for feching
|
109
|
+
# @param count=nil [Integer] count of pages to fetch
|
110
|
+
#
|
111
|
+
# @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
|
54
112
|
def fetch_downloaded(count=nil)
|
55
113
|
if count.nil?
|
56
114
|
build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).first
|
@@ -59,34 +117,81 @@ module Scruber
|
|
59
117
|
end
|
60
118
|
end
|
61
119
|
|
120
|
+
#
|
121
|
+
# Fetch pending page for fetching
|
122
|
+
# @param count=nil [Integer] count of pages to fetch
|
123
|
+
#
|
124
|
+
# @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
|
62
125
|
def fetch_pending(count=nil)
|
63
126
|
if count.nil?
|
64
|
-
build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).first
|
127
|
+
build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).first
|
65
128
|
else
|
66
|
-
build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a
|
129
|
+
build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a
|
67
130
|
end
|
68
131
|
end
|
69
132
|
|
133
|
+
#
|
134
|
+
# Fetch error page
|
135
|
+
# @param count=nil [Integer] count of pages to fetch
|
136
|
+
#
|
137
|
+
# @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
|
138
|
+
def fetch_error(count=nil)
|
139
|
+
if count.nil?
|
140
|
+
build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).first
|
141
|
+
else
|
142
|
+
build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).limit(count).to_a
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
#
|
147
|
+
# Used by Core. It checks for pages that are
|
148
|
+
# not downloaded or not parsed yet.
|
149
|
+
#
|
150
|
+
# @return [Boolean] true if queue still has work for scraper
|
70
151
|
def has_work?
|
71
152
|
fetch_pending.present? || fetch_downloaded.present?
|
72
153
|
end
|
73
154
|
|
155
|
+
#
|
156
|
+
# Accessing to mongo collection instance
|
157
|
+
#
|
158
|
+
# @return [Mongo::Collection] Mongo collection instance
|
74
159
|
def collection
|
75
160
|
Scruber::Mongo.client[pages_collection_name]
|
76
161
|
end
|
77
162
|
|
163
|
+
#
|
164
|
+
# Check if queue was initialized.
|
165
|
+
# Using for `seed` method. If queue was initialized,
|
166
|
+
# then no need to run seed block.
|
167
|
+
#
|
168
|
+
# @return [Boolean] true if queue already was initialized
|
169
|
+
def initialized?
|
170
|
+
Scruber::Mongo.client[pages_collection_name].find.first.present?
|
171
|
+
end
|
172
|
+
|
78
173
|
private
|
79
174
|
|
175
|
+
#
|
176
|
+
# Wrapping mongo objects into queue Page objects
|
177
|
+
#
|
178
|
+
# @param pages [Hash|Array<Hash>] Mongo document or array of mongo documents
|
179
|
+
#
|
180
|
+
# @return [type] [description]
|
80
181
|
def build_pages(pages)
|
81
182
|
if pages.nil?
|
82
183
|
nil
|
83
184
|
elsif pages.is_a?(Array)
|
84
|
-
pages.map{|p| Page.new(self, p['url']
|
185
|
+
pages.map{|p| Page.new(self, p.with_indifferent_access.merge(url: p['url']) )}
|
85
186
|
else
|
86
|
-
Page.new(self, pages['url']
|
187
|
+
Page.new(self, pages.with_indifferent_access.merge(url: pages['url']) )
|
87
188
|
end
|
88
189
|
end
|
89
190
|
|
191
|
+
#
|
192
|
+
# Generating mongo pages collection name
|
193
|
+
#
|
194
|
+
# @return [String] name of pages collection
|
90
195
|
def pages_collection_name
|
91
196
|
@_pages_collection_name ||= [Scruber::Mongo.configuration.options['collections_prefix'], @options[:scraper_name], 'pages'].select(&:present?).map(&:to_s).join('_')
|
92
197
|
end
|
data/scruber-mongo.gemspec
CHANGED
@@ -30,11 +30,11 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
31
|
spec.require_paths = ["lib"]
|
32
32
|
|
33
|
-
spec.add_dependency "scruber", "~> 0.1.
|
33
|
+
spec.add_dependency "scruber", "~> 0.1.6"
|
34
34
|
spec.add_dependency "mongo", "~> 2.4"
|
35
35
|
spec.add_development_dependency "bundler", "~> 1.16"
|
36
36
|
spec.add_development_dependency "rake", "~> 10.0"
|
37
37
|
spec.add_development_dependency "rspec", "~> 3.0"
|
38
|
-
spec.add_development_dependency "database_cleaner",
|
38
|
+
spec.add_development_dependency "database_cleaner", '~> 1.6', '>= 1.6.0'
|
39
39
|
spec.add_development_dependency "webmock", "3.0.1"
|
40
40
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber-mongo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: scruber
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.1.
|
19
|
+
version: 0.1.6
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.1.
|
26
|
+
version: 0.1.6
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: mongo
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -85,6 +85,9 @@ dependencies:
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.6'
|
90
|
+
- - ">="
|
88
91
|
- !ruby/object:Gem::Version
|
89
92
|
version: 1.6.0
|
90
93
|
type: :development
|
@@ -92,6 +95,9 @@ dependencies:
|
|
92
95
|
version_requirements: !ruby/object:Gem::Requirement
|
93
96
|
requirements:
|
94
97
|
- - "~>"
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '1.6'
|
100
|
+
- - ">="
|
95
101
|
- !ruby/object:Gem::Version
|
96
102
|
version: 1.6.0
|
97
103
|
- !ruby/object:Gem::Dependency
|
@@ -130,6 +136,7 @@ files:
|
|
130
136
|
- lib/scruber/mongo.rb
|
131
137
|
- lib/scruber/mongo/cli/generators.rb
|
132
138
|
- lib/scruber/mongo/cli/templates/mongo.tt
|
139
|
+
- lib/scruber/mongo/cli/templates/mongo_initializer.tt
|
133
140
|
- lib/scruber/mongo/configuration.rb
|
134
141
|
- lib/scruber/mongo/factory.rb
|
135
142
|
- lib/scruber/mongo/version.rb
|