scruber-mongo 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/Gemfile +1 -1
- data/Gemfile.lock +19 -13
- data/README.md +26 -14
- data/lib/scruber/core/extensions/mongo_output.rb +89 -3
- data/lib/scruber/mongo/cli/generators.rb +1 -0
- data/lib/scruber/mongo/cli/templates/mongo_initializer.tt +3 -0
- data/lib/scruber/mongo/version.rb +1 -1
- data/lib/scruber/queue_adapters/mongo.rb +126 -21
- data/scruber-mongo.gemspec +2 -2
- metadata +11 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5c298bffe030c11d719251f76cd10f7e12c18c39
|
4
|
+
data.tar.gz: 458c6f63660ef791c504480cd4b6b640b5cca9f9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: acfc91d30bf3f9fb48b344a3b9ad904c9f46eff2411675538fa46ab7de544ea6864ff20dec23b73afcaffb255e4f6c458925b4f36aaa9553c7f079aea2dc4da8
|
7
|
+
data.tar.gz: d4e10f9ffd106090beef149dd346785cc40ec6b8e3f2133a53a773f586f1f8d91b34c54ee373acdf9fd34dbff611d5e8e385eaf14cccf0ac489898b47ebd649f
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
scruber-mongo (0.1.
|
4
|
+
scruber-mongo (0.1.1)
|
5
5
|
mongo (~> 2.4)
|
6
|
-
scruber (~> 0.1.
|
6
|
+
scruber (~> 0.1.6)
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
activesupport (5.
|
11
|
+
activesupport (5.2.0)
|
12
12
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
13
|
-
i18n (
|
13
|
+
i18n (>= 0.7, < 2)
|
14
14
|
minitest (~> 5.1)
|
15
15
|
tzinfo (~> 1.1)
|
16
16
|
addressable (2.5.2)
|
@@ -21,15 +21,16 @@ GEM
|
|
21
21
|
safe_yaml (~> 1.0.0)
|
22
22
|
database_cleaner (1.6.2)
|
23
23
|
diff-lcs (1.3)
|
24
|
-
domain_name (0.5.
|
24
|
+
domain_name (0.5.20180417)
|
25
25
|
unf (>= 0.0.5, < 1.0.0)
|
26
26
|
ethon (0.11.0)
|
27
27
|
ffi (>= 1.3.0)
|
28
28
|
ffi (1.9.23)
|
29
29
|
hashdiff (0.3.7)
|
30
|
+
hashie (3.5.7)
|
30
31
|
http-cookie (1.0.3)
|
31
32
|
domain_name (~> 0.5)
|
32
|
-
i18n (0.
|
33
|
+
i18n (1.0.1)
|
33
34
|
concurrent-ruby (~> 1.0)
|
34
35
|
mini_portile2 (2.3.0)
|
35
36
|
minitest (5.11.3)
|
@@ -37,7 +38,10 @@ GEM
|
|
37
38
|
bson (>= 4.3.0, < 5.0.0)
|
38
39
|
nokogiri (1.8.2)
|
39
40
|
mini_portile2 (~> 2.3.0)
|
41
|
+
paint (2.0.1)
|
40
42
|
pickup (0.0.11)
|
43
|
+
powerbar (2.0.1)
|
44
|
+
hashie (>= 1.1.0)
|
41
45
|
public_suffix (3.0.2)
|
42
46
|
rake (10.5.0)
|
43
47
|
rspec (3.7.0)
|
@@ -54,16 +58,18 @@ GEM
|
|
54
58
|
rspec-support (~> 3.7.0)
|
55
59
|
rspec-support (3.7.1)
|
56
60
|
safe_yaml (1.0.4)
|
57
|
-
scruber (0.1.
|
58
|
-
activesupport (
|
61
|
+
scruber (0.1.6)
|
62
|
+
activesupport (~> 5.1, >= 5.1.5)
|
59
63
|
http-cookie (= 1.0.3)
|
60
|
-
nokogiri (
|
61
|
-
|
64
|
+
nokogiri (~> 1.8, >= 1.8.2)
|
65
|
+
paint (~> 2.0, >= 2.0.1)
|
66
|
+
pickup (~> 0.0.11)
|
67
|
+
powerbar (~> 2.0, >= 2.0.1)
|
62
68
|
thor (= 0.20.0)
|
63
|
-
typhoeus (
|
69
|
+
typhoeus (~> 1.1, >= 1.1.2)
|
64
70
|
thor (0.20.0)
|
65
71
|
thread_safe (0.3.6)
|
66
|
-
typhoeus (1.
|
72
|
+
typhoeus (1.3.0)
|
67
73
|
ethon (>= 0.9.0)
|
68
74
|
tzinfo (1.2.5)
|
69
75
|
thread_safe (~> 0.1)
|
@@ -80,7 +86,7 @@ PLATFORMS
|
|
80
86
|
|
81
87
|
DEPENDENCIES
|
82
88
|
bundler (~> 1.16)
|
83
|
-
database_cleaner (~> 1.6.0)
|
89
|
+
database_cleaner (~> 1.6, >= 1.6.0)
|
84
90
|
rake (~> 10.0)
|
85
91
|
rspec (~> 3.0)
|
86
92
|
scruber-mongo!
|
data/README.md
CHANGED
@@ -1,38 +1,50 @@
|
|
1
|
-
# Scruber
|
1
|
+
# Scruber-mongo
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
TODO: Delete this and the text above, and describe your gem
|
3
|
+
This gem provides Mongo support for Scruber
|
6
4
|
|
7
5
|
## Installation
|
8
6
|
|
9
|
-
Add this line to your application's Gemfile:
|
7
|
+
1. Add this line to your application's Gemfile:
|
10
8
|
|
11
9
|
```ruby
|
12
10
|
gem 'scruber-mongo'
|
13
11
|
```
|
14
12
|
|
15
|
-
And then execute:
|
13
|
+
2. And then execute:
|
16
14
|
|
17
15
|
$ bundle
|
18
16
|
|
19
|
-
|
17
|
+
3. Install gem
|
18
|
+
|
19
|
+
$ scruber generate mongo:install
|
20
20
|
|
21
|
-
|
21
|
+
This gem provides Queue driver, Output driver and FetcherAgent driver for mongo.
|
22
22
|
|
23
|
-
##
|
23
|
+
## Sample scraper
|
24
24
|
|
25
|
-
|
25
|
+
```ruby
|
26
|
+
Scruber.run do
|
27
|
+
get "http://example.abc/product"
|
28
|
+
|
29
|
+
parse :html do |page, doc|
|
30
|
+
id = mongo_out_product title: doc.at('title').text
|
26
31
|
|
27
|
-
|
32
|
+
get_reviews URI.join(page.url, doc.at('a.review_link').attr('href')).to_s, product_id: id
|
33
|
+
end
|
28
34
|
|
29
|
-
|
35
|
+
parse_reviews :html do |page,doc|
|
36
|
+
product = mongo_find_product page.options[:product_id]
|
30
37
|
|
31
|
-
|
38
|
+
product[:reviews] = doc.search('.review').map{|r| {author: r.at('.author').text, text: r.at('.text').text } }
|
39
|
+
|
40
|
+
mongo_out_product product
|
41
|
+
end
|
42
|
+
end
|
43
|
+
```
|
32
44
|
|
33
45
|
## Contributing
|
34
46
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
47
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/scruber/scruber-mongo.
|
36
48
|
|
37
49
|
## License
|
38
50
|
|
@@ -1,17 +1,67 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
|
5
|
+
#
|
6
|
+
# Extension for writing results to mongo collections.
|
7
|
+
# It registers methods for writing documents:
|
8
|
+
# mongo_out({..}) # writing document to {prefix}_{scraper_name}_records
|
9
|
+
# mongo_out_product({..}) # writing document to {prefix}_{scraper_name}_product
|
10
|
+
# Searching methods:
|
11
|
+
# mongo_find({..}) # searching document in {prefix}_{scraper_name}_records
|
12
|
+
# mongo_find_product({..}) # searching document in {prefix}_{scraper_name}_product
|
13
|
+
# Accessing to mongo collection:
|
14
|
+
# mongo_collection({..}) # Direct access to {prefix}_{scraper_name}_records
|
15
|
+
# mongo_product_collection({..}) # Direct access to {prefix}_{scraper_name}_product
|
16
|
+
#
|
17
|
+
# @example Writing products data and companies
|
18
|
+
# Scruber.run :simple do
|
19
|
+
# get_product 'http://example.com/product'
|
20
|
+
# get_company 'http://example.com/product'
|
21
|
+
#
|
22
|
+
# parse_product :html do |page,doc|
|
23
|
+
# id = mongo_out_product {title: doc.at('h1').text, price: doc.at('.price').text }
|
24
|
+
# record = mongo_find_product id
|
25
|
+
# record[:description] = doc.at('.desc').text
|
26
|
+
# mongo_out_product record
|
27
|
+
# log "Count: #{mongo_product_collection.count}"
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# parse_company :html do |page,doc|
|
31
|
+
# mongo_out_company {name: doc.at('h1').text, phone: doc.at('.phone').text }
|
32
|
+
# end
|
33
|
+
# end
|
34
|
+
#
|
35
|
+
# @author Ivan Goncharov
|
36
|
+
#
|
4
37
|
class MongoOutput < Base
|
5
38
|
module CoreMethods
|
6
39
|
|
40
|
+
#
|
41
|
+
# Mongo out default method. By default it uses suffix *_records*
|
42
|
+
#
|
43
|
+
# @param fields [Hash] Fields to output
|
44
|
+
# @param options [Hash] Output options, see https://docs.mongodb.com/manual/reference/method/db.collection.findOneAndUpdate/
|
45
|
+
#
|
46
|
+
# @return [Object] id of writed record
|
7
47
|
def mongo_out(fields, options={})
|
8
48
|
Scruber::Core::Extensions::MongoOutput.mongo_out self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, fields, options
|
9
49
|
end
|
10
50
|
|
51
|
+
#
|
52
|
+
# Find mongo document by id
|
53
|
+
#
|
54
|
+
# @param id [Object] id of document
|
55
|
+
#
|
56
|
+
# @return [Hash] mongo document
|
11
57
|
def mongo_find(id)
|
12
58
|
Scruber::Core::Extensions::MongoOutput.mongo_find self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, id
|
13
59
|
end
|
14
60
|
|
61
|
+
#
|
62
|
+
# Direct access to mongo collection
|
63
|
+
#
|
64
|
+
# @return [Mongo::Collection] Mongo collection instance
|
15
65
|
def mongo_collection
|
16
66
|
Scruber::Core::Extensions::MongoOutput.mongo_collection self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name
|
17
67
|
end
|
@@ -51,25 +101,47 @@ module Scruber
|
|
51
101
|
end
|
52
102
|
|
53
103
|
class << self
|
104
|
+
# Default mongo collection suffix name
|
54
105
|
attr_writer :default_suffix_name
|
55
106
|
|
107
|
+
#
|
108
|
+
# Default mongo collection suffix name
|
109
|
+
#
|
110
|
+
# @return [String] Default mongo collection suffix name
|
56
111
|
def default_suffix_name
|
57
112
|
@default_suffix_name ||= 'records'
|
58
113
|
end
|
59
114
|
|
115
|
+
#
|
116
|
+
# Writing results to mongo collection
|
117
|
+
#
|
118
|
+
# @param scraper_name [String] name of scraper to build collection name
|
119
|
+
# @param suffix [String] suffix to build collection name
|
120
|
+
# @param fields [Hash] Document to output
|
121
|
+
# @param options [Hash] Options for updating record (when *_id* not set), see https://docs.mongodb.com/manual/reference/method/db.collection.findOneAndUpdate/
|
122
|
+
#
|
123
|
+
# @return [type] [description]
|
60
124
|
def mongo_out(scraper_name, suffix, fields, options={})
|
61
125
|
fields = fields.with_indifferent_access
|
62
126
|
if fields[:_id].blank?
|
63
|
-
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].insert_one(fields)
|
127
|
+
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].insert_one(fields).inserted_id
|
64
128
|
else
|
65
129
|
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find_one_and_update(
|
66
130
|
{"_id" => fields[:_id] },
|
67
131
|
{'$set' => fields },
|
68
|
-
{return_document: :
|
69
|
-
)
|
132
|
+
{return_document: :after, upsert: true}.merge(options)
|
133
|
+
)[:_id]
|
70
134
|
end
|
71
135
|
end
|
72
136
|
|
137
|
+
#
|
138
|
+
# Searching document in mongo
|
139
|
+
#
|
140
|
+
# @param scraper_name [String] name of scraper to build collection name
|
141
|
+
# @param suffix [String] suffix to build collection name
|
142
|
+
# @param id [Object] id of document
|
143
|
+
#
|
144
|
+
# @return [Hash] document
|
73
145
|
def mongo_find(scraper_name, suffix, id)
|
74
146
|
if id.is_a?(Hash)
|
75
147
|
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find(id)
|
@@ -78,10 +150,24 @@ module Scruber
|
|
78
150
|
end
|
79
151
|
end
|
80
152
|
|
153
|
+
#
|
154
|
+
# Access to mongo collection
|
155
|
+
#
|
156
|
+
# @param scraper_name [String] name of scraper to build collection name
|
157
|
+
# @param suffix [String] suffix to build collection name
|
158
|
+
#
|
159
|
+
# @return [Mongo::Collection] instance of Mongo::Collection
|
81
160
|
def mongo_collection(scraper_name, suffix)
|
82
161
|
Scruber::Mongo.client[out_collection_name(scraper_name, suffix)]
|
83
162
|
end
|
84
163
|
|
164
|
+
#
|
165
|
+
# Collection name builder
|
166
|
+
#
|
167
|
+
# @param scraper_name [String] name of scraper to build collection name
|
168
|
+
# @param suffix [String] suffix to build collection name
|
169
|
+
#
|
170
|
+
# @return [String] name of collection for given scraper_name and suffix
|
85
171
|
def out_collection_name(scraper_name, suffix)
|
86
172
|
[Scruber::Mongo.configuration.options['collections_prefix'], scraper_name, suffix].select(&:present?).map(&:to_s).join('_')
|
87
173
|
end
|
@@ -1,56 +1,114 @@
|
|
1
1
|
module Scruber
|
2
2
|
module QueueAdapters
|
3
3
|
class Mongo < AbstractAdapter
|
4
|
-
attr_reader :error_pages
|
5
4
|
|
6
5
|
class Page < Scruber::QueueAdapters::AbstractAdapter::Page
|
7
6
|
def id
|
8
|
-
@options[:_id] || @
|
7
|
+
@options[:_id] || @id
|
9
8
|
end
|
10
9
|
|
11
|
-
|
10
|
+
#
|
11
|
+
# Saving page to queue
|
12
|
+
# @param options [Hash] saving options
|
13
|
+
# @param save_options={} [type] [description]
|
14
|
+
#
|
15
|
+
# @return [type] [description]
|
16
|
+
def save(options={}, save_options={})
|
12
17
|
if id.blank?
|
13
18
|
@queue.collection.insert_one(attrs)
|
14
19
|
else
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
+
if options[:new]
|
21
|
+
@queue.collection.find_one_and_update(
|
22
|
+
{"_id" => self.id },
|
23
|
+
{'$setOnInsert' => attrs },
|
24
|
+
{return_document: :after, upsert: true, projection: {_id: 1}}.merge(options)
|
25
|
+
)
|
26
|
+
else
|
27
|
+
@queue.collection.find_one_and_update(
|
28
|
+
{"_id" => self.id },
|
29
|
+
{'$set' => attrs },
|
30
|
+
{return_document: :after, upsert: true, projection: {_id: 1}}.merge(options)
|
31
|
+
)
|
32
|
+
end
|
20
33
|
end
|
21
34
|
end
|
22
35
|
|
36
|
+
#
|
37
|
+
# Mark page as processed by parser and save it
|
38
|
+
#
|
39
|
+
# @return [void]
|
40
|
+
def processed!
|
41
|
+
# Monkey patch for processing error pages.
|
42
|
+
if @fetched_at == 0
|
43
|
+
@fetched_at = -1
|
44
|
+
end
|
45
|
+
super
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Generating hash with mongo doc attributes
|
50
|
+
#
|
51
|
+
# @return [Hash] hash with page attributes
|
23
52
|
def attrs
|
24
53
|
@options.with_indifferent_access.except('id', '_id').merge(id.present? ? {_id: id} : {}).merge (instance_variables.select{|ivar| !(ivar.to_s =~ /\@_/) }-[:@options, :@queue]).inject({}){|acc,ivar| acc[ivar[1..-1]] = instance_variable_get(ivar);acc }.with_indifferent_access
|
25
54
|
end
|
26
55
|
|
56
|
+
#
|
57
|
+
# Delete record from Mongo collection
|
58
|
+
#
|
59
|
+
# @return [void]
|
27
60
|
def delete
|
28
61
|
@queue.collection.find({"_id" => self.id }).delete_one if self.id.present?
|
29
62
|
end
|
30
63
|
end
|
31
64
|
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
|
36
|
-
|
65
|
+
#
|
66
|
+
# Add page to queue
|
67
|
+
# @param url [String] URL of page
|
68
|
+
# @param options [Hash] Other options, see {Scruber::QueueAdapters::AbstractAdapter::Page}
|
69
|
+
#
|
70
|
+
# @return [void]
|
71
|
+
def add(url_or_page, options={})
|
37
72
|
if url_or_page.is_a?(Page)
|
38
73
|
url_or_page.queue = self
|
39
|
-
url_or_page.save(options)
|
74
|
+
url_or_page.save({new: true}.merge(options))
|
40
75
|
else
|
41
|
-
Page.new(self, url_or_page
|
76
|
+
Page.new(self, options.merge(url: url_or_page)).save({new: true})
|
42
77
|
end
|
43
78
|
end
|
44
|
-
alias_method :
|
79
|
+
alias_method :push, :add
|
45
80
|
|
46
|
-
|
81
|
+
#
|
82
|
+
# Size of queue
|
83
|
+
#
|
84
|
+
# @return [Integer] count of pages in queue
|
85
|
+
def size
|
47
86
|
collection.count
|
48
87
|
end
|
49
88
|
|
89
|
+
#
|
90
|
+
# Count of downloaded pages
|
91
|
+
# Using to show downloading progress.
|
92
|
+
#
|
93
|
+
# @return [Integer] count of downloaded pages
|
94
|
+
def downloaded_count
|
95
|
+
collection.find({fetched_at: {"$gt" => 0}}).count
|
96
|
+
end
|
97
|
+
|
98
|
+
#
|
99
|
+
# Search page by id
|
100
|
+
# @param id [Object] id of page
|
101
|
+
#
|
102
|
+
# @return [Page] page object
|
50
103
|
def find(id)
|
51
104
|
build_pages collection.find({_id: id}).first
|
52
105
|
end
|
53
106
|
|
107
|
+
#
|
108
|
+
# Fetch downloaded and not processed pages for feching
|
109
|
+
# @param count=nil [Integer] count of pages to fetch
|
110
|
+
#
|
111
|
+
# @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
|
54
112
|
def fetch_downloaded(count=nil)
|
55
113
|
if count.nil?
|
56
114
|
build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).first
|
@@ -59,34 +117,81 @@ module Scruber
|
|
59
117
|
end
|
60
118
|
end
|
61
119
|
|
120
|
+
#
|
121
|
+
# Fetch pending page for fetching
|
122
|
+
# @param count=nil [Integer] count of pages to fetch
|
123
|
+
#
|
124
|
+
# @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
|
62
125
|
def fetch_pending(count=nil)
|
63
126
|
if count.nil?
|
64
|
-
build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).first
|
127
|
+
build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).first
|
65
128
|
else
|
66
|
-
build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a
|
129
|
+
build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a
|
67
130
|
end
|
68
131
|
end
|
69
132
|
|
133
|
+
#
|
134
|
+
# Fetch error page
|
135
|
+
# @param count=nil [Integer] count of pages to fetch
|
136
|
+
#
|
137
|
+
# @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
|
138
|
+
def fetch_error(count=nil)
|
139
|
+
if count.nil?
|
140
|
+
build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).first
|
141
|
+
else
|
142
|
+
build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).limit(count).to_a
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
#
|
147
|
+
# Used by Core. It checks for pages that are
|
148
|
+
# not downloaded or not parsed yet.
|
149
|
+
#
|
150
|
+
# @return [Boolean] true if queue still has work for scraper
|
70
151
|
def has_work?
|
71
152
|
fetch_pending.present? || fetch_downloaded.present?
|
72
153
|
end
|
73
154
|
|
155
|
+
#
|
156
|
+
# Accessing to mongo collection instance
|
157
|
+
#
|
158
|
+
# @return [Mongo::Collection] Mongo collection instance
|
74
159
|
def collection
|
75
160
|
Scruber::Mongo.client[pages_collection_name]
|
76
161
|
end
|
77
162
|
|
163
|
+
#
|
164
|
+
# Check if queue was initialized.
|
165
|
+
# Using for `seed` method. If queue was initialized,
|
166
|
+
# then no need to run seed block.
|
167
|
+
#
|
168
|
+
# @return [Boolean] true if queue already was initialized
|
169
|
+
def initialized?
|
170
|
+
Scruber::Mongo.client[pages_collection_name].find.first.present?
|
171
|
+
end
|
172
|
+
|
78
173
|
private
|
79
174
|
|
175
|
+
#
|
176
|
+
# Wrapping mongo objects into queue Page objects
|
177
|
+
#
|
178
|
+
# @param pages [Hash|Array<Hash>] Mongo document or array of mongo documents
|
179
|
+
#
|
180
|
+
# @return [type] [description]
|
80
181
|
def build_pages(pages)
|
81
182
|
if pages.nil?
|
82
183
|
nil
|
83
184
|
elsif pages.is_a?(Array)
|
84
|
-
pages.map{|p| Page.new(self, p['url']
|
185
|
+
pages.map{|p| Page.new(self, p.with_indifferent_access.merge(url: p['url']) )}
|
85
186
|
else
|
86
|
-
Page.new(self, pages['url']
|
187
|
+
Page.new(self, pages.with_indifferent_access.merge(url: pages['url']) )
|
87
188
|
end
|
88
189
|
end
|
89
190
|
|
191
|
+
#
|
192
|
+
# Generating mongo pages collection name
|
193
|
+
#
|
194
|
+
# @return [String] name of pages collection
|
90
195
|
def pages_collection_name
|
91
196
|
@_pages_collection_name ||= [Scruber::Mongo.configuration.options['collections_prefix'], @options[:scraper_name], 'pages'].select(&:present?).map(&:to_s).join('_')
|
92
197
|
end
|
data/scruber-mongo.gemspec
CHANGED
@@ -30,11 +30,11 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
31
|
spec.require_paths = ["lib"]
|
32
32
|
|
33
|
-
spec.add_dependency "scruber", "~> 0.1.
|
33
|
+
spec.add_dependency "scruber", "~> 0.1.6"
|
34
34
|
spec.add_dependency "mongo", "~> 2.4"
|
35
35
|
spec.add_development_dependency "bundler", "~> 1.16"
|
36
36
|
spec.add_development_dependency "rake", "~> 10.0"
|
37
37
|
spec.add_development_dependency "rspec", "~> 3.0"
|
38
|
-
spec.add_development_dependency "database_cleaner",
|
38
|
+
spec.add_development_dependency "database_cleaner", '~> 1.6', '>= 1.6.0'
|
39
39
|
spec.add_development_dependency "webmock", "3.0.1"
|
40
40
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber-mongo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: scruber
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.1.
|
19
|
+
version: 0.1.6
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.1.
|
26
|
+
version: 0.1.6
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: mongo
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -85,6 +85,9 @@ dependencies:
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.6'
|
90
|
+
- - ">="
|
88
91
|
- !ruby/object:Gem::Version
|
89
92
|
version: 1.6.0
|
90
93
|
type: :development
|
@@ -92,6 +95,9 @@ dependencies:
|
|
92
95
|
version_requirements: !ruby/object:Gem::Requirement
|
93
96
|
requirements:
|
94
97
|
- - "~>"
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '1.6'
|
100
|
+
- - ">="
|
95
101
|
- !ruby/object:Gem::Version
|
96
102
|
version: 1.6.0
|
97
103
|
- !ruby/object:Gem::Dependency
|
@@ -130,6 +136,7 @@ files:
|
|
130
136
|
- lib/scruber/mongo.rb
|
131
137
|
- lib/scruber/mongo/cli/generators.rb
|
132
138
|
- lib/scruber/mongo/cli/templates/mongo.tt
|
139
|
+
- lib/scruber/mongo/cli/templates/mongo_initializer.tt
|
133
140
|
- lib/scruber/mongo/configuration.rb
|
134
141
|
- lib/scruber/mongo/factory.rb
|
135
142
|
- lib/scruber/mongo/version.rb
|