polipus-cassandra 0.0.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +35 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +2 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +2 -0
- data/lib/polipus-cassandra.rb +5 -0
- data/lib/polipus-cassandra/policies/default.rb +30 -0
- data/lib/polipus-cassandra/policies/policies.rb +1 -0
- data/lib/polipus-cassandra/queue_overflow/cassandra_queue.rb +307 -0
- data/lib/polipus-cassandra/storage/cassandra_store.rb +244 -0
- data/polipus-cassandra.gemspec +30 -0
- data/spec/polipus-cassandra/storage/cassandra_store_spec.rb +174 -0
- data/spec/spec_helper.rb +44 -0
- metadata +145 -17
- checksums.yaml +0 -15
data/.gitignore
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/test/tmp/
|
9
|
+
/test/version_tmp/
|
10
|
+
/tmp/
|
11
|
+
|
12
|
+
## Specific to RubyMotion:
|
13
|
+
.dat*
|
14
|
+
.repl_history
|
15
|
+
build/
|
16
|
+
|
17
|
+
## Documentation cache and generated files:
|
18
|
+
/.yardoc/
|
19
|
+
/_yardoc/
|
20
|
+
/doc/
|
21
|
+
/rdoc/
|
22
|
+
|
23
|
+
## Environment normalisation:
|
24
|
+
/.bundle/
|
25
|
+
/vendor/bundle
|
26
|
+
/lib/bundler/man/
|
27
|
+
|
28
|
+
# for a library or gem, you might want to ignore these files since the code is
|
29
|
+
# intended to run in multiple environments; otherwise, check them in:
|
30
|
+
Gemfile.lock
|
31
|
+
# .ruby-version
|
32
|
+
# .ruby-gemset
|
33
|
+
|
34
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
35
|
+
.rvmrc
|
data/.rspec
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.9.3-p551
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Stefano Fontanelli
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Polipus: addons for Cassandra
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'polipus-cassandra'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install polipus-cassandra
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it ( http://github.com/<my-github-username>/polipus-storage-s3/fork )
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'cassandra'
|
3
|
+
|
4
|
+
module Corm
|
5
|
+
module Retry
|
6
|
+
module Policies
|
7
|
+
class Default
|
8
|
+
include Cassandra::Retry::Policy
|
9
|
+
|
10
|
+
def read_timeout(_statement, consistency, _required, _received, retrieved, retries)
|
11
|
+
return reraise if retries >= 5
|
12
|
+
sleep(retries.to_f + Random.rand(0.0..1.0))
|
13
|
+
retrieved ? reraise : try_again(consistency)
|
14
|
+
end
|
15
|
+
|
16
|
+
def write_timeout(_statement, consistency, _type, _required, _received, retries)
|
17
|
+
return reraise if retries >= 5
|
18
|
+
sleep(retries.to_f + Random.rand(0.0..1.0))
|
19
|
+
try_again(consistency)
|
20
|
+
end
|
21
|
+
|
22
|
+
def unavailable(_statement, consistency, _required, _alive, retries)
|
23
|
+
return reraise if retries >= 5
|
24
|
+
sleep(retries.to_f + Random.rand(0.0..1.0))
|
25
|
+
try_again(consistency)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'polipus-cassandra/policies/default'
|
@@ -0,0 +1,307 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'cassandra'
|
3
|
+
require 'polipus'
|
4
|
+
|
5
|
+
module Polipus
|
6
|
+
module QueueOverflow
|
7
|
+
class CassandraQueue
|
8
|
+
|
9
|
+
# CassandraQueue wants to persists documents (please, still ignore the
|
10
|
+
# jargon inherited from Mongo) like the following JSON-ish entry.
|
11
|
+
#
|
12
|
+
# There is no superclass here but I've in mind the interface implicitly
|
13
|
+
# defined by Polipus::QueueOverflow::DevNullQueue that, more or less has:
|
14
|
+
#
|
15
|
+
# def initialize
|
16
|
+
# def length
|
17
|
+
# def empty?
|
18
|
+
# def clear
|
19
|
+
# def push(_data)
|
20
|
+
# def pop(_ = false)
|
21
|
+
#
|
22
|
+
# Taking some data from our backend.production.*****.com/polipus
|
23
|
+
# I found:
|
24
|
+
#
|
25
|
+
# mongos> db.getCollectionNames()
|
26
|
+
# [
|
27
|
+
# "data-com-companies",
|
28
|
+
# "data_com_companies",
|
29
|
+
# "googleplus",
|
30
|
+
# "linkedin",
|
31
|
+
# "linkedin-companies",
|
32
|
+
# "linkedin_companies_parsed",
|
33
|
+
# "linkedin_jobs",
|
34
|
+
# "linkedin_jobs_parsed",
|
35
|
+
# "linkedin_pages_errors",
|
36
|
+
# "polipus_q_overflow_data-com-companies_queue_overflow",
|
37
|
+
# "polipus_q_overflow_data_com_companies_queue_overflow",
|
38
|
+
# "polipus_q_overflow_googleplus_queue_overflow",
|
39
|
+
# "polipus_q_overflow_linkedin-companies_queue_overflow",
|
40
|
+
# "polipus_q_overflow_linkedin_jobs_queue_overflow",
|
41
|
+
# "polipus_q_overflow_linkedin_jobs_queue_overflow_old",
|
42
|
+
# "polipus_q_overflow_linkedin_refresh_queue_overflow",
|
43
|
+
# "system.indexes"
|
44
|
+
# ]
|
45
|
+
#
|
46
|
+
# mongos> db.getCollection("polipus_q_overflow_linkedin_jobs_queue_overflow").find().limit(1)
|
47
|
+
# {
|
48
|
+
# "_id" : ObjectId("54506b98e3d55b20c40b32d3"),
|
49
|
+
# "payload" : "{\"url\":\"https://www.linkedin.com/job/product-designer-jobs/?page_num=7&trk=jserp_pagination_next\",\"depth\":6,\"referer\":\"https://www.linkedin.com/job/product-designer-jobs/?page_num=6&trk=jserp_pagination_6\",\"fetched\":false}"
|
50
|
+
# }
|
51
|
+
#
|
52
|
+
# mongos> db.polipus_q_overflow_linkedin_refresh_queue_overflow.find().limit(10)
|
53
|
+
# {
|
54
|
+
# "_id" : ObjectId("544072b6e3d55b0db7000001"),
|
55
|
+
# "payload" : "{\"url\":\"http://www.linkedin.com/in/*****\",\"depth\":0,\"fetched\":false}"
|
56
|
+
# }
|
57
|
+
#
|
58
|
+
# We also assume this MonkeyPatch:
|
59
|
+
# Polipus::QueueOverflow.cassandra_queue(namespace, options = {})
|
60
|
+
# that returns instances of this class.
|
61
|
+
|
62
|
+
attr_accessor :cluster, :keyspace, :table
|
63
|
+
|
64
|
+
# There is a validation enforced to `:keyspace` and `:table` because
|
65
|
+
# Cassandra is not happy when a keyspace or a table name contains an
|
66
|
+
# hyphen.
|
67
|
+
def initialize(options = {})
|
68
|
+
raise ArgumentError unless options_are_valid?(options)
|
69
|
+
@cluster = options[:cluster]
|
70
|
+
@keyspace = options[:keyspace].gsub("-", "_")
|
71
|
+
@table = options[:table].gsub("-", "_")
|
72
|
+
@semaphore = Mutex.new
|
73
|
+
@options = options
|
74
|
+
@timeuuid_generator = Cassandra::Uuid::Generator.new
|
75
|
+
@logger = @options[:logger] ||= Logger.new(STDOUT).tap { |l| l.level = Logger::INFO }
|
76
|
+
end
|
77
|
+
|
78
|
+
# Length aka Size aka Count is supported in Cassandra... like your POSQL
|
79
|
+
# you can COUNT.
|
80
|
+
#
|
81
|
+
# SELECT COUNT (*) FROM keyspace.table_name;
|
82
|
+
#
|
83
|
+
# TBH I'm not sure if being "defensive" and returning 0/nil in case
|
84
|
+
# the results is_empty? ... I'm leaving (now) the code simple and noisy
|
85
|
+
# if something went wrong in the COUNT.
|
86
|
+
def length
|
87
|
+
table_ = [keyspace, table].compact.join '.'
|
88
|
+
statement = "SELECT COUNT (*) FROM #{table_} ;"
|
89
|
+
result = session.execute(statement)
|
90
|
+
result.first['count']
|
91
|
+
end
|
92
|
+
|
93
|
+
# Return true if the table has no rows.
|
94
|
+
# This is achieved with a 'SELECT WITH LIMIT 1' query.
|
95
|
+
def empty?
|
96
|
+
return get.first.nil?
|
97
|
+
end
|
98
|
+
|
99
|
+
# Clear is a fancy name for a DROP TABLE IF EXISTS <table_>.
|
100
|
+
def clear
|
101
|
+
table_ = [keyspace, table].compact.join '.'
|
102
|
+
statement = "DROP TABLE IF EXISTS #{table_} ;"
|
103
|
+
session.execute(statement)
|
104
|
+
end
|
105
|
+
|
106
|
+
# push is your the "write into Cassandra" method.
|
107
|
+
def push(data)
|
108
|
+
return nil if data.nil?
|
109
|
+
obj = MultiJson.decode(data)
|
110
|
+
|
111
|
+
table_ = [keyspace, table].compact.join('.')
|
112
|
+
queue_name = @keyspace
|
113
|
+
created_at = @timeuuid_generator.now
|
114
|
+
|
115
|
+
begin
|
116
|
+
@semaphore.synchronize do
|
117
|
+
|
118
|
+
if obj.has_key?('payload') && !obj['payload'].empty?
|
119
|
+
payload = MultiJson.encode(obj['payload'])
|
120
|
+
else
|
121
|
+
payload = nil
|
122
|
+
end
|
123
|
+
|
124
|
+
column_names = %w[ queue_name created_at payload ]
|
125
|
+
values_placeholders = column_names.map{|_| '?'}.join(',')
|
126
|
+
statement = "INSERT INTO #{table_} ( #{column_names.join(',')} ) VALUES (#{values_placeholders});"
|
127
|
+
|
128
|
+
session.execute(
|
129
|
+
session.prepare(statement),
|
130
|
+
arguments: [
|
131
|
+
queue_name,
|
132
|
+
created_at,
|
133
|
+
payload
|
134
|
+
])
|
135
|
+
end
|
136
|
+
rescue Encoding::UndefinedConversionError
|
137
|
+
puts $!.error_char.dump
|
138
|
+
puts $!.error_char.encoding
|
139
|
+
end
|
140
|
+
|
141
|
+
@logger.debug { "Writing this entry [#{[queue_name, created_at].to_s}]" }
|
142
|
+
[queue_name, created_at].to_s
|
143
|
+
end
|
144
|
+
|
145
|
+
# Pop removes 'n' entries from the overflow table (treated as a queue)
|
146
|
+
# and returns a paged result.
|
147
|
+
# results.class #=> Cassandra::Results::Paged
|
148
|
+
#
|
149
|
+
# Polipus is expecting a String, that will be JSONparsed with the purpose
|
150
|
+
# to build a
|
151
|
+
def pop(n = 1)
|
152
|
+
# A recap: pop should remove oldest N messages and return to the caller.
|
153
|
+
#
|
154
|
+
# Let's see how this queue is implemented.
|
155
|
+
# In redis, messages are LPUSH-ed:
|
156
|
+
#
|
157
|
+
# 4 - 3 - 2 - 1 --> REDIS
|
158
|
+
# 4 - 3 - 2 --> REDIS
|
159
|
+
# 4 - 3 --> REDIS
|
160
|
+
# 4 --> REDIS
|
161
|
+
#
|
162
|
+
# Then, in the fast_dequeue, are RPOP-ped:
|
163
|
+
#
|
164
|
+
# REDIS --> 1
|
165
|
+
# REDIS --> 2 - 1
|
166
|
+
# REDIS --> 3 - 2 - 1
|
167
|
+
# REDIS --> 4 - 3 - 2 - 1
|
168
|
+
#
|
169
|
+
# Then, are received in this order:
|
170
|
+
# [1] -> TimeUUID(1) = ...
|
171
|
+
# [2] -> TimeUUID(1) = ...
|
172
|
+
# [3] -> TimeUUID(1) = ...
|
173
|
+
# [4] -> TimeUUID(1) = ...
|
174
|
+
#
|
175
|
+
# As you can see below, are ORDER BY (created_at ASC)... that means
|
176
|
+
# "olders first". When using 'LIMIT n' in a query, you get the 'n'
|
177
|
+
# olders entries.
|
178
|
+
#
|
179
|
+
# cqlsh> SELECT * FROM polipus_queue_overflow_linkedin.linkedin_overflow ;
|
180
|
+
#
|
181
|
+
# queue_name | created_at | payload
|
182
|
+
# ---------------------------------+--------------------------------------+---------
|
183
|
+
# polipus_queue_overflow_linkedin | 4632d49c-1c04-11e5-844b-0b314c777502 | "1"
|
184
|
+
# polipus_queue_overflow_linkedin | 46339f8a-1c04-11e5-844b-0b314c777502 | "2"
|
185
|
+
# polipus_queue_overflow_linkedin | 46349962-1c04-11e5-844b-0b314c777502 | "3"
|
186
|
+
# polipus_queue_overflow_linkedin | 46351860-1c04-11e5-844b-0b314c777502 | "4"
|
187
|
+
#
|
188
|
+
# (4 rows)
|
189
|
+
# cqlsh> SELECT * FROM polipus_queue_overflow_linkedin.linkedin_overflow LIMIT 1;
|
190
|
+
#
|
191
|
+
# queue_name | created_at | payload
|
192
|
+
# ---------------------------------+--------------------------------------+---------
|
193
|
+
# polipus_queue_overflow_linkedin | 4632d49c-1c04-11e5-844b-0b314c777502 | "1"
|
194
|
+
#
|
195
|
+
# (1 rows)
|
196
|
+
#
|
197
|
+
table_ = [keyspace, table].compact.join '.'
|
198
|
+
results = get(n)
|
199
|
+
results.each do |entry|
|
200
|
+
statement = "DELETE FROM #{table_} WHERE queue_name = '#{entry['queue_name']}' AND created_at = #{entry['created_at']} ;"
|
201
|
+
session.execute(statement)
|
202
|
+
end
|
203
|
+
|
204
|
+
# Let's rispect the API as expected by Polipus.
|
205
|
+
# Otherwise the execute returns a Cassandra::Results::Paged
|
206
|
+
if !results.nil? && results.respond_to?(:count) && results.count == 1
|
207
|
+
return results.first['payload']
|
208
|
+
end
|
209
|
+
return results
|
210
|
+
end
|
211
|
+
|
212
|
+
alias_method :size, :length
|
213
|
+
alias_method :dec, :pop
|
214
|
+
alias_method :shift, :pop
|
215
|
+
alias_method :enc, :push
|
216
|
+
alias_method :<<, :push
|
217
|
+
|
218
|
+
def keyspace!(replication = nil, durable_writes = true)
|
219
|
+
replication ||= "{'class': 'SimpleStrategy', 'replication_factor': '3'}"
|
220
|
+
statement = "CREATE KEYSPACE IF NOT EXISTS #{keyspace} WITH replication = #{replication} AND durable_writes = #{durable_writes};"
|
221
|
+
cluster.connect.execute(statement)
|
222
|
+
end
|
223
|
+
|
224
|
+
def session
|
225
|
+
@session ||= @cluster.connect(keyspace)
|
226
|
+
end
|
227
|
+
|
228
|
+
# Taking a look in the Cassandra KEYSPACE you will found:
|
229
|
+
#
|
230
|
+
# cqlsh> DESCRIBE KEYSPACE polipus_queue_overflow_linkedin ;
|
231
|
+
#
|
232
|
+
# CREATE KEYSPACE polipus_queue_overflow_linkedin WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '3'} AND durable_writes = true;
|
233
|
+
#
|
234
|
+
# CREATE TABLE polipus_queue_overflow_linkedin.linkedin_overflow (
|
235
|
+
# queue_name text,
|
236
|
+
# created_at timeuuid,
|
237
|
+
# payload text,
|
238
|
+
# PRIMARY KEY (queue_name, created_at)
|
239
|
+
# ) WITH CLUSTERING ORDER BY (created_at ASC)
|
240
|
+
# AND bloom_filter_fp_chance = 0.01
|
241
|
+
# AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
|
242
|
+
# AND comment = ''
|
243
|
+
# AND compaction = {'min_threshold': '4', 'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32'}
|
244
|
+
# AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
|
245
|
+
# AND dclocal_read_repair_chance = 0.1
|
246
|
+
# AND default_time_to_live = 0
|
247
|
+
# AND gc_grace_seconds = 864000
|
248
|
+
# AND max_index_interval = 2048
|
249
|
+
# AND memtable_flush_period_in_ms = 0
|
250
|
+
# AND min_index_interval = 128
|
251
|
+
# AND read_repair_chance = 0.0
|
252
|
+
# AND speculative_retry = '99.0PERCENTILE';
|
253
|
+
#
|
254
|
+
# This means that:
|
255
|
+
# - queue_name is partition key;
|
256
|
+
# - created_at is clustering key;
|
257
|
+
#
|
258
|
+
# With sample data:
|
259
|
+
#
|
260
|
+
# cqlsh> SELECT * FROM polipus_queue_overflow_linkedin.linkedin_overflow LIMIT 1 ;
|
261
|
+
#
|
262
|
+
# queue_name | created_at | payload
|
263
|
+
# ---------------------------------+--------------------------------------+---------------------------------------------------------------------------------+
|
264
|
+
# polipus_queue_overflow_linkedin | de17ece6-1e5e-11e5-b997-47a87c40c422 | "{\"url\":\"http://www.linkedin.com/in/foobar\",\"depth\":0,\"fetched\":false}"
|
265
|
+
#
|
266
|
+
# (1 rows)
|
267
|
+
# cqlsh>
|
268
|
+
#
|
269
|
+
def table!(properties = nil)
|
270
|
+
table_ = [keyspace, table].compact.join '.'
|
271
|
+
def_ = "CREATE TABLE IF NOT EXISTS #{table_}
|
272
|
+
(
|
273
|
+
queue_name TEXT,
|
274
|
+
created_at TIMEUUID,
|
275
|
+
payload TEXT,
|
276
|
+
PRIMARY KEY (queue_name, created_at)
|
277
|
+
)"
|
278
|
+
props = Array(properties).join(' AND ')
|
279
|
+
statement = props.empty? ? "#{def_};" : "#{def_} WITH #{props};"
|
280
|
+
session.execute(statement)
|
281
|
+
end
|
282
|
+
|
283
|
+
private
|
284
|
+
|
285
|
+
def options_are_valid?(options)
|
286
|
+
options.has_key?(:cluster) && options.has_key?(:keyspace) && options.has_key?(:table)
|
287
|
+
end
|
288
|
+
|
289
|
+
def limit_is_valid?(limit)
|
290
|
+
!limit.nil? && limit.respond_to?(:to_i) && limit.to_i > 0
|
291
|
+
end
|
292
|
+
|
293
|
+
# results.class => Cassandra::Results::Paged
|
294
|
+
def get(limit = 1)
|
295
|
+
# coerce to int if a TrueClass/FalseClass is given.
|
296
|
+
limit = 1 if [true, false].include?(limit)
|
297
|
+
|
298
|
+
raise ArgumentError.new("Invalid limit value: must be an INTEGER greater than 1 (got #{limit.inspect}).") unless limit_is_valid?(limit)
|
299
|
+
table_ = [keyspace, table].compact.join '.'
|
300
|
+
statement = "SELECT queue_name, created_at, payload FROM #{table_} LIMIT #{limit.to_i} ;"
|
301
|
+
@semaphore.synchronize do
|
302
|
+
return session.execute(session.prepare(statement), arguments: [])
|
303
|
+
end
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
@@ -0,0 +1,244 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'cassandra'
|
3
|
+
require 'multi_json'
|
4
|
+
require 'polipus'
|
5
|
+
require 'thread'
|
6
|
+
require 'zlib'
|
7
|
+
|
8
|
+
module Polipus
|
9
|
+
module Storage
|
10
|
+
class CassandraStore < Base
|
11
|
+
|
12
|
+
# CassandraStore wants to persists documents (please ignore the jargon
|
13
|
+
# inherited from MongoDB) like the following JSON-ish entry:
|
14
|
+
#
|
15
|
+
# > db['linkedin-refresh'].find({})
|
16
|
+
#
|
17
|
+
# {
|
18
|
+
# "_id" : ObjectId("...."),
|
19
|
+
# "url" : "https://www.awesome.org/meh",
|
20
|
+
# "code" : 200,
|
21
|
+
# "depth" : 0,
|
22
|
+
# "referer" : "",
|
23
|
+
# "redirect_to" : "",
|
24
|
+
# "response_time" : 1313,
|
25
|
+
# "fetched" : true,
|
26
|
+
# "user_data" :
|
27
|
+
# {
|
28
|
+
# "imported" : false,
|
29
|
+
# "is_developer" : false,
|
30
|
+
# "last_modified" : null
|
31
|
+
# },
|
32
|
+
# "fetched_at" : 1434977757,
|
33
|
+
# "error" : "",
|
34
|
+
# "uuid" : "4ddce293532ea2454356a4210e61c363"
|
35
|
+
# }
|
36
|
+
|
37
|
+
attr_accessor :cluster, :keyspace, :table
|
38
|
+
|
39
|
+
BINARY_FIELDS = %w(body headers user_data)
|
40
|
+
|
41
|
+
def initialize(options = {})
|
42
|
+
@cluster = options[:cluster]
|
43
|
+
@keyspace = options[:keyspace]
|
44
|
+
@table = options[:table]
|
45
|
+
@except = options[:except] || []
|
46
|
+
@semaphore = Mutex.new
|
47
|
+
end
|
48
|
+
|
49
|
+
# {
|
50
|
+
# 'url' => @url.to_s,
|
51
|
+
# 'headers' => Marshal.dump(@headers),
|
52
|
+
# 'body' => @body,
|
53
|
+
# 'links' => links.map(&:to_s),
|
54
|
+
# 'code' => @code,
|
55
|
+
# 'depth' => @depth,
|
56
|
+
# 'referer' => @referer.to_s,
|
57
|
+
# 'redirect_to' => @redirect_to.to_s,
|
58
|
+
# 'response_time' => @response_time,
|
59
|
+
# 'fetched' => @fetched,
|
60
|
+
# 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
61
|
+
# 'fetched_at' => @fetched_at,
|
62
|
+
# 'error' => @error.to_s
|
63
|
+
# }
|
64
|
+
|
65
|
+
def add(page)
|
66
|
+
@semaphore.synchronize do
|
67
|
+
table_ = [keyspace, table].compact.join '.'
|
68
|
+
uuid_ = uuid(page)
|
69
|
+
obj = page.to_hash
|
70
|
+
Array(@except).each { |e| obj.delete(e.to_s) }
|
71
|
+
|
72
|
+
begin
|
73
|
+
BINARY_FIELDS.each do |field|
|
74
|
+
obj[field] = obj[field].to_s.encode('UTF-8', {
|
75
|
+
invalid: :replace,
|
76
|
+
undef: :replace,
|
77
|
+
replace: '?' }) if can_be_converted?(obj[field])
|
78
|
+
# ec = Encoding::Converter.new("ASCII-8BIT", "UTF-8")
|
79
|
+
# obj[field] = ec.convert(obj[field]) if can_be_converted?(obj[field])
|
80
|
+
# obj[field] = obj[field].force_encoding('ASCII-8BIT').force_encoding('UTF-8') if can_be_converted?(obj[field])
|
81
|
+
end
|
82
|
+
|
83
|
+
json = MultiJson.encode(obj)
|
84
|
+
|
85
|
+
url = obj.fetch('url', nil)
|
86
|
+
code = obj.fetch('code', nil)
|
87
|
+
depth = obj.fetch('depth', nil)
|
88
|
+
referer = obj.fetch('referer', nil)
|
89
|
+
redirectto = obj.fetch('redirect_to', nil)
|
90
|
+
response_time = obj.fetch('response_time', nil)
|
91
|
+
fetched = obj.fetch('fetched', nil)
|
92
|
+
error = obj.fetch('error', nil)
|
93
|
+
page = Zlib::Deflate.deflate(json)
|
94
|
+
|
95
|
+
if obj.has_key?('user_data') && !obj['user_data'].empty?
|
96
|
+
user_data = MultiJson.encode(obj['user_data'])
|
97
|
+
else
|
98
|
+
user_data = nil
|
99
|
+
end
|
100
|
+
|
101
|
+
value = obj.fetch('fetched_at', nil)
|
102
|
+
fetched_at = case value
|
103
|
+
when Fixnum
|
104
|
+
Time.at(value)
|
105
|
+
when String
|
106
|
+
Time.parse(value)
|
107
|
+
else
|
108
|
+
nil
|
109
|
+
end
|
110
|
+
|
111
|
+
column_names = %w[ uuid url code depth referer redirect_to response_time fetched user_data fetched_at error page ]
|
112
|
+
values_placeholders = column_names.map{|_| '?'}.join(',')
|
113
|
+
statement = "INSERT INTO #{table_} ( #{column_names.join(',')} ) VALUES (#{values_placeholders});"
|
114
|
+
|
115
|
+
session.execute(
|
116
|
+
session.prepare(statement),
|
117
|
+
arguments: [
|
118
|
+
uuid_,
|
119
|
+
url,
|
120
|
+
code,
|
121
|
+
depth,
|
122
|
+
referer,
|
123
|
+
redirectto,
|
124
|
+
response_time,
|
125
|
+
fetched,
|
126
|
+
user_data,
|
127
|
+
fetched_at,
|
128
|
+
error,
|
129
|
+
page
|
130
|
+
])
|
131
|
+
|
132
|
+
rescue Encoding::UndefinedConversionError
|
133
|
+
puts $!.error_char.dump
|
134
|
+
puts $!.error_char.encoding
|
135
|
+
end
|
136
|
+
|
137
|
+
uuid_
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def clear
|
142
|
+
table_ = [keyspace, table].compact.join '.'
|
143
|
+
statement = "DROP TABLE #{table_};"
|
144
|
+
session.execute statement
|
145
|
+
end
|
146
|
+
|
147
|
+
# TBH I'm not sure if being "defensive" and returning 0/nil in case
|
148
|
+
# the results is_empty? ... I'm leaving (now) the code simple and noisy
|
149
|
+
# if something went wrong in the COUNT.
|
150
|
+
def count
|
151
|
+
table_ = [keyspace, table].compact.join '.'
|
152
|
+
statement = "SELECT COUNT (*) FROM #{table_} ;"
|
153
|
+
result = session.execute(statement)
|
154
|
+
result.first['count']
|
155
|
+
end
|
156
|
+
|
157
|
+
def each
|
158
|
+
table_ = [keyspace, table].compact.join '.'
|
159
|
+
statement = "SELECT * FROM #{table_};"
|
160
|
+
session.execute(statement).each do |data|
|
161
|
+
page = load_page(data) unless data.nil?
|
162
|
+
yield data['uuid'], page
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def exists?(page)
|
167
|
+
@semaphore.synchronize do
|
168
|
+
table_ = [keyspace, table].compact.join '.'
|
169
|
+
statement = "SELECT uuid FROM #{table_} WHERE uuid = ? LIMIT 1;"
|
170
|
+
results = session.execute(session.prepare(statement),
|
171
|
+
arguments: [uuid(page)])
|
172
|
+
!results.first.nil?
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def get(page)
|
177
|
+
@semaphore.synchronize do
|
178
|
+
table_ = [keyspace, table].compact.join '.'
|
179
|
+
statement = "SELECT * FROM #{table_} WHERE uuid = ? LIMIT 1;"
|
180
|
+
results = session.execute(session.prepare(statement),
|
181
|
+
arguments: [uuid(page)])
|
182
|
+
data = results.first
|
183
|
+
load_page(data) unless data.nil?
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def keyspace!(replication = nil, durable_writes = true)
|
188
|
+
replication ||= "{'class': 'SimpleStrategy', 'replication_factor': '3'}"
|
189
|
+
statement = "CREATE KEYSPACE IF NOT EXISTS #{keyspace} WITH replication = #{replication} AND durable_writes = #{durable_writes};"
|
190
|
+
cluster.connect.execute statement
|
191
|
+
end
|
192
|
+
|
193
|
+
def remove(page)
|
194
|
+
@semaphore.synchronize do
|
195
|
+
table_ = [keyspace, table].compact.join '.'
|
196
|
+
statement = "DELETE FROM #{table_} WHERE uuid = ?;"
|
197
|
+
session.execute(session.prepare(statement),
|
198
|
+
arguments: [uuid(page)])
|
199
|
+
true
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def session
|
204
|
+
@session ||= @cluster.connect(keyspace)
|
205
|
+
end
|
206
|
+
|
207
|
+
def table!(properties = nil)
|
208
|
+
table_ = [keyspace, table].compact.join '.'
|
209
|
+
def_ = "CREATE TABLE IF NOT EXISTS #{table_}
|
210
|
+
(
|
211
|
+
uuid TEXT PRIMARY KEY,
|
212
|
+
url TEXT,
|
213
|
+
code INT,
|
214
|
+
depth INT,
|
215
|
+
referer TEXT,
|
216
|
+
redirect_to TEXT,
|
217
|
+
response_time BIGINT,
|
218
|
+
fetched BOOLEAN,
|
219
|
+
user_data TEXT,
|
220
|
+
fetched_at TIMESTAMP,
|
221
|
+
error TEXT,
|
222
|
+
page BLOB
|
223
|
+
)"
|
224
|
+
props = properties.to_a.join(' AND ')
|
225
|
+
statement = props.empty? ? "#{def_};" : "#{def_} WITH #{props};"
|
226
|
+
session.execute statement
|
227
|
+
end
|
228
|
+
|
229
|
+
def load_page(data)
|
230
|
+
json = Zlib::Inflate.inflate(data['page'])
|
231
|
+
hash = MultiJson.decode(json)
|
232
|
+
page = Page.from_hash(hash)
|
233
|
+
page.fetched_at = 0 if page.fetched_at.nil?
|
234
|
+
page
|
235
|
+
end
|
236
|
+
|
237
|
+
private
|
238
|
+
|
239
|
+
def can_be_converted?(field)
|
240
|
+
!field.nil? && field.is_a?(String) && !field.empty?
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'polipus-cassandra'
|
7
|
+
spec.version = '0.1.3'
|
8
|
+
spec.authors = ['Stefano Fontanelli', 'Edoardo Rossi']
|
9
|
+
spec.email = ['s.fontanelli@gmail.com', 'edoardo@gild.com']
|
10
|
+
spec.summary = 'Add support for Cassandra in Polipus crawler'
|
11
|
+
spec.description = 'Add support for Cassandra in Polipus crawler'
|
12
|
+
spec.homepage = 'https://github.com/stefanofontanelli/polipus-cassandra'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(/^bin\//) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(/^(test|spec|features)\//)
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_runtime_dependency 'cassandra-driver', '~> 2.0.1', '>= 2.0.1'
|
21
|
+
spec.add_runtime_dependency 'multi_json', '~> 1.11.0', '>= 1.11.0'
|
22
|
+
spec.add_runtime_dependency 'polipus', '~> 0.3', '>= 0.3.0'
|
23
|
+
|
24
|
+
spec.add_development_dependency 'rake', '~> 10.3'
|
25
|
+
spec.add_development_dependency 'rspec', '~> 3.1.0'
|
26
|
+
spec.add_development_dependency 'flexmock', '~> 1.3'
|
27
|
+
spec.add_development_dependency 'vcr', '~> 2.9.0'
|
28
|
+
spec.add_development_dependency 'webmock', '~> 1.20.0'
|
29
|
+
spec.add_development_dependency 'coveralls'
|
30
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'cassandra'
|
3
|
+
require 'logger'
|
4
|
+
require 'polipus-cassandra'
|
5
|
+
require 'spec_helper'
|
6
|
+
|
7
|
+
describe Polipus::Storage::CassandraStore do
|
8
|
+
before(:all)do
|
9
|
+
@logger = Logger.new(STDOUT).tap { |logger| logger.level = Logger::WARN }
|
10
|
+
@cluster = Cassandra.cluster hosts: ['127.0.0.1'], logger: @logger
|
11
|
+
@keyspace = 'polipus_cassandra_test'
|
12
|
+
@table = 'cassandra_store_test'
|
13
|
+
@storage = Polipus::Storage::CassandraStore.new(
|
14
|
+
cluster: @cluster,
|
15
|
+
keyspace: @keyspace,
|
16
|
+
table: @table,
|
17
|
+
)
|
18
|
+
|
19
|
+
@storage.keyspace!
|
20
|
+
@storage.table!
|
21
|
+
|
22
|
+
@storage_without_code_and_body = Polipus::Storage::CassandraStore.new(
|
23
|
+
cluster: @cluster,
|
24
|
+
keyspace: @keyspace,
|
25
|
+
table: @table,
|
26
|
+
except: ['code', 'body']
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
after(:all) do
|
31
|
+
@storage.clear
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should store a page' do
|
35
|
+
p = page_factory 'http://www.google.com'
|
36
|
+
uuid = @storage.add p
|
37
|
+
expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
|
38
|
+
p = @storage.get p
|
39
|
+
expect(p.url.to_s).to eq('http://www.google.com')
|
40
|
+
expect(p.body).to eq('<html></html>')
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should store all the relevant data from the page' do
|
44
|
+
url = "http://www.duckduckgo.com"
|
45
|
+
referer = "http://www.actually.nowhere.com"
|
46
|
+
redirectto = "#{url}/your_super_awesome_results?page=42"
|
47
|
+
now = Time.now.to_i
|
48
|
+
p = page_factory(
|
49
|
+
url,
|
50
|
+
{
|
51
|
+
referer: referer,
|
52
|
+
redirect_to: redirectto,
|
53
|
+
fetched_at: now
|
54
|
+
})
|
55
|
+
uuid = @storage.add p
|
56
|
+
expect(uuid).to eq('3cd657f53c74f22c1a21b420ce3863fd')
|
57
|
+
p = @storage.get p
|
58
|
+
|
59
|
+
expect(p.url.to_s).to eq(url)
|
60
|
+
expect(p.referer.to_s).to eq(referer)
|
61
|
+
expect(p.redirect_to.to_s).to eq(redirectto)
|
62
|
+
expect(p.fetched_at).to eq(now)
|
63
|
+
expect(p.body).to eq('<html></html>')
|
64
|
+
|
65
|
+
# for the sake of the other tests...
|
66
|
+
expect(@storage.remove(p)).to be_truthy
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should update a page' do
|
70
|
+
p = page_factory 'http://www.google.com', code: 301
|
71
|
+
@storage.add p
|
72
|
+
p = @storage.get p
|
73
|
+
expect(p.code).to eq(301)
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'should iterate over stored pages' do
|
77
|
+
@storage.each do |k, page|
|
78
|
+
expect(k).to eq('ed646a3334ca891fd3467db131372140')
|
79
|
+
expect(page.url.to_s).to eq('http://www.google.com')
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
it 'should delete a page' do
|
84
|
+
p = page_factory 'http://www.google.com', code: 301
|
85
|
+
@storage.remove p
|
86
|
+
expect(@storage.get(p)).to be_nil
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'should store a page removing a query string from the uuid generation' do
|
90
|
+
p = page_factory 'http://www.asd.com/?asd=lol'
|
91
|
+
p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1'
|
92
|
+
@storage.include_query_string_in_uuid = false
|
93
|
+
@storage.add p
|
94
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
95
|
+
@storage.remove p
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'should store a page removing a query string from the uuid generation no ending slash' do
|
99
|
+
p = page_factory 'http://www.asd.com?asd=lol'
|
100
|
+
p_no_query = page_factory 'http://www.asd.com'
|
101
|
+
@storage.include_query_string_in_uuid = false
|
102
|
+
@storage.add p
|
103
|
+
expect(@storage.exists?(p_no_query)).to be_truthy
|
104
|
+
@storage.remove p
|
105
|
+
end
|
106
|
+
|
107
|
+
it 'should store a page with user data associated' do
|
108
|
+
p = page_factory 'http://www.user.com'
|
109
|
+
p.user_data.name = 'Test User Data'
|
110
|
+
@storage.add p
|
111
|
+
expect(@storage.exists?(p)).to be_truthy
|
112
|
+
p = @storage.get(p)
|
113
|
+
expect(p.user_data.name).to eq('Test User Data')
|
114
|
+
@storage.remove p
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'should honor the except parameters' do
|
118
|
+
pag = page_factory 'http://www.user-doo.com'
|
119
|
+
expect(pag.code).to eq(200)
|
120
|
+
expect(pag.body).to eq('<html></html>')
|
121
|
+
|
122
|
+
@storage_without_code_and_body.add(pag)
|
123
|
+
pag = @storage_without_code_and_body.get(pag)
|
124
|
+
|
125
|
+
expect(pag.body).to be_nil
|
126
|
+
expect(pag.code).to eq(0)
|
127
|
+
@storage_without_code_and_body.remove(pag)
|
128
|
+
end
|
129
|
+
|
130
|
+
it 'should return false if a doc not exists' do
|
131
|
+
@storage.include_query_string_in_uuid = false
|
132
|
+
p_other = page_factory 'http://www.asdrrrr.com'
|
133
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
134
|
+
@storage.add p_other
|
135
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
136
|
+
p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol'
|
137
|
+
expect(@storage.exists?(p_other)).to be_truthy
|
138
|
+
@storage.include_query_string_in_uuid = true
|
139
|
+
expect(@storage.exists?(p_other)).to be_falsey
|
140
|
+
@storage.include_query_string_in_uuid = false
|
141
|
+
@storage.remove p_other
|
142
|
+
end
|
143
|
+
|
144
|
+
it 'should set page.fetched_at based on the id creation' do
|
145
|
+
p = page_factory 'http://www.user-doojo.com'
|
146
|
+
@storage.add p
|
147
|
+
expect(p.fetched_at).to be_nil
|
148
|
+
p = @storage.get p
|
149
|
+
expect(p.fetched_at).not_to be_nil
|
150
|
+
@storage.remove p
|
151
|
+
end
|
152
|
+
|
153
|
+
it 'should NOT set page.fetched_at if already present' do
|
154
|
+
p = page_factory 'http://www.user-doojooo.com'
|
155
|
+
p.fetched_at = 10
|
156
|
+
@storage.add p
|
157
|
+
p = @storage.get p
|
158
|
+
expect(p.fetched_at).to be 10
|
159
|
+
@storage.remove p
|
160
|
+
end
|
161
|
+
|
162
|
+
it 'should store two pages and the count will be two' do
|
163
|
+
pages = ['http://www.google.com', 'http://www.duckduckgo.com'].map do |url|
|
164
|
+
page_factory(url).tap do |page|
|
165
|
+
@storage.add(page)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
expect(@storage.count).to be 2
|
169
|
+
pages.each do |page|
|
170
|
+
@storage.remove(page)
|
171
|
+
end
|
172
|
+
expect(@storage.count).to be 0
|
173
|
+
end
|
174
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# Require this file using `require "spec_helper"`
|
2
|
+
# to ensure that it is only loaded once.
|
3
|
+
#
|
4
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
5
|
+
require 'digest/md5'
|
6
|
+
require 'coveralls'
|
7
|
+
require 'vcr'
|
8
|
+
require 'webmock/rspec'
|
9
|
+
|
10
|
+
Coveralls.wear!
|
11
|
+
|
12
|
+
VCR.configure do |c|
|
13
|
+
c.cassette_library_dir = "#{File.dirname(__FILE__)}/cassettes"
|
14
|
+
c.hook_into :webmock
|
15
|
+
end
|
16
|
+
|
17
|
+
require 'polipus'
|
18
|
+
|
19
|
+
RSpec.configure do |config|
|
20
|
+
config.run_all_when_everything_filtered = true
|
21
|
+
config.filter_run :focus
|
22
|
+
|
23
|
+
# Run specs in random order to surface order dependencies. If you find an
|
24
|
+
# order dependency and want to debug it, you can fix the order by providing
|
25
|
+
# the seed, which is printed after each run.
|
26
|
+
# --seed 1234
|
27
|
+
config.order = 'random'
|
28
|
+
config.mock_with :flexmock
|
29
|
+
config.around(:each) do |example|
|
30
|
+
t = Time.now
|
31
|
+
print example.metadata[:full_description]
|
32
|
+
VCR.use_cassette(Digest::MD5.hexdigest(example.metadata[:full_description])) do
|
33
|
+
example.run
|
34
|
+
puts " [#{Time.now - t}s]"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
config.before(:each) { Polipus::SignalHandler.disable }
|
38
|
+
end
|
39
|
+
|
40
|
+
def page_factory(url, params = {})
|
41
|
+
params[:code] = 200 unless params.has_key?(:code)
|
42
|
+
params[:body] = '<html></html>' unless params.has_key?(:body)
|
43
|
+
Polipus::Page.new url, params
|
44
|
+
end
|
metadata
CHANGED
@@ -1,18 +1,65 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus-cassandra
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.3
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Stefano Fontanelli
|
9
|
+
- Edoardo Rossi
|
8
10
|
autorequire:
|
9
11
|
bindir: bin
|
10
12
|
cert_chain: []
|
11
|
-
date: 2015-
|
13
|
+
date: 2015-07-13 00:00:00.000000000 Z
|
12
14
|
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: cassandra-driver
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.0.1
|
23
|
+
- - ! '>='
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: 2.0.1
|
26
|
+
type: :runtime
|
27
|
+
prerelease: false
|
28
|
+
version_requirements: !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 2.0.1
|
34
|
+
- - ! '>='
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: 2.0.1
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: multi_json
|
39
|
+
requirement: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ~>
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 1.11.0
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.11.0
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ~>
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 1.11.0
|
56
|
+
- - ! '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: 1.11.0
|
13
59
|
- !ruby/object:Gem::Dependency
|
14
60
|
name: polipus
|
15
61
|
requirement: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
16
63
|
requirements:
|
17
64
|
- - ~>
|
18
65
|
- !ruby/object:Gem::Version
|
@@ -23,6 +70,7 @@ dependencies:
|
|
23
70
|
type: :runtime
|
24
71
|
prerelease: false
|
25
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
26
74
|
requirements:
|
27
75
|
- - ~>
|
28
76
|
- !ruby/object:Gem::Version
|
@@ -31,28 +79,89 @@ dependencies:
|
|
31
79
|
- !ruby/object:Gem::Version
|
32
80
|
version: 0.3.0
|
33
81
|
- !ruby/object:Gem::Dependency
|
34
|
-
name:
|
82
|
+
name: rake
|
35
83
|
requirement: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
36
85
|
requirements:
|
37
86
|
- - ~>
|
38
87
|
- !ruby/object:Gem::Version
|
39
|
-
version:
|
40
|
-
|
88
|
+
version: '10.3'
|
89
|
+
type: :development
|
90
|
+
prerelease: false
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ~>
|
41
95
|
- !ruby/object:Gem::Version
|
42
|
-
version:
|
43
|
-
|
96
|
+
version: '10.3'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
101
|
+
requirements:
|
102
|
+
- - ~>
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: 3.1.0
|
105
|
+
type: :development
|
44
106
|
prerelease: false
|
45
107
|
version_requirements: !ruby/object:Gem::Requirement
|
108
|
+
none: false
|
46
109
|
requirements:
|
47
110
|
- - ~>
|
48
111
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
50
|
-
|
112
|
+
version: 3.1.0
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: flexmock
|
115
|
+
requirement: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ~>
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '1.3'
|
121
|
+
type: :development
|
122
|
+
prerelease: false
|
123
|
+
version_requirements: !ruby/object:Gem::Requirement
|
124
|
+
none: false
|
125
|
+
requirements:
|
126
|
+
- - ~>
|
51
127
|
- !ruby/object:Gem::Version
|
52
|
-
version:
|
128
|
+
version: '1.3'
|
53
129
|
- !ruby/object:Gem::Dependency
|
54
|
-
name:
|
130
|
+
name: vcr
|
131
|
+
requirement: !ruby/object:Gem::Requirement
|
132
|
+
none: false
|
133
|
+
requirements:
|
134
|
+
- - ~>
|
135
|
+
- !ruby/object:Gem::Version
|
136
|
+
version: 2.9.0
|
137
|
+
type: :development
|
138
|
+
prerelease: false
|
139
|
+
version_requirements: !ruby/object:Gem::Requirement
|
140
|
+
none: false
|
141
|
+
requirements:
|
142
|
+
- - ~>
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: 2.9.0
|
145
|
+
- !ruby/object:Gem::Dependency
|
146
|
+
name: webmock
|
147
|
+
requirement: !ruby/object:Gem::Requirement
|
148
|
+
none: false
|
149
|
+
requirements:
|
150
|
+
- - ~>
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 1.20.0
|
153
|
+
type: :development
|
154
|
+
prerelease: false
|
155
|
+
version_requirements: !ruby/object:Gem::Requirement
|
156
|
+
none: false
|
157
|
+
requirements:
|
158
|
+
- - ~>
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: 1.20.0
|
161
|
+
- !ruby/object:Gem::Dependency
|
162
|
+
name: coveralls
|
55
163
|
requirement: !ruby/object:Gem::Requirement
|
164
|
+
none: false
|
56
165
|
requirements:
|
57
166
|
- - ! '>='
|
58
167
|
- !ruby/object:Gem::Version
|
@@ -60,6 +169,7 @@ dependencies:
|
|
60
169
|
type: :development
|
61
170
|
prerelease: false
|
62
171
|
version_requirements: !ruby/object:Gem::Requirement
|
172
|
+
none: false
|
63
173
|
requirements:
|
64
174
|
- - ! '>='
|
65
175
|
- !ruby/object:Gem::Version
|
@@ -67,33 +177,51 @@ dependencies:
|
|
67
177
|
description: Add support for Cassandra in Polipus crawler
|
68
178
|
email:
|
69
179
|
- s.fontanelli@gmail.com
|
180
|
+
- edoardo@gild.com
|
70
181
|
executables: []
|
71
182
|
extensions: []
|
72
183
|
extra_rdoc_files: []
|
73
|
-
files:
|
184
|
+
files:
|
185
|
+
- .gitignore
|
186
|
+
- .rspec
|
187
|
+
- .ruby-version
|
188
|
+
- Gemfile
|
189
|
+
- LICENSE.txt
|
190
|
+
- README.md
|
191
|
+
- Rakefile
|
192
|
+
- lib/polipus-cassandra.rb
|
193
|
+
- lib/polipus-cassandra/policies/default.rb
|
194
|
+
- lib/polipus-cassandra/policies/policies.rb
|
195
|
+
- lib/polipus-cassandra/queue_overflow/cassandra_queue.rb
|
196
|
+
- lib/polipus-cassandra/storage/cassandra_store.rb
|
197
|
+
- polipus-cassandra.gemspec
|
198
|
+
- spec/polipus-cassandra/storage/cassandra_store_spec.rb
|
199
|
+
- spec/spec_helper.rb
|
74
200
|
homepage: https://github.com/stefanofontanelli/polipus-cassandra
|
75
201
|
licenses:
|
76
202
|
- MIT
|
77
|
-
metadata: {}
|
78
203
|
post_install_message:
|
79
204
|
rdoc_options: []
|
80
205
|
require_paths:
|
81
206
|
- lib
|
82
207
|
required_ruby_version: !ruby/object:Gem::Requirement
|
208
|
+
none: false
|
83
209
|
requirements:
|
84
210
|
- - ! '>='
|
85
211
|
- !ruby/object:Gem::Version
|
86
212
|
version: '0'
|
87
213
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
214
|
+
none: false
|
88
215
|
requirements:
|
89
216
|
- - ! '>='
|
90
217
|
- !ruby/object:Gem::Version
|
91
218
|
version: '0'
|
92
219
|
requirements: []
|
93
220
|
rubyforge_project:
|
94
|
-
rubygems_version:
|
221
|
+
rubygems_version: 1.8.23.2
|
95
222
|
signing_key:
|
96
|
-
specification_version:
|
223
|
+
specification_version: 3
|
97
224
|
summary: Add support for Cassandra in Polipus crawler
|
98
|
-
test_files:
|
99
|
-
|
225
|
+
test_files:
|
226
|
+
- spec/polipus-cassandra/storage/cassandra_store_spec.rb
|
227
|
+
- spec/spec_helper.rb
|
checksums.yaml
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
---
|
2
|
-
!binary "U0hBMQ==":
|
3
|
-
metadata.gz: !binary |-
|
4
|
-
NzYzOTkyMmJmOThiZTA5ZmQwNDJkZDM3MjA2NTk3MWQwMTcxMjEyMg==
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
Njg3ODc1ZmJiYTBkZmQ3NTY2MThjOGQ4Yjg0ZDFlZjcyOTUzMDI4MA==
|
7
|
-
SHA512:
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
OWZiMWEwOTU1NTZlMDNhODNkZTdkZmY5MmIyMDc2YTBmZWVmMTI0MjU3ZWNm
|
10
|
-
YzcyMGQ0NDQyOTc0MGIxOTE1YjJjOTk5MjYyYjg4NDJkOTQ5NjI1NWIyMzk4
|
11
|
-
MzExOGJlNjM3MGEyNzFlZGIxNTkxYzlkMDQ0NjJhMDQ4OGQ3NDk=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
NTM0MWI0M2Q2ZTlkMjcxMGQ1ZTkwYTBjY2M2NTRmNjcyYWM2Nzc0ZjExNzQ5
|
14
|
-
MGZhNmZmYTExMzgzMGMyYmQ0ZTYyMDYzZmQ1MjE2YjM2MDI3NTQzNDlmZDBk
|
15
|
-
NmZmN2M1NjY5NGZjY2QyMzk1MjRhZjBlMWUwY2FhZmU0MDdhMWE=
|