polipus-cassandra 0.0.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,35 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /test/tmp/
9
+ /test/version_tmp/
10
+ /tmp/
11
+
12
+ ## Specific to RubyMotion:
13
+ .dat*
14
+ .repl_history
15
+ build/
16
+
17
+ ## Documentation cache and generated files:
18
+ /.yardoc/
19
+ /_yardoc/
20
+ /doc/
21
+ /rdoc/
22
+
23
+ ## Environment normalisation:
24
+ /.bundle/
25
+ /vendor/bundle
26
+ /lib/bundler/man/
27
+
28
+ # for a library or gem, you might want to ignore these files since the code is
29
+ # intended to run in multiple environments; otherwise, check them in:
30
+ Gemfile.lock
31
+ # .ruby-version
32
+ # .ruby-gemset
33
+
34
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
35
+ .rvmrc
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 1.9.3-p551
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Stefano Fontanelli
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Polipus: addons for Cassandra
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'polipus-cassandra'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install polipus-cassandra
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it ( http://github.com/<my-github-username>/polipus-storage-s3/fork )
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ # coding: utf-8
2
+ require 'bundler/gem_tasks'
@@ -0,0 +1,5 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'polipus-cassandra/policies/policies'
4
+ require 'polipus-cassandra/queue_overflow/cassandra_queue'
5
+ require 'polipus-cassandra/storage/cassandra_store'
@@ -0,0 +1,30 @@
1
+ # encoding: utf-8
2
+ require 'cassandra'
3
+
4
+ module Corm
5
+ module Retry
6
+ module Policies
7
+ class Default
8
+ include Cassandra::Retry::Policy
9
+
10
+ def read_timeout(_statement, consistency, _required, _received, retrieved, retries)
11
+ return reraise if retries >= 5
12
+ sleep(retries.to_f + Random.rand(0.0..1.0))
13
+ retrieved ? reraise : try_again(consistency)
14
+ end
15
+
16
+ def write_timeout(_statement, consistency, _type, _required, _received, retries)
17
+ return reraise if retries >= 5
18
+ sleep(retries.to_f + Random.rand(0.0..1.0))
19
+ try_again(consistency)
20
+ end
21
+
22
+ def unavailable(_statement, consistency, _required, _alive, retries)
23
+ return reraise if retries >= 5
24
+ sleep(retries.to_f + Random.rand(0.0..1.0))
25
+ try_again(consistency)
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1 @@
1
+ require 'polipus-cassandra/policies/default'
@@ -0,0 +1,307 @@
1
+ # encoding: UTF-8
2
+ require 'cassandra'
3
+ require 'polipus'
4
+
5
+ module Polipus
6
+ module QueueOverflow
7
+ class CassandraQueue
8
+
9
+ # CassandraQueue wants to persists documents (please, still ignore the
10
+ # jargon inherited from Mongo) like the following JSON-ish entry.
11
+ #
12
+ # There is no superclass here but I've in mind the interface implicitly
13
+ # defined by Polipus::QueueOverflow::DevNullQueue that, more or less has:
14
+ #
15
+ # def initialize
16
+ # def length
17
+ # def empty?
18
+ # def clear
19
+ # def push(_data)
20
+ # def pop(_ = false)
21
+ #
22
+ # Taking some data from our backend.production.*****.com/polipus
23
+ # I found:
24
+ #
25
+ # mongos> db.getCollectionNames()
26
+ # [
27
+ # "data-com-companies",
28
+ # "data_com_companies",
29
+ # "googleplus",
30
+ # "linkedin",
31
+ # "linkedin-companies",
32
+ # "linkedin_companies_parsed",
33
+ # "linkedin_jobs",
34
+ # "linkedin_jobs_parsed",
35
+ # "linkedin_pages_errors",
36
+ # "polipus_q_overflow_data-com-companies_queue_overflow",
37
+ # "polipus_q_overflow_data_com_companies_queue_overflow",
38
+ # "polipus_q_overflow_googleplus_queue_overflow",
39
+ # "polipus_q_overflow_linkedin-companies_queue_overflow",
40
+ # "polipus_q_overflow_linkedin_jobs_queue_overflow",
41
+ # "polipus_q_overflow_linkedin_jobs_queue_overflow_old",
42
+ # "polipus_q_overflow_linkedin_refresh_queue_overflow",
43
+ # "system.indexes"
44
+ # ]
45
+ #
46
+ # mongos> db.getCollection("polipus_q_overflow_linkedin_jobs_queue_overflow").find().limit(1)
47
+ # {
48
+ # "_id" : ObjectId("54506b98e3d55b20c40b32d3"),
49
+ # "payload" : "{\"url\":\"https://www.linkedin.com/job/product-designer-jobs/?page_num=7&trk=jserp_pagination_next\",\"depth\":6,\"referer\":\"https://www.linkedin.com/job/product-designer-jobs/?page_num=6&trk=jserp_pagination_6\",\"fetched\":false}"
50
+ # }
51
+ #
52
+ # mongos> db.polipus_q_overflow_linkedin_refresh_queue_overflow.find().limit(10)
53
+ # {
54
+ # "_id" : ObjectId("544072b6e3d55b0db7000001"),
55
+ # "payload" : "{\"url\":\"http://www.linkedin.com/in/*****\",\"depth\":0,\"fetched\":false}"
56
+ # }
57
+ #
58
+ # We also assume this MonkeyPatch:
59
+ # Polipus::QueueOverflow.cassandra_queue(namespace, options = {})
60
+ # that returns instances of this class.
61
+
62
+ attr_accessor :cluster, :keyspace, :table
63
+
64
+ # There is a validation enforced to `:keyspace` and `:table` because
65
+ # Cassandra is not happy when a keyspace or a table name contains an
66
+ # hyphen.
67
+ def initialize(options = {})
68
+ raise ArgumentError unless options_are_valid?(options)
69
+ @cluster = options[:cluster]
70
+ @keyspace = options[:keyspace].gsub("-", "_")
71
+ @table = options[:table].gsub("-", "_")
72
+ @semaphore = Mutex.new
73
+ @options = options
74
+ @timeuuid_generator = Cassandra::Uuid::Generator.new
75
+ @logger = @options[:logger] ||= Logger.new(STDOUT).tap { |l| l.level = Logger::INFO }
76
+ end
77
+
78
+ # Length aka Size aka Count is supported in Cassandra... like your POSQL
79
+ # you can COUNT.
80
+ #
81
+ # SELECT COUNT (*) FROM keyspace.table_name;
82
+ #
83
+ # TBH I'm not sure if being "defensive" and returning 0/nil in case
84
+ # the results is_empty? ... I'm leaving (now) the code simple and noisy
85
+ # if something went wrong in the COUNT.
86
+ def length
87
+ table_ = [keyspace, table].compact.join '.'
88
+ statement = "SELECT COUNT (*) FROM #{table_} ;"
89
+ result = session.execute(statement)
90
+ result.first['count']
91
+ end
92
+
93
+ # Return true if the table has no rows.
94
+ # This is achieved with a 'SELECT WITH LIMIT 1' query.
95
+ def empty?
96
+ return get.first.nil?
97
+ end
98
+
99
+ # Clear is a fancy name for a DROP TABLE IF EXISTS <table_>.
100
+ def clear
101
+ table_ = [keyspace, table].compact.join '.'
102
+ statement = "DROP TABLE IF EXISTS #{table_} ;"
103
+ session.execute(statement)
104
+ end
105
+
106
+ # push is your the "write into Cassandra" method.
107
+ def push(data)
108
+ return nil if data.nil?
109
+ obj = MultiJson.decode(data)
110
+
111
+ table_ = [keyspace, table].compact.join('.')
112
+ queue_name = @keyspace
113
+ created_at = @timeuuid_generator.now
114
+
115
+ begin
116
+ @semaphore.synchronize do
117
+
118
+ if obj.has_key?('payload') && !obj['payload'].empty?
119
+ payload = MultiJson.encode(obj['payload'])
120
+ else
121
+ payload = nil
122
+ end
123
+
124
+ column_names = %w[ queue_name created_at payload ]
125
+ values_placeholders = column_names.map{|_| '?'}.join(',')
126
+ statement = "INSERT INTO #{table_} ( #{column_names.join(',')} ) VALUES (#{values_placeholders});"
127
+
128
+ session.execute(
129
+ session.prepare(statement),
130
+ arguments: [
131
+ queue_name,
132
+ created_at,
133
+ payload
134
+ ])
135
+ end
136
+ rescue Encoding::UndefinedConversionError
137
+ puts $!.error_char.dump
138
+ puts $!.error_char.encoding
139
+ end
140
+
141
+ @logger.debug { "Writing this entry [#{[queue_name, created_at].to_s}]" }
142
+ [queue_name, created_at].to_s
143
+ end
144
+
145
+ # Pop removes 'n' entries from the overflow table (treated as a queue)
146
+ # and returns a paged result.
147
+ # results.class #=> Cassandra::Results::Paged
148
+ #
149
+ # Polipus is expecting a String, that will be JSONparsed with the purpose
150
+ # to build a
151
+ def pop(n = 1)
152
+ # A recap: pop should remove oldest N messages and return to the caller.
153
+ #
154
+ # Let's see how this queue is implemented.
155
+ # In redis, messages are LPUSH-ed:
156
+ #
157
+ # 4 - 3 - 2 - 1 --> REDIS
158
+ # 4 - 3 - 2 --> REDIS
159
+ # 4 - 3 --> REDIS
160
+ # 4 --> REDIS
161
+ #
162
+ # Then, in the fast_dequeue, are RPOP-ped:
163
+ #
164
+ # REDIS --> 1
165
+ # REDIS --> 2 - 1
166
+ # REDIS --> 3 - 2 - 1
167
+ # REDIS --> 4 - 3 - 2 - 1
168
+ #
169
+ # Then, are received in this order:
170
+ # [1] -> TimeUUID(1) = ...
171
+ # [2] -> TimeUUID(1) = ...
172
+ # [3] -> TimeUUID(1) = ...
173
+ # [4] -> TimeUUID(1) = ...
174
+ #
175
+ # As you can see below, are ORDER BY (created_at ASC)... that means
176
+ # "olders first". When using 'LIMIT n' in a query, you get the 'n'
177
+ # olders entries.
178
+ #
179
+ # cqlsh> SELECT * FROM polipus_queue_overflow_linkedin.linkedin_overflow ;
180
+ #
181
+ # queue_name | created_at | payload
182
+ # ---------------------------------+--------------------------------------+---------
183
+ # polipus_queue_overflow_linkedin | 4632d49c-1c04-11e5-844b-0b314c777502 | "1"
184
+ # polipus_queue_overflow_linkedin | 46339f8a-1c04-11e5-844b-0b314c777502 | "2"
185
+ # polipus_queue_overflow_linkedin | 46349962-1c04-11e5-844b-0b314c777502 | "3"
186
+ # polipus_queue_overflow_linkedin | 46351860-1c04-11e5-844b-0b314c777502 | "4"
187
+ #
188
+ # (4 rows)
189
+ # cqlsh> SELECT * FROM polipus_queue_overflow_linkedin.linkedin_overflow LIMIT 1;
190
+ #
191
+ # queue_name | created_at | payload
192
+ # ---------------------------------+--------------------------------------+---------
193
+ # polipus_queue_overflow_linkedin | 4632d49c-1c04-11e5-844b-0b314c777502 | "1"
194
+ #
195
+ # (1 rows)
196
+ #
197
+ table_ = [keyspace, table].compact.join '.'
198
+ results = get(n)
199
+ results.each do |entry|
200
+ statement = "DELETE FROM #{table_} WHERE queue_name = '#{entry['queue_name']}' AND created_at = #{entry['created_at']} ;"
201
+ session.execute(statement)
202
+ end
203
+
204
+ # Let's rispect the API as expected by Polipus.
205
+ # Otherwise the execute returns a Cassandra::Results::Paged
206
+ if !results.nil? && results.respond_to?(:count) && results.count == 1
207
+ return results.first['payload']
208
+ end
209
+ return results
210
+ end
211
+
212
+ alias_method :size, :length
213
+ alias_method :dec, :pop
214
+ alias_method :shift, :pop
215
+ alias_method :enc, :push
216
+ alias_method :<<, :push
217
+
218
+ def keyspace!(replication = nil, durable_writes = true)
219
+ replication ||= "{'class': 'SimpleStrategy', 'replication_factor': '3'}"
220
+ statement = "CREATE KEYSPACE IF NOT EXISTS #{keyspace} WITH replication = #{replication} AND durable_writes = #{durable_writes};"
221
+ cluster.connect.execute(statement)
222
+ end
223
+
224
+ def session
225
+ @session ||= @cluster.connect(keyspace)
226
+ end
227
+
228
+ # Taking a look in the Cassandra KEYSPACE you will found:
229
+ #
230
+ # cqlsh> DESCRIBE KEYSPACE polipus_queue_overflow_linkedin ;
231
+ #
232
+ # CREATE KEYSPACE polipus_queue_overflow_linkedin WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '3'} AND durable_writes = true;
233
+ #
234
+ # CREATE TABLE polipus_queue_overflow_linkedin.linkedin_overflow (
235
+ # queue_name text,
236
+ # created_at timeuuid,
237
+ # payload text,
238
+ # PRIMARY KEY (queue_name, created_at)
239
+ # ) WITH CLUSTERING ORDER BY (created_at ASC)
240
+ # AND bloom_filter_fp_chance = 0.01
241
+ # AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
242
+ # AND comment = ''
243
+ # AND compaction = {'min_threshold': '4', 'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32'}
244
+ # AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
245
+ # AND dclocal_read_repair_chance = 0.1
246
+ # AND default_time_to_live = 0
247
+ # AND gc_grace_seconds = 864000
248
+ # AND max_index_interval = 2048
249
+ # AND memtable_flush_period_in_ms = 0
250
+ # AND min_index_interval = 128
251
+ # AND read_repair_chance = 0.0
252
+ # AND speculative_retry = '99.0PERCENTILE';
253
+ #
254
+ # This means that:
255
+ # - queue_name is partition key;
256
+ # - created_at is clustering key;
257
+ #
258
+ # With sample data:
259
+ #
260
+ # cqlsh> SELECT * FROM polipus_queue_overflow_linkedin.linkedin_overflow LIMIT 1 ;
261
+ #
262
+ # queue_name | created_at | payload
263
+ # ---------------------------------+--------------------------------------+---------------------------------------------------------------------------------+
264
+ # polipus_queue_overflow_linkedin | de17ece6-1e5e-11e5-b997-47a87c40c422 | "{\"url\":\"http://www.linkedin.com/in/foobar\",\"depth\":0,\"fetched\":false}"
265
+ #
266
+ # (1 rows)
267
+ # cqlsh>
268
+ #
269
+ def table!(properties = nil)
270
+ table_ = [keyspace, table].compact.join '.'
271
+ def_ = "CREATE TABLE IF NOT EXISTS #{table_}
272
+ (
273
+ queue_name TEXT,
274
+ created_at TIMEUUID,
275
+ payload TEXT,
276
+ PRIMARY KEY (queue_name, created_at)
277
+ )"
278
+ props = Array(properties).join(' AND ')
279
+ statement = props.empty? ? "#{def_};" : "#{def_} WITH #{props};"
280
+ session.execute(statement)
281
+ end
282
+
283
+ private
284
+
285
+ def options_are_valid?(options)
286
+ options.has_key?(:cluster) && options.has_key?(:keyspace) && options.has_key?(:table)
287
+ end
288
+
289
+ def limit_is_valid?(limit)
290
+ !limit.nil? && limit.respond_to?(:to_i) && limit.to_i > 0
291
+ end
292
+
293
+ # results.class => Cassandra::Results::Paged
294
+ def get(limit = 1)
295
+ # coerce to int if a TrueClass/FalseClass is given.
296
+ limit = 1 if [true, false].include?(limit)
297
+
298
+ raise ArgumentError.new("Invalid limit value: must be an INTEGER greater than 1 (got #{limit.inspect}).") unless limit_is_valid?(limit)
299
+ table_ = [keyspace, table].compact.join '.'
300
+ statement = "SELECT queue_name, created_at, payload FROM #{table_} LIMIT #{limit.to_i} ;"
301
+ @semaphore.synchronize do
302
+ return session.execute(session.prepare(statement), arguments: [])
303
+ end
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,244 @@
1
+ # encoding: UTF-8
2
+ require 'cassandra'
3
+ require 'multi_json'
4
+ require 'polipus'
5
+ require 'thread'
6
+ require 'zlib'
7
+
8
+ module Polipus
9
+ module Storage
10
+ class CassandraStore < Base
11
+
12
+ # CassandraStore wants to persists documents (please ignore the jargon
13
+ # inherited from MongoDB) like the following JSON-ish entry:
14
+ #
15
+ # > db['linkedin-refresh'].find({})
16
+ #
17
+ # {
18
+ # "_id" : ObjectId("...."),
19
+ # "url" : "https://www.awesome.org/meh",
20
+ # "code" : 200,
21
+ # "depth" : 0,
22
+ # "referer" : "",
23
+ # "redirect_to" : "",
24
+ # "response_time" : 1313,
25
+ # "fetched" : true,
26
+ # "user_data" :
27
+ # {
28
+ # "imported" : false,
29
+ # "is_developer" : false,
30
+ # "last_modified" : null
31
+ # },
32
+ # "fetched_at" : 1434977757,
33
+ # "error" : "",
34
+ # "uuid" : "4ddce293532ea2454356a4210e61c363"
35
+ # }
36
+
37
+ attr_accessor :cluster, :keyspace, :table
38
+
39
+ BINARY_FIELDS = %w(body headers user_data)
40
+
41
+ def initialize(options = {})
42
+ @cluster = options[:cluster]
43
+ @keyspace = options[:keyspace]
44
+ @table = options[:table]
45
+ @except = options[:except] || []
46
+ @semaphore = Mutex.new
47
+ end
48
+
49
+ # {
50
+ # 'url' => @url.to_s,
51
+ # 'headers' => Marshal.dump(@headers),
52
+ # 'body' => @body,
53
+ # 'links' => links.map(&:to_s),
54
+ # 'code' => @code,
55
+ # 'depth' => @depth,
56
+ # 'referer' => @referer.to_s,
57
+ # 'redirect_to' => @redirect_to.to_s,
58
+ # 'response_time' => @response_time,
59
+ # 'fetched' => @fetched,
60
+ # 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
61
+ # 'fetched_at' => @fetched_at,
62
+ # 'error' => @error.to_s
63
+ # }
64
+
65
+ def add(page)
66
+ @semaphore.synchronize do
67
+ table_ = [keyspace, table].compact.join '.'
68
+ uuid_ = uuid(page)
69
+ obj = page.to_hash
70
+ Array(@except).each { |e| obj.delete(e.to_s) }
71
+
72
+ begin
73
+ BINARY_FIELDS.each do |field|
74
+ obj[field] = obj[field].to_s.encode('UTF-8', {
75
+ invalid: :replace,
76
+ undef: :replace,
77
+ replace: '?' }) if can_be_converted?(obj[field])
78
+ # ec = Encoding::Converter.new("ASCII-8BIT", "UTF-8")
79
+ # obj[field] = ec.convert(obj[field]) if can_be_converted?(obj[field])
80
+ # obj[field] = obj[field].force_encoding('ASCII-8BIT').force_encoding('UTF-8') if can_be_converted?(obj[field])
81
+ end
82
+
83
+ json = MultiJson.encode(obj)
84
+
85
+ url = obj.fetch('url', nil)
86
+ code = obj.fetch('code', nil)
87
+ depth = obj.fetch('depth', nil)
88
+ referer = obj.fetch('referer', nil)
89
+ redirectto = obj.fetch('redirect_to', nil)
90
+ response_time = obj.fetch('response_time', nil)
91
+ fetched = obj.fetch('fetched', nil)
92
+ error = obj.fetch('error', nil)
93
+ page = Zlib::Deflate.deflate(json)
94
+
95
+ if obj.has_key?('user_data') && !obj['user_data'].empty?
96
+ user_data = MultiJson.encode(obj['user_data'])
97
+ else
98
+ user_data = nil
99
+ end
100
+
101
+ value = obj.fetch('fetched_at', nil)
102
+ fetched_at = case value
103
+ when Fixnum
104
+ Time.at(value)
105
+ when String
106
+ Time.parse(value)
107
+ else
108
+ nil
109
+ end
110
+
111
+ column_names = %w[ uuid url code depth referer redirect_to response_time fetched user_data fetched_at error page ]
112
+ values_placeholders = column_names.map{|_| '?'}.join(',')
113
+ statement = "INSERT INTO #{table_} ( #{column_names.join(',')} ) VALUES (#{values_placeholders});"
114
+
115
+ session.execute(
116
+ session.prepare(statement),
117
+ arguments: [
118
+ uuid_,
119
+ url,
120
+ code,
121
+ depth,
122
+ referer,
123
+ redirectto,
124
+ response_time,
125
+ fetched,
126
+ user_data,
127
+ fetched_at,
128
+ error,
129
+ page
130
+ ])
131
+
132
+ rescue Encoding::UndefinedConversionError
133
+ puts $!.error_char.dump
134
+ puts $!.error_char.encoding
135
+ end
136
+
137
+ uuid_
138
+ end
139
+ end
140
+
141
+ def clear
142
+ table_ = [keyspace, table].compact.join '.'
143
+ statement = "DROP TABLE #{table_};"
144
+ session.execute statement
145
+ end
146
+
147
+ # TBH I'm not sure if being "defensive" and returning 0/nil in case
148
+ # the results is_empty? ... I'm leaving (now) the code simple and noisy
149
+ # if something went wrong in the COUNT.
150
+ def count
151
+ table_ = [keyspace, table].compact.join '.'
152
+ statement = "SELECT COUNT (*) FROM #{table_} ;"
153
+ result = session.execute(statement)
154
+ result.first['count']
155
+ end
156
+
157
+ def each
158
+ table_ = [keyspace, table].compact.join '.'
159
+ statement = "SELECT * FROM #{table_};"
160
+ session.execute(statement).each do |data|
161
+ page = load_page(data) unless data.nil?
162
+ yield data['uuid'], page
163
+ end
164
+ end
165
+
166
+ def exists?(page)
167
+ @semaphore.synchronize do
168
+ table_ = [keyspace, table].compact.join '.'
169
+ statement = "SELECT uuid FROM #{table_} WHERE uuid = ? LIMIT 1;"
170
+ results = session.execute(session.prepare(statement),
171
+ arguments: [uuid(page)])
172
+ !results.first.nil?
173
+ end
174
+ end
175
+
176
+ def get(page)
177
+ @semaphore.synchronize do
178
+ table_ = [keyspace, table].compact.join '.'
179
+ statement = "SELECT * FROM #{table_} WHERE uuid = ? LIMIT 1;"
180
+ results = session.execute(session.prepare(statement),
181
+ arguments: [uuid(page)])
182
+ data = results.first
183
+ load_page(data) unless data.nil?
184
+ end
185
+ end
186
+
187
+ def keyspace!(replication = nil, durable_writes = true)
188
+ replication ||= "{'class': 'SimpleStrategy', 'replication_factor': '3'}"
189
+ statement = "CREATE KEYSPACE IF NOT EXISTS #{keyspace} WITH replication = #{replication} AND durable_writes = #{durable_writes};"
190
+ cluster.connect.execute statement
191
+ end
192
+
193
+ def remove(page)
194
+ @semaphore.synchronize do
195
+ table_ = [keyspace, table].compact.join '.'
196
+ statement = "DELETE FROM #{table_} WHERE uuid = ?;"
197
+ session.execute(session.prepare(statement),
198
+ arguments: [uuid(page)])
199
+ true
200
+ end
201
+ end
202
+
203
+ def session
204
+ @session ||= @cluster.connect(keyspace)
205
+ end
206
+
207
+ def table!(properties = nil)
208
+ table_ = [keyspace, table].compact.join '.'
209
+ def_ = "CREATE TABLE IF NOT EXISTS #{table_}
210
+ (
211
+ uuid TEXT PRIMARY KEY,
212
+ url TEXT,
213
+ code INT,
214
+ depth INT,
215
+ referer TEXT,
216
+ redirect_to TEXT,
217
+ response_time BIGINT,
218
+ fetched BOOLEAN,
219
+ user_data TEXT,
220
+ fetched_at TIMESTAMP,
221
+ error TEXT,
222
+ page BLOB
223
+ )"
224
+ props = properties.to_a.join(' AND ')
225
+ statement = props.empty? ? "#{def_};" : "#{def_} WITH #{props};"
226
+ session.execute statement
227
+ end
228
+
229
+ def load_page(data)
230
+ json = Zlib::Inflate.inflate(data['page'])
231
+ hash = MultiJson.decode(json)
232
+ page = Page.from_hash(hash)
233
+ page.fetched_at = 0 if page.fetched_at.nil?
234
+ page
235
+ end
236
+
237
+ private
238
+
239
+ def can_be_converted?(field)
240
+ !field.nil? && field.is_a?(String) && !field.empty?
241
+ end
242
+ end
243
+ end
244
+ end
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'polipus-cassandra'
7
+ spec.version = '0.1.3'
8
+ spec.authors = ['Stefano Fontanelli', 'Edoardo Rossi']
9
+ spec.email = ['s.fontanelli@gmail.com', 'edoardo@gild.com']
10
+ spec.summary = 'Add support for Cassandra in Polipus crawler'
11
+ spec.description = 'Add support for Cassandra in Polipus crawler'
12
+ spec.homepage = 'https://github.com/stefanofontanelli/polipus-cassandra'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(/^bin\//) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(/^(test|spec|features)\//)
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_runtime_dependency 'cassandra-driver', '~> 2.0.1', '>= 2.0.1'
21
+ spec.add_runtime_dependency 'multi_json', '~> 1.11.0', '>= 1.11.0'
22
+ spec.add_runtime_dependency 'polipus', '~> 0.3', '>= 0.3.0'
23
+
24
+ spec.add_development_dependency 'rake', '~> 10.3'
25
+ spec.add_development_dependency 'rspec', '~> 3.1.0'
26
+ spec.add_development_dependency 'flexmock', '~> 1.3'
27
+ spec.add_development_dependency 'vcr', '~> 2.9.0'
28
+ spec.add_development_dependency 'webmock', '~> 1.20.0'
29
+ spec.add_development_dependency 'coveralls'
30
+ end
@@ -0,0 +1,174 @@
1
+ # encoding: UTF-8
2
+ require 'cassandra'
3
+ require 'logger'
4
+ require 'polipus-cassandra'
5
+ require 'spec_helper'
6
+
7
+ describe Polipus::Storage::CassandraStore do
8
+ before(:all)do
9
+ @logger = Logger.new(STDOUT).tap { |logger| logger.level = Logger::WARN }
10
+ @cluster = Cassandra.cluster hosts: ['127.0.0.1'], logger: @logger
11
+ @keyspace = 'polipus_cassandra_test'
12
+ @table = 'cassandra_store_test'
13
+ @storage = Polipus::Storage::CassandraStore.new(
14
+ cluster: @cluster,
15
+ keyspace: @keyspace,
16
+ table: @table,
17
+ )
18
+
19
+ @storage.keyspace!
20
+ @storage.table!
21
+
22
+ @storage_without_code_and_body = Polipus::Storage::CassandraStore.new(
23
+ cluster: @cluster,
24
+ keyspace: @keyspace,
25
+ table: @table,
26
+ except: ['code', 'body']
27
+ )
28
+ end
29
+
30
+ after(:all) do
31
+ @storage.clear
32
+ end
33
+
34
+ it 'should store a page' do
35
+ p = page_factory 'http://www.google.com'
36
+ uuid = @storage.add p
37
+ expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
38
+ p = @storage.get p
39
+ expect(p.url.to_s).to eq('http://www.google.com')
40
+ expect(p.body).to eq('<html></html>')
41
+ end
42
+
43
+ it 'should store all the relevant data from the page' do
44
+ url = "http://www.duckduckgo.com"
45
+ referer = "http://www.actually.nowhere.com"
46
+ redirectto = "#{url}/your_super_awesome_results?page=42"
47
+ now = Time.now.to_i
48
+ p = page_factory(
49
+ url,
50
+ {
51
+ referer: referer,
52
+ redirect_to: redirectto,
53
+ fetched_at: now
54
+ })
55
+ uuid = @storage.add p
56
+ expect(uuid).to eq('3cd657f53c74f22c1a21b420ce3863fd')
57
+ p = @storage.get p
58
+
59
+ expect(p.url.to_s).to eq(url)
60
+ expect(p.referer.to_s).to eq(referer)
61
+ expect(p.redirect_to.to_s).to eq(redirectto)
62
+ expect(p.fetched_at).to eq(now)
63
+ expect(p.body).to eq('<html></html>')
64
+
65
+ # for the sake of the other tests...
66
+ expect(@storage.remove(p)).to be_truthy
67
+ end
68
+
69
+ it 'should update a page' do
70
+ p = page_factory 'http://www.google.com', code: 301
71
+ @storage.add p
72
+ p = @storage.get p
73
+ expect(p.code).to eq(301)
74
+ end
75
+
76
+ it 'should iterate over stored pages' do
77
+ @storage.each do |k, page|
78
+ expect(k).to eq('ed646a3334ca891fd3467db131372140')
79
+ expect(page.url.to_s).to eq('http://www.google.com')
80
+ end
81
+ end
82
+
83
+ it 'should delete a page' do
84
+ p = page_factory 'http://www.google.com', code: 301
85
+ @storage.remove p
86
+ expect(@storage.get(p)).to be_nil
87
+ end
88
+
89
+ it 'should store a page removing a query string from the uuid generation' do
90
+ p = page_factory 'http://www.asd.com/?asd=lol'
91
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1'
92
+ @storage.include_query_string_in_uuid = false
93
+ @storage.add p
94
+ expect(@storage.exists?(p_no_query)).to be_truthy
95
+ @storage.remove p
96
+ end
97
+
98
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
99
+ p = page_factory 'http://www.asd.com?asd=lol'
100
+ p_no_query = page_factory 'http://www.asd.com'
101
+ @storage.include_query_string_in_uuid = false
102
+ @storage.add p
103
+ expect(@storage.exists?(p_no_query)).to be_truthy
104
+ @storage.remove p
105
+ end
106
+
107
+ it 'should store a page with user data associated' do
108
+ p = page_factory 'http://www.user.com'
109
+ p.user_data.name = 'Test User Data'
110
+ @storage.add p
111
+ expect(@storage.exists?(p)).to be_truthy
112
+ p = @storage.get(p)
113
+ expect(p.user_data.name).to eq('Test User Data')
114
+ @storage.remove p
115
+ end
116
+
117
+ it 'should honor the except parameters' do
118
+ pag = page_factory 'http://www.user-doo.com'
119
+ expect(pag.code).to eq(200)
120
+ expect(pag.body).to eq('<html></html>')
121
+
122
+ @storage_without_code_and_body.add(pag)
123
+ pag = @storage_without_code_and_body.get(pag)
124
+
125
+ expect(pag.body).to be_nil
126
+ expect(pag.code).to eq(0)
127
+ @storage_without_code_and_body.remove(pag)
128
+ end
129
+
130
+ it 'should return false if a doc not exists' do
131
+ @storage.include_query_string_in_uuid = false
132
+ p_other = page_factory 'http://www.asdrrrr.com'
133
+ expect(@storage.exists?(p_other)).to be_falsey
134
+ @storage.add p_other
135
+ expect(@storage.exists?(p_other)).to be_truthy
136
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol'
137
+ expect(@storage.exists?(p_other)).to be_truthy
138
+ @storage.include_query_string_in_uuid = true
139
+ expect(@storage.exists?(p_other)).to be_falsey
140
+ @storage.include_query_string_in_uuid = false
141
+ @storage.remove p_other
142
+ end
143
+
144
+ it 'should set page.fetched_at based on the id creation' do
145
+ p = page_factory 'http://www.user-doojo.com'
146
+ @storage.add p
147
+ expect(p.fetched_at).to be_nil
148
+ p = @storage.get p
149
+ expect(p.fetched_at).not_to be_nil
150
+ @storage.remove p
151
+ end
152
+
153
+ it 'should NOT set page.fetched_at if already present' do
154
+ p = page_factory 'http://www.user-doojooo.com'
155
+ p.fetched_at = 10
156
+ @storage.add p
157
+ p = @storage.get p
158
+ expect(p.fetched_at).to be 10
159
+ @storage.remove p
160
+ end
161
+
162
+ it 'should store two pages and the count will be two' do
163
+ pages = ['http://www.google.com', 'http://www.duckduckgo.com'].map do |url|
164
+ page_factory(url).tap do |page|
165
+ @storage.add(page)
166
+ end
167
+ end
168
+ expect(@storage.count).to be 2
169
+ pages.each do |page|
170
+ @storage.remove(page)
171
+ end
172
+ expect(@storage.count).to be 0
173
+ end
174
+ end
@@ -0,0 +1,44 @@
1
+ # Require this file using `require "spec_helper"`
2
+ # to ensure that it is only loaded once.
3
+ #
4
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
5
+ require 'digest/md5'
6
+ require 'coveralls'
7
+ require 'vcr'
8
+ require 'webmock/rspec'
9
+
10
+ Coveralls.wear!
11
+
12
+ VCR.configure do |c|
13
+ c.cassette_library_dir = "#{File.dirname(__FILE__)}/cassettes"
14
+ c.hook_into :webmock
15
+ end
16
+
17
+ require 'polipus'
18
+
19
+ RSpec.configure do |config|
20
+ config.run_all_when_everything_filtered = true
21
+ config.filter_run :focus
22
+
23
+ # Run specs in random order to surface order dependencies. If you find an
24
+ # order dependency and want to debug it, you can fix the order by providing
25
+ # the seed, which is printed after each run.
26
+ # --seed 1234
27
+ config.order = 'random'
28
+ config.mock_with :flexmock
29
+ config.around(:each) do |example|
30
+ t = Time.now
31
+ print example.metadata[:full_description]
32
+ VCR.use_cassette(Digest::MD5.hexdigest(example.metadata[:full_description])) do
33
+ example.run
34
+ puts " [#{Time.now - t}s]"
35
+ end
36
+ end
37
+ config.before(:each) { Polipus::SignalHandler.disable }
38
+ end
39
+
40
+ def page_factory(url, params = {})
41
+ params[:code] = 200 unless params.has_key?(:code)
42
+ params[:body] = '<html></html>' unless params.has_key?(:body)
43
+ Polipus::Page.new url, params
44
+ end
metadata CHANGED
@@ -1,18 +1,65 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus-cassandra
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.3
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Stefano Fontanelli
9
+ - Edoardo Rossi
8
10
  autorequire:
9
11
  bindir: bin
10
12
  cert_chain: []
11
- date: 2015-03-04 00:00:00.000000000 Z
13
+ date: 2015-07-13 00:00:00.000000000 Z
12
14
  dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: cassandra-driver
17
+ requirement: !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ~>
21
+ - !ruby/object:Gem::Version
22
+ version: 2.0.1
23
+ - - ! '>='
24
+ - !ruby/object:Gem::Version
25
+ version: 2.0.1
26
+ type: :runtime
27
+ prerelease: false
28
+ version_requirements: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 2.0.1
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: 2.0.1
37
+ - !ruby/object:Gem::Dependency
38
+ name: multi_json
39
+ requirement: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ version: 1.11.0
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.11.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ~>
54
+ - !ruby/object:Gem::Version
55
+ version: 1.11.0
56
+ - - ! '>='
57
+ - !ruby/object:Gem::Version
58
+ version: 1.11.0
13
59
  - !ruby/object:Gem::Dependency
14
60
  name: polipus
15
61
  requirement: !ruby/object:Gem::Requirement
62
+ none: false
16
63
  requirements:
17
64
  - - ~>
18
65
  - !ruby/object:Gem::Version
@@ -23,6 +70,7 @@ dependencies:
23
70
  type: :runtime
24
71
  prerelease: false
25
72
  version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
26
74
  requirements:
27
75
  - - ~>
28
76
  - !ruby/object:Gem::Version
@@ -31,28 +79,89 @@ dependencies:
31
79
  - !ruby/object:Gem::Version
32
80
  version: 0.3.0
33
81
  - !ruby/object:Gem::Dependency
34
- name: cassandra-driver
82
+ name: rake
35
83
  requirement: !ruby/object:Gem::Requirement
84
+ none: false
36
85
  requirements:
37
86
  - - ~>
38
87
  - !ruby/object:Gem::Version
39
- version: 2.1.1
40
- - - ! '>='
88
+ version: '10.3'
89
+ type: :development
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ~>
41
95
  - !ruby/object:Gem::Version
42
- version: 2.1.1
43
- type: :runtime
96
+ version: '10.3'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ none: false
101
+ requirements:
102
+ - - ~>
103
+ - !ruby/object:Gem::Version
104
+ version: 3.1.0
105
+ type: :development
44
106
  prerelease: false
45
107
  version_requirements: !ruby/object:Gem::Requirement
108
+ none: false
46
109
  requirements:
47
110
  - - ~>
48
111
  - !ruby/object:Gem::Version
49
- version: 2.1.1
50
- - - ! '>='
112
+ version: 3.1.0
113
+ - !ruby/object:Gem::Dependency
114
+ name: flexmock
115
+ requirement: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ~>
119
+ - !ruby/object:Gem::Version
120
+ version: '1.3'
121
+ type: :development
122
+ prerelease: false
123
+ version_requirements: !ruby/object:Gem::Requirement
124
+ none: false
125
+ requirements:
126
+ - - ~>
51
127
  - !ruby/object:Gem::Version
52
- version: 2.1.1
128
+ version: '1.3'
53
129
  - !ruby/object:Gem::Dependency
54
- name: rake
130
+ name: vcr
131
+ requirement: !ruby/object:Gem::Requirement
132
+ none: false
133
+ requirements:
134
+ - - ~>
135
+ - !ruby/object:Gem::Version
136
+ version: 2.9.0
137
+ type: :development
138
+ prerelease: false
139
+ version_requirements: !ruby/object:Gem::Requirement
140
+ none: false
141
+ requirements:
142
+ - - ~>
143
+ - !ruby/object:Gem::Version
144
+ version: 2.9.0
145
+ - !ruby/object:Gem::Dependency
146
+ name: webmock
147
+ requirement: !ruby/object:Gem::Requirement
148
+ none: false
149
+ requirements:
150
+ - - ~>
151
+ - !ruby/object:Gem::Version
152
+ version: 1.20.0
153
+ type: :development
154
+ prerelease: false
155
+ version_requirements: !ruby/object:Gem::Requirement
156
+ none: false
157
+ requirements:
158
+ - - ~>
159
+ - !ruby/object:Gem::Version
160
+ version: 1.20.0
161
+ - !ruby/object:Gem::Dependency
162
+ name: coveralls
55
163
  requirement: !ruby/object:Gem::Requirement
164
+ none: false
56
165
  requirements:
57
166
  - - ! '>='
58
167
  - !ruby/object:Gem::Version
@@ -60,6 +169,7 @@ dependencies:
60
169
  type: :development
61
170
  prerelease: false
62
171
  version_requirements: !ruby/object:Gem::Requirement
172
+ none: false
63
173
  requirements:
64
174
  - - ! '>='
65
175
  - !ruby/object:Gem::Version
@@ -67,33 +177,51 @@ dependencies:
67
177
  description: Add support for Cassandra in Polipus crawler
68
178
  email:
69
179
  - s.fontanelli@gmail.com
180
+ - edoardo@gild.com
70
181
  executables: []
71
182
  extensions: []
72
183
  extra_rdoc_files: []
73
- files: []
184
+ files:
185
+ - .gitignore
186
+ - .rspec
187
+ - .ruby-version
188
+ - Gemfile
189
+ - LICENSE.txt
190
+ - README.md
191
+ - Rakefile
192
+ - lib/polipus-cassandra.rb
193
+ - lib/polipus-cassandra/policies/default.rb
194
+ - lib/polipus-cassandra/policies/policies.rb
195
+ - lib/polipus-cassandra/queue_overflow/cassandra_queue.rb
196
+ - lib/polipus-cassandra/storage/cassandra_store.rb
197
+ - polipus-cassandra.gemspec
198
+ - spec/polipus-cassandra/storage/cassandra_store_spec.rb
199
+ - spec/spec_helper.rb
74
200
  homepage: https://github.com/stefanofontanelli/polipus-cassandra
75
201
  licenses:
76
202
  - MIT
77
- metadata: {}
78
203
  post_install_message:
79
204
  rdoc_options: []
80
205
  require_paths:
81
206
  - lib
82
207
  required_ruby_version: !ruby/object:Gem::Requirement
208
+ none: false
83
209
  requirements:
84
210
  - - ! '>='
85
211
  - !ruby/object:Gem::Version
86
212
  version: '0'
87
213
  required_rubygems_version: !ruby/object:Gem::Requirement
214
+ none: false
88
215
  requirements:
89
216
  - - ! '>='
90
217
  - !ruby/object:Gem::Version
91
218
  version: '0'
92
219
  requirements: []
93
220
  rubyforge_project:
94
- rubygems_version: 2.2.2
221
+ rubygems_version: 1.8.23.2
95
222
  signing_key:
96
- specification_version: 4
223
+ specification_version: 3
97
224
  summary: Add support for Cassandra in Polipus crawler
98
- test_files: []
99
- has_rdoc:
225
+ test_files:
226
+ - spec/polipus-cassandra/storage/cassandra_store_spec.rb
227
+ - spec/spec_helper.rb
checksums.yaml DELETED
@@ -1,15 +0,0 @@
1
- ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- NzYzOTkyMmJmOThiZTA5ZmQwNDJkZDM3MjA2NTk3MWQwMTcxMjEyMg==
5
- data.tar.gz: !binary |-
6
- Njg3ODc1ZmJiYTBkZmQ3NTY2MThjOGQ4Yjg0ZDFlZjcyOTUzMDI4MA==
7
- SHA512:
8
- metadata.gz: !binary |-
9
- OWZiMWEwOTU1NTZlMDNhODNkZTdkZmY5MmIyMDc2YTBmZWVmMTI0MjU3ZWNm
10
- YzcyMGQ0NDQyOTc0MGIxOTE1YjJjOTk5MjYyYjg4NDJkOTQ5NjI1NWIyMzk4
11
- MzExOGJlNjM3MGEyNzFlZGIxNTkxYzlkMDQ0NjJhMDQ4OGQ3NDk=
12
- data.tar.gz: !binary |-
13
- NTM0MWI0M2Q2ZTlkMjcxMGQ1ZTkwYTBjY2M2NTRmNjcyYWM2Nzc0ZjExNzQ5
14
- MGZhNmZmYTExMzgzMGMyYmQ0ZTYyMDYzZmQ1MjE2YjM2MDI3NTQzNDlmZDBk
15
- NmZmN2M1NjY5NGZjY2QyMzk1MjRhZjBlMWUwY2FhZmU0MDdhMWE=