polipus-cassandra 0.0.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,35 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /test/tmp/
9
+ /test/version_tmp/
10
+ /tmp/
11
+
12
+ ## Specific to RubyMotion:
13
+ .dat*
14
+ .repl_history
15
+ build/
16
+
17
+ ## Documentation cache and generated files:
18
+ /.yardoc/
19
+ /_yardoc/
20
+ /doc/
21
+ /rdoc/
22
+
23
+ ## Environment normalisation:
24
+ /.bundle/
25
+ /vendor/bundle
26
+ /lib/bundler/man/
27
+
28
+ # for a library or gem, you might want to ignore these files since the code is
29
+ # intended to run in multiple environments; otherwise, check them in:
30
+ Gemfile.lock
31
+ # .ruby-version
32
+ # .ruby-gemset
33
+
34
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
35
+ .rvmrc
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 1.9.3-p551
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Stefano Fontanelli
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Polipus: addons for Cassandra
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'polipus-cassandra'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install polipus-cassandra
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it ( http://github.com/<my-github-username>/polipus-storage-s3/fork )
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ # coding: utf-8
2
+ require 'bundler/gem_tasks'
@@ -0,0 +1,5 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'polipus-cassandra/policies/policies'
4
+ require 'polipus-cassandra/queue_overflow/cassandra_queue'
5
+ require 'polipus-cassandra/storage/cassandra_store'
@@ -0,0 +1,30 @@
1
+ # encoding: utf-8
2
+ require 'cassandra'
3
+
4
+ module Corm
5
+ module Retry
6
+ module Policies
7
+ class Default
8
+ include Cassandra::Retry::Policy
9
+
10
+ def read_timeout(_statement, consistency, _required, _received, retrieved, retries)
11
+ return reraise if retries >= 5
12
+ sleep(retries.to_f + Random.rand(0.0..1.0))
13
+ retrieved ? reraise : try_again(consistency)
14
+ end
15
+
16
+ def write_timeout(_statement, consistency, _type, _required, _received, retries)
17
+ return reraise if retries >= 5
18
+ sleep(retries.to_f + Random.rand(0.0..1.0))
19
+ try_again(consistency)
20
+ end
21
+
22
+ def unavailable(_statement, consistency, _required, _alive, retries)
23
+ return reraise if retries >= 5
24
+ sleep(retries.to_f + Random.rand(0.0..1.0))
25
+ try_again(consistency)
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1 @@
1
+ require 'polipus-cassandra/policies/default'
@@ -0,0 +1,307 @@
1
+ # encoding: UTF-8
2
+ require 'cassandra'
3
+ require 'polipus'
4
+
5
+ module Polipus
6
+ module QueueOverflow
7
+ class CassandraQueue
8
+
9
+ # CassandraQueue wants to persists documents (please, still ignore the
10
+ # jargon inherited from Mongo) like the following JSON-ish entry.
11
+ #
12
+ # There is no superclass here but I've in mind the interface implicitly
13
+ # defined by Polipus::QueueOverflow::DevNullQueue that, more or less has:
14
+ #
15
+ # def initialize
16
+ # def length
17
+ # def empty?
18
+ # def clear
19
+ # def push(_data)
20
+ # def pop(_ = false)
21
+ #
22
+ # Taking some data from our backend.production.*****.com/polipus
23
+ # I found:
24
+ #
25
+ # mongos> db.getCollectionNames()
26
+ # [
27
+ # "data-com-companies",
28
+ # "data_com_companies",
29
+ # "googleplus",
30
+ # "linkedin",
31
+ # "linkedin-companies",
32
+ # "linkedin_companies_parsed",
33
+ # "linkedin_jobs",
34
+ # "linkedin_jobs_parsed",
35
+ # "linkedin_pages_errors",
36
+ # "polipus_q_overflow_data-com-companies_queue_overflow",
37
+ # "polipus_q_overflow_data_com_companies_queue_overflow",
38
+ # "polipus_q_overflow_googleplus_queue_overflow",
39
+ # "polipus_q_overflow_linkedin-companies_queue_overflow",
40
+ # "polipus_q_overflow_linkedin_jobs_queue_overflow",
41
+ # "polipus_q_overflow_linkedin_jobs_queue_overflow_old",
42
+ # "polipus_q_overflow_linkedin_refresh_queue_overflow",
43
+ # "system.indexes"
44
+ # ]
45
+ #
46
+ # mongos> db.getCollection("polipus_q_overflow_linkedin_jobs_queue_overflow").find().limit(1)
47
+ # {
48
+ # "_id" : ObjectId("54506b98e3d55b20c40b32d3"),
49
+ # "payload" : "{\"url\":\"https://www.linkedin.com/job/product-designer-jobs/?page_num=7&trk=jserp_pagination_next\",\"depth\":6,\"referer\":\"https://www.linkedin.com/job/product-designer-jobs/?page_num=6&trk=jserp_pagination_6\",\"fetched\":false}"
50
+ # }
51
+ #
52
+ # mongos> db.polipus_q_overflow_linkedin_refresh_queue_overflow.find().limit(10)
53
+ # {
54
+ # "_id" : ObjectId("544072b6e3d55b0db7000001"),
55
+ # "payload" : "{\"url\":\"http://www.linkedin.com/in/*****\",\"depth\":0,\"fetched\":false}"
56
+ # }
57
+ #
58
+ # We also assume this MonkeyPatch:
59
+ # Polipus::QueueOverflow.cassandra_queue(namespace, options = {})
60
+ # that returns instances of this class.
61
+
62
+ attr_accessor :cluster, :keyspace, :table
63
+
64
+ # There is a validation enforced to `:keyspace` and `:table` because
65
+ # Cassandra is not happy when a keyspace or a table name contains an
66
+ # hyphen.
67
+ def initialize(options = {})
68
+ raise ArgumentError unless options_are_valid?(options)
69
+ @cluster = options[:cluster]
70
+ @keyspace = options[:keyspace].gsub("-", "_")
71
+ @table = options[:table].gsub("-", "_")
72
+ @semaphore = Mutex.new
73
+ @options = options
74
+ @timeuuid_generator = Cassandra::Uuid::Generator.new
75
+ @logger = @options[:logger] ||= Logger.new(STDOUT).tap { |l| l.level = Logger::INFO }
76
+ end
77
+
78
+ # Length aka Size aka Count is supported in Cassandra... like your POSQL
79
+ # you can COUNT.
80
+ #
81
+ # SELECT COUNT (*) FROM keyspace.table_name;
82
+ #
83
+ # TBH I'm not sure if being "defensive" and returning 0/nil in case
84
+ # the results is_empty? ... I'm leaving (now) the code simple and noisy
85
+ # if something went wrong in the COUNT.
86
+ def length
87
+ table_ = [keyspace, table].compact.join '.'
88
+ statement = "SELECT COUNT (*) FROM #{table_} ;"
89
+ result = session.execute(statement)
90
+ result.first['count']
91
+ end
92
+
93
+ # Return true if the table has no rows.
94
+ # This is achieved with a 'SELECT WITH LIMIT 1' query.
95
+ def empty?
96
+ return get.first.nil?
97
+ end
98
+
99
+ # Clear is a fancy name for a DROP TABLE IF EXISTS <table_>.
100
+ def clear
101
+ table_ = [keyspace, table].compact.join '.'
102
+ statement = "DROP TABLE IF EXISTS #{table_} ;"
103
+ session.execute(statement)
104
+ end
105
+
106
+ # push is your the "write into Cassandra" method.
107
+ def push(data)
108
+ return nil if data.nil?
109
+ obj = MultiJson.decode(data)
110
+
111
+ table_ = [keyspace, table].compact.join('.')
112
+ queue_name = @keyspace
113
+ created_at = @timeuuid_generator.now
114
+
115
+ begin
116
+ @semaphore.synchronize do
117
+
118
+ if obj.has_key?('payload') && !obj['payload'].empty?
119
+ payload = MultiJson.encode(obj['payload'])
120
+ else
121
+ payload = nil
122
+ end
123
+
124
+ column_names = %w[ queue_name created_at payload ]
125
+ values_placeholders = column_names.map{|_| '?'}.join(',')
126
+ statement = "INSERT INTO #{table_} ( #{column_names.join(',')} ) VALUES (#{values_placeholders});"
127
+
128
+ session.execute(
129
+ session.prepare(statement),
130
+ arguments: [
131
+ queue_name,
132
+ created_at,
133
+ payload
134
+ ])
135
+ end
136
+ rescue Encoding::UndefinedConversionError
137
+ puts $!.error_char.dump
138
+ puts $!.error_char.encoding
139
+ end
140
+
141
+ @logger.debug { "Writing this entry [#{[queue_name, created_at].to_s}]" }
142
+ [queue_name, created_at].to_s
143
+ end
144
+
145
+ # Pop removes 'n' entries from the overflow table (treated as a queue)
146
+ # and returns a paged result.
147
+ # results.class #=> Cassandra::Results::Paged
148
+ #
149
+ # Polipus is expecting a String, that will be JSONparsed with the purpose
150
+ # to build a
151
+ def pop(n = 1)
152
+ # A recap: pop should remove oldest N messages and return to the caller.
153
+ #
154
+ # Let's see how this queue is implemented.
155
+ # In redis, messages are LPUSH-ed:
156
+ #
157
+ # 4 - 3 - 2 - 1 --> REDIS
158
+ # 4 - 3 - 2 --> REDIS
159
+ # 4 - 3 --> REDIS
160
+ # 4 --> REDIS
161
+ #
162
+ # Then, in the fast_dequeue, are RPOP-ped:
163
+ #
164
+ # REDIS --> 1
165
+ # REDIS --> 2 - 1
166
+ # REDIS --> 3 - 2 - 1
167
+ # REDIS --> 4 - 3 - 2 - 1
168
+ #
169
+ # Then, are received in this order:
170
+ # [1] -> TimeUUID(1) = ...
171
+ # [2] -> TimeUUID(1) = ...
172
+ # [3] -> TimeUUID(1) = ...
173
+ # [4] -> TimeUUID(1) = ...
174
+ #
175
+ # As you can see below, are ORDER BY (created_at ASC)... that means
176
+ # "olders first". When using 'LIMIT n' in a query, you get the 'n'
177
+ # olders entries.
178
+ #
179
+ # cqlsh> SELECT * FROM polipus_queue_overflow_linkedin.linkedin_overflow ;
180
+ #
181
+ # queue_name | created_at | payload
182
+ # ---------------------------------+--------------------------------------+---------
183
+ # polipus_queue_overflow_linkedin | 4632d49c-1c04-11e5-844b-0b314c777502 | "1"
184
+ # polipus_queue_overflow_linkedin | 46339f8a-1c04-11e5-844b-0b314c777502 | "2"
185
+ # polipus_queue_overflow_linkedin | 46349962-1c04-11e5-844b-0b314c777502 | "3"
186
+ # polipus_queue_overflow_linkedin | 46351860-1c04-11e5-844b-0b314c777502 | "4"
187
+ #
188
+ # (4 rows)
189
+ # cqlsh> SELECT * FROM polipus_queue_overflow_linkedin.linkedin_overflow LIMIT 1;
190
+ #
191
+ # queue_name | created_at | payload
192
+ # ---------------------------------+--------------------------------------+---------
193
+ # polipus_queue_overflow_linkedin | 4632d49c-1c04-11e5-844b-0b314c777502 | "1"
194
+ #
195
+ # (1 rows)
196
+ #
197
+ table_ = [keyspace, table].compact.join '.'
198
+ results = get(n)
199
+ results.each do |entry|
200
+ statement = "DELETE FROM #{table_} WHERE queue_name = '#{entry['queue_name']}' AND created_at = #{entry['created_at']} ;"
201
+ session.execute(statement)
202
+ end
203
+
204
+ # Let's rispect the API as expected by Polipus.
205
+ # Otherwise the execute returns a Cassandra::Results::Paged
206
+ if !results.nil? && results.respond_to?(:count) && results.count == 1
207
+ return results.first['payload']
208
+ end
209
+ return results
210
+ end
211
+
212
+ alias_method :size, :length
213
+ alias_method :dec, :pop
214
+ alias_method :shift, :pop
215
+ alias_method :enc, :push
216
+ alias_method :<<, :push
217
+
218
+ def keyspace!(replication = nil, durable_writes = true)
219
+ replication ||= "{'class': 'SimpleStrategy', 'replication_factor': '3'}"
220
+ statement = "CREATE KEYSPACE IF NOT EXISTS #{keyspace} WITH replication = #{replication} AND durable_writes = #{durable_writes};"
221
+ cluster.connect.execute(statement)
222
+ end
223
+
224
+ def session
225
+ @session ||= @cluster.connect(keyspace)
226
+ end
227
+
228
+ # Taking a look in the Cassandra KEYSPACE you will found:
229
+ #
230
+ # cqlsh> DESCRIBE KEYSPACE polipus_queue_overflow_linkedin ;
231
+ #
232
+ # CREATE KEYSPACE polipus_queue_overflow_linkedin WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '3'} AND durable_writes = true;
233
+ #
234
+ # CREATE TABLE polipus_queue_overflow_linkedin.linkedin_overflow (
235
+ # queue_name text,
236
+ # created_at timeuuid,
237
+ # payload text,
238
+ # PRIMARY KEY (queue_name, created_at)
239
+ # ) WITH CLUSTERING ORDER BY (created_at ASC)
240
+ # AND bloom_filter_fp_chance = 0.01
241
+ # AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
242
+ # AND comment = ''
243
+ # AND compaction = {'min_threshold': '4', 'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32'}
244
+ # AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
245
+ # AND dclocal_read_repair_chance = 0.1
246
+ # AND default_time_to_live = 0
247
+ # AND gc_grace_seconds = 864000
248
+ # AND max_index_interval = 2048
249
+ # AND memtable_flush_period_in_ms = 0
250
+ # AND min_index_interval = 128
251
+ # AND read_repair_chance = 0.0
252
+ # AND speculative_retry = '99.0PERCENTILE';
253
+ #
254
+ # This means that:
255
+ # - queue_name is partition key;
256
+ # - created_at is clustering key;
257
+ #
258
+ # With sample data:
259
+ #
260
+ # cqlsh> SELECT * FROM polipus_queue_overflow_linkedin.linkedin_overflow LIMIT 1 ;
261
+ #
262
+ # queue_name | created_at | payload
263
+ # ---------------------------------+--------------------------------------+---------------------------------------------------------------------------------+
264
+ # polipus_queue_overflow_linkedin | de17ece6-1e5e-11e5-b997-47a87c40c422 | "{\"url\":\"http://www.linkedin.com/in/foobar\",\"depth\":0,\"fetched\":false}"
265
+ #
266
+ # (1 rows)
267
+ # cqlsh>
268
+ #
269
+ def table!(properties = nil)
270
+ table_ = [keyspace, table].compact.join '.'
271
+ def_ = "CREATE TABLE IF NOT EXISTS #{table_}
272
+ (
273
+ queue_name TEXT,
274
+ created_at TIMEUUID,
275
+ payload TEXT,
276
+ PRIMARY KEY (queue_name, created_at)
277
+ )"
278
+ props = Array(properties).join(' AND ')
279
+ statement = props.empty? ? "#{def_};" : "#{def_} WITH #{props};"
280
+ session.execute(statement)
281
+ end
282
+
283
+ private
284
+
285
+ def options_are_valid?(options)
286
+ options.has_key?(:cluster) && options.has_key?(:keyspace) && options.has_key?(:table)
287
+ end
288
+
289
+ def limit_is_valid?(limit)
290
+ !limit.nil? && limit.respond_to?(:to_i) && limit.to_i > 0
291
+ end
292
+
293
+ # results.class => Cassandra::Results::Paged
294
+ def get(limit = 1)
295
+ # coerce to int if a TrueClass/FalseClass is given.
296
+ limit = 1 if [true, false].include?(limit)
297
+
298
+ raise ArgumentError.new("Invalid limit value: must be an INTEGER greater than 1 (got #{limit.inspect}).") unless limit_is_valid?(limit)
299
+ table_ = [keyspace, table].compact.join '.'
300
+ statement = "SELECT queue_name, created_at, payload FROM #{table_} LIMIT #{limit.to_i} ;"
301
+ @semaphore.synchronize do
302
+ return session.execute(session.prepare(statement), arguments: [])
303
+ end
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,244 @@
1
+ # encoding: UTF-8
2
+ require 'cassandra'
3
+ require 'multi_json'
4
+ require 'polipus'
5
+ require 'thread'
6
+ require 'zlib'
7
+
8
+ module Polipus
9
+ module Storage
10
+ class CassandraStore < Base
11
+
12
+ # CassandraStore wants to persists documents (please ignore the jargon
13
+ # inherited from MongoDB) like the following JSON-ish entry:
14
+ #
15
+ # > db['linkedin-refresh'].find({})
16
+ #
17
+ # {
18
+ # "_id" : ObjectId("...."),
19
+ # "url" : "https://www.awesome.org/meh",
20
+ # "code" : 200,
21
+ # "depth" : 0,
22
+ # "referer" : "",
23
+ # "redirect_to" : "",
24
+ # "response_time" : 1313,
25
+ # "fetched" : true,
26
+ # "user_data" :
27
+ # {
28
+ # "imported" : false,
29
+ # "is_developer" : false,
30
+ # "last_modified" : null
31
+ # },
32
+ # "fetched_at" : 1434977757,
33
+ # "error" : "",
34
+ # "uuid" : "4ddce293532ea2454356a4210e61c363"
35
+ # }
36
+
37
+ attr_accessor :cluster, :keyspace, :table
38
+
39
+ BINARY_FIELDS = %w(body headers user_data)
40
+
41
+ def initialize(options = {})
42
+ @cluster = options[:cluster]
43
+ @keyspace = options[:keyspace]
44
+ @table = options[:table]
45
+ @except = options[:except] || []
46
+ @semaphore = Mutex.new
47
+ end
48
+
49
+ # {
50
+ # 'url' => @url.to_s,
51
+ # 'headers' => Marshal.dump(@headers),
52
+ # 'body' => @body,
53
+ # 'links' => links.map(&:to_s),
54
+ # 'code' => @code,
55
+ # 'depth' => @depth,
56
+ # 'referer' => @referer.to_s,
57
+ # 'redirect_to' => @redirect_to.to_s,
58
+ # 'response_time' => @response_time,
59
+ # 'fetched' => @fetched,
60
+ # 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
61
+ # 'fetched_at' => @fetched_at,
62
+ # 'error' => @error.to_s
63
+ # }
64
+
65
+ def add(page)
66
+ @semaphore.synchronize do
67
+ table_ = [keyspace, table].compact.join '.'
68
+ uuid_ = uuid(page)
69
+ obj = page.to_hash
70
+ Array(@except).each { |e| obj.delete(e.to_s) }
71
+
72
+ begin
73
+ BINARY_FIELDS.each do |field|
74
+ obj[field] = obj[field].to_s.encode('UTF-8', {
75
+ invalid: :replace,
76
+ undef: :replace,
77
+ replace: '?' }) if can_be_converted?(obj[field])
78
+ # ec = Encoding::Converter.new("ASCII-8BIT", "UTF-8")
79
+ # obj[field] = ec.convert(obj[field]) if can_be_converted?(obj[field])
80
+ # obj[field] = obj[field].force_encoding('ASCII-8BIT').force_encoding('UTF-8') if can_be_converted?(obj[field])
81
+ end
82
+
83
+ json = MultiJson.encode(obj)
84
+
85
+ url = obj.fetch('url', nil)
86
+ code = obj.fetch('code', nil)
87
+ depth = obj.fetch('depth', nil)
88
+ referer = obj.fetch('referer', nil)
89
+ redirectto = obj.fetch('redirect_to', nil)
90
+ response_time = obj.fetch('response_time', nil)
91
+ fetched = obj.fetch('fetched', nil)
92
+ error = obj.fetch('error', nil)
93
+ page = Zlib::Deflate.deflate(json)
94
+
95
+ if obj.has_key?('user_data') && !obj['user_data'].empty?
96
+ user_data = MultiJson.encode(obj['user_data'])
97
+ else
98
+ user_data = nil
99
+ end
100
+
101
+ value = obj.fetch('fetched_at', nil)
102
+ fetched_at = case value
103
+ when Fixnum
104
+ Time.at(value)
105
+ when String
106
+ Time.parse(value)
107
+ else
108
+ nil
109
+ end
110
+
111
+ column_names = %w[ uuid url code depth referer redirect_to response_time fetched user_data fetched_at error page ]
112
+ values_placeholders = column_names.map{|_| '?'}.join(',')
113
+ statement = "INSERT INTO #{table_} ( #{column_names.join(',')} ) VALUES (#{values_placeholders});"
114
+
115
+ session.execute(
116
+ session.prepare(statement),
117
+ arguments: [
118
+ uuid_,
119
+ url,
120
+ code,
121
+ depth,
122
+ referer,
123
+ redirectto,
124
+ response_time,
125
+ fetched,
126
+ user_data,
127
+ fetched_at,
128
+ error,
129
+ page
130
+ ])
131
+
132
+ rescue Encoding::UndefinedConversionError
133
+ puts $!.error_char.dump
134
+ puts $!.error_char.encoding
135
+ end
136
+
137
+ uuid_
138
+ end
139
+ end
140
+
141
+ def clear
142
+ table_ = [keyspace, table].compact.join '.'
143
+ statement = "DROP TABLE #{table_};"
144
+ session.execute statement
145
+ end
146
+
147
+ # TBH I'm not sure if being "defensive" and returning 0/nil in case
148
+ # the results is_empty? ... I'm leaving (now) the code simple and noisy
149
+ # if something went wrong in the COUNT.
150
+ def count
151
+ table_ = [keyspace, table].compact.join '.'
152
+ statement = "SELECT COUNT (*) FROM #{table_} ;"
153
+ result = session.execute(statement)
154
+ result.first['count']
155
+ end
156
+
157
+ def each
158
+ table_ = [keyspace, table].compact.join '.'
159
+ statement = "SELECT * FROM #{table_};"
160
+ session.execute(statement).each do |data|
161
+ page = load_page(data) unless data.nil?
162
+ yield data['uuid'], page
163
+ end
164
+ end
165
+
166
+ def exists?(page)
167
+ @semaphore.synchronize do
168
+ table_ = [keyspace, table].compact.join '.'
169
+ statement = "SELECT uuid FROM #{table_} WHERE uuid = ? LIMIT 1;"
170
+ results = session.execute(session.prepare(statement),
171
+ arguments: [uuid(page)])
172
+ !results.first.nil?
173
+ end
174
+ end
175
+
176
+ def get(page)
177
+ @semaphore.synchronize do
178
+ table_ = [keyspace, table].compact.join '.'
179
+ statement = "SELECT * FROM #{table_} WHERE uuid = ? LIMIT 1;"
180
+ results = session.execute(session.prepare(statement),
181
+ arguments: [uuid(page)])
182
+ data = results.first
183
+ load_page(data) unless data.nil?
184
+ end
185
+ end
186
+
187
+ def keyspace!(replication = nil, durable_writes = true)
188
+ replication ||= "{'class': 'SimpleStrategy', 'replication_factor': '3'}"
189
+ statement = "CREATE KEYSPACE IF NOT EXISTS #{keyspace} WITH replication = #{replication} AND durable_writes = #{durable_writes};"
190
+ cluster.connect.execute statement
191
+ end
192
+
193
+ def remove(page)
194
+ @semaphore.synchronize do
195
+ table_ = [keyspace, table].compact.join '.'
196
+ statement = "DELETE FROM #{table_} WHERE uuid = ?;"
197
+ session.execute(session.prepare(statement),
198
+ arguments: [uuid(page)])
199
+ true
200
+ end
201
+ end
202
+
203
+ def session
204
+ @session ||= @cluster.connect(keyspace)
205
+ end
206
+
207
+ def table!(properties = nil)
208
+ table_ = [keyspace, table].compact.join '.'
209
+ def_ = "CREATE TABLE IF NOT EXISTS #{table_}
210
+ (
211
+ uuid TEXT PRIMARY KEY,
212
+ url TEXT,
213
+ code INT,
214
+ depth INT,
215
+ referer TEXT,
216
+ redirect_to TEXT,
217
+ response_time BIGINT,
218
+ fetched BOOLEAN,
219
+ user_data TEXT,
220
+ fetched_at TIMESTAMP,
221
+ error TEXT,
222
+ page BLOB
223
+ )"
224
+ props = properties.to_a.join(' AND ')
225
+ statement = props.empty? ? "#{def_};" : "#{def_} WITH #{props};"
226
+ session.execute statement
227
+ end
228
+
229
+ def load_page(data)
230
+ json = Zlib::Inflate.inflate(data['page'])
231
+ hash = MultiJson.decode(json)
232
+ page = Page.from_hash(hash)
233
+ page.fetched_at = 0 if page.fetched_at.nil?
234
+ page
235
+ end
236
+
237
+ private
238
+
239
+ def can_be_converted?(field)
240
+ !field.nil? && field.is_a?(String) && !field.empty?
241
+ end
242
+ end
243
+ end
244
+ end
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'polipus-cassandra'
7
+ spec.version = '0.1.3'
8
+ spec.authors = ['Stefano Fontanelli', 'Edoardo Rossi']
9
+ spec.email = ['s.fontanelli@gmail.com', 'edoardo@gild.com']
10
+ spec.summary = 'Add support for Cassandra in Polipus crawler'
11
+ spec.description = 'Add support for Cassandra in Polipus crawler'
12
+ spec.homepage = 'https://github.com/stefanofontanelli/polipus-cassandra'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(/^bin\//) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(/^(test|spec|features)\//)
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_runtime_dependency 'cassandra-driver', '~> 2.0.1', '>= 2.0.1'
21
+ spec.add_runtime_dependency 'multi_json', '~> 1.11.0', '>= 1.11.0'
22
+ spec.add_runtime_dependency 'polipus', '~> 0.3', '>= 0.3.0'
23
+
24
+ spec.add_development_dependency 'rake', '~> 10.3'
25
+ spec.add_development_dependency 'rspec', '~> 3.1.0'
26
+ spec.add_development_dependency 'flexmock', '~> 1.3'
27
+ spec.add_development_dependency 'vcr', '~> 2.9.0'
28
+ spec.add_development_dependency 'webmock', '~> 1.20.0'
29
+ spec.add_development_dependency 'coveralls'
30
+ end
@@ -0,0 +1,174 @@
1
+ # encoding: UTF-8
2
+ require 'cassandra'
3
+ require 'logger'
4
+ require 'polipus-cassandra'
5
+ require 'spec_helper'
6
+
7
+ describe Polipus::Storage::CassandraStore do
8
+ before(:all)do
9
+ @logger = Logger.new(STDOUT).tap { |logger| logger.level = Logger::WARN }
10
+ @cluster = Cassandra.cluster hosts: ['127.0.0.1'], logger: @logger
11
+ @keyspace = 'polipus_cassandra_test'
12
+ @table = 'cassandra_store_test'
13
+ @storage = Polipus::Storage::CassandraStore.new(
14
+ cluster: @cluster,
15
+ keyspace: @keyspace,
16
+ table: @table,
17
+ )
18
+
19
+ @storage.keyspace!
20
+ @storage.table!
21
+
22
+ @storage_without_code_and_body = Polipus::Storage::CassandraStore.new(
23
+ cluster: @cluster,
24
+ keyspace: @keyspace,
25
+ table: @table,
26
+ except: ['code', 'body']
27
+ )
28
+ end
29
+
30
+ after(:all) do
31
+ @storage.clear
32
+ end
33
+
34
+ it 'should store a page' do
35
+ p = page_factory 'http://www.google.com'
36
+ uuid = @storage.add p
37
+ expect(uuid).to eq('ed646a3334ca891fd3467db131372140')
38
+ p = @storage.get p
39
+ expect(p.url.to_s).to eq('http://www.google.com')
40
+ expect(p.body).to eq('<html></html>')
41
+ end
42
+
43
+ it 'should store all the relevant data from the page' do
44
+ url = "http://www.duckduckgo.com"
45
+ referer = "http://www.actually.nowhere.com"
46
+ redirectto = "#{url}/your_super_awesome_results?page=42"
47
+ now = Time.now.to_i
48
+ p = page_factory(
49
+ url,
50
+ {
51
+ referer: referer,
52
+ redirect_to: redirectto,
53
+ fetched_at: now
54
+ })
55
+ uuid = @storage.add p
56
+ expect(uuid).to eq('3cd657f53c74f22c1a21b420ce3863fd')
57
+ p = @storage.get p
58
+
59
+ expect(p.url.to_s).to eq(url)
60
+ expect(p.referer.to_s).to eq(referer)
61
+ expect(p.redirect_to.to_s).to eq(redirectto)
62
+ expect(p.fetched_at).to eq(now)
63
+ expect(p.body).to eq('<html></html>')
64
+
65
+ # for the sake of the other tests...
66
+ expect(@storage.remove(p)).to be_truthy
67
+ end
68
+
69
+ it 'should update a page' do
70
+ p = page_factory 'http://www.google.com', code: 301
71
+ @storage.add p
72
+ p = @storage.get p
73
+ expect(p.code).to eq(301)
74
+ end
75
+
76
+ it 'should iterate over stored pages' do
77
+ @storage.each do |k, page|
78
+ expect(k).to eq('ed646a3334ca891fd3467db131372140')
79
+ expect(page.url.to_s).to eq('http://www.google.com')
80
+ end
81
+ end
82
+
83
+ it 'should delete a page' do
84
+ p = page_factory 'http://www.google.com', code: 301
85
+ @storage.remove p
86
+ expect(@storage.get(p)).to be_nil
87
+ end
88
+
89
+ it 'should store a page removing a query string from the uuid generation' do
90
+ p = page_factory 'http://www.asd.com/?asd=lol'
91
+ p_no_query = page_factory 'http://www.asd.com/?asdas=dasda&adsda=1'
92
+ @storage.include_query_string_in_uuid = false
93
+ @storage.add p
94
+ expect(@storage.exists?(p_no_query)).to be_truthy
95
+ @storage.remove p
96
+ end
97
+
98
+ it 'should store a page removing a query string from the uuid generation no ending slash' do
99
+ p = page_factory 'http://www.asd.com?asd=lol'
100
+ p_no_query = page_factory 'http://www.asd.com'
101
+ @storage.include_query_string_in_uuid = false
102
+ @storage.add p
103
+ expect(@storage.exists?(p_no_query)).to be_truthy
104
+ @storage.remove p
105
+ end
106
+
107
+ it 'should store a page with user data associated' do
108
+ p = page_factory 'http://www.user.com'
109
+ p.user_data.name = 'Test User Data'
110
+ @storage.add p
111
+ expect(@storage.exists?(p)).to be_truthy
112
+ p = @storage.get(p)
113
+ expect(p.user_data.name).to eq('Test User Data')
114
+ @storage.remove p
115
+ end
116
+
117
+ it 'should honor the except parameters' do
118
+ pag = page_factory 'http://www.user-doo.com'
119
+ expect(pag.code).to eq(200)
120
+ expect(pag.body).to eq('<html></html>')
121
+
122
+ @storage_without_code_and_body.add(pag)
123
+ pag = @storage_without_code_and_body.get(pag)
124
+
125
+ expect(pag.body).to be_nil
126
+ expect(pag.code).to eq(0)
127
+ @storage_without_code_and_body.remove(pag)
128
+ end
129
+
130
+ it 'should return false if a doc not exists' do
131
+ @storage.include_query_string_in_uuid = false
132
+ p_other = page_factory 'http://www.asdrrrr.com'
133
+ expect(@storage.exists?(p_other)).to be_falsey
134
+ @storage.add p_other
135
+ expect(@storage.exists?(p_other)).to be_truthy
136
+ p_other = page_factory 'http://www.asdrrrr.com?trk=asd-lol'
137
+ expect(@storage.exists?(p_other)).to be_truthy
138
+ @storage.include_query_string_in_uuid = true
139
+ expect(@storage.exists?(p_other)).to be_falsey
140
+ @storage.include_query_string_in_uuid = false
141
+ @storage.remove p_other
142
+ end
143
+
144
+ it 'should set page.fetched_at based on the id creation' do
145
+ p = page_factory 'http://www.user-doojo.com'
146
+ @storage.add p
147
+ expect(p.fetched_at).to be_nil
148
+ p = @storage.get p
149
+ expect(p.fetched_at).not_to be_nil
150
+ @storage.remove p
151
+ end
152
+
153
+ it 'should NOT set page.fetched_at if already present' do
154
+ p = page_factory 'http://www.user-doojooo.com'
155
+ p.fetched_at = 10
156
+ @storage.add p
157
+ p = @storage.get p
158
+ expect(p.fetched_at).to be 10
159
+ @storage.remove p
160
+ end
161
+
162
+ it 'should store two pages and the count will be two' do
163
+ pages = ['http://www.google.com', 'http://www.duckduckgo.com'].map do |url|
164
+ page_factory(url).tap do |page|
165
+ @storage.add(page)
166
+ end
167
+ end
168
+ expect(@storage.count).to be 2
169
+ pages.each do |page|
170
+ @storage.remove(page)
171
+ end
172
+ expect(@storage.count).to be 0
173
+ end
174
+ end
@@ -0,0 +1,44 @@
1
+ # Require this file using `require "spec_helper"`
2
+ # to ensure that it is only loaded once.
3
+ #
4
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
5
+ require 'digest/md5'
6
+ require 'coveralls'
7
+ require 'vcr'
8
+ require 'webmock/rspec'
9
+
10
+ Coveralls.wear!
11
+
12
+ VCR.configure do |c|
13
+ c.cassette_library_dir = "#{File.dirname(__FILE__)}/cassettes"
14
+ c.hook_into :webmock
15
+ end
16
+
17
+ require 'polipus'
18
+
19
+ RSpec.configure do |config|
20
+ config.run_all_when_everything_filtered = true
21
+ config.filter_run :focus
22
+
23
+ # Run specs in random order to surface order dependencies. If you find an
24
+ # order dependency and want to debug it, you can fix the order by providing
25
+ # the seed, which is printed after each run.
26
+ # --seed 1234
27
+ config.order = 'random'
28
+ config.mock_with :flexmock
29
+ config.around(:each) do |example|
30
+ t = Time.now
31
+ print example.metadata[:full_description]
32
+ VCR.use_cassette(Digest::MD5.hexdigest(example.metadata[:full_description])) do
33
+ example.run
34
+ puts " [#{Time.now - t}s]"
35
+ end
36
+ end
37
+ config.before(:each) { Polipus::SignalHandler.disable }
38
+ end
39
+
40
+ def page_factory(url, params = {})
41
+ params[:code] = 200 unless params.has_key?(:code)
42
+ params[:body] = '<html></html>' unless params.has_key?(:body)
43
+ Polipus::Page.new url, params
44
+ end
metadata CHANGED
@@ -1,18 +1,65 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus-cassandra
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.3
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Stefano Fontanelli
9
+ - Edoardo Rossi
8
10
  autorequire:
9
11
  bindir: bin
10
12
  cert_chain: []
11
- date: 2015-03-04 00:00:00.000000000 Z
13
+ date: 2015-07-13 00:00:00.000000000 Z
12
14
  dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: cassandra-driver
17
+ requirement: !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ~>
21
+ - !ruby/object:Gem::Version
22
+ version: 2.0.1
23
+ - - ! '>='
24
+ - !ruby/object:Gem::Version
25
+ version: 2.0.1
26
+ type: :runtime
27
+ prerelease: false
28
+ version_requirements: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 2.0.1
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: 2.0.1
37
+ - !ruby/object:Gem::Dependency
38
+ name: multi_json
39
+ requirement: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ version: 1.11.0
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.11.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ~>
54
+ - !ruby/object:Gem::Version
55
+ version: 1.11.0
56
+ - - ! '>='
57
+ - !ruby/object:Gem::Version
58
+ version: 1.11.0
13
59
  - !ruby/object:Gem::Dependency
14
60
  name: polipus
15
61
  requirement: !ruby/object:Gem::Requirement
62
+ none: false
16
63
  requirements:
17
64
  - - ~>
18
65
  - !ruby/object:Gem::Version
@@ -23,6 +70,7 @@ dependencies:
23
70
  type: :runtime
24
71
  prerelease: false
25
72
  version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
26
74
  requirements:
27
75
  - - ~>
28
76
  - !ruby/object:Gem::Version
@@ -31,28 +79,89 @@ dependencies:
31
79
  - !ruby/object:Gem::Version
32
80
  version: 0.3.0
33
81
  - !ruby/object:Gem::Dependency
34
- name: cassandra-driver
82
+ name: rake
35
83
  requirement: !ruby/object:Gem::Requirement
84
+ none: false
36
85
  requirements:
37
86
  - - ~>
38
87
  - !ruby/object:Gem::Version
39
- version: 2.1.1
40
- - - ! '>='
88
+ version: '10.3'
89
+ type: :development
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ~>
41
95
  - !ruby/object:Gem::Version
42
- version: 2.1.1
43
- type: :runtime
96
+ version: '10.3'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ none: false
101
+ requirements:
102
+ - - ~>
103
+ - !ruby/object:Gem::Version
104
+ version: 3.1.0
105
+ type: :development
44
106
  prerelease: false
45
107
  version_requirements: !ruby/object:Gem::Requirement
108
+ none: false
46
109
  requirements:
47
110
  - - ~>
48
111
  - !ruby/object:Gem::Version
49
- version: 2.1.1
50
- - - ! '>='
112
+ version: 3.1.0
113
+ - !ruby/object:Gem::Dependency
114
+ name: flexmock
115
+ requirement: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ~>
119
+ - !ruby/object:Gem::Version
120
+ version: '1.3'
121
+ type: :development
122
+ prerelease: false
123
+ version_requirements: !ruby/object:Gem::Requirement
124
+ none: false
125
+ requirements:
126
+ - - ~>
51
127
  - !ruby/object:Gem::Version
52
- version: 2.1.1
128
+ version: '1.3'
53
129
  - !ruby/object:Gem::Dependency
54
- name: rake
130
+ name: vcr
131
+ requirement: !ruby/object:Gem::Requirement
132
+ none: false
133
+ requirements:
134
+ - - ~>
135
+ - !ruby/object:Gem::Version
136
+ version: 2.9.0
137
+ type: :development
138
+ prerelease: false
139
+ version_requirements: !ruby/object:Gem::Requirement
140
+ none: false
141
+ requirements:
142
+ - - ~>
143
+ - !ruby/object:Gem::Version
144
+ version: 2.9.0
145
+ - !ruby/object:Gem::Dependency
146
+ name: webmock
147
+ requirement: !ruby/object:Gem::Requirement
148
+ none: false
149
+ requirements:
150
+ - - ~>
151
+ - !ruby/object:Gem::Version
152
+ version: 1.20.0
153
+ type: :development
154
+ prerelease: false
155
+ version_requirements: !ruby/object:Gem::Requirement
156
+ none: false
157
+ requirements:
158
+ - - ~>
159
+ - !ruby/object:Gem::Version
160
+ version: 1.20.0
161
+ - !ruby/object:Gem::Dependency
162
+ name: coveralls
55
163
  requirement: !ruby/object:Gem::Requirement
164
+ none: false
56
165
  requirements:
57
166
  - - ! '>='
58
167
  - !ruby/object:Gem::Version
@@ -60,6 +169,7 @@ dependencies:
60
169
  type: :development
61
170
  prerelease: false
62
171
  version_requirements: !ruby/object:Gem::Requirement
172
+ none: false
63
173
  requirements:
64
174
  - - ! '>='
65
175
  - !ruby/object:Gem::Version
@@ -67,33 +177,51 @@ dependencies:
67
177
  description: Add support for Cassandra in Polipus crawler
68
178
  email:
69
179
  - s.fontanelli@gmail.com
180
+ - edoardo@gild.com
70
181
  executables: []
71
182
  extensions: []
72
183
  extra_rdoc_files: []
73
- files: []
184
+ files:
185
+ - .gitignore
186
+ - .rspec
187
+ - .ruby-version
188
+ - Gemfile
189
+ - LICENSE.txt
190
+ - README.md
191
+ - Rakefile
192
+ - lib/polipus-cassandra.rb
193
+ - lib/polipus-cassandra/policies/default.rb
194
+ - lib/polipus-cassandra/policies/policies.rb
195
+ - lib/polipus-cassandra/queue_overflow/cassandra_queue.rb
196
+ - lib/polipus-cassandra/storage/cassandra_store.rb
197
+ - polipus-cassandra.gemspec
198
+ - spec/polipus-cassandra/storage/cassandra_store_spec.rb
199
+ - spec/spec_helper.rb
74
200
  homepage: https://github.com/stefanofontanelli/polipus-cassandra
75
201
  licenses:
76
202
  - MIT
77
- metadata: {}
78
203
  post_install_message:
79
204
  rdoc_options: []
80
205
  require_paths:
81
206
  - lib
82
207
  required_ruby_version: !ruby/object:Gem::Requirement
208
+ none: false
83
209
  requirements:
84
210
  - - ! '>='
85
211
  - !ruby/object:Gem::Version
86
212
  version: '0'
87
213
  required_rubygems_version: !ruby/object:Gem::Requirement
214
+ none: false
88
215
  requirements:
89
216
  - - ! '>='
90
217
  - !ruby/object:Gem::Version
91
218
  version: '0'
92
219
  requirements: []
93
220
  rubyforge_project:
94
- rubygems_version: 2.2.2
221
+ rubygems_version: 1.8.23.2
95
222
  signing_key:
96
- specification_version: 4
223
+ specification_version: 3
97
224
  summary: Add support for Cassandra in Polipus crawler
98
- test_files: []
99
- has_rdoc:
225
+ test_files:
226
+ - spec/polipus-cassandra/storage/cassandra_store_spec.rb
227
+ - spec/spec_helper.rb
checksums.yaml DELETED
@@ -1,15 +0,0 @@
1
- ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- NzYzOTkyMmJmOThiZTA5ZmQwNDJkZDM3MjA2NTk3MWQwMTcxMjEyMg==
5
- data.tar.gz: !binary |-
6
- Njg3ODc1ZmJiYTBkZmQ3NTY2MThjOGQ4Yjg0ZDFlZjcyOTUzMDI4MA==
7
- SHA512:
8
- metadata.gz: !binary |-
9
- OWZiMWEwOTU1NTZlMDNhODNkZTdkZmY5MmIyMDc2YTBmZWVmMTI0MjU3ZWNm
10
- YzcyMGQ0NDQyOTc0MGIxOTE1YjJjOTk5MjYyYjg4NDJkOTQ5NjI1NWIyMzk4
11
- MzExOGJlNjM3MGEyNzFlZGIxNTkxYzlkMDQ0NjJhMDQ4OGQ3NDk=
12
- data.tar.gz: !binary |-
13
- NTM0MWI0M2Q2ZTlkMjcxMGQ1ZTkwYTBjY2M2NTRmNjcyYWM2Nzc0ZjExNzQ5
14
- MGZhNmZmYTExMzgzMGMyYmQ0ZTYyMDYzZmQ1MjE2YjM2MDI3NTQzNDlmZDBk
15
- NmZmN2M1NjY5NGZjY2QyMzk1MjRhZjBlMWUwY2FhZmU0MDdhMWE=