pyper_rb 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/Gemfile +24 -0
- data/LICENSE.txt +22 -0
- data/README.md +178 -0
- data/Rakefile +10 -0
- data/lib/pyper/all.rb +4 -0
- data/lib/pyper/pipeline.rb +63 -0
- data/lib/pyper/pipes/cassandra/all_items_reader.rb +40 -0
- data/lib/pyper/pipes/cassandra/deleter.rb +19 -0
- data/lib/pyper/pipes/cassandra/mod_key.rb +32 -0
- data/lib/pyper/pipes/cassandra/mod_key_reader.rb +41 -0
- data/lib/pyper/pipes/cassandra/pagination_decoding.rb +22 -0
- data/lib/pyper/pipes/cassandra/pagination_encoding.rb +17 -0
- data/lib/pyper/pipes/cassandra/reader.rb +35 -0
- data/lib/pyper/pipes/cassandra/writer.rb +24 -0
- data/lib/pyper/pipes/cassandra.rb +8 -0
- data/lib/pyper/pipes/content/fetch.rb +30 -0
- data/lib/pyper/pipes/content/store.rb +36 -0
- data/lib/pyper/pipes/content.rb +2 -0
- data/lib/pyper/pipes/default_values.rb +15 -0
- data/lib/pyper/pipes/field_rename.rb +23 -0
- data/lib/pyper/pipes/force_enumerator.rb +13 -0
- data/lib/pyper/pipes/model/attribute_deserializer.rb +27 -0
- data/lib/pyper/pipes/model/attribute_serializer.rb +34 -0
- data/lib/pyper/pipes/model/attribute_validation.rb +57 -0
- data/lib/pyper/pipes/model/virtus_deserializer.rb +39 -0
- data/lib/pyper/pipes/model/virtus_parser.rb +13 -0
- data/lib/pyper/pipes/model.rb +5 -0
- data/lib/pyper/pipes/no_op.rb +15 -0
- data/lib/pyper/pipes/pry.rb +9 -0
- data/lib/pyper/pipes/remove_fields.rb +22 -0
- data/lib/pyper/pipes.rb +8 -0
- data/lib/pyper/version.rb +3 -0
- data/lib/pyper.rb +4 -0
- data/pyper_rb.gemspec +22 -0
- data/test/fixtures/cass_schema_config.yml +6 -0
- data/test/fixtures/test_datastore/schema.cql +23 -0
- data/test/test_helper.rb +34 -0
- data/test/unit/pyper/pipeline_test.rb +81 -0
- data/test/unit/pyper/pipes/cassandra/all_items_reader_test.rb +47 -0
- data/test/unit/pyper/pipes/cassandra/deleter_test.rb +37 -0
- data/test/unit/pyper/pipes/cassandra/mod_key_reader_test.rb +47 -0
- data/test/unit/pyper/pipes/cassandra/pagination_decoding_test.rb +29 -0
- data/test/unit/pyper/pipes/cassandra/pagination_encoding_test.rb +29 -0
- data/test/unit/pyper/pipes/cassandra/reader_test.rb +79 -0
- data/test/unit/pyper/pipes/cassandra/writer_test.rb +51 -0
- data/test/unit/pyper/pipes/content/fetch_test.rb +38 -0
- data/test/unit/pyper/pipes/content/store_test.rb +49 -0
- data/test/unit/pyper/pipes/field_rename_test.rb +24 -0
- data/test/unit/pyper/pipes/model/attribute_deserializer_test.rb +69 -0
- data/test/unit/pyper/pipes/model/attribute_serializer_test.rb +60 -0
- data/test/unit/pyper/pipes/model/attribute_validation_test.rb +96 -0
- data/test/unit/pyper/pipes/model/virtus_deserializer_test.rb +75 -0
- data/test/unit/pyper/pipes/no_op_test.rb +12 -0
- data/test/unit/pyper/pipes/remove_fields_test.rb +24 -0
- metadata +147 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 646bae638d6adae8b2456c3b6cfc07a7d0f0968f
|
4
|
+
data.tar.gz: 44c1fc2cc772dd12205a052f3e974ecec835d553
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 29e850650ad4223b4e34920b7fb2d0d87ebc7434e09867bf7d6232f19f1dd43fae76c9fcf7e44c4d83508a838b78854575b3b497790f4b2bc933e128eaddda90
|
7
|
+
data.tar.gz: 7f741d99c2d945128b6b8c454cfd84d6709d18955b5f0cc29c35dda57b4ca78a448797d68705f5cb352017491f02806387d1e3acbc334b17bdc5f3cc50e81531
|
data/.gitignore
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in storage_pipeline.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
group :development, :test do
|
7
|
+
gem "pry"
|
8
|
+
gem "awesome_print"
|
9
|
+
gem 'm', :git => 'git@github.com:ANorwell/m.git', :branch => 'minitest_5'
|
10
|
+
gem 'cass_schema', :git => 'git@github.com:backupify/cass_schema.git', :tag => "0.0.4"
|
11
|
+
|
12
|
+
# make sure to use v2.0.1 to avoid issues with super column families
|
13
|
+
gem 'cassandra-driver', :git => 'git@github.com:datastax/ruby-driver.git', :tag => 'v2.0.1'
|
14
|
+
end
|
15
|
+
|
16
|
+
group :test do
|
17
|
+
gem 'minitest_should', :git => 'git@github.com:citrus/minitest_should.git'
|
18
|
+
gem "google-api-client", "0.7.1"
|
19
|
+
gem "mocha"
|
20
|
+
gem 'virtus'
|
21
|
+
end
|
22
|
+
|
23
|
+
gem 'storage_strategy', :git => 'git@github.com:backupify/storage_strategy.git'
|
24
|
+
gem 'cassava', :git => 'git@github.com:backupify/cassava.git'
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Datto
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
# Pyper
|
2
|
+
|
3
|
+
Flexible pipelines for content storage and retrieval.
|
4
|
+
|
5
|
+
Pyper allows the construction of pipelines to store and retrieve data. Each pipe in the pipeline modifies the
|
6
|
+
information in the pipeline before passing it to the next step. By composing pipes in different ways, different
|
7
|
+
data access patterns can be created.
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
Require the pyper library and the pipes that you need:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
require 'pyper'
|
15
|
+
require 'pyper/model' # Import model-related pipes
|
16
|
+
require 'pyper/cassandra' # Import Cassandra-related pipes
|
17
|
+
require 'pyper/content' # Import content storage-related pipes
|
18
|
+
```
|
19
|
+
|
20
|
+
Or, import the entire library using `require 'pyper/all'`
|
21
|
+
|
22
|
+
Create a pipeline composed of a set of pipes:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
write_pipeline = Pyper::Pipeline.create do
|
26
|
+
add Pyper::Pipes::Write::AttributeSerializer.new
|
27
|
+
add Pyper::Pipes::FieldRename.new(:to => :to_emails, :from => :from_email)
|
28
|
+
add Pyper::Pipes::ModKey.new
|
29
|
+
add Pyper::Pipes::Cassandra::Writer.new(:table_1, metadata_client)
|
30
|
+
add Pyper::Pipes::Cassandra::Writer.new(:table_2, indexes_client)
|
31
|
+
add Pyper::Pipes::Cassandra::Writer.new(:table_3, indexes_client)
|
32
|
+
end
|
33
|
+
```
|
34
|
+
|
35
|
+
Then, push data down the pipe:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
result = write_pipeline.push(attributes)
|
39
|
+
```
|
40
|
+
|
41
|
+
View the value of the set of successive transformations performed by the pipe:
|
42
|
+
```ruby
|
43
|
+
result.value
|
44
|
+
```
|
45
|
+
|
46
|
+
A pipeline performs a bunch of sequential transformations to the data being passed down the pipe. It may also have side
|
47
|
+
effects, such as storing data. The specific pipes provided in this library aim are aimed at two uses: writing and
|
48
|
+
reading data.
|
49
|
+
|
50
|
+
A write pipeline takes an initial set of attributes, performing a set of transfomations such as serialization and so on,
|
51
|
+
before storing the data in one or more storage outputs. For example, this gem provides storage pipes for Cassandra and
|
52
|
+
Amazon S3, but it is easy to write a pipe for other storage backends.
|
53
|
+
|
54
|
+
Conversely, a read pipeline takes initially a set of options. These options be transformed by the pipeline, and then used
|
55
|
+
to read data from an external source. This data may then be transformed by the pipeline - for example, performing
|
56
|
+
deserialization or data mapping operations.
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
read_pipeline = Pyper::Pipeline.create do
|
60
|
+
add Pyper::Pipes::Cassandra::PaginationDecoding.new
|
61
|
+
add Pyper::Pipes::Cassandra::Reader.new(:table, indexes_client)
|
62
|
+
add Pyper::Pipes::FieldRename.new(:to_emails => :to, :from_email => :from)
|
63
|
+
add Pyper::Pipes::Cassandra::PaginationEncoding.new
|
64
|
+
add Pyper::Pipes::Model::VirtusDeserializer.new(message_attributes)
|
65
|
+
add Pyper::Pipes::Model::VirtusParser.new(MyModelClass)
|
66
|
+
end
|
67
|
+
|
68
|
+
result = read_pipeline.push(:row => '1', :id => 'i', :page_token => 'sdf')
|
69
|
+
result.value # Enumerator with matching instances of MyModelClass
|
70
|
+
```
|
71
|
+
|
72
|
+
Note that pipe order matters. In the example read pipe above, `Cassandra::PaginationDecoding` decodes pagination options, thus
|
73
|
+
performing an operation on the initial options provided. The `Cassandra::Reader` pipe uses the options to retrieve items from
|
74
|
+
Cassandra, and subsequent elements of the pipeline are designed to transform this retrieved data. Thus, it would not be
|
75
|
+
sensible for the `Cassandra::PaginationDecoding` pipe to come after the `Cassandra::Reader` pipe.
|
76
|
+
|
77
|
+
### Creating and using pipelines
|
78
|
+
|
79
|
+
A pipeline is an instance of `Pyper::Pipeline`, to which pipes are appended using the `<<` or `add` operators.
|
80
|
+
|
81
|
+
```ruby
|
82
|
+
my_pipeline = Pyper::Pipeline.new <<
|
83
|
+
Pyper::Pipes::Cassandra::PaginationDecoding.new <<
|
84
|
+
Pyper::Pipes::Cassandra::Reader.new(:table, indexes_client) <<
|
85
|
+
Pyper::Pipes::Cassandra::PaginationEncoding.new
|
86
|
+
```
|
87
|
+
|
88
|
+
However, the `create` method makes pipeline construction easier. The above example becomes the following:
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
my_pipeline = Pyper::Pipeline.create do
|
92
|
+
add Pyper::Pipes::Cassandra::PaginationDecoding.new
|
93
|
+
add Pyper::Pipes::Cassandra::Reader.new(:table, indexes_client)
|
94
|
+
add Pyper::Pipes::Cassandra::PaginationEncoding.new
|
95
|
+
end
|
96
|
+
```
|
97
|
+
|
98
|
+
To invoke the pipeline, use the `push` method and provide the data to enter the pipeline:
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
pipe_status = my_pipeline.push(:row => '1', :id => 'i')
|
102
|
+
```
|
103
|
+
|
104
|
+
Here, `pipe_status` is a `Pyper::PipeStatus` object, which contains two attributes, `pipe_status.value` and
|
105
|
+
`pipe_status.status`. The value is the returned result of the series of tranformations applied by the pipeline. The status
|
106
|
+
contains metadata about the push operation that might be created by each pipe in the pipeline.
|
107
|
+
|
108
|
+
### Creating new pipes
|
109
|
+
|
110
|
+
A pipe must implement the `call` method, which takes two arguments: the object entering the pipe, as well as the status. It
|
111
|
+
should return the object leaving the pipe:
|
112
|
+
|
113
|
+
```ruby
|
114
|
+
class MyPipe
|
115
|
+
def call(attributes, status = {})
|
116
|
+
attributes[:c] = attributes[:a] + attributes[:b]
|
117
|
+
status[:processed_by_my_pipe] = true
|
118
|
+
attributes
|
119
|
+
end
|
120
|
+
end
|
121
|
+
```
|
122
|
+
|
123
|
+
This example pipe above modifies `attributes` before returning it. It also sets a flag on the status object.
|
124
|
+
|
125
|
+
Note that because the pipe need only respond to `call`, lambdas and procs are valid pipes.
|
126
|
+
|
127
|
+
Generally, pipes in a write pipeline operate on an attributes hash (containing the attributes meant to be written to a data
|
128
|
+
store). Pipes in a read pipeline initially might modify arguments. A data retrieval pipe would then use the arguments to
|
129
|
+
fetch data, and subsequent pipes would perform operations on the enumeration of data items. Thus, a read pipe might look
|
130
|
+
something like:
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
class Deserialize
|
134
|
+
def call(items, status = {})
|
135
|
+
items.map { |item| deserialize(item) }
|
136
|
+
end
|
137
|
+
|
138
|
+
def deserialize(item)
|
139
|
+
# ...
|
140
|
+
end
|
141
|
+
end
|
142
|
+
```
|
143
|
+
|
144
|
+
### Debugging Pipelines
|
145
|
+
|
146
|
+
Because pipes are expected to respond to `#call` you can simply add Procs and Lambdas in your code to debug pipelines:
|
147
|
+
|
148
|
+
```rb
|
149
|
+
pl = Pyper::Pipeline.create do
|
150
|
+
add Pyper::Pipes::Model::AttributeSerializer.new
|
151
|
+
add -> (*args) { binding.pry }
|
152
|
+
add Pyper::Pipes::Cassandra::Writer.new(:my_table, client, fields)
|
153
|
+
end
|
154
|
+
```
|
155
|
+
|
156
|
+
## Installation
|
157
|
+
|
158
|
+
Add this line to your application's Gemfile:
|
159
|
+
|
160
|
+
```ruby
|
161
|
+
gem 'pyper_rb', :git => 'git@github.com:backupify/pyper.git'
|
162
|
+
```
|
163
|
+
|
164
|
+
And then execute:
|
165
|
+
|
166
|
+
$ bundle
|
167
|
+
|
168
|
+
Or install it yourself as:
|
169
|
+
|
170
|
+
$ gem install pyper_rb
|
171
|
+
|
172
|
+
## Contributing
|
173
|
+
|
174
|
+
1. Fork it ( https://github.com/backupify/pyper/fork )
|
175
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
176
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
177
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
178
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/lib/pyper/all.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require_relative 'pipes'
|
2
|
+
|
3
|
+
module Pyper
|
4
|
+
|
5
|
+
class PipeStatus < Struct.new(:value, :status); end
|
6
|
+
|
7
|
+
class Pipeline
|
8
|
+
class << self
|
9
|
+
|
10
|
+
# Provides an interface for creating a pipeline. The provided block will be called
|
11
|
+
# in the context of a newly-created pipeline, to which pipes can be added using #add.
|
12
|
+
# @return [Pyper::Pipeline] The created pipeline.
|
13
|
+
def create(&block)
|
14
|
+
new.tap do |pipeline|
|
15
|
+
if block_given?
|
16
|
+
original_self = eval('self', block.binding)
|
17
|
+
pipeline.instance_variable_set(:@original_self, original_self)
|
18
|
+
pipeline.instance_eval(&block)
|
19
|
+
pipeline.remove_instance_variable(:@original_self)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :pipes
|
26
|
+
|
27
|
+
def initialize(pipes = [])
|
28
|
+
@pipes = pipes
|
29
|
+
end
|
30
|
+
|
31
|
+
# @param pipe [#pipe|#call] A pipe to append to the pipeline
|
32
|
+
def <<(pipe)
|
33
|
+
pipes << pipe
|
34
|
+
self
|
35
|
+
end
|
36
|
+
|
37
|
+
alias_method :add, :<<
|
38
|
+
|
39
|
+
# Insert something into the pipeline to be processed
|
40
|
+
# @param input [Object] The original input data to enter the pipeline. This may be mutated by each pipe in the pipeline.
|
41
|
+
# @return [PipeStatus] the pipe status, containing both the value and a status hash.
|
42
|
+
def push(input)
|
43
|
+
status = {}
|
44
|
+
value = pipes.inject(input) do |attributes, p|
|
45
|
+
if p.respond_to?(:call)
|
46
|
+
p.call(attributes, status)
|
47
|
+
else
|
48
|
+
p.pipe(attributes, status)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
PipeStatus.new(value, status)
|
53
|
+
end
|
54
|
+
|
55
|
+
def method_missing(sym, *args, &block)
|
56
|
+
@original_self ? @original_self.send(sym, *args, &block) : super
|
57
|
+
end
|
58
|
+
|
59
|
+
def respond_to_missing?(sym, include_all = false)
|
60
|
+
@original_self ? @original_self.respond_to?(sym, include_all) : super
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# A pipe for reading all items from a single row in cassandra
|
3
|
+
# @param [Symbol] table name
|
4
|
+
# @param [Cassava::Client] client to query cassandra with
|
5
|
+
class AllItemsReader < Struct.new(:table, :client)
|
6
|
+
attr_reader :page_size
|
7
|
+
|
8
|
+
# @param table [Symbol] the name of the cassandra table to fetch data from
|
9
|
+
# @param client [Cassava::Client]
|
10
|
+
# @param mod_size [Integer] the mod size
|
11
|
+
# @param page_size [Integer] the page size
|
12
|
+
def initialize(table, client, page_size = 1000)
|
13
|
+
@table = table
|
14
|
+
@client = client
|
15
|
+
@page_size = page_size
|
16
|
+
end
|
17
|
+
|
18
|
+
# @param arguments [Hash] Arguments passed to the cassandra client where statement
|
19
|
+
# @option arguments [Array] :order A pair [clustering_column, :desc|:asc] determining how to order the results.
|
20
|
+
# @option arguments [Integer] :page_size
|
21
|
+
# @param status [Hash] The mutable status field
|
22
|
+
# @return [Enumerator::Lazy<Hash>] enumerator of items
|
23
|
+
def pipe(arguments, status = {})
|
24
|
+
columns = arguments.delete(:columns)
|
25
|
+
enum = Enumerator.new do |yielder|
|
26
|
+
options = { :page_size => page_size }
|
27
|
+
paging_state = nil
|
28
|
+
loop do
|
29
|
+
options[:paging_state] = paging_state if paging_state.present?
|
30
|
+
result = @client.select(@table, columns).where(arguments).execute(options)
|
31
|
+
result.each { |item| yielder << item }
|
32
|
+
|
33
|
+
break if result.last_page?
|
34
|
+
paging_state = result.paging_state
|
35
|
+
end
|
36
|
+
end
|
37
|
+
enum.lazy
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# Deletes from a specified cassandra table.
|
3
|
+
|
4
|
+
# @param table_name [Symbol] The table from which to delete
|
5
|
+
# @param client [Cassava::Client] client to query cassandra with
|
6
|
+
class Deleter < Struct.new(:table_name, :client)
|
7
|
+
# @param args [Hash] Should contain the primary keys to delete. Can contain a :columns key to remove specific values.
|
8
|
+
# @param status [Hash] The mutable status field
|
9
|
+
# @return [Hash] The original attributes
|
10
|
+
def pipe(arguments, status = {})
|
11
|
+
local_args = arguments.dup
|
12
|
+
columns = local_args.delete(:columns)
|
13
|
+
|
14
|
+
statement = columns.present? ? client.delete(table_name, columns) : client.delete(table_name)
|
15
|
+
statement.where(local_args).execute
|
16
|
+
arguments
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# Adds the :mod_key field to the output attributes, which is based on the hash of
|
3
|
+
# a particular field in the input attributes.
|
4
|
+
# @example
|
5
|
+
# If the pipe is configured with an id field of :id, then the input
|
6
|
+
# { id: 'abc' }
|
7
|
+
# would result in an output of
|
8
|
+
# { id: 'abc', mod_key: 22 }
|
9
|
+
# Here the value 22 is within the range [0,mod_size - 1] and is uniquely
|
10
|
+
# determined by id.
|
11
|
+
class ModKey
|
12
|
+
attr_reader :mod_size, :id_field
|
13
|
+
|
14
|
+
# @param mod_size [Integer] mod keys will fall within the range [0,mod_key - 1]
|
15
|
+
# @param id_field [Symbol] the attribute to use when generating the mod key.
|
16
|
+
def initialize(mod_size = 100, id_field = :id)
|
17
|
+
@mod_size = mod_size
|
18
|
+
@id_field = id_field
|
19
|
+
end
|
20
|
+
|
21
|
+
# @param attributes [Hash] An attribute hash
|
22
|
+
# @param status [Hash] The mutable status field
|
23
|
+
# @return [Hash] The attribute hash with the mod_key field added
|
24
|
+
def pipe(attributes, status)
|
25
|
+
attributes.merge!(:mod_key => mod(attributes[id_field]))
|
26
|
+
end
|
27
|
+
|
28
|
+
def mod(value)
|
29
|
+
Zlib::crc32(value) % mod_size
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
|
3
|
+
# This pipe is for reading data from sharded rows in Cassandra. The table must have rows sharded by the 'mod_key' field.
|
4
|
+
# For a fixed number of such shards, this pipe reads all data from all of those shards, returning a lazy enumerator
|
5
|
+
# over all of those rows.
|
6
|
+
# For example, if mod_size is 100, it will read the 100 rows with mod_key between 0 and 99.
|
7
|
+
class ModKeyReader
|
8
|
+
|
9
|
+
# @param table [Symbol] the name of the cassandra table to fetch data from
|
10
|
+
# @param client [Cassava::Client]
|
11
|
+
# @param mod_size [Integer] the mod size
|
12
|
+
# @param page_size [Integer] the page size
|
13
|
+
attr_reader :table, :client, :mod_size, :page_size
|
14
|
+
def initialize(table, client, mod_size = 100, page_size = 1000)
|
15
|
+
@table = table
|
16
|
+
@client = client
|
17
|
+
@mod_size = mod_size
|
18
|
+
@page_size = page_size
|
19
|
+
end
|
20
|
+
|
21
|
+
# @param arguments [Hash] Arguments passed to the cassandra client where statement
|
22
|
+
# @param status [Hash] The mutable status field
|
23
|
+
# @return [Enumerator::Lazy<Hash>] enumerator of items from all rows
|
24
|
+
def pipe(arguments, status = {})
|
25
|
+
(Enumerator.new do |yielder|
|
26
|
+
(0...mod_size).each do |mod_id|
|
27
|
+
options = { :page_size => page_size }
|
28
|
+
paging_state = nil
|
29
|
+
loop do
|
30
|
+
options[:paging_state] = paging_state if paging_state.present?
|
31
|
+
result = client.select(table).where(arguments.merge(:mod_key => mod_id)).execute(options)
|
32
|
+
result.each { |item| yielder << item }
|
33
|
+
|
34
|
+
break if result.last_page?
|
35
|
+
paging_state = result.paging_state
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end).lazy
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'base64'
|
2
|
+
|
3
|
+
module Pyper::Pipes::Cassandra
|
4
|
+
# This pipe extracts an encoded paging_state, decodes it, and passes on a decoded
|
5
|
+
# paging state.
|
6
|
+
# This pipe is intended to be used before the Cassandra::Reader pipe, as that pipe
|
7
|
+
# can interpret the :paging_state argument.
|
8
|
+
#
|
9
|
+
# This pipe pairs with the PaginationEncoding pipe, which performs the reverse
|
10
|
+
# transformation
|
11
|
+
class PaginationDecoding
|
12
|
+
|
13
|
+
# @param args [Hash] Arguments that include an encoded :paging_state
|
14
|
+
# @param status [Hash] The mutable status field
|
15
|
+
# @return [Hash] The list of arguments with :paging_state decoded, if present
|
16
|
+
def pipe(args, status = {})
|
17
|
+
page_state = args[:paging_state]
|
18
|
+
args[:paging_state] = Base64.urlsafe_decode64(page_state) if page_state
|
19
|
+
args
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'base64'
|
2
|
+
|
3
|
+
module Pyper::Pipes::Cassandra
|
4
|
+
# Given a :paging_state in the status field, encodes it. This is the reverse transformation of
|
5
|
+
# the PaginationDecoding pipe.
|
6
|
+
class PaginationEncoding
|
7
|
+
|
8
|
+
# @param items [Enumerable<Hash>]
|
9
|
+
# @param status [Hash] The mutable status field
|
10
|
+
# @return [Enumerable<Hash>] The unchanged list of items
|
11
|
+
def pipe(items, status)
|
12
|
+
page_state = status[:paging_state]
|
13
|
+
status[:paging_state] = Base64.urlsafe_encode64(page_state) if page_state
|
14
|
+
items
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# A pipe for reading items from a single row in cassandra
|
3
|
+
# @param [Symbol] table name
|
4
|
+
# @param [Cassava::Client] client to query cassandra with
|
5
|
+
# @param [Hash] Additional/default options to pass to the Cassava execute statement.
|
6
|
+
class Reader < Struct.new(:table, :client, :options)
|
7
|
+
# @param arguments [Hash] Arguments passed to the cassandra client where statement
|
8
|
+
# @option arguments [Integer] :limit
|
9
|
+
# @option arguments [Array] :order A pair [clustering_column, :desc|:asc] determining how to order the results.
|
10
|
+
# @option arguments [Object] :paging_state
|
11
|
+
# @option arguments [Integer] :page_size
|
12
|
+
# @param status [Hash] The mutable status field
|
13
|
+
# @return [Enumerator::Lazy<Hash>] enumerator of items
|
14
|
+
def pipe(arguments, status = {})
|
15
|
+
limit = arguments.delete(:limit)
|
16
|
+
page_size = arguments.delete(:page_size)
|
17
|
+
paging_state = arguments.delete(:paging_state)
|
18
|
+
order = arguments.delete(:order)
|
19
|
+
columns = arguments.delete(:columns)
|
20
|
+
|
21
|
+
opts = (options || {}).merge({ page_size: page_size, paging_state: paging_state})
|
22
|
+
|
23
|
+
query = client.select(table, columns).where(arguments)
|
24
|
+
query = query.limit(limit) if limit
|
25
|
+
query = query.order(order.first, order.last) if order
|
26
|
+
|
27
|
+
result = query.execute(opts)
|
28
|
+
|
29
|
+
status[:paging_state] = result.paging_state
|
30
|
+
status[:last_page] = result.last_page?
|
31
|
+
|
32
|
+
result.rows.lazy
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# Writes a set of attributes to a specified cassandra table.
|
3
|
+
|
4
|
+
# @param table_name [Symbol] The table in which to store the attributes
|
5
|
+
# @param client [Cassava::Client] client to query cassandra with
|
6
|
+
# @param attribute_filter_set [Set] Optionally, a set of attributes which should be written. If none is provided,
|
7
|
+
# all attributes will be written.
|
8
|
+
class Writer < Struct.new(:table_name, :client, :attribute_filter_set)
|
9
|
+
|
10
|
+
# @param args [Hash] Arguments to store in cassandra
|
11
|
+
# @param status [Hash] The mutable status field
|
12
|
+
# @return [Hash] The original attributes
|
13
|
+
def pipe(attributes, status = {})
|
14
|
+
attributes_to_write = if attribute_filter_set
|
15
|
+
attributes.select { |k,v| attribute_filter_set.member?(k) }
|
16
|
+
else
|
17
|
+
attributes
|
18
|
+
end
|
19
|
+
|
20
|
+
client.insert(table_name, attributes_to_write)
|
21
|
+
attributes
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
require_relative 'cassandra/reader'
|
2
|
+
require_relative 'cassandra/writer'
|
3
|
+
require_relative 'cassandra/mod_key'
|
4
|
+
require_relative 'cassandra/mod_key_reader'
|
5
|
+
require_relative 'cassandra/all_items_reader'
|
6
|
+
require_relative 'cassandra/deleter'
|
7
|
+
require_relative 'cassandra/pagination_decoding'
|
8
|
+
require_relative 'cassandra/pagination_encoding'
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Pyper::Pipes::Content
|
2
|
+
class Fetch
|
3
|
+
|
4
|
+
attr_reader :storage_field, :storage_strategy_builder
|
5
|
+
|
6
|
+
# @param storage_field [Symbol] For each item hash, the field in which to insert the content
|
7
|
+
# @param storage_strategy_builder [Block] A block that takes an item and returns a StorageStrategy.
|
8
|
+
def initialize(storage_field, &storage_strategy_builder)
|
9
|
+
@storage_field = storage_field
|
10
|
+
@storage_strategy_builder = storage_strategy_builder
|
11
|
+
end
|
12
|
+
|
13
|
+
# @param items [Enumerable<Hash>] A list of items
|
14
|
+
# @param status [Hash] The mutable status field
|
15
|
+
# @return [Enumerable<Hash>] The items, with the retrieved content inserted in the storage field
|
16
|
+
def pipe(items, status = {})
|
17
|
+
items.map do |item|
|
18
|
+
strategy = storage_strategy_builder.call(item)
|
19
|
+
|
20
|
+
content =
|
21
|
+
begin
|
22
|
+
strategy.read
|
23
|
+
rescue Errno::ENOENT, StorageStrategy::NotFound
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
item.merge(storage_field => content)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Pyper::Pipes::Content
|
2
|
+
# A pipe for storing content to an object store. Uses the StorageStrategy gem.
|
3
|
+
class Store
|
4
|
+
|
5
|
+
attr_reader :storage_field, :storage_strategy_builder
|
6
|
+
|
7
|
+
# @param storage_field [Symbol] The attributes field in which the content is located.
|
8
|
+
# @param storage_strategy_builder [Block] A block that takes an item and returns a StorageStrategy.
|
9
|
+
def initialize(storage_field, &storage_strategy_builder)
|
10
|
+
@storage_field = storage_field
|
11
|
+
@storage_strategy_builder = storage_strategy_builder
|
12
|
+
end
|
13
|
+
|
14
|
+
# Stores content using the specified storage strategy
|
15
|
+
# @param attributes [Hash] The attributes of the item for which content is to be stored
|
16
|
+
# @param status [Hash] The mutable status field
|
17
|
+
# @return [Hash] The item attributes, with the storage_field deleted.
|
18
|
+
def pipe(attributes, status = {})
|
19
|
+
strategy = storage_strategy_builder.call(attributes)
|
20
|
+
|
21
|
+
content = attributes.delete(storage_field)
|
22
|
+
|
23
|
+
raise ArgumentError.new("#{storage_field} must be present in ContentStorage") unless content
|
24
|
+
|
25
|
+
case content
|
26
|
+
when NilClass then # do nothing -- there's no content to write
|
27
|
+
when String then strategy.write(content)
|
28
|
+
else strategy.write_from(content)
|
29
|
+
end
|
30
|
+
|
31
|
+
attributes.merge!(strategy.metadata)
|
32
|
+
|
33
|
+
attributes
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Pyper::Pipes
|
2
|
+
# @param default_values [Hash] A hash of default values to set within the provided attrs if they are not already present.
|
3
|
+
class DefaultValues < Struct.new(:default_values)
|
4
|
+
|
5
|
+
# @param attrs [Hash] The attributes of the item
|
6
|
+
# @param status [Hash] The mutable status field
|
7
|
+
# @return [Hash] The item attributes with default values inserted
|
8
|
+
def pipe(attrs, status = {})
|
9
|
+
default_values.each do |field, value|
|
10
|
+
attrs[field] = value unless attrs[field]
|
11
|
+
end
|
12
|
+
attrs
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|