pyper_rb 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/Gemfile +24 -0
- data/LICENSE.txt +22 -0
- data/README.md +178 -0
- data/Rakefile +10 -0
- data/lib/pyper/all.rb +4 -0
- data/lib/pyper/pipeline.rb +63 -0
- data/lib/pyper/pipes/cassandra/all_items_reader.rb +40 -0
- data/lib/pyper/pipes/cassandra/deleter.rb +19 -0
- data/lib/pyper/pipes/cassandra/mod_key.rb +32 -0
- data/lib/pyper/pipes/cassandra/mod_key_reader.rb +41 -0
- data/lib/pyper/pipes/cassandra/pagination_decoding.rb +22 -0
- data/lib/pyper/pipes/cassandra/pagination_encoding.rb +17 -0
- data/lib/pyper/pipes/cassandra/reader.rb +35 -0
- data/lib/pyper/pipes/cassandra/writer.rb +24 -0
- data/lib/pyper/pipes/cassandra.rb +8 -0
- data/lib/pyper/pipes/content/fetch.rb +30 -0
- data/lib/pyper/pipes/content/store.rb +36 -0
- data/lib/pyper/pipes/content.rb +2 -0
- data/lib/pyper/pipes/default_values.rb +15 -0
- data/lib/pyper/pipes/field_rename.rb +23 -0
- data/lib/pyper/pipes/force_enumerator.rb +13 -0
- data/lib/pyper/pipes/model/attribute_deserializer.rb +27 -0
- data/lib/pyper/pipes/model/attribute_serializer.rb +34 -0
- data/lib/pyper/pipes/model/attribute_validation.rb +57 -0
- data/lib/pyper/pipes/model/virtus_deserializer.rb +39 -0
- data/lib/pyper/pipes/model/virtus_parser.rb +13 -0
- data/lib/pyper/pipes/model.rb +5 -0
- data/lib/pyper/pipes/no_op.rb +15 -0
- data/lib/pyper/pipes/pry.rb +9 -0
- data/lib/pyper/pipes/remove_fields.rb +22 -0
- data/lib/pyper/pipes.rb +8 -0
- data/lib/pyper/version.rb +3 -0
- data/lib/pyper.rb +4 -0
- data/pyper_rb.gemspec +22 -0
- data/test/fixtures/cass_schema_config.yml +6 -0
- data/test/fixtures/test_datastore/schema.cql +23 -0
- data/test/test_helper.rb +34 -0
- data/test/unit/pyper/pipeline_test.rb +81 -0
- data/test/unit/pyper/pipes/cassandra/all_items_reader_test.rb +47 -0
- data/test/unit/pyper/pipes/cassandra/deleter_test.rb +37 -0
- data/test/unit/pyper/pipes/cassandra/mod_key_reader_test.rb +47 -0
- data/test/unit/pyper/pipes/cassandra/pagination_decoding_test.rb +29 -0
- data/test/unit/pyper/pipes/cassandra/pagination_encoding_test.rb +29 -0
- data/test/unit/pyper/pipes/cassandra/reader_test.rb +79 -0
- data/test/unit/pyper/pipes/cassandra/writer_test.rb +51 -0
- data/test/unit/pyper/pipes/content/fetch_test.rb +38 -0
- data/test/unit/pyper/pipes/content/store_test.rb +49 -0
- data/test/unit/pyper/pipes/field_rename_test.rb +24 -0
- data/test/unit/pyper/pipes/model/attribute_deserializer_test.rb +69 -0
- data/test/unit/pyper/pipes/model/attribute_serializer_test.rb +60 -0
- data/test/unit/pyper/pipes/model/attribute_validation_test.rb +96 -0
- data/test/unit/pyper/pipes/model/virtus_deserializer_test.rb +75 -0
- data/test/unit/pyper/pipes/no_op_test.rb +12 -0
- data/test/unit/pyper/pipes/remove_fields_test.rb +24 -0
- metadata +147 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 646bae638d6adae8b2456c3b6cfc07a7d0f0968f
|
4
|
+
data.tar.gz: 44c1fc2cc772dd12205a052f3e974ecec835d553
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 29e850650ad4223b4e34920b7fb2d0d87ebc7434e09867bf7d6232f19f1dd43fae76c9fcf7e44c4d83508a838b78854575b3b497790f4b2bc933e128eaddda90
|
7
|
+
data.tar.gz: 7f741d99c2d945128b6b8c454cfd84d6709d18955b5f0cc29c35dda57b4ca78a448797d68705f5cb352017491f02806387d1e3acbc334b17bdc5f3cc50e81531
|
data/.gitignore
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in storage_pipeline.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
group :development, :test do
|
7
|
+
gem "pry"
|
8
|
+
gem "awesome_print"
|
9
|
+
gem 'm', :git => 'git@github.com:ANorwell/m.git', :branch => 'minitest_5'
|
10
|
+
gem 'cass_schema', :git => 'git@github.com:backupify/cass_schema.git', :tag => "0.0.4"
|
11
|
+
|
12
|
+
# make sure to use v2.0.1 to avoid issues with super column families
|
13
|
+
gem 'cassandra-driver', :git => 'git@github.com:datastax/ruby-driver.git', :tag => 'v2.0.1'
|
14
|
+
end
|
15
|
+
|
16
|
+
group :test do
|
17
|
+
gem 'minitest_should', :git => 'git@github.com:citrus/minitest_should.git'
|
18
|
+
gem "google-api-client", "0.7.1"
|
19
|
+
gem "mocha"
|
20
|
+
gem 'virtus'
|
21
|
+
end
|
22
|
+
|
23
|
+
gem 'storage_strategy', :git => 'git@github.com:backupify/storage_strategy.git'
|
24
|
+
gem 'cassava', :git => 'git@github.com:backupify/cassava.git'
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Datto
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
# Pyper
|
2
|
+
|
3
|
+
Flexible pipelines for content storage and retrieval.
|
4
|
+
|
5
|
+
Pyper allows the construction of pipelines to store and retrieve data. Each pipe in the pipeline modifies the
|
6
|
+
information in the pipeline before passing it to the next step. By composing pipes in different ways, different
|
7
|
+
data access patterns can be created.
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
Require the pyper library and the pipes that you need:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
require 'pyper'
|
15
|
+
require 'pyper/model' # Import model-related pipes
|
16
|
+
require 'pyper/cassandra' # Import Cassandra-related pipes
|
17
|
+
require 'pyper/content' # Import content storage-related pipes
|
18
|
+
```
|
19
|
+
|
20
|
+
Or, import the entire library using `require 'pyper/all'`
|
21
|
+
|
22
|
+
Create a pipeline composed of a set of pipes:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
write_pipeline = Pyper::Pipeline.create do
|
26
|
+
add Pyper::Pipes::Write::AttributeSerializer.new
|
27
|
+
add Pyper::Pipes::FieldRename.new(:to => :to_emails, :from => :from_email)
|
28
|
+
add Pyper::Pipes::ModKey.new
|
29
|
+
add Pyper::Pipes::Cassandra::Writer.new(:table_1, metadata_client)
|
30
|
+
add Pyper::Pipes::Cassandra::Writer.new(:table_2, indexes_client)
|
31
|
+
add Pyper::Pipes::Cassandra::Writer.new(:table_3, indexes_client)
|
32
|
+
end
|
33
|
+
```
|
34
|
+
|
35
|
+
Then, push data down the pipe:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
result = write_pipeline.push(attributes)
|
39
|
+
```
|
40
|
+
|
41
|
+
View the value of the set of successive transformations performed by the pipe:
|
42
|
+
```ruby
|
43
|
+
result.value
|
44
|
+
```
|
45
|
+
|
46
|
+
A pipeline performs a bunch of sequential transformations to the data being passed down the pipe. It may also have side
|
47
|
+
effects, such as storing data. The specific pipes provided in this library aim are aimed at two uses: writing and
|
48
|
+
reading data.
|
49
|
+
|
50
|
+
A write pipeline takes an initial set of attributes, performing a set of transfomations such as serialization and so on,
|
51
|
+
before storing the data in one or more storage outputs. For example, this gem provides storage pipes for Cassandra and
|
52
|
+
Amazon S3, but it is easy to write a pipe for other storage backends.
|
53
|
+
|
54
|
+
Conversely, a read pipeline takes initially a set of options. These options be transformed by the pipeline, and then used
|
55
|
+
to read data from an external source. This data may then be transformed by the pipeline - for example, performing
|
56
|
+
deserialization or data mapping operations.
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
read_pipeline = Pyper::Pipeline.create do
|
60
|
+
add Pyper::Pipes::Cassandra::PaginationDecoding.new
|
61
|
+
add Pyper::Pipes::Cassandra::Reader.new(:table, indexes_client)
|
62
|
+
add Pyper::Pipes::FieldRename.new(:to_emails => :to, :from_email => :from)
|
63
|
+
add Pyper::Pipes::Cassandra::PaginationEncoding.new
|
64
|
+
add Pyper::Pipes::Model::VirtusDeserializer.new(message_attributes)
|
65
|
+
add Pyper::Pipes::Model::VirtusParser.new(MyModelClass)
|
66
|
+
end
|
67
|
+
|
68
|
+
result = read_pipeline.push(:row => '1', :id => 'i', :page_token => 'sdf')
|
69
|
+
result.value # Enumerator with matching instances of MyModelClass
|
70
|
+
```
|
71
|
+
|
72
|
+
Note that pipe order matters. In the example read pipe above, `Cassandra::PaginationDecoding` decodes pagination options, thus
|
73
|
+
performing an operation on the initial options provided. The `Cassandra::Reader` pipe uses the options to retrieve items from
|
74
|
+
Cassandra, and subsequent elements of the pipeline are designed to transform this retrieved data. Thus, it would not be
|
75
|
+
sensible for the `Cassandra::PaginationDecoding` pipe to come after the `Cassandra::Reader` pipe.
|
76
|
+
|
77
|
+
### Creating and using pipelines
|
78
|
+
|
79
|
+
A pipeline is an instance of `Pyper::Pipeline`, to which pipes are appended using the `<<` or `add` operators.
|
80
|
+
|
81
|
+
```ruby
|
82
|
+
my_pipeline = Pyper::Pipeline.new <<
|
83
|
+
Pyper::Pipes::Cassandra::PaginationDecoding.new <<
|
84
|
+
Pyper::Pipes::Cassandra::Reader.new(:table, indexes_client) <<
|
85
|
+
Pyper::Pipes::Cassandra::PaginationEncoding.new
|
86
|
+
```
|
87
|
+
|
88
|
+
However, the `create` method makes pipeline construction easier. The above example becomes the following:
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
my_pipeline = Pyper::Pipeline.create do
|
92
|
+
add Pyper::Pipes::Cassandra::PaginationDecoding.new
|
93
|
+
add Pyper::Pipes::Cassandra::Reader.new(:table, indexes_client)
|
94
|
+
add Pyper::Pipes::Cassandra::PaginationEncoding.new
|
95
|
+
end
|
96
|
+
```
|
97
|
+
|
98
|
+
To invoke the pipeline, use the `push` method and provide the data to enter the pipeline:
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
pipe_status = my_pipeline.push(:row => '1', :id => 'i')
|
102
|
+
```
|
103
|
+
|
104
|
+
Here, `pipe_status` is a `Pyper::PipeStatus` object, which contains two attributes, `pipe_status.value` and
|
105
|
+
`pipe_status.status`. The value is the returned result of the series of tranformations applied by the pipeline. The status
|
106
|
+
contains metadata about the push operation that might be created by each pipe in the pipeline.
|
107
|
+
|
108
|
+
### Creating new pipes
|
109
|
+
|
110
|
+
A pipe must implement the `call` method, which takes two arguments: the object entering the pipe, as well as the status. It
|
111
|
+
should return the object leaving the pipe:
|
112
|
+
|
113
|
+
```ruby
|
114
|
+
class MyPipe
|
115
|
+
def call(attributes, status = {})
|
116
|
+
attributes[:c] = attributes[:a] + attributes[:b]
|
117
|
+
status[:processed_by_my_pipe] = true
|
118
|
+
attributes
|
119
|
+
end
|
120
|
+
end
|
121
|
+
```
|
122
|
+
|
123
|
+
This example pipe above modifies `attributes` before returning it. It also sets a flag on the status object.
|
124
|
+
|
125
|
+
Note that because the pipe need only respond to `call`, lambdas and procs are valid pipes.
|
126
|
+
|
127
|
+
Generally, pipes in a write pipeline operate on an attributes hash (containing the attributes meant to be written to a data
|
128
|
+
store). Pipes in a read pipeline initially might modify arguments. A data retrieval pipe would then use the arguments to
|
129
|
+
fetch data, and subsequent pipes would perform operations on the enumeration of data items. Thus, a read pipe might look
|
130
|
+
something like:
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
class Deserialize
|
134
|
+
def call(items, status = {})
|
135
|
+
items.map { |item| deserialize(item) }
|
136
|
+
end
|
137
|
+
|
138
|
+
def deserialize(item)
|
139
|
+
# ...
|
140
|
+
end
|
141
|
+
end
|
142
|
+
```
|
143
|
+
|
144
|
+
### Debugging Pipelines
|
145
|
+
|
146
|
+
Because pipes are expected to respond to `#call` you can simply add Procs and Lambdas in your code to debug pipelines:
|
147
|
+
|
148
|
+
```rb
|
149
|
+
pl = Pyper::Pipeline.create do
|
150
|
+
add Pyper::Pipes::Model::AttributeSerializer.new
|
151
|
+
add -> (*args) { binding.pry }
|
152
|
+
add Pyper::Pipes::Cassandra::Writer.new(:my_table, client, fields)
|
153
|
+
end
|
154
|
+
```
|
155
|
+
|
156
|
+
## Installation
|
157
|
+
|
158
|
+
Add this line to your application's Gemfile:
|
159
|
+
|
160
|
+
```ruby
|
161
|
+
gem 'pyper_rb', :git => 'git@github.com:backupify/pyper.git'
|
162
|
+
```
|
163
|
+
|
164
|
+
And then execute:
|
165
|
+
|
166
|
+
$ bundle
|
167
|
+
|
168
|
+
Or install it yourself as:
|
169
|
+
|
170
|
+
$ gem install pyper_rb
|
171
|
+
|
172
|
+
## Contributing
|
173
|
+
|
174
|
+
1. Fork it ( https://github.com/backupify/pyper/fork )
|
175
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
176
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
177
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
178
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/lib/pyper/all.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require_relative 'pipes'
|
2
|
+
|
3
|
+
module Pyper
|
4
|
+
|
5
|
+
class PipeStatus < Struct.new(:value, :status); end
|
6
|
+
|
7
|
+
class Pipeline
|
8
|
+
class << self
|
9
|
+
|
10
|
+
# Provides an interface for creating a pipeline. The provided block will be called
|
11
|
+
# in the context of a newly-created pipeline, to which pipes can be added using #add.
|
12
|
+
# @return [Pyper::Pipeline] The created pipeline.
|
13
|
+
def create(&block)
|
14
|
+
new.tap do |pipeline|
|
15
|
+
if block_given?
|
16
|
+
original_self = eval('self', block.binding)
|
17
|
+
pipeline.instance_variable_set(:@original_self, original_self)
|
18
|
+
pipeline.instance_eval(&block)
|
19
|
+
pipeline.remove_instance_variable(:@original_self)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :pipes
|
26
|
+
|
27
|
+
def initialize(pipes = [])
|
28
|
+
@pipes = pipes
|
29
|
+
end
|
30
|
+
|
31
|
+
# @param pipe [#pipe|#call] A pipe to append to the pipeline
|
32
|
+
def <<(pipe)
|
33
|
+
pipes << pipe
|
34
|
+
self
|
35
|
+
end
|
36
|
+
|
37
|
+
alias_method :add, :<<
|
38
|
+
|
39
|
+
# Insert something into the pipeline to be processed
|
40
|
+
# @param input [Object] The original input data to enter the pipeline. This may be mutated by each pipe in the pipeline.
|
41
|
+
# @return [PipeStatus] the pipe status, containing both the value and a status hash.
|
42
|
+
def push(input)
|
43
|
+
status = {}
|
44
|
+
value = pipes.inject(input) do |attributes, p|
|
45
|
+
if p.respond_to?(:call)
|
46
|
+
p.call(attributes, status)
|
47
|
+
else
|
48
|
+
p.pipe(attributes, status)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
PipeStatus.new(value, status)
|
53
|
+
end
|
54
|
+
|
55
|
+
def method_missing(sym, *args, &block)
|
56
|
+
@original_self ? @original_self.send(sym, *args, &block) : super
|
57
|
+
end
|
58
|
+
|
59
|
+
def respond_to_missing?(sym, include_all = false)
|
60
|
+
@original_self ? @original_self.respond_to?(sym, include_all) : super
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# A pipe for reading all items from a single row in cassandra
|
3
|
+
# @param [Symbol] table name
|
4
|
+
# @param [Cassava::Client] client to query cassandra with
|
5
|
+
class AllItemsReader < Struct.new(:table, :client)
|
6
|
+
attr_reader :page_size
|
7
|
+
|
8
|
+
# @param table [Symbol] the name of the cassandra table to fetch data from
|
9
|
+
# @param client [Cassava::Client]
|
10
|
+
# @param mod_size [Integer] the mod size
|
11
|
+
# @param page_size [Integer] the page size
|
12
|
+
def initialize(table, client, page_size = 1000)
|
13
|
+
@table = table
|
14
|
+
@client = client
|
15
|
+
@page_size = page_size
|
16
|
+
end
|
17
|
+
|
18
|
+
# @param arguments [Hash] Arguments passed to the cassandra client where statement
|
19
|
+
# @option arguments [Array] :order A pair [clustering_column, :desc|:asc] determining how to order the results.
|
20
|
+
# @option arguments [Integer] :page_size
|
21
|
+
# @param status [Hash] The mutable status field
|
22
|
+
# @return [Enumerator::Lazy<Hash>] enumerator of items
|
23
|
+
def pipe(arguments, status = {})
|
24
|
+
columns = arguments.delete(:columns)
|
25
|
+
enum = Enumerator.new do |yielder|
|
26
|
+
options = { :page_size => page_size }
|
27
|
+
paging_state = nil
|
28
|
+
loop do
|
29
|
+
options[:paging_state] = paging_state if paging_state.present?
|
30
|
+
result = @client.select(@table, columns).where(arguments).execute(options)
|
31
|
+
result.each { |item| yielder << item }
|
32
|
+
|
33
|
+
break if result.last_page?
|
34
|
+
paging_state = result.paging_state
|
35
|
+
end
|
36
|
+
end
|
37
|
+
enum.lazy
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# Deletes from a specified cassandra table.
|
3
|
+
|
4
|
+
# @param table_name [Symbol] The table from which to delete
|
5
|
+
# @param client [Cassava::Client] client to query cassandra with
|
6
|
+
class Deleter < Struct.new(:table_name, :client)
|
7
|
+
# @param args [Hash] Should contain the primary keys to delete. Can contain a :columns key to remove specific values.
|
8
|
+
# @param status [Hash] The mutable status field
|
9
|
+
# @return [Hash] The original attributes
|
10
|
+
def pipe(arguments, status = {})
|
11
|
+
local_args = arguments.dup
|
12
|
+
columns = local_args.delete(:columns)
|
13
|
+
|
14
|
+
statement = columns.present? ? client.delete(table_name, columns) : client.delete(table_name)
|
15
|
+
statement.where(local_args).execute
|
16
|
+
arguments
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# Adds the :mod_key field to the output attributes, which is based on the hash of
|
3
|
+
# a particular field in the input attributes.
|
4
|
+
# @example
|
5
|
+
# If the pipe is configured with an id field of :id, then the input
|
6
|
+
# { id: 'abc' }
|
7
|
+
# would result in an output of
|
8
|
+
# { id: 'abc', mod_key: 22 }
|
9
|
+
# Here the value 22 is within the range [0,mod_size - 1] and is uniquely
|
10
|
+
# determined by id.
|
11
|
+
class ModKey
|
12
|
+
attr_reader :mod_size, :id_field
|
13
|
+
|
14
|
+
# @param mod_size [Integer] mod keys will fall within the range [0,mod_key - 1]
|
15
|
+
# @param id_field [Symbol] the attribute to use when generating the mod key.
|
16
|
+
def initialize(mod_size = 100, id_field = :id)
|
17
|
+
@mod_size = mod_size
|
18
|
+
@id_field = id_field
|
19
|
+
end
|
20
|
+
|
21
|
+
# @param attributes [Hash] An attribute hash
|
22
|
+
# @param status [Hash] The mutable status field
|
23
|
+
# @return [Hash] The attribute hash with the mod_key field added
|
24
|
+
def pipe(attributes, status)
|
25
|
+
attributes.merge!(:mod_key => mod(attributes[id_field]))
|
26
|
+
end
|
27
|
+
|
28
|
+
def mod(value)
|
29
|
+
Zlib::crc32(value) % mod_size
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
|
3
|
+
# This pipe is for reading data from sharded rows in Cassandra. The table must have rows sharded by the 'mod_key' field.
|
4
|
+
# For a fixed number of such shards, this pipe reads all data from all of those shards, returning a lazy enumerator
|
5
|
+
# over all of those rows.
|
6
|
+
# For example, if mod_size is 100, it will read the 100 rows with mod_key between 0 and 99.
|
7
|
+
class ModKeyReader
|
8
|
+
|
9
|
+
# @param table [Symbol] the name of the cassandra table to fetch data from
|
10
|
+
# @param client [Cassava::Client]
|
11
|
+
# @param mod_size [Integer] the mod size
|
12
|
+
# @param page_size [Integer] the page size
|
13
|
+
attr_reader :table, :client, :mod_size, :page_size
|
14
|
+
def initialize(table, client, mod_size = 100, page_size = 1000)
|
15
|
+
@table = table
|
16
|
+
@client = client
|
17
|
+
@mod_size = mod_size
|
18
|
+
@page_size = page_size
|
19
|
+
end
|
20
|
+
|
21
|
+
# @param arguments [Hash] Arguments passed to the cassandra client where statement
|
22
|
+
# @param status [Hash] The mutable status field
|
23
|
+
# @return [Enumerator::Lazy<Hash>] enumerator of items from all rows
|
24
|
+
def pipe(arguments, status = {})
|
25
|
+
(Enumerator.new do |yielder|
|
26
|
+
(0...mod_size).each do |mod_id|
|
27
|
+
options = { :page_size => page_size }
|
28
|
+
paging_state = nil
|
29
|
+
loop do
|
30
|
+
options[:paging_state] = paging_state if paging_state.present?
|
31
|
+
result = client.select(table).where(arguments.merge(:mod_key => mod_id)).execute(options)
|
32
|
+
result.each { |item| yielder << item }
|
33
|
+
|
34
|
+
break if result.last_page?
|
35
|
+
paging_state = result.paging_state
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end).lazy
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'base64'
|
2
|
+
|
3
|
+
module Pyper::Pipes::Cassandra
|
4
|
+
# This pipe extracts an encoded paging_state, decodes it, and passes on a decoded
|
5
|
+
# paging state.
|
6
|
+
# This pipe is intended to be used before the Cassandra::Reader pipe, as that pipe
|
7
|
+
# can interpret the :paging_state argument.
|
8
|
+
#
|
9
|
+
# This pipe pairs with the PaginationEncoding pipe, which performs the reverse
|
10
|
+
# transformation
|
11
|
+
class PaginationDecoding
|
12
|
+
|
13
|
+
# @param args [Hash] Arguments that include an encoded :paging_state
|
14
|
+
# @param status [Hash] The mutable status field
|
15
|
+
# @return [Hash] The list of arguments with :paging_state decoded, if present
|
16
|
+
def pipe(args, status = {})
|
17
|
+
page_state = args[:paging_state]
|
18
|
+
args[:paging_state] = Base64.urlsafe_decode64(page_state) if page_state
|
19
|
+
args
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'base64'
|
2
|
+
|
3
|
+
module Pyper::Pipes::Cassandra
|
4
|
+
# Given a :paging_state in the status field, encodes it. This is the reverse transformation of
|
5
|
+
# the PaginationDecoding pipe.
|
6
|
+
class PaginationEncoding
|
7
|
+
|
8
|
+
# @param items [Enumerable<Hash>]
|
9
|
+
# @param status [Hash] The mutable status field
|
10
|
+
# @return [Enumerable<Hash>] The unchanged list of items
|
11
|
+
def pipe(items, status)
|
12
|
+
page_state = status[:paging_state]
|
13
|
+
status[:paging_state] = Base64.urlsafe_encode64(page_state) if page_state
|
14
|
+
items
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# A pipe for reading items from a single row in cassandra
|
3
|
+
# @param [Symbol] table name
|
4
|
+
# @param [Cassava::Client] client to query cassandra with
|
5
|
+
# @param [Hash] Additional/default options to pass to the Cassava execute statement.
|
6
|
+
class Reader < Struct.new(:table, :client, :options)
|
7
|
+
# @param arguments [Hash] Arguments passed to the cassandra client where statement
|
8
|
+
# @option arguments [Integer] :limit
|
9
|
+
# @option arguments [Array] :order A pair [clustering_column, :desc|:asc] determining how to order the results.
|
10
|
+
# @option arguments [Object] :paging_state
|
11
|
+
# @option arguments [Integer] :page_size
|
12
|
+
# @param status [Hash] The mutable status field
|
13
|
+
# @return [Enumerator::Lazy<Hash>] enumerator of items
|
14
|
+
def pipe(arguments, status = {})
|
15
|
+
limit = arguments.delete(:limit)
|
16
|
+
page_size = arguments.delete(:page_size)
|
17
|
+
paging_state = arguments.delete(:paging_state)
|
18
|
+
order = arguments.delete(:order)
|
19
|
+
columns = arguments.delete(:columns)
|
20
|
+
|
21
|
+
opts = (options || {}).merge({ page_size: page_size, paging_state: paging_state})
|
22
|
+
|
23
|
+
query = client.select(table, columns).where(arguments)
|
24
|
+
query = query.limit(limit) if limit
|
25
|
+
query = query.order(order.first, order.last) if order
|
26
|
+
|
27
|
+
result = query.execute(opts)
|
28
|
+
|
29
|
+
status[:paging_state] = result.paging_state
|
30
|
+
status[:last_page] = result.last_page?
|
31
|
+
|
32
|
+
result.rows.lazy
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Pyper::Pipes::Cassandra
|
2
|
+
# Writes a set of attributes to a specified cassandra table.
|
3
|
+
|
4
|
+
# @param table_name [Symbol] The table in which to store the attributes
|
5
|
+
# @param client [Cassava::Client] client to query cassandra with
|
6
|
+
# @param attribute_filter_set [Set] Optionally, a set of attributes which should be written. If none is provided,
|
7
|
+
# all attributes will be written.
|
8
|
+
class Writer < Struct.new(:table_name, :client, :attribute_filter_set)
|
9
|
+
|
10
|
+
# @param args [Hash] Arguments to store in cassandra
|
11
|
+
# @param status [Hash] The mutable status field
|
12
|
+
# @return [Hash] The original attributes
|
13
|
+
def pipe(attributes, status = {})
|
14
|
+
attributes_to_write = if attribute_filter_set
|
15
|
+
attributes.select { |k,v| attribute_filter_set.member?(k) }
|
16
|
+
else
|
17
|
+
attributes
|
18
|
+
end
|
19
|
+
|
20
|
+
client.insert(table_name, attributes_to_write)
|
21
|
+
attributes
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
require_relative 'cassandra/reader'
|
2
|
+
require_relative 'cassandra/writer'
|
3
|
+
require_relative 'cassandra/mod_key'
|
4
|
+
require_relative 'cassandra/mod_key_reader'
|
5
|
+
require_relative 'cassandra/all_items_reader'
|
6
|
+
require_relative 'cassandra/deleter'
|
7
|
+
require_relative 'cassandra/pagination_decoding'
|
8
|
+
require_relative 'cassandra/pagination_encoding'
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Pyper::Pipes::Content
|
2
|
+
class Fetch
|
3
|
+
|
4
|
+
attr_reader :storage_field, :storage_strategy_builder
|
5
|
+
|
6
|
+
# @param storage_field [Symbol] For each item hash, the field in which to insert the content
|
7
|
+
# @param storage_strategy_builder [Block] A block that takes an item and returns a StorageStrategy.
|
8
|
+
def initialize(storage_field, &storage_strategy_builder)
|
9
|
+
@storage_field = storage_field
|
10
|
+
@storage_strategy_builder = storage_strategy_builder
|
11
|
+
end
|
12
|
+
|
13
|
+
# @param items [Enumerable<Hash>] A list of items
|
14
|
+
# @param status [Hash] The mutable status field
|
15
|
+
# @return [Enumerable<Hash>] The items, with the retrieved content inserted in the storage field
|
16
|
+
def pipe(items, status = {})
|
17
|
+
items.map do |item|
|
18
|
+
strategy = storage_strategy_builder.call(item)
|
19
|
+
|
20
|
+
content =
|
21
|
+
begin
|
22
|
+
strategy.read
|
23
|
+
rescue Errno::ENOENT, StorageStrategy::NotFound
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
item.merge(storage_field => content)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Pyper::Pipes::Content
|
2
|
+
# A pipe for storing content to an object store. Uses the StorageStrategy gem.
|
3
|
+
class Store
|
4
|
+
|
5
|
+
attr_reader :storage_field, :storage_strategy_builder
|
6
|
+
|
7
|
+
# @param storage_field [Symbol] The attributes field in which the content is located.
|
8
|
+
# @param storage_strategy_builder [Block] A block that takes an item and returns a StorageStrategy.
|
9
|
+
def initialize(storage_field, &storage_strategy_builder)
|
10
|
+
@storage_field = storage_field
|
11
|
+
@storage_strategy_builder = storage_strategy_builder
|
12
|
+
end
|
13
|
+
|
14
|
+
# Stores content using the specified storage strategy
|
15
|
+
# @param attributes [Hash] The attributes of the item for which content is to be stored
|
16
|
+
# @param status [Hash] The mutable status field
|
17
|
+
# @return [Hash] The item attributes, with the storage_field deleted.
|
18
|
+
def pipe(attributes, status = {})
|
19
|
+
strategy = storage_strategy_builder.call(attributes)
|
20
|
+
|
21
|
+
content = attributes.delete(storage_field)
|
22
|
+
|
23
|
+
raise ArgumentError.new("#{storage_field} must be present in ContentStorage") unless content
|
24
|
+
|
25
|
+
case content
|
26
|
+
when NilClass then # do nothing -- there's no content to write
|
27
|
+
when String then strategy.write(content)
|
28
|
+
else strategy.write_from(content)
|
29
|
+
end
|
30
|
+
|
31
|
+
attributes.merge!(strategy.metadata)
|
32
|
+
|
33
|
+
attributes
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Pyper::Pipes
|
2
|
+
# @param default_values [Hash] A hash of default values to set within the provided attrs if they are not already present.
|
3
|
+
class DefaultValues < Struct.new(:default_values)
|
4
|
+
|
5
|
+
# @param attrs [Hash] The attributes of the item
|
6
|
+
# @param status [Hash] The mutable status field
|
7
|
+
# @return [Hash] The item attributes with default values inserted
|
8
|
+
def pipe(attrs, status = {})
|
9
|
+
default_values.each do |field, value|
|
10
|
+
attrs[field] = value unless attrs[field]
|
11
|
+
end
|
12
|
+
attrs
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|