dataflow-rb 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.env.test.example +6 -0
  3. data/.gitignore +14 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +4 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE +21 -0
  8. data/README.md +46 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +7 -0
  12. data/dataflow-rb.gemspec +42 -0
  13. data/lib/config/mongoid.yml +21 -0
  14. data/lib/dataflow/adapters/csv_adapter.rb +123 -0
  15. data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
  16. data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
  17. data/lib/dataflow/adapters/psql_adapter.rb +21 -0
  18. data/lib/dataflow/adapters/settings.rb +33 -0
  19. data/lib/dataflow/adapters/sql_adapter.rb +322 -0
  20. data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
  21. data/lib/dataflow/errors/not_implemented_error.rb +7 -0
  22. data/lib/dataflow/event_mixin.rb +77 -0
  23. data/lib/dataflow/extensions/mongo_driver.rb +21 -0
  24. data/lib/dataflow/extensions/msgpack.rb +19 -0
  25. data/lib/dataflow/logger.rb +27 -0
  26. data/lib/dataflow/node.rb +37 -0
  27. data/lib/dataflow/nodes/compute_node.rb +495 -0
  28. data/lib/dataflow/nodes/data_node.rb +331 -0
  29. data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
  30. data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
  31. data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
  32. data/lib/dataflow/nodes/filter/where_node.rb +44 -0
  33. data/lib/dataflow/nodes/join_node.rb +151 -0
  34. data/lib/dataflow/nodes/map_node.rb +50 -0
  35. data/lib/dataflow/nodes/merge_node.rb +33 -0
  36. data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
  37. data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
  38. data/lib/dataflow/nodes/select_keys_node.rb +39 -0
  39. data/lib/dataflow/nodes/snapshot_node.rb +77 -0
  40. data/lib/dataflow/nodes/sql_query_node.rb +50 -0
  41. data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
  42. data/lib/dataflow/nodes/upsert_node.rb +68 -0
  43. data/lib/dataflow/properties_mixin.rb +35 -0
  44. data/lib/dataflow/schema_mixin.rb +134 -0
  45. data/lib/dataflow/version.rb +4 -0
  46. data/lib/dataflow-rb.rb +72 -0
  47. metadata +371 -0
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ class Settings
5
+ attr_accessor :connection_uri, :db_name, :indexes, :adapter_type,
6
+ :dataset_name, :read_dataset_name, :write_dataset_name, :schema
7
+
8
+ def initialize(data_node: nil, connection_uri: nil, db_name: nil,
9
+ dataset_name: nil, indexes: nil, adapter_type: nil, schema: nil)
10
+ @connection_uri = connection_uri
11
+
12
+ # first try to set the options based on the data node settings
13
+ if data_node.present?
14
+ @db_name = data_node.db_name
15
+ @dataset_name = data_node.name
16
+ @read_dataset_name = data_node.read_dataset_name
17
+ @write_dataset_name = data_node.write_dataset_name
18
+ @indexes = data_node.indexes
19
+ @schema = data_node.schema
20
+ end
21
+
22
+ # override if needed
23
+ @db_name ||= db_name
24
+ @dataset_name ||= dataset_name
25
+ @read_dataset_name ||= dataset_name
26
+ @write_dataset_name ||= dataset_name
27
+ @indexes ||= indexes
28
+ @adapter_type ||= adapter_type
29
+ @schema ||= schema
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,322 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ # Interface between a data node and mongodb.
5
+ # We use mongodb to perform all the store/retrieve operations.
6
+ class SqlAdapter
7
+ class << self
8
+ # Get (or create) a client that satisfies the given connection settings.
9
+ # @param settings [Hash] Represents the connection settings to the DB.
10
+ # @param db_name [String] The database name to which the client will connect.
11
+ # @return [Sequel::Database] a sequel database object.
12
+ def client(settings, db_name: nil)
13
+ @clients ||= {}
14
+
15
+ case settings.adapter_type
16
+ when 'mysql2'
17
+ host = ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
18
+ port = ENV['MOJACO_MYSQL_PORT'] || '3306'
19
+ user = ENV['MOJACO_MYSQL_USER']
20
+ password = ENV['MOJACO_MYSQL_PASSWORD']
21
+ when 'postgresql'
22
+ host = ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
23
+ port = ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
24
+ user = ENV['MOJACO_POSTGRESQL_USER'] || 'eurico'
25
+ password = ENV['MOJACO_POSTGRESQL_PASSWORD'] || 'eurico'
26
+ end
27
+
28
+ db_name ||= settings.db_name
29
+ user_password = user
30
+ user_password += ":#{password}" if password.present?
31
+
32
+ uri = "#{settings.adapter_type}://#{user_password}@#{host}:#{port}"
33
+ connection_uri = settings.connection_uri || "#{uri}/#{db_name}"
34
+
35
+ return @clients[connection_uri] if @clients[connection_uri].present?
36
+
37
+ # first, make sure the DB is created (if it is not an external db)
38
+ is_external_db = settings.connection_uri.present?
39
+ try_create_db(uri, db_name, user, password) unless is_external_db
40
+
41
+ # then, create the connection object
42
+ @clients[connection_uri] ||= Sequel.connect("#{connection_uri}?encoding=utf8")
43
+ end
44
+
45
+ # Used internally to try to create the DB automatically.
46
+ # @param uri [String] the connection uri to the DB.
47
+ # @param db_name [String] the database name.
48
+ # @return [Boolean] whether the db was created or not.
49
+ def try_create_db(uri, db_name, user, password)
50
+ Sequel.connect(uri, user: user, password: password) do |db|
51
+ db.run("CREATE DATABASE #{db_name}")
52
+ true
53
+ end
54
+ rescue Sequel::DatabaseError => e
55
+ # ignore error
56
+ false
57
+ end
58
+
59
+ # Force the clients to disconnect their connections.
60
+ # Use before forking.
61
+ def disconnect_clients
62
+ @clients ||= {}
63
+ @clients.values.each(&:disconnect)
64
+ end
65
+ end
66
+
67
+ SYSTEM_ID = :_id
68
+
69
+ attr_reader :settings
70
+ attr_reader :client
71
+
72
+ def initialize(args)
73
+ update_settings(args)
74
+ @client = SqlAdapter.client(settings)
75
+ @schema = settings.schema || [] # TODO: detect if the table schema has a mis-match
76
+ end
77
+
78
+ def update_settings(args)
79
+ @settings = Dataflow::Adapters::Settings.new(args)
80
+ end
81
+
82
+ def set_schema(schema)
83
+ @schema = schema
84
+ end
85
+
86
+ # retrieve a single element from a data node
87
+ def find(where: {}, fields: [], sort: {}, offset: 0)
88
+ all(where: where, fields: fields, sort: sort, offset: offset, limit: 1).first
89
+ end
90
+
91
+ # retrieve all elements from a data node
92
+ def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
93
+ res = client[settings.read_dataset_name.to_sym]
94
+
95
+ # if there is no fields, automatically
96
+ # select all the fields expect the system _id
97
+ fields = res.columns.reject { |x| x == SYSTEM_ID } if fields.blank?
98
+
99
+ res = res.select(*fields.map(&:to_sym)) if fields.present?
100
+ res = apply_query(res, where)
101
+
102
+ (sort || {}).each do |k, v|
103
+ sort_value = v == 1 ? k.to_sym : Sequel.desc(k.to_sym)
104
+ res = res.order(sort_value)
105
+ end
106
+
107
+ res = res.offset(offset) if offset > 0
108
+ res = res.limit(limit) if limit > 0
109
+
110
+ if block_given?
111
+ yield res
112
+ else
113
+ res.to_a
114
+ end
115
+ end
116
+
117
+ # Create queries that permit processing the whole dataset in parallel without using offsets.
118
+ def ordered_system_id_queries(batch_size:)
119
+ ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
120
+ queries_count = (ids.size / batch_size.to_f).ceil
121
+ Array.new(queries_count) do |i|
122
+ from = ids[i * batch_size]
123
+ to = ids[(i + 1) * batch_size] || ids[-1]
124
+ is_last = i == queries_count - 1
125
+
126
+ where_query = { SYSTEM_ID => { '>=' => from } }
127
+ operator = is_last ? '<=' : '<'
128
+ where_query[SYSTEM_ID][operator] = to
129
+
130
+ where_query
131
+ end
132
+ end
133
+
134
+ # count the number of records
135
+ def count(where: {})
136
+ res = client[settings.read_dataset_name.to_sym]
137
+ res = apply_query(res, where)
138
+ res.count
139
+ rescue Sequel::DatabaseError
140
+ 0
141
+ end
142
+
143
+ # Save the given records
144
+ # TODO: support :replace_by parameter
145
+ def save(records:)
146
+ dataset = client[settings.write_dataset_name.to_sym]
147
+ columns = dataset.columns.reject { |x| x == SYSTEM_ID }
148
+
149
+ tabular_data = records.map do |record|
150
+ columns.map { |col| record[col] }
151
+ end
152
+
153
+ dataset.insert_ignore.import(columns, tabular_data)
154
+ end
155
+
156
+ # Delete records that match the options.
157
+ # @param where query to apply on the delete operation.
158
+ # @note this deletes on the read dataset
159
+ # i.e. changes are seen immediately in the case of double buffered datasets
160
+ def delete(where: {})
161
+ res = client[settings.read_dataset_name.to_sym]
162
+ res = apply_query(res, where)
163
+ res.delete
164
+ end
165
+
166
+ # recreate the table/collection
167
+ def recreate_dataset(dataset: nil)
168
+ dataset ||= settings.write_dataset_name.to_sym
169
+ client.drop_table?(dataset)
170
+
171
+ unless @schema.present?
172
+ p 'WARNING: recreate dataset aborted: no schema'
173
+ return
174
+ end
175
+
176
+ create_table(dataset, @schema)
177
+ end
178
+
179
+ # Create the indexes on this dataset.
180
+ # @param dataset [String] Specify on which dataset the operation will be performed.
181
+ # Default: the adatpter's settings' dataset.
182
+ # @param type [Symbol] select which indexes type to create.
183
+ # Can be :all (default), :unique_only, :non_unique_only.
184
+ # TODO: add support for a :drop_retry_on_error parameter.
185
+ def create_indexes(dataset: nil, type: :all)
186
+ dataset ||= settings.write_dataset_name
187
+ dataset = dataset.to_sym
188
+ indexes = (settings.indexes || [])
189
+
190
+ case type
191
+ when :unique_only
192
+ indexes = indexes.select { |idx| idx['unique'] }
193
+ when :non_unique_only
194
+ indexes = indexes.reject { |idx| idx['unique'] }
195
+ end
196
+
197
+ indexes.each do |index|
198
+ params = index_parameters(index)
199
+
200
+ begin
201
+ client.add_index(dataset, *params)
202
+ rescue Sequel::DatabaseError => e
203
+ # ignore index already exists
204
+ raise e unless e.wrapped_exception.is_a?(PG::DuplicateTable)
205
+ end
206
+ end
207
+ end
208
+
209
+ def usage(dataset:)
210
+ indexes = retrieve_collection_indexes(dataset)
211
+ table_usage = fetch_table_usage(dataset: dataset)
212
+ table_usage.merge(effective_indexes: indexes)
213
+ end
214
+
215
+ private
216
+
217
+ MAX_INT = 2_147_483_647
218
+ MAX_VARCHAR = 255
219
+
220
+ def create_table(dataset, schema)
221
+ client.create_table(dataset.to_sym) do
222
+ # always add an _id field to be used internally
223
+ primary_key SYSTEM_ID
224
+
225
+ schema.each do |column, info|
226
+ type = info[:type]
227
+ max_size = info[:max] || info.dig(:types, type, :max)
228
+
229
+ case type
230
+ when 'object', 'string'
231
+ max_size ||= info.dig(:types, 'string', :max) || MAX_VARCHAR + 1
232
+ col_type = if max_size <= MAX_VARCHAR
233
+ "varchar(#{max_size})"
234
+ else
235
+ 'text'
236
+ end
237
+ when 'time'
238
+ col_type = 'timestamp'
239
+ when 'integer'
240
+ max_size ||= MAX_INT + 1
241
+ col_type = if max_size <= MAX_INT
242
+ 'integer'
243
+ else
244
+ 'bigint'
245
+ end
246
+ when 'numeric'
247
+ col_type = 'real'
248
+ when 'array', 'hash'
249
+ p "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
250
+ col_type = 'json'
251
+ else
252
+ p "Error: unexpected type '#{type}'. Keeping as-is."
253
+ col_type = type
254
+ end
255
+
256
+ # create a column with the given type
257
+ p "#{column} #{type} -> #{col_type}"
258
+ column(column.to_sym, col_type)
259
+ end
260
+ end
261
+ end
262
+
263
+ def apply_query(res, opts)
264
+ queries = transform_to_query(opts)
265
+ queries.each do |query_args|
266
+ res = res.where(*query_args)
267
+ end
268
+ res
269
+ end
270
+
271
+ def transform_to_query(opts)
272
+ # map to a serie of AND clauses queries
273
+ opts.flat_map do |k, v|
274
+ if v.is_a? Hash
275
+ v.map do |operator, value|
276
+ case operator
277
+ when '!='
278
+ if value.is_a? Array
279
+ ["#{k} NOT IN ?", value]
280
+ else
281
+ ["#{k} <> ?", value]
282
+ end
283
+ when '<'
284
+ ["#{k} < ?", value]
285
+ when '<='
286
+ ["#{k} <= ?", value]
287
+ when '>'
288
+ ["#{k} > ?", value]
289
+ when '>='
290
+ ["#{k} >= ?", value]
291
+ end
292
+ end
293
+ else
294
+ # e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
295
+ # are supported with simples hashes
296
+ [[{ k.to_sym => v }]]
297
+ end
298
+ end
299
+ end
300
+
301
+ # Required index format for sequel:
302
+ # :keys, unique: true
303
+ def index_parameters(index)
304
+ index = index.with_indifferent_access
305
+ keys = Array(index[:key]).map(&:to_sym)
306
+ params = [keys]
307
+ params << { unique: true } if index[:unique]
308
+ params
309
+ end
310
+
311
+ def retrieve_collection_indexes(collection)
312
+ psql_indexes = client.indexes(collection)
313
+ psql_indexes.values.map do |idx|
314
+ cols = idx[:columns].map(&:to_s)
315
+ index = { 'key' => cols }
316
+ index['unique'] = true if idx[:unique]
317
+ index
318
+ end.compact
319
+ end
320
+ end
321
+ end
322
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Errors
4
+ class InvalidConfigurationError < StandardError
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Errors
4
+ class NotImplementedError < StandardError
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module EventMixin
4
+ extend ActiveSupport::Concern
5
+
6
+ module ClassMethods
7
+ def event(event_name)
8
+ # re-open the base class
9
+ handlers_var_name = "@#{event_name}_handlers"
10
+
11
+ # Defines a class method called "event_name".
12
+ # It will serve a global-level evt handler for this class.
13
+ # @yield (optional) the event handler to add
14
+ # @return Array the list of event handlers
15
+ define_singleton_method(event_name) do |&block|
16
+ handlers = instance_variable_get(handlers_var_name)
17
+
18
+ unless handlers
19
+ handlers = []
20
+ instance_variable_set(handlers_var_name, [])
21
+ end
22
+
23
+ if block.present?
24
+ handlers << block
25
+ instance_variable_set(handlers_var_name, handlers)
26
+ end
27
+
28
+ # return all events from the hierarchy
29
+ superclass_handlers = []
30
+ superclass = self.superclass
31
+ while superclass
32
+ superclass_handlers += superclass.instance_variable_get(
33
+ :"@#{event_name}_handlers"
34
+ ) || []
35
+ superclass = superclass.superclass
36
+ end
37
+
38
+ handlers + superclass_handlers
39
+ end
40
+
41
+ # Defines a method called "event_name".
42
+ # It will serve a instance-level evt handler.
43
+ # @yield (optional) the event handler to add
44
+ # @return Array the list of event handlers
45
+ define_method(event_name) do |&block|
46
+ handlers = instance_variable_get(handlers_var_name)
47
+
48
+ unless handlers
49
+ handlers = []
50
+ instance_variable_set(handlers_var_name, [])
51
+ end
52
+
53
+ if block.present?
54
+ handlers << block
55
+ instance_variable_set(handlers_var_name, handlers)
56
+ end
57
+
58
+ handlers
59
+ end
60
+
61
+ # Defines a way to fire the event: "on_event_name(evt)"
62
+ # @param *args a variable list of arguments passed to the handlers
63
+ define_method("on_#{event_name}") do |*args|
64
+ handlers = send(event_name) + self.class.send(event_name)
65
+ handlers.each do |handler|
66
+ begin
67
+ handler.call(self, *args)
68
+ rescue StandardError => e
69
+ @logger&.log("ERROR IN HANDLER [on_#{event_name}]: #{e}")
70
+ # ignore error in handlers
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+ module Mongo
3
+ class Collection
4
+ class View
5
+ attr_reader :cursor
6
+
7
+ def initial_query
8
+ @cursor = nil
9
+ result = nil
10
+
11
+ read_with_retry do
12
+ server = read.select_server(cluster, false)
13
+ result = send_initial_query(server)
14
+ @cursor = Cursor.new(view, result, server)
15
+ end
16
+
17
+ result
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ Time.class_eval do
4
+ def to_msgpack(out = '')
5
+ iso8601.to_msgpack(out)
6
+ end
7
+ end
8
+
9
+ DateTime.class_eval do
10
+ def to_msgpack(out = '')
11
+ iso8601.to_msgpack(out)
12
+ end
13
+ end
14
+
15
+ Date.class_eval do
16
+ def to_msgpack(out = '')
17
+ iso8601.to_msgpack(out)
18
+ end
19
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ class Logger
4
+ attr_accessor :prefix
5
+ attr_accessor :use_notifications
6
+
7
+ def initialize(prefix:, use_notifications: false)
8
+ @prefix = prefix
9
+ @use_notifications = use_notifications
10
+ @@impl = LoggerImpl.new
11
+ end
12
+
13
+ def log(str)
14
+ return if ENV['RACK_ENV'] == 'test'
15
+ now = DateTime.now.strftime('%y-%m-%d %H:%M:%S')
16
+ message = "[#{now}] #{prefix} :: #{str}"
17
+ logger_impl = @@impl
18
+ logger_impl.log(message)
19
+ end
20
+
21
+ class LoggerImpl
22
+ def log(message)
23
+ puts message
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ # Define (default) common interface for nodes.
4
+ # These may be overriden with their specific implementations.
5
+ module Node
6
+ # Returns either a DataNode or a ComputeNode that match the id
7
+ def self.find(id)
8
+ begin
9
+ return Dataflow::Nodes::DataNode.find(id)
10
+ rescue Mongoid::Errors::DocumentNotFound
11
+ # try again against a computed node
12
+ end
13
+
14
+ Dataflow::Nodes::ComputeNode.find(id)
15
+ end
16
+
17
+ def updated?
18
+ true
19
+ end
20
+
21
+ def recompute(*args)
22
+ # Interface only, for recursion purposes
23
+ end
24
+
25
+ # Overriden in computed node
26
+ def valid_for_computation?
27
+ true
28
+ end
29
+
30
+ def validate!
31
+ # throw if normal model validation do not pass.
32
+ valid = valid_for_computation?
33
+ raise Dataflow::Errors::InvalidConfigurationError, errors.messages unless valid
34
+ true
35
+ end
36
+ end
37
+ end