dataflow-rb 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.env.test.example +6 -0
  3. data/.gitignore +14 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +4 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE +21 -0
  8. data/README.md +46 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +7 -0
  12. data/dataflow-rb.gemspec +42 -0
  13. data/lib/config/mongoid.yml +21 -0
  14. data/lib/dataflow/adapters/csv_adapter.rb +123 -0
  15. data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
  16. data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
  17. data/lib/dataflow/adapters/psql_adapter.rb +21 -0
  18. data/lib/dataflow/adapters/settings.rb +33 -0
  19. data/lib/dataflow/adapters/sql_adapter.rb +322 -0
  20. data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
  21. data/lib/dataflow/errors/not_implemented_error.rb +7 -0
  22. data/lib/dataflow/event_mixin.rb +77 -0
  23. data/lib/dataflow/extensions/mongo_driver.rb +21 -0
  24. data/lib/dataflow/extensions/msgpack.rb +19 -0
  25. data/lib/dataflow/logger.rb +27 -0
  26. data/lib/dataflow/node.rb +37 -0
  27. data/lib/dataflow/nodes/compute_node.rb +495 -0
  28. data/lib/dataflow/nodes/data_node.rb +331 -0
  29. data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
  30. data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
  31. data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
  32. data/lib/dataflow/nodes/filter/where_node.rb +44 -0
  33. data/lib/dataflow/nodes/join_node.rb +151 -0
  34. data/lib/dataflow/nodes/map_node.rb +50 -0
  35. data/lib/dataflow/nodes/merge_node.rb +33 -0
  36. data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
  37. data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
  38. data/lib/dataflow/nodes/select_keys_node.rb +39 -0
  39. data/lib/dataflow/nodes/snapshot_node.rb +77 -0
  40. data/lib/dataflow/nodes/sql_query_node.rb +50 -0
  41. data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
  42. data/lib/dataflow/nodes/upsert_node.rb +68 -0
  43. data/lib/dataflow/properties_mixin.rb +35 -0
  44. data/lib/dataflow/schema_mixin.rb +134 -0
  45. data/lib/dataflow/version.rb +4 -0
  46. data/lib/dataflow-rb.rb +72 -0
  47. metadata +371 -0
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ class Settings
5
+ attr_accessor :connection_uri, :db_name, :indexes, :adapter_type,
6
+ :dataset_name, :read_dataset_name, :write_dataset_name, :schema
7
+
8
+ def initialize(data_node: nil, connection_uri: nil, db_name: nil,
9
+ dataset_name: nil, indexes: nil, adapter_type: nil, schema: nil)
10
+ @connection_uri = connection_uri
11
+
12
+ # first try to set the options based on the data node settings
13
+ if data_node.present?
14
+ @db_name = data_node.db_name
15
+ @dataset_name = data_node.name
16
+ @read_dataset_name = data_node.read_dataset_name
17
+ @write_dataset_name = data_node.write_dataset_name
18
+ @indexes = data_node.indexes
19
+ @schema = data_node.schema
20
+ end
21
+
22
+ # override if needed
23
+ @db_name ||= db_name
24
+ @dataset_name ||= dataset_name
25
+ @read_dataset_name ||= dataset_name
26
+ @write_dataset_name ||= dataset_name
27
+ @indexes ||= indexes
28
+ @adapter_type ||= adapter_type
29
+ @schema ||= schema
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,322 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ # Interface between a data node and mongodb.
5
+ # We use mongodb to perform all the store/retrieve operations.
6
+ class SqlAdapter
7
+ class << self
8
+ # Get (or create) a client that satisfies the given connection settings.
9
+ # @param settings [Hash] Represents the connection settings to the DB.
10
+ # @param db_name [String] The database name to which the client will connect.
11
+ # @return [Sequel::Database] a sequel database object.
12
+ def client(settings, db_name: nil)
13
+ @clients ||= {}
14
+
15
+ case settings.adapter_type
16
+ when 'mysql2'
17
+ host = ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
18
+ port = ENV['MOJACO_MYSQL_PORT'] || '3306'
19
+ user = ENV['MOJACO_MYSQL_USER']
20
+ password = ENV['MOJACO_MYSQL_PASSWORD']
21
+ when 'postgresql'
22
+ host = ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
23
+ port = ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
24
+ user = ENV['MOJACO_POSTGRESQL_USER'] || 'eurico'
25
+ password = ENV['MOJACO_POSTGRESQL_PASSWORD'] || 'eurico'
26
+ end
27
+
28
+ db_name ||= settings.db_name
29
+ user_password = user
30
+ user_password += ":#{password}" if password.present?
31
+
32
+ uri = "#{settings.adapter_type}://#{user_password}@#{host}:#{port}"
33
+ connection_uri = settings.connection_uri || "#{uri}/#{db_name}"
34
+
35
+ return @clients[connection_uri] if @clients[connection_uri].present?
36
+
37
+ # first, make sure the DB is created (if it is not an external db)
38
+ is_external_db = settings.connection_uri.present?
39
+ try_create_db(uri, db_name, user, password) unless is_external_db
40
+
41
+ # then, create the connection object
42
+ @clients[connection_uri] ||= Sequel.connect("#{connection_uri}?encoding=utf8")
43
+ end
44
+
45
+ # Used internally to try to create the DB automatically.
46
+ # @param uri [String] the connection uri to the DB.
47
+ # @param db_name [String] the database name.
48
+ # @return [Boolean] whether the db was created or not.
49
+ def try_create_db(uri, db_name, user, password)
50
+ Sequel.connect(uri, user: user, password: password) do |db|
51
+ db.run("CREATE DATABASE #{db_name}")
52
+ true
53
+ end
54
+ rescue Sequel::DatabaseError => e
55
+ # ignore error
56
+ false
57
+ end
58
+
59
+ # Force the clients to disconnect their connections.
60
+ # Use before forking.
61
+ def disconnect_clients
62
+ @clients ||= {}
63
+ @clients.values.each(&:disconnect)
64
+ end
65
+ end
66
+
67
+ SYSTEM_ID = :_id
68
+
69
+ attr_reader :settings
70
+ attr_reader :client
71
+
72
+ def initialize(args)
73
+ update_settings(args)
74
+ @client = SqlAdapter.client(settings)
75
+ @schema = settings.schema || [] # TODO: detect if the table schema has a mis-match
76
+ end
77
+
78
+ def update_settings(args)
79
+ @settings = Dataflow::Adapters::Settings.new(args)
80
+ end
81
+
82
+ def set_schema(schema)
83
+ @schema = schema
84
+ end
85
+
86
+ # retrieve a single element from a data node
87
+ def find(where: {}, fields: [], sort: {}, offset: 0)
88
+ all(where: where, fields: fields, sort: sort, offset: offset, limit: 1).first
89
+ end
90
+
91
+ # retrieve all elements from a data node
92
+ def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
93
+ res = client[settings.read_dataset_name.to_sym]
94
+
95
+ # if there is no fields, automatically
96
+ # select all the fields expect the system _id
97
+ fields = res.columns.reject { |x| x == SYSTEM_ID } if fields.blank?
98
+
99
+ res = res.select(*fields.map(&:to_sym)) if fields.present?
100
+ res = apply_query(res, where)
101
+
102
+ (sort || {}).each do |k, v|
103
+ sort_value = v == 1 ? k.to_sym : Sequel.desc(k.to_sym)
104
+ res = res.order(sort_value)
105
+ end
106
+
107
+ res = res.offset(offset) if offset > 0
108
+ res = res.limit(limit) if limit > 0
109
+
110
+ if block_given?
111
+ yield res
112
+ else
113
+ res.to_a
114
+ end
115
+ end
116
+
117
+ # Create queries that permit processing the whole dataset in parallel without using offsets.
118
+ def ordered_system_id_queries(batch_size:)
119
+ ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
120
+ queries_count = (ids.size / batch_size.to_f).ceil
121
+ Array.new(queries_count) do |i|
122
+ from = ids[i * batch_size]
123
+ to = ids[(i + 1) * batch_size] || ids[-1]
124
+ is_last = i == queries_count - 1
125
+
126
+ where_query = { SYSTEM_ID => { '>=' => from } }
127
+ operator = is_last ? '<=' : '<'
128
+ where_query[SYSTEM_ID][operator] = to
129
+
130
+ where_query
131
+ end
132
+ end
133
+
134
+ # count the number of records
135
+ def count(where: {})
136
+ res = client[settings.read_dataset_name.to_sym]
137
+ res = apply_query(res, where)
138
+ res.count
139
+ rescue Sequel::DatabaseError
140
+ 0
141
+ end
142
+
143
+ # Save the given records
144
+ # TODO: support :replace_by parameter
145
+ def save(records:)
146
+ dataset = client[settings.write_dataset_name.to_sym]
147
+ columns = dataset.columns.reject { |x| x == SYSTEM_ID }
148
+
149
+ tabular_data = records.map do |record|
150
+ columns.map { |col| record[col] }
151
+ end
152
+
153
+ dataset.insert_ignore.import(columns, tabular_data)
154
+ end
155
+
156
+ # Delete records that match the options.
157
+ # @param where query to apply on the delete operation.
158
+ # @note this deletes on the read dataset
159
+ # i.e. changes are seen immediately in the case of double buffered datasets
160
+ def delete(where: {})
161
+ res = client[settings.read_dataset_name.to_sym]
162
+ res = apply_query(res, where)
163
+ res.delete
164
+ end
165
+
166
+ # recreate the table/collection
167
+ def recreate_dataset(dataset: nil)
168
+ dataset ||= settings.write_dataset_name.to_sym
169
+ client.drop_table?(dataset)
170
+
171
+ unless @schema.present?
172
+ p 'WARNING: recreate dataset aborted: no schema'
173
+ return
174
+ end
175
+
176
+ create_table(dataset, @schema)
177
+ end
178
+
179
+ # Create the indexes on this dataset.
180
+ # @param dataset [String] Specify on which dataset the operation will be performed.
181
+ # Default: the adatpter's settings' dataset.
182
+ # @param type [Symbol] select which indexes type to create.
183
+ # Can be :all (default), :unique_only, :non_unique_only.
184
+ # TODO: add support for a :drop_retry_on_error parameter.
185
+ def create_indexes(dataset: nil, type: :all)
186
+ dataset ||= settings.write_dataset_name
187
+ dataset = dataset.to_sym
188
+ indexes = (settings.indexes || [])
189
+
190
+ case type
191
+ when :unique_only
192
+ indexes = indexes.select { |idx| idx['unique'] }
193
+ when :non_unique_only
194
+ indexes = indexes.reject { |idx| idx['unique'] }
195
+ end
196
+
197
+ indexes.each do |index|
198
+ params = index_parameters(index)
199
+
200
+ begin
201
+ client.add_index(dataset, *params)
202
+ rescue Sequel::DatabaseError => e
203
+ # ignore index already exists
204
+ raise e unless e.wrapped_exception.is_a?(PG::DuplicateTable)
205
+ end
206
+ end
207
+ end
208
+
209
+ def usage(dataset:)
210
+ indexes = retrieve_collection_indexes(dataset)
211
+ table_usage = fetch_table_usage(dataset: dataset)
212
+ table_usage.merge(effective_indexes: indexes)
213
+ end
214
+
215
+ private
216
+
217
+ MAX_INT = 2_147_483_647
218
+ MAX_VARCHAR = 255
219
+
220
+ def create_table(dataset, schema)
221
+ client.create_table(dataset.to_sym) do
222
+ # always add an _id field to be used internally
223
+ primary_key SYSTEM_ID
224
+
225
+ schema.each do |column, info|
226
+ type = info[:type]
227
+ max_size = info[:max] || info.dig(:types, type, :max)
228
+
229
+ case type
230
+ when 'object', 'string'
231
+ max_size ||= info.dig(:types, 'string', :max) || MAX_VARCHAR + 1
232
+ col_type = if max_size <= MAX_VARCHAR
233
+ "varchar(#{max_size})"
234
+ else
235
+ 'text'
236
+ end
237
+ when 'time'
238
+ col_type = 'timestamp'
239
+ when 'integer'
240
+ max_size ||= MAX_INT + 1
241
+ col_type = if max_size <= MAX_INT
242
+ 'integer'
243
+ else
244
+ 'bigint'
245
+ end
246
+ when 'numeric'
247
+ col_type = 'real'
248
+ when 'array', 'hash'
249
+ p "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
250
+ col_type = 'json'
251
+ else
252
+ p "Error: unexpected type '#{type}'. Keeping as-is."
253
+ col_type = type
254
+ end
255
+
256
+ # create a column with the given type
257
+ p "#{column} #{type} -> #{col_type}"
258
+ column(column.to_sym, col_type)
259
+ end
260
+ end
261
+ end
262
+
263
+ def apply_query(res, opts)
264
+ queries = transform_to_query(opts)
265
+ queries.each do |query_args|
266
+ res = res.where(*query_args)
267
+ end
268
+ res
269
+ end
270
+
271
+ def transform_to_query(opts)
272
+ # map to a serie of AND clauses queries
273
+ opts.flat_map do |k, v|
274
+ if v.is_a? Hash
275
+ v.map do |operator, value|
276
+ case operator
277
+ when '!='
278
+ if value.is_a? Array
279
+ ["#{k} NOT IN ?", value]
280
+ else
281
+ ["#{k} <> ?", value]
282
+ end
283
+ when '<'
284
+ ["#{k} < ?", value]
285
+ when '<='
286
+ ["#{k} <= ?", value]
287
+ when '>'
288
+ ["#{k} > ?", value]
289
+ when '>='
290
+ ["#{k} >= ?", value]
291
+ end
292
+ end
293
+ else
294
+ # e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
295
+ # are supported with simples hashes
296
+ [[{ k.to_sym => v }]]
297
+ end
298
+ end
299
+ end
300
+
301
+ # Required index format for sequel:
302
+ # :keys, unique: true
303
+ def index_parameters(index)
304
+ index = index.with_indifferent_access
305
+ keys = Array(index[:key]).map(&:to_sym)
306
+ params = [keys]
307
+ params << { unique: true } if index[:unique]
308
+ params
309
+ end
310
+
311
+ def retrieve_collection_indexes(collection)
312
+ psql_indexes = client.indexes(collection)
313
+ psql_indexes.values.map do |idx|
314
+ cols = idx[:columns].map(&:to_s)
315
+ index = { 'key' => cols }
316
+ index['unique'] = true if idx[:unique]
317
+ index
318
+ end.compact
319
+ end
320
+ end
321
+ end
322
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Errors
4
+ class InvalidConfigurationError < StandardError
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Errors
4
+ class NotImplementedError < StandardError
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module EventMixin
4
+ extend ActiveSupport::Concern
5
+
6
+ module ClassMethods
7
+ def event(event_name)
8
+ # re-open the base class
9
+ handlers_var_name = "@#{event_name}_handlers"
10
+
11
+ # Defines a class method called "event_name".
12
+ # It will serve a global-level evt handler for this class.
13
+ # @yield (optional) the event handler to add
14
+ # @return Array the list of event handlers
15
+ define_singleton_method(event_name) do |&block|
16
+ handlers = instance_variable_get(handlers_var_name)
17
+
18
+ unless handlers
19
+ handlers = []
20
+ instance_variable_set(handlers_var_name, [])
21
+ end
22
+
23
+ if block.present?
24
+ handlers << block
25
+ instance_variable_set(handlers_var_name, handlers)
26
+ end
27
+
28
+ # return all events from the hierarchy
29
+ superclass_handlers = []
30
+ superclass = self.superclass
31
+ while superclass
32
+ superclass_handlers += superclass.instance_variable_get(
33
+ :"@#{event_name}_handlers"
34
+ ) || []
35
+ superclass = superclass.superclass
36
+ end
37
+
38
+ handlers + superclass_handlers
39
+ end
40
+
41
+ # Defines a method called "event_name".
42
+ # It will serve a instance-level evt handler.
43
+ # @yield (optional) the event handler to add
44
+ # @return Array the list of event handlers
45
+ define_method(event_name) do |&block|
46
+ handlers = instance_variable_get(handlers_var_name)
47
+
48
+ unless handlers
49
+ handlers = []
50
+ instance_variable_set(handlers_var_name, [])
51
+ end
52
+
53
+ if block.present?
54
+ handlers << block
55
+ instance_variable_set(handlers_var_name, handlers)
56
+ end
57
+
58
+ handlers
59
+ end
60
+
61
+ # Defines a way to fire the event: "on_event_name(evt)"
62
+ # @param *args a variable list of arguments passed to the handlers
63
+ define_method("on_#{event_name}") do |*args|
64
+ handlers = send(event_name) + self.class.send(event_name)
65
+ handlers.each do |handler|
66
+ begin
67
+ handler.call(self, *args)
68
+ rescue StandardError => e
69
+ @logger&.log("ERROR IN HANDLER [on_#{event_name}]: #{e}")
70
+ # ignore error in handlers
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+ module Mongo
3
+ class Collection
4
+ class View
5
+ attr_reader :cursor
6
+
7
+ def initial_query
8
+ @cursor = nil
9
+ result = nil
10
+
11
+ read_with_retry do
12
+ server = read.select_server(cluster, false)
13
+ result = send_initial_query(server)
14
+ @cursor = Cursor.new(view, result, server)
15
+ end
16
+
17
+ result
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ Time.class_eval do
4
+ def to_msgpack(out = '')
5
+ iso8601.to_msgpack(out)
6
+ end
7
+ end
8
+
9
+ DateTime.class_eval do
10
+ def to_msgpack(out = '')
11
+ iso8601.to_msgpack(out)
12
+ end
13
+ end
14
+
15
+ Date.class_eval do
16
+ def to_msgpack(out = '')
17
+ iso8601.to_msgpack(out)
18
+ end
19
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ class Logger
4
+ attr_accessor :prefix
5
+ attr_accessor :use_notifications
6
+
7
+ def initialize(prefix:, use_notifications: false)
8
+ @prefix = prefix
9
+ @use_notifications = use_notifications
10
+ @@impl = LoggerImpl.new
11
+ end
12
+
13
+ def log(str)
14
+ return if ENV['RACK_ENV'] == 'test'
15
+ now = DateTime.now.strftime('%y-%m-%d %H:%M:%S')
16
+ message = "[#{now}] #{prefix} :: #{str}"
17
+ logger_impl = @@impl
18
+ logger_impl.log(message)
19
+ end
20
+
21
+ class LoggerImpl
22
+ def log(message)
23
+ puts message
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ # Define (default) common interface for nodes.
4
+ # These may be overriden with their specific implementations.
5
+ module Node
6
+ # Returns either a DataNode or a ComputeNode that match the id
7
+ def self.find(id)
8
+ begin
9
+ return Dataflow::Nodes::DataNode.find(id)
10
+ rescue Mongoid::Errors::DocumentNotFound
11
+ # try again against a computed node
12
+ end
13
+
14
+ Dataflow::Nodes::ComputeNode.find(id)
15
+ end
16
+
17
+ def updated?
18
+ true
19
+ end
20
+
21
+ def recompute(*args)
22
+ # Interface only, for recursion purposes
23
+ end
24
+
25
+ # Overriden in computed node
26
+ def valid_for_computation?
27
+ true
28
+ end
29
+
30
+ def validate!
31
+ # throw if normal model validation do not pass.
32
+ valid = valid_for_computation?
33
+ raise Dataflow::Errors::InvalidConfigurationError, errors.messages unless valid
34
+ true
35
+ end
36
+ end
37
+ end