ductr-postgres 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ #
6
+ # A lookup control that execute the query for a bunch of rows, registered as `:match`.
7
+ #
8
+ # Accept the `:buffer_size` option, default value is 10 000.
9
+ # Accept the `:merge` option, mandatory an array with two entries:
10
+ # - The first one is the looked up row key to match.
11
+ # - The second one is the buffer row key to match.
12
+ #
13
+ # Unless the `:buffered` lookup, this one abstracts the row matching logic by assuming that
14
+ # you want to merge rows based on a key couple e.g. primary / foreign keys:
15
+ #
16
+ # lookup :some_postgres_database, :match, merge: [:id, :item], buffer_size: 42
17
+ # def merge_with_stuff(db, ids)
18
+ # db[:items_bis].where(item: ids)
19
+ # end
20
+ #
21
+ class MatchLookup < Ductr::ETL::BufferedTransform
22
+ Adapter.lookup_registry.add(self, as: :match)
23
+
24
+ #
25
+ # The looked up row key to match.
26
+ #
27
+ # @return [Symbol] The column name
28
+ #
29
+ def from_key
30
+ @options[:merge].first
31
+ end
32
+
33
+ #
34
+ # The buffer row key to match.
35
+ #
36
+ # @return [Symbol] The column name
37
+ #
38
+ def to_key
39
+ @options[:merge].last
40
+ end
41
+
42
+ #
43
+ # Opens the database if needed, calls the job's method and merges
44
+ # the looked up rows with corresponding buffer rows.
45
+ #
46
+ # @yield [row] The each block
47
+ # @yieldparam [Hash<Symbol, Object>] row The merged row
48
+ #
49
+ # @return [void]
50
+ #
51
+ def on_flush(&)
52
+ call_method(adapter.db, buffer_keys).each do |row|
53
+ match = buffer_find(row)
54
+ next yield(row) unless match
55
+
56
+ yield(row.merge match)
57
+ end
58
+ end
59
+
60
+ private
61
+
62
+ #
63
+ # Find the corresponding row into the buffer.
64
+ #
65
+ # @param [Hash<Symbol, Object>] row The looked up row
66
+ #
67
+ # @return [Hash<Symbol, Object>, nil] the matching row if exists
68
+ #
69
+ def buffer_find(row)
70
+ buffer.find { |r| r[from_key] == row[to_key] }
71
+ end
72
+
73
+ #
74
+ # Maps the buffer keys into an array.
75
+ #
76
+ # @return [Array<Integer, String>] The keys array
77
+ #
78
+ def buffer_keys
79
+ buffer.map { |row| row[from_key] }
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ #
6
+ # The rufus-scheduler handler class.
7
+ # @see https://github.com/jmettraux/rufus-scheduler#scheduling-handler-instances
8
+ # For further information
9
+ #
10
+ class PollingHandler
11
+ #
12
+ # Creates the handler based on the given scheduler, its method name and the trigger's adapter instance.
13
+ #
14
+ # @param [Method] method The scheduler's method
15
+ # @param [Ductr::Adapter] adapter The trigger's adapter
16
+ #
17
+ def initialize(method, adapter)
18
+ @method = method
19
+ @adapter = adapter
20
+ @last_triggering_key = nil
21
+ end
22
+
23
+ #
24
+ # The callable method used by the trigger, actually calls the scheduler's method.
25
+ #
26
+ # @return [void]
27
+ #
28
+ def call
29
+ @adapter.open do |db|
30
+ @method.call(db) do |triggering_key|
31
+ return false if triggering_key == @last_triggering_key
32
+
33
+ @last_triggering_key = triggering_key
34
+ true
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ #
6
+ # A trigger based on the RufusTrigger, runs the PollingHandler at the given timing.
7
+ # The handler calls the scheduler's method with a block which compares the yield result with the previous one.
8
+ # If they are different, yield returns true:
9
+ #
10
+ # trigger :my_database, :polling, interval: "1min"
11
+ # def check_timestamp(db) # will perform MyJob if the name have changed
12
+ # return unless yield(db[:items].select(:name).first)
13
+ #
14
+ # MyJob.perform_later
15
+ # end
16
+ #
17
+ class PollingTrigger < Ductr::RufusTrigger
18
+ Adapter.trigger_registry.add(self, as: :polling)
19
+
20
+ #
21
+ # Closes the connection if the scheduler is stopped.
22
+ #
23
+ # @return [void]
24
+ #
25
+ def stop
26
+ super
27
+ adapter.close!
28
+ end
29
+
30
+ private
31
+
32
+ #
33
+ # Returns a callable object, allowing rufus-scheduler to call it.
34
+ #
35
+ # @param [Ductr::Scheduler] scheduler The scheduler instance
36
+ # @param [Method] method The scheduler's method
37
+ # @param [Hash] ** The option passed to the trigger annotation
38
+ #
39
+ # @return [#call] A callable object
40
+ #
41
+ def callable(method, **)
42
+ PollingHandler.new(method, adapter)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ #
6
+ # A source control that yields rows usnig the PostgreSQL streaming feature, registered as `:streamed`:
7
+ #
8
+ # source :some_postgres_database, :streamed
9
+ # def select_some_stuff(db)
10
+ # db[:items].limit(42)
11
+ # end
12
+ #
13
+ # You can select a large number of rows, without worrying about pagination handling or memory usage.
14
+ #
15
+ class StreamedSource < Ductr::ETL::Source
16
+ Adapter.source_registry.add(self, as: :streamed)
17
+
18
+ #
19
+ # Opens the database, calls the job's method and iterate over the query results.
20
+ #
21
+ # @yield The each block
22
+ #
23
+ # @return [void]
24
+ #
25
+ def each(&)
26
+ call_method(adapter.db).each(&)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ # @return [String] VERSION Gem's version
6
+ VERSION = "0.1.0"
7
+ end
8
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ductr"
4
+ require "sequel"
5
+
6
+ Dir[File.join(__dir__, "postgres", "*.rb")].each { |file| require file }
7
+
8
+ # :nodoc:
9
+ module Ductr
10
+ #
11
+ # ## PostgreSQL adapter for Ductr ETL
12
+ # This gem provides useful controls to operate Ductr ETL with PostgreSQL databases.
13
+ #
14
+ # To get details about the database connection handling, checkout the {Ductr::Postgres::Adapter} class.
15
+ #
16
+ # ### Sources
17
+ # - {Ductr::Postgres::BasicSource} Yields rows one by one.
18
+ # - {Ductr::Postgres::PaginatedSource} Allows to select a big number of rows by relying on pagination.
19
+ #
20
+ # ### Lookups
21
+ # - {Ductr::Postgres::BasicLookup} Executes one query per row and merge the looked up row with the current row.
22
+ # - {Ductr::Postgres::BufferedLookup} Executes one query for a bunch of rows and let you implement the matching logic.
23
+ # - {Ductr::Postgres::MatchLookup} Executes one query for a bunch of rows and abstracts the matching logic.
24
+ #
25
+ # ### Destinations
26
+ # - {Ductr::Postgres::BasicDestination} Writes rows one by one.
27
+ # - {Ductr::Postgres::BufferedDestination} Accumulates rows in a buffer to write them by batch.
28
+ # - {Ductr::Postgres::BufferedUpsertDestination} Accumulates rows in a buffer to upsert them by batch.
29
+ #
30
+ module Postgres; end
31
+ end
@@ -0,0 +1,244 @@
1
+ # :nodoc:
2
+ module Ductr
3
+ #
4
+ # ## PostgreSQL adapter for Ductr ETL
5
+ # This gem provides useful controls to operate Ductr ETL with PostgreSQL databases.
6
+ #
7
+ # To get details about the database connection handling, checkout the {Ductr::Postgres::Adapter} class.
8
+ #
9
+ # ### Sources
10
+ # - {Ductr::Postgres::BasicSource} Yields rows one by one.
11
+ # - {Ductr::Postgres::PaginatedSource} Allows to select a big number of rows by relying on pagination.
12
+ #
13
+ # ### Lookups
14
+ # - {Ductr::Postgres::BasicLookup} Executes one query per row and merge the looked up row with the current row.
15
+ # - {Ductr::Postgres::BufferedLookup} Executes one query for a bunch of rows and let you implement the matching logic.
16
+ # - {Ductr::Postgres::MatchLookup} Executes one query for a bunch of rows and abstracts the matching logic.
17
+ #
18
+ # ### Destinations
19
+ # - {Ductr::Postgres::BasicDestination} Writes rows one by one.
20
+ # - {Ductr::Postgres::BufferedDestination} Accumulates rows in a buffer to write them by batch.
21
+ # - {Ductr::Postgres::BufferedUpsertDestination} Accumulates rows in a buffer to upsert them by batch.
22
+ module Postgres
23
+ VERSION: String
24
+
25
+ #
26
+ # The PostgreSQL adapter implement the required #open! and #close! methods to handle the database connection.
27
+ # The adapter is registered as `:postgres` to use it, add `adapter: postgres` to the YAML configuration e.g.:
28
+ #
29
+ # ```yml
30
+ # # config/development.yml
31
+ # adapters:
32
+ # some_postgres_database:
33
+ # adapter: postgres
34
+ # host: localhost
35
+ # user: postgres
36
+ # password: s3cr3t
37
+ # database: example
38
+ # ```
39
+ #
40
+ # @see https://sequel.jeremyevans.net/rdoc/files/doc/opening_databases_rdoc.html#label-General+connection+options
41
+ # General sequel options
42
+ # @see https://sequel.jeremyevans.net/rdoc/files/doc/opening_databases_rdoc.html#label-postgres
43
+ # PostgreSQL specific options
44
+ class Adapter < Ductr::Adapter
45
+ # sord warn - Sequel::Database wasn't able to be resolved to a constant in this project
46
+ # Opens the database connection with the adapter's configuration.
47
+ #
48
+ # _@return_ — The database connection instance
49
+ def open!: () -> Sequel::Database
50
+
51
+ # Closes the database connection.
52
+ def close!: () -> void
53
+
54
+ # sord warn - Sequel::Database wasn't able to be resolved to a constant in this project
55
+ # _@return_ — The database connection instance
56
+ attr_reader db: Sequel::Database?
57
+ end
58
+
59
+ #
60
+ # A lookup control that execute one query per row, registered as `:basic`.
61
+ # The job's method must return a row which will merged with the current row:
62
+ #
63
+ # lookup :some_postgres_database, :basic
64
+ # def my_lookup(db, row)
65
+ # db[:items_bis].where(item: row[:id]).limit(1)
66
+ # end
67
+ #
68
+ # As the control merge the looked up row with the current row,
69
+ # ensure that column names are different or they will be overwritten.
70
+ #
71
+ # If the lookup returns a falsy value, nothing won't be merged with the current row.
72
+ class BasicLookup < Ductr::ETL::Transform
73
+ # Calls the job's method to merge its result with the current row.
74
+ #
75
+ # _@param_ `row` — The current row, preferably a Hash
76
+ #
77
+ # _@return_ — The row merged with looked up row or the untouched row if nothing was found
78
+ def process: (::Hash[Symbol, Object] row) -> ::Hash[Symbol, Object]
79
+ end
80
+
81
+ #
82
+ # A lookup control that execute the query for a bunch of rows, registered as `:match`.
83
+ #
84
+ # Accept the `:buffer_size` option, default value is 10 000.
85
+ # Accept the `:merge` option, mandatory an array with two entries:
86
+ # - The first one is the looked up row key to match.
87
+ # - The second one is the buffer row key to match.
88
+ #
89
+ # Unless the `:buffered` lookup, this one abstracts the row matching logic by assuming that
90
+ # you want to merge rows based on a key couple e.g. primary / foreign keys:
91
+ #
92
+ # lookup :some_postgres_database, :match, merge: [:id, :item], buffer_size: 42
93
+ # def merge_with_stuff(db, ids)
94
+ # db[:items_bis].where(item: ids)
95
+ # end
96
+ class MatchLookup < Ductr::ETL::BufferedTransform
97
+ # The looked up row key to match.
98
+ #
99
+ # _@return_ — The column name
100
+ def from_key: () -> Symbol
101
+
102
+ # The buffer row key to match.
103
+ #
104
+ # _@return_ — The column name
105
+ def to_key: () -> Symbol
106
+
107
+ # Opens the database if needed, calls the job's method and merges
108
+ # the looked up rows with corresponding buffer rows.
109
+ def on_flush: () ?{ (::Hash[Symbol, Object] row) -> void } -> void
110
+
111
+ # Find the corresponding row into the buffer.
112
+ #
113
+ # _@param_ `row` — The looked up row
114
+ #
115
+ # _@return_ — the matching row if exists
116
+ def buffer_find: (::Hash[Symbol, Object] row) -> ::Hash[Symbol, Object]?
117
+
118
+ # Maps the buffer keys into an array.
119
+ #
120
+ # _@return_ — The keys array
121
+ def buffer_keys: () -> ::Array[(Integer | String)]
122
+ end
123
+
124
+ #
125
+ # A lookup control that execute the query for a bunch of rows, registered as `:buffered`.
126
+ # Accept the `:buffer_size` option, default value is 10 000.
127
+ # You have to implement your own row matching logic:
128
+ #
129
+ # lookup :some_postgres_database, :buffered, buffer_size: 42
130
+ # def my_lookup(db, buffer, &)
131
+ # ids = buffer.map {|row| row[:id]}
132
+ # db[:items].where(item: ids).each do |row|
133
+ # match = buffer.find { |r| r[:id] == row[:item] }
134
+ #
135
+ # next yield(row) unless match
136
+ #
137
+ # yield(row.merge match)
138
+ # end
139
+ # end
140
+ class BufferedLookup < Ductr::ETL::BufferedTransform
141
+ # Opens the database if needed, calls the job's method and pass the each block to it.
142
+ def on_flush: () -> void
143
+ end
144
+
145
+ #
146
+ # The rufus-scheduler handler class.
147
+ # @see https://github.com/jmettraux/rufus-scheduler#scheduling-handler-instances
148
+ # For further information
149
+ class PollingHandler
150
+ # sord warn - Ductr::Adapter wasn't able to be resolved to a constant in this project
151
+ # Creates the handler based on the given scheduler, its method name and the trigger's adapter instance.
152
+ #
153
+ # _@param_ `method` — The scheduler's method
154
+ #
155
+ # _@param_ `adapter` — The trigger's adapter
156
+ def initialize: (Method method, Ductr::Adapter adapter) -> void
157
+
158
+ # The callable method used by the trigger, actually calls the scheduler's method.
159
+ def call: () -> void
160
+ end
161
+
162
+ #
163
+ # A trigger based on the RufusTrigger, runs the PollingHandler at the given timing.
164
+ # The handler calls the scheduler's method with a block which compares the yield result with the previous one.
165
+ # If they are different, yield returns true:
166
+ #
167
+ # trigger :my_database, :polling, interval: "1min"
168
+ # def check_timestamp(db) # will perform MyJob if the name have changed
169
+ # return unless yield(db[:items].select(:name).first)
170
+ #
171
+ # MyJob.perform_later
172
+ # end
173
+ class PollingTrigger < Ductr::RufusTrigger
174
+ # Closes the connection if the scheduler is stopped.
175
+ def stop: () -> void
176
+
177
+ # sord duck - #call looks like a duck type, replacing with untyped
178
+ # Returns a callable object, allowing rufus-scheduler to call it.
179
+ #
180
+ # _@param_ `scheduler` — The scheduler instance
181
+ #
182
+ # _@param_ `method` — The scheduler's method
183
+ #
184
+ # _@param_ `**` — The option passed to the trigger annotation
185
+ #
186
+ # _@return_ — A callable object
187
+ def callable: (Method method) -> untyped
188
+ end
189
+
190
+ #
191
+ # A source control that yields rows usnig the PostgreSQL streaming feature, registered as `:streamed`:
192
+ #
193
+ # source :some_postgres_database, :streamed
194
+ # def select_some_stuff(db)
195
+ # db[:items].limit(42)
196
+ # end
197
+ #
198
+ # You can select a large number of rows, without worrying about pagination handling or memory usage.
199
+ class StreamedSource < Ductr::ETL::Source
200
+ # Opens the database, calls the job's method and iterate over the query results.
201
+ def each: () -> void
202
+ end
203
+
204
+ #
205
+ # A destination control that accumulates rows in a buffer to write them by batch, registered as `:buffered`.
206
+ # Accept the `:buffer_size` option, default value is 10 000:
207
+ #
208
+ # destination :some_postgres_database, :buffered, buffer_size: 42
209
+ # def my_destination(db, buffer)
210
+ # db[:items].multi_insert(buffer)
211
+ # end
212
+ #
213
+ # @see more Ductr::ETL::BufferedDestination
214
+ class BufferedDestination < Ductr::ETL::BufferedDestination
215
+ # Open the database if needed and call the job's method to run the query.
216
+ def on_flush: () -> void
217
+ end
218
+
219
+ #
220
+ # A destination control that accumulates rows in a buffer to upsert them by batch, registered as `:buffered_upsert`.
221
+ # Accept the `:buffer_size` option, default value is 10 000:
222
+ #
223
+ # destination :some_postgres_database, :buffered_upsert, buffer_size: 42
224
+ # def my_destination(buffer, excluded, db)
225
+ # db[:items].insert_conflict(target: :id, update: excluded).multi_insert(buffer)
226
+ # end
227
+ #
228
+ # @see more Ductr::ETL::BufferedDestination
229
+ class BufferedUpsertDestination < Ductr::ETL::BufferedDestination
230
+ # Open the database if needed and call the job's method to run the query.
231
+ def on_flush: () -> void
232
+
233
+ # sord warn - Sequel::SQL::QualifiedIdentifier wasn't able to be resolved to a constant in this project
234
+ # Generate the excluded keys hash e.g.
235
+ #
236
+ # ```ruby
237
+ # {a: Sequel[:excluded][:a]}
238
+ # ```
239
+ #
240
+ # _@return_ — The excluded keys hash
241
+ def excluded: () -> ::Hash[Symbol, Sequel::SQL::QualifiedIdentifier]
242
+ end
243
+ end
244
+ end