ductr-postgres 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ #
6
+ # A lookup control that execute the query for a bunch of rows, registered as `:match`.
7
+ #
8
+ # Accept the `:buffer_size` option, default value is 10 000.
9
+ # Accept the `:merge` option, mandatory an array with two entries:
10
+ # - The first one is the looked up row key to match.
11
+ # - The second one is the buffer row key to match.
12
+ #
13
+ # Unless the `:buffered` lookup, this one abstracts the row matching logic by assuming that
14
+ # you want to merge rows based on a key couple e.g. primary / foreign keys:
15
+ #
16
+ # lookup :some_postgres_database, :match, merge: [:id, :item], buffer_size: 42
17
+ # def merge_with_stuff(db, ids)
18
+ # db[:items_bis].where(item: ids)
19
+ # end
20
+ #
21
+ class MatchLookup < Ductr::ETL::BufferedTransform
22
+ Adapter.lookup_registry.add(self, as: :match)
23
+
24
+ #
25
+ # The looked up row key to match.
26
+ #
27
+ # @return [Symbol] The column name
28
+ #
29
+ def from_key
30
+ @options[:merge].first
31
+ end
32
+
33
+ #
34
+ # The buffer row key to match.
35
+ #
36
+ # @return [Symbol] The column name
37
+ #
38
+ def to_key
39
+ @options[:merge].last
40
+ end
41
+
42
+ #
43
+ # Opens the database if needed, calls the job's method and merges
44
+ # the looked up rows with corresponding buffer rows.
45
+ #
46
+ # @yield [row] The each block
47
+ # @yieldparam [Hash<Symbol, Object>] row The merged row
48
+ #
49
+ # @return [void]
50
+ #
51
+ def on_flush(&)
52
+ call_method(adapter.db, buffer_keys).each do |row|
53
+ match = buffer_find(row)
54
+ next yield(row) unless match
55
+
56
+ yield(row.merge match)
57
+ end
58
+ end
59
+
60
+ private
61
+
62
+ #
63
+ # Find the corresponding row into the buffer.
64
+ #
65
+ # @param [Hash<Symbol, Object>] row The looked up row
66
+ #
67
+ # @return [Hash<Symbol, Object>, nil] the matching row if exists
68
+ #
69
+ def buffer_find(row)
70
+ buffer.find { |r| r[from_key] == row[to_key] }
71
+ end
72
+
73
+ #
74
+ # Maps the buffer keys into an array.
75
+ #
76
+ # @return [Array<Integer, String>] The keys array
77
+ #
78
+ def buffer_keys
79
+ buffer.map { |row| row[from_key] }
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ #
6
+ # The rufus-scheduler handler class.
7
+ # @see https://github.com/jmettraux/rufus-scheduler#scheduling-handler-instances
8
+ # For further information
9
+ #
10
+ class PollingHandler
11
+ #
12
+ # Creates the handler based on the given scheduler, its method name and the trigger's adapter instance.
13
+ #
14
+ # @param [Method] method The scheduler's method
15
+ # @param [Ductr::Adapter] adapter The trigger's adapter
16
+ #
17
+ def initialize(method, adapter)
18
+ @method = method
19
+ @adapter = adapter
20
+ @last_triggering_key = nil
21
+ end
22
+
23
+ #
24
+ # The callable method used by the trigger, actually calls the scheduler's method.
25
+ #
26
+ # @return [void]
27
+ #
28
+ def call
29
+ @adapter.open do |db|
30
+ @method.call(db) do |triggering_key|
31
+ return false if triggering_key == @last_triggering_key
32
+
33
+ @last_triggering_key = triggering_key
34
+ true
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ #
6
+ # A trigger based on the RufusTrigger, runs the PollingHandler at the given timing.
7
+ # The handler calls the scheduler's method with a block which compares the yield result with the previous one.
8
+ # If they are different, yield returns true:
9
+ #
10
+ # trigger :my_database, :polling, interval: "1min"
11
+ # def check_timestamp(db) # will perform MyJob if the name have changed
12
+ # return unless yield(db[:items].select(:name).first)
13
+ #
14
+ # MyJob.perform_later
15
+ # end
16
+ #
17
+ class PollingTrigger < Ductr::RufusTrigger
18
+ Adapter.trigger_registry.add(self, as: :polling)
19
+
20
+ #
21
+ # Closes the connection if the scheduler is stopped.
22
+ #
23
+ # @return [void]
24
+ #
25
+ def stop
26
+ super
27
+ adapter.close!
28
+ end
29
+
30
+ private
31
+
32
+ #
33
+ # Returns a callable object, allowing rufus-scheduler to call it.
34
+ #
35
+ # @param [Ductr::Scheduler] scheduler The scheduler instance
36
+ # @param [Method] method The scheduler's method
37
+ # @param [Hash] ** The option passed to the trigger annotation
38
+ #
39
+ # @return [#call] A callable object
40
+ #
41
+ def callable(method, **)
42
+ PollingHandler.new(method, adapter)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ #
6
+ # A source control that yields rows usnig the PostgreSQL streaming feature, registered as `:streamed`:
7
+ #
8
+ # source :some_postgres_database, :streamed
9
+ # def select_some_stuff(db)
10
+ # db[:items].limit(42)
11
+ # end
12
+ #
13
+ # You can select a large number of rows, without worrying about pagination handling or memory usage.
14
+ #
15
+ class StreamedSource < Ductr::ETL::Source
16
+ Adapter.source_registry.add(self, as: :streamed)
17
+
18
+ #
19
+ # Opens the database, calls the job's method and iterate over the query results.
20
+ #
21
+ # @yield The each block
22
+ #
23
+ # @return [void]
24
+ #
25
+ def each(&)
26
+ call_method(adapter.db).each(&)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module Postgres
5
+ # @return [String] VERSION Gem's version
6
+ VERSION = "0.1.0"
7
+ end
8
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ductr"
4
+ require "sequel"
5
+
6
+ Dir[File.join(__dir__, "postgres", "*.rb")].each { |file| require file }
7
+
8
+ # :nodoc:
9
+ module Ductr
10
+ #
11
+ # ## PostgreSQL adapter for Ductr ETL
12
+ # This gem provides useful controls to operate Ductr ETL with PostgreSQL databases.
13
+ #
14
+ # To get details about the database connection handling, checkout the {Ductr::Postgres::Adapter} class.
15
+ #
16
+ # ### Sources
17
+ # - {Ductr::Postgres::BasicSource} Yields rows one by one.
18
+ # - {Ductr::Postgres::PaginatedSource} Allows to select a big number of rows by relying on pagination.
19
+ #
20
+ # ### Lookups
21
+ # - {Ductr::Postgres::BasicLookup} Executes one query per row and merge the looked up row with the current row.
22
+ # - {Ductr::Postgres::BufferedLookup} Executes one query for a bunch of rows and let you implement the matching logic.
23
+ # - {Ductr::Postgres::MatchLookup} Executes one query for a bunch of rows and abstracts the matching logic.
24
+ #
25
+ # ### Destinations
26
+ # - {Ductr::Postgres::BasicDestination} Writes rows one by one.
27
+ # - {Ductr::Postgres::BufferedDestination} Accumulates rows in a buffer to write them by batch.
28
+ # - {Ductr::Postgres::BufferedUpsertDestination} Accumulates rows in a buffer to upsert them by batch.
29
+ #
30
+ module Postgres; end
31
+ end
@@ -0,0 +1,244 @@
1
+ # :nodoc:
2
+ module Ductr
3
+ #
4
+ # ## PostgreSQL adapter for Ductr ETL
5
+ # This gem provides useful controls to operate Ductr ETL with PostgreSQL databases.
6
+ #
7
+ # To get details about the database connection handling, checkout the {Ductr::Postgres::Adapter} class.
8
+ #
9
+ # ### Sources
10
+ # - {Ductr::Postgres::BasicSource} Yields rows one by one.
11
+ # - {Ductr::Postgres::PaginatedSource} Allows to select a big number of rows by relying on pagination.
12
+ #
13
+ # ### Lookups
14
+ # - {Ductr::Postgres::BasicLookup} Executes one query per row and merge the looked up row with the current row.
15
+ # - {Ductr::Postgres::BufferedLookup} Executes one query for a bunch of rows and let you implement the matching logic.
16
+ # - {Ductr::Postgres::MatchLookup} Executes one query for a bunch of rows and abstracts the matching logic.
17
+ #
18
+ # ### Destinations
19
+ # - {Ductr::Postgres::BasicDestination} Writes rows one by one.
20
+ # - {Ductr::Postgres::BufferedDestination} Accumulates rows in a buffer to write them by batch.
21
+ # - {Ductr::Postgres::BufferedUpsertDestination} Accumulates rows in a buffer to upsert them by batch.
22
+ module Postgres
23
+ VERSION: String
24
+
25
+ #
26
+ # The PostgreSQL adapter implement the required #open! and #close! methods to handle the database connection.
27
+ # The adapter is registered as `:postgres` to use it, add `adapter: postgres` to the YAML configuration e.g.:
28
+ #
29
+ # ```yml
30
+ # # config/development.yml
31
+ # adapters:
32
+ # some_postgres_database:
33
+ # adapter: postgres
34
+ # host: localhost
35
+ # user: postgres
36
+ # password: s3cr3t
37
+ # database: example
38
+ # ```
39
+ #
40
+ # @see https://sequel.jeremyevans.net/rdoc/files/doc/opening_databases_rdoc.html#label-General+connection+options
41
+ # General sequel options
42
+ # @see https://sequel.jeremyevans.net/rdoc/files/doc/opening_databases_rdoc.html#label-postgres
43
+ # PostgreSQL specific options
44
+ class Adapter < Ductr::Adapter
45
+ # sord warn - Sequel::Database wasn't able to be resolved to a constant in this project
46
+ # Opens the database connection with the adapter's configuration.
47
+ #
48
+ # _@return_ — The database connection instance
49
+ def open!: () -> Sequel::Database
50
+
51
+ # Closes the database connection.
52
+ def close!: () -> void
53
+
54
+ # sord warn - Sequel::Database wasn't able to be resolved to a constant in this project
55
+ # _@return_ — The database connection instance
56
+ attr_reader db: Sequel::Database?
57
+ end
58
+
59
+ #
60
+ # A lookup control that execute one query per row, registered as `:basic`.
61
+ # The job's method must return a row which will merged with the current row:
62
+ #
63
+ # lookup :some_postgres_database, :basic
64
+ # def my_lookup(db, row)
65
+ # db[:items_bis].where(item: row[:id]).limit(1)
66
+ # end
67
+ #
68
+ # As the control merge the looked up row with the current row,
69
+ # ensure that column names are different or they will be overwritten.
70
+ #
71
+ # If the lookup returns a falsy value, nothing won't be merged with the current row.
72
+ class BasicLookup < Ductr::ETL::Transform
73
+ # Calls the job's method to merge its result with the current row.
74
+ #
75
+ # _@param_ `row` — The current row, preferably a Hash
76
+ #
77
+ # _@return_ — The row merged with looked up row or the untouched row if nothing was found
78
+ def process: (::Hash[Symbol, Object] row) -> ::Hash[Symbol, Object]
79
+ end
80
+
81
+ #
82
+ # A lookup control that execute the query for a bunch of rows, registered as `:match`.
83
+ #
84
+ # Accept the `:buffer_size` option, default value is 10 000.
85
+ # Accept the `:merge` option, mandatory an array with two entries:
86
+ # - The first one is the looked up row key to match.
87
+ # - The second one is the buffer row key to match.
88
+ #
89
+ # Unless the `:buffered` lookup, this one abstracts the row matching logic by assuming that
90
+ # you want to merge rows based on a key couple e.g. primary / foreign keys:
91
+ #
92
+ # lookup :some_postgres_database, :match, merge: [:id, :item], buffer_size: 42
93
+ # def merge_with_stuff(db, ids)
94
+ # db[:items_bis].where(item: ids)
95
+ # end
96
+ class MatchLookup < Ductr::ETL::BufferedTransform
97
+ # The looked up row key to match.
98
+ #
99
+ # _@return_ — The column name
100
+ def from_key: () -> Symbol
101
+
102
+ # The buffer row key to match.
103
+ #
104
+ # _@return_ — The column name
105
+ def to_key: () -> Symbol
106
+
107
+ # Opens the database if needed, calls the job's method and merges
108
+ # the looked up rows with corresponding buffer rows.
109
+ def on_flush: () ?{ (::Hash[Symbol, Object] row) -> void } -> void
110
+
111
+ # Find the corresponding row into the buffer.
112
+ #
113
+ # _@param_ `row` — The looked up row
114
+ #
115
+ # _@return_ — the matching row if exists
116
+ def buffer_find: (::Hash[Symbol, Object] row) -> ::Hash[Symbol, Object]?
117
+
118
+ # Maps the buffer keys into an array.
119
+ #
120
+ # _@return_ — The keys array
121
+ def buffer_keys: () -> ::Array[(Integer | String)]
122
+ end
123
+
124
+ #
125
+ # A lookup control that execute the query for a bunch of rows, registered as `:buffered`.
126
+ # Accept the `:buffer_size` option, default value is 10 000.
127
+ # You have to implement your own row matching logic:
128
+ #
129
+ # lookup :some_postgres_database, :buffered, buffer_size: 42
130
+ # def my_lookup(db, buffer, &)
131
+ # ids = buffer.map {|row| row[:id]}
132
+ # db[:items].where(item: ids).each do |row|
133
+ # match = buffer.find { |r| r[:id] == row[:item] }
134
+ #
135
+ # next yield(row) unless match
136
+ #
137
+ # yield(row.merge match)
138
+ # end
139
+ # end
140
+ class BufferedLookup < Ductr::ETL::BufferedTransform
141
+ # Opens the database if needed, calls the job's method and pass the each block to it.
142
+ def on_flush: () -> void
143
+ end
144
+
145
+ #
146
+ # The rufus-scheduler handler class.
147
+ # @see https://github.com/jmettraux/rufus-scheduler#scheduling-handler-instances
148
+ # For further information
149
+ class PollingHandler
150
+ # sord warn - Ductr::Adapter wasn't able to be resolved to a constant in this project
151
+ # Creates the handler based on the given scheduler, its method name and the trigger's adapter instance.
152
+ #
153
+ # _@param_ `method` — The scheduler's method
154
+ #
155
+ # _@param_ `adapter` — The trigger's adapter
156
+ def initialize: (Method method, Ductr::Adapter adapter) -> void
157
+
158
+ # The callable method used by the trigger, actually calls the scheduler's method.
159
+ def call: () -> void
160
+ end
161
+
162
+ #
163
+ # A trigger based on the RufusTrigger, runs the PollingHandler at the given timing.
164
+ # The handler calls the scheduler's method with a block which compares the yield result with the previous one.
165
+ # If they are different, yield returns true:
166
+ #
167
+ # trigger :my_database, :polling, interval: "1min"
168
+ # def check_timestamp(db) # will perform MyJob if the name have changed
169
+ # return unless yield(db[:items].select(:name).first)
170
+ #
171
+ # MyJob.perform_later
172
+ # end
173
+ class PollingTrigger < Ductr::RufusTrigger
174
+ # Closes the connection if the scheduler is stopped.
175
+ def stop: () -> void
176
+
177
+ # sord duck - #call looks like a duck type, replacing with untyped
178
+ # Returns a callable object, allowing rufus-scheduler to call it.
179
+ #
180
+ # _@param_ `scheduler` — The scheduler instance
181
+ #
182
+ # _@param_ `method` — The scheduler's method
183
+ #
184
+ # _@param_ `**` — The option passed to the trigger annotation
185
+ #
186
+ # _@return_ — A callable object
187
+ def callable: (Method method) -> untyped
188
+ end
189
+
190
+ #
191
+ # A source control that yields rows usnig the PostgreSQL streaming feature, registered as `:streamed`:
192
+ #
193
+ # source :some_postgres_database, :streamed
194
+ # def select_some_stuff(db)
195
+ # db[:items].limit(42)
196
+ # end
197
+ #
198
+ # You can select a large number of rows, without worrying about pagination handling or memory usage.
199
+ class StreamedSource < Ductr::ETL::Source
200
+ # Opens the database, calls the job's method and iterate over the query results.
201
+ def each: () -> void
202
+ end
203
+
204
+ #
205
+ # A destination control that accumulates rows in a buffer to write them by batch, registered as `:buffered`.
206
+ # Accept the `:buffer_size` option, default value is 10 000:
207
+ #
208
+ # destination :some_postgres_database, :buffered, buffer_size: 42
209
+ # def my_destination(db, buffer)
210
+ # db[:items].multi_insert(buffer)
211
+ # end
212
+ #
213
+ # @see more Ductr::ETL::BufferedDestination
214
+ class BufferedDestination < Ductr::ETL::BufferedDestination
215
+ # Open the database if needed and call the job's method to run the query.
216
+ def on_flush: () -> void
217
+ end
218
+
219
+ #
220
+ # A destination control that accumulates rows in a buffer to upsert them by batch, registered as `:buffered_upsert`.
221
+ # Accept the `:buffer_size` option, default value is 10 000:
222
+ #
223
+ # destination :some_postgres_database, :buffered_upsert, buffer_size: 42
224
+ # def my_destination(buffer, excluded, db)
225
+ # db[:items].insert_conflict(target: :id, update: excluded).multi_insert(buffer)
226
+ # end
227
+ #
228
+ # @see more Ductr::ETL::BufferedDestination
229
+ class BufferedUpsertDestination < Ductr::ETL::BufferedDestination
230
+ # Open the database if needed and call the job's method to run the query.
231
+ def on_flush: () -> void
232
+
233
+ # sord warn - Sequel::SQL::QualifiedIdentifier wasn't able to be resolved to a constant in this project
234
+ # Generate the excluded keys hash e.g.
235
+ #
236
+ # ```ruby
237
+ # {a: Sequel[:excluded][:a]}
238
+ # ```
239
+ #
240
+ # _@return_ — The excluded keys hash
241
+ def excluded: () -> ::Hash[Symbol, Sequel::SQL::QualifiedIdentifier]
242
+ end
243
+ end
244
+ end