ductr-postgres 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +21 -0
- data/.vscode/settings.json +22 -0
- data/.yardopts +1 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/COPYING +674 -0
- data/COPYING.LESSER +165 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +138 -0
- data/README.md +67 -0
- data/Rakefile +38 -0
- data/ductr-postgres.gemspec +48 -0
- data/lib/ductr/postgres/adapter.rb +55 -0
- data/lib/ductr/postgres/basic_lookup.rb +37 -0
- data/lib/ductr/postgres/buffered_destination.rb +29 -0
- data/lib/ductr/postgres/buffered_lookup.rb +37 -0
- data/lib/ductr/postgres/buffered_upsert_destination.rb +50 -0
- data/lib/ductr/postgres/match_lookup.rb +83 -0
- data/lib/ductr/postgres/polling_handler.rb +40 -0
- data/lib/ductr/postgres/polling_trigger.rb +46 -0
- data/lib/ductr/postgres/streamed_source.rb +30 -0
- data/lib/ductr/postgres/version.rb +8 -0
- data/lib/ductr/postgres.rb +31 -0
- data/sig/ductr/postgres.rbs +244 -0
- metadata +240 -0
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ductr
|
4
|
+
module Postgres
|
5
|
+
#
|
6
|
+
# A lookup control that execute the query for a bunch of rows, registered as `:match`.
|
7
|
+
#
|
8
|
+
# Accept the `:buffer_size` option, default value is 10 000.
|
9
|
+
# Accept the `:merge` option, mandatory an array with two entries:
|
10
|
+
# - The first one is the looked up row key to match.
|
11
|
+
# - The second one is the buffer row key to match.
|
12
|
+
#
|
13
|
+
# Unless the `:buffered` lookup, this one abstracts the row matching logic by assuming that
|
14
|
+
# you want to merge rows based on a key couple e.g. primary / foreign keys:
|
15
|
+
#
|
16
|
+
# lookup :some_postgres_database, :match, merge: [:id, :item], buffer_size: 42
|
17
|
+
# def merge_with_stuff(db, ids)
|
18
|
+
# db[:items_bis].where(item: ids)
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
class MatchLookup < Ductr::ETL::BufferedTransform
|
22
|
+
Adapter.lookup_registry.add(self, as: :match)
|
23
|
+
|
24
|
+
#
|
25
|
+
# The looked up row key to match.
|
26
|
+
#
|
27
|
+
# @return [Symbol] The column name
|
28
|
+
#
|
29
|
+
def from_key
|
30
|
+
@options[:merge].first
|
31
|
+
end
|
32
|
+
|
33
|
+
#
|
34
|
+
# The buffer row key to match.
|
35
|
+
#
|
36
|
+
# @return [Symbol] The column name
|
37
|
+
#
|
38
|
+
def to_key
|
39
|
+
@options[:merge].last
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Opens the database if needed, calls the job's method and merges
|
44
|
+
# the looked up rows with corresponding buffer rows.
|
45
|
+
#
|
46
|
+
# @yield [row] The each block
|
47
|
+
# @yieldparam [Hash<Symbol, Object>] row The merged row
|
48
|
+
#
|
49
|
+
# @return [void]
|
50
|
+
#
|
51
|
+
def on_flush(&)
|
52
|
+
call_method(adapter.db, buffer_keys).each do |row|
|
53
|
+
match = buffer_find(row)
|
54
|
+
next yield(row) unless match
|
55
|
+
|
56
|
+
yield(row.merge match)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
#
|
63
|
+
# Find the corresponding row into the buffer.
|
64
|
+
#
|
65
|
+
# @param [Hash<Symbol, Object>] row The looked up row
|
66
|
+
#
|
67
|
+
# @return [Hash<Symbol, Object>, nil] the matching row if exists
|
68
|
+
#
|
69
|
+
def buffer_find(row)
|
70
|
+
buffer.find { |r| r[from_key] == row[to_key] }
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# Maps the buffer keys into an array.
|
75
|
+
#
|
76
|
+
# @return [Array<Integer, String>] The keys array
|
77
|
+
#
|
78
|
+
def buffer_keys
|
79
|
+
buffer.map { |row| row[from_key] }
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ductr
|
4
|
+
module Postgres
|
5
|
+
#
|
6
|
+
# The rufus-scheduler handler class.
|
7
|
+
# @see https://github.com/jmettraux/rufus-scheduler#scheduling-handler-instances
|
8
|
+
# For further information
|
9
|
+
#
|
10
|
+
class PollingHandler
|
11
|
+
#
|
12
|
+
# Creates the handler based on the given scheduler, its method name and the trigger's adapter instance.
|
13
|
+
#
|
14
|
+
# @param [Method] method The scheduler's method
|
15
|
+
# @param [Ductr::Adapter] adapter The trigger's adapter
|
16
|
+
#
|
17
|
+
def initialize(method, adapter)
|
18
|
+
@method = method
|
19
|
+
@adapter = adapter
|
20
|
+
@last_triggering_key = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# The callable method used by the trigger, actually calls the scheduler's method.
|
25
|
+
#
|
26
|
+
# @return [void]
|
27
|
+
#
|
28
|
+
def call
|
29
|
+
@adapter.open do |db|
|
30
|
+
@method.call(db) do |triggering_key|
|
31
|
+
return false if triggering_key == @last_triggering_key
|
32
|
+
|
33
|
+
@last_triggering_key = triggering_key
|
34
|
+
true
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ductr
|
4
|
+
module Postgres
|
5
|
+
#
|
6
|
+
# A trigger based on the RufusTrigger, runs the PollingHandler at the given timing.
|
7
|
+
# The handler calls the scheduler's method with a block which compares the yield result with the previous one.
|
8
|
+
# If they are different, yield returns true:
|
9
|
+
#
|
10
|
+
# trigger :my_database, :polling, interval: "1min"
|
11
|
+
# def check_timestamp(db) # will perform MyJob if the name have changed
|
12
|
+
# return unless yield(db[:items].select(:name).first)
|
13
|
+
#
|
14
|
+
# MyJob.perform_later
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
class PollingTrigger < Ductr::RufusTrigger
|
18
|
+
Adapter.trigger_registry.add(self, as: :polling)
|
19
|
+
|
20
|
+
#
|
21
|
+
# Closes the connection if the scheduler is stopped.
|
22
|
+
#
|
23
|
+
# @return [void]
|
24
|
+
#
|
25
|
+
def stop
|
26
|
+
super
|
27
|
+
adapter.close!
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
#
|
33
|
+
# Returns a callable object, allowing rufus-scheduler to call it.
|
34
|
+
#
|
35
|
+
# @param [Ductr::Scheduler] scheduler The scheduler instance
|
36
|
+
# @param [Method] method The scheduler's method
|
37
|
+
# @param [Hash] ** The option passed to the trigger annotation
|
38
|
+
#
|
39
|
+
# @return [#call] A callable object
|
40
|
+
#
|
41
|
+
def callable(method, **)
|
42
|
+
PollingHandler.new(method, adapter)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ductr
|
4
|
+
module Postgres
|
5
|
+
#
|
6
|
+
# A source control that yields rows usnig the PostgreSQL streaming feature, registered as `:streamed`:
|
7
|
+
#
|
8
|
+
# source :some_postgres_database, :streamed
|
9
|
+
# def select_some_stuff(db)
|
10
|
+
# db[:items].limit(42)
|
11
|
+
# end
|
12
|
+
#
|
13
|
+
# You can select a large number of rows, without worrying about pagination handling or memory usage.
|
14
|
+
#
|
15
|
+
class StreamedSource < Ductr::ETL::Source
|
16
|
+
Adapter.source_registry.add(self, as: :streamed)
|
17
|
+
|
18
|
+
#
|
19
|
+
# Opens the database, calls the job's method and iterate over the query results.
|
20
|
+
#
|
21
|
+
# @yield The each block
|
22
|
+
#
|
23
|
+
# @return [void]
|
24
|
+
#
|
25
|
+
def each(&)
|
26
|
+
call_method(adapter.db).each(&)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ductr"
|
4
|
+
require "sequel"
|
5
|
+
|
6
|
+
Dir[File.join(__dir__, "postgres", "*.rb")].each { |file| require file }
|
7
|
+
|
8
|
+
# :nodoc:
|
9
|
+
module Ductr
|
10
|
+
#
|
11
|
+
# ## PostgreSQL adapter for Ductr ETL
|
12
|
+
# This gem provides useful controls to operate Ductr ETL with PostgreSQL databases.
|
13
|
+
#
|
14
|
+
# To get details about the database connection handling, checkout the {Ductr::Postgres::Adapter} class.
|
15
|
+
#
|
16
|
+
# ### Sources
|
17
|
+
# - {Ductr::Postgres::BasicSource} Yields rows one by one.
|
18
|
+
# - {Ductr::Postgres::PaginatedSource} Allows to select a big number of rows by relying on pagination.
|
19
|
+
#
|
20
|
+
# ### Lookups
|
21
|
+
# - {Ductr::Postgres::BasicLookup} Executes one query per row and merge the looked up row with the current row.
|
22
|
+
# - {Ductr::Postgres::BufferedLookup} Executes one query for a bunch of rows and let you implement the matching logic.
|
23
|
+
# - {Ductr::Postgres::MatchLookup} Executes one query for a bunch of rows and abstracts the matching logic.
|
24
|
+
#
|
25
|
+
# ### Destinations
|
26
|
+
# - {Ductr::Postgres::BasicDestination} Writes rows one by one.
|
27
|
+
# - {Ductr::Postgres::BufferedDestination} Accumulates rows in a buffer to write them by batch.
|
28
|
+
# - {Ductr::Postgres::BufferedUpsertDestination} Accumulates rows in a buffer to upsert them by batch.
|
29
|
+
#
|
30
|
+
module Postgres; end
|
31
|
+
end
|
@@ -0,0 +1,244 @@
|
|
1
|
+
# :nodoc:
|
2
|
+
module Ductr
|
3
|
+
#
|
4
|
+
# ## PostgreSQL adapter for Ductr ETL
|
5
|
+
# This gem provides useful controls to operate Ductr ETL with PostgreSQL databases.
|
6
|
+
#
|
7
|
+
# To get details about the database connection handling, checkout the {Ductr::Postgres::Adapter} class.
|
8
|
+
#
|
9
|
+
# ### Sources
|
10
|
+
# - {Ductr::Postgres::BasicSource} Yields rows one by one.
|
11
|
+
# - {Ductr::Postgres::PaginatedSource} Allows to select a big number of rows by relying on pagination.
|
12
|
+
#
|
13
|
+
# ### Lookups
|
14
|
+
# - {Ductr::Postgres::BasicLookup} Executes one query per row and merge the looked up row with the current row.
|
15
|
+
# - {Ductr::Postgres::BufferedLookup} Executes one query for a bunch of rows and let you implement the matching logic.
|
16
|
+
# - {Ductr::Postgres::MatchLookup} Executes one query for a bunch of rows and abstracts the matching logic.
|
17
|
+
#
|
18
|
+
# ### Destinations
|
19
|
+
# - {Ductr::Postgres::BasicDestination} Writes rows one by one.
|
20
|
+
# - {Ductr::Postgres::BufferedDestination} Accumulates rows in a buffer to write them by batch.
|
21
|
+
# - {Ductr::Postgres::BufferedUpsertDestination} Accumulates rows in a buffer to upsert them by batch.
|
22
|
+
module Postgres
|
23
|
+
VERSION: String
|
24
|
+
|
25
|
+
#
|
26
|
+
# The PostgreSQL adapter implement the required #open! and #close! methods to handle the database connection.
|
27
|
+
# The adapter is registered as `:postgres` to use it, add `adapter: postgres` to the YAML configuration e.g.:
|
28
|
+
#
|
29
|
+
# ```yml
|
30
|
+
# # config/development.yml
|
31
|
+
# adapters:
|
32
|
+
# some_postgres_database:
|
33
|
+
# adapter: postgres
|
34
|
+
# host: localhost
|
35
|
+
# user: postgres
|
36
|
+
# password: s3cr3t
|
37
|
+
# database: example
|
38
|
+
# ```
|
39
|
+
#
|
40
|
+
# @see https://sequel.jeremyevans.net/rdoc/files/doc/opening_databases_rdoc.html#label-General+connection+options
|
41
|
+
# General sequel options
|
42
|
+
# @see https://sequel.jeremyevans.net/rdoc/files/doc/opening_databases_rdoc.html#label-postgres
|
43
|
+
# PostgreSQL specific options
|
44
|
+
class Adapter < Ductr::Adapter
|
45
|
+
# sord warn - Sequel::Database wasn't able to be resolved to a constant in this project
|
46
|
+
# Opens the database connection with the adapter's configuration.
|
47
|
+
#
|
48
|
+
# _@return_ — The database connection instance
|
49
|
+
def open!: () -> Sequel::Database
|
50
|
+
|
51
|
+
# Closes the database connection.
|
52
|
+
def close!: () -> void
|
53
|
+
|
54
|
+
# sord warn - Sequel::Database wasn't able to be resolved to a constant in this project
|
55
|
+
# _@return_ — The database connection instance
|
56
|
+
attr_reader db: Sequel::Database?
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# A lookup control that execute one query per row, registered as `:basic`.
|
61
|
+
# The job's method must return a row which will merged with the current row:
|
62
|
+
#
|
63
|
+
# lookup :some_postgres_database, :basic
|
64
|
+
# def my_lookup(db, row)
|
65
|
+
# db[:items_bis].where(item: row[:id]).limit(1)
|
66
|
+
# end
|
67
|
+
#
|
68
|
+
# As the control merge the looked up row with the current row,
|
69
|
+
# ensure that column names are different or they will be overwritten.
|
70
|
+
#
|
71
|
+
# If the lookup returns a falsy value, nothing won't be merged with the current row.
|
72
|
+
class BasicLookup < Ductr::ETL::Transform
|
73
|
+
# Calls the job's method to merge its result with the current row.
|
74
|
+
#
|
75
|
+
# _@param_ `row` — The current row, preferably a Hash
|
76
|
+
#
|
77
|
+
# _@return_ — The row merged with looked up row or the untouched row if nothing was found
|
78
|
+
def process: (::Hash[Symbol, Object] row) -> ::Hash[Symbol, Object]
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# A lookup control that execute the query for a bunch of rows, registered as `:match`.
|
83
|
+
#
|
84
|
+
# Accept the `:buffer_size` option, default value is 10 000.
|
85
|
+
# Accept the `:merge` option, mandatory an array with two entries:
|
86
|
+
# - The first one is the looked up row key to match.
|
87
|
+
# - The second one is the buffer row key to match.
|
88
|
+
#
|
89
|
+
# Unless the `:buffered` lookup, this one abstracts the row matching logic by assuming that
|
90
|
+
# you want to merge rows based on a key couple e.g. primary / foreign keys:
|
91
|
+
#
|
92
|
+
# lookup :some_postgres_database, :match, merge: [:id, :item], buffer_size: 42
|
93
|
+
# def merge_with_stuff(db, ids)
|
94
|
+
# db[:items_bis].where(item: ids)
|
95
|
+
# end
|
96
|
+
class MatchLookup < Ductr::ETL::BufferedTransform
|
97
|
+
# The looked up row key to match.
|
98
|
+
#
|
99
|
+
# _@return_ — The column name
|
100
|
+
def from_key: () -> Symbol
|
101
|
+
|
102
|
+
# The buffer row key to match.
|
103
|
+
#
|
104
|
+
# _@return_ — The column name
|
105
|
+
def to_key: () -> Symbol
|
106
|
+
|
107
|
+
# Opens the database if needed, calls the job's method and merges
|
108
|
+
# the looked up rows with corresponding buffer rows.
|
109
|
+
def on_flush: () ?{ (::Hash[Symbol, Object] row) -> void } -> void
|
110
|
+
|
111
|
+
# Find the corresponding row into the buffer.
|
112
|
+
#
|
113
|
+
# _@param_ `row` — The looked up row
|
114
|
+
#
|
115
|
+
# _@return_ — the matching row if exists
|
116
|
+
def buffer_find: (::Hash[Symbol, Object] row) -> ::Hash[Symbol, Object]?
|
117
|
+
|
118
|
+
# Maps the buffer keys into an array.
|
119
|
+
#
|
120
|
+
# _@return_ — The keys array
|
121
|
+
def buffer_keys: () -> ::Array[(Integer | String)]
|
122
|
+
end
|
123
|
+
|
124
|
+
#
|
125
|
+
# A lookup control that execute the query for a bunch of rows, registered as `:buffered`.
|
126
|
+
# Accept the `:buffer_size` option, default value is 10 000.
|
127
|
+
# You have to implement your own row matching logic:
|
128
|
+
#
|
129
|
+
# lookup :some_postgres_database, :buffered, buffer_size: 42
|
130
|
+
# def my_lookup(db, buffer, &)
|
131
|
+
# ids = buffer.map {|row| row[:id]}
|
132
|
+
# db[:items].where(item: ids).each do |row|
|
133
|
+
# match = buffer.find { |r| r[:id] == row[:item] }
|
134
|
+
#
|
135
|
+
# next yield(row) unless match
|
136
|
+
#
|
137
|
+
# yield(row.merge match)
|
138
|
+
# end
|
139
|
+
# end
|
140
|
+
class BufferedLookup < Ductr::ETL::BufferedTransform
|
141
|
+
# Opens the database if needed, calls the job's method and pass the each block to it.
|
142
|
+
def on_flush: () -> void
|
143
|
+
end
|
144
|
+
|
145
|
+
#
|
146
|
+
# The rufus-scheduler handler class.
|
147
|
+
# @see https://github.com/jmettraux/rufus-scheduler#scheduling-handler-instances
|
148
|
+
# For further information
|
149
|
+
class PollingHandler
|
150
|
+
# sord warn - Ductr::Adapter wasn't able to be resolved to a constant in this project
|
151
|
+
# Creates the handler based on the given scheduler, its method name and the trigger's adapter instance.
|
152
|
+
#
|
153
|
+
# _@param_ `method` — The scheduler's method
|
154
|
+
#
|
155
|
+
# _@param_ `adapter` — The trigger's adapter
|
156
|
+
def initialize: (Method method, Ductr::Adapter adapter) -> void
|
157
|
+
|
158
|
+
# The callable method used by the trigger, actually calls the scheduler's method.
|
159
|
+
def call: () -> void
|
160
|
+
end
|
161
|
+
|
162
|
+
#
|
163
|
+
# A trigger based on the RufusTrigger, runs the PollingHandler at the given timing.
|
164
|
+
# The handler calls the scheduler's method with a block which compares the yield result with the previous one.
|
165
|
+
# If they are different, yield returns true:
|
166
|
+
#
|
167
|
+
# trigger :my_database, :polling, interval: "1min"
|
168
|
+
# def check_timestamp(db) # will perform MyJob if the name have changed
|
169
|
+
# return unless yield(db[:items].select(:name).first)
|
170
|
+
#
|
171
|
+
# MyJob.perform_later
|
172
|
+
# end
|
173
|
+
class PollingTrigger < Ductr::RufusTrigger
|
174
|
+
# Closes the connection if the scheduler is stopped.
|
175
|
+
def stop: () -> void
|
176
|
+
|
177
|
+
# sord duck - #call looks like a duck type, replacing with untyped
|
178
|
+
# Returns a callable object, allowing rufus-scheduler to call it.
|
179
|
+
#
|
180
|
+
# _@param_ `scheduler` — The scheduler instance
|
181
|
+
#
|
182
|
+
# _@param_ `method` — The scheduler's method
|
183
|
+
#
|
184
|
+
# _@param_ `**` — The option passed to the trigger annotation
|
185
|
+
#
|
186
|
+
# _@return_ — A callable object
|
187
|
+
def callable: (Method method) -> untyped
|
188
|
+
end
|
189
|
+
|
190
|
+
#
|
191
|
+
# A source control that yields rows usnig the PostgreSQL streaming feature, registered as `:streamed`:
|
192
|
+
#
|
193
|
+
# source :some_postgres_database, :streamed
|
194
|
+
# def select_some_stuff(db)
|
195
|
+
# db[:items].limit(42)
|
196
|
+
# end
|
197
|
+
#
|
198
|
+
# You can select a large number of rows, without worrying about pagination handling or memory usage.
|
199
|
+
class StreamedSource < Ductr::ETL::Source
|
200
|
+
# Opens the database, calls the job's method and iterate over the query results.
|
201
|
+
def each: () -> void
|
202
|
+
end
|
203
|
+
|
204
|
+
#
|
205
|
+
# A destination control that accumulates rows in a buffer to write them by batch, registered as `:buffered`.
|
206
|
+
# Accept the `:buffer_size` option, default value is 10 000:
|
207
|
+
#
|
208
|
+
# destination :some_postgres_database, :buffered, buffer_size: 42
|
209
|
+
# def my_destination(db, buffer)
|
210
|
+
# db[:items].multi_insert(buffer)
|
211
|
+
# end
|
212
|
+
#
|
213
|
+
# @see more Ductr::ETL::BufferedDestination
|
214
|
+
class BufferedDestination < Ductr::ETL::BufferedDestination
|
215
|
+
# Open the database if needed and call the job's method to run the query.
|
216
|
+
def on_flush: () -> void
|
217
|
+
end
|
218
|
+
|
219
|
+
#
|
220
|
+
# A destination control that accumulates rows in a buffer to upsert them by batch, registered as `:buffered_upsert`.
|
221
|
+
# Accept the `:buffer_size` option, default value is 10 000:
|
222
|
+
#
|
223
|
+
# destination :some_postgres_database, :buffered_upsert, buffer_size: 42
|
224
|
+
# def my_destination(buffer, excluded, db)
|
225
|
+
# db[:items].insert_conflict(target: :id, update: excluded).multi_insert(buffer)
|
226
|
+
# end
|
227
|
+
#
|
228
|
+
# @see more Ductr::ETL::BufferedDestination
|
229
|
+
class BufferedUpsertDestination < Ductr::ETL::BufferedDestination
|
230
|
+
# Open the database if needed and call the job's method to run the query.
|
231
|
+
def on_flush: () -> void
|
232
|
+
|
233
|
+
# sord warn - Sequel::SQL::QualifiedIdentifier wasn't able to be resolved to a constant in this project
|
234
|
+
# Generate the excluded keys hash e.g.
|
235
|
+
#
|
236
|
+
# ```ruby
|
237
|
+
# {a: Sequel[:excluded][:a]}
|
238
|
+
# ```
|
239
|
+
#
|
240
|
+
# _@return_ — The excluded keys hash
|
241
|
+
def excluded: () -> ::Hash[Symbol, Sequel::SQL::QualifiedIdentifier]
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|