data_drain 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +40 -1
- data/CHANGELOG.md +55 -0
- data/CLAUDE.md +14 -0
- data/README.md +2 -0
- data/data_drain.gemspec +1 -1
- data/docs/IMPROVEMENT_PLAN.md +132 -26
- data/docs/execution/archive/v0.3.0-OBSERVACIONES.md +136 -0
- data/docs/execution/archive/v0.3.0.md +1111 -0
- data/docs/execution/v0.3.1-OBSERVACIONES.md +146 -0
- data/docs/execution/v0.3.1.md +842 -0
- data/lib/data_drain/configuration.rb +7 -1
- data/lib/data_drain/engine.rb +185 -74
- data/lib/data_drain/file_ingestor.rb +64 -47
- data/lib/data_drain/observability/timing.rb +23 -0
- data/lib/data_drain/observability.rb +2 -0
- data/lib/data_drain/record.rb +8 -15
- data/lib/data_drain/storage/base.rb +12 -0
- data/lib/data_drain/storage/local.rb +1 -3
- data/lib/data_drain/storage/s3.rb +35 -14
- data/lib/data_drain/types/json_type.rb +1 -0
- data/lib/data_drain/validations.rb +2 -0
- data/lib/data_drain/version.rb +2 -1
- data/lib/data_drain.rb +2 -0
- data/skill/references/antipatrones.md +10 -0
- data/skill/references/postgres-tuning.md +14 -0
- metadata +8 -3
|
@@ -9,7 +9,10 @@ module DataDrain
|
|
|
9
9
|
:aws_access_key_id, :aws_secret_access_key,
|
|
10
10
|
:db_host, :db_port, :db_user, :db_pass, :db_name,
|
|
11
11
|
:batch_size, :throttle_delay, :logger, :limit_ram, :tmp_directory,
|
|
12
|
-
:idle_in_transaction_session_timeout
|
|
12
|
+
:idle_in_transaction_session_timeout,
|
|
13
|
+
:vacuum_after_purge,
|
|
14
|
+
:slow_batch_threshold_s,
|
|
15
|
+
:slow_batch_alert_after
|
|
13
16
|
|
|
14
17
|
def initialize
|
|
15
18
|
@storage_mode = :local
|
|
@@ -20,6 +23,9 @@ module DataDrain
|
|
|
20
23
|
@limit_ram = nil # eg 2GB
|
|
21
24
|
@tmp_directory = nil # eg /tmp/duckdb_work
|
|
22
25
|
@idle_in_transaction_session_timeout = 0
|
|
26
|
+
@vacuum_after_purge = false
|
|
27
|
+
@slow_batch_threshold_s = 30
|
|
28
|
+
@slow_batch_alert_after = 5
|
|
23
29
|
@logger = Logger.new($stdout)
|
|
24
30
|
end
|
|
25
31
|
|
data/lib/data_drain/engine.rb
CHANGED
|
@@ -5,12 +5,12 @@ require "pg"
|
|
|
5
5
|
|
|
6
6
|
module DataDrain
|
|
7
7
|
# Motor principal de extracción y purga de datos (DataDrain).
|
|
8
|
-
# rubocop:disable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
|
|
9
8
|
#
|
|
10
9
|
# Orquesta el flujo ETL desde PostgreSQL hacia un Data Lake analítico
|
|
11
10
|
# delegando la interacción del almacenamiento al adaptador configurado.
|
|
12
11
|
class Engine
|
|
13
12
|
include Observability
|
|
13
|
+
include Observability::Timing
|
|
14
14
|
# Inicializa una nueva instancia del motor de extracción.
|
|
15
15
|
#
|
|
16
16
|
# @param options [Hash] Diccionario de configuración para la extracción.
|
|
@@ -50,70 +50,92 @@ module DataDrain
|
|
|
50
50
|
@duckdb = database.connect
|
|
51
51
|
end
|
|
52
52
|
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
# @return [Boolean] `true` si el proceso finalizó con éxito, `false` si falló la integridad.
|
|
53
|
+
# @return [Boolean] true si el flujo completó exitosamente, false si falló
|
|
56
54
|
def call
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
55
|
+
@durations = {}
|
|
56
|
+
start_time = monotonic
|
|
57
|
+
log_start
|
|
60
58
|
|
|
61
59
|
setup_duckdb
|
|
60
|
+
return skip_empty(start_time) if step_count.zero?
|
|
62
61
|
|
|
63
|
-
# 1. Conteo inicial en Postgres
|
|
64
|
-
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
65
|
-
@pg_count = get_postgres_count
|
|
66
|
-
db_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
67
|
-
|
|
68
|
-
if @pg_count.zero?
|
|
69
|
-
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
70
|
-
safe_log(:info, "engine.skip_empty",
|
|
71
|
-
{ table: @table_name, duration_s: duration.round(2), db_query_duration_s: db_query_duration.round(2) })
|
|
72
|
-
return true
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
# 2. Exportación
|
|
76
|
-
export_duration = 0.0
|
|
77
62
|
if @skip_export
|
|
78
63
|
safe_log(:info, "engine.skip_export", { table: @table_name })
|
|
79
64
|
else
|
|
80
|
-
|
|
81
|
-
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
82
|
-
export_to_parquet
|
|
83
|
-
export_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
65
|
+
step_export
|
|
84
66
|
end
|
|
67
|
+
return integrity_failed(start_time) unless step_verify
|
|
85
68
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
69
|
+
step_purge
|
|
70
|
+
log_complete(start_time)
|
|
71
|
+
true
|
|
72
|
+
end
|
|
90
73
|
|
|
91
|
-
|
|
92
|
-
# 4. Purga en Postgres
|
|
93
|
-
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
94
|
-
purge_from_postgres
|
|
95
|
-
purge_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
74
|
+
private
|
|
96
75
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
db_query_duration_s: db_query_duration.round(2),
|
|
102
|
-
export_duration_s: export_duration.round(2),
|
|
103
|
-
integrity_duration_s: integrity_duration.round(2),
|
|
104
|
-
purge_duration_s: purge_duration.round(2),
|
|
105
|
-
count: @pg_count
|
|
106
|
-
})
|
|
107
|
-
true
|
|
108
|
-
else
|
|
109
|
-
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
110
|
-
safe_log(:error, "engine.integrity_error",
|
|
111
|
-
{ table: @table_name, duration_s: duration.round(2), count: @pg_count })
|
|
112
|
-
false
|
|
113
|
-
end
|
|
76
|
+
# @api private
|
|
77
|
+
def log_start
|
|
78
|
+
safe_log(:info, "engine.start",
|
|
79
|
+
{ table: @table_name, start_date: @start_date.to_date, end_date: @end_date.to_date })
|
|
114
80
|
end
|
|
115
81
|
|
|
116
|
-
private
|
|
82
|
+
# @api private
|
|
83
|
+
def step_count
|
|
84
|
+
@pg_count = timed(:db_query) { get_postgres_count }
|
|
85
|
+
@pg_count
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# @api private
|
|
89
|
+
def skip_empty(start_time)
|
|
90
|
+
duration = monotonic - start_time
|
|
91
|
+
safe_log(:info, "engine.skip_empty", {
|
|
92
|
+
table: @table_name,
|
|
93
|
+
duration_s: duration.round(2),
|
|
94
|
+
db_query_duration_s: @durations.fetch(:db_query, 0).round(2)
|
|
95
|
+
})
|
|
96
|
+
true
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# @api private
|
|
100
|
+
def step_export
|
|
101
|
+
safe_log(:info, "engine.export_start", { table: @table_name, count: @pg_count })
|
|
102
|
+
timed(:export) { export_to_parquet }
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# @api private
|
|
106
|
+
def step_verify
|
|
107
|
+
timed(:integrity) { verify_integrity }
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# @api private
|
|
111
|
+
def step_purge
|
|
112
|
+
timed(:purge) { purge_from_postgres }
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# @api private
|
|
116
|
+
def log_complete(start_time)
|
|
117
|
+
duration = monotonic - start_time
|
|
118
|
+
safe_log(:info, "engine.complete", {
|
|
119
|
+
table: @table_name,
|
|
120
|
+
duration_s: duration.round(2),
|
|
121
|
+
db_query_duration_s: @durations.fetch(:db_query, 0).round(2),
|
|
122
|
+
export_duration_s: @durations.fetch(:export, 0).round(2),
|
|
123
|
+
integrity_duration_s: @durations.fetch(:integrity, 0).round(2),
|
|
124
|
+
purge_duration_s: @durations.fetch(:purge, 0).round(2),
|
|
125
|
+
count: @pg_count
|
|
126
|
+
})
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# @api private
|
|
130
|
+
def integrity_failed(start_time)
|
|
131
|
+
duration = monotonic - start_time
|
|
132
|
+
safe_log(:error, "engine.integrity_error", {
|
|
133
|
+
table: @table_name,
|
|
134
|
+
duration_s: duration.round(2),
|
|
135
|
+
count: @pg_count
|
|
136
|
+
})
|
|
137
|
+
false
|
|
138
|
+
end
|
|
117
139
|
|
|
118
140
|
# @api private
|
|
119
141
|
# @return [String]
|
|
@@ -137,7 +159,7 @@ module DataDrain
|
|
|
137
159
|
# @api private
|
|
138
160
|
# @return [Integer]
|
|
139
161
|
def get_postgres_count
|
|
140
|
-
pg_sql = "SELECT
|
|
162
|
+
pg_sql = "SELECT count() AS row_count FROM public.#{@table_name} WHERE #{base_where_sql}"
|
|
141
163
|
pg_sql = pg_sql.gsub("'", "''")
|
|
142
164
|
query = "SELECT row_count FROM postgres_query('pg_source', '#{pg_sql}')"
|
|
143
165
|
@duckdb.query(query).first.first
|
|
@@ -182,7 +204,7 @@ module DataDrain
|
|
|
182
204
|
|
|
183
205
|
begin
|
|
184
206
|
query = <<~SQL
|
|
185
|
-
SELECT
|
|
207
|
+
SELECT count()
|
|
186
208
|
FROM read_parquet('#{archive_path}')
|
|
187
209
|
WHERE #{base_where_sql}
|
|
188
210
|
SQL
|
|
@@ -213,40 +235,129 @@ module DataDrain
|
|
|
213
235
|
conn.exec("SET idle_in_transaction_session_timeout = #{@config.idle_in_transaction_session_timeout};")
|
|
214
236
|
end
|
|
215
237
|
|
|
238
|
+
total_deleted = purge_loop(conn)
|
|
239
|
+
|
|
240
|
+
vacuum_if_needed(conn, total_deleted)
|
|
241
|
+
ensure
|
|
242
|
+
conn&.close
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# @api private
|
|
246
|
+
def vacuum_if_needed(conn, total_deleted)
|
|
247
|
+
return unless @config.vacuum_after_purge
|
|
248
|
+
return if total_deleted.zero?
|
|
249
|
+
|
|
250
|
+
vacuum_start = monotonic
|
|
251
|
+
dead_before = fetch_dead_tuple_count(conn)
|
|
252
|
+
|
|
253
|
+
begin
|
|
254
|
+
conn.exec("VACUUM ANALYZE #{@table_name};")
|
|
255
|
+
rescue PG::Error => e
|
|
256
|
+
safe_log(:warn, "engine.vacuum_error", {
|
|
257
|
+
table: @table_name,
|
|
258
|
+
dead_tuples_before: dead_before,
|
|
259
|
+
rows_deleted_count: total_deleted,
|
|
260
|
+
duration_s: (monotonic - vacuum_start).round(2)
|
|
261
|
+
}.merge(exception_metadata(e)))
|
|
262
|
+
return
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
dead_after = fetch_dead_tuple_count(conn)
|
|
266
|
+
vacuum_duration = monotonic - vacuum_start
|
|
267
|
+
|
|
268
|
+
safe_log(:info, "engine.vacuum_complete", {
|
|
269
|
+
table: @table_name,
|
|
270
|
+
duration_s: vacuum_duration.round(2),
|
|
271
|
+
dead_tuples_before: dead_before,
|
|
272
|
+
dead_tuples_after: dead_after,
|
|
273
|
+
rows_deleted_count: total_deleted
|
|
274
|
+
})
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
# @api private
|
|
278
|
+
def fetch_dead_tuple_count(conn)
|
|
279
|
+
result = conn.exec_params(
|
|
280
|
+
"SELECT n_dead_tup FROM pg_stat_user_tables WHERE relname = $1",
|
|
281
|
+
[@table_name]
|
|
282
|
+
)
|
|
283
|
+
result.first&.dig("n_dead_tup")&.to_i || 0
|
|
284
|
+
rescue PG::Error
|
|
285
|
+
-1
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
# @api private
|
|
289
|
+
# @param conn [PG::Connection]
|
|
290
|
+
# @return [Integer] total de filas borradas
|
|
291
|
+
def purge_loop(conn)
|
|
216
292
|
batches_processed = 0
|
|
217
293
|
total_deleted = 0
|
|
294
|
+
slow_batch_streak = 0
|
|
218
295
|
|
|
219
296
|
loop do
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
SELECT #{@primary_key} FROM #{@table_name}
|
|
224
|
-
WHERE #{base_where_sql}
|
|
225
|
-
LIMIT #{@config.batch_size}
|
|
226
|
-
)
|
|
227
|
-
SQL
|
|
228
|
-
|
|
229
|
-
result = conn.exec(sql)
|
|
297
|
+
batch_start = monotonic
|
|
298
|
+
result = conn.exec(build_delete_sql)
|
|
299
|
+
batch_duration = monotonic - batch_start
|
|
230
300
|
count = result.cmd_tuples
|
|
231
301
|
break if count.zero?
|
|
232
302
|
|
|
233
303
|
batches_processed += 1
|
|
234
304
|
total_deleted += count
|
|
235
305
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
306
|
+
slow_batch_streak = handle_batch_timing(batch_duration, count, slow_batch_streak)
|
|
307
|
+
emit_heartbeat_if_due(batches_processed, total_deleted)
|
|
308
|
+
|
|
309
|
+
sleep(@config.throttle_delay) if @config.throttle_delay.positive?
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
total_deleted
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
# @api private
|
|
316
|
+
def handle_batch_timing(batch_duration, count, streak)
|
|
317
|
+
if batch_duration > @config.slow_batch_threshold_s
|
|
318
|
+
streak += 1
|
|
319
|
+
safe_log(:warn, "engine.slow_batch", {
|
|
320
|
+
table: @table_name,
|
|
321
|
+
batch_duration_s: batch_duration.round(2),
|
|
322
|
+
batch_size: count,
|
|
323
|
+
streak: streak,
|
|
324
|
+
threshold_s: @config.slow_batch_threshold_s
|
|
325
|
+
})
|
|
326
|
+
|
|
327
|
+
if streak == @config.slow_batch_alert_after
|
|
328
|
+
safe_log(:warn, "engine.purge_degraded", {
|
|
239
329
|
table: @table_name,
|
|
240
|
-
|
|
241
|
-
|
|
330
|
+
consecutive_slow_batches: streak,
|
|
331
|
+
hint: "considerar índice composite o particionamiento (ver postgres-tuning.md)"
|
|
242
332
|
})
|
|
243
333
|
end
|
|
244
|
-
|
|
245
|
-
|
|
334
|
+
streak
|
|
335
|
+
else
|
|
336
|
+
0
|
|
246
337
|
end
|
|
247
|
-
|
|
248
|
-
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# @api private
|
|
341
|
+
def emit_heartbeat_if_due(batches_processed, total_deleted)
|
|
342
|
+
return unless (batches_processed % 100).zero?
|
|
343
|
+
|
|
344
|
+
safe_log(:info, "engine.purge_heartbeat", {
|
|
345
|
+
table: @table_name,
|
|
346
|
+
batches_processed_count: batches_processed,
|
|
347
|
+
rows_deleted_count: total_deleted
|
|
348
|
+
})
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
# @api private
|
|
352
|
+
def build_delete_sql
|
|
353
|
+
<<~SQL
|
|
354
|
+
DELETE FROM #{@table_name}
|
|
355
|
+
WHERE #{@primary_key} IN (
|
|
356
|
+
SELECT #{@primary_key} FROM #{@table_name}
|
|
357
|
+
WHERE #{base_where_sql}
|
|
358
|
+
LIMIT #{@config.batch_size}
|
|
359
|
+
)
|
|
360
|
+
SQL
|
|
249
361
|
end
|
|
250
362
|
end
|
|
251
|
-
# rubocop:enable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
|
|
252
363
|
end
|
|
@@ -6,8 +6,7 @@ module DataDrain
|
|
|
6
6
|
# aplicando compresión ZSTD y particionamiento Hive.
|
|
7
7
|
class FileIngestor
|
|
8
8
|
include Observability
|
|
9
|
-
|
|
10
|
-
# Metrics/MethodLength
|
|
9
|
+
include Observability::Timing
|
|
11
10
|
|
|
12
11
|
# @param options [Hash] Opciones de ingestión.
|
|
13
12
|
# @option options [String] :source_path Ruta absoluta al archivo local.
|
|
@@ -36,46 +35,77 @@ module DataDrain
|
|
|
36
35
|
# Ejecuta el flujo de ingestión.
|
|
37
36
|
# @return [Boolean] true si el proceso fue exitoso.
|
|
38
37
|
def call
|
|
39
|
-
|
|
38
|
+
@durations = {}
|
|
39
|
+
start_time = monotonic
|
|
40
40
|
safe_log(:info, "file_ingestor.start", { source_path: @source_path })
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
42
|
+
return file_not_found(start_time) unless step_validate_file
|
|
43
|
+
|
|
44
|
+
step_setup_duckdb
|
|
45
|
+
@reader_function = determine_reader
|
|
46
|
+
@source_count = step_count_source
|
|
47
|
+
|
|
48
|
+
return skip_empty(start_time) if @source_count.zero?
|
|
49
|
+
|
|
50
|
+
step_export
|
|
51
|
+
log_complete(start_time)
|
|
52
|
+
cleanup_local_file
|
|
53
|
+
true
|
|
54
|
+
rescue DuckDB::Error => e
|
|
55
|
+
duration = monotonic - start_time
|
|
56
|
+
safe_log(:error, "file_ingestor.duckdb_error",
|
|
57
|
+
{ source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
|
|
58
|
+
false
|
|
59
|
+
ensure
|
|
60
|
+
@duckdb&.close
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
# @api private
|
|
66
|
+
def file_not_found(_start_time)
|
|
67
|
+
safe_log(:error, "file_ingestor.file_not_found", { source_path: @source_path })
|
|
68
|
+
false
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# @api private
|
|
72
|
+
def step_validate_file
|
|
73
|
+
File.exist?(@source_path)
|
|
74
|
+
end
|
|
46
75
|
|
|
76
|
+
# @api private
|
|
77
|
+
def step_setup_duckdb
|
|
47
78
|
@duckdb.query("SET max_memory='#{@config.limit_ram}';") if @config.limit_ram.present?
|
|
48
79
|
@duckdb.query("SET temp_directory='#{@config.tmp_directory}'") if @config.tmp_directory.present?
|
|
49
|
-
|
|
50
80
|
@adapter.setup_duckdb(@duckdb)
|
|
81
|
+
end
|
|
51
82
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
# 1. Conteo de seguridad
|
|
56
|
-
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
57
|
-
source_count = @duckdb.query("SELECT COUNT(*) FROM #{reader_function}").first.first
|
|
58
|
-
source_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
83
|
+
# @api private
|
|
84
|
+
def step_count_source
|
|
85
|
+
source_count = timed(:source_query) { @duckdb.query("SELECT count() FROM #{@reader_function}").first.first }
|
|
59
86
|
safe_log(:info, "file_ingestor.count", {
|
|
60
87
|
source_path: @source_path,
|
|
61
88
|
count: source_count,
|
|
62
|
-
source_query_duration_s:
|
|
89
|
+
source_query_duration_s: @durations.fetch(:source_query, 0).round(2)
|
|
63
90
|
})
|
|
91
|
+
source_count
|
|
92
|
+
end
|
|
64
93
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
94
|
+
# @api private
|
|
95
|
+
def skip_empty(start_time)
|
|
96
|
+
cleanup_local_file
|
|
97
|
+
duration = monotonic - start_time
|
|
98
|
+
safe_log(:info, "file_ingestor.skip_empty", { source_path: @source_path, duration_s: duration.round(2) })
|
|
99
|
+
true
|
|
100
|
+
end
|
|
71
101
|
|
|
72
|
-
|
|
102
|
+
# @api private
|
|
103
|
+
def step_export
|
|
73
104
|
@adapter.prepare_export_path(@bucket, @folder_name)
|
|
74
105
|
dest_path = if @config.storage_mode.to_sym == :s3
|
|
75
106
|
"s3://#{@bucket}/#{@folder_name}/"
|
|
76
107
|
else
|
|
77
|
-
File.join(@bucket,
|
|
78
|
-
@folder_name, "")
|
|
108
|
+
File.join(@bucket, @folder_name, "")
|
|
79
109
|
end
|
|
80
110
|
|
|
81
111
|
partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(", ")})," : ""
|
|
@@ -83,7 +113,7 @@ module DataDrain
|
|
|
83
113
|
query = <<~SQL
|
|
84
114
|
COPY (
|
|
85
115
|
SELECT #{@select_sql}
|
|
86
|
-
FROM #{reader_function}
|
|
116
|
+
FROM #{@reader_function}
|
|
87
117
|
) TO '#{dest_path}'
|
|
88
118
|
(
|
|
89
119
|
FORMAT PARQUET,
|
|
@@ -94,32 +124,21 @@ module DataDrain
|
|
|
94
124
|
SQL
|
|
95
125
|
|
|
96
126
|
safe_log(:info, "file_ingestor.export_start", { dest_path: dest_path })
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
export_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
127
|
+
timed(:export) { @duckdb.query(query) }
|
|
128
|
+
end
|
|
100
129
|
|
|
101
|
-
|
|
130
|
+
# @api private
|
|
131
|
+
def log_complete(start_time)
|
|
132
|
+
duration = monotonic - start_time
|
|
102
133
|
safe_log(:info, "file_ingestor.complete", {
|
|
103
134
|
source_path: @source_path,
|
|
104
135
|
duration_s: duration.round(2),
|
|
105
|
-
source_query_duration_s:
|
|
106
|
-
export_duration_s:
|
|
107
|
-
count: source_count
|
|
136
|
+
source_query_duration_s: @durations.fetch(:source_query, 0).round(2),
|
|
137
|
+
export_duration_s: @durations.fetch(:export, 0).round(2),
|
|
138
|
+
count: @source_count
|
|
108
139
|
})
|
|
109
|
-
|
|
110
|
-
cleanup_local_file
|
|
111
|
-
true
|
|
112
|
-
rescue DuckDB::Error => e
|
|
113
|
-
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
114
|
-
safe_log(:error, "file_ingestor.duckdb_error",
|
|
115
|
-
{ source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
|
|
116
|
-
false
|
|
117
|
-
ensure
|
|
118
|
-
@duckdb&.close
|
|
119
140
|
end
|
|
120
141
|
|
|
121
|
-
private
|
|
122
|
-
|
|
123
142
|
# @api private
|
|
124
143
|
def determine_reader
|
|
125
144
|
case File.extname(@source_path).downcase
|
|
@@ -142,6 +161,4 @@ module DataDrain
|
|
|
142
161
|
safe_log(:info, "file_ingestor.cleanup", { source_path: @source_path })
|
|
143
162
|
end
|
|
144
163
|
end
|
|
145
|
-
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
|
|
146
|
-
# Metrics/MethodLength
|
|
147
164
|
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DataDrain
|
|
4
|
+
module Observability
|
|
5
|
+
# Helper para medición de duración de operaciones.
|
|
6
|
+
# @api private
|
|
7
|
+
module Timing
|
|
8
|
+
private
|
|
9
|
+
|
|
10
|
+
def monotonic
|
|
11
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def timed(step_name)
|
|
15
|
+
t = monotonic
|
|
16
|
+
result = yield
|
|
17
|
+
@durations ||= {}
|
|
18
|
+
@durations[step_name] = monotonic - t
|
|
19
|
+
result
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -7,6 +7,8 @@ module DataDrain
|
|
|
7
7
|
# Este módulo es genérico y puede ser utilizado en otras gemas.
|
|
8
8
|
# @api private
|
|
9
9
|
module Observability
|
|
10
|
+
# Regex para detectar claves sensibles en logs y enmascararlas preventivamente.
|
|
11
|
+
# @!visibility private
|
|
10
12
|
SENSITIVE_KEY_PATTERN = /password|passwd|pass|secret|token|api_key|apikey|auth|credential|private_key/i
|
|
11
13
|
|
|
12
14
|
private
|
data/lib/data_drain/record.rb
CHANGED
|
@@ -46,7 +46,6 @@ module DataDrain
|
|
|
46
46
|
# Esto previene tener que recargar extensiones (como httpfs) en cada consulta.
|
|
47
47
|
#
|
|
48
48
|
# @return [DuckDB::Connection] Conexión activa a DuckDB.
|
|
49
|
-
# rubocop:disable Metrics/AbcSize
|
|
50
49
|
def self.connection
|
|
51
50
|
Thread.current[:data_drain_duckdb] ||= begin
|
|
52
51
|
db = DuckDB::Database.open(":memory:")
|
|
@@ -57,11 +56,13 @@ module DataDrain
|
|
|
57
56
|
conn.query("SET temp_directory='#{config.tmp_directory}'") if config.tmp_directory.present?
|
|
58
57
|
|
|
59
58
|
DataDrain::Storage.adapter.setup_duckdb(conn)
|
|
59
|
+
|
|
60
|
+
conn.query("SET lock_configuration=true;")
|
|
61
|
+
|
|
60
62
|
{ db: db, conn: conn }
|
|
61
63
|
end
|
|
62
64
|
Thread.current[:data_drain_duckdb][:conn]
|
|
63
65
|
end
|
|
64
|
-
# rubocop:enable Metrics/AbcSize
|
|
65
66
|
|
|
66
67
|
# Consulta registros en el Data Lake filtrando por claves de partición.
|
|
67
68
|
#
|
|
@@ -138,22 +139,14 @@ module DataDrain
|
|
|
138
139
|
# @param sql [String]
|
|
139
140
|
# @param columns [Array<String>]
|
|
140
141
|
# @return [Array<DataDrain::Record>]
|
|
141
|
-
# rubocop:disable Metrics/MethodLength
|
|
142
142
|
def execute_and_instantiate(sql, columns)
|
|
143
143
|
@logger = DataDrain.configuration.logger
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
result.map do |row|
|
|
152
|
-
attributes_hash = columns.zip(row).to_h
|
|
153
|
-
new(attributes_hash)
|
|
154
|
-
end
|
|
144
|
+
result = connection.query(sql)
|
|
145
|
+
result.map { |row| new(columns.zip(row).to_h) }
|
|
146
|
+
rescue DuckDB::Error => e
|
|
147
|
+
safe_log(:warn, "record.parquet_not_found", exception_metadata(e))
|
|
148
|
+
[]
|
|
155
149
|
end
|
|
156
150
|
end
|
|
157
|
-
# rubocop:enable Metrics/MethodLength
|
|
158
151
|
end
|
|
159
152
|
end
|
|
@@ -54,6 +54,18 @@ module DataDrain
|
|
|
54
54
|
def destroy_partitions(bucket, folder_name, partition_keys, partitions)
|
|
55
55
|
raise NotImplementedError, "#{self.class} debe implementar #destroy_partitions"
|
|
56
56
|
end
|
|
57
|
+
|
|
58
|
+
protected
|
|
59
|
+
|
|
60
|
+
# @param bucket [String]
|
|
61
|
+
# @param folder_name [String]
|
|
62
|
+
# @param partition_path [String, nil]
|
|
63
|
+
# @return [String] path sin prefix de protocolo ni sufijo glob
|
|
64
|
+
def build_path_base(bucket, folder_name, partition_path)
|
|
65
|
+
base = File.join(bucket, folder_name)
|
|
66
|
+
base = File.join(base, partition_path) if partition_path && !partition_path.empty?
|
|
67
|
+
base
|
|
68
|
+
end
|
|
57
69
|
end
|
|
58
70
|
end
|
|
59
71
|
end
|
|
@@ -24,9 +24,7 @@ module DataDrain
|
|
|
24
24
|
# @param partition_path [String, nil]
|
|
25
25
|
# @return [String]
|
|
26
26
|
def build_path(bucket, folder_name, partition_path)
|
|
27
|
-
|
|
28
|
-
base = File.join(base, partition_path) if partition_path && !partition_path.empty?
|
|
29
|
-
"#{base}/**/*.parquet"
|
|
27
|
+
"#{build_path_base(bucket, folder_name, partition_path)}/**/*.parquet"
|
|
30
28
|
end
|
|
31
29
|
|
|
32
30
|
# @param bucket [String]
|