data_drain 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +46 -1
- data/CLAUDE.md +3 -1
- data/README.md +3 -0
- data/docs/IMPROVEMENT_PLAN.md +271 -11
- data/docs/execution/v0.2.2.md +891 -0
- data/lib/data_drain/configuration.rb +55 -5
- data/lib/data_drain/engine.rb +183 -72
- data/lib/data_drain/file_ingestor.rb +65 -47
- data/lib/data_drain/glue_runner.rb +22 -10
- data/lib/data_drain/observability/timing.rb +23 -0
- data/lib/data_drain/observability.rb +4 -2
- data/lib/data_drain/record.rb +10 -16
- data/lib/data_drain/storage/s3.rb +60 -45
- data/lib/data_drain/version.rb +1 -1
- data/lib/data_drain.rb +1 -0
- data/skill/SKILL.md +1 -0
- data/skill/references/antipatrones.md +20 -3
- data/skill/references/api-detallada.md +18 -5
- data/skill/references/eventos-telemetria.md +5 -0
- data/skill/references/postgres-tuning.md +129 -0
- metadata +5 -2
|
@@ -6,10 +6,13 @@ module DataDrain
|
|
|
6
6
|
# Contenedor para todas las opciones de configuración del motor DataDrain.
|
|
7
7
|
class Configuration
|
|
8
8
|
attr_accessor :storage_mode, :aws_region,
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
9
|
+
:aws_access_key_id, :aws_secret_access_key,
|
|
10
|
+
:db_host, :db_port, :db_user, :db_pass, :db_name,
|
|
11
|
+
:batch_size, :throttle_delay, :logger, :limit_ram, :tmp_directory,
|
|
12
|
+
:idle_in_transaction_session_timeout,
|
|
13
|
+
:vacuum_after_purge,
|
|
14
|
+
:slow_batch_threshold_s,
|
|
15
|
+
:slow_batch_alert_after
|
|
13
16
|
|
|
14
17
|
def initialize
|
|
15
18
|
@storage_mode = :local
|
|
@@ -20,12 +23,59 @@ module DataDrain
|
|
|
20
23
|
@limit_ram = nil # eg 2GB
|
|
21
24
|
@tmp_directory = nil # eg /tmp/duckdb_work
|
|
22
25
|
@idle_in_transaction_session_timeout = 0
|
|
23
|
-
@
|
|
26
|
+
@vacuum_after_purge = false
|
|
27
|
+
@slow_batch_threshold_s = 30
|
|
28
|
+
@slow_batch_alert_after = 5
|
|
29
|
+
@logger = Logger.new($stdout)
|
|
24
30
|
end
|
|
25
31
|
|
|
26
32
|
# @return [String] Cadena de conexión optimizada para DuckDB.
|
|
27
33
|
def duckdb_connection_string
|
|
28
34
|
"postgresql://#{@db_user}:#{@db_pass}@#{@db_host}:#{@db_port}/#{@db_name}?options=-c%20idle_in_transaction_session_timeout%3D#{@idle_in_transaction_session_timeout}"
|
|
29
35
|
end
|
|
36
|
+
|
|
37
|
+
# Valida invariantes generales (storage_mode + AWS si aplica).
|
|
38
|
+
# Llamado por FileIngestor#initialize y GlueRunner.run_and_wait.
|
|
39
|
+
#
|
|
40
|
+
# @raise [DataDrain::ConfigurationError]
|
|
41
|
+
def validate!
|
|
42
|
+
validate_storage_mode!
|
|
43
|
+
validate_aws_config! if storage_mode.to_sym == :s3
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Valida además las credenciales PostgreSQL.
|
|
47
|
+
# Llamado por Engine#initialize.
|
|
48
|
+
#
|
|
49
|
+
# @raise [DataDrain::ConfigurationError]
|
|
50
|
+
def validate_for_engine!
|
|
51
|
+
validate!
|
|
52
|
+
validate_db_config!
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def validate_storage_mode!
|
|
58
|
+
return if %i[local s3].include?(storage_mode.to_sym)
|
|
59
|
+
|
|
60
|
+
raise DataDrain::ConfigurationError,
|
|
61
|
+
"storage_mode debe ser :local o :s3, recibido #{storage_mode.inspect}"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def validate_aws_config!
|
|
65
|
+
return unless aws_region.nil? || aws_region.to_s.empty?
|
|
66
|
+
|
|
67
|
+
raise DataDrain::ConfigurationError,
|
|
68
|
+
"aws_region es obligatorio con storage_mode = :s3"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def validate_db_config!
|
|
72
|
+
%i[db_host db_user db_name].each do |attr|
|
|
73
|
+
val = public_send(attr)
|
|
74
|
+
next unless val.nil? || val.to_s.empty?
|
|
75
|
+
|
|
76
|
+
raise DataDrain::ConfigurationError,
|
|
77
|
+
"config.#{attr} es obligatorio para Engine (storage_mode=#{storage_mode})"
|
|
78
|
+
end
|
|
79
|
+
end
|
|
30
80
|
end
|
|
31
81
|
end
|
data/lib/data_drain/engine.rb
CHANGED
|
@@ -5,12 +5,12 @@ require "pg"
|
|
|
5
5
|
|
|
6
6
|
module DataDrain
|
|
7
7
|
# Motor principal de extracción y purga de datos (DataDrain).
|
|
8
|
-
# rubocop:disable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
|
|
9
8
|
#
|
|
10
9
|
# Orquesta el flujo ETL desde PostgreSQL hacia un Data Lake analítico
|
|
11
10
|
# delegando la interacción del almacenamiento al adaptador configurado.
|
|
12
11
|
class Engine
|
|
13
12
|
include Observability
|
|
13
|
+
include Observability::Timing
|
|
14
14
|
# Inicializa una nueva instancia del motor de extracción.
|
|
15
15
|
#
|
|
16
16
|
# @param options [Hash] Diccionario de configuración para la extracción.
|
|
@@ -42,6 +42,7 @@ module DataDrain
|
|
|
42
42
|
@skip_export = options.fetch(:skip_export, false)
|
|
43
43
|
|
|
44
44
|
@config = DataDrain.configuration
|
|
45
|
+
@config.validate_for_engine!
|
|
45
46
|
@logger = @config.logger
|
|
46
47
|
@adapter = DataDrain::Storage.adapter
|
|
47
48
|
|
|
@@ -49,70 +50,91 @@ module DataDrain
|
|
|
49
50
|
@duckdb = database.connect
|
|
50
51
|
end
|
|
51
52
|
|
|
52
|
-
# Ejecuta el flujo completo del motor: Setup, Conteo, Exportación (opcional), Verificación y Purga.
|
|
53
|
-
#
|
|
54
|
-
# @return [Boolean] `true` si el proceso finalizó con éxito, `false` si falló la integridad.
|
|
55
53
|
def call
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
54
|
+
@durations = {}
|
|
55
|
+
start_time = monotonic
|
|
56
|
+
log_start
|
|
59
57
|
|
|
60
58
|
setup_duckdb
|
|
59
|
+
return skip_empty(start_time) if step_count.zero?
|
|
61
60
|
|
|
62
|
-
# 1. Conteo inicial en Postgres
|
|
63
|
-
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
64
|
-
@pg_count = get_postgres_count
|
|
65
|
-
db_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
66
|
-
|
|
67
|
-
if @pg_count.zero?
|
|
68
|
-
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
69
|
-
safe_log(:info, "engine.skip_empty",
|
|
70
|
-
{ table: @table_name, duration_s: duration.round(2), db_query_duration_s: db_query_duration.round(2) })
|
|
71
|
-
return true
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# 2. Exportación
|
|
75
|
-
export_duration = 0.0
|
|
76
61
|
if @skip_export
|
|
77
62
|
safe_log(:info, "engine.skip_export", { table: @table_name })
|
|
78
63
|
else
|
|
79
|
-
|
|
80
|
-
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
81
|
-
export_to_parquet
|
|
82
|
-
export_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
64
|
+
step_export
|
|
83
65
|
end
|
|
66
|
+
return integrity_failed(start_time) unless step_verify
|
|
84
67
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
68
|
+
step_purge
|
|
69
|
+
log_complete(start_time)
|
|
70
|
+
true
|
|
71
|
+
end
|
|
89
72
|
|
|
90
|
-
|
|
91
|
-
# 4. Purga en Postgres
|
|
92
|
-
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
93
|
-
purge_from_postgres
|
|
94
|
-
purge_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
73
|
+
private
|
|
95
74
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
db_query_duration_s: db_query_duration.round(2),
|
|
101
|
-
export_duration_s: export_duration.round(2),
|
|
102
|
-
integrity_duration_s: integrity_duration.round(2),
|
|
103
|
-
purge_duration_s: purge_duration.round(2),
|
|
104
|
-
count: @pg_count
|
|
105
|
-
})
|
|
106
|
-
true
|
|
107
|
-
else
|
|
108
|
-
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
109
|
-
safe_log(:error, "engine.integrity_error",
|
|
110
|
-
{ table: @table_name, duration_s: duration.round(2), count: @pg_count })
|
|
111
|
-
false
|
|
112
|
-
end
|
|
75
|
+
# @api private
|
|
76
|
+
def log_start
|
|
77
|
+
safe_log(:info, "engine.start",
|
|
78
|
+
{ table: @table_name, start_date: @start_date.to_date, end_date: @end_date.to_date })
|
|
113
79
|
end
|
|
114
80
|
|
|
115
|
-
private
|
|
81
|
+
# @api private
|
|
82
|
+
def step_count
|
|
83
|
+
@pg_count = timed(:db_query) { get_postgres_count }
|
|
84
|
+
@pg_count
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# @api private
|
|
88
|
+
def skip_empty(start_time)
|
|
89
|
+
duration = monotonic - start_time
|
|
90
|
+
safe_log(:info, "engine.skip_empty", {
|
|
91
|
+
table: @table_name,
|
|
92
|
+
duration_s: duration.round(2),
|
|
93
|
+
db_query_duration_s: @durations.fetch(:db_query, 0).round(2)
|
|
94
|
+
})
|
|
95
|
+
true
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# @api private
|
|
99
|
+
def step_export
|
|
100
|
+
safe_log(:info, "engine.export_start", { table: @table_name, count: @pg_count })
|
|
101
|
+
timed(:export) { export_to_parquet }
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# @api private
|
|
105
|
+
def step_verify
|
|
106
|
+
timed(:integrity) { verify_integrity }
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# @api private
|
|
110
|
+
def step_purge
|
|
111
|
+
timed(:purge) { purge_from_postgres }
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# @api private
|
|
115
|
+
def log_complete(start_time)
|
|
116
|
+
duration = monotonic - start_time
|
|
117
|
+
safe_log(:info, "engine.complete", {
|
|
118
|
+
table: @table_name,
|
|
119
|
+
duration_s: duration.round(2),
|
|
120
|
+
db_query_duration_s: @durations.fetch(:db_query, 0).round(2),
|
|
121
|
+
export_duration_s: @durations.fetch(:export, 0).round(2),
|
|
122
|
+
integrity_duration_s: @durations.fetch(:integrity, 0).round(2),
|
|
123
|
+
purge_duration_s: @durations.fetch(:purge, 0).round(2),
|
|
124
|
+
count: @pg_count
|
|
125
|
+
})
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# @api private
|
|
129
|
+
def integrity_failed(start_time)
|
|
130
|
+
duration = monotonic - start_time
|
|
131
|
+
safe_log(:error, "engine.integrity_error", {
|
|
132
|
+
table: @table_name,
|
|
133
|
+
duration_s: duration.round(2),
|
|
134
|
+
count: @pg_count
|
|
135
|
+
})
|
|
136
|
+
false
|
|
137
|
+
end
|
|
116
138
|
|
|
117
139
|
# @api private
|
|
118
140
|
# @return [String]
|
|
@@ -212,40 +234,129 @@ module DataDrain
|
|
|
212
234
|
conn.exec("SET idle_in_transaction_session_timeout = #{@config.idle_in_transaction_session_timeout};")
|
|
213
235
|
end
|
|
214
236
|
|
|
237
|
+
total_deleted = purge_loop(conn)
|
|
238
|
+
|
|
239
|
+
vacuum_if_needed(conn, total_deleted)
|
|
240
|
+
ensure
|
|
241
|
+
conn&.close
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# @api private
|
|
245
|
+
def vacuum_if_needed(conn, total_deleted)
|
|
246
|
+
return unless @config.vacuum_after_purge
|
|
247
|
+
return if total_deleted.zero?
|
|
248
|
+
|
|
249
|
+
vacuum_start = monotonic
|
|
250
|
+
dead_before = fetch_dead_tuple_count(conn)
|
|
251
|
+
|
|
252
|
+
begin
|
|
253
|
+
conn.exec("VACUUM ANALYZE #{@table_name};")
|
|
254
|
+
rescue PG::Error => e
|
|
255
|
+
safe_log(:warn, "engine.vacuum_error", {
|
|
256
|
+
table: @table_name,
|
|
257
|
+
dead_tuples_before: dead_before,
|
|
258
|
+
rows_deleted_count: total_deleted,
|
|
259
|
+
duration_s: (monotonic - vacuum_start).round(2)
|
|
260
|
+
}.merge(exception_metadata(e)))
|
|
261
|
+
return
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
dead_after = fetch_dead_tuple_count(conn)
|
|
265
|
+
vacuum_duration = monotonic - vacuum_start
|
|
266
|
+
|
|
267
|
+
safe_log(:info, "engine.vacuum_complete", {
|
|
268
|
+
table: @table_name,
|
|
269
|
+
duration_s: vacuum_duration.round(2),
|
|
270
|
+
dead_tuples_before: dead_before,
|
|
271
|
+
dead_tuples_after: dead_after,
|
|
272
|
+
rows_deleted_count: total_deleted
|
|
273
|
+
})
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# @api private
|
|
277
|
+
def fetch_dead_tuple_count(conn)
|
|
278
|
+
result = conn.exec_params(
|
|
279
|
+
"SELECT n_dead_tup FROM pg_stat_user_tables WHERE relname = $1",
|
|
280
|
+
[@table_name]
|
|
281
|
+
)
|
|
282
|
+
result.first&.dig("n_dead_tup")&.to_i || 0
|
|
283
|
+
rescue PG::Error
|
|
284
|
+
-1
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
# @api private
|
|
288
|
+
# @param conn [PG::Connection]
|
|
289
|
+
# @return [Integer] total de filas borradas
|
|
290
|
+
def purge_loop(conn)
|
|
215
291
|
batches_processed = 0
|
|
216
292
|
total_deleted = 0
|
|
293
|
+
slow_batch_streak = 0
|
|
217
294
|
|
|
218
295
|
loop do
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
SELECT #{@primary_key} FROM #{@table_name}
|
|
223
|
-
WHERE #{base_where_sql}
|
|
224
|
-
LIMIT #{@config.batch_size}
|
|
225
|
-
)
|
|
226
|
-
SQL
|
|
227
|
-
|
|
228
|
-
result = conn.exec(sql)
|
|
296
|
+
batch_start = monotonic
|
|
297
|
+
result = conn.exec(build_delete_sql)
|
|
298
|
+
batch_duration = monotonic - batch_start
|
|
229
299
|
count = result.cmd_tuples
|
|
230
300
|
break if count.zero?
|
|
231
301
|
|
|
232
302
|
batches_processed += 1
|
|
233
303
|
total_deleted += count
|
|
234
304
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
305
|
+
slow_batch_streak = handle_batch_timing(batch_duration, count, slow_batch_streak)
|
|
306
|
+
emit_heartbeat_if_due(batches_processed, total_deleted)
|
|
307
|
+
|
|
308
|
+
sleep(@config.throttle_delay) if @config.throttle_delay.positive?
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
total_deleted
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# @api private
|
|
315
|
+
def handle_batch_timing(batch_duration, count, streak)
|
|
316
|
+
if batch_duration > @config.slow_batch_threshold_s
|
|
317
|
+
streak += 1
|
|
318
|
+
safe_log(:warn, "engine.slow_batch", {
|
|
319
|
+
table: @table_name,
|
|
320
|
+
batch_duration_s: batch_duration.round(2),
|
|
321
|
+
batch_size: count,
|
|
322
|
+
streak: streak,
|
|
323
|
+
threshold_s: @config.slow_batch_threshold_s
|
|
324
|
+
})
|
|
325
|
+
|
|
326
|
+
if streak == @config.slow_batch_alert_after
|
|
327
|
+
safe_log(:warn, "engine.purge_degraded", {
|
|
238
328
|
table: @table_name,
|
|
239
|
-
|
|
240
|
-
|
|
329
|
+
consecutive_slow_batches: streak,
|
|
330
|
+
hint: "considerar índice composite o particionamiento (ver postgres-tuning.md)"
|
|
241
331
|
})
|
|
242
332
|
end
|
|
243
|
-
|
|
244
|
-
|
|
333
|
+
streak
|
|
334
|
+
else
|
|
335
|
+
0
|
|
245
336
|
end
|
|
246
|
-
|
|
247
|
-
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
# @api private
|
|
340
|
+
def emit_heartbeat_if_due(batches_processed, total_deleted)
|
|
341
|
+
return unless (batches_processed % 100).zero?
|
|
342
|
+
|
|
343
|
+
safe_log(:info, "engine.purge_heartbeat", {
|
|
344
|
+
table: @table_name,
|
|
345
|
+
batches_processed_count: batches_processed,
|
|
346
|
+
rows_deleted_count: total_deleted
|
|
347
|
+
})
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# @api private
|
|
351
|
+
def build_delete_sql
|
|
352
|
+
<<~SQL
|
|
353
|
+
DELETE FROM #{@table_name}
|
|
354
|
+
WHERE #{@primary_key} IN (
|
|
355
|
+
SELECT #{@primary_key} FROM #{@table_name}
|
|
356
|
+
WHERE #{base_where_sql}
|
|
357
|
+
LIMIT #{@config.batch_size}
|
|
358
|
+
)
|
|
359
|
+
SQL
|
|
248
360
|
end
|
|
249
361
|
end
|
|
250
|
-
# rubocop:enable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
|
|
251
362
|
end
|
|
@@ -6,8 +6,7 @@ module DataDrain
|
|
|
6
6
|
# aplicando compresión ZSTD y particionamiento Hive.
|
|
7
7
|
class FileIngestor
|
|
8
8
|
include Observability
|
|
9
|
-
|
|
10
|
-
# Metrics/MethodLength
|
|
9
|
+
include Observability::Timing
|
|
11
10
|
|
|
12
11
|
# @param options [Hash] Opciones de ingestión.
|
|
13
12
|
# @option options [String] :source_path Ruta absoluta al archivo local.
|
|
@@ -25,6 +24,7 @@ module DataDrain
|
|
|
25
24
|
@bucket = options[:bucket]
|
|
26
25
|
|
|
27
26
|
@config = DataDrain.configuration
|
|
27
|
+
@config.validate!
|
|
28
28
|
@logger = @config.logger
|
|
29
29
|
@adapter = DataDrain::Storage.adapter
|
|
30
30
|
|
|
@@ -35,46 +35,77 @@ module DataDrain
|
|
|
35
35
|
# Ejecuta el flujo de ingestión.
|
|
36
36
|
# @return [Boolean] true si el proceso fue exitoso.
|
|
37
37
|
def call
|
|
38
|
-
|
|
38
|
+
@durations = {}
|
|
39
|
+
start_time = monotonic
|
|
39
40
|
safe_log(:info, "file_ingestor.start", { source_path: @source_path })
|
|
40
41
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
42
|
+
return file_not_found(start_time) unless step_validate_file
|
|
43
|
+
|
|
44
|
+
step_setup_duckdb
|
|
45
|
+
@reader_function = determine_reader
|
|
46
|
+
@source_count = step_count_source
|
|
47
|
+
|
|
48
|
+
return skip_empty(start_time) if @source_count.zero?
|
|
49
|
+
|
|
50
|
+
step_export
|
|
51
|
+
log_complete(start_time)
|
|
52
|
+
cleanup_local_file
|
|
53
|
+
true
|
|
54
|
+
rescue DuckDB::Error => e
|
|
55
|
+
duration = monotonic - start_time
|
|
56
|
+
safe_log(:error, "file_ingestor.duckdb_error",
|
|
57
|
+
{ source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
|
|
58
|
+
false
|
|
59
|
+
ensure
|
|
60
|
+
@duckdb&.close
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
# @api private
|
|
66
|
+
def file_not_found(_start_time)
|
|
67
|
+
safe_log(:error, "file_ingestor.file_not_found", { source_path: @source_path })
|
|
68
|
+
false
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# @api private
|
|
72
|
+
def step_validate_file
|
|
73
|
+
File.exist?(@source_path)
|
|
74
|
+
end
|
|
45
75
|
|
|
76
|
+
# @api private
|
|
77
|
+
def step_setup_duckdb
|
|
46
78
|
@duckdb.query("SET max_memory='#{@config.limit_ram}';") if @config.limit_ram.present?
|
|
47
79
|
@duckdb.query("SET temp_directory='#{@config.tmp_directory}'") if @config.tmp_directory.present?
|
|
48
|
-
|
|
49
80
|
@adapter.setup_duckdb(@duckdb)
|
|
81
|
+
end
|
|
50
82
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
# 1. Conteo de seguridad
|
|
55
|
-
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
56
|
-
source_count = @duckdb.query("SELECT COUNT(*) FROM #{reader_function}").first.first
|
|
57
|
-
source_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
83
|
+
# @api private
|
|
84
|
+
def step_count_source
|
|
85
|
+
source_count = timed(:source_query) { @duckdb.query("SELECT COUNT(*) FROM #{@reader_function}").first.first }
|
|
58
86
|
safe_log(:info, "file_ingestor.count", {
|
|
59
87
|
source_path: @source_path,
|
|
60
88
|
count: source_count,
|
|
61
|
-
source_query_duration_s:
|
|
89
|
+
source_query_duration_s: @durations.fetch(:source_query, 0).round(2)
|
|
62
90
|
})
|
|
91
|
+
source_count
|
|
92
|
+
end
|
|
63
93
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
94
|
+
# @api private
|
|
95
|
+
def skip_empty(start_time)
|
|
96
|
+
cleanup_local_file
|
|
97
|
+
duration = monotonic - start_time
|
|
98
|
+
safe_log(:info, "file_ingestor.skip_empty", { source_path: @source_path, duration_s: duration.round(2) })
|
|
99
|
+
true
|
|
100
|
+
end
|
|
70
101
|
|
|
71
|
-
|
|
102
|
+
# @api private
|
|
103
|
+
def step_export
|
|
72
104
|
@adapter.prepare_export_path(@bucket, @folder_name)
|
|
73
105
|
dest_path = if @config.storage_mode.to_sym == :s3
|
|
74
106
|
"s3://#{@bucket}/#{@folder_name}/"
|
|
75
107
|
else
|
|
76
|
-
File.join(@bucket,
|
|
77
|
-
@folder_name, "")
|
|
108
|
+
File.join(@bucket, @folder_name, "")
|
|
78
109
|
end
|
|
79
110
|
|
|
80
111
|
partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(", ")})," : ""
|
|
@@ -82,7 +113,7 @@ module DataDrain
|
|
|
82
113
|
query = <<~SQL
|
|
83
114
|
COPY (
|
|
84
115
|
SELECT #{@select_sql}
|
|
85
|
-
FROM #{reader_function}
|
|
116
|
+
FROM #{@reader_function}
|
|
86
117
|
) TO '#{dest_path}'
|
|
87
118
|
(
|
|
88
119
|
FORMAT PARQUET,
|
|
@@ -93,32 +124,21 @@ module DataDrain
|
|
|
93
124
|
SQL
|
|
94
125
|
|
|
95
126
|
safe_log(:info, "file_ingestor.export_start", { dest_path: dest_path })
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
export_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
127
|
+
timed(:export) { @duckdb.query(query) }
|
|
128
|
+
end
|
|
99
129
|
|
|
100
|
-
|
|
130
|
+
# @api private
|
|
131
|
+
def log_complete(start_time)
|
|
132
|
+
duration = monotonic - start_time
|
|
101
133
|
safe_log(:info, "file_ingestor.complete", {
|
|
102
134
|
source_path: @source_path,
|
|
103
135
|
duration_s: duration.round(2),
|
|
104
|
-
source_query_duration_s:
|
|
105
|
-
export_duration_s:
|
|
106
|
-
count: source_count
|
|
136
|
+
source_query_duration_s: @durations.fetch(:source_query, 0).round(2),
|
|
137
|
+
export_duration_s: @durations.fetch(:export, 0).round(2),
|
|
138
|
+
count: @source_count
|
|
107
139
|
})
|
|
108
|
-
|
|
109
|
-
cleanup_local_file
|
|
110
|
-
true
|
|
111
|
-
rescue DuckDB::Error => e
|
|
112
|
-
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
113
|
-
safe_log(:error, "file_ingestor.duckdb_error",
|
|
114
|
-
{ source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
|
|
115
|
-
false
|
|
116
|
-
ensure
|
|
117
|
-
@duckdb&.close
|
|
118
140
|
end
|
|
119
141
|
|
|
120
|
-
private
|
|
121
|
-
|
|
122
142
|
# @api private
|
|
123
143
|
def determine_reader
|
|
124
144
|
case File.extname(@source_path).downcase
|
|
@@ -141,6 +161,4 @@ module DataDrain
|
|
|
141
161
|
safe_log(:info, "file_ingestor.cleanup", { source_path: @source_path })
|
|
142
162
|
end
|
|
143
163
|
end
|
|
144
|
-
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
|
|
145
|
-
# Metrics/MethodLength
|
|
146
164
|
end
|
|
@@ -14,22 +14,35 @@ module DataDrain
|
|
|
14
14
|
# @param job_name [String] Nombre del Job en la consola de AWS.
|
|
15
15
|
# @param arguments [Hash] Argumentos de ejecución (deben empezar con --).
|
|
16
16
|
# @param polling_interval [Integer] Segundos de espera entre cada chequeo de estado.
|
|
17
|
+
# @param max_wait_seconds [Integer, nil] Timeout máximo en segundos.
|
|
18
|
+
# nil = sin límite (comportamiento anterior).
|
|
17
19
|
# @return [Boolean] true si el Job terminó exitosamente (SUCCEEDED).
|
|
18
|
-
# @raise [
|
|
19
|
-
|
|
20
|
+
# @raise [DataDrain::Error] si max_wait_seconds excede antes de SUCCEEDED.
|
|
21
|
+
# @raise [RuntimeError] si el Job falla o se detiene.
|
|
22
|
+
def self.run_and_wait(job_name, arguments = {}, polling_interval: 30, max_wait_seconds: nil)
|
|
20
23
|
config = DataDrain.configuration
|
|
24
|
+
config.validate!
|
|
21
25
|
client = Aws::Glue::Client.new(region: config.aws_region)
|
|
22
26
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
23
27
|
|
|
24
|
-
|
|
25
|
-
# Pero como extendemos Observability, usamos safe_log directamente.
|
|
26
|
-
@logger = config.logger
|
|
28
|
+
@logger = config.logger
|
|
27
29
|
|
|
28
30
|
safe_log(:info, "glue_runner.start", { job: job_name })
|
|
29
31
|
resp = client.start_job_run(job_name: job_name, arguments: arguments)
|
|
30
32
|
run_id = resp.job_run_id
|
|
31
33
|
|
|
32
34
|
loop do
|
|
35
|
+
if max_wait_seconds &&
|
|
36
|
+
(Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) > max_wait_seconds
|
|
37
|
+
safe_log(:error, "glue_runner.timeout", {
|
|
38
|
+
job: job_name,
|
|
39
|
+
run_id: run_id,
|
|
40
|
+
max_wait_seconds: max_wait_seconds
|
|
41
|
+
})
|
|
42
|
+
raise DataDrain::Error,
|
|
43
|
+
"Glue Job #{job_name} (Run ID: #{run_id}) excedió max_wait_seconds=#{max_wait_seconds}"
|
|
44
|
+
end
|
|
45
|
+
|
|
33
46
|
run_info = client.get_job_run(job_name: job_name, run_id: run_id).job_run
|
|
34
47
|
status = run_info.job_run_state
|
|
35
48
|
|
|
@@ -41,15 +54,14 @@ module DataDrain
|
|
|
41
54
|
when "FAILED", "STOPPED", "TIMEOUT"
|
|
42
55
|
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
43
56
|
error_metadata = { job: job_name, run_id: run_id, status: status, duration_s: duration.round(2) }
|
|
44
|
-
|
|
45
|
-
if run_info.error_message
|
|
46
|
-
error_metadata[:error_message] = run_info.error_message.gsub("\"", "'")[0, 200]
|
|
47
|
-
end
|
|
57
|
+
|
|
58
|
+
error_metadata[:error_message] = run_info.error_message.gsub("\"", "'")[0, 200] if run_info.error_message
|
|
48
59
|
|
|
49
60
|
safe_log(:error, "glue_runner.failed", error_metadata)
|
|
50
61
|
raise "Glue Job #{job_name} (Run ID: #{run_id}) falló con estado #{status}."
|
|
51
62
|
else
|
|
52
|
-
safe_log(:info, "glue_runner.polling",
|
|
63
|
+
safe_log(:info, "glue_runner.polling",
|
|
64
|
+
{ job: job_name, run_id: run_id, status: status, next_check_in_s: polling_interval })
|
|
53
65
|
sleep polling_interval
|
|
54
66
|
end
|
|
55
67
|
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DataDrain
|
|
4
|
+
module Observability
|
|
5
|
+
# Helper para medición de duración de operaciones.
|
|
6
|
+
# @api private
|
|
7
|
+
module Timing
|
|
8
|
+
private
|
|
9
|
+
|
|
10
|
+
def monotonic
|
|
11
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def timed(step_name)
|
|
15
|
+
t = monotonic
|
|
16
|
+
result = yield
|
|
17
|
+
@durations ||= {}
|
|
18
|
+
@durations[step_name] = monotonic - t
|
|
19
|
+
result
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module DataDrain
|
|
4
|
-
# Módulo interno para garantizar que la telemetría cumpla con los
|
|
4
|
+
# Módulo interno para garantizar que la telemetría cumpla con los
|
|
5
5
|
# Global-Observability-Standards: resiliencia, KV-structured y precisión.
|
|
6
6
|
#
|
|
7
7
|
# Este módulo es genérico y puede ser utilizado en otras gemas.
|
|
8
8
|
# @api private
|
|
9
9
|
module Observability
|
|
10
|
+
SENSITIVE_KEY_PATTERN = /password|passwd|pass|secret|token|api_key|apikey|auth|credential|private_key/i
|
|
11
|
+
|
|
10
12
|
private
|
|
11
13
|
|
|
12
14
|
# Emite un log estructurado de forma segura.
|
|
@@ -19,7 +21,7 @@ module DataDrain
|
|
|
19
21
|
|
|
20
22
|
# Enmascaramiento preventivo de secretos (Security)
|
|
21
23
|
log_line = fields.map do |k, v|
|
|
22
|
-
val =
|
|
24
|
+
val = SENSITIVE_KEY_PATTERN.match?(k.to_s) ? "[FILTERED]" : v
|
|
23
25
|
"#{k}=#{val}"
|
|
24
26
|
end.join(" ")
|
|
25
27
|
|