data_drain 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -0
- data/CHANGELOG.md +34 -0
- data/CLAUDE.md +3 -1
- data/README.md +3 -2
- data/docs/IMPROVEMENT_PLAN.md +1417 -0
- data/docs/execution/archive/v0.2.0.agente-review.md +125 -0
- data/docs/execution/archive/v0.2.0.md +812 -0
- data/docs/execution/v0.2.2.md +891 -0
- data/docs/glue_pyspark_example.py +60 -0
- data/lib/data_drain/configuration.rb +49 -5
- data/lib/data_drain/engine.rb +1 -0
- data/lib/data_drain/file_ingestor.rb +1 -0
- data/lib/data_drain/glue_runner.rb +22 -10
- data/lib/data_drain/observability.rb +4 -2
- data/lib/data_drain/record.rb +2 -1
- data/lib/data_drain/storage/s3.rb +33 -37
- data/lib/data_drain/version.rb +1 -1
- data/skill/SKILL.md +1 -0
- data/skill/references/antipatrones.md +21 -4
- data/skill/references/api-detallada.md +18 -5
- data/skill/references/eventos-telemetria.md +5 -0
- data/skill/references/postgres-tuning.md +129 -0
- metadata +7 -1
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Script de AWS Glue (PySpark) compatible con DataDrain::GlueRunner.
|
|
3
|
+
|
|
4
|
+
Crear el Job en la consola de AWS Glue (Spark 4.0+) y usar este script como base.
|
|
5
|
+
Argumentos requeridos: JOB_NAME, start_date, end_date, s3_bucket, s3_folder,
|
|
6
|
+
db_url, db_user, db_password, db_table, partition_by.
|
|
7
|
+
|
|
8
|
+
Personalizar la sección de columnas derivadas según las partition_keys de cada tabla.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
from awsglue.utils import getResolvedOptions
|
|
13
|
+
from pyspark.context import SparkContext
|
|
14
|
+
from awsglue.context import GlueContext
|
|
15
|
+
from awsglue.job import Job
|
|
16
|
+
from pyspark.sql.functions import col, year, month
|
|
17
|
+
|
|
18
|
+
args = getResolvedOptions(sys.argv, [
|
|
19
|
+
'JOB_NAME', 'start_date', 'end_date', 's3_bucket', 's3_folder',
|
|
20
|
+
'db_url', 'db_user', 'db_password', 'db_table', 'partition_by'
|
|
21
|
+
])
|
|
22
|
+
|
|
23
|
+
sc = SparkContext()
|
|
24
|
+
glueContext = GlueContext(sc)
|
|
25
|
+
spark = glueContext.spark_session
|
|
26
|
+
job = Job(glueContext)
|
|
27
|
+
job.init(args['JOB_NAME'], args)
|
|
28
|
+
|
|
29
|
+
options = {
|
|
30
|
+
"url": args['db_url'],
|
|
31
|
+
"dbtable": args['db_table'],
|
|
32
|
+
"user": args['db_user'],
|
|
33
|
+
"password": args['db_password'],
|
|
34
|
+
"sampleQuery": (
|
|
35
|
+
f"SELECT * FROM {args['db_table']} "
|
|
36
|
+
f"WHERE created_at >= '{args['start_date']}' "
|
|
37
|
+
f"AND created_at < '{args['end_date']}'"
|
|
38
|
+
)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
df = spark.read.format("jdbc").options(**options).load()
|
|
42
|
+
|
|
43
|
+
# Agregar columnas derivadas necesarias para las particiones.
|
|
44
|
+
# isp_id ya existe en la tabla fuente — solo agregar las que se calculan.
|
|
45
|
+
# Personalizar esta seccion segun las partition_keys de cada tabla.
|
|
46
|
+
df_final = (
|
|
47
|
+
df.withColumn("year", year(col("created_at")))
|
|
48
|
+
.withColumn("month", month(col("created_at")))
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
output_path = f"s3://{args['s3_bucket']}/{args['s3_folder']}/"
|
|
52
|
+
partitions = args['partition_by'].split(",")
|
|
53
|
+
|
|
54
|
+
(df_final.write.mode("overwrite")
|
|
55
|
+
.partitionBy(*partitions)
|
|
56
|
+
.format("parquet")
|
|
57
|
+
.option("compression", "zstd")
|
|
58
|
+
.save(output_path))
|
|
59
|
+
|
|
60
|
+
job.commit()
|
|
@@ -6,10 +6,10 @@ module DataDrain
|
|
|
6
6
|
# Contenedor para todas las opciones de configuración del motor DataDrain.
|
|
7
7
|
class Configuration
|
|
8
8
|
attr_accessor :storage_mode, :aws_region,
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
9
|
+
:aws_access_key_id, :aws_secret_access_key,
|
|
10
|
+
:db_host, :db_port, :db_user, :db_pass, :db_name,
|
|
11
|
+
:batch_size, :throttle_delay, :logger, :limit_ram, :tmp_directory,
|
|
12
|
+
:idle_in_transaction_session_timeout
|
|
13
13
|
|
|
14
14
|
def initialize
|
|
15
15
|
@storage_mode = :local
|
|
@@ -20,12 +20,56 @@ module DataDrain
|
|
|
20
20
|
@limit_ram = nil # eg 2GB
|
|
21
21
|
@tmp_directory = nil # eg /tmp/duckdb_work
|
|
22
22
|
@idle_in_transaction_session_timeout = 0
|
|
23
|
-
@logger
|
|
23
|
+
@logger = Logger.new($stdout)
|
|
24
24
|
end
|
|
25
25
|
|
|
26
26
|
# @return [String] Cadena de conexión optimizada para DuckDB.
|
|
27
27
|
def duckdb_connection_string
|
|
28
28
|
"postgresql://#{@db_user}:#{@db_pass}@#{@db_host}:#{@db_port}/#{@db_name}?options=-c%20idle_in_transaction_session_timeout%3D#{@idle_in_transaction_session_timeout}"
|
|
29
29
|
end
|
|
30
|
+
|
|
31
|
+
# Valida invariantes generales (storage_mode + AWS si aplica).
|
|
32
|
+
# Llamado por FileIngestor#initialize y GlueRunner.run_and_wait.
|
|
33
|
+
#
|
|
34
|
+
# @raise [DataDrain::ConfigurationError]
|
|
35
|
+
def validate!
|
|
36
|
+
validate_storage_mode!
|
|
37
|
+
validate_aws_config! if storage_mode.to_sym == :s3
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Valida además las credenciales PostgreSQL.
|
|
41
|
+
# Llamado por Engine#initialize.
|
|
42
|
+
#
|
|
43
|
+
# @raise [DataDrain::ConfigurationError]
|
|
44
|
+
def validate_for_engine!
|
|
45
|
+
validate!
|
|
46
|
+
validate_db_config!
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def validate_storage_mode!
|
|
52
|
+
return if %i[local s3].include?(storage_mode.to_sym)
|
|
53
|
+
|
|
54
|
+
raise DataDrain::ConfigurationError,
|
|
55
|
+
"storage_mode debe ser :local o :s3, recibido #{storage_mode.inspect}"
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def validate_aws_config!
|
|
59
|
+
return unless aws_region.nil? || aws_region.to_s.empty?
|
|
60
|
+
|
|
61
|
+
raise DataDrain::ConfigurationError,
|
|
62
|
+
"aws_region es obligatorio con storage_mode = :s3"
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def validate_db_config!
|
|
66
|
+
%i[db_host db_user db_name].each do |attr|
|
|
67
|
+
val = public_send(attr)
|
|
68
|
+
next unless val.nil? || val.to_s.empty?
|
|
69
|
+
|
|
70
|
+
raise DataDrain::ConfigurationError,
|
|
71
|
+
"config.#{attr} es obligatorio para Engine (storage_mode=#{storage_mode})"
|
|
72
|
+
end
|
|
73
|
+
end
|
|
30
74
|
end
|
|
31
75
|
end
|
data/lib/data_drain/engine.rb
CHANGED
|
@@ -14,22 +14,35 @@ module DataDrain
|
|
|
14
14
|
# @param job_name [String] Nombre del Job en la consola de AWS.
|
|
15
15
|
# @param arguments [Hash] Argumentos de ejecución (deben empezar con --).
|
|
16
16
|
# @param polling_interval [Integer] Segundos de espera entre cada chequeo de estado.
|
|
17
|
+
# @param max_wait_seconds [Integer, nil] Timeout máximo en segundos.
|
|
18
|
+
# nil = sin límite (comportamiento anterior).
|
|
17
19
|
# @return [Boolean] true si el Job terminó exitosamente (SUCCEEDED).
|
|
18
|
-
# @raise [
|
|
19
|
-
|
|
20
|
+
# @raise [DataDrain::Error] si max_wait_seconds excede antes de SUCCEEDED.
|
|
21
|
+
# @raise [RuntimeError] si el Job falla o se detiene.
|
|
22
|
+
def self.run_and_wait(job_name, arguments = {}, polling_interval: 30, max_wait_seconds: nil)
|
|
20
23
|
config = DataDrain.configuration
|
|
24
|
+
config.validate!
|
|
21
25
|
client = Aws::Glue::Client.new(region: config.aws_region)
|
|
22
26
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
23
27
|
|
|
24
|
-
|
|
25
|
-
# Pero como extendemos Observability, usamos safe_log directamente.
|
|
26
|
-
@logger = config.logger
|
|
28
|
+
@logger = config.logger
|
|
27
29
|
|
|
28
30
|
safe_log(:info, "glue_runner.start", { job: job_name })
|
|
29
31
|
resp = client.start_job_run(job_name: job_name, arguments: arguments)
|
|
30
32
|
run_id = resp.job_run_id
|
|
31
33
|
|
|
32
34
|
loop do
|
|
35
|
+
if max_wait_seconds &&
|
|
36
|
+
(Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) > max_wait_seconds
|
|
37
|
+
safe_log(:error, "glue_runner.timeout", {
|
|
38
|
+
job: job_name,
|
|
39
|
+
run_id: run_id,
|
|
40
|
+
max_wait_seconds: max_wait_seconds
|
|
41
|
+
})
|
|
42
|
+
raise DataDrain::Error,
|
|
43
|
+
"Glue Job #{job_name} (Run ID: #{run_id}) excedió max_wait_seconds=#{max_wait_seconds}"
|
|
44
|
+
end
|
|
45
|
+
|
|
33
46
|
run_info = client.get_job_run(job_name: job_name, run_id: run_id).job_run
|
|
34
47
|
status = run_info.job_run_state
|
|
35
48
|
|
|
@@ -41,15 +54,14 @@ module DataDrain
|
|
|
41
54
|
when "FAILED", "STOPPED", "TIMEOUT"
|
|
42
55
|
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
43
56
|
error_metadata = { job: job_name, run_id: run_id, status: status, duration_s: duration.round(2) }
|
|
44
|
-
|
|
45
|
-
if run_info.error_message
|
|
46
|
-
error_metadata[:error_message] = run_info.error_message.gsub("\"", "'")[0, 200]
|
|
47
|
-
end
|
|
57
|
+
|
|
58
|
+
error_metadata[:error_message] = run_info.error_message.gsub("\"", "'")[0, 200] if run_info.error_message
|
|
48
59
|
|
|
49
60
|
safe_log(:error, "glue_runner.failed", error_metadata)
|
|
50
61
|
raise "Glue Job #{job_name} (Run ID: #{run_id}) falló con estado #{status}."
|
|
51
62
|
else
|
|
52
|
-
safe_log(:info, "glue_runner.polling",
|
|
63
|
+
safe_log(:info, "glue_runner.polling",
|
|
64
|
+
{ job: job_name, run_id: run_id, status: status, next_check_in_s: polling_interval })
|
|
53
65
|
sleep polling_interval
|
|
54
66
|
end
|
|
55
67
|
end
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module DataDrain
|
|
4
|
-
# Módulo interno para garantizar que la telemetría cumpla con los
|
|
4
|
+
# Módulo interno para garantizar que la telemetría cumpla con los
|
|
5
5
|
# Global-Observability-Standards: resiliencia, KV-structured y precisión.
|
|
6
6
|
#
|
|
7
7
|
# Este módulo es genérico y puede ser utilizado en otras gemas.
|
|
8
8
|
# @api private
|
|
9
9
|
module Observability
|
|
10
|
+
SENSITIVE_KEY_PATTERN = /password|passwd|pass|secret|token|api_key|apikey|auth|credential|private_key/i
|
|
11
|
+
|
|
10
12
|
private
|
|
11
13
|
|
|
12
14
|
# Emite un log estructurado de forma segura.
|
|
@@ -19,7 +21,7 @@ module DataDrain
|
|
|
19
21
|
|
|
20
22
|
# Enmascaramiento preventivo de secretos (Security)
|
|
21
23
|
log_line = fields.map do |k, v|
|
|
22
|
-
val =
|
|
24
|
+
val = SENSITIVE_KEY_PATTERN.match?(k.to_s) ? "[FILTERED]" : v
|
|
23
25
|
"#{k}=#{val}"
|
|
24
26
|
end.join(" ")
|
|
25
27
|
|
data/lib/data_drain/record.rb
CHANGED
|
@@ -38,7 +38,8 @@ module DataDrain
|
|
|
38
38
|
|
|
39
39
|
entry[:conn]&.close
|
|
40
40
|
entry[:db]&.close
|
|
41
|
-
rescue StandardError
|
|
41
|
+
rescue StandardError
|
|
42
|
+
nil
|
|
42
43
|
end
|
|
43
44
|
|
|
44
45
|
# Retorna la conexión persistente a DuckDB en memoria para el hilo (Thread) actual.
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
module DataDrain
|
|
4
4
|
module Storage
|
|
5
|
-
# Implementación del adaptador de almacenamiento para Amazon S3.
|
|
6
5
|
class S3 < Base
|
|
7
6
|
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
7
|
+
|
|
8
8
|
# Carga la extensión httpfs en DuckDB e inyecta las credenciales de AWS.
|
|
9
9
|
# Si aws_access_key_id y aws_secret_access_key están seteados, usa
|
|
10
10
|
# credenciales explícitas. Si no, usa credential_chain (IAM role, env vars,
|
|
@@ -16,42 +16,6 @@ module DataDrain
|
|
|
16
16
|
create_s3_secret(connection)
|
|
17
17
|
end
|
|
18
18
|
|
|
19
|
-
private
|
|
20
|
-
|
|
21
|
-
# @param connection [DuckDB::Connection]
|
|
22
|
-
# @raise [DataDrain::ConfigurationError]
|
|
23
|
-
def create_s3_secret(connection)
|
|
24
|
-
region = @config.aws_region
|
|
25
|
-
raise DataDrain::ConfigurationError, "aws_region es obligatorio para storage_mode=:s3" if region.nil?
|
|
26
|
-
|
|
27
|
-
if @config.aws_access_key_id && @config.aws_secret_access_key
|
|
28
|
-
connection.query(<<~SQL)
|
|
29
|
-
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
30
|
-
TYPE S3,
|
|
31
|
-
KEY_ID '#{escape_sql(@config.aws_access_key_id)}',
|
|
32
|
-
SECRET '#{escape_sql(@config.aws_secret_access_key)}',
|
|
33
|
-
REGION '#{escape_sql(region)}'
|
|
34
|
-
);
|
|
35
|
-
SQL
|
|
36
|
-
else
|
|
37
|
-
connection.query(<<~SQL)
|
|
38
|
-
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
39
|
-
TYPE S3,
|
|
40
|
-
PROVIDER credential_chain,
|
|
41
|
-
REGION '#{escape_sql(region)}'
|
|
42
|
-
);
|
|
43
|
-
SQL
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# @param value [String]
|
|
48
|
-
# @return [String]
|
|
49
|
-
def escape_sql(value)
|
|
50
|
-
value.to_s.gsub("'", "''")
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
public
|
|
54
|
-
|
|
55
19
|
# @param bucket [String]
|
|
56
20
|
# @param folder_name [String]
|
|
57
21
|
# @param partition_path [String, nil]
|
|
@@ -96,6 +60,38 @@ module DataDrain
|
|
|
96
60
|
|
|
97
61
|
private
|
|
98
62
|
|
|
63
|
+
# @param connection [DuckDB::Connection]
|
|
64
|
+
# @raise [DataDrain::ConfigurationError]
|
|
65
|
+
def create_s3_secret(connection)
|
|
66
|
+
region = @config.aws_region
|
|
67
|
+
raise DataDrain::ConfigurationError, "aws_region es obligatorio para storage_mode=:s3" if region.nil?
|
|
68
|
+
|
|
69
|
+
if @config.aws_access_key_id && @config.aws_secret_access_key
|
|
70
|
+
connection.query(<<~SQL)
|
|
71
|
+
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
72
|
+
TYPE S3,
|
|
73
|
+
KEY_ID '#{escape_sql(@config.aws_access_key_id)}',
|
|
74
|
+
SECRET '#{escape_sql(@config.aws_secret_access_key)}',
|
|
75
|
+
REGION '#{escape_sql(region)}'
|
|
76
|
+
);
|
|
77
|
+
SQL
|
|
78
|
+
else
|
|
79
|
+
connection.query(<<~SQL)
|
|
80
|
+
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
81
|
+
TYPE S3,
|
|
82
|
+
PROVIDER credential_chain,
|
|
83
|
+
REGION '#{escape_sql(region)}'
|
|
84
|
+
);
|
|
85
|
+
SQL
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# @param value [String]
|
|
90
|
+
# @return [String]
|
|
91
|
+
def escape_sql(value)
|
|
92
|
+
value.to_s.gsub("'", "''")
|
|
93
|
+
end
|
|
94
|
+
|
|
99
95
|
# @param client [Aws::S3::Client]
|
|
100
96
|
# @param bucket [String]
|
|
101
97
|
# @param objects_to_delete [Array<Hash>]
|
data/lib/data_drain/version.rb
CHANGED
data/skill/SKILL.md
CHANGED
|
@@ -213,3 +213,4 @@ Catálogo completo en [Antipatrones](references/antipatrones.md). Resumen de los
|
|
|
213
213
|
- [API Detallada](references/api-detallada.md) — Firmas completas, parámetros, retornos y comportamientos de cada clase pública.
|
|
214
214
|
- [Eventos y Telemetría](references/eventos-telemetria.md) — Catálogo completo de eventos KV emitidos por la gema.
|
|
215
215
|
- [Antipatrones](references/antipatrones.md) — Qué NO hacer y alternativas correctas.
|
|
216
|
+
- [Postgres Tuning](references/postgres-tuning.md) — Índices, VACUUM, particionamiento y diagnóstico por tamaño de tabla.
|
|
@@ -207,7 +207,7 @@ ArchivedX.connection.close # Rompe la siguiente query del mismo thread
|
|
|
207
207
|
|
|
208
208
|
**Razón:** `Record.connection` es thread-local y persistente — diseñada para amortizar el costo de cargar `httpfs` y credenciales. Cerrarla obliga a reconectar todo en la próxima query y puede dejar el `Thread.current` apuntando a una conexión muerta (`Database` GC'd).
|
|
209
209
|
|
|
210
|
-
**Alternativa:** No
|
|
210
|
+
**Alternativa:** No usar `Record.connection.close` directamente. Si necesitás cerrar (Sidekiq/Puma middleware), usar `Record.disconnect!` que cierra `db` + `conn` y limpia `Thread.current` atómicamente. En threads de larga vida, esto previene memory leak.
|
|
211
211
|
|
|
212
212
|
---
|
|
213
213
|
|
|
@@ -234,9 +234,26 @@ DataDrain::Engine.new(
|
|
|
234
234
|
|
|
235
235
|
**Incorrecto:**
|
|
236
236
|
```ruby
|
|
237
|
-
DataDrain::GlueRunner.run_and_wait("job", args) #
|
|
237
|
+
DataDrain::GlueRunner.run_and_wait("job", args) # Sin timeout, puede bloquearse
|
|
238
238
|
```
|
|
239
239
|
|
|
240
|
-
**Razón:**
|
|
240
|
+
**Razón:** Si Glue queda colgado en `RUNNING`, bloquea indefinidamente.
|
|
241
241
|
|
|
242
|
-
**Alternativa:**
|
|
242
|
+
**Alternativa:** Usar `max_wait_seconds:` (desde v0.2.2):
|
|
243
|
+
```ruby
|
|
244
|
+
GlueRunner.run_and_wait("job", args, max_wait_seconds: 3600) # 1h max
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## 15. Llamar `Engine.new` con configuración incompleta
|
|
250
|
+
|
|
251
|
+
**Incorrecto:**
|
|
252
|
+
```ruby
|
|
253
|
+
DataDrain::Engine.new(table_name: "versions", start_date: ..., end_date: ...)
|
|
254
|
+
# donde DataDrain.configuration no tiene db_name seteado
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**Razón:** Desde v0.2.2, `Engine#initialize` llama `config.validate_for_engine!` que verifica `db_host`, `db_user`, `db_name`. Si alguno falta, levanta `DataDrain::ConfigurationError`.
|
|
258
|
+
|
|
259
|
+
**Alternativa:** Asegurar que `db_name`, `db_user` y `db_host` estén seteados en `DataDrain.configure` antes de llamar `Engine.new`. Si se usa auth peer/trust, `db_pass` puede ser `nil`.
|
|
@@ -40,7 +40,20 @@ Atributos (`attr_accessor`):
|
|
|
40
40
|
### `#duckdb_connection_string`
|
|
41
41
|
Retorna URI: `postgresql://user:pass@host:port/db?options=-c%20idle_in_transaction_session_timeout%3D<val>`
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
### `#validate!`
|
|
44
|
+
Valida invariantes generales. Llamada automáticamente por `FileIngestor#initialize` y `GlueRunner.run_and_wait`.
|
|
45
|
+
|
|
46
|
+
Raises `DataDrain::ConfigurationError` si:
|
|
47
|
+
- `storage_mode` no es `:local` ni `:s3`
|
|
48
|
+
- `storage_mode == :s3` y `aws_region` es nil o vacío
|
|
49
|
+
|
|
50
|
+
### `#validate_for_engine!`
|
|
51
|
+
Valida invariantes de Engine. Además de `#validate!`, verifica `db_host`, `db_user`, `db_name` no nil ni vacíos.
|
|
52
|
+
|
|
53
|
+
Llamada automáticamente por `Engine#initialize`.
|
|
54
|
+
|
|
55
|
+
**No valida `db_pass`** — puede ser nil con auth peer/trust (sockets locales) o IAM (RDS).
|
|
56
|
+
**No valida `db_port`** — tiene default `5432`, nunca nil tras `Configuration#initialize`.
|
|
44
57
|
|
|
45
58
|
---
|
|
46
59
|
|
|
@@ -164,24 +177,24 @@ Formato: `#<Class attr1: val1, attr2: val2, ...>`.
|
|
|
164
177
|
|
|
165
178
|
## `DataDrain::GlueRunner`
|
|
166
179
|
|
|
167
|
-
### `.run_and_wait(job_name, arguments = {}, polling_interval: 30) → true`
|
|
180
|
+
### `.run_and_wait(job_name, arguments = {}, polling_interval: 30, max_wait_seconds: nil) → true`
|
|
168
181
|
|
|
169
182
|
| Parámetro | Tipo | Descripción |
|
|
170
183
|
|-----------|------|-------------|
|
|
171
184
|
| `job_name` | String | Nombre del Job en consola AWS |
|
|
172
185
|
| `arguments` | Hash | Args con prefijo `--` (ej. `"--start_date" => "..."`) |
|
|
173
186
|
| `polling_interval` | Integer | Segundos entre chequeos. Default `30` |
|
|
187
|
+
| `max_wait_seconds` | Integer, nil | Timeout máximo. nil = sin límite. Default `nil` |
|
|
174
188
|
|
|
175
189
|
Flujo:
|
|
176
190
|
1. `Aws::Glue::Client.new(region: config.aws_region)`
|
|
177
191
|
2. `start_job_run` → captura `run_id`
|
|
178
192
|
3. Loop: `get_job_run`, evalúa `job_run_state`:
|
|
193
|
+
- Si `max_wait_seconds` excede → log `glue_runner.timeout`, `raise DataDrain::Error`
|
|
179
194
|
- `SUCCEEDED` → log `glue_runner.complete`, retorna `true`
|
|
180
195
|
- `FAILED|STOPPED|TIMEOUT` → log `glue_runner.failed` (incluye `error_message` truncado a 200 chars), `raise RuntimeError`
|
|
181
196
|
- Otro → log `glue_runner.polling`, `sleep polling_interval`
|
|
182
197
|
|
|
183
|
-
No tiene timeout máximo. Si Glue queda colgado en `RUNNING`, esto bloquea indefinidamente.
|
|
184
|
-
|
|
185
198
|
---
|
|
186
199
|
|
|
187
200
|
## `DataDrain::Storage`
|
|
@@ -220,7 +233,7 @@ Diseñado para `include` (instance methods, requiere `@logger`) o `extend` (clas
|
|
|
220
233
|
### `#safe_log(level, event, metadata = {})` (privado)
|
|
221
234
|
- Si `@logger` es nil, no-op.
|
|
222
235
|
- Construye `fields = { component: observability_name, event: event }.merge(metadata)`.
|
|
223
|
-
- Filtra valores cuyas keys
|
|
236
|
+
- Filtra valores cuyas keys matcheen `SENSITIVE_KEY_PATTERN = /password|passwd|pass|secret|token|api_key|apikey|auth|credential|private_key/i` → `[FILTERED]`. Aplica a claves exactas (`password`) y variantes (`db_password`, `aws_secret_access_key`, `bearer_token`, etc.).
|
|
224
237
|
- Emite `@logger.send(level) { "k1=v1 k2=v2 ..." }`.
|
|
225
238
|
- `rescue StandardError` silencioso (resilience).
|
|
226
239
|
|
|
@@ -128,6 +128,11 @@ Catálogo completo de eventos KV emitidos por DataDrain. Formato Wispro-Observab
|
|
|
128
128
|
**Campos:** `job`, `run_id`, `status`, `duration_s`, `error_message` (si Glue lo provee, truncado a 200 chars).
|
|
129
129
|
**Consecuencia:** `raise RuntimeError`.
|
|
130
130
|
|
|
131
|
+
### `glue_runner.timeout`
|
|
132
|
+
**Nivel:** ERROR. Emite cuando `max_wait_seconds` excede antes de `SUCCEEDED`.
|
|
133
|
+
**Campos:** `job`, `run_id`, `max_wait_seconds`.
|
|
134
|
+
**Consecuencia:** `raise DataDrain::Error`.
|
|
135
|
+
|
|
131
136
|
---
|
|
132
137
|
|
|
133
138
|
## Ejemplos reales
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Postgres Tuning para DataDrain
|
|
2
|
+
|
|
3
|
+
Guía operacional para tablas que DataDrain archiva y purga. Cubre índices,
|
|
4
|
+
VACUUM, particionamiento y diagnóstico.
|
|
5
|
+
|
|
6
|
+
## Tabla de decisión por tamaño
|
|
7
|
+
|
|
8
|
+
| Tamaño | Estrategia |
|
|
9
|
+
|--------|-----------|
|
|
10
|
+
| <10GB | Índice composite `(created_at, pk)` con `CREATE INDEX CONCURRENTLY` |
|
|
11
|
+
| 10-100GB | Mismo + `SET maintenance_work_mem='4GB'` + checklist |
|
|
12
|
+
| 100GB-1TB | Particionamiento declarativo por mes |
|
|
13
|
+
| >1TB | Particionamiento obligatorio + `DROP PARTITION` reemplaza DELETE |
|
|
14
|
+
|
|
15
|
+
## Índice recomendado
|
|
16
|
+
|
|
17
|
+
Para tablas <100GB, DataDrain se beneficia de un índice composite:
|
|
18
|
+
|
|
19
|
+
CREATE INDEX CONCURRENTLY idx_versions_created_at_id
|
|
20
|
+
ON versions (created_at, id);
|
|
21
|
+
|
|
22
|
+
El DELETE en batches usa `WHERE created_at >= X AND created_at < Y` + `IN (SELECT id LIMIT N)`.
|
|
23
|
+
El índice composite lo convierte en index scan por rango + acceso directo al id.
|
|
24
|
+
|
|
25
|
+
### Checklist pre-`CREATE INDEX CONCURRENTLY`
|
|
26
|
+
|
|
27
|
+
- [ ] Tamaño actual: `SELECT pg_size_pretty(pg_total_relation_size('versions'));`
|
|
28
|
+
- [ ] Espacio libre disco (>2x tabla)
|
|
29
|
+
- [ ] `SET maintenance_work_mem = '4GB';` (sesión)
|
|
30
|
+
- [ ] `SET statement_timeout = 0;`
|
|
31
|
+
- [ ] Ventana de baja carga
|
|
32
|
+
- [ ] Plan rollback: `DROP INDEX CONCURRENTLY` si satura I/O
|
|
33
|
+
|
|
34
|
+
### Riesgos de `CONCURRENTLY`
|
|
35
|
+
|
|
36
|
+
1. **Dos pasadas** (puede tardar horas en 500GB)
|
|
37
|
+
2. **I/O sostenido** (satura IOPS en EBS gp3 sin provisioned)
|
|
38
|
+
3. **Puede fallar y dejar índice INVALID** → recuperar con `DROP INDEX CONCURRENTLY idx; CREATE INDEX CONCURRENTLY idx ...`
|
|
39
|
+
4. **Espacio en disco alto** durante build (sort externo si `maintenance_work_mem` bajo)
|
|
40
|
+
|
|
41
|
+
## VACUUM ANALYZE post-purga
|
|
42
|
+
|
|
43
|
+
En tablas no particionadas, purgar millones de rows deja dead tuples.
|
|
44
|
+
Sin VACUUM, el espacio no se libera y los seq scan recorren páginas vacías.
|
|
45
|
+
|
|
46
|
+
VACUUM ANALYZE versions;
|
|
47
|
+
|
|
48
|
+
Item 5 del roadmap agrega `config.vacuum_after_purge` para automatizar esto.
|
|
49
|
+
Hasta v0.3.0, correr manualmente después de cada `Engine#call` en tablas
|
|
50
|
+
grandes no particionadas.
|
|
51
|
+
|
|
52
|
+
**NO usar `VACUUM FULL`** — bloquea la tabla entera (ACCESS EXCLUSIVE lock).
|
|
53
|
+
|
|
54
|
+
## Diagnóstico de purga lenta
|
|
55
|
+
|
|
56
|
+
-- Plan del DELETE en lotes
|
|
57
|
+
EXPLAIN (ANALYZE, BUFFERS)
|
|
58
|
+
DELETE FROM versions
|
|
59
|
+
WHERE id IN (
|
|
60
|
+
SELECT id FROM versions
|
|
61
|
+
WHERE created_at >= '2026-01-01' AND created_at < '2026-02-01'
|
|
62
|
+
LIMIT 5000
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
-- Sesiones activas sobre la tabla
|
|
66
|
+
SELECT pid, state, wait_event, query_start, query
|
|
67
|
+
FROM pg_stat_activity
|
|
68
|
+
WHERE query LIKE '%versions%'
|
|
69
|
+
AND state != 'idle';
|
|
70
|
+
|
|
71
|
+
-- Estadísticas de la tabla
|
|
72
|
+
SELECT relname, n_live_tup, n_dead_tup, last_vacuum, last_autovacuum
|
|
73
|
+
FROM pg_stat_user_tables
|
|
74
|
+
WHERE relname = 'versions';
|
|
75
|
+
|
|
76
|
+
-- Top queries lentas (requiere pg_stat_statements)
|
|
77
|
+
SELECT substring(query, 1, 100) AS query, calls, mean_exec_time, rows
|
|
78
|
+
FROM pg_stat_statements
|
|
79
|
+
WHERE query LIKE '%versions%'
|
|
80
|
+
ORDER BY mean_exec_time DESC
|
|
81
|
+
LIMIT 10;
|
|
82
|
+
|
|
83
|
+
## Particionamiento declarativo (tablas > 100GB)
|
|
84
|
+
|
|
85
|
+
Migrar a tabla particionada cambia DataDrain de "DELETE masivo throttled" a
|
|
86
|
+
"DROP PARTITION instantáneo".
|
|
87
|
+
|
|
88
|
+
### Setup
|
|
89
|
+
|
|
90
|
+
-- 1. Crear tabla particionada (vacía, misma estructura que versions)
|
|
91
|
+
CREATE TABLE versions_new (
|
|
92
|
+
id UUID PRIMARY KEY,
|
|
93
|
+
created_at TIMESTAMP NOT NULL,
|
|
94
|
+
... -- resto de columnas
|
|
95
|
+
) PARTITION BY RANGE (created_at);
|
|
96
|
+
|
|
97
|
+
-- 2. Crear partición por mes
|
|
98
|
+
CREATE TABLE versions_2026_03 PARTITION OF versions_new
|
|
99
|
+
FOR VALUES FROM ('2026-03-01') TO ('2026-04-01');
|
|
100
|
+
|
|
101
|
+
-- 3. Migrar datos (lotes, una partición por vez)
|
|
102
|
+
INSERT INTO versions_2026_03
|
|
103
|
+
SELECT * FROM versions
|
|
104
|
+
WHERE created_at >= '2026-03-01' AND created_at < '2026-04-01';
|
|
105
|
+
|
|
106
|
+
-- 4. Swap nombres (downtime mínimo)
|
|
107
|
+
BEGIN;
|
|
108
|
+
ALTER TABLE versions RENAME TO versions_old;
|
|
109
|
+
ALTER TABLE versions_new RENAME TO versions;
|
|
110
|
+
COMMIT;
|
|
111
|
+
|
|
112
|
+
### Beneficio para DataDrain
|
|
113
|
+
|
|
114
|
+
-- v0.2.x: DELETE en lotes, VACUUM después, horas en TB
|
|
115
|
+
DataDrain::Engine.new(...).call
|
|
116
|
+
|
|
117
|
+
-- Con particiones: DataDrain sigue funcionando pero si el rango
|
|
118
|
+
-- coincide con una partición, el operador puede hacer:
|
|
119
|
+
DROP TABLE versions_2026_03; -- instantáneo, sin bloat
|
|
120
|
+
|
|
121
|
+
DataDrain no detecta particiones automáticamente (futuro item). Hoy el
|
|
122
|
+
operador decide.
|
|
123
|
+
|
|
124
|
+
## Referencias
|
|
125
|
+
|
|
126
|
+
- Skill: `.agents/skills/postgresql-optimization/SKILL.md`
|
|
127
|
+
- PG docs: https://www.postgresql.org/docs/current/ddl-partitioning.html
|
|
128
|
+
- Item 5 roadmap (VACUUM automático): ../IMPROVEMENT_PLAN.md#item-5
|
|
129
|
+
- Item 11b roadmap (warning runtime): ../IMPROVEMENT_PLAN.md#item-11b
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_drain
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gabriel
|
|
@@ -97,6 +97,11 @@ files:
|
|
|
97
97
|
- README.md
|
|
98
98
|
- Rakefile
|
|
99
99
|
- data_drain.gemspec
|
|
100
|
+
- docs/IMPROVEMENT_PLAN.md
|
|
101
|
+
- docs/execution/archive/v0.2.0.agente-review.md
|
|
102
|
+
- docs/execution/archive/v0.2.0.md
|
|
103
|
+
- docs/execution/v0.2.2.md
|
|
104
|
+
- docs/glue_pyspark_example.py
|
|
100
105
|
- lib/data_drain.rb
|
|
101
106
|
- lib/data_drain/configuration.rb
|
|
102
107
|
- lib/data_drain/engine.rb
|
|
@@ -117,6 +122,7 @@ files:
|
|
|
117
122
|
- skill/references/antipatrones.md
|
|
118
123
|
- skill/references/api-detallada.md
|
|
119
124
|
- skill/references/eventos-telemetria.md
|
|
125
|
+
- skill/references/postgres-tuning.md
|
|
120
126
|
homepage: https://github.com/gedera/data_drain
|
|
121
127
|
licenses: []
|
|
122
128
|
metadata: {}
|