data_drain 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +46 -1
- data/CLAUDE.md +3 -1
- data/README.md +3 -0
- data/docs/IMPROVEMENT_PLAN.md +271 -11
- data/docs/execution/v0.2.2.md +891 -0
- data/lib/data_drain/configuration.rb +55 -5
- data/lib/data_drain/engine.rb +183 -72
- data/lib/data_drain/file_ingestor.rb +65 -47
- data/lib/data_drain/glue_runner.rb +22 -10
- data/lib/data_drain/observability/timing.rb +23 -0
- data/lib/data_drain/observability.rb +4 -2
- data/lib/data_drain/record.rb +10 -16
- data/lib/data_drain/storage/s3.rb +60 -45
- data/lib/data_drain/version.rb +1 -1
- data/lib/data_drain.rb +1 -0
- data/skill/SKILL.md +1 -0
- data/skill/references/antipatrones.md +20 -3
- data/skill/references/api-detallada.md +18 -5
- data/skill/references/eventos-telemetria.md +5 -0
- data/skill/references/postgres-tuning.md +129 -0
- metadata +5 -2
data/lib/data_drain/record.rb
CHANGED
|
@@ -38,14 +38,14 @@ module DataDrain
|
|
|
38
38
|
|
|
39
39
|
entry[:conn]&.close
|
|
40
40
|
entry[:db]&.close
|
|
41
|
-
rescue StandardError
|
|
41
|
+
rescue StandardError
|
|
42
|
+
nil
|
|
42
43
|
end
|
|
43
44
|
|
|
44
45
|
# Retorna la conexión persistente a DuckDB en memoria para el hilo (Thread) actual.
|
|
45
46
|
# Esto previene tener que recargar extensiones (como httpfs) en cada consulta.
|
|
46
47
|
#
|
|
47
48
|
# @return [DuckDB::Connection] Conexión activa a DuckDB.
|
|
48
|
-
# rubocop:disable Metrics/AbcSize
|
|
49
49
|
def self.connection
|
|
50
50
|
Thread.current[:data_drain_duckdb] ||= begin
|
|
51
51
|
db = DuckDB::Database.open(":memory:")
|
|
@@ -56,11 +56,13 @@ module DataDrain
|
|
|
56
56
|
conn.query("SET temp_directory='#{config.tmp_directory}'") if config.tmp_directory.present?
|
|
57
57
|
|
|
58
58
|
DataDrain::Storage.adapter.setup_duckdb(conn)
|
|
59
|
+
|
|
60
|
+
conn.query("SET lock_configuration=true;")
|
|
61
|
+
|
|
59
62
|
{ db: db, conn: conn }
|
|
60
63
|
end
|
|
61
64
|
Thread.current[:data_drain_duckdb][:conn]
|
|
62
65
|
end
|
|
63
|
-
# rubocop:enable Metrics/AbcSize
|
|
64
66
|
|
|
65
67
|
# Consulta registros en el Data Lake filtrando por claves de partición.
|
|
66
68
|
#
|
|
@@ -137,22 +139,14 @@ module DataDrain
|
|
|
137
139
|
# @param sql [String]
|
|
138
140
|
# @param columns [Array<String>]
|
|
139
141
|
# @return [Array<DataDrain::Record>]
|
|
140
|
-
# rubocop:disable Metrics/MethodLength
|
|
141
142
|
def execute_and_instantiate(sql, columns)
|
|
142
143
|
@logger = DataDrain.configuration.logger
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
end
|
|
149
|
-
|
|
150
|
-
result.map do |row|
|
|
151
|
-
attributes_hash = columns.zip(row).to_h
|
|
152
|
-
new(attributes_hash)
|
|
153
|
-
end
|
|
144
|
+
result = connection.query(sql)
|
|
145
|
+
result.map { |row| new(columns.zip(row).to_h) }
|
|
146
|
+
rescue DuckDB::Error => e
|
|
147
|
+
safe_log(:warn, "record.parquet_not_found", exception_metadata(e))
|
|
148
|
+
[]
|
|
154
149
|
end
|
|
155
150
|
end
|
|
156
|
-
# rubocop:enable Metrics/MethodLength
|
|
157
151
|
end
|
|
158
152
|
end
|
|
@@ -2,9 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
module DataDrain
|
|
4
4
|
module Storage
|
|
5
|
-
# Implementación del adaptador de almacenamiento para Amazon S3.
|
|
6
5
|
class S3 < Base
|
|
7
|
-
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
8
6
|
# Carga la extensión httpfs en DuckDB e inyecta las credenciales de AWS.
|
|
9
7
|
# Si aws_access_key_id y aws_secret_access_key están seteados, usa
|
|
10
8
|
# credenciales explícitas. Si no, usa credential_chain (IAM role, env vars,
|
|
@@ -16,42 +14,6 @@ module DataDrain
|
|
|
16
14
|
create_s3_secret(connection)
|
|
17
15
|
end
|
|
18
16
|
|
|
19
|
-
private
|
|
20
|
-
|
|
21
|
-
# @param connection [DuckDB::Connection]
|
|
22
|
-
# @raise [DataDrain::ConfigurationError]
|
|
23
|
-
def create_s3_secret(connection)
|
|
24
|
-
region = @config.aws_region
|
|
25
|
-
raise DataDrain::ConfigurationError, "aws_region es obligatorio para storage_mode=:s3" if region.nil?
|
|
26
|
-
|
|
27
|
-
if @config.aws_access_key_id && @config.aws_secret_access_key
|
|
28
|
-
connection.query(<<~SQL)
|
|
29
|
-
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
30
|
-
TYPE S3,
|
|
31
|
-
KEY_ID '#{escape_sql(@config.aws_access_key_id)}',
|
|
32
|
-
SECRET '#{escape_sql(@config.aws_secret_access_key)}',
|
|
33
|
-
REGION '#{escape_sql(region)}'
|
|
34
|
-
);
|
|
35
|
-
SQL
|
|
36
|
-
else
|
|
37
|
-
connection.query(<<~SQL)
|
|
38
|
-
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
39
|
-
TYPE S3,
|
|
40
|
-
PROVIDER credential_chain,
|
|
41
|
-
REGION '#{escape_sql(region)}'
|
|
42
|
-
);
|
|
43
|
-
SQL
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# @param value [String]
|
|
48
|
-
# @return [String]
|
|
49
|
-
def escape_sql(value)
|
|
50
|
-
value.to_s.gsub("'", "''")
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
public
|
|
54
|
-
|
|
55
17
|
# @param bucket [String]
|
|
56
18
|
# @param folder_name [String]
|
|
57
19
|
# @param partition_path [String, nil]
|
|
@@ -68,33 +30,87 @@ module DataDrain
|
|
|
68
30
|
# @param partitions [Hash]
|
|
69
31
|
# @return [Integer]
|
|
70
32
|
def destroy_partitions(bucket, folder_name, partition_keys, partitions)
|
|
71
|
-
client =
|
|
33
|
+
client = s3_client
|
|
34
|
+
prefix, pattern_regex = build_destroy_pattern(folder_name, partition_keys, partitions)
|
|
35
|
+
objects = collect_matching_objects(client, bucket, prefix, pattern_regex)
|
|
36
|
+
delete_in_batches(client, bucket, objects)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
# @return [Aws::S3::Client]
|
|
42
|
+
def s3_client
|
|
43
|
+
Aws::S3::Client.new(
|
|
72
44
|
region: @config.aws_region,
|
|
73
45
|
access_key_id: @config.aws_access_key_id,
|
|
74
46
|
secret_access_key: @config.aws_secret_access_key
|
|
75
47
|
)
|
|
48
|
+
end
|
|
76
49
|
|
|
50
|
+
# @param folder_name [String]
|
|
51
|
+
# @param partition_keys [Array<Symbol>]
|
|
52
|
+
# @param partitions [Hash]
|
|
53
|
+
# @return [Array(String, Regexp)] prefix y pattern_regex
|
|
54
|
+
def build_destroy_pattern(folder_name, partition_keys, partitions)
|
|
77
55
|
regex_parts = partition_keys.map do |key|
|
|
78
56
|
val = partitions[key]
|
|
79
57
|
val.nil? || val.to_s.empty? ? "#{key}=[^/]+" : "#{key}=#{val}"
|
|
80
58
|
end
|
|
81
|
-
|
|
59
|
+
pattern = Regexp.new("^#{folder_name}/#{regex_parts.join("/")}")
|
|
82
60
|
|
|
83
|
-
objects_to_delete = []
|
|
84
61
|
prefix = "#{folder_name}/"
|
|
85
62
|
first_key = partition_keys.first
|
|
86
63
|
prefix += "#{first_key}=#{partitions[first_key]}/" if partitions[first_key]
|
|
87
64
|
|
|
65
|
+
[prefix, pattern]
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# @param client [Aws::S3::Client]
|
|
69
|
+
# @param bucket [String]
|
|
70
|
+
# @param prefix [String]
|
|
71
|
+
# @param pattern_regex [Regexp]
|
|
72
|
+
# @return [Array<Hash>]
|
|
73
|
+
def collect_matching_objects(client, bucket, prefix, pattern_regex)
|
|
74
|
+
objects = []
|
|
88
75
|
client.list_objects_v2(bucket: bucket, prefix: prefix).each do |response|
|
|
89
76
|
response.contents.each do |obj|
|
|
90
|
-
|
|
77
|
+
objects << { key: obj.key } if obj.key.match?(pattern_regex)
|
|
91
78
|
end
|
|
92
79
|
end
|
|
80
|
+
objects
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# @param connection [DuckDB::Connection]
|
|
84
|
+
# @raise [DataDrain::ConfigurationError]
|
|
85
|
+
def create_s3_secret(connection)
|
|
86
|
+
region = @config.aws_region
|
|
87
|
+
raise DataDrain::ConfigurationError, "aws_region es obligatorio para storage_mode=:s3" if region.nil?
|
|
93
88
|
|
|
94
|
-
|
|
89
|
+
if @config.aws_access_key_id && @config.aws_secret_access_key
|
|
90
|
+
connection.query(<<~SQL)
|
|
91
|
+
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
92
|
+
TYPE S3,
|
|
93
|
+
KEY_ID '#{escape_sql(@config.aws_access_key_id)}',
|
|
94
|
+
SECRET '#{escape_sql(@config.aws_secret_access_key)}',
|
|
95
|
+
REGION '#{escape_sql(region)}'
|
|
96
|
+
);
|
|
97
|
+
SQL
|
|
98
|
+
else
|
|
99
|
+
connection.query(<<~SQL)
|
|
100
|
+
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
101
|
+
TYPE S3,
|
|
102
|
+
PROVIDER credential_chain,
|
|
103
|
+
REGION '#{escape_sql(region)}'
|
|
104
|
+
);
|
|
105
|
+
SQL
|
|
106
|
+
end
|
|
95
107
|
end
|
|
96
108
|
|
|
97
|
-
|
|
109
|
+
# @param value [String]
|
|
110
|
+
# @return [String]
|
|
111
|
+
def escape_sql(value)
|
|
112
|
+
value.to_s.gsub("'", "''")
|
|
113
|
+
end
|
|
98
114
|
|
|
99
115
|
# @param client [Aws::S3::Client]
|
|
100
116
|
# @param bucket [String]
|
|
@@ -111,6 +127,5 @@ module DataDrain
|
|
|
111
127
|
deleted_count
|
|
112
128
|
end
|
|
113
129
|
end
|
|
114
|
-
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
115
130
|
end
|
|
116
131
|
end
|
data/lib/data_drain/version.rb
CHANGED
data/lib/data_drain.rb
CHANGED
|
@@ -7,6 +7,7 @@ require_relative "data_drain/configuration"
|
|
|
7
7
|
require_relative "data_drain/validations"
|
|
8
8
|
require_relative "data_drain/storage"
|
|
9
9
|
require_relative "data_drain/observability"
|
|
10
|
+
require_relative "data_drain/observability/timing"
|
|
10
11
|
require_relative "data_drain/engine"
|
|
11
12
|
require_relative "data_drain/record"
|
|
12
13
|
require_relative "data_drain/file_ingestor"
|
data/skill/SKILL.md
CHANGED
|
@@ -213,3 +213,4 @@ Catálogo completo en [Antipatrones](references/antipatrones.md). Resumen de los
|
|
|
213
213
|
- [API Detallada](references/api-detallada.md) — Firmas completas, parámetros, retornos y comportamientos de cada clase pública.
|
|
214
214
|
- [Eventos y Telemetría](references/eventos-telemetria.md) — Catálogo completo de eventos KV emitidos por la gema.
|
|
215
215
|
- [Antipatrones](references/antipatrones.md) — Qué NO hacer y alternativas correctas.
|
|
216
|
+
- [Postgres Tuning](references/postgres-tuning.md) — Índices, VACUUM, particionamiento y diagnóstico por tamaño de tabla.
|
|
@@ -234,9 +234,26 @@ DataDrain::Engine.new(
|
|
|
234
234
|
|
|
235
235
|
**Incorrecto:**
|
|
236
236
|
```ruby
|
|
237
|
-
DataDrain::GlueRunner.run_and_wait("job", args) #
|
|
237
|
+
DataDrain::GlueRunner.run_and_wait("job", args) # Sin timeout, puede bloquearse
|
|
238
238
|
```
|
|
239
239
|
|
|
240
|
-
**Razón:**
|
|
240
|
+
**Razón:** Si Glue queda colgado en `RUNNING`, bloquea indefinidamente.
|
|
241
241
|
|
|
242
|
-
**Alternativa:**
|
|
242
|
+
**Alternativa:** Usar `max_wait_seconds:` (desde v0.2.2):
|
|
243
|
+
```ruby
|
|
244
|
+
GlueRunner.run_and_wait("job", args, max_wait_seconds: 3600) # 1h max
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## 15. Llamar `Engine.new` con configuración incompleta
|
|
250
|
+
|
|
251
|
+
**Incorrecto:**
|
|
252
|
+
```ruby
|
|
253
|
+
DataDrain::Engine.new(table_name: "versions", start_date: ..., end_date: ...)
|
|
254
|
+
# donde DataDrain.configuration no tiene db_name seteado
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**Razón:** Desde v0.2.2, `Engine#initialize` llama `config.validate_for_engine!` que verifica `db_host`, `db_user`, `db_name`. Si alguno falta, levanta `DataDrain::ConfigurationError`.
|
|
258
|
+
|
|
259
|
+
**Alternativa:** Asegurar que `db_name`, `db_user` y `db_host` estén seteados en `DataDrain.configure` antes de llamar `Engine.new`. Si se usa auth peer/trust, `db_pass` puede ser `nil`.
|
|
@@ -40,7 +40,20 @@ Atributos (`attr_accessor`):
|
|
|
40
40
|
### `#duckdb_connection_string`
|
|
41
41
|
Retorna URI: `postgresql://user:pass@host:port/db?options=-c%20idle_in_transaction_session_timeout%3D<val>`
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
### `#validate!`
|
|
44
|
+
Valida invariantes generales. Llamada automáticamente por `FileIngestor#initialize` y `GlueRunner.run_and_wait`.
|
|
45
|
+
|
|
46
|
+
Raises `DataDrain::ConfigurationError` si:
|
|
47
|
+
- `storage_mode` no es `:local` ni `:s3`
|
|
48
|
+
- `storage_mode == :s3` y `aws_region` es nil o vacío
|
|
49
|
+
|
|
50
|
+
### `#validate_for_engine!`
|
|
51
|
+
Valida invariantes de Engine. Además de `#validate!`, verifica `db_host`, `db_user`, `db_name` no nil ni vacíos.
|
|
52
|
+
|
|
53
|
+
Llamada automáticamente por `Engine#initialize`.
|
|
54
|
+
|
|
55
|
+
**No valida `db_pass`** — puede ser nil con auth peer/trust (sockets locales) o IAM (RDS).
|
|
56
|
+
**No valida `db_port`** — tiene default `5432`, nunca nil tras `Configuration#initialize`.
|
|
44
57
|
|
|
45
58
|
---
|
|
46
59
|
|
|
@@ -164,24 +177,24 @@ Formato: `#<Class attr1: val1, attr2: val2, ...>`.
|
|
|
164
177
|
|
|
165
178
|
## `DataDrain::GlueRunner`
|
|
166
179
|
|
|
167
|
-
### `.run_and_wait(job_name, arguments = {}, polling_interval: 30) → true`
|
|
180
|
+
### `.run_and_wait(job_name, arguments = {}, polling_interval: 30, max_wait_seconds: nil) → true`
|
|
168
181
|
|
|
169
182
|
| Parámetro | Tipo | Descripción |
|
|
170
183
|
|-----------|------|-------------|
|
|
171
184
|
| `job_name` | String | Nombre del Job en consola AWS |
|
|
172
185
|
| `arguments` | Hash | Args con prefijo `--` (ej. `"--start_date" => "..."`) |
|
|
173
186
|
| `polling_interval` | Integer | Segundos entre chequeos. Default `30` |
|
|
187
|
+
| `max_wait_seconds` | Integer, nil | Timeout máximo. nil = sin límite. Default `nil` |
|
|
174
188
|
|
|
175
189
|
Flujo:
|
|
176
190
|
1. `Aws::Glue::Client.new(region: config.aws_region)`
|
|
177
191
|
2. `start_job_run` → captura `run_id`
|
|
178
192
|
3. Loop: `get_job_run`, evalúa `job_run_state`:
|
|
193
|
+
- Si `max_wait_seconds` excede → log `glue_runner.timeout`, `raise DataDrain::Error`
|
|
179
194
|
- `SUCCEEDED` → log `glue_runner.complete`, retorna `true`
|
|
180
195
|
- `FAILED|STOPPED|TIMEOUT` → log `glue_runner.failed` (incluye `error_message` truncado a 200 chars), `raise RuntimeError`
|
|
181
196
|
- Otro → log `glue_runner.polling`, `sleep polling_interval`
|
|
182
197
|
|
|
183
|
-
No tiene timeout máximo. Si Glue queda colgado en `RUNNING`, esto bloquea indefinidamente.
|
|
184
|
-
|
|
185
198
|
---
|
|
186
199
|
|
|
187
200
|
## `DataDrain::Storage`
|
|
@@ -220,7 +233,7 @@ Diseñado para `include` (instance methods, requiere `@logger`) o `extend` (clas
|
|
|
220
233
|
### `#safe_log(level, event, metadata = {})` (privado)
|
|
221
234
|
- Si `@logger` es nil, no-op.
|
|
222
235
|
- Construye `fields = { component: observability_name, event: event }.merge(metadata)`.
|
|
223
|
-
- Filtra valores cuyas keys
|
|
236
|
+
- Filtra valores cuyas keys matcheen `SENSITIVE_KEY_PATTERN = /password|passwd|pass|secret|token|api_key|apikey|auth|credential|private_key/i` → `[FILTERED]`. Aplica a claves exactas (`password`) y variantes (`db_password`, `aws_secret_access_key`, `bearer_token`, etc.).
|
|
224
237
|
- Emite `@logger.send(level) { "k1=v1 k2=v2 ..." }`.
|
|
225
238
|
- `rescue StandardError` silencioso (resilience).
|
|
226
239
|
|
|
@@ -128,6 +128,11 @@ Catálogo completo de eventos KV emitidos por DataDrain. Formato Wispro-Observab
|
|
|
128
128
|
**Campos:** `job`, `run_id`, `status`, `duration_s`, `error_message` (si Glue lo provee, truncado a 200 chars).
|
|
129
129
|
**Consecuencia:** `raise RuntimeError`.
|
|
130
130
|
|
|
131
|
+
### `glue_runner.timeout`
|
|
132
|
+
**Nivel:** ERROR. Emite cuando `max_wait_seconds` excede antes de `SUCCEEDED`.
|
|
133
|
+
**Campos:** `job`, `run_id`, `max_wait_seconds`.
|
|
134
|
+
**Consecuencia:** `raise DataDrain::Error`.
|
|
135
|
+
|
|
131
136
|
---
|
|
132
137
|
|
|
133
138
|
## Ejemplos reales
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Postgres Tuning para DataDrain
|
|
2
|
+
|
|
3
|
+
Guía operacional para tablas que DataDrain archiva y purga. Cubre índices,
|
|
4
|
+
VACUUM, particionamiento y diagnóstico.
|
|
5
|
+
|
|
6
|
+
## Tabla de decisión por tamaño
|
|
7
|
+
|
|
8
|
+
| Tamaño | Estrategia |
|
|
9
|
+
|--------|-----------|
|
|
10
|
+
| <10GB | Índice composite `(created_at, pk)` con `CREATE INDEX CONCURRENTLY` |
|
|
11
|
+
| 10-100GB | Mismo + `SET maintenance_work_mem='4GB'` + checklist |
|
|
12
|
+
| 100GB-1TB | Particionamiento declarativo por mes |
|
|
13
|
+
| >1TB | Particionamiento obligatorio + `DROP PARTITION` reemplaza DELETE |
|
|
14
|
+
|
|
15
|
+
## Índice recomendado
|
|
16
|
+
|
|
17
|
+
Para tablas <100GB, DataDrain se beneficia de un índice composite:
|
|
18
|
+
|
|
19
|
+
CREATE INDEX CONCURRENTLY idx_versions_created_at_id
|
|
20
|
+
ON versions (created_at, id);
|
|
21
|
+
|
|
22
|
+
El DELETE en batches usa `WHERE created_at >= X AND created_at < Y` + `IN (SELECT id LIMIT N)`.
|
|
23
|
+
El índice composite lo convierte en index scan por rango + acceso directo al id.
|
|
24
|
+
|
|
25
|
+
### Checklist pre-`CREATE INDEX CONCURRENTLY`
|
|
26
|
+
|
|
27
|
+
- [ ] Tamaño actual: `SELECT pg_size_pretty(pg_total_relation_size('versions'));`
|
|
28
|
+
- [ ] Espacio libre disco (>2x tabla)
|
|
29
|
+
- [ ] `SET maintenance_work_mem = '4GB';` (sesión)
|
|
30
|
+
- [ ] `SET statement_timeout = 0;`
|
|
31
|
+
- [ ] Ventana de baja carga
|
|
32
|
+
- [ ] Plan rollback: `DROP INDEX CONCURRENTLY` si satura I/O
|
|
33
|
+
|
|
34
|
+
### Riesgos de `CONCURRENTLY`
|
|
35
|
+
|
|
36
|
+
1. **Dos pasadas** (puede tardar horas en 500GB)
|
|
37
|
+
2. **I/O sostenido** (satura IOPS en EBS gp3 sin provisioned)
|
|
38
|
+
3. **Puede fallar y dejar índice INVALID** → recuperar con `DROP INDEX CONCURRENTLY idx; CREATE INDEX CONCURRENTLY idx ...`
|
|
39
|
+
4. **Espacio en disco alto** durante build (sort externo si `maintenance_work_mem` bajo)
|
|
40
|
+
|
|
41
|
+
## VACUUM ANALYZE post-purga
|
|
42
|
+
|
|
43
|
+
En tablas no particionadas, purgar millones de rows deja dead tuples.
|
|
44
|
+
Sin VACUUM, el espacio no se libera y los seq scan recorren páginas vacías.
|
|
45
|
+
|
|
46
|
+
VACUUM ANALYZE versions;
|
|
47
|
+
|
|
48
|
+
Item 5 del roadmap agrega `config.vacuum_after_purge` para automatizar esto.
|
|
49
|
+
Hasta v0.3.0, correr manualmente después de cada `Engine#call` en tablas
|
|
50
|
+
grandes no particionadas.
|
|
51
|
+
|
|
52
|
+
**NO usar `VACUUM FULL`** — bloquea la tabla entera (ACCESS EXCLUSIVE lock).
|
|
53
|
+
|
|
54
|
+
## Diagnóstico de purga lenta
|
|
55
|
+
|
|
56
|
+
-- Plan del DELETE en lotes
|
|
57
|
+
EXPLAIN (ANALYZE, BUFFERS)
|
|
58
|
+
DELETE FROM versions
|
|
59
|
+
WHERE id IN (
|
|
60
|
+
SELECT id FROM versions
|
|
61
|
+
WHERE created_at >= '2026-01-01' AND created_at < '2026-02-01'
|
|
62
|
+
LIMIT 5000
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
-- Sesiones activas sobre la tabla
|
|
66
|
+
SELECT pid, state, wait_event, query_start, query
|
|
67
|
+
FROM pg_stat_activity
|
|
68
|
+
WHERE query LIKE '%versions%'
|
|
69
|
+
AND state != 'idle';
|
|
70
|
+
|
|
71
|
+
-- Estadísticas de la tabla
|
|
72
|
+
SELECT relname, n_live_tup, n_dead_tup, last_vacuum, last_autovacuum
|
|
73
|
+
FROM pg_stat_user_tables
|
|
74
|
+
WHERE relname = 'versions';
|
|
75
|
+
|
|
76
|
+
-- Top queries lentas (requiere pg_stat_statements)
|
|
77
|
+
SELECT substring(query, 1, 100) AS query, calls, mean_exec_time, rows
|
|
78
|
+
FROM pg_stat_statements
|
|
79
|
+
WHERE query LIKE '%versions%'
|
|
80
|
+
ORDER BY mean_exec_time DESC
|
|
81
|
+
LIMIT 10;
|
|
82
|
+
|
|
83
|
+
## Particionamiento declarativo (tablas > 100GB)
|
|
84
|
+
|
|
85
|
+
Migrar a tabla particionada cambia DataDrain de "DELETE masivo throttled" a
|
|
86
|
+
"DROP PARTITION instantáneo".
|
|
87
|
+
|
|
88
|
+
### Setup
|
|
89
|
+
|
|
90
|
+
-- 1. Crear tabla particionada (vacía, misma estructura que versions)
|
|
91
|
+
CREATE TABLE versions_new (
|
|
92
|
+
id UUID PRIMARY KEY,
|
|
93
|
+
created_at TIMESTAMP NOT NULL,
|
|
94
|
+
... -- resto de columnas
|
|
95
|
+
) PARTITION BY RANGE (created_at);
|
|
96
|
+
|
|
97
|
+
-- 2. Crear partición por mes
|
|
98
|
+
CREATE TABLE versions_2026_03 PARTITION OF versions_new
|
|
99
|
+
FOR VALUES FROM ('2026-03-01') TO ('2026-04-01');
|
|
100
|
+
|
|
101
|
+
-- 3. Migrar datos (lotes, una partición por vez)
|
|
102
|
+
INSERT INTO versions_2026_03
|
|
103
|
+
SELECT * FROM versions
|
|
104
|
+
WHERE created_at >= '2026-03-01' AND created_at < '2026-04-01';
|
|
105
|
+
|
|
106
|
+
-- 4. Swap nombres (downtime mínimo)
|
|
107
|
+
BEGIN;
|
|
108
|
+
ALTER TABLE versions RENAME TO versions_old;
|
|
109
|
+
ALTER TABLE versions_new RENAME TO versions;
|
|
110
|
+
COMMIT;
|
|
111
|
+
|
|
112
|
+
### Beneficio para DataDrain
|
|
113
|
+
|
|
114
|
+
-- v0.2.x: DELETE en lotes, VACUUM después, horas en TB
|
|
115
|
+
DataDrain::Engine.new(...).call
|
|
116
|
+
|
|
117
|
+
-- Con particiones: DataDrain sigue funcionando pero si el rango
|
|
118
|
+
-- coincide con una partición, el operador puede hacer:
|
|
119
|
+
DROP TABLE versions_2026_03; -- instantáneo, sin bloat
|
|
120
|
+
|
|
121
|
+
DataDrain no detecta particiones automáticamente (futuro item). Hoy el
|
|
122
|
+
operador decide.
|
|
123
|
+
|
|
124
|
+
## Referencias
|
|
125
|
+
|
|
126
|
+
- Skill: `.agents/skills/postgresql-optimization/SKILL.md`
|
|
127
|
+
- PG docs: https://www.postgresql.org/docs/current/ddl-partitioning.html
|
|
128
|
+
- Item 5 roadmap (VACUUM automático): ../IMPROVEMENT_PLAN.md#item-5
|
|
129
|
+
- Item 11b roadmap (warning runtime): ../IMPROVEMENT_PLAN.md#item-11b
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_drain
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gabriel
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activemodel
|
|
@@ -100,6 +100,7 @@ files:
|
|
|
100
100
|
- docs/IMPROVEMENT_PLAN.md
|
|
101
101
|
- docs/execution/archive/v0.2.0.agente-review.md
|
|
102
102
|
- docs/execution/archive/v0.2.0.md
|
|
103
|
+
- docs/execution/v0.2.2.md
|
|
103
104
|
- docs/glue_pyspark_example.py
|
|
104
105
|
- lib/data_drain.rb
|
|
105
106
|
- lib/data_drain/configuration.rb
|
|
@@ -108,6 +109,7 @@ files:
|
|
|
108
109
|
- lib/data_drain/file_ingestor.rb
|
|
109
110
|
- lib/data_drain/glue_runner.rb
|
|
110
111
|
- lib/data_drain/observability.rb
|
|
112
|
+
- lib/data_drain/observability/timing.rb
|
|
111
113
|
- lib/data_drain/record.rb
|
|
112
114
|
- lib/data_drain/storage.rb
|
|
113
115
|
- lib/data_drain/storage/base.rb
|
|
@@ -121,6 +123,7 @@ files:
|
|
|
121
123
|
- skill/references/antipatrones.md
|
|
122
124
|
- skill/references/api-detallada.md
|
|
123
125
|
- skill/references/eventos-telemetria.md
|
|
126
|
+
- skill/references/postgres-tuning.md
|
|
124
127
|
homepage: https://github.com/gedera/data_drain
|
|
125
128
|
licenses: []
|
|
126
129
|
metadata: {}
|