data_drain 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +10 -6
- data/data_drain.gemspec +1 -1
- data/lib/data_drain/record.rb +4 -1
- data/lib/data_drain/version.rb +1 -1
- data/skill/SKILL.md +2 -2
- data/skill/references/eventos-telemetria.md +9 -0
- metadata +5 -5
- /data/docs/execution/{v0.5.0-OBSERVACIONES.md → archives/v0.5.0-OBSERVACIONES.md} +0 -0
- /data/docs/execution/{v0.5.0.md → archives/v0.5.0.md} +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 414600ce1230908cb1eef7e092ebf9287774ddbe4985286d8aa83995d0e47d4b
|
|
4
|
+
data.tar.gz: d300b31686ccf09320abc070a018566510e3e9a2d8488cb2bc83209dc56a3b21
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f4a7177e6d412995216397de87e9e93806815c395dea206fed75b541f3dafb208a34ea2c1cb35d6226f0dd4d4bf118c9ba13b0e8dc522d73be16ab59487fc7c9
|
|
7
|
+
data.tar.gz: 40dd834ad6af6d0c291b35a4ddbd259ef635400b31f92852d50e9565ff9f14ce880c3d0b4ad04f8a1e924888e92327338356776d065bd07dabec5d450ea0d440
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.5.2] - 2026-04-16
|
|
4
|
+
|
|
5
|
+
### Correcciones
|
|
6
|
+
|
|
7
|
+
- `Record#where()` ahora usa wildcards (`key=*`) para partition keys no especificadas, en lugar de valores vacíos (`key=`). Consistente con `destroy_partitions`. Fixes #1.
|
|
8
|
+
|
|
9
|
+
## [0.5.1] - 2026-04-15
|
|
10
|
+
|
|
11
|
+
### Docs
|
|
12
|
+
|
|
13
|
+
- `skill/references/eventos-telemetria.md`: nuevos eventos `script_uploaded` y `script_upload_error`.
|
|
14
|
+
- `README.md`: ejemplos de `script_path` en GlueRunner y observabilidad.
|
|
15
|
+
|
|
3
16
|
## [0.5.0] - 2026-04-15
|
|
4
17
|
|
|
5
18
|
### Features
|
data/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# DataDrain
|
|
2
2
|
|
|
3
|
-
[](https://github.com/sequre/data_drain/actions/workflows/main.yml)
|
|
4
4
|
|
|
5
5
|
Micro-framework Ruby para extraer, archivar y purgar datos históricos de PostgreSQL hacia un Data Lake (S3 o disco local) en formato Parquet, usando DuckDB en memoria.
|
|
6
6
|
|
|
@@ -18,7 +18,7 @@ Micro-framework Ruby para extraer, archivar y purgar datos históricos de Postgr
|
|
|
18
18
|
|
|
19
19
|
```ruby
|
|
20
20
|
# Gemfile
|
|
21
|
-
gem 'data_drain', git: 'https://github.com/
|
|
21
|
+
gem 'data_drain', git: 'https://github.com/sequre/data_drain.git', branch: 'main'
|
|
22
22
|
```
|
|
23
23
|
|
|
24
24
|
```bash
|
|
@@ -115,11 +115,13 @@ DataDrain::GlueRunner.job_exists?("my-glue-export-job")
|
|
|
115
115
|
job = DataDrain::GlueRunner.get_job("my-glue-export-job")
|
|
116
116
|
# => Aws::Glue::Types::Job (Name, Command, DefaultArguments, etc.)
|
|
117
117
|
|
|
118
|
-
# Crear un job
|
|
118
|
+
# Crear un job con script local (v0.5.0+)
|
|
119
119
|
job = DataDrain::GlueRunner.create_job(
|
|
120
120
|
"my-glue-export-job",
|
|
121
121
|
role_arn: "arn:aws:iam::123:role/GlueServiceRole",
|
|
122
|
-
|
|
122
|
+
script_path: "scripts/glue/export.py", # local → S3 automático
|
|
123
|
+
script_bucket: "my-bucket",
|
|
124
|
+
script_folder: "scripts",
|
|
123
125
|
default_arguments: { "--extra-files" => "s3://my-bucket/scripts/udf.py" },
|
|
124
126
|
timeout: 1440,
|
|
125
127
|
max_retries: 2
|
|
@@ -129,8 +131,9 @@ job = DataDrain::GlueRunner.create_job(
|
|
|
129
131
|
job = DataDrain::GlueRunner.ensure_job(
|
|
130
132
|
"my-glue-export-job",
|
|
131
133
|
role_arn: "arn:aws:iam::123:role/GlueServiceRole",
|
|
132
|
-
|
|
133
|
-
|
|
134
|
+
script_path: "scripts/glue/export.py",
|
|
135
|
+
script_bucket: "my-bucket",
|
|
136
|
+
script_folder: "scripts"
|
|
134
137
|
)
|
|
135
138
|
|
|
136
139
|
# Eliminar un job
|
|
@@ -198,6 +201,7 @@ ArchivedVersion.destroy_all(year: 2024, month: 3) # un mes globalmente
|
|
|
198
201
|
```
|
|
199
202
|
component=data_drain event=engine.complete table=versions duration_s=12.4 export_duration_s=8.1 purge_duration_s=3.9 count=150000
|
|
200
203
|
component=data_drain event=engine.purge_heartbeat table=versions batches_processed_count=100 rows_deleted_count=500000
|
|
204
|
+
component=data_drain event=glue_runner.script_uploaded local_path=scripts/glue/export.py s3_path=s3://my-bucket/scripts/export.py bytes=4521
|
|
201
205
|
component=data_drain event=glue_runner.failed job=my-export-job run_id=jr_abc123 status=FAILED duration_s=301.0
|
|
202
206
|
```
|
|
203
207
|
|
data/data_drain.gemspec
CHANGED
|
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
|
|
|
11
11
|
spec.summary = "Micro-framework para drenar datos de PostgreSQL a Parquet vía DuckDB."
|
|
12
12
|
spec.description = "Extrae datos transaccionales, los archiva en un Data Lake (S3/Local) " \
|
|
13
13
|
"en formato Parquet usando Hive Partitioning, y purga el origen de forma segura."
|
|
14
|
-
spec.homepage = "https://github.com/
|
|
14
|
+
spec.homepage = "https://github.com/sequre/data_drain"
|
|
15
15
|
spec.required_ruby_version = ">= 3.2"
|
|
16
16
|
|
|
17
17
|
spec.files = Dir.chdir(__dir__) do
|
data/lib/data_drain/record.rb
CHANGED
|
@@ -131,7 +131,10 @@ module DataDrain
|
|
|
131
131
|
# @param partitions [Hash]
|
|
132
132
|
# @return [String]
|
|
133
133
|
def build_query_path(partitions)
|
|
134
|
-
partition_path = partition_keys.map
|
|
134
|
+
partition_path = partition_keys.map do |k|
|
|
135
|
+
val = partitions.key?(k.to_sym) ? partitions[k.to_sym] : partitions[k.to_s]
|
|
136
|
+
val.nil? || val.to_s.empty? ? "#{k}=*" : "#{k}=#{val}"
|
|
137
|
+
end.join("/")
|
|
135
138
|
DataDrain::Storage.adapter.build_path(bucket, folder_name, partition_path)
|
|
136
139
|
end
|
|
137
140
|
|
data/lib/data_drain/version.rb
CHANGED
data/skill/SKILL.md
CHANGED
|
@@ -70,7 +70,7 @@ DataDrain resuelve el ciclo de vida de datos históricos en bases relacionales c
|
|
|
70
70
|
|
|
71
71
|
- Ruby `>= 3.2.0`
|
|
72
72
|
- Runtime: `activemodel >= 6.0`, `duckdb ~> 1.4`, `pg >= 1.2`, `aws-sdk-s3 ~> 1.114`, `aws-sdk-glue ~> 1.0`
|
|
73
|
-
- Versión actual: `0.5.
|
|
73
|
+
- Versión actual: `0.5.1`
|
|
74
74
|
|
|
75
75
|
## API Pública (resumen)
|
|
76
76
|
|
|
@@ -271,7 +271,7 @@ Catálogo completo en [Antipatrones](references/antipatrones.md). Resumen de los
|
|
|
271
271
|
## Referencias
|
|
272
272
|
|
|
273
273
|
- [API Detallada](references/api-detallada.md) — Firmas completas, parámetros, retornos y comportamientos de cada clase pública.
|
|
274
|
-
- [Glue Jobs Lifecycle](https://github.com/
|
|
274
|
+
- [Glue Jobs Lifecycle](https://github.com/sequre/data_drain/blob/main/docs/glue-jobs-lifecycle.md) — Guía completa de gestión de AWS Glue Jobs: crear, actualizar, eliminar, verificar y ejecutar jobs idempotentemente.
|
|
275
275
|
- [Eventos y Telemetría](references/eventos-telemetria.md) — Catálogo completo de eventos KV emitidos por la gema.
|
|
276
276
|
- [Antipatrones](references/antipatrones.md) — Qué NO hacer y alternativas correctas.
|
|
277
277
|
- [Postgres Tuning](references/postgres-tuning.md) — Índices, VACUUM, particionamiento y diagnóstico por tamaño de tabla.
|
|
@@ -115,6 +115,15 @@ Catálogo completo de eventos KV emitidos por DataDrain. Formato Wispro-Observab
|
|
|
115
115
|
**Nivel:** INFO. Emite antes de `start_job_run`.
|
|
116
116
|
**Campos:** `job`.
|
|
117
117
|
|
|
118
|
+
### `glue_runner.script_uploaded`
|
|
119
|
+
**Nivel:** INFO. Emite tras subir un script a S3 (v0.5.0+).
|
|
120
|
+
**Campos:** `local_path`, `s3_path`, `bytes`.
|
|
121
|
+
|
|
122
|
+
### `glue_runner.script_upload_error`
|
|
123
|
+
**Nivel:** ERROR. Emite si el upload a S3 falla (v0.5.0+).
|
|
124
|
+
**Campos:** `local_path`, `bucket`, `error_class`, `error_message`.
|
|
125
|
+
**Consecuencia:** propaga el `Aws::S3::Errors::ServiceError`.
|
|
126
|
+
|
|
118
127
|
### `glue_runner.job_exists`
|
|
119
128
|
**Nivel:** INFO. Emite en `ensure_job` cuando el job ya existe y se actualiza.
|
|
120
129
|
**Campos:** `job`.
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_drain
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gabriel
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activemodel
|
|
@@ -104,11 +104,11 @@ files:
|
|
|
104
104
|
- docs/execution/archive/v0.3.0.md
|
|
105
105
|
- docs/execution/archive/v0.3.1-OBSERVACIONES.md
|
|
106
106
|
- docs/execution/archive/v0.3.1.md
|
|
107
|
+
- docs/execution/archives/v0.5.0-OBSERVACIONES.md
|
|
108
|
+
- docs/execution/archives/v0.5.0.md
|
|
107
109
|
- docs/execution/v0.2.2.md
|
|
108
110
|
- docs/execution/v0.4.0-OBSERVACIONES.md
|
|
109
111
|
- docs/execution/v0.4.0.md
|
|
110
|
-
- docs/execution/v0.5.0-OBSERVACIONES.md
|
|
111
|
-
- docs/execution/v0.5.0.md
|
|
112
112
|
- docs/glue-jobs-lifecycle.md
|
|
113
113
|
- docs/glue_pyspark_example.py
|
|
114
114
|
- lib/data_drain.rb
|
|
@@ -133,7 +133,7 @@ files:
|
|
|
133
133
|
- skill/references/api-detallada.md
|
|
134
134
|
- skill/references/eventos-telemetria.md
|
|
135
135
|
- skill/references/postgres-tuning.md
|
|
136
|
-
homepage: https://github.com/
|
|
136
|
+
homepage: https://github.com/sequre/data_drain
|
|
137
137
|
licenses: []
|
|
138
138
|
metadata: {}
|
|
139
139
|
post_install_message:
|
|
File without changes
|
|
File without changes
|