data_drain 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c92c85e6232344565dc090539d3d58aa47904ca1e454b696ba4eba12e2648881
4
- data.tar.gz: 1b3332ac50288dfd6793aed0c51ab8fa49d8dc309f7bc8dd4a642ecffd97e3cd
3
+ metadata.gz: 414600ce1230908cb1eef7e092ebf9287774ddbe4985286d8aa83995d0e47d4b
4
+ data.tar.gz: d300b31686ccf09320abc070a018566510e3e9a2d8488cb2bc83209dc56a3b21
5
5
  SHA512:
6
- metadata.gz: d0cecb3d168ad96943b9cc70eb936e9b95e92d03dc0bd08ed5996ac1967ef627191802892097abf3b094d5b59515c150ca8edc4ffcefe13be8b4a2d7721a180a
7
- data.tar.gz: 45018ca7e4287bf055cb7060e5720f83da318bd7c9ad44a30d8e832972feb1eb99d29286a9813128d4f84171d4c032e12f9a4f4dd7cc0022b36e44a3098d91c8
6
+ metadata.gz: f4a7177e6d412995216397de87e9e93806815c395dea206fed75b541f3dafb208a34ea2c1cb35d6226f0dd4d4bf118c9ba13b0e8dc522d73be16ab59487fc7c9
7
+ data.tar.gz: 40dd834ad6af6d0c291b35a4ddbd259ef635400b31f92852d50e9565ff9f14ce880c3d0b4ad04f8a1e924888e92327338356776d065bd07dabec5d450ea0d440
data/CHANGELOG.md CHANGED
@@ -1,5 +1,18 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.5.2] - 2026-04-16
4
+
5
+ ### Correcciones
6
+
7
+ - `Record#where()` ahora usa wildcards (`key=*`) para partition keys no especificadas, en lugar de valores vacíos (`key=`). Consistente con `destroy_partitions`. Fixes #1.
8
+
9
+ ## [0.5.1] - 2026-04-15
10
+
11
+ ### Docs
12
+
13
+ - `skill/references/eventos-telemetria.md`: nuevos eventos `script_uploaded` y `script_upload_error`.
14
+ - `README.md`: ejemplos de `script_path` en GlueRunner y observabilidad.
15
+
3
16
  ## [0.5.0] - 2026-04-15
4
17
 
5
18
  ### Features
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # DataDrain
2
2
 
3
- [![CI](https://github.com/gedera/data_drain/actions/workflows/main.yml/badge.svg)](https://github.com/gedera/data_drain/actions/workflows/main.yml)
3
+ [![CI](https://github.com/sequre/data_drain/actions/workflows/main.yml/badge.svg)](https://github.com/sequre/data_drain/actions/workflows/main.yml)
4
4
 
5
5
  Micro-framework Ruby para extraer, archivar y purgar datos históricos de PostgreSQL hacia un Data Lake (S3 o disco local) en formato Parquet, usando DuckDB en memoria.
6
6
 
@@ -18,7 +18,7 @@ Micro-framework Ruby para extraer, archivar y purgar datos históricos de Postgr
18
18
 
19
19
  ```ruby
20
20
  # Gemfile
21
- gem 'data_drain', git: 'https://github.com/gedera/data_drain.git', branch: 'main'
21
+ gem 'data_drain', git: 'https://github.com/sequre/data_drain.git', branch: 'main'
22
22
  ```
23
23
 
24
24
  ```bash
@@ -115,11 +115,13 @@ DataDrain::GlueRunner.job_exists?("my-glue-export-job")
115
115
  job = DataDrain::GlueRunner.get_job("my-glue-export-job")
116
116
  # => Aws::Glue::Types::Job (Name, Command, DefaultArguments, etc.)
117
117
 
118
- # Crear un job
118
+ # Crear un job con script local (v0.5.0+)
119
119
  job = DataDrain::GlueRunner.create_job(
120
120
  "my-glue-export-job",
121
121
  role_arn: "arn:aws:iam::123:role/GlueServiceRole",
122
- script_location: "s3://my-bucket/scripts/export.py",
122
+ script_path: "scripts/glue/export.py", # local → S3 automático
123
+ script_bucket: "my-bucket",
124
+ script_folder: "scripts",
123
125
  default_arguments: { "--extra-files" => "s3://my-bucket/scripts/udf.py" },
124
126
  timeout: 1440,
125
127
  max_retries: 2
@@ -129,8 +131,9 @@ job = DataDrain::GlueRunner.create_job(
129
131
  job = DataDrain::GlueRunner.ensure_job(
130
132
  "my-glue-export-job",
131
133
  role_arn: "arn:aws:iam::123:role/GlueServiceRole",
132
- script_location: "s3://my-bucket/scripts/export.py",
133
- timeout: 1440
134
+ script_path: "scripts/glue/export.py",
135
+ script_bucket: "my-bucket",
136
+ script_folder: "scripts"
134
137
  )
135
138
 
136
139
  # Eliminar un job
@@ -198,6 +201,7 @@ ArchivedVersion.destroy_all(year: 2024, month: 3) # un mes globalmente
198
201
  ```
199
202
  component=data_drain event=engine.complete table=versions duration_s=12.4 export_duration_s=8.1 purge_duration_s=3.9 count=150000
200
203
  component=data_drain event=engine.purge_heartbeat table=versions batches_processed_count=100 rows_deleted_count=500000
204
+ component=data_drain event=glue_runner.script_uploaded local_path=scripts/glue/export.py s3_path=s3://my-bucket/scripts/export.py bytes=4521
201
205
  component=data_drain event=glue_runner.failed job=my-export-job run_id=jr_abc123 status=FAILED duration_s=301.0
202
206
  ```
203
207
 
data/data_drain.gemspec CHANGED
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
11
11
  spec.summary = "Micro-framework para drenar datos de PostgreSQL a Parquet vía DuckDB."
12
12
  spec.description = "Extrae datos transaccionales, los archiva en un Data Lake (S3/Local) " \
13
13
  "en formato Parquet usando Hive Partitioning, y purga el origen de forma segura."
14
- spec.homepage = "https://github.com/gedera/data_drain"
14
+ spec.homepage = "https://github.com/sequre/data_drain"
15
15
  spec.required_ruby_version = ">= 3.2"
16
16
 
17
17
  spec.files = Dir.chdir(__dir__) do
@@ -131,7 +131,10 @@ module DataDrain
131
131
  # @param partitions [Hash]
132
132
  # @return [String]
133
133
  def build_query_path(partitions)
134
- partition_path = partition_keys.map { |k| "#{k}=#{partitions[k.to_sym] || partitions[k.to_s]}" }.join("/")
134
+ partition_path = partition_keys.map do |k|
135
+ val = partitions.key?(k.to_sym) ? partitions[k.to_sym] : partitions[k.to_s]
136
+ val.nil? || val.to_s.empty? ? "#{k}=*" : "#{k}=#{val}"
137
+ end.join("/")
135
138
  DataDrain::Storage.adapter.build_path(bucket, folder_name, partition_path)
136
139
  end
137
140
 
@@ -2,5 +2,5 @@
2
2
 
3
3
  module DataDrain
4
4
  # @return [String] versión semver de la gema
5
- VERSION = "0.5.0"
5
+ VERSION = "0.5.2"
6
6
  end
data/skill/SKILL.md CHANGED
@@ -70,7 +70,7 @@ DataDrain resuelve el ciclo de vida de datos históricos en bases relacionales c
70
70
 
71
71
  - Ruby `>= 3.2.0`
72
72
  - Runtime: `activemodel >= 6.0`, `duckdb ~> 1.4`, `pg >= 1.2`, `aws-sdk-s3 ~> 1.114`, `aws-sdk-glue ~> 1.0`
73
- - Versión actual: `0.5.0`
73
+ - Versión actual: `0.5.1`
74
74
 
75
75
  ## API Pública (resumen)
76
76
 
@@ -271,7 +271,7 @@ Catálogo completo en [Antipatrones](references/antipatrones.md). Resumen de los
271
271
  ## Referencias
272
272
 
273
273
  - [API Detallada](references/api-detallada.md) — Firmas completas, parámetros, retornos y comportamientos de cada clase pública.
274
- - [Glue Jobs Lifecycle](https://github.com/gedera/data_drain/blob/main/docs/glue-jobs-lifecycle.md) — Guía completa de gestión de AWS Glue Jobs: crear, actualizar, eliminar, verificar y ejecutar jobs idempotentemente.
274
+ - [Glue Jobs Lifecycle](https://github.com/sequre/data_drain/blob/main/docs/glue-jobs-lifecycle.md) — Guía completa de gestión de AWS Glue Jobs: crear, actualizar, eliminar, verificar y ejecutar jobs idempotentemente.
275
275
  - [Eventos y Telemetría](references/eventos-telemetria.md) — Catálogo completo de eventos KV emitidos por la gema.
276
276
  - [Antipatrones](references/antipatrones.md) — Qué NO hacer y alternativas correctas.
277
277
  - [Postgres Tuning](references/postgres-tuning.md) — Índices, VACUUM, particionamiento y diagnóstico por tamaño de tabla.
@@ -115,6 +115,15 @@ Catálogo completo de eventos KV emitidos por DataDrain. Formato Wispro-Observab
115
115
  **Nivel:** INFO. Emite antes de `start_job_run`.
116
116
  **Campos:** `job`.
117
117
 
118
+ ### `glue_runner.script_uploaded`
119
+ **Nivel:** INFO. Emite tras subir un script a S3 (v0.5.0+).
120
+ **Campos:** `local_path`, `s3_path`, `bytes`.
121
+
122
+ ### `glue_runner.script_upload_error`
123
+ **Nivel:** ERROR. Emite si el upload a S3 falla (v0.5.0+).
124
+ **Campos:** `local_path`, `bucket`, `error_class`, `error_message`.
125
+ **Consecuencia:** propaga el `Aws::S3::Errors::ServiceError`.
126
+
118
127
  ### `glue_runner.job_exists`
119
128
  **Nivel:** INFO. Emite en `ensure_job` cuando el job ya existe y se actualiza.
120
129
  **Campos:** `job`.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_drain
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gabriel
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-15 00:00:00.000000000 Z
11
+ date: 2026-04-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -104,11 +104,11 @@ files:
104
104
  - docs/execution/archive/v0.3.0.md
105
105
  - docs/execution/archive/v0.3.1-OBSERVACIONES.md
106
106
  - docs/execution/archive/v0.3.1.md
107
+ - docs/execution/archives/v0.5.0-OBSERVACIONES.md
108
+ - docs/execution/archives/v0.5.0.md
107
109
  - docs/execution/v0.2.2.md
108
110
  - docs/execution/v0.4.0-OBSERVACIONES.md
109
111
  - docs/execution/v0.4.0.md
110
- - docs/execution/v0.5.0-OBSERVACIONES.md
111
- - docs/execution/v0.5.0.md
112
112
  - docs/glue-jobs-lifecycle.md
113
113
  - docs/glue_pyspark_example.py
114
114
  - lib/data_drain.rb
@@ -133,7 +133,7 @@ files:
133
133
  - skill/references/api-detallada.md
134
134
  - skill/references/eventos-telemetria.md
135
135
  - skill/references/postgres-tuning.md
136
- homepage: https://github.com/gedera/data_drain
136
+ homepage: https://github.com/sequre/data_drain
137
137
  licenses: []
138
138
  metadata: {}
139
139
  post_install_message:
File without changes