data_drain 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +40 -1
- data/CHANGELOG.md +38 -0
- data/CLAUDE.md +14 -0
- data/README.md +2 -0
- data/data_drain.gemspec +1 -1
- data/docs/IMPROVEMENT_PLAN.md +122 -21
- data/docs/execution/archive/v0.3.0-OBSERVACIONES.md +136 -0
- data/docs/execution/archive/v0.3.0.md +1111 -0
- data/docs/execution/archive/v0.3.1-OBSERVACIONES.md +146 -0
- data/docs/execution/archive/v0.3.1.md +842 -0
- data/lib/data_drain/engine.rb +1 -0
- data/lib/data_drain/observability.rb +2 -0
- data/lib/data_drain/storage/base.rb +12 -0
- data/lib/data_drain/storage/local.rb +1 -3
- data/lib/data_drain/storage/s3.rb +5 -3
- data/lib/data_drain/types/json_type.rb +1 -0
- data/lib/data_drain/validations.rb +2 -0
- data/lib/data_drain/version.rb +2 -1
- data/lib/data_drain.rb +1 -0
- data/skill/references/antipatrones.md +10 -0
- data/skill/references/postgres-tuning.md +14 -0
- metadata +6 -2
data/lib/data_drain/engine.rb
CHANGED
|
@@ -7,6 +7,8 @@ module DataDrain
|
|
|
7
7
|
# Este módulo es genérico y puede ser utilizado en otras gemas.
|
|
8
8
|
# @api private
|
|
9
9
|
module Observability
|
|
10
|
+
# Regex para detectar claves sensibles en logs y enmascararlas preventivamente.
|
|
11
|
+
# @!visibility private
|
|
10
12
|
SENSITIVE_KEY_PATTERN = /password|passwd|pass|secret|token|api_key|apikey|auth|credential|private_key/i
|
|
11
13
|
|
|
12
14
|
private
|
|
@@ -54,6 +54,18 @@ module DataDrain
|
|
|
54
54
|
def destroy_partitions(bucket, folder_name, partition_keys, partitions)
|
|
55
55
|
raise NotImplementedError, "#{self.class} debe implementar #destroy_partitions"
|
|
56
56
|
end
|
|
57
|
+
|
|
58
|
+
protected
|
|
59
|
+
|
|
60
|
+
# @param bucket [String]
|
|
61
|
+
# @param folder_name [String]
|
|
62
|
+
# @param partition_path [String, nil]
|
|
63
|
+
# @return [String] path sin prefix de protocolo ni sufijo glob
|
|
64
|
+
def build_path_base(bucket, folder_name, partition_path)
|
|
65
|
+
base = File.join(bucket, folder_name)
|
|
66
|
+
base = File.join(base, partition_path) if partition_path && !partition_path.empty?
|
|
67
|
+
base
|
|
68
|
+
end
|
|
57
69
|
end
|
|
58
70
|
end
|
|
59
71
|
end
|
|
@@ -24,9 +24,7 @@ module DataDrain
|
|
|
24
24
|
# @param partition_path [String, nil]
|
|
25
25
|
# @return [String]
|
|
26
26
|
def build_path(bucket, folder_name, partition_path)
|
|
27
|
-
|
|
28
|
-
base = File.join(base, partition_path) if partition_path && !partition_path.empty?
|
|
29
|
-
"#{base}/**/*.parquet"
|
|
27
|
+
"#{build_path_base(bucket, folder_name, partition_path)}/**/*.parquet"
|
|
30
28
|
end
|
|
31
29
|
|
|
32
30
|
# @param bucket [String]
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "aws-sdk-s3"
|
|
4
|
+
|
|
3
5
|
module DataDrain
|
|
4
6
|
module Storage
|
|
7
|
+
# Adaptador de almacenamiento para Amazon S3.
|
|
8
|
+
# Configura credenciales en DuckDB y provee destrucción de particiones vía AWS SDK.
|
|
5
9
|
class S3 < Base
|
|
6
10
|
# Carga la extensión httpfs en DuckDB e inyecta las credenciales de AWS.
|
|
7
11
|
# Si aws_access_key_id y aws_secret_access_key están seteados, usa
|
|
@@ -19,9 +23,7 @@ module DataDrain
|
|
|
19
23
|
# @param partition_path [String, nil]
|
|
20
24
|
# @return [String]
|
|
21
25
|
def build_path(bucket, folder_name, partition_path)
|
|
22
|
-
|
|
23
|
-
base = File.join(base, partition_path) if partition_path && !partition_path.empty?
|
|
24
|
-
"s3://#{base}/**/*.parquet"
|
|
26
|
+
"s3://#{build_path_base(bucket, folder_name, partition_path)}/**/*.parquet"
|
|
25
27
|
end
|
|
26
28
|
|
|
27
29
|
# @param bucket [String]
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
module DataDrain
|
|
4
4
|
# Módulo de validación de configuración para prevenir errores de uso.
|
|
5
5
|
module Validations
|
|
6
|
+
# Regex que valida identificadores SQL (tablas, columnas, etc.).
|
|
7
|
+
# Permite letras, guiones bajos y números (no al inicio).
|
|
6
8
|
IDENTIFIER_REGEX = /\A[a-zA-Z_][a-zA-Z0-9_]*\z/
|
|
7
9
|
|
|
8
10
|
module_function
|
data/lib/data_drain/version.rb
CHANGED
data/lib/data_drain.rb
CHANGED
|
@@ -195,6 +195,16 @@ logger.debug("query=#{expensive_serialize(obj)}") # Siempre evalúa, incluso si
|
|
|
195
195
|
logger.debug { "query=#{expensive_serialize(obj)}" }
|
|
196
196
|
```
|
|
197
197
|
|
|
198
|
+
**Ejemplo real en DataDrain:**
|
|
199
|
+
|
|
200
|
+
```ruby
|
|
201
|
+
# Incorrecto — el query puede tener MB de datos en partition_by, se serializa siempre:
|
|
202
|
+
logger.debug("export_query=#{query}")
|
|
203
|
+
|
|
204
|
+
# Correcto — solo se serializa si DEBUG está activo:
|
|
205
|
+
logger.debug { "export_query=#{query}" }
|
|
206
|
+
```
|
|
207
|
+
|
|
198
208
|
---
|
|
199
209
|
|
|
200
210
|
## 12. Asumir que `Record.connection` se puede cerrar manualmente
|
|
@@ -121,6 +121,20 @@ Migrar a tabla particionada cambia DataDrain de "DELETE masivo throttled" a
|
|
|
121
121
|
DataDrain no detecta particiones automáticamente (futuro item). Hoy el
|
|
122
122
|
operador decide.
|
|
123
123
|
|
|
124
|
+
## Tuning de parámetros DataDrain por tamaño
|
|
125
|
+
|
|
126
|
+
| Filas tabla | `batch_size` | `throttle_delay` | `vacuum_after_purge` | `slow_batch_threshold_s` |
|
|
127
|
+
|------------|-------------|-----------------|---------------------|-------------------------|
|
|
128
|
+
| <1M | 5000 | 0.1 | false | 30 |
|
|
129
|
+
| 1M-100M | 5000 | 0.5 | true | 30 |
|
|
130
|
+
| 100M-1B | 10000 | 1.0 | true | 60 |
|
|
131
|
+
| >1B | migrar a particionamiento (ver arriba) | | | |
|
|
132
|
+
|
|
133
|
+
Contexto operacional:
|
|
134
|
+
- **OLTP concurrente**: `throttle_delay` alto (≥0.5s) para no saturar la DB.
|
|
135
|
+
- **Tablas frías** (sin queries de usuarios): `throttle_delay` 0 OK.
|
|
136
|
+
- **`slow_batch_threshold_s`** alto en tablas grandes porque cada batch tarda más legítimamente.
|
|
137
|
+
|
|
124
138
|
## Referencias
|
|
125
139
|
|
|
126
140
|
- Skill: `.agents/skills/postgresql-optimization/SKILL.md`
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_drain
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gabriel
|
|
@@ -100,6 +100,10 @@ files:
|
|
|
100
100
|
- docs/IMPROVEMENT_PLAN.md
|
|
101
101
|
- docs/execution/archive/v0.2.0.agente-review.md
|
|
102
102
|
- docs/execution/archive/v0.2.0.md
|
|
103
|
+
- docs/execution/archive/v0.3.0-OBSERVACIONES.md
|
|
104
|
+
- docs/execution/archive/v0.3.0.md
|
|
105
|
+
- docs/execution/archive/v0.3.1-OBSERVACIONES.md
|
|
106
|
+
- docs/execution/archive/v0.3.1.md
|
|
103
107
|
- docs/execution/v0.2.2.md
|
|
104
108
|
- docs/glue_pyspark_example.py
|
|
105
109
|
- lib/data_drain.rb
|
|
@@ -135,7 +139,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
135
139
|
requirements:
|
|
136
140
|
- - ">="
|
|
137
141
|
- !ruby/object:Gem::Version
|
|
138
|
-
version: 3.
|
|
142
|
+
version: '3.2'
|
|
139
143
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
140
144
|
requirements:
|
|
141
145
|
- - ">="
|