data_drain 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +40 -1
- data/CHANGELOG.md +55 -0
- data/CLAUDE.md +14 -0
- data/README.md +2 -0
- data/data_drain.gemspec +1 -1
- data/docs/IMPROVEMENT_PLAN.md +132 -26
- data/docs/execution/archive/v0.3.0-OBSERVACIONES.md +136 -0
- data/docs/execution/archive/v0.3.0.md +1111 -0
- data/docs/execution/v0.3.1-OBSERVACIONES.md +146 -0
- data/docs/execution/v0.3.1.md +842 -0
- data/lib/data_drain/configuration.rb +7 -1
- data/lib/data_drain/engine.rb +185 -74
- data/lib/data_drain/file_ingestor.rb +64 -47
- data/lib/data_drain/observability/timing.rb +23 -0
- data/lib/data_drain/observability.rb +2 -0
- data/lib/data_drain/record.rb +8 -15
- data/lib/data_drain/storage/base.rb +12 -0
- data/lib/data_drain/storage/local.rb +1 -3
- data/lib/data_drain/storage/s3.rb +35 -14
- data/lib/data_drain/types/json_type.rb +1 -0
- data/lib/data_drain/validations.rb +2 -0
- data/lib/data_drain/version.rb +2 -1
- data/lib/data_drain.rb +2 -0
- data/skill/references/antipatrones.md +10 -0
- data/skill/references/postgres-tuning.md +14 -0
- metadata +8 -3
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "aws-sdk-s3"
|
|
4
|
+
|
|
3
5
|
module DataDrain
|
|
4
6
|
module Storage
|
|
7
|
+
# Adaptador de almacenamiento para Amazon S3.
|
|
8
|
+
# Configura credenciales en DuckDB y provee destrucción de particiones vía AWS SDK.
|
|
5
9
|
class S3 < Base
|
|
6
|
-
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
7
|
-
|
|
8
10
|
# Carga la extensión httpfs en DuckDB e inyecta las credenciales de AWS.
|
|
9
11
|
# Si aws_access_key_id y aws_secret_access_key están seteados, usa
|
|
10
12
|
# credenciales explícitas. Si no, usa credential_chain (IAM role, env vars,
|
|
@@ -21,9 +23,7 @@ module DataDrain
|
|
|
21
23
|
# @param partition_path [String, nil]
|
|
22
24
|
# @return [String]
|
|
23
25
|
def build_path(bucket, folder_name, partition_path)
|
|
24
|
-
|
|
25
|
-
base = File.join(base, partition_path) if partition_path && !partition_path.empty?
|
|
26
|
-
"s3://#{base}/**/*.parquet"
|
|
26
|
+
"s3://#{build_path_base(bucket, folder_name, partition_path)}/**/*.parquet"
|
|
27
27
|
end
|
|
28
28
|
|
|
29
29
|
# @param bucket [String]
|
|
@@ -32,34 +32,56 @@ module DataDrain
|
|
|
32
32
|
# @param partitions [Hash]
|
|
33
33
|
# @return [Integer]
|
|
34
34
|
def destroy_partitions(bucket, folder_name, partition_keys, partitions)
|
|
35
|
-
client =
|
|
35
|
+
client = s3_client
|
|
36
|
+
prefix, pattern_regex = build_destroy_pattern(folder_name, partition_keys, partitions)
|
|
37
|
+
objects = collect_matching_objects(client, bucket, prefix, pattern_regex)
|
|
38
|
+
delete_in_batches(client, bucket, objects)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
# @return [Aws::S3::Client]
|
|
44
|
+
def s3_client
|
|
45
|
+
Aws::S3::Client.new(
|
|
36
46
|
region: @config.aws_region,
|
|
37
47
|
access_key_id: @config.aws_access_key_id,
|
|
38
48
|
secret_access_key: @config.aws_secret_access_key
|
|
39
49
|
)
|
|
50
|
+
end
|
|
40
51
|
|
|
52
|
+
# @param folder_name [String]
|
|
53
|
+
# @param partition_keys [Array<Symbol>]
|
|
54
|
+
# @param partitions [Hash]
|
|
55
|
+
# @return [Array(String, Regexp)] prefix y pattern_regex
|
|
56
|
+
def build_destroy_pattern(folder_name, partition_keys, partitions)
|
|
41
57
|
regex_parts = partition_keys.map do |key|
|
|
42
58
|
val = partitions[key]
|
|
43
59
|
val.nil? || val.to_s.empty? ? "#{key}=[^/]+" : "#{key}=#{val}"
|
|
44
60
|
end
|
|
45
|
-
|
|
61
|
+
pattern = Regexp.new("^#{folder_name}/#{regex_parts.join("/")}")
|
|
46
62
|
|
|
47
|
-
objects_to_delete = []
|
|
48
63
|
prefix = "#{folder_name}/"
|
|
49
64
|
first_key = partition_keys.first
|
|
50
65
|
prefix += "#{first_key}=#{partitions[first_key]}/" if partitions[first_key]
|
|
51
66
|
|
|
67
|
+
[prefix, pattern]
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# @param client [Aws::S3::Client]
|
|
71
|
+
# @param bucket [String]
|
|
72
|
+
# @param prefix [String]
|
|
73
|
+
# @param pattern_regex [Regexp]
|
|
74
|
+
# @return [Array<Hash>]
|
|
75
|
+
def collect_matching_objects(client, bucket, prefix, pattern_regex)
|
|
76
|
+
objects = []
|
|
52
77
|
client.list_objects_v2(bucket: bucket, prefix: prefix).each do |response|
|
|
53
78
|
response.contents.each do |obj|
|
|
54
|
-
|
|
79
|
+
objects << { key: obj.key } if obj.key.match?(pattern_regex)
|
|
55
80
|
end
|
|
56
81
|
end
|
|
57
|
-
|
|
58
|
-
delete_in_batches(client, bucket, objects_to_delete)
|
|
82
|
+
objects
|
|
59
83
|
end
|
|
60
84
|
|
|
61
|
-
private
|
|
62
|
-
|
|
63
85
|
# @param connection [DuckDB::Connection]
|
|
64
86
|
# @raise [DataDrain::ConfigurationError]
|
|
65
87
|
def create_s3_secret(connection)
|
|
@@ -107,6 +129,5 @@ module DataDrain
|
|
|
107
129
|
deleted_count
|
|
108
130
|
end
|
|
109
131
|
end
|
|
110
|
-
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
111
132
|
end
|
|
112
133
|
end
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
module DataDrain
|
|
4
4
|
# Módulo de validación de configuración para prevenir errores de uso.
|
|
5
5
|
module Validations
|
|
6
|
+
# Regex que valida identificadores SQL (tablas, columnas, etc.).
|
|
7
|
+
# Permite letras, guiones bajos y números (no al inicio).
|
|
6
8
|
IDENTIFIER_REGEX = /\A[a-zA-Z_][a-zA-Z0-9_]*\z/
|
|
7
9
|
|
|
8
10
|
module_function
|
data/lib/data_drain/version.rb
CHANGED
data/lib/data_drain.rb
CHANGED
|
@@ -7,6 +7,7 @@ require_relative "data_drain/configuration"
|
|
|
7
7
|
require_relative "data_drain/validations"
|
|
8
8
|
require_relative "data_drain/storage"
|
|
9
9
|
require_relative "data_drain/observability"
|
|
10
|
+
require_relative "data_drain/observability/timing"
|
|
10
11
|
require_relative "data_drain/engine"
|
|
11
12
|
require_relative "data_drain/record"
|
|
12
13
|
require_relative "data_drain/file_ingestor"
|
|
@@ -30,6 +31,7 @@ module DataDrain
|
|
|
30
31
|
end
|
|
31
32
|
|
|
32
33
|
# @api private
|
|
34
|
+
# @return [void]
|
|
33
35
|
def reset_configuration!
|
|
34
36
|
@configuration = Configuration.new
|
|
35
37
|
DataDrain::Storage.reset_adapter!
|
|
@@ -195,6 +195,16 @@ logger.debug("query=#{expensive_serialize(obj)}") # Siempre evalúa, incluso si
|
|
|
195
195
|
logger.debug { "query=#{expensive_serialize(obj)}" }
|
|
196
196
|
```
|
|
197
197
|
|
|
198
|
+
**Ejemplo real en DataDrain:**
|
|
199
|
+
|
|
200
|
+
```ruby
|
|
201
|
+
# Incorrecto — el query puede tener MB de datos en partition_by, se serializa siempre:
|
|
202
|
+
logger.debug("export_query=#{query}")
|
|
203
|
+
|
|
204
|
+
# Correcto — solo se serializa si DEBUG está activo:
|
|
205
|
+
logger.debug { "export_query=#{query}" }
|
|
206
|
+
```
|
|
207
|
+
|
|
198
208
|
---
|
|
199
209
|
|
|
200
210
|
## 12. Asumir que `Record.connection` se puede cerrar manualmente
|
|
@@ -121,6 +121,20 @@ Migrar a tabla particionada cambia DataDrain de "DELETE masivo throttled" a
|
|
|
121
121
|
DataDrain no detecta particiones automáticamente (futuro item). Hoy el
|
|
122
122
|
operador decide.
|
|
123
123
|
|
|
124
|
+
## Tuning de parámetros DataDrain por tamaño
|
|
125
|
+
|
|
126
|
+
| Filas tabla | `batch_size` | `throttle_delay` | `vacuum_after_purge` | `slow_batch_threshold_s` |
|
|
127
|
+
|------------|-------------|-----------------|---------------------|-------------------------|
|
|
128
|
+
| <1M | 5000 | 0.1 | false | 30 |
|
|
129
|
+
| 1M-100M | 5000 | 0.5 | true | 30 |
|
|
130
|
+
| 100M-1B | 10000 | 1.0 | true | 60 |
|
|
131
|
+
| >1B | migrar a particionamiento (ver arriba) | | | |
|
|
132
|
+
|
|
133
|
+
Contexto operacional:
|
|
134
|
+
- **OLTP concurrente**: `throttle_delay` alto (≥0.5s) para no saturar la DB.
|
|
135
|
+
- **Tablas frías** (sin queries de usuarios): `throttle_delay` 0 OK.
|
|
136
|
+
- **`slow_batch_threshold_s`** alto en tablas grandes porque cada batch tarda más legítimamente.
|
|
137
|
+
|
|
124
138
|
## Referencias
|
|
125
139
|
|
|
126
140
|
- Skill: `.agents/skills/postgresql-optimization/SKILL.md`
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_drain
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gabriel
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activemodel
|
|
@@ -100,7 +100,11 @@ files:
|
|
|
100
100
|
- docs/IMPROVEMENT_PLAN.md
|
|
101
101
|
- docs/execution/archive/v0.2.0.agente-review.md
|
|
102
102
|
- docs/execution/archive/v0.2.0.md
|
|
103
|
+
- docs/execution/archive/v0.3.0-OBSERVACIONES.md
|
|
104
|
+
- docs/execution/archive/v0.3.0.md
|
|
103
105
|
- docs/execution/v0.2.2.md
|
|
106
|
+
- docs/execution/v0.3.1-OBSERVACIONES.md
|
|
107
|
+
- docs/execution/v0.3.1.md
|
|
104
108
|
- docs/glue_pyspark_example.py
|
|
105
109
|
- lib/data_drain.rb
|
|
106
110
|
- lib/data_drain/configuration.rb
|
|
@@ -109,6 +113,7 @@ files:
|
|
|
109
113
|
- lib/data_drain/file_ingestor.rb
|
|
110
114
|
- lib/data_drain/glue_runner.rb
|
|
111
115
|
- lib/data_drain/observability.rb
|
|
116
|
+
- lib/data_drain/observability/timing.rb
|
|
112
117
|
- lib/data_drain/record.rb
|
|
113
118
|
- lib/data_drain/storage.rb
|
|
114
119
|
- lib/data_drain/storage/base.rb
|
|
@@ -134,7 +139,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
134
139
|
requirements:
|
|
135
140
|
- - ">="
|
|
136
141
|
- !ruby/object:Gem::Version
|
|
137
|
-
version: 3.
|
|
142
|
+
version: '3.2'
|
|
138
143
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
139
144
|
requirements:
|
|
140
145
|
- - ">="
|