data_drain 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "aws-sdk-s3"
4
+
3
5
  module DataDrain
4
6
  module Storage
7
+ # Adaptador de almacenamiento para Amazon S3.
8
+ # Configura credenciales en DuckDB y provee destrucción de particiones vía AWS SDK.
5
9
  class S3 < Base
6
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
7
-
8
10
  # Carga la extensión httpfs en DuckDB e inyecta las credenciales de AWS.
9
11
  # Si aws_access_key_id y aws_secret_access_key están seteados, usa
10
12
  # credenciales explícitas. Si no, usa credential_chain (IAM role, env vars,
@@ -21,9 +23,7 @@ module DataDrain
21
23
  # @param partition_path [String, nil]
22
24
  # @return [String]
23
25
  def build_path(bucket, folder_name, partition_path)
24
- base = File.join(bucket, folder_name)
25
- base = File.join(base, partition_path) if partition_path && !partition_path.empty?
26
- "s3://#{base}/**/*.parquet"
26
+ "s3://#{build_path_base(bucket, folder_name, partition_path)}/**/*.parquet"
27
27
  end
28
28
 
29
29
  # @param bucket [String]
@@ -32,34 +32,56 @@ module DataDrain
32
32
  # @param partitions [Hash]
33
33
  # @return [Integer]
34
34
  def destroy_partitions(bucket, folder_name, partition_keys, partitions)
35
- client = Aws::S3::Client.new(
35
+ client = s3_client
36
+ prefix, pattern_regex = build_destroy_pattern(folder_name, partition_keys, partitions)
37
+ objects = collect_matching_objects(client, bucket, prefix, pattern_regex)
38
+ delete_in_batches(client, bucket, objects)
39
+ end
40
+
41
+ private
42
+
43
+ # @return [Aws::S3::Client]
44
+ def s3_client
45
+ Aws::S3::Client.new(
36
46
  region: @config.aws_region,
37
47
  access_key_id: @config.aws_access_key_id,
38
48
  secret_access_key: @config.aws_secret_access_key
39
49
  )
50
+ end
40
51
 
52
+ # @param folder_name [String]
53
+ # @param partition_keys [Array<Symbol>]
54
+ # @param partitions [Hash]
55
+ # @return [Array(String, Regexp)] prefix y pattern_regex
56
+ def build_destroy_pattern(folder_name, partition_keys, partitions)
41
57
  regex_parts = partition_keys.map do |key|
42
58
  val = partitions[key]
43
59
  val.nil? || val.to_s.empty? ? "#{key}=[^/]+" : "#{key}=#{val}"
44
60
  end
45
- pattern_regex = Regexp.new("^#{folder_name}/#{regex_parts.join("/")}")
61
+ pattern = Regexp.new("^#{folder_name}/#{regex_parts.join("/")}")
46
62
 
47
- objects_to_delete = []
48
63
  prefix = "#{folder_name}/"
49
64
  first_key = partition_keys.first
50
65
  prefix += "#{first_key}=#{partitions[first_key]}/" if partitions[first_key]
51
66
 
67
+ [prefix, pattern]
68
+ end
69
+
70
+ # @param client [Aws::S3::Client]
71
+ # @param bucket [String]
72
+ # @param prefix [String]
73
+ # @param pattern_regex [Regexp]
74
+ # @return [Array<Hash>]
75
+ def collect_matching_objects(client, bucket, prefix, pattern_regex)
76
+ objects = []
52
77
  client.list_objects_v2(bucket: bucket, prefix: prefix).each do |response|
53
78
  response.contents.each do |obj|
54
- objects_to_delete << { key: obj.key } if obj.key.match?(pattern_regex)
79
+ objects << { key: obj.key } if obj.key.match?(pattern_regex)
55
80
  end
56
81
  end
57
-
58
- delete_in_batches(client, bucket, objects_to_delete)
82
+ objects
59
83
  end
60
84
 
61
- private
62
-
63
85
  # @param connection [DuckDB::Connection]
64
86
  # @raise [DataDrain::ConfigurationError]
65
87
  def create_s3_secret(connection)
@@ -107,6 +129,5 @@ module DataDrain
107
129
  deleted_count
108
130
  end
109
131
  end
110
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
111
132
  end
112
133
  end
@@ -3,6 +3,7 @@
3
3
  require "json"
4
4
 
5
5
  module DataDrain
6
+ # Tipos personalizados para ActiveModel registrados por DataDrain.
6
7
  module Types
7
8
  # Tipo personalizado para ActiveModel que maneja la conversión de
8
9
  # cadenas JSON de DuckDB hacia Hashes de Ruby.
@@ -3,6 +3,8 @@
3
3
  module DataDrain
4
4
  # Módulo de validación de configuración para prevenir errores de uso.
5
5
  module Validations
6
+ # Regex que valida identificadores SQL (tablas, columnas, etc.).
7
+ # Permite letras, guiones bajos y números (no al inicio).
6
8
  IDENTIFIER_REGEX = /\A[a-zA-Z_][a-zA-Z0-9_]*\z/
7
9
 
8
10
  module_function
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DataDrain
4
- VERSION = "0.2.2"
4
+ # @return [String] versión semver de la gema
5
+ VERSION = "0.3.1"
5
6
  end
data/lib/data_drain.rb CHANGED
@@ -7,6 +7,7 @@ require_relative "data_drain/configuration"
7
7
  require_relative "data_drain/validations"
8
8
  require_relative "data_drain/storage"
9
9
  require_relative "data_drain/observability"
10
+ require_relative "data_drain/observability/timing"
10
11
  require_relative "data_drain/engine"
11
12
  require_relative "data_drain/record"
12
13
  require_relative "data_drain/file_ingestor"
@@ -30,6 +31,7 @@ module DataDrain
30
31
  end
31
32
 
32
33
  # @api private
34
+ # @return [void]
33
35
  def reset_configuration!
34
36
  @configuration = Configuration.new
35
37
  DataDrain::Storage.reset_adapter!
@@ -195,6 +195,16 @@ logger.debug("query=#{expensive_serialize(obj)}") # Siempre evalúa, incluso si
195
195
  logger.debug { "query=#{expensive_serialize(obj)}" }
196
196
  ```
197
197
 
198
+ **Ejemplo real en DataDrain:**
199
+
200
+ ```ruby
201
+ # Incorrecto — el query puede tener MB de datos en partition_by, se serializa siempre:
202
+ logger.debug("export_query=#{query}")
203
+
204
+ # Correcto — solo se serializa si DEBUG está activo:
205
+ logger.debug { "export_query=#{query}" }
206
+ ```
207
+
198
208
  ---
199
209
 
200
210
  ## 12. Asumir que `Record.connection` se puede cerrar manualmente
@@ -121,6 +121,20 @@ Migrar a tabla particionada cambia DataDrain de "DELETE masivo throttled" a
121
121
  DataDrain no detecta particiones automáticamente (futuro item). Hoy el
122
122
  operador decide.
123
123
 
124
+ ## Tuning de parámetros DataDrain por tamaño
125
+
126
+ | Filas tabla | `batch_size` | `throttle_delay` | `vacuum_after_purge` | `slow_batch_threshold_s` |
127
+ |------------|-------------|-----------------|---------------------|-------------------------|
128
+ | <1M | 5000 | 0.1 | false | 30 |
129
+ | 1M-100M | 5000 | 0.5 | true | 30 |
130
+ | 100M-1B | 10000 | 1.0 | true | 60 |
131
+ | >1B | migrar a particionamiento (ver arriba) | | | |
132
+
133
+ Contexto operacional:
134
+ - **OLTP concurrente**: `throttle_delay` alto (≥0.5s) para no saturar la DB.
135
+ - **Tablas frías** (sin queries de usuarios): `throttle_delay` 0 OK.
136
+ - **`slow_batch_threshold_s`** alto en tablas grandes porque cada batch tarda más legítimamente.
137
+
124
138
  ## Referencias
125
139
 
126
140
  - Skill: `.agents/skills/postgresql-optimization/SKILL.md`
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_drain
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gabriel
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-14 00:00:00.000000000 Z
11
+ date: 2026-04-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -100,7 +100,11 @@ files:
100
100
  - docs/IMPROVEMENT_PLAN.md
101
101
  - docs/execution/archive/v0.2.0.agente-review.md
102
102
  - docs/execution/archive/v0.2.0.md
103
+ - docs/execution/archive/v0.3.0-OBSERVACIONES.md
104
+ - docs/execution/archive/v0.3.0.md
103
105
  - docs/execution/v0.2.2.md
106
+ - docs/execution/v0.3.1-OBSERVACIONES.md
107
+ - docs/execution/v0.3.1.md
104
108
  - docs/glue_pyspark_example.py
105
109
  - lib/data_drain.rb
106
110
  - lib/data_drain/configuration.rb
@@ -109,6 +113,7 @@ files:
109
113
  - lib/data_drain/file_ingestor.rb
110
114
  - lib/data_drain/glue_runner.rb
111
115
  - lib/data_drain/observability.rb
116
+ - lib/data_drain/observability/timing.rb
112
117
  - lib/data_drain/record.rb
113
118
  - lib/data_drain/storage.rb
114
119
  - lib/data_drain/storage/base.rb
@@ -134,7 +139,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
134
139
  requirements:
135
140
  - - ">="
136
141
  - !ruby/object:Gem::Version
137
- version: 3.0.0
142
+ version: '3.2'
138
143
  required_rubygems_version: !ruby/object:Gem::Requirement
139
144
  requirements:
140
145
  - - ">="