data_drain 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -50,6 +50,7 @@ module DataDrain
50
50
  @duckdb = database.connect
51
51
  end
52
52
 
53
+ # @return [Boolean] true si el flujo completó exitosamente, false si falló
53
54
  def call
54
55
  @durations = {}
55
56
  start_time = monotonic
@@ -158,7 +159,7 @@ module DataDrain
158
159
  # @api private
159
160
  # @return [Integer]
160
161
  def get_postgres_count
161
- pg_sql = "SELECT COUNT(*) AS row_count FROM public.#{@table_name} WHERE #{base_where_sql}"
162
+ pg_sql = "SELECT count() AS row_count FROM public.#{@table_name} WHERE #{base_where_sql}"
162
163
  pg_sql = pg_sql.gsub("'", "''")
163
164
  query = "SELECT row_count FROM postgres_query('pg_source', '#{pg_sql}')"
164
165
  @duckdb.query(query).first.first
@@ -203,7 +204,7 @@ module DataDrain
203
204
 
204
205
  begin
205
206
  query = <<~SQL
206
- SELECT COUNT(*)
207
+ SELECT count()
207
208
  FROM read_parquet('#{archive_path}')
208
209
  WHERE #{base_where_sql}
209
210
  SQL
@@ -82,7 +82,7 @@ module DataDrain
82
82
 
83
83
  # @api private
84
84
  def step_count_source
85
- source_count = timed(:source_query) { @duckdb.query("SELECT COUNT(*) FROM #{@reader_function}").first.first }
85
+ source_count = timed(:source_query) { @duckdb.query("SELECT count() FROM #{@reader_function}").first.first }
86
86
  safe_log(:info, "file_ingestor.count", {
87
87
  source_path: @source_path,
88
88
  count: source_count,
@@ -7,6 +7,8 @@ module DataDrain
7
7
  # Este módulo es genérico y puede ser utilizado en otras gemas.
8
8
  # @api private
9
9
  module Observability
10
+ # Regex para detectar claves sensibles en logs y enmascararlas preventivamente.
11
+ # @!visibility private
10
12
  SENSITIVE_KEY_PATTERN = /password|passwd|pass|secret|token|api_key|apikey|auth|credential|private_key/i
11
13
 
12
14
  private
@@ -54,6 +54,18 @@ module DataDrain
54
54
  def destroy_partitions(bucket, folder_name, partition_keys, partitions)
55
55
  raise NotImplementedError, "#{self.class} debe implementar #destroy_partitions"
56
56
  end
57
+
58
+ protected
59
+
60
+ # @param bucket [String]
61
+ # @param folder_name [String]
62
+ # @param partition_path [String, nil]
63
+ # @return [String] path sin prefix de protocolo ni sufijo glob
64
+ def build_path_base(bucket, folder_name, partition_path)
65
+ base = File.join(bucket, folder_name)
66
+ base = File.join(base, partition_path) if partition_path && !partition_path.empty?
67
+ base
68
+ end
57
69
  end
58
70
  end
59
71
  end
@@ -24,9 +24,7 @@ module DataDrain
24
24
  # @param partition_path [String, nil]
25
25
  # @return [String]
26
26
  def build_path(bucket, folder_name, partition_path)
27
- base = File.join(bucket, folder_name)
28
- base = File.join(base, partition_path) if partition_path && !partition_path.empty?
29
- "#{base}/**/*.parquet"
27
+ "#{build_path_base(bucket, folder_name, partition_path)}/**/*.parquet"
30
28
  end
31
29
 
32
30
  # @param bucket [String]
@@ -1,7 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "aws-sdk-s3"
4
+
3
5
  module DataDrain
4
6
  module Storage
7
+ # Adaptador de almacenamiento para Amazon S3.
8
+ # Configura credenciales en DuckDB y provee destrucción de particiones vía AWS SDK.
5
9
  class S3 < Base
6
10
  # Carga la extensión httpfs en DuckDB e inyecta las credenciales de AWS.
7
11
  # Si aws_access_key_id y aws_secret_access_key están seteados, usa
@@ -19,9 +23,7 @@ module DataDrain
19
23
  # @param partition_path [String, nil]
20
24
  # @return [String]
21
25
  def build_path(bucket, folder_name, partition_path)
22
- base = File.join(bucket, folder_name)
23
- base = File.join(base, partition_path) if partition_path && !partition_path.empty?
24
- "s3://#{base}/**/*.parquet"
26
+ "s3://#{build_path_base(bucket, folder_name, partition_path)}/**/*.parquet"
25
27
  end
26
28
 
27
29
  # @param bucket [String]
@@ -3,6 +3,7 @@
3
3
  require "json"
4
4
 
5
5
  module DataDrain
6
+ # Tipos personalizados para ActiveModel registrados por DataDrain.
6
7
  module Types
7
8
  # Tipo personalizado para ActiveModel que maneja la conversión de
8
9
  # cadenas JSON de DuckDB hacia Hashes de Ruby.
@@ -3,6 +3,8 @@
3
3
  module DataDrain
4
4
  # Módulo de validación de configuración para prevenir errores de uso.
5
5
  module Validations
6
+ # Regex que valida identificadores SQL (tablas, columnas, etc.).
7
+ # Permite letras, guiones bajos y números (no al inicio).
6
8
  IDENTIFIER_REGEX = /\A[a-zA-Z_][a-zA-Z0-9_]*\z/
7
9
 
8
10
  module_function
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DataDrain
4
- VERSION = "0.3.0"
4
+ # @return [String] versión semver de la gema
5
+ VERSION = "0.3.1"
5
6
  end
data/lib/data_drain.rb CHANGED
@@ -31,6 +31,7 @@ module DataDrain
31
31
  end
32
32
 
33
33
  # @api private
34
+ # @return [void]
34
35
  def reset_configuration!
35
36
  @configuration = Configuration.new
36
37
  DataDrain::Storage.reset_adapter!
@@ -195,6 +195,16 @@ logger.debug("query=#{expensive_serialize(obj)}") # Siempre evalúa, incluso si
195
195
  logger.debug { "query=#{expensive_serialize(obj)}" }
196
196
  ```
197
197
 
198
+ **Ejemplo real en DataDrain:**
199
+
200
+ ```ruby
201
+ # Incorrecto — el query puede tener MB de datos en partition_by, se serializa siempre:
202
+ logger.debug("export_query=#{query}")
203
+
204
+ # Correcto — solo se serializa si DEBUG está activo:
205
+ logger.debug { "export_query=#{query}" }
206
+ ```
207
+
198
208
  ---
199
209
 
200
210
  ## 12. Asumir que `Record.connection` se puede cerrar manualmente
@@ -121,6 +121,20 @@ Migrar a tabla particionada cambia DataDrain de "DELETE masivo throttled" a
121
121
  DataDrain no detecta particiones automáticamente (futuro item). Hoy el
122
122
  operador decide.
123
123
 
124
+ ## Tuning de parámetros DataDrain por tamaño
125
+
126
+ | Filas tabla | `batch_size` | `throttle_delay` | `vacuum_after_purge` | `slow_batch_threshold_s` |
127
+ |------------|-------------|-----------------|---------------------|-------------------------|
128
+ | <1M | 5000 | 0.1 | false | 30 |
129
+ | 1M-100M | 5000 | 0.5 | true | 30 |
130
+ | 100M-1B | 10000 | 1.0 | true | 60 |
131
+ | >1B | migrar a particionamiento (ver arriba) | | | |
132
+
133
+ Contexto operacional:
134
+ - **OLTP concurrente**: `throttle_delay` alto (≥0.5s) para no saturar la DB.
135
+ - **Tablas frías** (sin queries de usuarios): `throttle_delay` 0 OK.
136
+ - **`slow_batch_threshold_s`** alto en tablas grandes porque cada batch tarda más legítimamente.
137
+
124
138
  ## Referencias
125
139
 
126
140
  - Skill: `.agents/skills/postgresql-optimization/SKILL.md`
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_drain
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gabriel
@@ -100,7 +100,11 @@ files:
100
100
  - docs/IMPROVEMENT_PLAN.md
101
101
  - docs/execution/archive/v0.2.0.agente-review.md
102
102
  - docs/execution/archive/v0.2.0.md
103
+ - docs/execution/archive/v0.3.0-OBSERVACIONES.md
104
+ - docs/execution/archive/v0.3.0.md
103
105
  - docs/execution/v0.2.2.md
106
+ - docs/execution/v0.3.1-OBSERVACIONES.md
107
+ - docs/execution/v0.3.1.md
104
108
  - docs/glue_pyspark_example.py
105
109
  - lib/data_drain.rb
106
110
  - lib/data_drain/configuration.rb
@@ -135,7 +139,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
135
139
  requirements:
136
140
  - - ">="
137
141
  - !ruby/object:Gem::Version
138
- version: 3.0.0
142
+ version: '3.2'
139
143
  required_rubygems_version: !ruby/object:Gem::Requirement
140
144
  requirements:
141
145
  - - ">="