data_drain 0.1.18 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/CLAUDE.md +22 -0
- data/README.md +69 -169
- data/lib/data_drain/engine.rb +53 -40
- data/lib/data_drain/file_ingestor.rb +40 -25
- data/lib/data_drain/record.rb +26 -5
- data/lib/data_drain/storage/s3.rb +48 -6
- data/lib/data_drain/validations.rb +17 -0
- data/lib/data_drain/version.rb +1 -1
- data/lib/data_drain.rb +2 -0
- data/skill/SKILL.md +215 -0
- data/skill/references/antipatrones.md +242 -0
- data/skill/references/api-detallada.md +257 -0
- data/skill/references/eventos-telemetria.md +154 -0
- metadata +7 -2
|
@@ -6,6 +6,8 @@ module DataDrain
|
|
|
6
6
|
# aplicando compresión ZSTD y particionamiento Hive.
|
|
7
7
|
class FileIngestor
|
|
8
8
|
include Observability
|
|
9
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
|
|
10
|
+
# Metrics/MethodLength
|
|
9
11
|
|
|
10
12
|
# @param options [Hash] Opciones de ingestión.
|
|
11
13
|
# @option options [String] :source_path Ruta absoluta al archivo local.
|
|
@@ -14,19 +16,20 @@ module DataDrain
|
|
|
14
16
|
# @option options [String] :select_sql (Opcional) Sentencia SELECT para transformar datos al vuelo.
|
|
15
17
|
# @option options [Boolean] :delete_after_upload (Opcional) Borra el archivo local al terminar. Por defecto true.
|
|
16
18
|
def initialize(options)
|
|
17
|
-
@source_path
|
|
18
|
-
@folder_name
|
|
19
|
-
|
|
20
|
-
@
|
|
19
|
+
@source_path = options.fetch(:source_path)
|
|
20
|
+
@folder_name = options.fetch(:folder_name)
|
|
21
|
+
Validations.validate_identifier!(:folder_name, @folder_name)
|
|
22
|
+
@partition_keys = options.fetch(:partition_keys, [])
|
|
23
|
+
@select_sql = options.fetch(:select_sql, "*")
|
|
21
24
|
@delete_after_upload = options.fetch(:delete_after_upload, true)
|
|
22
|
-
@bucket
|
|
25
|
+
@bucket = options[:bucket]
|
|
23
26
|
|
|
24
|
-
@config
|
|
25
|
-
@logger
|
|
27
|
+
@config = DataDrain.configuration
|
|
28
|
+
@logger = @config.logger
|
|
26
29
|
@adapter = DataDrain::Storage.adapter
|
|
27
30
|
|
|
28
31
|
database = DuckDB::Database.open(":memory:")
|
|
29
|
-
@duckdb
|
|
32
|
+
@duckdb = database.connect
|
|
30
33
|
end
|
|
31
34
|
|
|
32
35
|
# Ejecuta el flujo de ingestión.
|
|
@@ -52,7 +55,11 @@ module DataDrain
|
|
|
52
55
|
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
53
56
|
source_count = @duckdb.query("SELECT COUNT(*) FROM #{reader_function}").first.first
|
|
54
57
|
source_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
55
|
-
safe_log(:info, "file_ingestor.count", {
|
|
58
|
+
safe_log(:info, "file_ingestor.count", {
|
|
59
|
+
source_path: @source_path,
|
|
60
|
+
count: source_count,
|
|
61
|
+
source_query_duration_s: source_query_duration.round(2)
|
|
62
|
+
})
|
|
56
63
|
|
|
57
64
|
if source_count.zero?
|
|
58
65
|
cleanup_local_file
|
|
@@ -63,9 +70,14 @@ module DataDrain
|
|
|
63
70
|
|
|
64
71
|
# 2. Exportación / Subida
|
|
65
72
|
@adapter.prepare_export_path(@bucket, @folder_name)
|
|
66
|
-
dest_path = @config.storage_mode.to_sym == :s3
|
|
73
|
+
dest_path = if @config.storage_mode.to_sym == :s3
|
|
74
|
+
"s3://#{@bucket}/#{@folder_name}/"
|
|
75
|
+
else
|
|
76
|
+
File.join(@bucket,
|
|
77
|
+
@folder_name, "")
|
|
78
|
+
end
|
|
67
79
|
|
|
68
|
-
partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(
|
|
80
|
+
partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(", ")})," : ""
|
|
69
81
|
|
|
70
82
|
query = <<~SQL
|
|
71
83
|
COPY (
|
|
@@ -87,18 +99,19 @@ module DataDrain
|
|
|
87
99
|
|
|
88
100
|
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
89
101
|
safe_log(:info, "file_ingestor.complete", {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
102
|
+
source_path: @source_path,
|
|
103
|
+
duration_s: duration.round(2),
|
|
104
|
+
source_query_duration_s: source_query_duration.round(2),
|
|
105
|
+
export_duration_s: export_duration.round(2),
|
|
106
|
+
count: source_count
|
|
107
|
+
})
|
|
96
108
|
|
|
97
109
|
cleanup_local_file
|
|
98
110
|
true
|
|
99
111
|
rescue DuckDB::Error => e
|
|
100
112
|
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
101
|
-
safe_log(:error, "file_ingestor.duckdb_error",
|
|
113
|
+
safe_log(:error, "file_ingestor.duckdb_error",
|
|
114
|
+
{ source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
|
|
102
115
|
false
|
|
103
116
|
ensure
|
|
104
117
|
@duckdb&.close
|
|
@@ -109,11 +122,11 @@ module DataDrain
|
|
|
109
122
|
# @api private
|
|
110
123
|
def determine_reader
|
|
111
124
|
case File.extname(@source_path).downcase
|
|
112
|
-
when
|
|
125
|
+
when ".csv"
|
|
113
126
|
"read_csv_auto('#{@source_path}')"
|
|
114
|
-
when
|
|
127
|
+
when ".json"
|
|
115
128
|
"read_json_auto('#{@source_path}')"
|
|
116
|
-
when
|
|
129
|
+
when ".parquet"
|
|
117
130
|
"read_parquet('#{@source_path}')"
|
|
118
131
|
else
|
|
119
132
|
raise DataDrain::Error, "Formato de archivo no soportado para ingestión: #{@source_path}"
|
|
@@ -122,10 +135,12 @@ module DataDrain
|
|
|
122
135
|
|
|
123
136
|
# @api private
|
|
124
137
|
def cleanup_local_file
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
138
|
+
return unless @delete_after_upload && File.exist?(@source_path)
|
|
139
|
+
|
|
140
|
+
File.delete(@source_path)
|
|
141
|
+
safe_log(:info, "file_ingestor.cleanup", { source_path: @source_path })
|
|
129
142
|
end
|
|
130
143
|
end
|
|
144
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
|
|
145
|
+
# Metrics/MethodLength
|
|
131
146
|
end
|
data/lib/data_drain/record.rb
CHANGED
|
@@ -11,7 +11,7 @@ module DataDrain
|
|
|
11
11
|
# @example
|
|
12
12
|
# class ArchivedVersion < DataDrain::Record
|
|
13
13
|
# self.folder_name = 'versions'
|
|
14
|
-
# self.partition_keys = [:
|
|
14
|
+
# self.partition_keys = [:isp_id, :year, :month]
|
|
15
15
|
# attribute :event, :string
|
|
16
16
|
# end
|
|
17
17
|
class Record
|
|
@@ -24,10 +24,28 @@ module DataDrain
|
|
|
24
24
|
class_attribute :folder_name
|
|
25
25
|
class_attribute :partition_keys
|
|
26
26
|
|
|
27
|
+
# Cierra la conexión DuckDB del thread actual y limpia Thread.current.
|
|
28
|
+
# Idempotente: llamarlo varias veces no levanta.
|
|
29
|
+
#
|
|
30
|
+
# Útil en middlewares de Sidekiq/Puma para evitar memory leak en threads
|
|
31
|
+
# de larga vida.
|
|
32
|
+
#
|
|
33
|
+
# @return [void]
|
|
34
|
+
def self.disconnect!
|
|
35
|
+
entry = Thread.current[:data_drain_duckdb]
|
|
36
|
+
Thread.current[:data_drain_duckdb] = nil
|
|
37
|
+
return unless entry
|
|
38
|
+
|
|
39
|
+
entry[:conn]&.close
|
|
40
|
+
entry[:db]&.close
|
|
41
|
+
rescue StandardError # rubocop:disable Lint/SuppressedException
|
|
42
|
+
end
|
|
43
|
+
|
|
27
44
|
# Retorna la conexión persistente a DuckDB en memoria para el hilo (Thread) actual.
|
|
28
45
|
# Esto previene tener que recargar extensiones (como httpfs) en cada consulta.
|
|
29
46
|
#
|
|
30
47
|
# @return [DuckDB::Connection] Conexión activa a DuckDB.
|
|
48
|
+
# rubocop:disable Metrics/AbcSize
|
|
31
49
|
def self.connection
|
|
32
50
|
Thread.current[:data_drain_duckdb] ||= begin
|
|
33
51
|
db = DuckDB::Database.open(":memory:")
|
|
@@ -42,6 +60,7 @@ module DataDrain
|
|
|
42
60
|
end
|
|
43
61
|
Thread.current[:data_drain_duckdb][:conn]
|
|
44
62
|
end
|
|
63
|
+
# rubocop:enable Metrics/AbcSize
|
|
45
64
|
|
|
46
65
|
# Consulta registros en el Data Lake filtrando por claves de partición.
|
|
47
66
|
#
|
|
@@ -52,7 +71,7 @@ module DataDrain
|
|
|
52
71
|
path = build_query_path(partitions)
|
|
53
72
|
|
|
54
73
|
sql = <<~SQL
|
|
55
|
-
SELECT #{attribute_names.join(
|
|
74
|
+
SELECT #{attribute_names.join(", ")}
|
|
56
75
|
FROM read_parquet('#{path}')
|
|
57
76
|
ORDER BY created_at DESC
|
|
58
77
|
LIMIT #{limit}
|
|
@@ -73,7 +92,7 @@ module DataDrain
|
|
|
73
92
|
safe_id = id.to_s.gsub("'", "''")
|
|
74
93
|
|
|
75
94
|
sql = <<~SQL
|
|
76
|
-
SELECT #{attribute_names.join(
|
|
95
|
+
SELECT #{attribute_names.join(", ")}
|
|
77
96
|
FROM read_parquet('#{path}')
|
|
78
97
|
WHERE id = '#{safe_id}'
|
|
79
98
|
LIMIT 1
|
|
@@ -97,7 +116,7 @@ module DataDrain
|
|
|
97
116
|
# @return [String] Representación legible en consola.
|
|
98
117
|
def inspect
|
|
99
118
|
inspection = attributes.map do |name, value|
|
|
100
|
-
"#{name}: #{value.nil? ?
|
|
119
|
+
"#{name}: #{value.nil? ? "nil" : value.inspect}"
|
|
101
120
|
end.compact.join(", ")
|
|
102
121
|
|
|
103
122
|
"#<#{self.class} #{inspection}>"
|
|
@@ -110,7 +129,7 @@ module DataDrain
|
|
|
110
129
|
# @param partitions [Hash]
|
|
111
130
|
# @return [String]
|
|
112
131
|
def build_query_path(partitions)
|
|
113
|
-
partition_path =
|
|
132
|
+
partition_path = partition_keys.map { |k| "#{k}=#{partitions[k.to_sym] || partitions[k.to_s]}" }.join("/")
|
|
114
133
|
DataDrain::Storage.adapter.build_path(bucket, folder_name, partition_path)
|
|
115
134
|
end
|
|
116
135
|
|
|
@@ -118,6 +137,7 @@ module DataDrain
|
|
|
118
137
|
# @param sql [String]
|
|
119
138
|
# @param columns [Array<String>]
|
|
120
139
|
# @return [Array<DataDrain::Record>]
|
|
140
|
+
# rubocop:disable Metrics/MethodLength
|
|
121
141
|
def execute_and_instantiate(sql, columns)
|
|
122
142
|
@logger = DataDrain.configuration.logger
|
|
123
143
|
begin
|
|
@@ -133,5 +153,6 @@ module DataDrain
|
|
|
133
153
|
end
|
|
134
154
|
end
|
|
135
155
|
end
|
|
156
|
+
# rubocop:enable Metrics/MethodLength
|
|
136
157
|
end
|
|
137
158
|
end
|
|
@@ -4,21 +4,59 @@ module DataDrain
|
|
|
4
4
|
module Storage
|
|
5
5
|
# Implementación del adaptador de almacenamiento para Amazon S3.
|
|
6
6
|
class S3 < Base
|
|
7
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
7
8
|
# Carga la extensión httpfs en DuckDB e inyecta las credenciales de AWS.
|
|
9
|
+
# Si aws_access_key_id y aws_secret_access_key están seteados, usa
|
|
10
|
+
# credenciales explícitas. Si no, usa credential_chain (IAM role, env vars,
|
|
11
|
+
# ~/.aws/credentials).
|
|
8
12
|
# @param connection [DuckDB::Connection]
|
|
13
|
+
# @raise [DataDrain::ConfigurationError] si aws_region no está configurado
|
|
9
14
|
def setup_duckdb(connection)
|
|
10
15
|
connection.query("INSTALL httpfs; LOAD httpfs;")
|
|
11
|
-
connection
|
|
12
|
-
connection.query("SET s3_access_key_id='#{@config.aws_access_key_id}';")
|
|
13
|
-
connection.query("SET s3_secret_access_key='#{@config.aws_secret_access_key}';")
|
|
16
|
+
create_s3_secret(connection)
|
|
14
17
|
end
|
|
15
18
|
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
# @param connection [DuckDB::Connection]
|
|
22
|
+
# @raise [DataDrain::ConfigurationError]
|
|
23
|
+
def create_s3_secret(connection)
|
|
24
|
+
region = @config.aws_region
|
|
25
|
+
raise DataDrain::ConfigurationError, "aws_region es obligatorio para storage_mode=:s3" if region.nil?
|
|
26
|
+
|
|
27
|
+
if @config.aws_access_key_id && @config.aws_secret_access_key
|
|
28
|
+
connection.query(<<~SQL)
|
|
29
|
+
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
30
|
+
TYPE S3,
|
|
31
|
+
KEY_ID '#{escape_sql(@config.aws_access_key_id)}',
|
|
32
|
+
SECRET '#{escape_sql(@config.aws_secret_access_key)}',
|
|
33
|
+
REGION '#{escape_sql(region)}'
|
|
34
|
+
);
|
|
35
|
+
SQL
|
|
36
|
+
else
|
|
37
|
+
connection.query(<<~SQL)
|
|
38
|
+
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
39
|
+
TYPE S3,
|
|
40
|
+
PROVIDER credential_chain,
|
|
41
|
+
REGION '#{escape_sql(region)}'
|
|
42
|
+
);
|
|
43
|
+
SQL
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# @param value [String]
|
|
48
|
+
# @return [String]
|
|
49
|
+
def escape_sql(value)
|
|
50
|
+
value.to_s.gsub("'", "''")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
public
|
|
54
|
+
|
|
16
55
|
# @param bucket [String]
|
|
17
56
|
# @param folder_name [String]
|
|
18
57
|
# @param partition_path [String, nil]
|
|
19
58
|
# @return [String]
|
|
20
59
|
def build_path(bucket, folder_name, partition_path)
|
|
21
|
-
# En S3, el base_path actúa como el nombre del bucket
|
|
22
60
|
base = File.join(bucket, folder_name)
|
|
23
61
|
base = File.join(base, partition_path) if partition_path && !partition_path.empty?
|
|
24
62
|
"s3://#{base}/**/*.parquet"
|
|
@@ -40,7 +78,7 @@ module DataDrain
|
|
|
40
78
|
val = partitions[key]
|
|
41
79
|
val.nil? || val.to_s.empty? ? "#{key}=[^/]+" : "#{key}=#{val}"
|
|
42
80
|
end
|
|
43
|
-
pattern_regex = Regexp.new("^#{folder_name}/#{regex_parts.join(
|
|
81
|
+
pattern_regex = Regexp.new("^#{folder_name}/#{regex_parts.join("/")}")
|
|
44
82
|
|
|
45
83
|
objects_to_delete = []
|
|
46
84
|
prefix = "#{folder_name}/"
|
|
@@ -58,7 +96,10 @@ module DataDrain
|
|
|
58
96
|
|
|
59
97
|
private
|
|
60
98
|
|
|
61
|
-
# @
|
|
99
|
+
# @param client [Aws::S3::Client]
|
|
100
|
+
# @param bucket [String]
|
|
101
|
+
# @param objects_to_delete [Array<Hash>]
|
|
102
|
+
# @return [Integer]
|
|
62
103
|
def delete_in_batches(client, bucket, objects_to_delete)
|
|
63
104
|
return 0 if objects_to_delete.empty?
|
|
64
105
|
|
|
@@ -70,5 +111,6 @@ module DataDrain
|
|
|
70
111
|
deleted_count
|
|
71
112
|
end
|
|
72
113
|
end
|
|
114
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
73
115
|
end
|
|
74
116
|
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DataDrain
|
|
4
|
+
# Módulo de validación de configuración para prevenir errores de uso.
|
|
5
|
+
module Validations
|
|
6
|
+
IDENTIFIER_REGEX = /\A[a-zA-Z_][a-zA-Z0-9_]*\z/
|
|
7
|
+
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def validate_identifier!(name, value)
|
|
11
|
+
return if IDENTIFIER_REGEX.match?(value.to_s)
|
|
12
|
+
|
|
13
|
+
raise DataDrain::ConfigurationError,
|
|
14
|
+
"#{name} '#{value}' no es un identificador SQL válido"
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
data/lib/data_drain/version.rb
CHANGED
data/lib/data_drain.rb
CHANGED
|
@@ -4,6 +4,7 @@ require "active_model"
|
|
|
4
4
|
require_relative "data_drain/version"
|
|
5
5
|
require_relative "data_drain/errors"
|
|
6
6
|
require_relative "data_drain/configuration"
|
|
7
|
+
require_relative "data_drain/validations"
|
|
7
8
|
require_relative "data_drain/storage"
|
|
8
9
|
require_relative "data_drain/observability"
|
|
9
10
|
require_relative "data_drain/engine"
|
|
@@ -15,6 +16,7 @@ require_relative "data_drain/glue_runner"
|
|
|
15
16
|
require_relative "data_drain/types/json_type"
|
|
16
17
|
ActiveModel::Type.register(:json, DataDrain::Types::JsonType)
|
|
17
18
|
|
|
19
|
+
# DSL para extraer, archivar y purgar datos entre PostgreSQL y un Data Lake en Parquet.
|
|
18
20
|
module DataDrain
|
|
19
21
|
class << self
|
|
20
22
|
# @return [DataDrain::Configuration]
|
data/skill/SKILL.md
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# DataDrain Expert
|
|
2
|
+
|
|
3
|
+
Skill de conocimiento completo sobre DataDrain. Consultame para cualquier pregunta sobre integración, arquitectura, API, errores y antipatrones.
|
|
4
|
+
|
|
5
|
+
## Glosario
|
|
6
|
+
|
|
7
|
+
- **DataDrain** — Micro-framework Ruby para ETL: extraer datos históricos de PostgreSQL → Parquet (S3/Local) → verificar integridad → purgar origen.
|
|
8
|
+
- **Engine** — Motor principal que orquesta el flujo Conteo → Export → Verify → Purge.
|
|
9
|
+
- **FileIngestor** — Convierte archivos crudos (CSV/JSON/Parquet) a Parquet particionado en el Data Lake.
|
|
10
|
+
- **Record** — Clase base ORM analítico (tipo ActiveRecord) read-only sobre Parquet vía DuckDB.
|
|
11
|
+
- **GlueRunner** — Orquestador de AWS Glue Jobs para tablas de gran volumen (>500GB-1TB).
|
|
12
|
+
- **Storage Adapter** — Patrón Strategy con dos implementaciones: `Storage::Local` y `Storage::S3`. Cacheado en `Storage.adapter`.
|
|
13
|
+
- **Observability** — Módulo mixín (`include`/`extend`) con `safe_log` resiliente y logging KV estructurado.
|
|
14
|
+
- **Hive Partitioning** — Estructura de carpetas `key1=val1/key2=val2/...` que DuckDB genera y consume nativamente para prefix scans eficientes.
|
|
15
|
+
- **Semi-abierto** — Convención de rangos `[start, end)` con `<` (no `<=`) para evitar pérdida de microsegundos en límites de fecha.
|
|
16
|
+
- **skip_export** — Modo del Engine donde delega export a herramienta externa (Glue/EMR) y solo verifica + purga.
|
|
17
|
+
- **Heartbeat** — Log de progreso emitido cada 100 lotes en purgas masivas (tablas 1TB).
|
|
18
|
+
- **Wispro-Observability-Spec v1** — Estándar de logs KV: `component=` y `event=` primero, sufijo `_s` para tiempos float, `_count` para enteros, sin unidades en valores.
|
|
19
|
+
|
|
20
|
+
## Arquitectura
|
|
21
|
+
|
|
22
|
+
### Responsabilidad core
|
|
23
|
+
|
|
24
|
+
DataDrain resuelve el ciclo de vida de datos históricos en bases relacionales calientes: archivar a Data Lake con garantía matemática de integridad antes de purgar el origen.
|
|
25
|
+
|
|
26
|
+
### Componentes
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
|
30
|
+
│ PostgreSQL │───>│ Engine │───>│ Data Lake │
|
|
31
|
+
└──────────────┘ │ (DuckDB) │ │ (S3 / Local) │
|
|
32
|
+
▲ └──────────────┘ └──────────────┘
|
|
33
|
+
│ │ ▲
|
|
34
|
+
│ ▼ │
|
|
35
|
+
│ ┌──────────────┐ │
|
|
36
|
+
└────purga───│ Verify OK? │ │
|
|
37
|
+
└──────────────┘ │
|
|
38
|
+
│
|
|
39
|
+
┌──────────────┐ │
|
|
40
|
+
│ FileIngestor │────┘
|
|
41
|
+
└──────────────┘
|
|
42
|
+
│
|
|
43
|
+
┌──────────────┐ │
|
|
44
|
+
│ Record │<───┘
|
|
45
|
+
│ (consultas) │
|
|
46
|
+
└──────────────┘
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Flujo runtime de Engine
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
1. setup_duckdb → ATTACH Postgres + setup adapter (httpfs si S3)
|
|
53
|
+
2. get_postgres_count → si 0, return true (skip)
|
|
54
|
+
3. export_to_parquet → COPY ... TO ... PARTITION_BY (...) ZSTD [omitido si skip_export]
|
|
55
|
+
4. verify_integrity → COUNT(*) Parquet == COUNT(*) Postgres
|
|
56
|
+
5. purge_from_postgres → DELETE en lotes throttled + heartbeat
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Decisiones de diseño
|
|
60
|
+
|
|
61
|
+
- **DuckDB en memoria** procesa millones de registros sin cargar objetos en RAM Ruby. Usa `ATTACH POSTGRES READ_ONLY` para leer origen y `COPY ... TO` para escribir Parquet.
|
|
62
|
+
- **Conexión DuckDB thread-local** en `Record`: cada thread inicializa una conexión persistente que se cachea en `Thread.current[:data_drain_duckdb] = { db:, conn: }`. El hash retiene la `Database` para evitar GC prematuro de la conexión.
|
|
63
|
+
- **Verify es la única puerta de seguridad** antes de purgar. Si retorna `false` (incluyendo `DuckDB::Error` al leer Parquet), la purga se aborta.
|
|
64
|
+
- **Storage Adapter cacheado**: `Storage.adapter` memoiza la instancia. Si se cambia `storage_mode` en runtime, llamar `Storage.reset_adapter!`.
|
|
65
|
+
- **Rangos semi-abiertos**: `created_at >= start AND created_at < end_boundary` donde `end_boundary = end_date.next_day.beginning_of_day`. Nunca `<= end_of_day`.
|
|
66
|
+
|
|
67
|
+
### Stack y dependencias
|
|
68
|
+
|
|
69
|
+
- Ruby `>= 3.0.0`
|
|
70
|
+
- Runtime: `activemodel >= 6.0`, `duckdb ~> 1.4`, `pg >= 1.2`, `aws-sdk-s3 ~> 1.114`, `aws-sdk-glue ~> 1.0`
|
|
71
|
+
- Versión actual: `0.1.19`
|
|
72
|
+
|
|
73
|
+
## API Pública (resumen)
|
|
74
|
+
|
|
75
|
+
### Configuración global
|
|
76
|
+
|
|
77
|
+
```ruby
|
|
78
|
+
DataDrain.configure do |config|
|
|
79
|
+
config.storage_mode = :local | :s3
|
|
80
|
+
config.aws_region, .aws_access_key_id, .aws_secret_access_key
|
|
81
|
+
config.db_host, .db_port, .db_user, .db_pass, .db_name
|
|
82
|
+
config.batch_size = 5000
|
|
83
|
+
config.throttle_delay = 0.5
|
|
84
|
+
config.idle_in_transaction_session_timeout = 0 # 0 = DESACTIVADO
|
|
85
|
+
config.limit_ram = "2GB"
|
|
86
|
+
config.tmp_directory = "/tmp/duckdb_work"
|
|
87
|
+
config.logger = Rails.logger
|
|
88
|
+
end
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Operaciones principales
|
|
92
|
+
|
|
93
|
+
```ruby
|
|
94
|
+
# 1. ETL completo (Engine)
|
|
95
|
+
DataDrain::Engine.new(
|
|
96
|
+
bucket:, start_date:, end_date:, table_name:,
|
|
97
|
+
partition_keys: %w[isp_id year month],
|
|
98
|
+
primary_key: "id", # opcional
|
|
99
|
+
where_clause: nil, # opcional, SQL extra
|
|
100
|
+
skip_export: false, # true delega export a Glue
|
|
101
|
+
folder_name: nil, # default = table_name
|
|
102
|
+
select_sql: "*" # default
|
|
103
|
+
).call # => true (ok) | false (integrity fail)
|
|
104
|
+
|
|
105
|
+
# 2. Ingesta de archivos crudos
|
|
106
|
+
DataDrain::FileIngestor.new(
|
|
107
|
+
bucket:, source_path:, folder_name:,
|
|
108
|
+
partition_keys: [], # opcional
|
|
109
|
+
select_sql: "*", # opcional
|
|
110
|
+
delete_after_upload: true # opcional
|
|
111
|
+
).call
|
|
112
|
+
|
|
113
|
+
# 3. ORM analítico
|
|
114
|
+
class ArchivedX < DataDrain::Record
|
|
115
|
+
self.bucket = "..."
|
|
116
|
+
self.folder_name = "..."
|
|
117
|
+
self.partition_keys = [:isp_id, :year, :month] # ORDEN = jerarquía Hive
|
|
118
|
+
attribute :id, :string
|
|
119
|
+
end
|
|
120
|
+
ArchivedX.where(limit: 10, isp_id: 42, year: 2026, month: 3) # => Array
|
|
121
|
+
ArchivedX.find("uuid", isp_id: 42, year: 2026, month: 3) # => instance | nil
|
|
122
|
+
ArchivedX.destroy_all(isp_id: 42) # => Integer (particiones borradas)
|
|
123
|
+
|
|
124
|
+
# 4. Glue para tablas 1TB+
|
|
125
|
+
DataDrain::GlueRunner.run_and_wait("job-name", { "--key" => "val" }, polling_interval: 30)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Detalle completo de firmas, parámetros, retornos y comportamientos en [API Detallada](references/api-detallada.md).
|
|
129
|
+
|
|
130
|
+
## FAQ
|
|
131
|
+
|
|
132
|
+
### ¿Cuándo usar `Engine` directo vs `GlueRunner` + `Engine(skip_export: true)`?
|
|
133
|
+
|
|
134
|
+
`Engine` directo soporta hasta ~10-50GB cómodamente. Para tablas >500GB-1TB delegar el export a AWS Glue (Apache Spark distribuido) y usar `Engine(skip_export: true)` solo para verificar integridad y purgar Postgres. DataDrain en este modo solo lee Parquet (no exporta) y borra origen una vez confirmados los conteos.
|
|
135
|
+
|
|
136
|
+
### ¿Qué pasa si `verify_integrity` falla?
|
|
137
|
+
|
|
138
|
+
`Engine#call` retorna `false` y **no ejecuta la purga**. Emite log `engine.integrity_error`. Si la falla viene de no poder leer el Parquet (`DuckDB::Error`), emite `engine.parquet_read_error` y también retorna `false`. Es la única salvaguarda matemática del sistema.
|
|
139
|
+
|
|
140
|
+
### ¿Cómo cambiar `storage_mode` en runtime?
|
|
141
|
+
|
|
142
|
+
```ruby
|
|
143
|
+
DataDrain.configure { |c| c.storage_mode = :s3 }
|
|
144
|
+
DataDrain::Storage.reset_adapter! # OBLIGATORIO, sino se sigue usando el adapter cacheado
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### ¿Por qué `idle_in_transaction_session_timeout = 0`?
|
|
148
|
+
|
|
149
|
+
`0` **desactiva** el timeout (sin límite de tiempo). Es mandatorio para purgas de gran volumen donde un lote puede tardar segundos. Internamente se valida con `!nil?` (no `.present?`) porque `0.present?` es `false` en Rails.
|
|
150
|
+
|
|
151
|
+
### ¿El orden de `partition_keys` importa?
|
|
152
|
+
|
|
153
|
+
Sí, **crítico**. Determina la jerarquía Hive en disco. El orden al **escribir** (Engine/FileIngestor) debe ser idéntico al declarado en el modelo `Record` que lee. Mismatch → DuckDB retorna vacío sin error. Convención canónica: `[dimension_principal, year, month]` (mayor cardinalidad o filtro más usado primero).
|
|
154
|
+
|
|
155
|
+
### ¿La conexión DuckDB es thread-safe?
|
|
156
|
+
|
|
157
|
+
Sí. `Record.connection` mantiene una conexión por thread vía `Thread.current`. En Puma/Sidekiq cada worker thread tiene la suya. La conexión nunca se cierra explícitamente (persiste mientras vive el thread). `Engine` y `FileIngestor` crean su propia conexión efímera por instancia y la cierran en `ensure`.
|
|
158
|
+
|
|
159
|
+
### ¿DataDrain valida los nombres de tabla?
|
|
160
|
+
|
|
161
|
+
No. `table_name`, `select_sql` y `where_clause` se interpolan directamente en SQL. La gema asume que estos valores vienen de código de aplicación (no de input de usuario). En `Record.find` el `id` sí se sanitiza (escape de comillas simples).
|
|
162
|
+
|
|
163
|
+
### ¿Cómo evito OOM con tablas grandes?
|
|
164
|
+
|
|
165
|
+
Setear `limit_ram` (ej. `"2GB"`) y `tmp_directory` (en SSD). DuckDB hará spill-to-disk automáticamente. Para tablas >500GB delegar a Glue.
|
|
166
|
+
|
|
167
|
+
### ¿Los logs incluyen `source=`?
|
|
168
|
+
|
|
169
|
+
No. La gema NO emite `source=` manualmente — lo inyecta automáticamente `exis_ray` (logger middleware externo) cuando está presente. Si no usás `exis_ray`, agregalo con un wrapper de logger.
|
|
170
|
+
|
|
171
|
+
### ¿Qué formato tienen los logs?
|
|
172
|
+
|
|
173
|
+
`component=data_drain event=<clase>.<suceso> [campos KV]`. Tiempos con sufijo `_s` y valor float. Contadores con `_count` y valor integer. Sin unidades en los valores. Detalle en [Eventos y Telemetría](references/eventos-telemetria.md).
|
|
174
|
+
|
|
175
|
+
## Errores
|
|
176
|
+
|
|
177
|
+
Catálogo top. Detalle completo y resolución en [API Detallada](references/api-detallada.md).
|
|
178
|
+
|
|
179
|
+
### `DataDrain::Error`
|
|
180
|
+
Clase base. Toda excepción del framework hereda de acá.
|
|
181
|
+
|
|
182
|
+
### `DataDrain::ConfigurationError`
|
|
183
|
+
Levantado cuando falta configuración obligatoria. **Causa típica:** olvidar `aws_*` con `storage_mode = :s3`. **Resolución:** completar el bloque `DataDrain.configure`.
|
|
184
|
+
|
|
185
|
+
### `DataDrain::IntegrityError`
|
|
186
|
+
Reservado para fallos matemáticos en verificación. Actualmente `Engine#call` retorna `false` en lugar de levantarlo. **Resolución:** investigar mismatch entre conteo Postgres y conteo Parquet.
|
|
187
|
+
|
|
188
|
+
### `DataDrain::StorageError`
|
|
189
|
+
Problemas interactuando con disco local, S3 o DuckDB. **Causa típica:** credenciales AWS inválidas, bucket inexistente, permisos S3 insuficientes.
|
|
190
|
+
|
|
191
|
+
### `DataDrain::Storage::InvalidAdapterError`
|
|
192
|
+
`storage_mode` no reconocido. **Causa:** valor distinto de `:local` o `:s3`. **Resolución:** corregir configuración.
|
|
193
|
+
|
|
194
|
+
### `DuckDB::Error` (no envuelto)
|
|
195
|
+
Errores de query DuckDB. En `Engine#verify_integrity` se captura y se loguea como `engine.parquet_read_error` retornando `false`. En `FileIngestor#call` se captura y se loguea como `file_ingestor.duckdb_error` retornando `false`. En `Record` se captura en `execute_and_instantiate` y retorna `[]`.
|
|
196
|
+
|
|
197
|
+
### `RuntimeError` desde `GlueRunner`
|
|
198
|
+
Levantado cuando un Job de Glue termina con estado `FAILED`, `STOPPED` o `TIMEOUT`. **Mensaje:** `"Glue Job <name> (Run ID: <id>) falló con estado <status>."`
|
|
199
|
+
|
|
200
|
+
## Antipatrones
|
|
201
|
+
|
|
202
|
+
Catálogo completo en [Antipatrones](references/antipatrones.md). Resumen de los más críticos:
|
|
203
|
+
|
|
204
|
+
1. **Bypassear `verify_integrity`** llamando `purge_from_postgres` directo — rompe la única garantía de seguridad.
|
|
205
|
+
2. **Mismatch en orden de `partition_keys`** entre escritura y lectura — DuckDB devuelve vacío sin error.
|
|
206
|
+
3. **`storage_mode` cambiado sin `reset_adapter!`** — sigue usando el adapter viejo cacheado.
|
|
207
|
+
4. **Validar `idle_in_transaction_session_timeout` con `.present?`** — `0.present?` es `false`, ignora la config.
|
|
208
|
+
5. **Usar `<= end_of_day`** en rangos de fecha — pierde registros con microsegundos.
|
|
209
|
+
6. **Loguear `source=`** manualmente — duplica el campo que inyecta `exis_ray`.
|
|
210
|
+
|
|
211
|
+
## Referencias
|
|
212
|
+
|
|
213
|
+
- [API Detallada](references/api-detallada.md) — Firmas completas, parámetros, retornos y comportamientos de cada clase pública.
|
|
214
|
+
- [Eventos y Telemetría](references/eventos-telemetria.md) — Catálogo completo de eventos KV emitidos por la gema.
|
|
215
|
+
- [Antipatrones](references/antipatrones.md) — Qué NO hacer y alternativas correctas.
|