data_drain 0.1.19 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -0
- data/CHANGELOG.md +25 -0
- data/CLAUDE.md +4 -0
- data/README.md +66 -171
- data/docs/IMPROVEMENT_PLAN.md +1162 -0
- data/docs/execution/archive/v0.2.0.agente-review.md +125 -0
- data/docs/execution/archive/v0.2.0.md +812 -0
- data/docs/glue_pyspark_example.py +60 -0
- data/lib/data_drain/engine.rb +53 -40
- data/lib/data_drain/file_ingestor.rb +40 -25
- data/lib/data_drain/record.rb +24 -3
- data/lib/data_drain/storage/s3.rb +48 -6
- data/lib/data_drain/validations.rb +17 -0
- data/lib/data_drain/version.rb +1 -1
- data/lib/data_drain.rb +2 -0
- data/skill/SKILL.md +215 -0
- data/skill/references/antipatrones.md +242 -0
- data/skill/references/api-detallada.md +257 -0
- data/skill/references/eventos-telemetria.md +154 -0
- metadata +11 -2
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Script de AWS Glue (PySpark) compatible con DataDrain::GlueRunner.
|
|
3
|
+
|
|
4
|
+
Crear el Job en la consola de AWS Glue (Spark 4.0+) y usar este script como base.
|
|
5
|
+
Argumentos requeridos: JOB_NAME, start_date, end_date, s3_bucket, s3_folder,
|
|
6
|
+
db_url, db_user, db_password, db_table, partition_by.
|
|
7
|
+
|
|
8
|
+
Personalizar la sección de columnas derivadas según las partition_keys de cada tabla.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
from awsglue.utils import getResolvedOptions
|
|
13
|
+
from pyspark.context import SparkContext
|
|
14
|
+
from awsglue.context import GlueContext
|
|
15
|
+
from awsglue.job import Job
|
|
16
|
+
from pyspark.sql.functions import col, year, month
|
|
17
|
+
|
|
18
|
+
args = getResolvedOptions(sys.argv, [
|
|
19
|
+
'JOB_NAME', 'start_date', 'end_date', 's3_bucket', 's3_folder',
|
|
20
|
+
'db_url', 'db_user', 'db_password', 'db_table', 'partition_by'
|
|
21
|
+
])
|
|
22
|
+
|
|
23
|
+
sc = SparkContext()
|
|
24
|
+
glueContext = GlueContext(sc)
|
|
25
|
+
spark = glueContext.spark_session
|
|
26
|
+
job = Job(glueContext)
|
|
27
|
+
job.init(args['JOB_NAME'], args)
|
|
28
|
+
|
|
29
|
+
options = {
|
|
30
|
+
"url": args['db_url'],
|
|
31
|
+
"dbtable": args['db_table'],
|
|
32
|
+
"user": args['db_user'],
|
|
33
|
+
"password": args['db_password'],
|
|
34
|
+
"sampleQuery": (
|
|
35
|
+
f"SELECT * FROM {args['db_table']} "
|
|
36
|
+
f"WHERE created_at >= '{args['start_date']}' "
|
|
37
|
+
f"AND created_at < '{args['end_date']}'"
|
|
38
|
+
)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
df = spark.read.format("jdbc").options(**options).load()
|
|
42
|
+
|
|
43
|
+
# Agregar columnas derivadas necesarias para las particiones.
|
|
44
|
+
# isp_id ya existe en la tabla fuente — solo agregar las que se calculan.
|
|
45
|
+
# Personalizar esta seccion segun las partition_keys de cada tabla.
|
|
46
|
+
df_final = (
|
|
47
|
+
df.withColumn("year", year(col("created_at")))
|
|
48
|
+
.withColumn("month", month(col("created_at")))
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
output_path = f"s3://{args['s3_bucket']}/{args['s3_folder']}/"
|
|
52
|
+
partitions = args['partition_by'].split(",")
|
|
53
|
+
|
|
54
|
+
(df_final.write.mode("overwrite")
|
|
55
|
+
.partitionBy(*partitions)
|
|
56
|
+
.format("parquet")
|
|
57
|
+
.option("compression", "zstd")
|
|
58
|
+
.save(output_path))
|
|
59
|
+
|
|
60
|
+
job.commit()
|
data/lib/data_drain/engine.rb
CHANGED
|
@@ -5,6 +5,7 @@ require "pg"
|
|
|
5
5
|
|
|
6
6
|
module DataDrain
|
|
7
7
|
# Motor principal de extracción y purga de datos (DataDrain).
|
|
8
|
+
# rubocop:disable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
|
|
8
9
|
#
|
|
9
10
|
# Orquesta el flujo ETL desde PostgreSQL hacia un Data Lake analítico
|
|
10
11
|
# delegando la interacción del almacenamiento al adaptador configurado.
|
|
@@ -21,29 +22,31 @@ module DataDrain
|
|
|
21
22
|
# @option options [Array<String, Symbol>] :partition_keys Columnas para particionar.
|
|
22
23
|
# @option options [String] :primary_key (Opcional) Clave primaria para borrado. Por defecto 'id'.
|
|
23
24
|
# @option options [String] :where_clause (Opcional) Condición SQL extra.
|
|
24
|
-
# @option options [Boolean] :skip_export (Opcional) Si
|
|
25
|
+
# @option options [Boolean] :skip_export (Opcional) Si true, no exporta
|
|
26
|
+
# a Parquet — solo valida y purga (para uso con GlueRunner).
|
|
25
27
|
def initialize(options)
|
|
26
|
-
@start_date
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# Esto evita problemas de precisión con los microsegundos al usar end_of_day
|
|
30
|
-
@end_date = options.fetch(:end_date).to_date.next_day.beginning_of_day
|
|
31
|
-
|
|
32
|
-
@table_name = options.fetch(:table_name)
|
|
33
|
-
@folder_name = options.fetch(:folder_name, @table_name)
|
|
34
|
-
@select_sql = options.fetch(:select_sql, "*")
|
|
35
|
-
@partition_keys = options.fetch(:partition_keys)
|
|
36
|
-
@primary_key = options.fetch(:primary_key, "id")
|
|
37
|
-
@where_clause = options[:where_clause]
|
|
38
|
-
@bucket = options[:bucket]
|
|
39
|
-
@skip_export = options.fetch(:skip_export, false)
|
|
28
|
+
@start_date = options.fetch(:start_date).beginning_of_day
|
|
29
|
+
|
|
30
|
+
@end_date = options.fetch(:end_date).to_date.next_day.beginning_of_day
|
|
40
31
|
|
|
41
|
-
@
|
|
42
|
-
|
|
32
|
+
@table_name = options.fetch(:table_name)
|
|
33
|
+
Validations.validate_identifier!(:table_name, @table_name)
|
|
34
|
+
|
|
35
|
+
@folder_name = options.fetch(:folder_name, @table_name)
|
|
36
|
+
@select_sql = options.fetch(:select_sql, "*")
|
|
37
|
+
@partition_keys = options.fetch(:partition_keys)
|
|
38
|
+
@primary_key = options.fetch(:primary_key, "id")
|
|
39
|
+
Validations.validate_identifier!(:primary_key, @primary_key)
|
|
40
|
+
@where_clause = options[:where_clause]
|
|
41
|
+
@bucket = options[:bucket]
|
|
42
|
+
@skip_export = options.fetch(:skip_export, false)
|
|
43
|
+
|
|
44
|
+
@config = DataDrain.configuration
|
|
45
|
+
@logger = @config.logger
|
|
43
46
|
@adapter = DataDrain::Storage.adapter
|
|
44
47
|
|
|
45
48
|
database = DuckDB::Database.open(":memory:")
|
|
46
|
-
@duckdb
|
|
49
|
+
@duckdb = database.connect
|
|
47
50
|
end
|
|
48
51
|
|
|
49
52
|
# Ejecuta el flujo completo del motor: Setup, Conteo, Exportación (opcional), Verificación y Purga.
|
|
@@ -51,7 +54,8 @@ module DataDrain
|
|
|
51
54
|
# @return [Boolean] `true` si el proceso finalizó con éxito, `false` si falló la integridad.
|
|
52
55
|
def call
|
|
53
56
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
54
|
-
safe_log(:info, "engine.start",
|
|
57
|
+
safe_log(:info, "engine.start",
|
|
58
|
+
{ table: @table_name, start_date: @start_date.to_date, end_date: @end_date.to_date })
|
|
55
59
|
|
|
56
60
|
setup_duckdb
|
|
57
61
|
|
|
@@ -62,7 +66,8 @@ module DataDrain
|
|
|
62
66
|
|
|
63
67
|
if @pg_count.zero?
|
|
64
68
|
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
65
|
-
safe_log(:info, "engine.skip_empty",
|
|
69
|
+
safe_log(:info, "engine.skip_empty",
|
|
70
|
+
{ table: @table_name, duration_s: duration.round(2), db_query_duration_s: db_query_duration.round(2) })
|
|
66
71
|
return true
|
|
67
72
|
end
|
|
68
73
|
|
|
@@ -90,18 +95,19 @@ module DataDrain
|
|
|
90
95
|
|
|
91
96
|
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
92
97
|
safe_log(:info, "engine.complete", {
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
98
|
+
table: @table_name,
|
|
99
|
+
duration_s: duration.round(2),
|
|
100
|
+
db_query_duration_s: db_query_duration.round(2),
|
|
101
|
+
export_duration_s: export_duration.round(2),
|
|
102
|
+
integrity_duration_s: integrity_duration.round(2),
|
|
103
|
+
purge_duration_s: purge_duration.round(2),
|
|
104
|
+
count: @pg_count
|
|
105
|
+
})
|
|
101
106
|
true
|
|
102
107
|
else
|
|
103
108
|
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
104
|
-
safe_log(:error, "engine.integrity_error",
|
|
109
|
+
safe_log(:error, "engine.integrity_error",
|
|
110
|
+
{ table: @table_name, duration_s: duration.round(2), count: @pg_count })
|
|
105
111
|
false
|
|
106
112
|
end
|
|
107
113
|
end
|
|
@@ -142,7 +148,12 @@ module DataDrain
|
|
|
142
148
|
@adapter.prepare_export_path(@bucket, @folder_name)
|
|
143
149
|
|
|
144
150
|
# Determinamos el path base de destino según el adaptador
|
|
145
|
-
dest_path = @config.storage_mode.to_sym == :s3
|
|
151
|
+
dest_path = if @config.storage_mode.to_sym == :s3
|
|
152
|
+
"s3://#{@bucket}/#{@folder_name}/"
|
|
153
|
+
else
|
|
154
|
+
File.join(@bucket,
|
|
155
|
+
@folder_name, "")
|
|
156
|
+
end
|
|
146
157
|
|
|
147
158
|
pg_sql = "SELECT #{@select_sql} FROM public.#{@table_name} WHERE #{base_where_sql}"
|
|
148
159
|
pg_sql = pg_sql.gsub("'", "''")
|
|
@@ -154,7 +165,7 @@ module DataDrain
|
|
|
154
165
|
) TO '#{dest_path}'
|
|
155
166
|
(
|
|
156
167
|
FORMAT PARQUET,
|
|
157
|
-
PARTITION_BY (#{@partition_keys.join(
|
|
168
|
+
PARTITION_BY (#{@partition_keys.join(", ")}),
|
|
158
169
|
COMPRESSION 'ZSTD',
|
|
159
170
|
OVERWRITE_OR_IGNORE 1
|
|
160
171
|
);
|
|
@@ -180,7 +191,8 @@ module DataDrain
|
|
|
180
191
|
return false
|
|
181
192
|
end
|
|
182
193
|
|
|
183
|
-
safe_log(:info, "engine.integrity_check",
|
|
194
|
+
safe_log(:info, "engine.integrity_check",
|
|
195
|
+
{ table: @table_name, pg_count: @pg_count, parquet_count: parquet_result })
|
|
184
196
|
@pg_count == parquet_result
|
|
185
197
|
end
|
|
186
198
|
|
|
@@ -189,11 +201,11 @@ module DataDrain
|
|
|
189
201
|
safe_log(:info, "engine.purge_start", { table: @table_name, batch_size: @config.batch_size })
|
|
190
202
|
|
|
191
203
|
conn = PG.connect(
|
|
192
|
-
host:
|
|
193
|
-
port:
|
|
194
|
-
user:
|
|
204
|
+
host: @config.db_host,
|
|
205
|
+
port: @config.db_port,
|
|
206
|
+
user: @config.db_user,
|
|
195
207
|
password: @config.db_pass,
|
|
196
|
-
dbname:
|
|
208
|
+
dbname: @config.db_name
|
|
197
209
|
)
|
|
198
210
|
|
|
199
211
|
unless @config.idle_in_transaction_session_timeout.nil?
|
|
@@ -223,10 +235,10 @@ module DataDrain
|
|
|
223
235
|
# Heartbeat cada 100 lotes para monitorear procesos largos de 1TB
|
|
224
236
|
if (batches_processed % 100).zero?
|
|
225
237
|
safe_log(:info, "engine.purge_heartbeat", {
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
238
|
+
table: @table_name,
|
|
239
|
+
batches_processed_count: batches_processed,
|
|
240
|
+
rows_deleted_count: total_deleted
|
|
241
|
+
})
|
|
230
242
|
end
|
|
231
243
|
|
|
232
244
|
sleep(@config.throttle_delay) if @config.throttle_delay.positive?
|
|
@@ -235,4 +247,5 @@ module DataDrain
|
|
|
235
247
|
conn&.close
|
|
236
248
|
end
|
|
237
249
|
end
|
|
250
|
+
# rubocop:enable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
|
|
238
251
|
end
|
|
@@ -6,6 +6,8 @@ module DataDrain
|
|
|
6
6
|
# aplicando compresión ZSTD y particionamiento Hive.
|
|
7
7
|
class FileIngestor
|
|
8
8
|
include Observability
|
|
9
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
|
|
10
|
+
# Metrics/MethodLength
|
|
9
11
|
|
|
10
12
|
# @param options [Hash] Opciones de ingestión.
|
|
11
13
|
# @option options [String] :source_path Ruta absoluta al archivo local.
|
|
@@ -14,19 +16,20 @@ module DataDrain
|
|
|
14
16
|
# @option options [String] :select_sql (Opcional) Sentencia SELECT para transformar datos al vuelo.
|
|
15
17
|
# @option options [Boolean] :delete_after_upload (Opcional) Borra el archivo local al terminar. Por defecto true.
|
|
16
18
|
def initialize(options)
|
|
17
|
-
@source_path
|
|
18
|
-
@folder_name
|
|
19
|
-
|
|
20
|
-
@
|
|
19
|
+
@source_path = options.fetch(:source_path)
|
|
20
|
+
@folder_name = options.fetch(:folder_name)
|
|
21
|
+
Validations.validate_identifier!(:folder_name, @folder_name)
|
|
22
|
+
@partition_keys = options.fetch(:partition_keys, [])
|
|
23
|
+
@select_sql = options.fetch(:select_sql, "*")
|
|
21
24
|
@delete_after_upload = options.fetch(:delete_after_upload, true)
|
|
22
|
-
@bucket
|
|
25
|
+
@bucket = options[:bucket]
|
|
23
26
|
|
|
24
|
-
@config
|
|
25
|
-
@logger
|
|
27
|
+
@config = DataDrain.configuration
|
|
28
|
+
@logger = @config.logger
|
|
26
29
|
@adapter = DataDrain::Storage.adapter
|
|
27
30
|
|
|
28
31
|
database = DuckDB::Database.open(":memory:")
|
|
29
|
-
@duckdb
|
|
32
|
+
@duckdb = database.connect
|
|
30
33
|
end
|
|
31
34
|
|
|
32
35
|
# Ejecuta el flujo de ingestión.
|
|
@@ -52,7 +55,11 @@ module DataDrain
|
|
|
52
55
|
step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
53
56
|
source_count = @duckdb.query("SELECT COUNT(*) FROM #{reader_function}").first.first
|
|
54
57
|
source_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
|
|
55
|
-
safe_log(:info, "file_ingestor.count", {
|
|
58
|
+
safe_log(:info, "file_ingestor.count", {
|
|
59
|
+
source_path: @source_path,
|
|
60
|
+
count: source_count,
|
|
61
|
+
source_query_duration_s: source_query_duration.round(2)
|
|
62
|
+
})
|
|
56
63
|
|
|
57
64
|
if source_count.zero?
|
|
58
65
|
cleanup_local_file
|
|
@@ -63,9 +70,14 @@ module DataDrain
|
|
|
63
70
|
|
|
64
71
|
# 2. Exportación / Subida
|
|
65
72
|
@adapter.prepare_export_path(@bucket, @folder_name)
|
|
66
|
-
dest_path = @config.storage_mode.to_sym == :s3
|
|
73
|
+
dest_path = if @config.storage_mode.to_sym == :s3
|
|
74
|
+
"s3://#{@bucket}/#{@folder_name}/"
|
|
75
|
+
else
|
|
76
|
+
File.join(@bucket,
|
|
77
|
+
@folder_name, "")
|
|
78
|
+
end
|
|
67
79
|
|
|
68
|
-
partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(
|
|
80
|
+
partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(", ")})," : ""
|
|
69
81
|
|
|
70
82
|
query = <<~SQL
|
|
71
83
|
COPY (
|
|
@@ -87,18 +99,19 @@ module DataDrain
|
|
|
87
99
|
|
|
88
100
|
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
89
101
|
safe_log(:info, "file_ingestor.complete", {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
102
|
+
source_path: @source_path,
|
|
103
|
+
duration_s: duration.round(2),
|
|
104
|
+
source_query_duration_s: source_query_duration.round(2),
|
|
105
|
+
export_duration_s: export_duration.round(2),
|
|
106
|
+
count: source_count
|
|
107
|
+
})
|
|
96
108
|
|
|
97
109
|
cleanup_local_file
|
|
98
110
|
true
|
|
99
111
|
rescue DuckDB::Error => e
|
|
100
112
|
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
101
|
-
safe_log(:error, "file_ingestor.duckdb_error",
|
|
113
|
+
safe_log(:error, "file_ingestor.duckdb_error",
|
|
114
|
+
{ source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
|
|
102
115
|
false
|
|
103
116
|
ensure
|
|
104
117
|
@duckdb&.close
|
|
@@ -109,11 +122,11 @@ module DataDrain
|
|
|
109
122
|
# @api private
|
|
110
123
|
def determine_reader
|
|
111
124
|
case File.extname(@source_path).downcase
|
|
112
|
-
when
|
|
125
|
+
when ".csv"
|
|
113
126
|
"read_csv_auto('#{@source_path}')"
|
|
114
|
-
when
|
|
127
|
+
when ".json"
|
|
115
128
|
"read_json_auto('#{@source_path}')"
|
|
116
|
-
when
|
|
129
|
+
when ".parquet"
|
|
117
130
|
"read_parquet('#{@source_path}')"
|
|
118
131
|
else
|
|
119
132
|
raise DataDrain::Error, "Formato de archivo no soportado para ingestión: #{@source_path}"
|
|
@@ -122,10 +135,12 @@ module DataDrain
|
|
|
122
135
|
|
|
123
136
|
# @api private
|
|
124
137
|
def cleanup_local_file
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
138
|
+
return unless @delete_after_upload && File.exist?(@source_path)
|
|
139
|
+
|
|
140
|
+
File.delete(@source_path)
|
|
141
|
+
safe_log(:info, "file_ingestor.cleanup", { source_path: @source_path })
|
|
129
142
|
end
|
|
130
143
|
end
|
|
144
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
|
|
145
|
+
# Metrics/MethodLength
|
|
131
146
|
end
|
data/lib/data_drain/record.rb
CHANGED
|
@@ -24,10 +24,28 @@ module DataDrain
|
|
|
24
24
|
class_attribute :folder_name
|
|
25
25
|
class_attribute :partition_keys
|
|
26
26
|
|
|
27
|
+
# Cierra la conexión DuckDB del thread actual y limpia Thread.current.
|
|
28
|
+
# Idempotente: llamarlo varias veces no levanta.
|
|
29
|
+
#
|
|
30
|
+
# Útil en middlewares de Sidekiq/Puma para evitar memory leak en threads
|
|
31
|
+
# de larga vida.
|
|
32
|
+
#
|
|
33
|
+
# @return [void]
|
|
34
|
+
def self.disconnect!
|
|
35
|
+
entry = Thread.current[:data_drain_duckdb]
|
|
36
|
+
Thread.current[:data_drain_duckdb] = nil
|
|
37
|
+
return unless entry
|
|
38
|
+
|
|
39
|
+
entry[:conn]&.close
|
|
40
|
+
entry[:db]&.close
|
|
41
|
+
rescue StandardError # rubocop:disable Lint/SuppressedException
|
|
42
|
+
end
|
|
43
|
+
|
|
27
44
|
# Retorna la conexión persistente a DuckDB en memoria para el hilo (Thread) actual.
|
|
28
45
|
# Esto previene tener que recargar extensiones (como httpfs) en cada consulta.
|
|
29
46
|
#
|
|
30
47
|
# @return [DuckDB::Connection] Conexión activa a DuckDB.
|
|
48
|
+
# rubocop:disable Metrics/AbcSize
|
|
31
49
|
def self.connection
|
|
32
50
|
Thread.current[:data_drain_duckdb] ||= begin
|
|
33
51
|
db = DuckDB::Database.open(":memory:")
|
|
@@ -42,6 +60,7 @@ module DataDrain
|
|
|
42
60
|
end
|
|
43
61
|
Thread.current[:data_drain_duckdb][:conn]
|
|
44
62
|
end
|
|
63
|
+
# rubocop:enable Metrics/AbcSize
|
|
45
64
|
|
|
46
65
|
# Consulta registros en el Data Lake filtrando por claves de partición.
|
|
47
66
|
#
|
|
@@ -52,7 +71,7 @@ module DataDrain
|
|
|
52
71
|
path = build_query_path(partitions)
|
|
53
72
|
|
|
54
73
|
sql = <<~SQL
|
|
55
|
-
SELECT #{attribute_names.join(
|
|
74
|
+
SELECT #{attribute_names.join(", ")}
|
|
56
75
|
FROM read_parquet('#{path}')
|
|
57
76
|
ORDER BY created_at DESC
|
|
58
77
|
LIMIT #{limit}
|
|
@@ -73,7 +92,7 @@ module DataDrain
|
|
|
73
92
|
safe_id = id.to_s.gsub("'", "''")
|
|
74
93
|
|
|
75
94
|
sql = <<~SQL
|
|
76
|
-
SELECT #{attribute_names.join(
|
|
95
|
+
SELECT #{attribute_names.join(", ")}
|
|
77
96
|
FROM read_parquet('#{path}')
|
|
78
97
|
WHERE id = '#{safe_id}'
|
|
79
98
|
LIMIT 1
|
|
@@ -97,7 +116,7 @@ module DataDrain
|
|
|
97
116
|
# @return [String] Representación legible en consola.
|
|
98
117
|
def inspect
|
|
99
118
|
inspection = attributes.map do |name, value|
|
|
100
|
-
"#{name}: #{value.nil? ?
|
|
119
|
+
"#{name}: #{value.nil? ? "nil" : value.inspect}"
|
|
101
120
|
end.compact.join(", ")
|
|
102
121
|
|
|
103
122
|
"#<#{self.class} #{inspection}>"
|
|
@@ -118,6 +137,7 @@ module DataDrain
|
|
|
118
137
|
# @param sql [String]
|
|
119
138
|
# @param columns [Array<String>]
|
|
120
139
|
# @return [Array<DataDrain::Record>]
|
|
140
|
+
# rubocop:disable Metrics/MethodLength
|
|
121
141
|
def execute_and_instantiate(sql, columns)
|
|
122
142
|
@logger = DataDrain.configuration.logger
|
|
123
143
|
begin
|
|
@@ -133,5 +153,6 @@ module DataDrain
|
|
|
133
153
|
end
|
|
134
154
|
end
|
|
135
155
|
end
|
|
156
|
+
# rubocop:enable Metrics/MethodLength
|
|
136
157
|
end
|
|
137
158
|
end
|
|
@@ -4,21 +4,59 @@ module DataDrain
|
|
|
4
4
|
module Storage
|
|
5
5
|
# Implementación del adaptador de almacenamiento para Amazon S3.
|
|
6
6
|
class S3 < Base
|
|
7
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
7
8
|
# Carga la extensión httpfs en DuckDB e inyecta las credenciales de AWS.
|
|
9
|
+
# Si aws_access_key_id y aws_secret_access_key están seteados, usa
|
|
10
|
+
# credenciales explícitas. Si no, usa credential_chain (IAM role, env vars,
|
|
11
|
+
# ~/.aws/credentials).
|
|
8
12
|
# @param connection [DuckDB::Connection]
|
|
13
|
+
# @raise [DataDrain::ConfigurationError] si aws_region no está configurado
|
|
9
14
|
def setup_duckdb(connection)
|
|
10
15
|
connection.query("INSTALL httpfs; LOAD httpfs;")
|
|
11
|
-
connection
|
|
12
|
-
connection.query("SET s3_access_key_id='#{@config.aws_access_key_id}';")
|
|
13
|
-
connection.query("SET s3_secret_access_key='#{@config.aws_secret_access_key}';")
|
|
16
|
+
create_s3_secret(connection)
|
|
14
17
|
end
|
|
15
18
|
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
# @param connection [DuckDB::Connection]
|
|
22
|
+
# @raise [DataDrain::ConfigurationError]
|
|
23
|
+
def create_s3_secret(connection)
|
|
24
|
+
region = @config.aws_region
|
|
25
|
+
raise DataDrain::ConfigurationError, "aws_region es obligatorio para storage_mode=:s3" if region.nil?
|
|
26
|
+
|
|
27
|
+
if @config.aws_access_key_id && @config.aws_secret_access_key
|
|
28
|
+
connection.query(<<~SQL)
|
|
29
|
+
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
30
|
+
TYPE S3,
|
|
31
|
+
KEY_ID '#{escape_sql(@config.aws_access_key_id)}',
|
|
32
|
+
SECRET '#{escape_sql(@config.aws_secret_access_key)}',
|
|
33
|
+
REGION '#{escape_sql(region)}'
|
|
34
|
+
);
|
|
35
|
+
SQL
|
|
36
|
+
else
|
|
37
|
+
connection.query(<<~SQL)
|
|
38
|
+
CREATE OR REPLACE SECRET data_drain_s3 (
|
|
39
|
+
TYPE S3,
|
|
40
|
+
PROVIDER credential_chain,
|
|
41
|
+
REGION '#{escape_sql(region)}'
|
|
42
|
+
);
|
|
43
|
+
SQL
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# @param value [String]
|
|
48
|
+
# @return [String]
|
|
49
|
+
def escape_sql(value)
|
|
50
|
+
value.to_s.gsub("'", "''")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
public
|
|
54
|
+
|
|
16
55
|
# @param bucket [String]
|
|
17
56
|
# @param folder_name [String]
|
|
18
57
|
# @param partition_path [String, nil]
|
|
19
58
|
# @return [String]
|
|
20
59
|
def build_path(bucket, folder_name, partition_path)
|
|
21
|
-
# En S3, el base_path actúa como el nombre del bucket
|
|
22
60
|
base = File.join(bucket, folder_name)
|
|
23
61
|
base = File.join(base, partition_path) if partition_path && !partition_path.empty?
|
|
24
62
|
"s3://#{base}/**/*.parquet"
|
|
@@ -40,7 +78,7 @@ module DataDrain
|
|
|
40
78
|
val = partitions[key]
|
|
41
79
|
val.nil? || val.to_s.empty? ? "#{key}=[^/]+" : "#{key}=#{val}"
|
|
42
80
|
end
|
|
43
|
-
pattern_regex = Regexp.new("^#{folder_name}/#{regex_parts.join(
|
|
81
|
+
pattern_regex = Regexp.new("^#{folder_name}/#{regex_parts.join("/")}")
|
|
44
82
|
|
|
45
83
|
objects_to_delete = []
|
|
46
84
|
prefix = "#{folder_name}/"
|
|
@@ -58,7 +96,10 @@ module DataDrain
|
|
|
58
96
|
|
|
59
97
|
private
|
|
60
98
|
|
|
61
|
-
# @
|
|
99
|
+
# @param client [Aws::S3::Client]
|
|
100
|
+
# @param bucket [String]
|
|
101
|
+
# @param objects_to_delete [Array<Hash>]
|
|
102
|
+
# @return [Integer]
|
|
62
103
|
def delete_in_batches(client, bucket, objects_to_delete)
|
|
63
104
|
return 0 if objects_to_delete.empty?
|
|
64
105
|
|
|
@@ -70,5 +111,6 @@ module DataDrain
|
|
|
70
111
|
deleted_count
|
|
71
112
|
end
|
|
72
113
|
end
|
|
114
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
73
115
|
end
|
|
74
116
|
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DataDrain
|
|
4
|
+
# Módulo de validación de configuración para prevenir errores de uso.
|
|
5
|
+
module Validations
|
|
6
|
+
IDENTIFIER_REGEX = /\A[a-zA-Z_][a-zA-Z0-9_]*\z/
|
|
7
|
+
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def validate_identifier!(name, value)
|
|
11
|
+
return if IDENTIFIER_REGEX.match?(value.to_s)
|
|
12
|
+
|
|
13
|
+
raise DataDrain::ConfigurationError,
|
|
14
|
+
"#{name} '#{value}' no es un identificador SQL válido"
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
data/lib/data_drain/version.rb
CHANGED
data/lib/data_drain.rb
CHANGED
|
@@ -4,6 +4,7 @@ require "active_model"
|
|
|
4
4
|
require_relative "data_drain/version"
|
|
5
5
|
require_relative "data_drain/errors"
|
|
6
6
|
require_relative "data_drain/configuration"
|
|
7
|
+
require_relative "data_drain/validations"
|
|
7
8
|
require_relative "data_drain/storage"
|
|
8
9
|
require_relative "data_drain/observability"
|
|
9
10
|
require_relative "data_drain/engine"
|
|
@@ -15,6 +16,7 @@ require_relative "data_drain/glue_runner"
|
|
|
15
16
|
require_relative "data_drain/types/json_type"
|
|
16
17
|
ActiveModel::Type.register(:json, DataDrain::Types::JsonType)
|
|
17
18
|
|
|
19
|
+
# DSL para extraer, archivar y purgar datos entre PostgreSQL y un Data Lake en Parquet.
|
|
18
20
|
module DataDrain
|
|
19
21
|
class << self
|
|
20
22
|
# @return [DataDrain::Configuration]
|