data_drain 0.1.19 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ """
2
+ Script de AWS Glue (PySpark) compatible con DataDrain::GlueRunner.
3
+
4
+ Crear el Job en la consola de AWS Glue (Spark 4.0+) y usar este script como base.
5
+ Argumentos requeridos: JOB_NAME, start_date, end_date, s3_bucket, s3_folder,
6
+ db_url, db_user, db_password, db_table, partition_by.
7
+
8
+ Personalizar la sección de columnas derivadas según las partition_keys de cada tabla.
9
+ """
10
+
11
+ import sys
12
+ from awsglue.utils import getResolvedOptions
13
+ from pyspark.context import SparkContext
14
+ from awsglue.context import GlueContext
15
+ from awsglue.job import Job
16
+ from pyspark.sql.functions import col, year, month
17
+
18
+ args = getResolvedOptions(sys.argv, [
19
+ 'JOB_NAME', 'start_date', 'end_date', 's3_bucket', 's3_folder',
20
+ 'db_url', 'db_user', 'db_password', 'db_table', 'partition_by'
21
+ ])
22
+
23
+ sc = SparkContext()
24
+ glueContext = GlueContext(sc)
25
+ spark = glueContext.spark_session
26
+ job = Job(glueContext)
27
+ job.init(args['JOB_NAME'], args)
28
+
29
+ options = {
30
+ "url": args['db_url'],
31
+ "dbtable": args['db_table'],
32
+ "user": args['db_user'],
33
+ "password": args['db_password'],
34
+ "sampleQuery": (
35
+ f"SELECT * FROM {args['db_table']} "
36
+ f"WHERE created_at >= '{args['start_date']}' "
37
+ f"AND created_at < '{args['end_date']}'"
38
+ )
39
+ }
40
+
41
+ df = spark.read.format("jdbc").options(**options).load()
42
+
43
+ # Agregar columnas derivadas necesarias para las particiones.
44
+ # isp_id ya existe en la tabla fuente — solo agregar las que se calculan.
45
+ # Personalizar esta seccion segun las partition_keys de cada tabla.
46
+ df_final = (
47
+ df.withColumn("year", year(col("created_at")))
48
+ .withColumn("month", month(col("created_at")))
49
+ )
50
+
51
+ output_path = f"s3://{args['s3_bucket']}/{args['s3_folder']}/"
52
+ partitions = args['partition_by'].split(",")
53
+
54
+ (df_final.write.mode("overwrite")
55
+ .partitionBy(*partitions)
56
+ .format("parquet")
57
+ .option("compression", "zstd")
58
+ .save(output_path))
59
+
60
+ job.commit()
@@ -5,6 +5,7 @@ require "pg"
5
5
 
6
6
  module DataDrain
7
7
  # Motor principal de extracción y purga de datos (DataDrain).
8
+ # rubocop:disable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
8
9
  #
9
10
  # Orquesta el flujo ETL desde PostgreSQL hacia un Data Lake analítico
10
11
  # delegando la interacción del almacenamiento al adaptador configurado.
@@ -21,29 +22,31 @@ module DataDrain
21
22
  # @option options [Array<String, Symbol>] :partition_keys Columnas para particionar.
22
23
  # @option options [String] :primary_key (Opcional) Clave primaria para borrado. Por defecto 'id'.
23
24
  # @option options [String] :where_clause (Opcional) Condición SQL extra.
24
- # @option options [Boolean] :skip_export (Opcional) Si es true, no realiza el export a Parquet, solo validación y purga.
25
+ # @option options [Boolean] :skip_export (Opcional) Si true, no exporta
26
+ # a Parquet — solo valida y purga (para uso con GlueRunner).
25
27
  def initialize(options)
26
- @start_date = options.fetch(:start_date).beginning_of_day
27
-
28
- # Usamos el inicio del día siguiente como límite superior estricto (<)
29
- # Esto evita problemas de precisión con los microsegundos al usar end_of_day
30
- @end_date = options.fetch(:end_date).to_date.next_day.beginning_of_day
31
-
32
- @table_name = options.fetch(:table_name)
33
- @folder_name = options.fetch(:folder_name, @table_name)
34
- @select_sql = options.fetch(:select_sql, "*")
35
- @partition_keys = options.fetch(:partition_keys)
36
- @primary_key = options.fetch(:primary_key, "id")
37
- @where_clause = options[:where_clause]
38
- @bucket = options[:bucket]
39
- @skip_export = options.fetch(:skip_export, false)
28
+ @start_date = options.fetch(:start_date).beginning_of_day
29
+
30
+ @end_date = options.fetch(:end_date).to_date.next_day.beginning_of_day
40
31
 
41
- @config = DataDrain.configuration
42
- @logger = @config.logger
32
+ @table_name = options.fetch(:table_name)
33
+ Validations.validate_identifier!(:table_name, @table_name)
34
+
35
+ @folder_name = options.fetch(:folder_name, @table_name)
36
+ @select_sql = options.fetch(:select_sql, "*")
37
+ @partition_keys = options.fetch(:partition_keys)
38
+ @primary_key = options.fetch(:primary_key, "id")
39
+ Validations.validate_identifier!(:primary_key, @primary_key)
40
+ @where_clause = options[:where_clause]
41
+ @bucket = options[:bucket]
42
+ @skip_export = options.fetch(:skip_export, false)
43
+
44
+ @config = DataDrain.configuration
45
+ @logger = @config.logger
43
46
  @adapter = DataDrain::Storage.adapter
44
47
 
45
48
  database = DuckDB::Database.open(":memory:")
46
- @duckdb = database.connect
49
+ @duckdb = database.connect
47
50
  end
48
51
 
49
52
  # Ejecuta el flujo completo del motor: Setup, Conteo, Exportación (opcional), Verificación y Purga.
@@ -51,7 +54,8 @@ module DataDrain
51
54
  # @return [Boolean] `true` si el proceso finalizó con éxito, `false` si falló la integridad.
52
55
  def call
53
56
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
54
- safe_log(:info, "engine.start", { table: @table_name, start_date: @start_date.to_date, end_date: @end_date.to_date })
57
+ safe_log(:info, "engine.start",
58
+ { table: @table_name, start_date: @start_date.to_date, end_date: @end_date.to_date })
55
59
 
56
60
  setup_duckdb
57
61
 
@@ -62,7 +66,8 @@ module DataDrain
62
66
 
63
67
  if @pg_count.zero?
64
68
  duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
65
- safe_log(:info, "engine.skip_empty", { table: @table_name, duration_s: duration.round(2), db_query_duration_s: db_query_duration.round(2) })
69
+ safe_log(:info, "engine.skip_empty",
70
+ { table: @table_name, duration_s: duration.round(2), db_query_duration_s: db_query_duration.round(2) })
66
71
  return true
67
72
  end
68
73
 
@@ -90,18 +95,19 @@ module DataDrain
90
95
 
91
96
  duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
92
97
  safe_log(:info, "engine.complete", {
93
- table: @table_name,
94
- duration_s: duration.round(2),
95
- db_query_duration_s: db_query_duration.round(2),
96
- export_duration_s: export_duration.round(2),
97
- integrity_duration_s: integrity_duration.round(2),
98
- purge_duration_s: purge_duration.round(2),
99
- count: @pg_count
100
- })
98
+ table: @table_name,
99
+ duration_s: duration.round(2),
100
+ db_query_duration_s: db_query_duration.round(2),
101
+ export_duration_s: export_duration.round(2),
102
+ integrity_duration_s: integrity_duration.round(2),
103
+ purge_duration_s: purge_duration.round(2),
104
+ count: @pg_count
105
+ })
101
106
  true
102
107
  else
103
108
  duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
104
- safe_log(:error, "engine.integrity_error", { table: @table_name, duration_s: duration.round(2), count: @pg_count })
109
+ safe_log(:error, "engine.integrity_error",
110
+ { table: @table_name, duration_s: duration.round(2), count: @pg_count })
105
111
  false
106
112
  end
107
113
  end
@@ -142,7 +148,12 @@ module DataDrain
142
148
  @adapter.prepare_export_path(@bucket, @folder_name)
143
149
 
144
150
  # Determinamos el path base de destino según el adaptador
145
- dest_path = @config.storage_mode.to_sym == :s3 ? "s3://#{@bucket}/#{@folder_name}/" : File.join(@bucket, @folder_name, "")
151
+ dest_path = if @config.storage_mode.to_sym == :s3
152
+ "s3://#{@bucket}/#{@folder_name}/"
153
+ else
154
+ File.join(@bucket,
155
+ @folder_name, "")
156
+ end
146
157
 
147
158
  pg_sql = "SELECT #{@select_sql} FROM public.#{@table_name} WHERE #{base_where_sql}"
148
159
  pg_sql = pg_sql.gsub("'", "''")
@@ -154,7 +165,7 @@ module DataDrain
154
165
  ) TO '#{dest_path}'
155
166
  (
156
167
  FORMAT PARQUET,
157
- PARTITION_BY (#{@partition_keys.join(', ')}),
168
+ PARTITION_BY (#{@partition_keys.join(", ")}),
158
169
  COMPRESSION 'ZSTD',
159
170
  OVERWRITE_OR_IGNORE 1
160
171
  );
@@ -180,7 +191,8 @@ module DataDrain
180
191
  return false
181
192
  end
182
193
 
183
- safe_log(:info, "engine.integrity_check", { table: @table_name, pg_count: @pg_count, parquet_count: parquet_result })
194
+ safe_log(:info, "engine.integrity_check",
195
+ { table: @table_name, pg_count: @pg_count, parquet_count: parquet_result })
184
196
  @pg_count == parquet_result
185
197
  end
186
198
 
@@ -189,11 +201,11 @@ module DataDrain
189
201
  safe_log(:info, "engine.purge_start", { table: @table_name, batch_size: @config.batch_size })
190
202
 
191
203
  conn = PG.connect(
192
- host: @config.db_host,
193
- port: @config.db_port,
194
- user: @config.db_user,
204
+ host: @config.db_host,
205
+ port: @config.db_port,
206
+ user: @config.db_user,
195
207
  password: @config.db_pass,
196
- dbname: @config.db_name
208
+ dbname: @config.db_name
197
209
  )
198
210
 
199
211
  unless @config.idle_in_transaction_session_timeout.nil?
@@ -223,10 +235,10 @@ module DataDrain
223
235
  # Heartbeat cada 100 lotes para monitorear procesos largos de 1TB
224
236
  if (batches_processed % 100).zero?
225
237
  safe_log(:info, "engine.purge_heartbeat", {
226
- table: @table_name,
227
- batches_processed_count: batches_processed,
228
- rows_deleted_count: total_deleted
229
- })
238
+ table: @table_name,
239
+ batches_processed_count: batches_processed,
240
+ rows_deleted_count: total_deleted
241
+ })
230
242
  end
231
243
 
232
244
  sleep(@config.throttle_delay) if @config.throttle_delay.positive?
@@ -235,4 +247,5 @@ module DataDrain
235
247
  conn&.close
236
248
  end
237
249
  end
250
+ # rubocop:enable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
238
251
  end
@@ -6,6 +6,8 @@ module DataDrain
6
6
  # aplicando compresión ZSTD y particionamiento Hive.
7
7
  class FileIngestor
8
8
  include Observability
9
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
10
+ # Metrics/MethodLength
9
11
 
10
12
  # @param options [Hash] Opciones de ingestión.
11
13
  # @option options [String] :source_path Ruta absoluta al archivo local.
@@ -14,19 +16,20 @@ module DataDrain
14
16
  # @option options [String] :select_sql (Opcional) Sentencia SELECT para transformar datos al vuelo.
15
17
  # @option options [Boolean] :delete_after_upload (Opcional) Borra el archivo local al terminar. Por defecto true.
16
18
  def initialize(options)
17
- @source_path = options.fetch(:source_path)
18
- @folder_name = options.fetch(:folder_name)
19
- @partition_keys = options.fetch(:partition_keys, [])
20
- @select_sql = options.fetch(:select_sql, "*")
19
+ @source_path = options.fetch(:source_path)
20
+ @folder_name = options.fetch(:folder_name)
21
+ Validations.validate_identifier!(:folder_name, @folder_name)
22
+ @partition_keys = options.fetch(:partition_keys, [])
23
+ @select_sql = options.fetch(:select_sql, "*")
21
24
  @delete_after_upload = options.fetch(:delete_after_upload, true)
22
- @bucket = options[:bucket]
25
+ @bucket = options[:bucket]
23
26
 
24
- @config = DataDrain.configuration
25
- @logger = @config.logger
27
+ @config = DataDrain.configuration
28
+ @logger = @config.logger
26
29
  @adapter = DataDrain::Storage.adapter
27
30
 
28
31
  database = DuckDB::Database.open(":memory:")
29
- @duckdb = database.connect
32
+ @duckdb = database.connect
30
33
  end
31
34
 
32
35
  # Ejecuta el flujo de ingestión.
@@ -52,7 +55,11 @@ module DataDrain
52
55
  step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
53
56
  source_count = @duckdb.query("SELECT COUNT(*) FROM #{reader_function}").first.first
54
57
  source_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
55
- safe_log(:info, "file_ingestor.count", { source_path: @source_path, count: source_count, source_query_duration_s: source_query_duration.round(2) })
58
+ safe_log(:info, "file_ingestor.count", {
59
+ source_path: @source_path,
60
+ count: source_count,
61
+ source_query_duration_s: source_query_duration.round(2)
62
+ })
56
63
 
57
64
  if source_count.zero?
58
65
  cleanup_local_file
@@ -63,9 +70,14 @@ module DataDrain
63
70
 
64
71
  # 2. Exportación / Subida
65
72
  @adapter.prepare_export_path(@bucket, @folder_name)
66
- dest_path = @config.storage_mode.to_sym == :s3 ? "s3://#{@bucket}/#{@folder_name}/" : File.join(@bucket, @folder_name, "")
73
+ dest_path = if @config.storage_mode.to_sym == :s3
74
+ "s3://#{@bucket}/#{@folder_name}/"
75
+ else
76
+ File.join(@bucket,
77
+ @folder_name, "")
78
+ end
67
79
 
68
- partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(', ')})," : ""
80
+ partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(", ")})," : ""
69
81
 
70
82
  query = <<~SQL
71
83
  COPY (
@@ -87,18 +99,19 @@ module DataDrain
87
99
 
88
100
  duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
89
101
  safe_log(:info, "file_ingestor.complete", {
90
- source_path: @source_path,
91
- duration_s: duration.round(2),
92
- source_query_duration_s: source_query_duration.round(2),
93
- export_duration_s: export_duration.round(2),
94
- count: source_count
95
- })
102
+ source_path: @source_path,
103
+ duration_s: duration.round(2),
104
+ source_query_duration_s: source_query_duration.round(2),
105
+ export_duration_s: export_duration.round(2),
106
+ count: source_count
107
+ })
96
108
 
97
109
  cleanup_local_file
98
110
  true
99
111
  rescue DuckDB::Error => e
100
112
  duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
101
- safe_log(:error, "file_ingestor.duckdb_error", { source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
113
+ safe_log(:error, "file_ingestor.duckdb_error",
114
+ { source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
102
115
  false
103
116
  ensure
104
117
  @duckdb&.close
@@ -109,11 +122,11 @@ module DataDrain
109
122
  # @api private
110
123
  def determine_reader
111
124
  case File.extname(@source_path).downcase
112
- when '.csv'
125
+ when ".csv"
113
126
  "read_csv_auto('#{@source_path}')"
114
- when '.json'
127
+ when ".json"
115
128
  "read_json_auto('#{@source_path}')"
116
- when '.parquet'
129
+ when ".parquet"
117
130
  "read_parquet('#{@source_path}')"
118
131
  else
119
132
  raise DataDrain::Error, "Formato de archivo no soportado para ingestión: #{@source_path}"
@@ -122,10 +135,12 @@ module DataDrain
122
135
 
123
136
  # @api private
124
137
  def cleanup_local_file
125
- if @delete_after_upload && File.exist?(@source_path)
126
- File.delete(@source_path)
127
- safe_log(:info, "file_ingestor.cleanup", { source_path: @source_path })
128
- end
138
+ return unless @delete_after_upload && File.exist?(@source_path)
139
+
140
+ File.delete(@source_path)
141
+ safe_log(:info, "file_ingestor.cleanup", { source_path: @source_path })
129
142
  end
130
143
  end
144
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
145
+ # Metrics/MethodLength
131
146
  end
@@ -24,10 +24,28 @@ module DataDrain
24
24
  class_attribute :folder_name
25
25
  class_attribute :partition_keys
26
26
 
27
+ # Cierra la conexión DuckDB del thread actual y limpia Thread.current.
28
+ # Idempotente: llamarlo varias veces no levanta.
29
+ #
30
+ # Útil en middlewares de Sidekiq/Puma para evitar memory leak en threads
31
+ # de larga vida.
32
+ #
33
+ # @return [void]
34
+ def self.disconnect!
35
+ entry = Thread.current[:data_drain_duckdb]
36
+ Thread.current[:data_drain_duckdb] = nil
37
+ return unless entry
38
+
39
+ entry[:conn]&.close
40
+ entry[:db]&.close
41
+ rescue StandardError # rubocop:disable Lint/SuppressedException
42
+ end
43
+
27
44
  # Retorna la conexión persistente a DuckDB en memoria para el hilo (Thread) actual.
28
45
  # Esto previene tener que recargar extensiones (como httpfs) en cada consulta.
29
46
  #
30
47
  # @return [DuckDB::Connection] Conexión activa a DuckDB.
48
+ # rubocop:disable Metrics/AbcSize
31
49
  def self.connection
32
50
  Thread.current[:data_drain_duckdb] ||= begin
33
51
  db = DuckDB::Database.open(":memory:")
@@ -42,6 +60,7 @@ module DataDrain
42
60
  end
43
61
  Thread.current[:data_drain_duckdb][:conn]
44
62
  end
63
+ # rubocop:enable Metrics/AbcSize
45
64
 
46
65
  # Consulta registros en el Data Lake filtrando por claves de partición.
47
66
  #
@@ -52,7 +71,7 @@ module DataDrain
52
71
  path = build_query_path(partitions)
53
72
 
54
73
  sql = <<~SQL
55
- SELECT #{attribute_names.join(', ')}
74
+ SELECT #{attribute_names.join(", ")}
56
75
  FROM read_parquet('#{path}')
57
76
  ORDER BY created_at DESC
58
77
  LIMIT #{limit}
@@ -73,7 +92,7 @@ module DataDrain
73
92
  safe_id = id.to_s.gsub("'", "''")
74
93
 
75
94
  sql = <<~SQL
76
- SELECT #{attribute_names.join(', ')}
95
+ SELECT #{attribute_names.join(", ")}
77
96
  FROM read_parquet('#{path}')
78
97
  WHERE id = '#{safe_id}'
79
98
  LIMIT 1
@@ -97,7 +116,7 @@ module DataDrain
97
116
  # @return [String] Representación legible en consola.
98
117
  def inspect
99
118
  inspection = attributes.map do |name, value|
100
- "#{name}: #{value.nil? ? 'nil' : value.inspect}"
119
+ "#{name}: #{value.nil? ? "nil" : value.inspect}"
101
120
  end.compact.join(", ")
102
121
 
103
122
  "#<#{self.class} #{inspection}>"
@@ -118,6 +137,7 @@ module DataDrain
118
137
  # @param sql [String]
119
138
  # @param columns [Array<String>]
120
139
  # @return [Array<DataDrain::Record>]
140
+ # rubocop:disable Metrics/MethodLength
121
141
  def execute_and_instantiate(sql, columns)
122
142
  @logger = DataDrain.configuration.logger
123
143
  begin
@@ -133,5 +153,6 @@ module DataDrain
133
153
  end
134
154
  end
135
155
  end
156
+ # rubocop:enable Metrics/MethodLength
136
157
  end
137
158
  end
@@ -4,21 +4,59 @@ module DataDrain
4
4
  module Storage
5
5
  # Implementación del adaptador de almacenamiento para Amazon S3.
6
6
  class S3 < Base
7
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
7
8
  # Carga la extensión httpfs en DuckDB e inyecta las credenciales de AWS.
9
+ # Si aws_access_key_id y aws_secret_access_key están seteados, usa
10
+ # credenciales explícitas. Si no, usa credential_chain (IAM role, env vars,
11
+ # ~/.aws/credentials).
8
12
  # @param connection [DuckDB::Connection]
13
+ # @raise [DataDrain::ConfigurationError] si aws_region no está configurado
9
14
  def setup_duckdb(connection)
10
15
  connection.query("INSTALL httpfs; LOAD httpfs;")
11
- connection.query("SET s3_region='#{@config.aws_region}';")
12
- connection.query("SET s3_access_key_id='#{@config.aws_access_key_id}';")
13
- connection.query("SET s3_secret_access_key='#{@config.aws_secret_access_key}';")
16
+ create_s3_secret(connection)
14
17
  end
15
18
 
19
+ private
20
+
21
+ # @param connection [DuckDB::Connection]
22
+ # @raise [DataDrain::ConfigurationError]
23
+ def create_s3_secret(connection)
24
+ region = @config.aws_region
25
+ raise DataDrain::ConfigurationError, "aws_region es obligatorio para storage_mode=:s3" if region.nil?
26
+
27
+ if @config.aws_access_key_id && @config.aws_secret_access_key
28
+ connection.query(<<~SQL)
29
+ CREATE OR REPLACE SECRET data_drain_s3 (
30
+ TYPE S3,
31
+ KEY_ID '#{escape_sql(@config.aws_access_key_id)}',
32
+ SECRET '#{escape_sql(@config.aws_secret_access_key)}',
33
+ REGION '#{escape_sql(region)}'
34
+ );
35
+ SQL
36
+ else
37
+ connection.query(<<~SQL)
38
+ CREATE OR REPLACE SECRET data_drain_s3 (
39
+ TYPE S3,
40
+ PROVIDER credential_chain,
41
+ REGION '#{escape_sql(region)}'
42
+ );
43
+ SQL
44
+ end
45
+ end
46
+
47
+ # @param value [String]
48
+ # @return [String]
49
+ def escape_sql(value)
50
+ value.to_s.gsub("'", "''")
51
+ end
52
+
53
+ public
54
+
16
55
  # @param bucket [String]
17
56
  # @param folder_name [String]
18
57
  # @param partition_path [String, nil]
19
58
  # @return [String]
20
59
  def build_path(bucket, folder_name, partition_path)
21
- # En S3, el base_path actúa como el nombre del bucket
22
60
  base = File.join(bucket, folder_name)
23
61
  base = File.join(base, partition_path) if partition_path && !partition_path.empty?
24
62
  "s3://#{base}/**/*.parquet"
@@ -40,7 +78,7 @@ module DataDrain
40
78
  val = partitions[key]
41
79
  val.nil? || val.to_s.empty? ? "#{key}=[^/]+" : "#{key}=#{val}"
42
80
  end
43
- pattern_regex = Regexp.new("^#{folder_name}/#{regex_parts.join('/')}")
81
+ pattern_regex = Regexp.new("^#{folder_name}/#{regex_parts.join("/")}")
44
82
 
45
83
  objects_to_delete = []
46
84
  prefix = "#{folder_name}/"
@@ -58,7 +96,10 @@ module DataDrain
58
96
 
59
97
  private
60
98
 
61
- # @api private
99
+ # @param client [Aws::S3::Client]
100
+ # @param bucket [String]
101
+ # @param objects_to_delete [Array<Hash>]
102
+ # @return [Integer]
62
103
  def delete_in_batches(client, bucket, objects_to_delete)
63
104
  return 0 if objects_to_delete.empty?
64
105
 
@@ -70,5 +111,6 @@ module DataDrain
70
111
  deleted_count
71
112
  end
72
113
  end
114
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
73
115
  end
74
116
  end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DataDrain
4
+ # Módulo de validación de configuración para prevenir errores de uso.
5
+ module Validations
6
+ IDENTIFIER_REGEX = /\A[a-zA-Z_][a-zA-Z0-9_]*\z/
7
+
8
+ module_function
9
+
10
+ def validate_identifier!(name, value)
11
+ return if IDENTIFIER_REGEX.match?(value.to_s)
12
+
13
+ raise DataDrain::ConfigurationError,
14
+ "#{name} '#{value}' no es un identificador SQL válido"
15
+ end
16
+ end
17
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DataDrain
4
- VERSION = "0.1.19"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/data_drain.rb CHANGED
@@ -4,6 +4,7 @@ require "active_model"
4
4
  require_relative "data_drain/version"
5
5
  require_relative "data_drain/errors"
6
6
  require_relative "data_drain/configuration"
7
+ require_relative "data_drain/validations"
7
8
  require_relative "data_drain/storage"
8
9
  require_relative "data_drain/observability"
9
10
  require_relative "data_drain/engine"
@@ -15,6 +16,7 @@ require_relative "data_drain/glue_runner"
15
16
  require_relative "data_drain/types/json_type"
16
17
  ActiveModel::Type.register(:json, DataDrain::Types::JsonType)
17
18
 
19
+ # DSL para extraer, archivar y purgar datos entre PostgreSQL y un Data Lake en Parquet.
18
20
  module DataDrain
19
21
  class << self
20
22
  # @return [DataDrain::Configuration]