data_drain 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9c484ac47a5f767491fa8d8e48dbdb53ccdd55d756a6a0eb90d7bbeb0d28f68a
4
- data.tar.gz: 18526e071ac821f7c19127cb53dad875108ede9ab9b7bfe40a1d17bde877a6cc
3
+ metadata.gz: 97d660cb624931d75d6f39e51527c58faf180b7ab727d9c85a7fa44079dc76a0
4
+ data.tar.gz: 932c85dcf3542e52b0f3981281e6a93a757ac194153c8b0b7080a79857613ed5
5
5
  SHA512:
6
- metadata.gz: abf18e9f987f24cb2e58fb71be8a15f92f867f6e671b174e6414b7a44a5626a316235c091bd59708c1ddc93c755db87ec92af117573c68213d0f2238165728be
7
- data.tar.gz: '00124804ef7f7c9dc2c67d47a1a2304d4dc996b0caff24548acbe913e85f5ae43d410eac6725f264f9b9648d49c7dc8bdc0baed77e5ea958bfa3fc8cea08ee9d'
6
+ metadata.gz: d30e7aaf152e576821b2b2c9a3a68cba01a4c3db6941209e0d0ad0ffb7f69f763e5cf93bd90ac0964a4a2b9b5a5582e348c6f9f5599a5c3ddb24df45168e6418
7
+ data.tar.gz: f71de76a5075e99eea50a83d0c0d1831091c011a2a64e17b4f3ea206fe8f50ec4bcd2309dfb3096478995c75b4bbfc384431af0d5a5bf3ff446522fa06857891
@@ -0,0 +1,24 @@
1
+ {
2
+ "hooks": {
3
+ "Notification": [
4
+ {
5
+ "hooks": [
6
+ {
7
+ "type": "command",
8
+ "command": "curl -sf -X POST -H \"Content-Type: application/json\" -H \"X-Emdash-Token: $EMDASH_HOOK_TOKEN\" -H \"X-Emdash-Pty-Id: $EMDASH_PTY_ID\" -H \"X-Emdash-Event-Type: notification\" -d @- \"http://127.0.0.1:$EMDASH_HOOK_PORT/hook\" || true"
9
+ }
10
+ ]
11
+ }
12
+ ],
13
+ "Stop": [
14
+ {
15
+ "hooks": [
16
+ {
17
+ "type": "command",
18
+ "command": "curl -sf -X POST -H \"Content-Type: application/json\" -H \"X-Emdash-Token: $EMDASH_HOOK_TOKEN\" -H \"X-Emdash-Pty-Id: $EMDASH_PTY_ID\" -H \"X-Emdash-Event-Type: stop\" -d @- \"http://127.0.0.1:$EMDASH_HOOK_PORT/hook\" || true"
19
+ }
20
+ ]
21
+ }
22
+ ]
23
+ }
24
+ }
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.1.14] - 2026-03-17
4
+
5
+ - Feature: Implementación de **Logging Estructurado** en toda la gema (\`key=value\`) para mejor observabilidad en producción.
6
+ - Optimization: Caching automático de adaptadores de almacenamiento para mejorar el rendimiento de consultas repetidas.
7
+ - Testing: Mejora en la robustez de los tests de \`Engine\` desacoplándolos de cambios menores en el setup de DuckDB.
8
+
3
9
  ## [0.1.13] - 2026-03-17
4
10
 
5
11
  - Feature: Parametrización total en la orquestación con Glue. Se añadieron \`s3_bucket\`, \`s3_folder\` y \`partition_by\` como argumentos dinámicos, permitiendo que el mismo Job de Glue sirva para múltiples tablas y destinos.
@@ -49,30 +49,30 @@ module DataDrain
49
49
  #
50
50
  # @return [Boolean] `true` si el proceso finalizó con éxito, `false` si falló la integridad.
51
51
  def call
52
- @logger.info "[DataDrain Engine] 🚀 Preparando '#{@table_name}' (#{@start_date.to_date} a #{@end_date.to_date})..."
52
+ @logger.info "component=data_drain event=engine.start table=#{@table_name} start_date=#{@start_date.to_date} end_date=#{@end_date.to_date}"
53
53
 
54
54
  setup_duckdb
55
55
 
56
56
  @pg_count = get_postgres_count
57
57
 
58
58
  if @pg_count.zero?
59
- @logger.info "[DataDrain Engine] ⏭️ No hay registros que cumplan las condiciones."
59
+ @logger.info "component=data_drain event=engine.skip_empty table=#{@table_name}"
60
60
  return true
61
61
  end
62
62
 
63
63
  if @skip_export
64
- @logger.info "[DataDrain Engine] ⏭️ Modo 'Skip Export' activo. Saltando paso de exportación..."
64
+ @logger.info "component=data_drain event=engine.skip_export table=#{@table_name}"
65
65
  else
66
- @logger.info "[DataDrain Engine] 📦 Exportando #{@pg_count} registros a Parquet..."
66
+ @logger.info "component=data_drain event=engine.export_start table=#{@table_name} count=#{@pg_count}"
67
67
  export_to_parquet
68
68
  end
69
69
 
70
70
  if verify_integrity
71
71
  purge_from_postgres
72
- @logger.info "[DataDrain Engine] ✅ Proceso completado exitosamente para '#{@table_name}'."
72
+ @logger.info "component=data_drain event=engine.complete table=#{@table_name}"
73
73
  true
74
74
  else
75
- @logger.error "[DataDrain Engine] ❌ ERROR de integridad en '#{@table_name}'. Abortando purga."
75
+ @logger.error "component=data_drain event=engine.integrity_error table=#{@table_name}"
76
76
  false
77
77
  end
78
78
  end
@@ -147,17 +147,17 @@ module DataDrain
147
147
  SQL
148
148
  parquet_result = @duckdb.query(query).first.first
149
149
  rescue DuckDB::Error => e
150
- @logger.error "[DataDrain Engine] Error leyendo Parquet: #{e.message}"
150
+ @logger.error "component=data_drain event=engine.parquet_read_error table=#{@table_name} error=#{e.message}"
151
151
  return false
152
152
  end
153
153
 
154
- @logger.info "[DataDrain Engine] 📊 Verificación -> Postgres: #{@pg_count} | Parquet: #{parquet_result}"
154
+ @logger.info "component=data_drain event=engine.integrity_check table=#{@table_name} pg_count=#{@pg_count} parquet_count=#{parquet_result}"
155
155
  @pg_count == parquet_result
156
156
  end
157
157
 
158
158
  # @api private
159
159
  def purge_from_postgres
160
- @logger.info "[DataDrain Engine] 🗑️ Purgando en base de datos (Lotes de #{@config.batch_size})..."
160
+ @logger.info "component=data_drain event=engine.purge_start table=#{@table_name} batch_size=#{@config.batch_size}"
161
161
 
162
162
  conn = PG.connect(
163
163
  host: @config.db_host,
@@ -30,10 +30,10 @@ module DataDrain
30
30
  # Ejecuta el flujo de ingestión.
31
31
  # @return [Boolean] true si el proceso fue exitoso.
32
32
  def call
33
- @logger.info "[DataDrain FileIngestor] 🚀 Iniciando ingestión de '#{@source_path}'..."
33
+ @logger.info "component=data_drain event=file_ingestor.start source_path=#{@source_path}"
34
34
 
35
35
  unless File.exist?(@source_path)
36
- @logger.error "[DataDrain FileIngestor] ❌ El archivo origen no existe: #{@source_path}"
36
+ @logger.error "component=data_drain event=file_ingestor.file_not_found source_path=#{@source_path}"
37
37
  return false
38
38
  end
39
39
 
@@ -47,7 +47,7 @@ module DataDrain
47
47
 
48
48
  # 1. Conteo de seguridad
49
49
  source_count = @duckdb.query("SELECT COUNT(*) FROM #{reader_function}").first.first
50
- @logger.info "[DataDrain FileIngestor] 📊 Encontrados #{source_count} registros para procesar."
50
+ @logger.info "component=data_drain event=file_ingestor.count source_path=#{@source_path} count=#{source_count}"
51
51
 
52
52
  if source_count.zero?
53
53
  cleanup_local_file
@@ -73,15 +73,15 @@ module DataDrain
73
73
  );
74
74
  SQL
75
75
 
76
- @logger.info "[DataDrain FileIngestor] ☁️ Escribiendo en el Data Lake..."
76
+ @logger.info "component=data_drain event=file_ingestor.export_start dest_path=#{dest_path}"
77
77
  @duckdb.query(query)
78
78
 
79
- @logger.info "[DataDrain FileIngestor] ✅ Archivo ingerido y comprimido exitosamente."
79
+ @logger.info "component=data_drain event=file_ingestor.complete source_path=#{@source_path}"
80
80
 
81
81
  cleanup_local_file
82
82
  true
83
83
  rescue DuckDB::Error => e
84
- @logger.error "[DataDrain FileIngestor] Error de DuckDB durante la ingestión: #{e.message}"
84
+ @logger.error "component=data_drain event=file_ingestor.duckdb_error source_path=#{@source_path} error=#{e.message}"
85
85
  false
86
86
  ensure
87
87
  @duckdb&.close
@@ -107,7 +107,7 @@ module DataDrain
107
107
  def cleanup_local_file
108
108
  if @delete_after_upload && File.exist?(@source_path)
109
109
  File.delete(@source_path)
110
- @logger.info "[DataDrain FileIngestor] 🗑️ Archivo temporal local eliminado."
110
+ @logger.info "component=data_drain event=file_ingestor.cleanup source_path=#{@source_path}"
111
111
  end
112
112
  end
113
113
  end
@@ -17,7 +17,7 @@ module DataDrain
17
17
  config = DataDrain.configuration
18
18
  client = Aws::Glue::Client.new(region: config.aws_region)
19
19
 
20
- config.logger.info "[DataDrain GlueRunner] 🚀 Disparando Job: #{job_name}..."
20
+ config.logger.info "component=data_drain event=glue_runner.start job=#{job_name}"
21
21
  resp = client.start_job_run(job_name: job_name, arguments: arguments)
22
22
  run_id = resp.job_run_id
23
23
 
@@ -27,14 +27,14 @@ module DataDrain
27
27
 
28
28
  case status
29
29
  when "SUCCEEDED"
30
- config.logger.info "[DataDrain GlueRunner] Job completado con éxito."
30
+ config.logger.info "component=data_drain event=glue_runner.complete job=#{job_name} run_id=#{run_id}"
31
31
  return true
32
32
  when "FAILED", "STOPPED", "TIMEOUT"
33
33
  error_msg = run_info.error_message || "Sin mensaje de error disponible."
34
- config.logger.error "[DataDrain GlueRunner] ERROR: El Job terminó con estado #{status}: #{error_msg}"
34
+ config.logger.error "component=data_drain event=glue_runner.failed job=#{job_name} run_id=#{run_id} status=#{status} error=#{error_msg}"
35
35
  raise "Glue Job #{job_name} (Run ID: #{run_id}) falló con estado #{status}."
36
36
  else
37
- config.logger.info "[DataDrain GlueRunner] Estado: #{status}. Esperando #{polling_interval}s..."
37
+ config.logger.info "component=data_drain event=glue_runner.polling job=#{job_name} run_id=#{run_id} status=#{status} next_check_in=#{polling_interval}s"
38
38
  sleep polling_interval
39
39
  end
40
40
  end
@@ -85,7 +85,7 @@ module DataDrain
85
85
  # @return [Integer] Cantidad de particiones físicas eliminadas.
86
86
  def self.destroy_all(**partitions)
87
87
  adapter = DataDrain::Storage.adapter
88
- DataDrain.configuration.logger.info "[DataDrain] 🗑️ Ejecutando destroy_all en #{folder_name} con: #{partitions.inspect}"
88
+ DataDrain.configuration.logger.info "component=data_drain event=record.destroy_all folder=#{folder_name} partitions=#{partitions.inspect}"
89
89
 
90
90
  adapter.destroy_partitions(bucket, folder_name, partition_keys, partitions)
91
91
  end
@@ -118,7 +118,7 @@ module DataDrain
118
118
  begin
119
119
  result = connection.query(sql)
120
120
  rescue DuckDB::Error => e
121
- DataDrain.configuration.logger.warn "[DataDrain] ⚠️ Ruta o archivo no encontrado: #{e.message}"
121
+ DataDrain.configuration.logger.warn "component=data_drain event=record.parquet_not_found error=#{e.message}"
122
122
  return []
123
123
  end
124
124
 
@@ -11,20 +11,28 @@ module DataDrain
11
11
  class InvalidAdapterError < DataDrain::Error; end
12
12
 
13
13
  # Resuelve e instancia el adaptador de almacenamiento correspondiente
14
- # basándose en la configuración actual del framework.
14
+ # basándose en la configuración actual del framework. La instancia se
15
+ # cachea para evitar allocations innecesarias entre queries.
15
16
  #
16
17
  # @return [DataDrain::Storage::Base] Una instancia de Local o S3.
17
18
  # @raise [InvalidAdapterError] Si el storage_mode no es válido.
18
19
  def self.adapter
19
- mode = DataDrain.configuration.storage_mode
20
- case mode.to_sym
21
- when :local
22
- Local.new(DataDrain.configuration)
23
- when :s3
24
- S3.new(DataDrain.configuration)
25
- else
26
- raise InvalidAdapterError, "Storage mode '#{mode}' no está soportado."
20
+ @adapter ||= begin
21
+ mode = DataDrain.configuration.storage_mode
22
+ case mode.to_sym
23
+ when :local
24
+ Local.new(DataDrain.configuration)
25
+ when :s3
26
+ S3.new(DataDrain.configuration)
27
+ else
28
+ raise InvalidAdapterError, "Storage mode '#{mode}' no está soportado."
29
+ end
27
30
  end
28
31
  end
32
+
33
+ # Descarta el adaptador cacheado. Llamar cuando cambia storage_mode.
34
+ def self.reset_adapter!
35
+ @adapter = nil
36
+ end
29
37
  end
30
38
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DataDrain
4
- VERSION = "0.1.13"
4
+ VERSION = "0.1.14"
5
5
  end
data/lib/data_drain.rb CHANGED
@@ -29,6 +29,7 @@ module DataDrain
29
29
  # @api private
30
30
  def reset_configuration!
31
31
  @configuration = Configuration.new
32
+ DataDrain::Storage.reset_adapter!
32
33
  end
33
34
  end
34
35
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_drain
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gabriel
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-20 00:00:00.000000000 Z
11
+ date: 2026-03-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -88,6 +88,7 @@ executables: []
88
88
  extensions: []
89
89
  extra_rdoc_files: []
90
90
  files:
91
+ - ".claude/settings.local.json"
91
92
  - ".rspec"
92
93
  - ".rubocop.yml"
93
94
  - CHANGELOG.md