data_drain 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +24 -0
- data/CHANGELOG.md +6 -0
- data/lib/data_drain/engine.rb +9 -9
- data/lib/data_drain/file_ingestor.rb +7 -7
- data/lib/data_drain/glue_runner.rb +4 -4
- data/lib/data_drain/record.rb +2 -2
- data/lib/data_drain/storage.rb +17 -9
- data/lib/data_drain/version.rb +1 -1
- data/lib/data_drain.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 97d660cb624931d75d6f39e51527c58faf180b7ab727d9c85a7fa44079dc76a0
|
|
4
|
+
data.tar.gz: 932c85dcf3542e52b0f3981281e6a93a757ac194153c8b0b7080a79857613ed5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d30e7aaf152e576821b2b2c9a3a68cba01a4c3db6941209e0d0ad0ffb7f69f763e5cf93bd90ac0964a4a2b9b5a5582e348c6f9f5599a5c3ddb24df45168e6418
|
|
7
|
+
data.tar.gz: f71de76a5075e99eea50a83d0c0d1831091c011a2a64e17b4f3ea206fe8f50ec4bcd2309dfb3096478995c75b4bbfc384431af0d5a5bf3ff446522fa06857891
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"hooks": {
|
|
3
|
+
"Notification": [
|
|
4
|
+
{
|
|
5
|
+
"hooks": [
|
|
6
|
+
{
|
|
7
|
+
"type": "command",
|
|
8
|
+
"command": "curl -sf -X POST -H \"Content-Type: application/json\" -H \"X-Emdash-Token: $EMDASH_HOOK_TOKEN\" -H \"X-Emdash-Pty-Id: $EMDASH_PTY_ID\" -H \"X-Emdash-Event-Type: notification\" -d @- \"http://127.0.0.1:$EMDASH_HOOK_PORT/hook\" || true"
|
|
9
|
+
}
|
|
10
|
+
]
|
|
11
|
+
}
|
|
12
|
+
],
|
|
13
|
+
"Stop": [
|
|
14
|
+
{
|
|
15
|
+
"hooks": [
|
|
16
|
+
{
|
|
17
|
+
"type": "command",
|
|
18
|
+
"command": "curl -sf -X POST -H \"Content-Type: application/json\" -H \"X-Emdash-Token: $EMDASH_HOOK_TOKEN\" -H \"X-Emdash-Pty-Id: $EMDASH_PTY_ID\" -H \"X-Emdash-Event-Type: stop\" -d @- \"http://127.0.0.1:$EMDASH_HOOK_PORT/hook\" || true"
|
|
19
|
+
}
|
|
20
|
+
]
|
|
21
|
+
}
|
|
22
|
+
]
|
|
23
|
+
}
|
|
24
|
+
}
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.1.14] - 2026-03-17
|
|
4
|
+
|
|
5
|
+
- Feature: Implementación de **Logging Estructurado** en toda la gema (\`key=value\`) para mejor observabilidad en producción.
|
|
6
|
+
- Optimization: Caching automático de adaptadores de almacenamiento para mejorar el rendimiento de consultas repetidas.
|
|
7
|
+
- Testing: Mejora en la robustez de los tests de \`Engine\` desacoplándolos de cambios menores en el setup de DuckDB.
|
|
8
|
+
|
|
3
9
|
## [0.1.13] - 2026-03-17
|
|
4
10
|
|
|
5
11
|
- Feature: Parametrización total en la orquestación con Glue. Se añadieron \`s3_bucket\`, \`s3_folder\` y \`partition_by\` como argumentos dinámicos, permitiendo que el mismo Job de Glue sirva para múltiples tablas y destinos.
|
data/lib/data_drain/engine.rb
CHANGED
|
@@ -49,30 +49,30 @@ module DataDrain
|
|
|
49
49
|
#
|
|
50
50
|
# @return [Boolean] `true` si el proceso finalizó con éxito, `false` si falló la integridad.
|
|
51
51
|
def call
|
|
52
|
-
@logger.info "
|
|
52
|
+
@logger.info "component=data_drain event=engine.start table=#{@table_name} start_date=#{@start_date.to_date} end_date=#{@end_date.to_date}"
|
|
53
53
|
|
|
54
54
|
setup_duckdb
|
|
55
55
|
|
|
56
56
|
@pg_count = get_postgres_count
|
|
57
57
|
|
|
58
58
|
if @pg_count.zero?
|
|
59
|
-
@logger.info "
|
|
59
|
+
@logger.info "component=data_drain event=engine.skip_empty table=#{@table_name}"
|
|
60
60
|
return true
|
|
61
61
|
end
|
|
62
62
|
|
|
63
63
|
if @skip_export
|
|
64
|
-
@logger.info "
|
|
64
|
+
@logger.info "component=data_drain event=engine.skip_export table=#{@table_name}"
|
|
65
65
|
else
|
|
66
|
-
@logger.info "
|
|
66
|
+
@logger.info "component=data_drain event=engine.export_start table=#{@table_name} count=#{@pg_count}"
|
|
67
67
|
export_to_parquet
|
|
68
68
|
end
|
|
69
69
|
|
|
70
70
|
if verify_integrity
|
|
71
71
|
purge_from_postgres
|
|
72
|
-
@logger.info "
|
|
72
|
+
@logger.info "component=data_drain event=engine.complete table=#{@table_name}"
|
|
73
73
|
true
|
|
74
74
|
else
|
|
75
|
-
@logger.error "
|
|
75
|
+
@logger.error "component=data_drain event=engine.integrity_error table=#{@table_name}"
|
|
76
76
|
false
|
|
77
77
|
end
|
|
78
78
|
end
|
|
@@ -147,17 +147,17 @@ module DataDrain
|
|
|
147
147
|
SQL
|
|
148
148
|
parquet_result = @duckdb.query(query).first.first
|
|
149
149
|
rescue DuckDB::Error => e
|
|
150
|
-
@logger.error "
|
|
150
|
+
@logger.error "component=data_drain event=engine.parquet_read_error table=#{@table_name} error=#{e.message}"
|
|
151
151
|
return false
|
|
152
152
|
end
|
|
153
153
|
|
|
154
|
-
@logger.info "
|
|
154
|
+
@logger.info "component=data_drain event=engine.integrity_check table=#{@table_name} pg_count=#{@pg_count} parquet_count=#{parquet_result}"
|
|
155
155
|
@pg_count == parquet_result
|
|
156
156
|
end
|
|
157
157
|
|
|
158
158
|
# @api private
|
|
159
159
|
def purge_from_postgres
|
|
160
|
-
@logger.info "
|
|
160
|
+
@logger.info "component=data_drain event=engine.purge_start table=#{@table_name} batch_size=#{@config.batch_size}"
|
|
161
161
|
|
|
162
162
|
conn = PG.connect(
|
|
163
163
|
host: @config.db_host,
|
|
@@ -30,10 +30,10 @@ module DataDrain
|
|
|
30
30
|
# Ejecuta el flujo de ingestión.
|
|
31
31
|
# @return [Boolean] true si el proceso fue exitoso.
|
|
32
32
|
def call
|
|
33
|
-
@logger.info "
|
|
33
|
+
@logger.info "component=data_drain event=file_ingestor.start source_path=#{@source_path}"
|
|
34
34
|
|
|
35
35
|
unless File.exist?(@source_path)
|
|
36
|
-
@logger.error "
|
|
36
|
+
@logger.error "component=data_drain event=file_ingestor.file_not_found source_path=#{@source_path}"
|
|
37
37
|
return false
|
|
38
38
|
end
|
|
39
39
|
|
|
@@ -47,7 +47,7 @@ module DataDrain
|
|
|
47
47
|
|
|
48
48
|
# 1. Conteo de seguridad
|
|
49
49
|
source_count = @duckdb.query("SELECT COUNT(*) FROM #{reader_function}").first.first
|
|
50
|
-
@logger.info "
|
|
50
|
+
@logger.info "component=data_drain event=file_ingestor.count source_path=#{@source_path} count=#{source_count}"
|
|
51
51
|
|
|
52
52
|
if source_count.zero?
|
|
53
53
|
cleanup_local_file
|
|
@@ -73,15 +73,15 @@ module DataDrain
|
|
|
73
73
|
);
|
|
74
74
|
SQL
|
|
75
75
|
|
|
76
|
-
@logger.info "
|
|
76
|
+
@logger.info "component=data_drain event=file_ingestor.export_start dest_path=#{dest_path}"
|
|
77
77
|
@duckdb.query(query)
|
|
78
78
|
|
|
79
|
-
@logger.info "
|
|
79
|
+
@logger.info "component=data_drain event=file_ingestor.complete source_path=#{@source_path}"
|
|
80
80
|
|
|
81
81
|
cleanup_local_file
|
|
82
82
|
true
|
|
83
83
|
rescue DuckDB::Error => e
|
|
84
|
-
@logger.error "
|
|
84
|
+
@logger.error "component=data_drain event=file_ingestor.duckdb_error source_path=#{@source_path} error=#{e.message}"
|
|
85
85
|
false
|
|
86
86
|
ensure
|
|
87
87
|
@duckdb&.close
|
|
@@ -107,7 +107,7 @@ module DataDrain
|
|
|
107
107
|
def cleanup_local_file
|
|
108
108
|
if @delete_after_upload && File.exist?(@source_path)
|
|
109
109
|
File.delete(@source_path)
|
|
110
|
-
@logger.info "
|
|
110
|
+
@logger.info "component=data_drain event=file_ingestor.cleanup source_path=#{@source_path}"
|
|
111
111
|
end
|
|
112
112
|
end
|
|
113
113
|
end
|
|
@@ -17,7 +17,7 @@ module DataDrain
|
|
|
17
17
|
config = DataDrain.configuration
|
|
18
18
|
client = Aws::Glue::Client.new(region: config.aws_region)
|
|
19
19
|
|
|
20
|
-
config.logger.info "
|
|
20
|
+
config.logger.info "component=data_drain event=glue_runner.start job=#{job_name}"
|
|
21
21
|
resp = client.start_job_run(job_name: job_name, arguments: arguments)
|
|
22
22
|
run_id = resp.job_run_id
|
|
23
23
|
|
|
@@ -27,14 +27,14 @@ module DataDrain
|
|
|
27
27
|
|
|
28
28
|
case status
|
|
29
29
|
when "SUCCEEDED"
|
|
30
|
-
config.logger.info "
|
|
30
|
+
config.logger.info "component=data_drain event=glue_runner.complete job=#{job_name} run_id=#{run_id}"
|
|
31
31
|
return true
|
|
32
32
|
when "FAILED", "STOPPED", "TIMEOUT"
|
|
33
33
|
error_msg = run_info.error_message || "Sin mensaje de error disponible."
|
|
34
|
-
config.logger.error "
|
|
34
|
+
config.logger.error "component=data_drain event=glue_runner.failed job=#{job_name} run_id=#{run_id} status=#{status} error=#{error_msg}"
|
|
35
35
|
raise "Glue Job #{job_name} (Run ID: #{run_id}) falló con estado #{status}."
|
|
36
36
|
else
|
|
37
|
-
config.logger.info "
|
|
37
|
+
config.logger.info "component=data_drain event=glue_runner.polling job=#{job_name} run_id=#{run_id} status=#{status} next_check_in=#{polling_interval}s"
|
|
38
38
|
sleep polling_interval
|
|
39
39
|
end
|
|
40
40
|
end
|
data/lib/data_drain/record.rb
CHANGED
|
@@ -85,7 +85,7 @@ module DataDrain
|
|
|
85
85
|
# @return [Integer] Cantidad de particiones físicas eliminadas.
|
|
86
86
|
def self.destroy_all(**partitions)
|
|
87
87
|
adapter = DataDrain::Storage.adapter
|
|
88
|
-
DataDrain.configuration.logger.info "
|
|
88
|
+
DataDrain.configuration.logger.info "component=data_drain event=record.destroy_all folder=#{folder_name} partitions=#{partitions.inspect}"
|
|
89
89
|
|
|
90
90
|
adapter.destroy_partitions(bucket, folder_name, partition_keys, partitions)
|
|
91
91
|
end
|
|
@@ -118,7 +118,7 @@ module DataDrain
|
|
|
118
118
|
begin
|
|
119
119
|
result = connection.query(sql)
|
|
120
120
|
rescue DuckDB::Error => e
|
|
121
|
-
DataDrain.configuration.logger.warn "
|
|
121
|
+
DataDrain.configuration.logger.warn "component=data_drain event=record.parquet_not_found error=#{e.message}"
|
|
122
122
|
return []
|
|
123
123
|
end
|
|
124
124
|
|
data/lib/data_drain/storage.rb
CHANGED
|
@@ -11,20 +11,28 @@ module DataDrain
|
|
|
11
11
|
class InvalidAdapterError < DataDrain::Error; end
|
|
12
12
|
|
|
13
13
|
# Resuelve e instancia el adaptador de almacenamiento correspondiente
|
|
14
|
-
# basándose en la configuración actual del framework.
|
|
14
|
+
# basándose en la configuración actual del framework. La instancia se
|
|
15
|
+
# cachea para evitar allocations innecesarias entre queries.
|
|
15
16
|
#
|
|
16
17
|
# @return [DataDrain::Storage::Base] Una instancia de Local o S3.
|
|
17
18
|
# @raise [InvalidAdapterError] Si el storage_mode no es válido.
|
|
18
19
|
def self.adapter
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
20
|
+
@adapter ||= begin
|
|
21
|
+
mode = DataDrain.configuration.storage_mode
|
|
22
|
+
case mode.to_sym
|
|
23
|
+
when :local
|
|
24
|
+
Local.new(DataDrain.configuration)
|
|
25
|
+
when :s3
|
|
26
|
+
S3.new(DataDrain.configuration)
|
|
27
|
+
else
|
|
28
|
+
raise InvalidAdapterError, "Storage mode '#{mode}' no está soportado."
|
|
29
|
+
end
|
|
27
30
|
end
|
|
28
31
|
end
|
|
32
|
+
|
|
33
|
+
# Descarta el adaptador cacheado. Llamar cuando cambia storage_mode.
|
|
34
|
+
def self.reset_adapter!
|
|
35
|
+
@adapter = nil
|
|
36
|
+
end
|
|
29
37
|
end
|
|
30
38
|
end
|
data/lib/data_drain/version.rb
CHANGED
data/lib/data_drain.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_drain
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.14
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gabriel
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-22 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activemodel
|
|
@@ -88,6 +88,7 @@ executables: []
|
|
|
88
88
|
extensions: []
|
|
89
89
|
extra_rdoc_files: []
|
|
90
90
|
files:
|
|
91
|
+
- ".claude/settings.local.json"
|
|
91
92
|
- ".rspec"
|
|
92
93
|
- ".rubocop.yml"
|
|
93
94
|
- CHANGELOG.md
|