data_drain 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/README.md +15 -1
- data/lib/data_drain/configuration.rb +4 -1
- data/lib/data_drain/engine.rb +5 -0
- data/lib/data_drain/file_ingestor.rb +3 -0
- data/lib/data_drain/record.rb +5 -0
- data/lib/data_drain/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a5a4767539f7a1e03be113cbdb218eba708228e2d4255734bd6150a913e414a0
|
|
4
|
+
data.tar.gz: d7a5d26b5ce4cb545a5e2132e09d72b65569aad6c2716b681f9c07dcfea80747
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5b2c335c98d509d951d6fdea54ad3068c2fd539a88033ef0d4be8e6c12127d019b24086b02eb60c31e63d65ba99bd51daa2d10e595b46c21a01ebdc1516082dc
|
|
7
|
+
data.tar.gz: 95a6fe6c0b97862c624668a8123383f0206744bfecf09f04e57bc17cc95eba27ea86f0f5d083dd8e948f9490eb640189f8bfcc7b9a39a2c924a419cbe54edda4
|
data/CHANGELOG.md
CHANGED
|
@@ -1,8 +1,16 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.1.7] - 2026-03-16
|
|
4
|
+
|
|
5
|
+
- Se agrego soporte para idle_in_transaction_session_timeout.
|
|
6
|
+
|
|
7
|
+
## [0.1.6] - 2026-03-16
|
|
8
|
+
|
|
9
|
+
- Se agrego el tem_directory para duckdb.
|
|
10
|
+
|
|
3
11
|
## [0.1.5] - 2026-03-16
|
|
4
12
|
|
|
5
|
-
- Se agrego el
|
|
13
|
+
- Se agrego el attach para duckdb.
|
|
6
14
|
|
|
7
15
|
## [0.1.4] - 2026-03-16
|
|
8
16
|
|
data/README.md
CHANGED
|
@@ -47,12 +47,26 @@ DataDrain.configure do |config|
|
|
|
47
47
|
config.db_pass = ENV.fetch('DB_PASS', '')
|
|
48
48
|
config.db_name = ENV.fetch('DB_NAME', 'core_production')
|
|
49
49
|
|
|
50
|
-
# Rendimiento y Tuning
|
|
50
|
+
# Rendimiento y Tuning de Postgres
|
|
51
51
|
config.batch_size = 5000 # Registros a borrar por transacción
|
|
52
52
|
config.throttle_delay = 0.5 # Segundos de pausa entre borrados
|
|
53
|
+
|
|
54
|
+
# Timeout de inactividad de transacciones en PostgreSQL (en milisegundos).
|
|
55
|
+
# Útil establecerlo en 0 para evitar que la conexión se cierre prematuramente
|
|
56
|
+
# durante el borrado de grandes volúmenes de datos.
|
|
57
|
+
config.idle_in_transaction_session_timeout = 0
|
|
58
|
+
|
|
53
59
|
config.logger = Rails.logger
|
|
54
60
|
|
|
61
|
+
# Tuning de DuckDB
|
|
62
|
+
# Límite máximo de RAM para las consultas en memoria de DuckDB (ej. '2GB', '512MB').
|
|
63
|
+
# Evita que el proceso OOM (Out Of Memory) si el contenedor o servidor tiene memoria limitada.
|
|
55
64
|
config.limit_ram = '2GB'
|
|
65
|
+
|
|
66
|
+
# Directorio temporal de DuckDB para desbordar memoria (spill to disk) durante
|
|
67
|
+
# transformaciones pesadas o creación de archivos Parquet masivos.
|
|
68
|
+
# Es muy recomendable que este directorio resida en un disco SSD/NVMe rápido.
|
|
69
|
+
config.tmp_directory = '/tmp/duckdb_work'
|
|
56
70
|
end
|
|
57
71
|
```
|
|
58
72
|
|
|
@@ -8,7 +8,8 @@ module DataDrain
|
|
|
8
8
|
attr_accessor :storage_mode, :aws_region,
|
|
9
9
|
:aws_access_key_id, :aws_secret_access_key,
|
|
10
10
|
:db_host, :db_port, :db_user, :db_pass, :db_name,
|
|
11
|
-
:batch_size, :throttle_delay, :logger, :limit_ram
|
|
11
|
+
:batch_size, :throttle_delay, :logger, :limit_ram, :tmp_directory,
|
|
12
|
+
:idle_in_transaction_session_timeout
|
|
12
13
|
|
|
13
14
|
def initialize
|
|
14
15
|
@storage_mode = :local
|
|
@@ -17,6 +18,8 @@ module DataDrain
|
|
|
17
18
|
@batch_size = 5000
|
|
18
19
|
@throttle_delay = 0.5
|
|
19
20
|
@limit_ram = nil # eg 2GB
|
|
21
|
+
@tmp_directory = nil # eg /tmp/duckdb_work
|
|
22
|
+
@idle_in_transaction_session_timeout = 0
|
|
20
23
|
@logger = Logger.new($stdout)
|
|
21
24
|
end
|
|
22
25
|
|
data/lib/data_drain/engine.rb
CHANGED
|
@@ -81,6 +81,7 @@ module DataDrain
|
|
|
81
81
|
def setup_duckdb
|
|
82
82
|
@duckdb.query("INSTALL postgres; LOAD postgres;")
|
|
83
83
|
@duckdb.query("SET max_memory='#{@config.limit_ram}';") if @config.limit_ram.present?
|
|
84
|
+
@duckdb.query("SET temp_directory='#{@config.tmp_directory}'") if @config.tmp_directory.present?
|
|
84
85
|
@duckdb.query("ATTACH '#{@config.duckdb_connection_string}' AS pg_source (TYPE POSTGRES, READ_ONLY)")
|
|
85
86
|
|
|
86
87
|
# 💡 Magia del Adapter: Él sabe si cargar httpfs y setear credenciales o no hacer nada
|
|
@@ -156,6 +157,10 @@ module DataDrain
|
|
|
156
157
|
dbname: @config.db_name
|
|
157
158
|
)
|
|
158
159
|
|
|
160
|
+
if @config.idle_in_transaction_session_timeout.present?
|
|
161
|
+
conn.exec("SET idle_in_transaction_session_timeout = #{@config.idle_in_transaction_session_timeout};")
|
|
162
|
+
end
|
|
163
|
+
|
|
159
164
|
loop do
|
|
160
165
|
sql = <<~SQL
|
|
161
166
|
DELETE FROM #{@table_name}
|
|
@@ -37,6 +37,9 @@ module DataDrain
|
|
|
37
37
|
return false
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
+
@duckdb.query("SET max_memory='#{@config.limit_ram}';") if @config.limit_ram.present?
|
|
41
|
+
@duckdb.query("SET temp_directory='#{@config.tmp_directory}'") if @config.tmp_directory.present?
|
|
42
|
+
|
|
40
43
|
@adapter.setup_duckdb(@duckdb)
|
|
41
44
|
|
|
42
45
|
# Determinamos la función lectora de DuckDB según la extensión del archivo
|
data/lib/data_drain/record.rb
CHANGED
|
@@ -30,6 +30,11 @@ module DataDrain
|
|
|
30
30
|
Thread.current[:data_drain_duckdb_conn] ||= begin
|
|
31
31
|
db = DuckDB::Database.open(":memory:")
|
|
32
32
|
conn = db.connect
|
|
33
|
+
|
|
34
|
+
config = DataDrain.configuration
|
|
35
|
+
conn.query("SET max_memory='#{config.limit_ram}';") if config.limit_ram.present?
|
|
36
|
+
conn.query("SET temp_directory='#{config.tmp_directory}'") if config.tmp_directory.present?
|
|
37
|
+
|
|
33
38
|
DataDrain::Storage.adapter.setup_duckdb(conn)
|
|
34
39
|
conn
|
|
35
40
|
end
|
data/lib/data_drain/version.rb
CHANGED