data_drain 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +15 -1
- data/lib/data_drain/configuration.rb +4 -2
- data/lib/data_drain/engine.rb +4 -0
- data/lib/data_drain/file_ingestor.rb +3 -0
- data/lib/data_drain/record.rb +5 -0
- data/lib/data_drain/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7b52a43c3e135f0ef36762bd0453e5ef39b4ba64872415cfff86a3407edadd1f
|
|
4
|
+
data.tar.gz: f2c77c7173d8a554f3c5aad5fdd4116ce52f91e5c286277be19fab9dfc178153
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3ae238656bb38fdc270654ee7939bb8713b13c97237921302e1568f201c51aaab3cc5489d13e85b3a24f3642083759713e6b581336b3d848c9c4fff7ef84bf1f
|
|
7
|
+
data.tar.gz: a91df374b1ad62de9c2aa4d091da45c852cfecab7a3784ed2e1946fe8d82fdcdbde4800825d569774727a5e50cd6acc4c527f1302ee6ddb1572b68780b553115
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.1.8] - 2026-03-16
|
|
4
|
+
|
|
5
|
+
- Fix: Se cambió la cadena de conexión de DuckDB a formato URI para propagar el timeout de sesión en el ATTACH.
|
|
6
|
+
|
|
7
|
+
## [0.1.7] - 2026-03-16
|
|
8
|
+
|
|
9
|
+
- Se agrego soporte para idle_in_transaction_session_timeout.
|
|
10
|
+
|
|
3
11
|
## [0.1.6] - 2026-03-16
|
|
4
12
|
|
|
5
13
|
- Se agrego el tem_directory para duckdb.
|
data/README.md
CHANGED
|
@@ -47,12 +47,26 @@ DataDrain.configure do |config|
|
|
|
47
47
|
config.db_pass = ENV.fetch('DB_PASS', '')
|
|
48
48
|
config.db_name = ENV.fetch('DB_NAME', 'core_production')
|
|
49
49
|
|
|
50
|
-
# Rendimiento y Tuning
|
|
50
|
+
# Rendimiento y Tuning de Postgres
|
|
51
51
|
config.batch_size = 5000 # Registros a borrar por transacción
|
|
52
52
|
config.throttle_delay = 0.5 # Segundos de pausa entre borrados
|
|
53
|
+
|
|
54
|
+
# Timeout de inactividad de transacciones en PostgreSQL (en milisegundos).
|
|
55
|
+
# Útil establecerlo en 0 para evitar que la conexión se cierre prematuramente
|
|
56
|
+
# durante el borrado de grandes volúmenes de datos.
|
|
57
|
+
config.idle_in_transaction_session_timeout = 0
|
|
58
|
+
|
|
53
59
|
config.logger = Rails.logger
|
|
54
60
|
|
|
61
|
+
# Tuning de DuckDB
|
|
62
|
+
# Límite máximo de RAM para las consultas en memoria de DuckDB (ej. '2GB', '512MB').
|
|
63
|
+
# Evita que el proceso OOM (Out Of Memory) si el contenedor o servidor tiene memoria limitada.
|
|
55
64
|
config.limit_ram = '2GB'
|
|
65
|
+
|
|
66
|
+
# Directorio temporal de DuckDB para desbordar memoria (spill to disk) durante
|
|
67
|
+
# transformaciones pesadas o creación de archivos Parquet masivos.
|
|
68
|
+
# Es muy recomendable que este directorio resida en un disco SSD/NVMe rápido.
|
|
69
|
+
config.tmp_directory = '/tmp/duckdb_work'
|
|
56
70
|
end
|
|
57
71
|
```
|
|
58
72
|
|
|
@@ -8,7 +8,8 @@ module DataDrain
|
|
|
8
8
|
attr_accessor :storage_mode, :aws_region,
|
|
9
9
|
:aws_access_key_id, :aws_secret_access_key,
|
|
10
10
|
:db_host, :db_port, :db_user, :db_pass, :db_name,
|
|
11
|
-
:batch_size, :throttle_delay, :logger, :limit_ram, :tmp_directory
|
|
11
|
+
:batch_size, :throttle_delay, :logger, :limit_ram, :tmp_directory,
|
|
12
|
+
:idle_in_transaction_session_timeout
|
|
12
13
|
|
|
13
14
|
def initialize
|
|
14
15
|
@storage_mode = :local
|
|
@@ -18,12 +19,13 @@ module DataDrain
|
|
|
18
19
|
@throttle_delay = 0.5
|
|
19
20
|
@limit_ram = nil # eg 2GB
|
|
20
21
|
@tmp_directory = nil # eg /tmp/duckdb_work
|
|
22
|
+
@idle_in_transaction_session_timeout = 0
|
|
21
23
|
@logger = Logger.new($stdout)
|
|
22
24
|
end
|
|
23
25
|
|
|
24
26
|
# @return [String] Cadena de conexión optimizada para DuckDB.
|
|
25
27
|
def duckdb_connection_string
|
|
26
|
-
"
|
|
28
|
+
"postgresql://#{@db_user}:#{@db_pass}@#{@db_host}:#{@db_port}/#{@db_name}?options=-c%20idle_in_transaction_session_timeout%3D#{@idle_in_transaction_session_timeout}"
|
|
27
29
|
end
|
|
28
30
|
end
|
|
29
31
|
end
|
data/lib/data_drain/engine.rb
CHANGED
|
@@ -157,6 +157,10 @@ module DataDrain
|
|
|
157
157
|
dbname: @config.db_name
|
|
158
158
|
)
|
|
159
159
|
|
|
160
|
+
if @config.idle_in_transaction_session_timeout.present?
|
|
161
|
+
conn.exec("SET idle_in_transaction_session_timeout = #{@config.idle_in_transaction_session_timeout};")
|
|
162
|
+
end
|
|
163
|
+
|
|
160
164
|
loop do
|
|
161
165
|
sql = <<~SQL
|
|
162
166
|
DELETE FROM #{@table_name}
|
|
@@ -37,6 +37,9 @@ module DataDrain
|
|
|
37
37
|
return false
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
+
@duckdb.query("SET max_memory='#{@config.limit_ram}';") if @config.limit_ram.present?
|
|
41
|
+
@duckdb.query("SET temp_directory='#{@config.tmp_directory}'") if @config.tmp_directory.present?
|
|
42
|
+
|
|
40
43
|
@adapter.setup_duckdb(@duckdb)
|
|
41
44
|
|
|
42
45
|
# Determinamos la función lectora de DuckDB según la extensión del archivo
|
data/lib/data_drain/record.rb
CHANGED
|
@@ -30,6 +30,11 @@ module DataDrain
|
|
|
30
30
|
Thread.current[:data_drain_duckdb_conn] ||= begin
|
|
31
31
|
db = DuckDB::Database.open(":memory:")
|
|
32
32
|
conn = db.connect
|
|
33
|
+
|
|
34
|
+
config = DataDrain.configuration
|
|
35
|
+
conn.query("SET max_memory='#{config.limit_ram}';") if config.limit_ram.present?
|
|
36
|
+
conn.query("SET temp_directory='#{config.tmp_directory}'") if config.tmp_directory.present?
|
|
37
|
+
|
|
33
38
|
DataDrain::Storage.adapter.setup_duckdb(conn)
|
|
34
39
|
conn
|
|
35
40
|
end
|
data/lib/data_drain/version.rb
CHANGED