data_drain 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -0
- data/CHANGELOG.md +12 -0
- data/README.md +0 -2
- data/docs/IMPROVEMENT_PLAN.md +1162 -0
- data/docs/execution/archive/v0.2.0.agente-review.md +125 -0
- data/docs/execution/archive/v0.2.0.md +812 -0
- data/docs/glue_pyspark_example.py +60 -0
- data/lib/data_drain/version.rb +1 -1
- data/skill/references/antipatrones.md +1 -1
- metadata +5 -1
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Script de AWS Glue (PySpark) compatible con DataDrain::GlueRunner.
|
|
3
|
+
|
|
4
|
+
Crear el Job en la consola de AWS Glue (Spark 4.0+) y usar este script como base.
|
|
5
|
+
Argumentos requeridos: JOB_NAME, start_date, end_date, s3_bucket, s3_folder,
|
|
6
|
+
db_url, db_user, db_password, db_table, partition_by.
|
|
7
|
+
|
|
8
|
+
Personalizar la sección de columnas derivadas según las partition_keys de cada tabla.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
from awsglue.utils import getResolvedOptions
|
|
13
|
+
from pyspark.context import SparkContext
|
|
14
|
+
from awsglue.context import GlueContext
|
|
15
|
+
from awsglue.job import Job
|
|
16
|
+
from pyspark.sql.functions import col, year, month
|
|
17
|
+
|
|
18
|
+
args = getResolvedOptions(sys.argv, [
|
|
19
|
+
'JOB_NAME', 'start_date', 'end_date', 's3_bucket', 's3_folder',
|
|
20
|
+
'db_url', 'db_user', 'db_password', 'db_table', 'partition_by'
|
|
21
|
+
])
|
|
22
|
+
|
|
23
|
+
sc = SparkContext()
|
|
24
|
+
glueContext = GlueContext(sc)
|
|
25
|
+
spark = glueContext.spark_session
|
|
26
|
+
job = Job(glueContext)
|
|
27
|
+
job.init(args['JOB_NAME'], args)
|
|
28
|
+
|
|
29
|
+
options = {
|
|
30
|
+
"url": args['db_url'],
|
|
31
|
+
"dbtable": args['db_table'],
|
|
32
|
+
"user": args['db_user'],
|
|
33
|
+
"password": args['db_password'],
|
|
34
|
+
"sampleQuery": (
|
|
35
|
+
f"SELECT * FROM {args['db_table']} "
|
|
36
|
+
f"WHERE created_at >= '{args['start_date']}' "
|
|
37
|
+
f"AND created_at < '{args['end_date']}'"
|
|
38
|
+
)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
df = spark.read.format("jdbc").options(**options).load()
|
|
42
|
+
|
|
43
|
+
# Agregar columnas derivadas necesarias para las particiones.
|
|
44
|
+
# isp_id ya existe en la tabla fuente — solo agregar las que se calculan.
|
|
45
|
+
# Personalizar esta seccion segun las partition_keys de cada tabla.
|
|
46
|
+
df_final = (
|
|
47
|
+
df.withColumn("year", year(col("created_at")))
|
|
48
|
+
.withColumn("month", month(col("created_at")))
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
output_path = f"s3://{args['s3_bucket']}/{args['s3_folder']}/"
|
|
52
|
+
partitions = args['partition_by'].split(",")
|
|
53
|
+
|
|
54
|
+
(df_final.write.mode("overwrite")
|
|
55
|
+
.partitionBy(*partitions)
|
|
56
|
+
.format("parquet")
|
|
57
|
+
.option("compression", "zstd")
|
|
58
|
+
.save(output_path))
|
|
59
|
+
|
|
60
|
+
job.commit()
|
data/lib/data_drain/version.rb
CHANGED
|
@@ -207,7 +207,7 @@ ArchivedX.connection.close # Rompe la siguiente query del mismo thread
|
|
|
207
207
|
|
|
208
208
|
**Razón:** `Record.connection` es thread-local y persistente — diseñada para amortizar el costo de cargar `httpfs` y credenciales. Cerrarla obliga a reconectar todo en la próxima query y puede dejar el `Thread.current` apuntando a una conexión muerta (`Database` GC'd).
|
|
209
209
|
|
|
210
|
-
**Alternativa:** No
|
|
210
|
+
**Alternativa:** No usar `Record.connection.close` directamente. Si necesitás cerrar (Sidekiq/Puma middleware), usar `Record.disconnect!` que cierra `db` + `conn` y limpia `Thread.current` atómicamente. En threads de larga vida, esto previene memory leak.
|
|
211
211
|
|
|
212
212
|
---
|
|
213
213
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_drain
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gabriel
|
|
@@ -97,6 +97,10 @@ files:
|
|
|
97
97
|
- README.md
|
|
98
98
|
- Rakefile
|
|
99
99
|
- data_drain.gemspec
|
|
100
|
+
- docs/IMPROVEMENT_PLAN.md
|
|
101
|
+
- docs/execution/archive/v0.2.0.agente-review.md
|
|
102
|
+
- docs/execution/archive/v0.2.0.md
|
|
103
|
+
- docs/glue_pyspark_example.py
|
|
100
104
|
- lib/data_drain.rb
|
|
101
105
|
- lib/data_drain/configuration.rb
|
|
102
106
|
- lib/data_drain/engine.rb
|