sws-spark-dissemination-helper 0.0.128__tar.gz → 0.0.130__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/PKG-INFO +1 -1
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/pyproject.toml +1 -1
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +103 -3
- sws_spark_dissemination_helper-0.0.130/tests/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/.gitignore +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/LICENSE +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/README.md +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/old_requirements.txt +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/requirements.txt +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +0 -0
- /sws_spark_dissemination_helper-0.0.128/tests/__init__.py → /sws_spark_dissemination_helper-0.0.130/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/constants.py +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/utils.py +0 -0
- {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/tests/test.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.130
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
5
|
Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
@@ -159,7 +159,7 @@ class SWSEasyIcebergSparkHelper:
|
|
|
159
159
|
left join reference_data.dim_element_fao d1 on d1.id = oc.dim_element_fao
|
|
160
160
|
left join reference_data.dim_item_cpc d2 on d2.id = oc.dim_item_cpc
|
|
161
161
|
left join reference_data.dim_time_series_years d3 on d3.id = oc.dim_time_series_years
|
|
162
|
-
where o.replaced_on is null
|
|
162
|
+
where o.replaced_on is null
|
|
163
163
|
"""
|
|
164
164
|
|
|
165
165
|
# ----------------
|
|
@@ -229,6 +229,45 @@ class SWSEasyIcebergSparkHelper:
|
|
|
229
229
|
|
|
230
230
|
return df_obs_denorm
|
|
231
231
|
|
|
232
|
+
def _gen_denormalized_observation_sql(self) -> DataFrame:
|
|
233
|
+
# ----------------
|
|
234
|
+
# Prepare dataframes for the joins
|
|
235
|
+
# ----------------
|
|
236
|
+
|
|
237
|
+
select_statement = """
|
|
238
|
+
select o.id,
|
|
239
|
+
o.value,
|
|
240
|
+
u.email,
|
|
241
|
+
o.created_on,
|
|
242
|
+
o.replaced_on,
|
|
243
|
+
o.version,
|
|
244
|
+
o.flag_obs_status,
|
|
245
|
+
o.flag_method
|
|
246
|
+
"""
|
|
247
|
+
|
|
248
|
+
from_statement = f"""
|
|
249
|
+
from {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
250
|
+
join {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
251
|
+
left join {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} as oc on oc.id = o.observation_coordinates
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
for i, (dim, cl) in enumerate(
|
|
255
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
256
|
+
):
|
|
257
|
+
select_statement += f',\nd{i}.code as "{dim}"'
|
|
258
|
+
from_statement += f"\nleft join {cl.iceberg_id} d{i} on d{i}.id = oc.{dim}"
|
|
259
|
+
|
|
260
|
+
final_query = select_statement + from_statement
|
|
261
|
+
if not self.keep_history:
|
|
262
|
+
final_query += "where o.replaced_on is null"
|
|
263
|
+
|
|
264
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
265
|
+
logging.info(final_query)
|
|
266
|
+
|
|
267
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
268
|
+
|
|
269
|
+
return df_obs_denorm
|
|
270
|
+
|
|
232
271
|
def _gen_denormalized_metadata(self) -> DataFrame:
|
|
233
272
|
"""Original query upon which the below computation is based
|
|
234
273
|
|
|
@@ -292,6 +331,32 @@ class SWSEasyIcebergSparkHelper:
|
|
|
292
331
|
|
|
293
332
|
return df_meta_denorm
|
|
294
333
|
|
|
334
|
+
def _gen_denormalized_metadata_sql(self) -> DataFrame:
|
|
335
|
+
# ----------------
|
|
336
|
+
# Generate denormalized observation table
|
|
337
|
+
# ----------------
|
|
338
|
+
|
|
339
|
+
logging.info("meta_denorm start")
|
|
340
|
+
|
|
341
|
+
df_meta_denorm = self.spark.sql(
|
|
342
|
+
f"""
|
|
343
|
+
select m.observation as observation_id,
|
|
344
|
+
mt.code as type,
|
|
345
|
+
met.code as element_type,
|
|
346
|
+
l.country_code as language,
|
|
347
|
+
me.value
|
|
348
|
+
from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
|
|
349
|
+
left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
|
|
350
|
+
left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
|
|
351
|
+
left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
|
|
352
|
+
left join {self.dataset_tables.LANGUAGE.iceberg_id} reference_data.language l on l.id = m.language
|
|
353
|
+
"""
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
logging.info("meta_denorm write")
|
|
357
|
+
|
|
358
|
+
return df_meta_denorm
|
|
359
|
+
|
|
295
360
|
def _gen_grouped_metadata(self) -> DataFrame:
|
|
296
361
|
return (
|
|
297
362
|
self._gen_denormalized_metadata()
|
|
@@ -312,6 +377,26 @@ class SWSEasyIcebergSparkHelper:
|
|
|
312
377
|
.agg(F.collect_list("metadata").alias("metadata"))
|
|
313
378
|
)
|
|
314
379
|
|
|
380
|
+
def _gen_grouped_metadata_sql(self) -> DataFrame:
|
|
381
|
+
return (
|
|
382
|
+
self._gen_denormalized_metadata_sql()
|
|
383
|
+
.select(
|
|
384
|
+
col("observation_id"),
|
|
385
|
+
F.create_map(
|
|
386
|
+
lit("type"),
|
|
387
|
+
col("type"),
|
|
388
|
+
lit("element_type"),
|
|
389
|
+
col("element_type"),
|
|
390
|
+
lit("language"),
|
|
391
|
+
col("language"),
|
|
392
|
+
lit("value"),
|
|
393
|
+
col("value"),
|
|
394
|
+
).alias("metadata"),
|
|
395
|
+
)
|
|
396
|
+
.groupby("observation_id")
|
|
397
|
+
.agg(F.collect_list("metadata").alias("metadata"))
|
|
398
|
+
)
|
|
399
|
+
|
|
315
400
|
def _gen_denormalied_data(self) -> DataFrame:
|
|
316
401
|
return (
|
|
317
402
|
self._gen_denormalized_observation()
|
|
@@ -324,8 +409,23 @@ class SWSEasyIcebergSparkHelper:
|
|
|
324
409
|
.drop("m.observation_id")
|
|
325
410
|
)
|
|
326
411
|
|
|
327
|
-
def
|
|
328
|
-
|
|
412
|
+
def _gen_denormalied_data_sql(self) -> DataFrame:
|
|
413
|
+
return (
|
|
414
|
+
self._gen_denormalized_observation_sql()
|
|
415
|
+
.alias("o")
|
|
416
|
+
.join(
|
|
417
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
418
|
+
col("o.id") == col("m.observation_id"),
|
|
419
|
+
"left",
|
|
420
|
+
)
|
|
421
|
+
.drop("m.observation_id")
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
def write_data_to_iceberg_and_csv(self, sql=False) -> DataFrame:
|
|
425
|
+
if sql:
|
|
426
|
+
self.df_denorm = self._gen_denormalied_data_sql()
|
|
427
|
+
else:
|
|
428
|
+
self.df_denorm = self._gen_denormalied_data()
|
|
329
429
|
|
|
330
430
|
self.df_denorm.writeTo(self.iceberg_tables.TABLE.iceberg_id).createOrReplace()
|
|
331
431
|
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/.gitignore
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/tests/test.py
RENAMED
|
File without changes
|