sws-spark-dissemination-helper 0.0.141__py3-none-any.whl → 0.0.171__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +293 -31
- sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +167 -32
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +154 -2
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py +46 -27
- sws_spark_dissemination_helper/constants.py +62 -17
- {sws_spark_dissemination_helper-0.0.141.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/METADATA +11 -11
- {sws_spark_dissemination_helper-0.0.141.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/RECORD +9 -9
- {sws_spark_dissemination_helper-0.0.141.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/WHEEL +0 -0
- {sws_spark_dissemination_helper-0.0.141.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
from copy import copy
|
|
4
|
-
from typing import Dict, List, Tuple
|
|
4
|
+
from typing import Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import pyspark.sql.functions as F
|
|
7
7
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
26
26
|
domain_code: str,
|
|
27
27
|
dataset_details: dict = None,
|
|
28
28
|
dataset_tables: DatasetTables = None,
|
|
29
|
+
keep_history: bool = False,
|
|
30
|
+
write_csv: bool = True,
|
|
31
|
+
source_tag: Union[str, None] = None,
|
|
29
32
|
) -> None:
|
|
30
33
|
self.spark: SparkSession = spark
|
|
31
34
|
self.dataset_details: dict = dataset_details
|
|
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
36
39
|
self.dataset_tables: DatasetTables = dataset_tables
|
|
37
40
|
self.iceberg_tables: IcebergTables = iceberg_tables
|
|
38
41
|
self.domain_code = domain_code
|
|
42
|
+
self.keep_history: bool = keep_history
|
|
43
|
+
self.write_csv: bool = write_csv
|
|
44
|
+
self.source_tag: Union[str, None] = source_tag
|
|
39
45
|
|
|
40
46
|
if dataset_details is not None:
|
|
41
47
|
(
|
|
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
83
89
|
self.df_obs_coord,
|
|
84
90
|
self.df_metadata,
|
|
85
91
|
self.df_meta_elem,
|
|
92
|
+
self.df_tag_observation,
|
|
86
93
|
) = self.raw_data
|
|
87
94
|
|
|
88
95
|
(
|
|
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
92
99
|
self.df_meta_elem_type,
|
|
93
100
|
self.df_language,
|
|
94
101
|
self.df_unit_of_measure,
|
|
102
|
+
self.df_dataset,
|
|
95
103
|
self.dfs_dimension,
|
|
96
104
|
) = self.raw_reference_data
|
|
97
105
|
|
|
98
|
-
self.df_user = self.raw_operational_data
|
|
106
|
+
(self.df_user, self.df_tag) = self.raw_operational_data
|
|
99
107
|
|
|
100
108
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
101
109
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
@@ -150,7 +158,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
150
158
|
|
|
151
159
|
return dfs_dimension
|
|
152
160
|
|
|
153
|
-
def _prepare_element_uom(self) -> DataFrame:
|
|
161
|
+
def _prepare_element_uom(self) -> Union[DataFrame, None]:
|
|
154
162
|
"""Prepare the element and unit of measure join."""
|
|
155
163
|
|
|
156
164
|
# Get the element DataFrame
|
|
@@ -162,23 +170,24 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
162
170
|
if dimension_column == self.element_column
|
|
163
171
|
)
|
|
164
172
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
173
|
+
if any("unit_of_measure" == column.lower() for column in df_element.columns):
|
|
174
|
+
# Join the element and the unit_of_measure
|
|
175
|
+
df_element_uom = (
|
|
176
|
+
df_element.alias("e")
|
|
177
|
+
.join(
|
|
178
|
+
self.df_unit_of_measure.alias("u"),
|
|
179
|
+
col("e.unit_of_measure") == col("u.id"),
|
|
180
|
+
)
|
|
181
|
+
.select(
|
|
182
|
+
col("e.code").alias("element_code"),
|
|
183
|
+
col("u.code").alias("unit_of_measure"),
|
|
184
|
+
col("u.symbol").alias("unit_of_measure_symbol"),
|
|
185
|
+
col("u.base_unit").alias("unit_of_measure_base_unit"),
|
|
186
|
+
col("u.multiplier").alias("unit_of_measure_multiplier"),
|
|
187
|
+
)
|
|
178
188
|
)
|
|
179
|
-
)
|
|
180
189
|
|
|
181
|
-
|
|
190
|
+
return df_element_uom
|
|
182
191
|
|
|
183
192
|
def _gen_denormalized_observation(self) -> DataFrame:
|
|
184
193
|
"""Original query upon which the below computation is based
|
|
@@ -270,20 +279,170 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
270
279
|
.withColumnRenamed("code", dimension_column)
|
|
271
280
|
)
|
|
272
281
|
|
|
273
|
-
|
|
274
|
-
df_intermediate
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
282
|
+
if df_element_uom is not None:
|
|
283
|
+
df_intermediate = (
|
|
284
|
+
df_intermediate.alias("d")
|
|
285
|
+
.join(
|
|
286
|
+
F.broadcast(df_element_uom).alias("e"),
|
|
287
|
+
col(f"d.{self.element_column}") == col("e.element_code"),
|
|
288
|
+
"left",
|
|
289
|
+
)
|
|
290
|
+
.drop("element_code")
|
|
279
291
|
)
|
|
280
|
-
.drop("element_code")
|
|
281
|
-
)
|
|
282
292
|
|
|
283
293
|
df_obs_denorm = df_intermediate
|
|
284
294
|
|
|
285
295
|
return df_obs_denorm
|
|
286
296
|
|
|
297
|
+
def _gen_denormalized_observation_sql(self) -> DataFrame:
|
|
298
|
+
# ----------------
|
|
299
|
+
# Prepare dataframes for the joins
|
|
300
|
+
# ----------------
|
|
301
|
+
|
|
302
|
+
select_statement = """
|
|
303
|
+
o.id,
|
|
304
|
+
o.value,
|
|
305
|
+
u.email,
|
|
306
|
+
o.created_on,
|
|
307
|
+
o.replaced_on,
|
|
308
|
+
o.version"""
|
|
309
|
+
|
|
310
|
+
from_statement = f"""
|
|
311
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
312
|
+
JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
313
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
314
|
+
|
|
315
|
+
hint_statement = ""
|
|
316
|
+
|
|
317
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
318
|
+
for flag_col in self.flag_columns:
|
|
319
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
320
|
+
|
|
321
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
322
|
+
for i, (dim_col, cl) in enumerate(
|
|
323
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
324
|
+
):
|
|
325
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
326
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
327
|
+
hint_statement = (
|
|
328
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
329
|
+
if hint_statement
|
|
330
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
334
|
+
|
|
335
|
+
final_query = "SELECT " + hint_statement + select_statement + from_statement
|
|
336
|
+
if not self.keep_history:
|
|
337
|
+
final_query += "\nWHERE o.replaced_on IS NULL"
|
|
338
|
+
|
|
339
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
340
|
+
logging.info(final_query)
|
|
341
|
+
|
|
342
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
343
|
+
|
|
344
|
+
df_element_uom = self._prepare_element_uom()
|
|
345
|
+
|
|
346
|
+
dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
|
|
347
|
+
|
|
348
|
+
# Join all the dimension codelists
|
|
349
|
+
for dimension_column, df_dimension in zip(
|
|
350
|
+
self.dim_columns_w_time, dfs_dimension_w_validity
|
|
351
|
+
):
|
|
352
|
+
logging.debug(f"Joining dimension column: {dimension_column}")
|
|
353
|
+
logging.debug(f"df_obs_denorm columns: {df_obs_denorm.columns}")
|
|
354
|
+
logging.debug(
|
|
355
|
+
f"Is dimension {dimension_column} in the dataframe? {dimension_column in df_obs_denorm.columns}"
|
|
356
|
+
)
|
|
357
|
+
df_obs_denorm = (
|
|
358
|
+
df_obs_denorm.alias("o")
|
|
359
|
+
.join(
|
|
360
|
+
F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
|
|
361
|
+
"d"
|
|
362
|
+
),
|
|
363
|
+
col(f"{dimension_column}") == col("d.code"),
|
|
364
|
+
)
|
|
365
|
+
.drop("code", "join_id")
|
|
366
|
+
)
|
|
367
|
+
logging.debug(f"After join count: {df_obs_denorm.count()}")
|
|
368
|
+
|
|
369
|
+
if df_element_uom is not None:
|
|
370
|
+
df_obs_denorm = (
|
|
371
|
+
df_obs_denorm.alias("d")
|
|
372
|
+
.join(
|
|
373
|
+
F.broadcast(df_element_uom).alias("e"),
|
|
374
|
+
col(f"d.{self.element_column}") == col("e.element_code"),
|
|
375
|
+
"left",
|
|
376
|
+
)
|
|
377
|
+
.drop("element_code")
|
|
378
|
+
)
|
|
379
|
+
logging.debug(f"After uom count: {df_obs_denorm.count()}")
|
|
380
|
+
|
|
381
|
+
return df_obs_denorm
|
|
382
|
+
|
|
383
|
+
def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
|
|
384
|
+
# ----------------
|
|
385
|
+
# Prepare dataframes for the joins
|
|
386
|
+
# ----------------
|
|
387
|
+
|
|
388
|
+
select_statement = """
|
|
389
|
+
o.id,
|
|
390
|
+
o.value,
|
|
391
|
+
u.email,
|
|
392
|
+
o.created_on,
|
|
393
|
+
o.replaced_on,
|
|
394
|
+
o.version"""
|
|
395
|
+
|
|
396
|
+
from_statement = f"""
|
|
397
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
398
|
+
INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
|
|
399
|
+
INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
|
|
400
|
+
INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
|
|
401
|
+
LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
402
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
403
|
+
|
|
404
|
+
hint_statement = ""
|
|
405
|
+
|
|
406
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
407
|
+
for flag_col in self.flag_columns:
|
|
408
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
409
|
+
|
|
410
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
411
|
+
for i, (dim_col, cl) in enumerate(
|
|
412
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
413
|
+
):
|
|
414
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
415
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
416
|
+
hint_statement = (
|
|
417
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
418
|
+
if hint_statement
|
|
419
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
423
|
+
|
|
424
|
+
# TODO Add tag name as a parameter
|
|
425
|
+
where_statement = (
|
|
426
|
+
f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
final_query = (
|
|
430
|
+
"SELECT "
|
|
431
|
+
+ hint_statement
|
|
432
|
+
+ select_statement
|
|
433
|
+
+ from_statement
|
|
434
|
+
+ where_statement
|
|
435
|
+
)
|
|
436
|
+
if not self.keep_history:
|
|
437
|
+
final_query += "\n AND o.replaced_on IS NULL"
|
|
438
|
+
|
|
439
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
440
|
+
logging.info(final_query)
|
|
441
|
+
|
|
442
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
443
|
+
|
|
444
|
+
return df_obs_denorm
|
|
445
|
+
|
|
287
446
|
def _gen_denormalized_metadata(self) -> DataFrame:
|
|
288
447
|
"""Original query upon which the below computation is based
|
|
289
448
|
|
|
@@ -347,6 +506,32 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
347
506
|
|
|
348
507
|
return df_meta_denorm
|
|
349
508
|
|
|
509
|
+
def _gen_denormalized_metadata_sql(self) -> DataFrame:
|
|
510
|
+
# ----------------
|
|
511
|
+
# Generate denormalized observation table
|
|
512
|
+
# ----------------
|
|
513
|
+
|
|
514
|
+
logging.info("meta_denorm start")
|
|
515
|
+
|
|
516
|
+
df_meta_denorm = self.spark.sql(
|
|
517
|
+
f"""
|
|
518
|
+
select m.observation as observation_id,
|
|
519
|
+
mt.code as type,
|
|
520
|
+
met.code as element_type,
|
|
521
|
+
l.country_code as language,
|
|
522
|
+
me.value
|
|
523
|
+
from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
|
|
524
|
+
left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
|
|
525
|
+
left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
|
|
526
|
+
left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
|
|
527
|
+
left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
|
|
528
|
+
"""
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
logging.info("meta_denorm write")
|
|
532
|
+
|
|
533
|
+
return df_meta_denorm
|
|
534
|
+
|
|
350
535
|
def _gen_grouped_metadata(self) -> DataFrame:
|
|
351
536
|
return (
|
|
352
537
|
self._gen_denormalized_metadata()
|
|
@@ -367,6 +552,26 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
367
552
|
.agg(F.collect_list("metadata").alias("metadata"))
|
|
368
553
|
)
|
|
369
554
|
|
|
555
|
+
def _gen_grouped_metadata_sql(self) -> DataFrame:
|
|
556
|
+
return (
|
|
557
|
+
self._gen_denormalized_metadata_sql()
|
|
558
|
+
.select(
|
|
559
|
+
col("observation_id"),
|
|
560
|
+
F.create_map(
|
|
561
|
+
lit("type"),
|
|
562
|
+
col("type"),
|
|
563
|
+
lit("element_type"),
|
|
564
|
+
col("element_type"),
|
|
565
|
+
lit("language"),
|
|
566
|
+
col("language"),
|
|
567
|
+
lit("value"),
|
|
568
|
+
col("value"),
|
|
569
|
+
).alias("metadata"),
|
|
570
|
+
)
|
|
571
|
+
.groupby("observation_id")
|
|
572
|
+
.agg(F.collect_list("metadata").alias("metadata"))
|
|
573
|
+
)
|
|
574
|
+
|
|
370
575
|
def _gen_bronze_data(self) -> DataFrame:
|
|
371
576
|
return (
|
|
372
577
|
self._gen_denormalized_observation()
|
|
@@ -379,9 +584,37 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
379
584
|
.drop("m.observation_id")
|
|
380
585
|
)
|
|
381
586
|
|
|
587
|
+
def _gen_bronze_data_sql(self) -> DataFrame:
|
|
588
|
+
return (
|
|
589
|
+
self._gen_denormalized_observation_sql()
|
|
590
|
+
.alias("o")
|
|
591
|
+
.join(
|
|
592
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
593
|
+
col("o.id") == col("m.observation_id"),
|
|
594
|
+
"left",
|
|
595
|
+
)
|
|
596
|
+
.drop("m.observation_id")
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
|
|
600
|
+
return (
|
|
601
|
+
self._gen_denormalized_observation_sql_from_tag()
|
|
602
|
+
.alias("o")
|
|
603
|
+
.join(
|
|
604
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
605
|
+
col("o.id") == col("m.observation_id"),
|
|
606
|
+
"left",
|
|
607
|
+
)
|
|
608
|
+
.drop("m.observation_id")
|
|
609
|
+
)
|
|
610
|
+
|
|
382
611
|
# TODO decouple data generation and data writing
|
|
383
|
-
def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
384
|
-
|
|
612
|
+
def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
|
|
613
|
+
|
|
614
|
+
if sql:
|
|
615
|
+
self.df_bronze = self._gen_bronze_data_sql()
|
|
616
|
+
else:
|
|
617
|
+
self.df_bronze = self._gen_bronze_data()
|
|
385
618
|
|
|
386
619
|
self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
|
|
387
620
|
|
|
@@ -449,12 +682,15 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
449
682
|
logging.info("Bronze Dissemination tags successfully written")
|
|
450
683
|
|
|
451
684
|
def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
|
|
452
|
-
self, dimensions: Dict[str, List[str]]
|
|
685
|
+
self, dimensions: Dict[str, List[str]] = {}, from_tag=False
|
|
453
686
|
) -> DataFrame:
|
|
454
687
|
|
|
455
|
-
|
|
688
|
+
if from_tag:
|
|
689
|
+
self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
|
|
690
|
+
else:
|
|
691
|
+
self.disseminated_tag_df = self.df_bronze
|
|
456
692
|
|
|
457
|
-
if
|
|
693
|
+
if not from_tag and dimensions is not None and len(dimensions) != 0:
|
|
458
694
|
for dimension_name, codes in dimensions.items():
|
|
459
695
|
logging.info(f"dimension_name: {dimension_name}")
|
|
460
696
|
logging.info(f"codes: {codes}")
|
|
@@ -533,3 +769,29 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
533
769
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
534
770
|
|
|
535
771
|
logging.info("Bronze Disseminated tag with selection successfully written")
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
1
|
|
775
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
776
|
+
1
|
|
777
|
+
1
|
|
778
|
+
2
|
|
779
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
780
|
+
2
|
|
781
|
+
1
|
|
782
|
+
1
|
|
783
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
784
|
+
1
|
|
785
|
+
1
|
|
786
|
+
2
|
|
787
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
788
|
+
2
|
|
789
|
+
1
|
|
790
|
+
1
|
|
791
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
792
|
+
1
|
|
793
|
+
1
|
|
794
|
+
1
|
|
795
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
796
|
+
1
|
|
797
|
+
1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from copy import copy
|
|
3
|
-
from typing import Dict, List, Tuple
|
|
3
|
+
from typing import Dict, List, Tuple, Union
|
|
4
4
|
|
|
5
5
|
import pyspark.sql.functions as F
|
|
6
6
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -26,6 +26,7 @@ class SWSEasyIcebergSparkHelper:
|
|
|
26
26
|
dataset_tables: DatasetTables = None,
|
|
27
27
|
keep_history: bool = False,
|
|
28
28
|
write_csv: bool = True,
|
|
29
|
+
source_tag: Union[str, None] = None,
|
|
29
30
|
) -> None:
|
|
30
31
|
self.spark: SparkSession = spark
|
|
31
32
|
self.dataset_details: dict = dataset_details
|
|
@@ -37,6 +38,7 @@ class SWSEasyIcebergSparkHelper:
|
|
|
37
38
|
self.iceberg_tables: IcebergTables = iceberg_tables
|
|
38
39
|
self.keep_history: bool = keep_history
|
|
39
40
|
self.write_csv: bool = write_csv
|
|
41
|
+
self.source_tag: Union[str, None] = source_tag
|
|
40
42
|
|
|
41
43
|
if dataset_details is not None:
|
|
42
44
|
(
|
|
@@ -69,8 +71,10 @@ class SWSEasyIcebergSparkHelper:
|
|
|
69
71
|
self.df_obs_coord,
|
|
70
72
|
self.df_metadata,
|
|
71
73
|
self.df_meta_elem,
|
|
74
|
+
self.df_tag_observation,
|
|
72
75
|
) = self.raw_data
|
|
73
76
|
|
|
77
|
+
logging.info(self.raw_reference_data)
|
|
74
78
|
(
|
|
75
79
|
self.df_flag_method,
|
|
76
80
|
self.df_flag_obs_status,
|
|
@@ -78,10 +82,11 @@ class SWSEasyIcebergSparkHelper:
|
|
|
78
82
|
self.df_meta_elem_type,
|
|
79
83
|
self.df_language,
|
|
80
84
|
self.df_unit_of_measure,
|
|
85
|
+
self.df_dataset,
|
|
81
86
|
self.dfs_dimension,
|
|
82
87
|
) = self.raw_reference_data
|
|
83
88
|
|
|
84
|
-
self.df_user = self.raw_operational_data
|
|
89
|
+
(self.df_user, self.df_tag) = self.raw_operational_data
|
|
85
90
|
|
|
86
91
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
87
92
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
@@ -271,6 +276,75 @@ class SWSEasyIcebergSparkHelper:
|
|
|
271
276
|
if not self.keep_history:
|
|
272
277
|
final_query += "\nWHERE o.replaced_on IS NULL"
|
|
273
278
|
|
|
279
|
+
logging.info("Final query for merging observation and observation_coordinates")
|
|
280
|
+
logging.info(final_query)
|
|
281
|
+
|
|
282
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
283
|
+
|
|
284
|
+
df_obs_denorm.writeTo(
|
|
285
|
+
self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id
|
|
286
|
+
).createOrReplace()
|
|
287
|
+
|
|
288
|
+
logging.info(f"{self.iceberg_tables.DENORMALIZED_OBSERVATION.table} write")
|
|
289
|
+
|
|
290
|
+
return df_obs_denorm
|
|
291
|
+
|
|
292
|
+
def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
|
|
293
|
+
# ----------------
|
|
294
|
+
# Prepare dataframes for the joins
|
|
295
|
+
# ----------------
|
|
296
|
+
|
|
297
|
+
select_statement = """
|
|
298
|
+
o.id,
|
|
299
|
+
o.value,
|
|
300
|
+
u.email,
|
|
301
|
+
o.created_on,
|
|
302
|
+
o.replaced_on,
|
|
303
|
+
o.version"""
|
|
304
|
+
|
|
305
|
+
from_statement = f"""
|
|
306
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
307
|
+
INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
|
|
308
|
+
INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
|
|
309
|
+
INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
|
|
310
|
+
LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
311
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
312
|
+
|
|
313
|
+
hint_statement = ""
|
|
314
|
+
|
|
315
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
316
|
+
for flag_col in self.flag_columns:
|
|
317
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
318
|
+
|
|
319
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
320
|
+
for i, (dim_col, cl) in enumerate(
|
|
321
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
322
|
+
):
|
|
323
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
324
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
325
|
+
hint_statement = (
|
|
326
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
327
|
+
if hint_statement
|
|
328
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
332
|
+
|
|
333
|
+
# TODO Add tag name as a parameter
|
|
334
|
+
where_statement = (
|
|
335
|
+
f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
final_query = (
|
|
339
|
+
"SELECT "
|
|
340
|
+
+ hint_statement
|
|
341
|
+
+ select_statement
|
|
342
|
+
+ from_statement
|
|
343
|
+
+ where_statement
|
|
344
|
+
)
|
|
345
|
+
if not self.keep_history:
|
|
346
|
+
final_query += "\n AND o.replaced_on IS NULL"
|
|
347
|
+
|
|
274
348
|
logging.info("Final query for merging observation and observation_coordinares")
|
|
275
349
|
logging.info(final_query)
|
|
276
350
|
|
|
@@ -350,7 +424,13 @@ class SWSEasyIcebergSparkHelper:
|
|
|
350
424
|
|
|
351
425
|
df_meta_denorm = self.spark.sql(
|
|
352
426
|
f"""
|
|
353
|
-
select
|
|
427
|
+
select
|
|
428
|
+
/*+
|
|
429
|
+
BROADCAST({self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id}),
|
|
430
|
+
BROADCAST({self.dataset_tables.METADATA_TYPE.iceberg_id}),
|
|
431
|
+
BROADCAST({self.dataset_tables.LANGUAGE.iceberg_id})
|
|
432
|
+
*/
|
|
433
|
+
m.observation as observation_id,
|
|
354
434
|
mt.code as type,
|
|
355
435
|
met.code as element_type,
|
|
356
436
|
l.country_code as language,
|
|
@@ -363,7 +443,11 @@ class SWSEasyIcebergSparkHelper:
|
|
|
363
443
|
"""
|
|
364
444
|
)
|
|
365
445
|
|
|
366
|
-
|
|
446
|
+
df_meta_denorm.writeTo(
|
|
447
|
+
self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id
|
|
448
|
+
).createOrReplace()
|
|
449
|
+
|
|
450
|
+
logging.info(f"{self.iceberg_tables.DENORMALIZED_METADATA.table} write")
|
|
367
451
|
|
|
368
452
|
return df_meta_denorm
|
|
369
453
|
|
|
@@ -388,25 +472,31 @@ class SWSEasyIcebergSparkHelper:
|
|
|
388
472
|
)
|
|
389
473
|
|
|
390
474
|
def _gen_grouped_metadata_sql(self) -> DataFrame:
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
.groupby("observation_id")
|
|
407
|
-
.agg(F.collect_list("metadata").alias("metadata"))
|
|
475
|
+
df_meta_grouped = self.spark.sql(
|
|
476
|
+
f"""
|
|
477
|
+
SELECT
|
|
478
|
+
observation_id,
|
|
479
|
+
collect_list(
|
|
480
|
+
map(
|
|
481
|
+
'type', type,
|
|
482
|
+
'element_type', element_type,
|
|
483
|
+
'language', language,
|
|
484
|
+
'value', value
|
|
485
|
+
)
|
|
486
|
+
) AS metadata
|
|
487
|
+
FROM {self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id}
|
|
488
|
+
GROUP BY observation_id
|
|
489
|
+
"""
|
|
408
490
|
)
|
|
409
491
|
|
|
492
|
+
df_meta_grouped.writeTo(
|
|
493
|
+
self.iceberg_tables.GROUPED_METADATA.iceberg_id
|
|
494
|
+
).createOrReplace()
|
|
495
|
+
|
|
496
|
+
logging.info(f"{self.iceberg_tables.GROUPED_METADATA.table} write")
|
|
497
|
+
|
|
498
|
+
return df_meta_grouped
|
|
499
|
+
|
|
410
500
|
def _gen_denormalied_data(self) -> DataFrame:
|
|
411
501
|
return (
|
|
412
502
|
self._gen_denormalized_observation()
|
|
@@ -420,8 +510,24 @@ class SWSEasyIcebergSparkHelper:
|
|
|
420
510
|
)
|
|
421
511
|
|
|
422
512
|
def _gen_denormalied_data_sql(self) -> DataFrame:
|
|
513
|
+
self._gen_denormalized_observation_sql()
|
|
514
|
+
self._gen_denormalized_metadata_sql()
|
|
515
|
+
self._gen_grouped_metadata_sql()
|
|
516
|
+
|
|
517
|
+
return self.spark.sql(
|
|
518
|
+
f"""
|
|
519
|
+
SELECT
|
|
520
|
+
o.*,
|
|
521
|
+
m.metadata
|
|
522
|
+
FROM {self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id} AS o
|
|
523
|
+
LEFT JOIN {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS m
|
|
524
|
+
ON o.id = m.observation_id
|
|
525
|
+
"""
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
def _gen_denormalied_data_sql_from_tag(self) -> DataFrame:
|
|
423
529
|
return (
|
|
424
|
-
self.
|
|
530
|
+
self._gen_denormalized_observation_sql_from_tag()
|
|
425
531
|
.alias("o")
|
|
426
532
|
.join(
|
|
427
533
|
self._gen_grouped_metadata_sql().alias("m"),
|
|
@@ -431,7 +537,7 @@ class SWSEasyIcebergSparkHelper:
|
|
|
431
537
|
.drop("m.observation_id")
|
|
432
538
|
)
|
|
433
539
|
|
|
434
|
-
def write_data_to_iceberg_and_csv(self, sql=
|
|
540
|
+
def write_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
|
|
435
541
|
if sql:
|
|
436
542
|
self.df_denorm = self._gen_denormalied_data_sql()
|
|
437
543
|
else:
|
|
@@ -504,18 +610,21 @@ class SWSEasyIcebergSparkHelper:
|
|
|
504
610
|
logging.info("Unfiltered data tags successfully written")
|
|
505
611
|
|
|
506
612
|
def write_filtered_data_to_iceberg_and_csv(
|
|
507
|
-
self, dimensions: Dict[str, List[str]]
|
|
613
|
+
self, dimensions: Dict[str, List[str]] = None, from_tag=False
|
|
508
614
|
) -> DataFrame:
|
|
509
615
|
|
|
510
|
-
|
|
616
|
+
if from_tag:
|
|
617
|
+
self.filtered_df = self._gen_denormalied_data_sql_from_tag()
|
|
618
|
+
else:
|
|
619
|
+
self.filtered_df = self.df_denorm
|
|
511
620
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
621
|
+
for dimension_name, codes in dimensions.items():
|
|
622
|
+
logging.info(f"dimension_name: {dimension_name}")
|
|
623
|
+
logging.info(f"codes: {codes}")
|
|
624
|
+
if len(codes) != 0:
|
|
625
|
+
self.filtered_df = self.filtered_df.filter(
|
|
626
|
+
col(dimension_name).isin(codes)
|
|
627
|
+
)
|
|
519
628
|
|
|
520
629
|
self.filtered_df.writeTo(
|
|
521
630
|
self.iceberg_tables.TABLE_FILTERED.iceberg_id
|
|
@@ -586,3 +695,29 @@ class SWSEasyIcebergSparkHelper:
|
|
|
586
695
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
587
696
|
|
|
588
697
|
logging.info("Filtered data tags successfully written")
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
1
|
|
701
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
702
|
+
1
|
|
703
|
+
1
|
|
704
|
+
2
|
|
705
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
706
|
+
2
|
|
707
|
+
1
|
|
708
|
+
1
|
|
709
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
710
|
+
1
|
|
711
|
+
1
|
|
712
|
+
2
|
|
713
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
714
|
+
2
|
|
715
|
+
1
|
|
716
|
+
1
|
|
717
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
718
|
+
1
|
|
719
|
+
1
|
|
720
|
+
1
|
|
721
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
722
|
+
1
|
|
723
|
+
1
|
|
@@ -271,6 +271,60 @@ class SWSGoldIcebergSparkHelper:
|
|
|
271
271
|
|
|
272
272
|
return df
|
|
273
273
|
|
|
274
|
+
def write_gold_faostat_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
|
|
275
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
276
|
+
df.writeTo(self.iceberg_tables.GOLD_FAOSTAT.iceberg_id).createOrReplace()
|
|
277
|
+
|
|
278
|
+
logging.info(
|
|
279
|
+
f"Gold FAOSTAT table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
self.spark.sql(
|
|
283
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
logging.info(f"gold FAOSTAT tag '{self.tag_name}' created")
|
|
287
|
+
|
|
288
|
+
df_1 = df.coalesce(1)
|
|
289
|
+
|
|
290
|
+
save_cache_csv(
|
|
291
|
+
df=df_1,
|
|
292
|
+
bucket=self.bucket,
|
|
293
|
+
prefix=self.iceberg_tables.GOLD_FAOSTAT.csv_prefix,
|
|
294
|
+
tag_name=self.tag_name,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
return df
|
|
298
|
+
|
|
299
|
+
def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
|
|
300
|
+
self, df: DataFrame
|
|
301
|
+
) -> DataFrame:
|
|
302
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
303
|
+
df.writeTo(
|
|
304
|
+
self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
|
|
305
|
+
).createOrReplace()
|
|
306
|
+
|
|
307
|
+
logging.info(
|
|
308
|
+
f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
self.spark.sql(
|
|
312
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
|
|
316
|
+
|
|
317
|
+
df_1 = df.coalesce(1)
|
|
318
|
+
|
|
319
|
+
save_cache_csv(
|
|
320
|
+
df=df_1,
|
|
321
|
+
bucket=self.bucket,
|
|
322
|
+
prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
|
|
323
|
+
tag_name=self.tag_name,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
return df
|
|
327
|
+
|
|
274
328
|
def write_gold_sws_validated_sws_dissemination_tag(
|
|
275
329
|
self, df: DataFrame, tags: Tags
|
|
276
330
|
) -> DataFrame:
|
|
@@ -496,8 +550,8 @@ class SWSGoldIcebergSparkHelper:
|
|
|
496
550
|
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
497
551
|
|
|
498
552
|
new_diss_table = BaseDisseminatedTagTable(
|
|
499
|
-
id=f"{self.domain_code.lower()}
|
|
500
|
-
name=f"{self.domain_code} gold
|
|
553
|
+
id=f"{self.domain_code.lower()}_gold_sws_csv",
|
|
554
|
+
name=f"{self.domain_code} gold SWS csv",
|
|
501
555
|
description="Gold table containing the tag data without any processing cached in csv",
|
|
502
556
|
layer=TableLayer.GOLD,
|
|
503
557
|
private=True,
|
|
@@ -515,3 +569,101 @@ class SWSGoldIcebergSparkHelper:
|
|
|
515
569
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
516
570
|
|
|
517
571
|
return df
|
|
572
|
+
|
|
573
|
+
def write_gold_faostat_dissemination_tag(
|
|
574
|
+
self, df: DataFrame, tags: Tags
|
|
575
|
+
) -> DataFrame:
|
|
576
|
+
# Get or create a new tag
|
|
577
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
578
|
+
logging.debug(f"Tag: {tag}")
|
|
579
|
+
|
|
580
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
581
|
+
id=f"{self.domain_code.lower()}_gold_faostat_iceberg",
|
|
582
|
+
name=f"{self.domain_code} gold FAOSTAT Iceberg",
|
|
583
|
+
description="Gold table containing the tag data in FAOSTAT format",
|
|
584
|
+
layer=TableLayer.GOLD,
|
|
585
|
+
private=True,
|
|
586
|
+
type=TableType.ICEBERG,
|
|
587
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
588
|
+
table=self.iceberg_tables.GOLD_FAOSTAT.table,
|
|
589
|
+
path=self.iceberg_tables.GOLD_FAOSTAT.path,
|
|
590
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
591
|
+
)
|
|
592
|
+
tag = upsert_disseminated_table(
|
|
593
|
+
sws_tags=tags,
|
|
594
|
+
tag=tag,
|
|
595
|
+
dataset_id=self.dataset_id,
|
|
596
|
+
tag_name=self.tag_name,
|
|
597
|
+
table=new_iceberg_table,
|
|
598
|
+
)
|
|
599
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
600
|
+
|
|
601
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
602
|
+
id=f"{self.domain_code.lower()}_gold_faostat_csv",
|
|
603
|
+
name=f"{self.domain_code} gold FAOSTAT csv",
|
|
604
|
+
description="Gold table containing the tag data in FAOSTAT format in csv",
|
|
605
|
+
layer=TableLayer.GOLD,
|
|
606
|
+
private=True,
|
|
607
|
+
type=TableType.CSV,
|
|
608
|
+
path=self.iceberg_tables.GOLD_FAOSTAT.csv_path,
|
|
609
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
610
|
+
)
|
|
611
|
+
tag = upsert_disseminated_table(
|
|
612
|
+
sws_tags=tags,
|
|
613
|
+
tag=tag,
|
|
614
|
+
dataset_id=self.dataset_id,
|
|
615
|
+
tag_name=self.tag_name,
|
|
616
|
+
table=new_diss_table,
|
|
617
|
+
)
|
|
618
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
619
|
+
|
|
620
|
+
return df
|
|
621
|
+
|
|
622
|
+
def write_gold_faostat_unfiltered_dissemination_tag(
|
|
623
|
+
self, df: DataFrame, tags: Tags
|
|
624
|
+
) -> DataFrame:
|
|
625
|
+
# Get or create a new tag
|
|
626
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
627
|
+
logging.debug(f"Tag: {tag}")
|
|
628
|
+
|
|
629
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
630
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
|
|
631
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
|
|
632
|
+
description="Gold table containing all the tag data in FAOSTAT format",
|
|
633
|
+
layer=TableLayer.GOLD,
|
|
634
|
+
private=True,
|
|
635
|
+
type=TableType.ICEBERG,
|
|
636
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
637
|
+
table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
|
|
638
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
|
|
639
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
640
|
+
)
|
|
641
|
+
tag = upsert_disseminated_table(
|
|
642
|
+
sws_tags=tags,
|
|
643
|
+
tag=tag,
|
|
644
|
+
dataset_id=self.dataset_id,
|
|
645
|
+
tag_name=self.tag_name,
|
|
646
|
+
table=new_iceberg_table,
|
|
647
|
+
)
|
|
648
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
649
|
+
|
|
650
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
651
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
|
|
652
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
|
|
653
|
+
description="Gold table containing the tag data in FAOSTAT format in csv",
|
|
654
|
+
layer=TableLayer.GOLD,
|
|
655
|
+
private=True,
|
|
656
|
+
type=TableType.CSV,
|
|
657
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
|
|
658
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
659
|
+
)
|
|
660
|
+
tag = upsert_disseminated_table(
|
|
661
|
+
sws_tags=tags,
|
|
662
|
+
tag=tag,
|
|
663
|
+
dataset_id=self.dataset_id,
|
|
664
|
+
tag_name=self.tag_name,
|
|
665
|
+
table=new_diss_table,
|
|
666
|
+
)
|
|
667
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
668
|
+
|
|
669
|
+
return df
|
|
@@ -94,25 +94,37 @@ class SWSPostgresSparkReader:
|
|
|
94
94
|
|
|
95
95
|
logging.info(f"{pg_table} read start")
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
97
|
+
if min_id is None or max_id is None:
|
|
98
|
+
df = (
|
|
99
|
+
self.spark.read.format("jdbc")
|
|
100
|
+
.option("customSchema", custom_schema)
|
|
101
|
+
.option("dbtable", pg_table)
|
|
102
|
+
.option("fetchsize", "1000")
|
|
103
|
+
.option("url", self.jdbc_url)
|
|
104
|
+
.option("user", self.jdbc_conn_properties["user"])
|
|
105
|
+
.option("password", self.jdbc_conn_properties["password"])
|
|
106
|
+
.option("driver", SPARK_POSTGRES_DRIVER)
|
|
107
|
+
.load()
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
df = (
|
|
111
|
+
self.spark.read.format("jdbc")
|
|
112
|
+
.option("customSchema", custom_schema)
|
|
113
|
+
.option("dbtable", pg_table)
|
|
114
|
+
.option("partitionColumn", partition_column)
|
|
115
|
+
.option("lowerBound", min_id)
|
|
116
|
+
.option("upperBound", max_id)
|
|
117
|
+
.option("numPartitions", num_partitions)
|
|
118
|
+
.option("fetchsize", "1000")
|
|
119
|
+
.option("url", self.jdbc_url)
|
|
120
|
+
.option("user", self.jdbc_conn_properties["user"])
|
|
121
|
+
.option("password", self.jdbc_conn_properties["password"])
|
|
122
|
+
.option("driver", SPARK_POSTGRES_DRIVER)
|
|
123
|
+
.load()
|
|
124
|
+
# .repartition(1024, partition_column)
|
|
125
|
+
# .sortWithinPartitions(partition_column)
|
|
126
|
+
# .cache()
|
|
127
|
+
)
|
|
116
128
|
else:
|
|
117
129
|
df = (
|
|
118
130
|
self.spark.read.format("jdbc")
|
|
@@ -195,6 +207,7 @@ class SWSPostgresSparkReader:
|
|
|
195
207
|
(dataset_tables.OBSERVATION_COORDINATE, "id", 10),
|
|
196
208
|
(dataset_tables.METADATA, "id", 10),
|
|
197
209
|
(dataset_tables.METADATA_ELEMENT, "metadata", 10),
|
|
210
|
+
(dataset_tables.TAG_OBSERVATION, "tag", 10),
|
|
198
211
|
]
|
|
199
212
|
return self._import_tables(data_tables)
|
|
200
213
|
|
|
@@ -209,25 +222,30 @@ class SWSPostgresSparkReader:
|
|
|
209
222
|
dataset_tables.METADATA_ELEMENT_TYPE,
|
|
210
223
|
dataset_tables.LANGUAGE,
|
|
211
224
|
dataset_tables.UNIT_OF_MEASURE,
|
|
225
|
+
dataset_tables.DATASET,
|
|
212
226
|
*dataset_tables.CODELISTS,
|
|
213
227
|
]
|
|
228
|
+
logging.info(
|
|
229
|
+
f"Importing reference data tables: {[(table.postgres_id, table.iceberg_id) for table in reference_data_tables]}"
|
|
230
|
+
)
|
|
214
231
|
return self._import_tables(
|
|
215
232
|
[(table, None, 1) for table in reference_data_tables]
|
|
216
233
|
)
|
|
217
234
|
|
|
218
235
|
def import_operational_data_tables(
|
|
219
236
|
self, dataset_tables: DatasetTables
|
|
220
|
-
) -> DataFrame:
|
|
237
|
+
) -> List[DataFrame]:
|
|
221
238
|
# Define and import operational data table without partitioning
|
|
222
239
|
operational_data_tables = [
|
|
223
240
|
(dataset_tables.USER, None, 1),
|
|
241
|
+
(dataset_tables.TAG, None, 1),
|
|
224
242
|
]
|
|
225
|
-
return self._import_tables(operational_data_tables)
|
|
243
|
+
return self._import_tables(operational_data_tables)
|
|
226
244
|
|
|
227
245
|
def import_data_reference_data_operational_data(
|
|
228
246
|
self, dataset_tables: DatasetTables
|
|
229
247
|
) -> Tuple[
|
|
230
|
-
Tuple[DataFrame, DataFrame, DataFrame, DataFrame],
|
|
248
|
+
Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame],
|
|
231
249
|
Tuple[
|
|
232
250
|
DataFrame,
|
|
233
251
|
DataFrame,
|
|
@@ -235,22 +253,23 @@ class SWSPostgresSparkReader:
|
|
|
235
253
|
DataFrame,
|
|
236
254
|
DataFrame,
|
|
237
255
|
DataFrame,
|
|
256
|
+
DataFrame,
|
|
238
257
|
List[DataFrame],
|
|
239
258
|
],
|
|
240
|
-
DataFrame,
|
|
259
|
+
Tuple[DataFrame, DataFrame],
|
|
241
260
|
]:
|
|
242
261
|
# Import and organize DataFrames into the desired output structure
|
|
243
262
|
data_dfs = self.import_data_tables(dataset_tables)
|
|
244
263
|
reference_data_dfs = self.import_reference_data_tables(dataset_tables)
|
|
245
|
-
|
|
264
|
+
operational_data_dfs = self.import_operational_data_tables(dataset_tables)
|
|
246
265
|
|
|
247
266
|
return (
|
|
248
267
|
tuple(data_dfs),
|
|
249
268
|
(
|
|
250
|
-
*reference_data_dfs[:
|
|
251
|
-
reference_data_dfs[
|
|
269
|
+
*reference_data_dfs[:7],
|
|
270
|
+
reference_data_dfs[7:],
|
|
252
271
|
),
|
|
253
|
-
|
|
272
|
+
tuple(operational_data_dfs),
|
|
254
273
|
)
|
|
255
274
|
|
|
256
275
|
def get_codelist_type_mapping(
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
1
3
|
from pyspark.sql.functions import col, lit
|
|
2
4
|
|
|
3
5
|
SPARK_POSTGRES_DRIVER = "org.postgresql.Driver"
|
|
@@ -34,10 +36,14 @@ class DomainFilters:
|
|
|
34
36
|
class DatasetDatatables:
|
|
35
37
|
|
|
36
38
|
class __SWSDatatable:
|
|
37
|
-
def __init__(
|
|
39
|
+
def __init__(
|
|
40
|
+
self, id: str, name: str, schema: str, join_columns: List[str] = []
|
|
41
|
+
):
|
|
38
42
|
self.id = id
|
|
43
|
+
self.iceberg_id = f"{IcebergDatabases.BRONZE_DATABASE}.{id.split('.')[1]}"
|
|
39
44
|
self.name = name
|
|
40
45
|
self.schema = schema
|
|
46
|
+
self.join_columns = join_columns
|
|
41
47
|
|
|
42
48
|
# Aggregation Tables
|
|
43
49
|
AGGREGATES_COMPOSITION = __SWSDatatable(
|
|
@@ -50,22 +56,37 @@ class DatasetDatatables:
|
|
|
50
56
|
name="Aggregation - Aggregates per elements",
|
|
51
57
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, element STRING, aggregation_type STRING, code STRING",
|
|
52
58
|
)
|
|
53
|
-
|
|
59
|
+
|
|
54
60
|
# Dissemination Tables
|
|
55
61
|
DISSEMINATION_TYPE_LIST = __SWSDatatable(
|
|
56
62
|
id="datatables.dissemination_{type}_list",
|
|
57
63
|
name="Dissemination - {type} list",
|
|
58
64
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
|
|
65
|
+
join_columns=["domain", "code"],
|
|
59
66
|
)
|
|
60
67
|
DISSEMINATION_EXCEPTIONS = __SWSDatatable(
|
|
61
68
|
id="datatables.dissemination_exception",
|
|
62
69
|
name="Dissemination - Exceptions",
|
|
63
70
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, dim1_code STRING, dim2_code STRING, dim3_code STRING, dim4_code STRING, dim5_code STRING, dim6_code STRING, dim7_code STRING, status_flag STRING, method_flag STRING, dissemination BOOLEAN, aggregation BOOLEAN, note STRING",
|
|
71
|
+
join_columns=[
|
|
72
|
+
"domain",
|
|
73
|
+
" dim1_code",
|
|
74
|
+
" dim2_code",
|
|
75
|
+
" dim3_code",
|
|
76
|
+
" dim4_code",
|
|
77
|
+
" dim5_code",
|
|
78
|
+
" dim6_code",
|
|
79
|
+
" dim7_code",
|
|
80
|
+
" status_flag",
|
|
81
|
+
" method_flag",
|
|
82
|
+
],
|
|
64
83
|
)
|
|
84
|
+
# TODO Deprecate
|
|
65
85
|
DISSEMINATION_ITEM_LIST_FAOSTAT = __SWSDatatable(
|
|
66
86
|
id="datatables.dissemination_item_list_faostat",
|
|
67
87
|
name="Dissemination - Item list - FAOSTAT",
|
|
68
88
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
|
|
89
|
+
join_columns=["domain", "code"],
|
|
69
90
|
)
|
|
70
91
|
|
|
71
92
|
# Mapping Tables
|
|
@@ -73,19 +94,23 @@ class DatasetDatatables:
|
|
|
73
94
|
id="datatables.aggregates_mapping_domains_id",
|
|
74
95
|
name="Mapping - Domains ID",
|
|
75
96
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, domain_name STRING, sws_source_id STRING, sws_destination_id STRING",
|
|
97
|
+
join_columns=["domain", "sws_source_id"],
|
|
76
98
|
)
|
|
77
99
|
MAPPING_CODELIST_TYPE = __SWSDatatable(
|
|
78
100
|
id="datatables.mapping_codelist_type",
|
|
79
101
|
name="Mapping Codelist type",
|
|
80
102
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, col_name STRING, col_type STRING",
|
|
103
|
+
join_columns=["domain", "col_name"],
|
|
81
104
|
)
|
|
82
105
|
MAPPING_CODE_CORRECTION = __SWSDatatable(
|
|
83
106
|
id="datatables.aggregates_mapping_code_correction",
|
|
84
107
|
name="Mapping - Code correction",
|
|
85
108
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, old_code STRING, new_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
|
|
109
|
+
join_columns=["domain", "old_code", "var_type", "mapping_type"],
|
|
86
110
|
)
|
|
87
111
|
|
|
88
112
|
# Non-SWS Sources Tables
|
|
113
|
+
# TODO To deprecate
|
|
89
114
|
FAOSTAT_CODE_MAPPING = __SWSDatatable(
|
|
90
115
|
id="datatables.faostat_code_mapping",
|
|
91
116
|
name="FAOSTAT Code Mapping",
|
|
@@ -147,6 +172,11 @@ class DatasetTables:
|
|
|
147
172
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata_element",
|
|
148
173
|
schema="id BIGINT, metadata INT, metadata_element_type INT, value STRING",
|
|
149
174
|
)
|
|
175
|
+
self.TAG_OBSERVATION = self.__SWSTable(
|
|
176
|
+
postgres_id=f"{self.__dataset_id}.tag_observation",
|
|
177
|
+
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_tag_observation",
|
|
178
|
+
schema="tag BIGINT, observation INT",
|
|
179
|
+
)
|
|
150
180
|
|
|
151
181
|
# Reference data
|
|
152
182
|
self.CODELISTS = [
|
|
@@ -178,18 +208,21 @@ class DatasetTables:
|
|
|
178
208
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_element_type",
|
|
179
209
|
schema="id INT, metadata_type INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN, private BOOLEAN",
|
|
180
210
|
)
|
|
181
|
-
|
|
182
211
|
LANGUAGE = __SWSTable(
|
|
183
212
|
postgres_id="reference_data.language",
|
|
184
213
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.language",
|
|
185
214
|
schema="id INT, country_code STRING, description STRING",
|
|
186
215
|
)
|
|
187
|
-
|
|
188
216
|
UNIT_OF_MEASURE = __SWSTable(
|
|
189
217
|
postgres_id="reference_data.unit_of_measure",
|
|
190
218
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.unit_of_measure",
|
|
191
219
|
schema="id INT, code STRING, sdmx_code STRING, metric BOOLEAN, description STRING, symbol STRING, base_unit STRING, multiplier DECIMAL",
|
|
192
220
|
)
|
|
221
|
+
DATASET = __SWSTable(
|
|
222
|
+
postgres_id="reference_data.dataset",
|
|
223
|
+
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.dataset",
|
|
224
|
+
schema="id INT, xml_name STRING",
|
|
225
|
+
)
|
|
193
226
|
|
|
194
227
|
# Operational data
|
|
195
228
|
USER = __SWSTable(
|
|
@@ -197,6 +230,11 @@ class DatasetTables:
|
|
|
197
230
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.user",
|
|
198
231
|
schema="id INT, username STRING, preferences INT, email STRING, active BOOLEAN, settings STRING",
|
|
199
232
|
)
|
|
233
|
+
TAG = __SWSTable(
|
|
234
|
+
postgres_id="operational_data.tag",
|
|
235
|
+
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.tag",
|
|
236
|
+
schema="id INT, name STRING, reference_date DATE, dataset INT, type STRING, released_ON DATE, released_by INT, properties STRING",
|
|
237
|
+
)
|
|
200
238
|
|
|
201
239
|
|
|
202
240
|
class IcebergTable:
|
|
@@ -216,30 +254,37 @@ class IcebergTables:
|
|
|
216
254
|
self.__tag_name = tag_name
|
|
217
255
|
|
|
218
256
|
# TODO Fix later with a more appropriate DATABASE
|
|
219
|
-
self.
|
|
220
|
-
self.
|
|
221
|
-
self.
|
|
222
|
-
self.
|
|
223
|
-
self.
|
|
257
|
+
self.DENORMALIZED_OBSERVATION = self.create_iceberg_table("BRONZE", suffix="denormalized_observation")
|
|
258
|
+
self.DENORMALIZED_METADATA = self.create_iceberg_table("BRONZE", suffix="denormalized_metadata")
|
|
259
|
+
self.GROUPED_METADATA = self.create_iceberg_table("BRONZE", suffix="grouped_metadata")
|
|
260
|
+
self.TABLE = self.create_iceberg_table("BRONZE")
|
|
261
|
+
self.TABLE_FILTERED = self.create_iceberg_table("BRONZE", suffix="filtered")
|
|
262
|
+
self.BRONZE = self.create_iceberg_table("BRONZE")
|
|
263
|
+
self.BRONZE_DISS_TAG = self.create_iceberg_table("BRONZE", suffix="diss_tag")
|
|
264
|
+
self.SILVER = self.create_iceberg_table("SILVER", prefix=domain)
|
|
224
265
|
|
|
225
266
|
# GOLD tables with specific suffixes
|
|
226
|
-
self.GOLD_SWS = self.
|
|
227
|
-
|
|
228
|
-
)
|
|
229
|
-
self.GOLD_SDMX = self._create_iceberg_table(
|
|
267
|
+
self.GOLD_SWS = self.create_iceberg_table("GOLD", prefix=domain, suffix="sws")
|
|
268
|
+
self.GOLD_SDMX = self.create_iceberg_table(
|
|
230
269
|
"GOLD", prefix=domain, suffix="sdmx_disseminated"
|
|
231
270
|
)
|
|
232
|
-
self.GOLD_SWS_VALIDATED = self.
|
|
271
|
+
self.GOLD_SWS_VALIDATED = self.create_iceberg_table(
|
|
233
272
|
"GOLD", prefix=domain, suffix="sws_validated"
|
|
234
273
|
)
|
|
235
|
-
self.GOLD_SWS_DISSEMINATED = self.
|
|
274
|
+
self.GOLD_SWS_DISSEMINATED = self.create_iceberg_table(
|
|
236
275
|
"GOLD", prefix=domain, suffix="sws_disseminated"
|
|
237
276
|
)
|
|
238
|
-
self.GOLD_PRE_SDMX = self.
|
|
277
|
+
self.GOLD_PRE_SDMX = self.create_iceberg_table(
|
|
239
278
|
"GOLD", prefix=domain, suffix="pre_sdmx"
|
|
240
279
|
)
|
|
280
|
+
self.GOLD_FAOSTAT = self.create_iceberg_table(
|
|
281
|
+
"GOLD", prefix=domain, suffix="faostat"
|
|
282
|
+
)
|
|
283
|
+
self.GOLD_FAOSTAT_UNFILTERED = self.create_iceberg_table(
|
|
284
|
+
"GOLD", prefix=domain, suffix="faostat_unfiltered"
|
|
285
|
+
)
|
|
241
286
|
|
|
242
|
-
def
|
|
287
|
+
def create_iceberg_table(
|
|
243
288
|
self, level: str, prefix: str = "", suffix: str = ""
|
|
244
289
|
) -> IcebergTable:
|
|
245
290
|
database = getattr(IcebergDatabases, f"{level}_DATABASE")
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.171
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
|
-
Project-URL: Repository, https://
|
|
5
|
+
Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
7
7
|
License: MIT License
|
|
8
8
|
|
|
@@ -31,27 +31,27 @@ Classifier: Operating System :: OS Independent
|
|
|
31
31
|
Classifier: Programming Language :: Python :: 3
|
|
32
32
|
Requires-Python: >=3.9
|
|
33
33
|
Requires-Dist: annotated-types==0.7.0
|
|
34
|
-
Requires-Dist: boto3
|
|
35
|
-
Requires-Dist: botocore
|
|
34
|
+
Requires-Dist: boto3>=1.40.75
|
|
35
|
+
Requires-Dist: botocore>=1.40.75
|
|
36
36
|
Requires-Dist: certifi==2025.1.31
|
|
37
37
|
Requires-Dist: charset-normalizer==3.4.1
|
|
38
|
-
Requires-Dist: idna
|
|
38
|
+
Requires-Dist: idna>=3.10
|
|
39
39
|
Requires-Dist: jmespath==1.0.1
|
|
40
40
|
Requires-Dist: numpy==2.0.2
|
|
41
|
-
Requires-Dist: pandas==2.
|
|
41
|
+
Requires-Dist: pandas==2.3.3
|
|
42
42
|
Requires-Dist: py4j==0.10.9.7
|
|
43
43
|
Requires-Dist: pydantic-core==2.27.2
|
|
44
44
|
Requires-Dist: pydantic==2.10.6
|
|
45
45
|
Requires-Dist: pyspark==3.5.4
|
|
46
46
|
Requires-Dist: python-dateutil==2.9.0.post0
|
|
47
47
|
Requires-Dist: python-dotenv==0.19.2
|
|
48
|
-
Requires-Dist: pytz==2025.
|
|
48
|
+
Requires-Dist: pytz==2025.2
|
|
49
49
|
Requires-Dist: requests==2.32.3
|
|
50
|
-
Requires-Dist: s3transfer
|
|
50
|
+
Requires-Dist: s3transfer>=0.11.2
|
|
51
51
|
Requires-Dist: six==1.17.0
|
|
52
|
-
Requires-Dist: sws-api-client
|
|
53
|
-
Requires-Dist: typing-extensions
|
|
54
|
-
Requires-Dist: tzdata==2025.
|
|
52
|
+
Requires-Dist: sws-api-client==2.3.0
|
|
53
|
+
Requires-Dist: typing-extensions>=4.12.2
|
|
54
|
+
Requires-Dist: tzdata==2025.2
|
|
55
55
|
Requires-Dist: urllib3==1.26.20
|
|
56
56
|
Description-Content-Type: text/markdown
|
|
57
57
|
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=
|
|
1
|
+
sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
|
|
2
2
|
sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=
|
|
4
|
-
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=
|
|
5
|
-
sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=
|
|
3
|
+
sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
|
|
4
|
+
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=0dxbVkrhdaASapEffF5PFcgKwAMyJoWBxzgymjZ4JyY,25049
|
|
5
|
+
sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=KpG8gp8Ai9pHDiKhUOTcXWxxmFGeKEE3XKlI_Y-SveU,18453
|
|
6
6
|
sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=qioLv3SlJEfk0LzTiwfXRtZXVImPOJUeh9k1XwHC-pA,26225
|
|
7
7
|
sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
|
|
8
|
-
sws_spark_dissemination_helper/constants.py,sha256=
|
|
8
|
+
sws_spark_dissemination_helper/constants.py,sha256=vQmalAqInwPAybgJOfYx99jn47KsKp8jeD8eqmjw-Rs,13471
|
|
9
9
|
sws_spark_dissemination_helper/utils.py,sha256=G7lQqNRrvqZpgm9WmddD7fWsI8IVn09x1p3cV3458EA,21963
|
|
10
|
-
sws_spark_dissemination_helper-0.0.
|
|
11
|
-
sws_spark_dissemination_helper-0.0.
|
|
12
|
-
sws_spark_dissemination_helper-0.0.
|
|
13
|
-
sws_spark_dissemination_helper-0.0.
|
|
10
|
+
sws_spark_dissemination_helper-0.0.171.dist-info/METADATA,sha256=W4qkQISSzekzXhpmNhlNMfJEmaQlscu3hQTs4Vavawg,2824
|
|
11
|
+
sws_spark_dissemination_helper-0.0.171.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
12
|
+
sws_spark_dissemination_helper-0.0.171.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
|
|
13
|
+
sws_spark_dissemination_helper-0.0.171.dist-info/RECORD,,
|
|
File without changes
|