sws-spark-dissemination-helper 0.0.149__tar.gz → 0.0.168__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sws-spark-dissemination-helper might be problematic. Click here for more details.
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/PKG-INFO +1 -1
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/pyproject.toml +1 -1
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +293 -31
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +95 -41
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +154 -2
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +33 -21
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/constants.py +9 -0
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/.gitignore +0 -0
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/LICENSE +0 -0
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/README.md +0 -0
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/utils.py +0 -0
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/tests/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/tests/test.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.168
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
5
|
Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
from copy import copy
|
|
4
|
-
from typing import Dict, List, Tuple
|
|
4
|
+
from typing import Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import pyspark.sql.functions as F
|
|
7
7
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
26
26
|
domain_code: str,
|
|
27
27
|
dataset_details: dict = None,
|
|
28
28
|
dataset_tables: DatasetTables = None,
|
|
29
|
+
keep_history: bool = False,
|
|
30
|
+
write_csv: bool = True,
|
|
31
|
+
source_tag: Union[str, None] = None,
|
|
29
32
|
) -> None:
|
|
30
33
|
self.spark: SparkSession = spark
|
|
31
34
|
self.dataset_details: dict = dataset_details
|
|
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
36
39
|
self.dataset_tables: DatasetTables = dataset_tables
|
|
37
40
|
self.iceberg_tables: IcebergTables = iceberg_tables
|
|
38
41
|
self.domain_code = domain_code
|
|
42
|
+
self.keep_history: bool = keep_history
|
|
43
|
+
self.write_csv: bool = write_csv
|
|
44
|
+
self.source_tag: Union[str, None] = source_tag
|
|
39
45
|
|
|
40
46
|
if dataset_details is not None:
|
|
41
47
|
(
|
|
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
83
89
|
self.df_obs_coord,
|
|
84
90
|
self.df_metadata,
|
|
85
91
|
self.df_meta_elem,
|
|
92
|
+
self.df_tag_observation,
|
|
86
93
|
) = self.raw_data
|
|
87
94
|
|
|
88
95
|
(
|
|
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
92
99
|
self.df_meta_elem_type,
|
|
93
100
|
self.df_language,
|
|
94
101
|
self.df_unit_of_measure,
|
|
102
|
+
self.df_dataset,
|
|
95
103
|
self.dfs_dimension,
|
|
96
104
|
) = self.raw_reference_data
|
|
97
105
|
|
|
98
|
-
self.df_user = self.raw_operational_data
|
|
106
|
+
(self.df_user, self.df_tag) = self.raw_operational_data
|
|
99
107
|
|
|
100
108
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
101
109
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
@@ -150,7 +158,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
150
158
|
|
|
151
159
|
return dfs_dimension
|
|
152
160
|
|
|
153
|
-
def _prepare_element_uom(self) -> DataFrame:
|
|
161
|
+
def _prepare_element_uom(self) -> Union[DataFrame, None]:
|
|
154
162
|
"""Prepare the element and unit of measure join."""
|
|
155
163
|
|
|
156
164
|
# Get the element DataFrame
|
|
@@ -162,23 +170,24 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
162
170
|
if dimension_column == self.element_column
|
|
163
171
|
)
|
|
164
172
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
173
|
+
if any("unit_of_measure" == column.lower() for column in df_element.columns):
|
|
174
|
+
# Join the element and the unit_of_measure
|
|
175
|
+
df_element_uom = (
|
|
176
|
+
df_element.alias("e")
|
|
177
|
+
.join(
|
|
178
|
+
self.df_unit_of_measure.alias("u"),
|
|
179
|
+
col("e.unit_of_measure") == col("u.id"),
|
|
180
|
+
)
|
|
181
|
+
.select(
|
|
182
|
+
col("e.code").alias("element_code"),
|
|
183
|
+
col("u.code").alias("unit_of_measure"),
|
|
184
|
+
col("u.symbol").alias("unit_of_measure_symbol"),
|
|
185
|
+
col("u.base_unit").alias("unit_of_measure_base_unit"),
|
|
186
|
+
col("u.multiplier").alias("unit_of_measure_multiplier"),
|
|
187
|
+
)
|
|
178
188
|
)
|
|
179
|
-
)
|
|
180
189
|
|
|
181
|
-
|
|
190
|
+
return df_element_uom
|
|
182
191
|
|
|
183
192
|
def _gen_denormalized_observation(self) -> DataFrame:
|
|
184
193
|
"""Original query upon which the below computation is based
|
|
@@ -270,20 +279,170 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
270
279
|
.withColumnRenamed("code", dimension_column)
|
|
271
280
|
)
|
|
272
281
|
|
|
273
|
-
|
|
274
|
-
df_intermediate
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
282
|
+
if df_element_uom is not None:
|
|
283
|
+
df_intermediate = (
|
|
284
|
+
df_intermediate.alias("d")
|
|
285
|
+
.join(
|
|
286
|
+
F.broadcast(df_element_uom).alias("e"),
|
|
287
|
+
col(f"d.{self.element_column}") == col("e.element_code"),
|
|
288
|
+
"left",
|
|
289
|
+
)
|
|
290
|
+
.drop("element_code")
|
|
279
291
|
)
|
|
280
|
-
.drop("element_code")
|
|
281
|
-
)
|
|
282
292
|
|
|
283
293
|
df_obs_denorm = df_intermediate
|
|
284
294
|
|
|
285
295
|
return df_obs_denorm
|
|
286
296
|
|
|
297
|
+
def _gen_denormalized_observation_sql(self) -> DataFrame:
|
|
298
|
+
# ----------------
|
|
299
|
+
# Prepare dataframes for the joins
|
|
300
|
+
# ----------------
|
|
301
|
+
|
|
302
|
+
select_statement = """
|
|
303
|
+
o.id,
|
|
304
|
+
o.value,
|
|
305
|
+
u.email,
|
|
306
|
+
o.created_on,
|
|
307
|
+
o.replaced_on,
|
|
308
|
+
o.version"""
|
|
309
|
+
|
|
310
|
+
from_statement = f"""
|
|
311
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
312
|
+
JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
313
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
314
|
+
|
|
315
|
+
hint_statement = ""
|
|
316
|
+
|
|
317
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
318
|
+
for flag_col in self.flag_columns:
|
|
319
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
320
|
+
|
|
321
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
322
|
+
for i, (dim_col, cl) in enumerate(
|
|
323
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
324
|
+
):
|
|
325
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
326
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
327
|
+
hint_statement = (
|
|
328
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
329
|
+
if hint_statement
|
|
330
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
334
|
+
|
|
335
|
+
final_query = "SELECT " + hint_statement + select_statement + from_statement
|
|
336
|
+
if not self.keep_history:
|
|
337
|
+
final_query += "\nWHERE o.replaced_on IS NULL"
|
|
338
|
+
|
|
339
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
340
|
+
logging.info(final_query)
|
|
341
|
+
|
|
342
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
343
|
+
|
|
344
|
+
df_element_uom = self._prepare_element_uom()
|
|
345
|
+
|
|
346
|
+
dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
|
|
347
|
+
|
|
348
|
+
# Join all the dimension codelists
|
|
349
|
+
for dimension_column, df_dimension in zip(
|
|
350
|
+
self.dim_columns_w_time, dfs_dimension_w_validity
|
|
351
|
+
):
|
|
352
|
+
logging.debug(f"Joining dimension column: {dimension_column}")
|
|
353
|
+
logging.debug(f"df_obs_denorm columns: {df_obs_denorm.columns}")
|
|
354
|
+
logging.debug(
|
|
355
|
+
f"Is dimension {dimension_column} in the dataframe? {dimension_column in df_obs_denorm.columns}"
|
|
356
|
+
)
|
|
357
|
+
df_obs_denorm = (
|
|
358
|
+
df_obs_denorm.alias("o")
|
|
359
|
+
.join(
|
|
360
|
+
F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
|
|
361
|
+
"d"
|
|
362
|
+
),
|
|
363
|
+
col(f"{dimension_column}") == col("d.code"),
|
|
364
|
+
)
|
|
365
|
+
.drop("code", "join_id")
|
|
366
|
+
)
|
|
367
|
+
logging.debug(f"After join count: {df_obs_denorm.count()}")
|
|
368
|
+
|
|
369
|
+
if df_element_uom is not None:
|
|
370
|
+
df_obs_denorm = (
|
|
371
|
+
df_obs_denorm.alias("d")
|
|
372
|
+
.join(
|
|
373
|
+
F.broadcast(df_element_uom).alias("e"),
|
|
374
|
+
col(f"d.{self.element_column}") == col("e.element_code"),
|
|
375
|
+
"left",
|
|
376
|
+
)
|
|
377
|
+
.drop("element_code")
|
|
378
|
+
)
|
|
379
|
+
logging.debug(f"After uom count: {df_obs_denorm.count()}")
|
|
380
|
+
|
|
381
|
+
return df_obs_denorm
|
|
382
|
+
|
|
383
|
+
def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
|
|
384
|
+
# ----------------
|
|
385
|
+
# Prepare dataframes for the joins
|
|
386
|
+
# ----------------
|
|
387
|
+
|
|
388
|
+
select_statement = """
|
|
389
|
+
o.id,
|
|
390
|
+
o.value,
|
|
391
|
+
u.email,
|
|
392
|
+
o.created_on,
|
|
393
|
+
o.replaced_on,
|
|
394
|
+
o.version"""
|
|
395
|
+
|
|
396
|
+
from_statement = f"""
|
|
397
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
398
|
+
INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
|
|
399
|
+
INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
|
|
400
|
+
INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
|
|
401
|
+
LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
402
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
403
|
+
|
|
404
|
+
hint_statement = ""
|
|
405
|
+
|
|
406
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
407
|
+
for flag_col in self.flag_columns:
|
|
408
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
409
|
+
|
|
410
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
411
|
+
for i, (dim_col, cl) in enumerate(
|
|
412
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
413
|
+
):
|
|
414
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
415
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
416
|
+
hint_statement = (
|
|
417
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
418
|
+
if hint_statement
|
|
419
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
423
|
+
|
|
424
|
+
# TODO Add tag name as a parameter
|
|
425
|
+
where_statement = (
|
|
426
|
+
f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
final_query = (
|
|
430
|
+
"SELECT "
|
|
431
|
+
+ hint_statement
|
|
432
|
+
+ select_statement
|
|
433
|
+
+ from_statement
|
|
434
|
+
+ where_statement
|
|
435
|
+
)
|
|
436
|
+
if not self.keep_history:
|
|
437
|
+
final_query += "\n AND o.replaced_on IS NULL"
|
|
438
|
+
|
|
439
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
440
|
+
logging.info(final_query)
|
|
441
|
+
|
|
442
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
443
|
+
|
|
444
|
+
return df_obs_denorm
|
|
445
|
+
|
|
287
446
|
def _gen_denormalized_metadata(self) -> DataFrame:
|
|
288
447
|
"""Original query upon which the below computation is based
|
|
289
448
|
|
|
@@ -347,6 +506,32 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
347
506
|
|
|
348
507
|
return df_meta_denorm
|
|
349
508
|
|
|
509
|
+
def _gen_denormalized_metadata_sql(self) -> DataFrame:
|
|
510
|
+
# ----------------
|
|
511
|
+
# Generate denormalized observation table
|
|
512
|
+
# ----------------
|
|
513
|
+
|
|
514
|
+
logging.info("meta_denorm start")
|
|
515
|
+
|
|
516
|
+
df_meta_denorm = self.spark.sql(
|
|
517
|
+
f"""
|
|
518
|
+
select m.observation as observation_id,
|
|
519
|
+
mt.code as type,
|
|
520
|
+
met.code as element_type,
|
|
521
|
+
l.country_code as language,
|
|
522
|
+
me.value
|
|
523
|
+
from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
|
|
524
|
+
left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
|
|
525
|
+
left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
|
|
526
|
+
left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
|
|
527
|
+
left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
|
|
528
|
+
"""
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
logging.info("meta_denorm write")
|
|
532
|
+
|
|
533
|
+
return df_meta_denorm
|
|
534
|
+
|
|
350
535
|
def _gen_grouped_metadata(self) -> DataFrame:
|
|
351
536
|
return (
|
|
352
537
|
self._gen_denormalized_metadata()
|
|
@@ -367,6 +552,26 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
367
552
|
.agg(F.collect_list("metadata").alias("metadata"))
|
|
368
553
|
)
|
|
369
554
|
|
|
555
|
+
def _gen_grouped_metadata_sql(self) -> DataFrame:
|
|
556
|
+
return (
|
|
557
|
+
self._gen_denormalized_metadata_sql()
|
|
558
|
+
.select(
|
|
559
|
+
col("observation_id"),
|
|
560
|
+
F.create_map(
|
|
561
|
+
lit("type"),
|
|
562
|
+
col("type"),
|
|
563
|
+
lit("element_type"),
|
|
564
|
+
col("element_type"),
|
|
565
|
+
lit("language"),
|
|
566
|
+
col("language"),
|
|
567
|
+
lit("value"),
|
|
568
|
+
col("value"),
|
|
569
|
+
).alias("metadata"),
|
|
570
|
+
)
|
|
571
|
+
.groupby("observation_id")
|
|
572
|
+
.agg(F.collect_list("metadata").alias("metadata"))
|
|
573
|
+
)
|
|
574
|
+
|
|
370
575
|
def _gen_bronze_data(self) -> DataFrame:
|
|
371
576
|
return (
|
|
372
577
|
self._gen_denormalized_observation()
|
|
@@ -379,9 +584,37 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
379
584
|
.drop("m.observation_id")
|
|
380
585
|
)
|
|
381
586
|
|
|
587
|
+
def _gen_bronze_data_sql(self) -> DataFrame:
|
|
588
|
+
return (
|
|
589
|
+
self._gen_denormalized_observation_sql()
|
|
590
|
+
.alias("o")
|
|
591
|
+
.join(
|
|
592
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
593
|
+
col("o.id") == col("m.observation_id"),
|
|
594
|
+
"left",
|
|
595
|
+
)
|
|
596
|
+
.drop("m.observation_id")
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
|
|
600
|
+
return (
|
|
601
|
+
self._gen_denormalized_observation_sql_from_tag()
|
|
602
|
+
.alias("o")
|
|
603
|
+
.join(
|
|
604
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
605
|
+
col("o.id") == col("m.observation_id"),
|
|
606
|
+
"left",
|
|
607
|
+
)
|
|
608
|
+
.drop("m.observation_id")
|
|
609
|
+
)
|
|
610
|
+
|
|
382
611
|
# TODO decouple data generation and data writing
|
|
383
|
-
def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
384
|
-
|
|
612
|
+
def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
|
|
613
|
+
|
|
614
|
+
if sql:
|
|
615
|
+
self.df_bronze = self._gen_bronze_data_sql()
|
|
616
|
+
else:
|
|
617
|
+
self.df_bronze = self._gen_bronze_data()
|
|
385
618
|
|
|
386
619
|
self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
|
|
387
620
|
|
|
@@ -449,12 +682,15 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
449
682
|
logging.info("Bronze Dissemination tags successfully written")
|
|
450
683
|
|
|
451
684
|
def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
|
|
452
|
-
self, dimensions: Dict[str, List[str]]
|
|
685
|
+
self, dimensions: Dict[str, List[str]] = {}, from_tag=False
|
|
453
686
|
) -> DataFrame:
|
|
454
687
|
|
|
455
|
-
|
|
688
|
+
if from_tag:
|
|
689
|
+
self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
|
|
690
|
+
else:
|
|
691
|
+
self.disseminated_tag_df = self.df_bronze
|
|
456
692
|
|
|
457
|
-
if
|
|
693
|
+
if not from_tag and dimensions is not None and len(dimensions) != 0:
|
|
458
694
|
for dimension_name, codes in dimensions.items():
|
|
459
695
|
logging.info(f"dimension_name: {dimension_name}")
|
|
460
696
|
logging.info(f"codes: {codes}")
|
|
@@ -533,3 +769,29 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
533
769
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
534
770
|
|
|
535
771
|
logging.info("Bronze Disseminated tag with selection successfully written")
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
1
|
|
775
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
776
|
+
1
|
|
777
|
+
1
|
|
778
|
+
2
|
|
779
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
780
|
+
2
|
|
781
|
+
1
|
|
782
|
+
1
|
|
783
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
784
|
+
1
|
|
785
|
+
1
|
|
786
|
+
2
|
|
787
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
788
|
+
2
|
|
789
|
+
1
|
|
790
|
+
1
|
|
791
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
792
|
+
1
|
|
793
|
+
1
|
|
794
|
+
1
|
|
795
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
796
|
+
1
|
|
797
|
+
1
|
|
@@ -74,6 +74,7 @@ class SWSEasyIcebergSparkHelper:
|
|
|
74
74
|
self.df_tag_observation,
|
|
75
75
|
) = self.raw_data
|
|
76
76
|
|
|
77
|
+
logging.info(self.raw_reference_data)
|
|
77
78
|
(
|
|
78
79
|
self.df_flag_method,
|
|
79
80
|
self.df_flag_obs_status,
|
|
@@ -275,11 +276,17 @@ class SWSEasyIcebergSparkHelper:
|
|
|
275
276
|
if not self.keep_history:
|
|
276
277
|
final_query += "\nWHERE o.replaced_on IS NULL"
|
|
277
278
|
|
|
278
|
-
logging.info("Final query for merging observation and
|
|
279
|
+
logging.info("Final query for merging observation and observation_coordinates")
|
|
279
280
|
logging.info(final_query)
|
|
280
281
|
|
|
281
282
|
df_obs_denorm = self.spark.sql(final_query)
|
|
282
283
|
|
|
284
|
+
df_obs_denorm.writeTo(
|
|
285
|
+
self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id
|
|
286
|
+
).createOrReplace()
|
|
287
|
+
|
|
288
|
+
logging.info(f"{self.iceberg_tables.DENORMALIZED_OBSERVATION.table} write")
|
|
289
|
+
|
|
283
290
|
return df_obs_denorm
|
|
284
291
|
|
|
285
292
|
def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
|
|
@@ -417,7 +424,13 @@ class SWSEasyIcebergSparkHelper:
|
|
|
417
424
|
|
|
418
425
|
df_meta_denorm = self.spark.sql(
|
|
419
426
|
f"""
|
|
420
|
-
select
|
|
427
|
+
select
|
|
428
|
+
/*+
|
|
429
|
+
BROADCAST({self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id}),
|
|
430
|
+
BROADCAST({self.dataset_tables.METADATA_TYPE.iceberg_id}),
|
|
431
|
+
BROADCAST({self.dataset_tables.LANGUAGE.iceberg_id})
|
|
432
|
+
*/
|
|
433
|
+
m.observation as observation_id,
|
|
421
434
|
mt.code as type,
|
|
422
435
|
met.code as element_type,
|
|
423
436
|
l.country_code as language,
|
|
@@ -430,7 +443,11 @@ class SWSEasyIcebergSparkHelper:
|
|
|
430
443
|
"""
|
|
431
444
|
)
|
|
432
445
|
|
|
433
|
-
|
|
446
|
+
df_meta_denorm.writeTo(
|
|
447
|
+
self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id
|
|
448
|
+
).createOrReplace()
|
|
449
|
+
|
|
450
|
+
logging.info(f"{self.iceberg_tables.DENORMALIZED_METADATA.table} write")
|
|
434
451
|
|
|
435
452
|
return df_meta_denorm
|
|
436
453
|
|
|
@@ -455,25 +472,31 @@ class SWSEasyIcebergSparkHelper:
|
|
|
455
472
|
)
|
|
456
473
|
|
|
457
474
|
def _gen_grouped_metadata_sql(self) -> DataFrame:
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
.groupby("observation_id")
|
|
474
|
-
.agg(F.collect_list("metadata").alias("metadata"))
|
|
475
|
+
df_meta_grouped = self.spark.sql(
|
|
476
|
+
f"""
|
|
477
|
+
SELECT
|
|
478
|
+
observation_id,
|
|
479
|
+
collect_list(
|
|
480
|
+
map(
|
|
481
|
+
'type', type,
|
|
482
|
+
'element_type', element_type,
|
|
483
|
+
'language', language,
|
|
484
|
+
'value', value
|
|
485
|
+
)
|
|
486
|
+
) AS metadata
|
|
487
|
+
FROM {self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id}
|
|
488
|
+
GROUP BY observation_id
|
|
489
|
+
"""
|
|
475
490
|
)
|
|
476
491
|
|
|
492
|
+
df_meta_grouped.writeTo(
|
|
493
|
+
self.iceberg_tables.GROUPED_METADATA.iceberg_id
|
|
494
|
+
).createOrReplace()
|
|
495
|
+
|
|
496
|
+
logging.info(f"{self.iceberg_tables.GROUPED_METADATA.table} write")
|
|
497
|
+
|
|
498
|
+
return df_meta_grouped
|
|
499
|
+
|
|
477
500
|
def _gen_denormalied_data(self) -> DataFrame:
|
|
478
501
|
return (
|
|
479
502
|
self._gen_denormalized_observation()
|
|
@@ -487,15 +510,19 @@ class SWSEasyIcebergSparkHelper:
|
|
|
487
510
|
)
|
|
488
511
|
|
|
489
512
|
def _gen_denormalied_data_sql(self) -> DataFrame:
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
513
|
+
self._gen_denormalized_observation_sql()
|
|
514
|
+
self._gen_denormalized_metadata_sql()
|
|
515
|
+
self._gen_grouped_metadata_sql()
|
|
516
|
+
|
|
517
|
+
return self.spark.sql(
|
|
518
|
+
f"""
|
|
519
|
+
SELECT
|
|
520
|
+
o.*,
|
|
521
|
+
m.metadata
|
|
522
|
+
FROM {self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id} AS o
|
|
523
|
+
LEFT JOIN {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS m
|
|
524
|
+
ON o.id = m.observation_id
|
|
525
|
+
"""
|
|
499
526
|
)
|
|
500
527
|
|
|
501
528
|
def _gen_denormalied_data_sql_from_tag(self) -> DataFrame:
|
|
@@ -510,11 +537,9 @@ class SWSEasyIcebergSparkHelper:
|
|
|
510
537
|
.drop("m.observation_id")
|
|
511
538
|
)
|
|
512
539
|
|
|
513
|
-
def write_data_to_iceberg_and_csv(self, sql=
|
|
540
|
+
def write_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
|
|
514
541
|
if sql:
|
|
515
542
|
self.df_denorm = self._gen_denormalied_data_sql()
|
|
516
|
-
elif from_tag:
|
|
517
|
-
self.df_denorm = self._gen_denormalied_data_sql_from_tag()
|
|
518
543
|
else:
|
|
519
544
|
self.df_denorm = self._gen_denormalied_data()
|
|
520
545
|
|
|
@@ -585,18 +610,21 @@ class SWSEasyIcebergSparkHelper:
|
|
|
585
610
|
logging.info("Unfiltered data tags successfully written")
|
|
586
611
|
|
|
587
612
|
def write_filtered_data_to_iceberg_and_csv(
|
|
588
|
-
self, dimensions: Dict[str, List[str]]
|
|
613
|
+
self, dimensions: Dict[str, List[str]] = None, from_tag=False
|
|
589
614
|
) -> DataFrame:
|
|
590
615
|
|
|
591
|
-
|
|
616
|
+
if from_tag:
|
|
617
|
+
self.filtered_df = self._gen_denormalied_data_sql_from_tag()
|
|
618
|
+
else:
|
|
619
|
+
self.filtered_df = self.df_denorm
|
|
592
620
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
621
|
+
for dimension_name, codes in dimensions.items():
|
|
622
|
+
logging.info(f"dimension_name: {dimension_name}")
|
|
623
|
+
logging.info(f"codes: {codes}")
|
|
624
|
+
if len(codes) != 0:
|
|
625
|
+
self.filtered_df = self.filtered_df.filter(
|
|
626
|
+
col(dimension_name).isin(codes)
|
|
627
|
+
)
|
|
600
628
|
|
|
601
629
|
self.filtered_df.writeTo(
|
|
602
630
|
self.iceberg_tables.TABLE_FILTERED.iceberg_id
|
|
@@ -667,3 +695,29 @@ class SWSEasyIcebergSparkHelper:
|
|
|
667
695
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
668
696
|
|
|
669
697
|
logging.info("Filtered data tags successfully written")
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
1
|
|
701
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
702
|
+
1
|
|
703
|
+
1
|
|
704
|
+
2
|
|
705
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
706
|
+
2
|
|
707
|
+
1
|
|
708
|
+
1
|
|
709
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
710
|
+
1
|
|
711
|
+
1
|
|
712
|
+
2
|
|
713
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
714
|
+
2
|
|
715
|
+
1
|
|
716
|
+
1
|
|
717
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
718
|
+
1
|
|
719
|
+
1
|
|
720
|
+
1
|
|
721
|
+
frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
|
|
722
|
+
1
|
|
723
|
+
1
|
|
@@ -271,6 +271,60 @@ class SWSGoldIcebergSparkHelper:
|
|
|
271
271
|
|
|
272
272
|
return df
|
|
273
273
|
|
|
274
|
+
def write_gold_faostat_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
|
|
275
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
276
|
+
df.writeTo(self.iceberg_tables.GOLD_FAOSTAT.iceberg_id).createOrReplace()
|
|
277
|
+
|
|
278
|
+
logging.info(
|
|
279
|
+
f"Gold FAOSTAT table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
self.spark.sql(
|
|
283
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
logging.info(f"gold FAOSTAT tag '{self.tag_name}' created")
|
|
287
|
+
|
|
288
|
+
df_1 = df.coalesce(1)
|
|
289
|
+
|
|
290
|
+
save_cache_csv(
|
|
291
|
+
df=df_1,
|
|
292
|
+
bucket=self.bucket,
|
|
293
|
+
prefix=self.iceberg_tables.GOLD_FAOSTAT.csv_prefix,
|
|
294
|
+
tag_name=self.tag_name,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
return df
|
|
298
|
+
|
|
299
|
+
def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
|
|
300
|
+
self, df: DataFrame
|
|
301
|
+
) -> DataFrame:
|
|
302
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
303
|
+
df.writeTo(
|
|
304
|
+
self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
|
|
305
|
+
).createOrReplace()
|
|
306
|
+
|
|
307
|
+
logging.info(
|
|
308
|
+
f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
self.spark.sql(
|
|
312
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
|
|
316
|
+
|
|
317
|
+
df_1 = df.coalesce(1)
|
|
318
|
+
|
|
319
|
+
save_cache_csv(
|
|
320
|
+
df=df_1,
|
|
321
|
+
bucket=self.bucket,
|
|
322
|
+
prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
|
|
323
|
+
tag_name=self.tag_name,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
return df
|
|
327
|
+
|
|
274
328
|
def write_gold_sws_validated_sws_dissemination_tag(
|
|
275
329
|
self, df: DataFrame, tags: Tags
|
|
276
330
|
) -> DataFrame:
|
|
@@ -496,8 +550,8 @@ class SWSGoldIcebergSparkHelper:
|
|
|
496
550
|
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
497
551
|
|
|
498
552
|
new_diss_table = BaseDisseminatedTagTable(
|
|
499
|
-
id=f"{self.domain_code.lower()}
|
|
500
|
-
name=f"{self.domain_code} gold
|
|
553
|
+
id=f"{self.domain_code.lower()}_gold_sws_csv",
|
|
554
|
+
name=f"{self.domain_code} gold SWS csv",
|
|
501
555
|
description="Gold table containing the tag data without any processing cached in csv",
|
|
502
556
|
layer=TableLayer.GOLD,
|
|
503
557
|
private=True,
|
|
@@ -515,3 +569,101 @@ class SWSGoldIcebergSparkHelper:
|
|
|
515
569
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
516
570
|
|
|
517
571
|
return df
|
|
572
|
+
|
|
573
|
+
def write_gold_faostat_dissemination_tag(
|
|
574
|
+
self, df: DataFrame, tags: Tags
|
|
575
|
+
) -> DataFrame:
|
|
576
|
+
# Get or create a new tag
|
|
577
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
578
|
+
logging.debug(f"Tag: {tag}")
|
|
579
|
+
|
|
580
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
581
|
+
id=f"{self.domain_code.lower()}_gold_faostat_iceberg",
|
|
582
|
+
name=f"{self.domain_code} gold FAOSTAT Iceberg",
|
|
583
|
+
description="Gold table containing the tag data in FAOSTAT format",
|
|
584
|
+
layer=TableLayer.GOLD,
|
|
585
|
+
private=True,
|
|
586
|
+
type=TableType.ICEBERG,
|
|
587
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
588
|
+
table=self.iceberg_tables.GOLD_FAOSTAT.table,
|
|
589
|
+
path=self.iceberg_tables.GOLD_FAOSTAT.path,
|
|
590
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
591
|
+
)
|
|
592
|
+
tag = upsert_disseminated_table(
|
|
593
|
+
sws_tags=tags,
|
|
594
|
+
tag=tag,
|
|
595
|
+
dataset_id=self.dataset_id,
|
|
596
|
+
tag_name=self.tag_name,
|
|
597
|
+
table=new_iceberg_table,
|
|
598
|
+
)
|
|
599
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
600
|
+
|
|
601
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
602
|
+
id=f"{self.domain_code.lower()}_gold_faostat_csv",
|
|
603
|
+
name=f"{self.domain_code} gold FAOSTAT csv",
|
|
604
|
+
description="Gold table containing the tag data in FAOSTAT format in csv",
|
|
605
|
+
layer=TableLayer.GOLD,
|
|
606
|
+
private=True,
|
|
607
|
+
type=TableType.CSV,
|
|
608
|
+
path=self.iceberg_tables.GOLD_FAOSTAT.csv_path,
|
|
609
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
610
|
+
)
|
|
611
|
+
tag = upsert_disseminated_table(
|
|
612
|
+
sws_tags=tags,
|
|
613
|
+
tag=tag,
|
|
614
|
+
dataset_id=self.dataset_id,
|
|
615
|
+
tag_name=self.tag_name,
|
|
616
|
+
table=new_diss_table,
|
|
617
|
+
)
|
|
618
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
619
|
+
|
|
620
|
+
return df
|
|
621
|
+
|
|
622
|
+
def write_gold_faostat_unfiltered_dissemination_tag(
|
|
623
|
+
self, df: DataFrame, tags: Tags
|
|
624
|
+
) -> DataFrame:
|
|
625
|
+
# Get or create a new tag
|
|
626
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
627
|
+
logging.debug(f"Tag: {tag}")
|
|
628
|
+
|
|
629
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
630
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
|
|
631
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
|
|
632
|
+
description="Gold table containing all the tag data in FAOSTAT format",
|
|
633
|
+
layer=TableLayer.GOLD,
|
|
634
|
+
private=True,
|
|
635
|
+
type=TableType.ICEBERG,
|
|
636
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
637
|
+
table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
|
|
638
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
|
|
639
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
640
|
+
)
|
|
641
|
+
tag = upsert_disseminated_table(
|
|
642
|
+
sws_tags=tags,
|
|
643
|
+
tag=tag,
|
|
644
|
+
dataset_id=self.dataset_id,
|
|
645
|
+
tag_name=self.tag_name,
|
|
646
|
+
table=new_iceberg_table,
|
|
647
|
+
)
|
|
648
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
649
|
+
|
|
650
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
651
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
|
|
652
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
|
|
653
|
+
description="Gold table containing the tag data in FAOSTAT format in csv",
|
|
654
|
+
layer=TableLayer.GOLD,
|
|
655
|
+
private=True,
|
|
656
|
+
type=TableType.CSV,
|
|
657
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
|
|
658
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
659
|
+
)
|
|
660
|
+
tag = upsert_disseminated_table(
|
|
661
|
+
sws_tags=tags,
|
|
662
|
+
tag=tag,
|
|
663
|
+
dataset_id=self.dataset_id,
|
|
664
|
+
tag_name=self.tag_name,
|
|
665
|
+
table=new_diss_table,
|
|
666
|
+
)
|
|
667
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
668
|
+
|
|
669
|
+
return df
|
|
@@ -94,25 +94,37 @@ class SWSPostgresSparkReader:
|
|
|
94
94
|
|
|
95
95
|
logging.info(f"{pg_table} read start")
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
97
|
+
if min_id is None or max_id is None:
|
|
98
|
+
df = (
|
|
99
|
+
self.spark.read.format("jdbc")
|
|
100
|
+
.option("customSchema", custom_schema)
|
|
101
|
+
.option("dbtable", pg_table)
|
|
102
|
+
.option("fetchsize", "1000")
|
|
103
|
+
.option("url", self.jdbc_url)
|
|
104
|
+
.option("user", self.jdbc_conn_properties["user"])
|
|
105
|
+
.option("password", self.jdbc_conn_properties["password"])
|
|
106
|
+
.option("driver", SPARK_POSTGRES_DRIVER)
|
|
107
|
+
.load()
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
df = (
|
|
111
|
+
self.spark.read.format("jdbc")
|
|
112
|
+
.option("customSchema", custom_schema)
|
|
113
|
+
.option("dbtable", pg_table)
|
|
114
|
+
.option("partitionColumn", partition_column)
|
|
115
|
+
.option("lowerBound", min_id)
|
|
116
|
+
.option("upperBound", max_id)
|
|
117
|
+
.option("numPartitions", num_partitions)
|
|
118
|
+
.option("fetchsize", "1000")
|
|
119
|
+
.option("url", self.jdbc_url)
|
|
120
|
+
.option("user", self.jdbc_conn_properties["user"])
|
|
121
|
+
.option("password", self.jdbc_conn_properties["password"])
|
|
122
|
+
.option("driver", SPARK_POSTGRES_DRIVER)
|
|
123
|
+
.load()
|
|
124
|
+
# .repartition(1024, partition_column)
|
|
125
|
+
# .sortWithinPartitions(partition_column)
|
|
126
|
+
# .cache()
|
|
127
|
+
)
|
|
116
128
|
else:
|
|
117
129
|
df = (
|
|
118
130
|
self.spark.read.format("jdbc")
|
|
@@ -254,8 +266,8 @@ class SWSPostgresSparkReader:
|
|
|
254
266
|
return (
|
|
255
267
|
tuple(data_dfs),
|
|
256
268
|
(
|
|
257
|
-
*reference_data_dfs[:
|
|
258
|
-
reference_data_dfs[
|
|
269
|
+
*reference_data_dfs[:7],
|
|
270
|
+
reference_data_dfs[7:],
|
|
259
271
|
),
|
|
260
272
|
tuple(operational_data_dfs),
|
|
261
273
|
)
|
|
@@ -254,6 +254,9 @@ class IcebergTables:
|
|
|
254
254
|
self.__tag_name = tag_name
|
|
255
255
|
|
|
256
256
|
# TODO Fix later with a more appropriate DATABASE
|
|
257
|
+
self.DENORMALIZED_OBSERVATION = self._create_iceberg_table("BRONZE", suffix="denormalized_observation")
|
|
258
|
+
self.DENORMALIZED_METADATA = self._create_iceberg_table("BRONZE", suffix="denormalized_metadata")
|
|
259
|
+
self.GROUPED_METADATA = self._create_iceberg_table("BRONZE", suffix="grouped_metadata")
|
|
257
260
|
self.TABLE = self._create_iceberg_table("BRONZE")
|
|
258
261
|
self.TABLE_FILTERED = self._create_iceberg_table("BRONZE", suffix="filtered")
|
|
259
262
|
self.BRONZE = self._create_iceberg_table("BRONZE")
|
|
@@ -274,6 +277,12 @@ class IcebergTables:
|
|
|
274
277
|
self.GOLD_PRE_SDMX = self._create_iceberg_table(
|
|
275
278
|
"GOLD", prefix=domain, suffix="pre_sdmx"
|
|
276
279
|
)
|
|
280
|
+
self.GOLD_FAOSTAT = self._create_iceberg_table(
|
|
281
|
+
"GOLD", prefix=domain, suffix="faostat"
|
|
282
|
+
)
|
|
283
|
+
self.GOLD_FAOSTAT_UNFILTERED = self._create_iceberg_table(
|
|
284
|
+
"GOLD", prefix=domain, suffix="faostat_unfiltered"
|
|
285
|
+
)
|
|
277
286
|
|
|
278
287
|
def _create_iceberg_table(
|
|
279
288
|
self, level: str, prefix: str = "", suffix: str = ""
|
{sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/.gitignore
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/tests/__init__.py
RENAMED
|
File without changes
|
{sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/tests/test.py
RENAMED
|
File without changes
|