dataforge-sdk 10.0.dev115__tar.gz → 10.0.0rc3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/PKG-INFO +3 -1
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/__init__.py +1 -1
- dataforge_sdk-10.0.0rc3/dataforge/_session.py +40 -0
- dataforge_sdk-10.0.0rc3/dataforge/databricks/__init__.py +0 -0
- dataforge_sdk-10.0.dev115/dataforge/_databricks_session.py → dataforge_sdk-10.0.0rc3/dataforge/databricks/_databricks_base_session.py +13 -66
- dataforge_sdk-10.0.0rc3/dataforge/databricks/_databricks_ingestion_session.py +70 -0
- dataforge_sdk-10.0.0rc3/dataforge/databricks/_databricks_parsing_session.py +46 -0
- dataforge_sdk-10.0.0rc3/dataforge/databricks/_databricks_pg.py +94 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/ingestion_session.py +2 -2
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/parsing_session.py +4 -37
- dataforge_sdk-10.0.0rc3/dataforge/pg.py +10 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/post_output_session.py +0 -1
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/postgres_connection.py +6 -3
- dataforge_sdk-10.0.0rc3/dataforge/snowflake/__init__.py +0 -0
- dataforge_sdk-10.0.0rc3/dataforge/snowflake/_snowflake_base_session.py +66 -0
- dataforge_sdk-10.0.0rc3/dataforge/snowflake/_snowflake_ingestion_session.py +39 -0
- dataforge_sdk-10.0.0rc3/dataforge/snowflake/_snowflake_parsing_session.py +30 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge_sdk.egg-info/PKG-INFO +3 -1
- dataforge_sdk-10.0.0rc3/dataforge_sdk.egg-info/SOURCES.txt +28 -0
- dataforge_sdk-10.0.0rc3/dataforge_sdk.egg-info/requires.txt +3 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/pyproject.toml +3 -3
- dataforge_sdk-10.0.dev115/dataforge/_session.py +0 -7
- dataforge_sdk-10.0.dev115/dataforge/_snowflake_session.py +0 -82
- dataforge_sdk-10.0.dev115/dataforge/pg.py +0 -91
- dataforge_sdk-10.0.dev115/dataforge_sdk.egg-info/SOURCES.txt +0 -20
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/README.md +0 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/_base_session.py +0 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/process_record.py +0 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/system_configuration.py +0 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/utils.py +0 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge_sdk.egg-info/dependency_links.txt +0 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge_sdk.egg-info/top_level.txt +0 -0
- {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/setup.cfg +0 -0
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-sdk
|
|
3
|
-
Version: 10.0.
|
|
3
|
+
Version: 10.0.0rc3
|
|
4
4
|
Summary: SDK for creating DataForge extensions
|
|
5
5
|
Author-email: Vadim Orlov <vorlov@dataforgelabs.com>
|
|
6
6
|
Project-URL: Homepage, https://docs.dataforgelabs.com
|
|
7
7
|
Project-URL: Issues, https://docs.dataforgelabs.com/hc/en-us/requests/new
|
|
8
8
|
Requires-Python: >=3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
|
+
Provides-Extra: psycopg2
|
|
11
|
+
Requires-Dist: psycopg2-binary>=2.9; extra == "psycopg2"
|
|
10
12
|
|
|
11
13
|
# dataforge-sdk
|
|
12
14
|
SDK for creating DataForge extensions.
|
|
@@ -2,5 +2,5 @@ from .post_output_session import PostOutputSession
|
|
|
2
2
|
from .ingestion_session import IngestionSession
|
|
3
3
|
from .parsing_session import ParsingSession
|
|
4
4
|
|
|
5
|
-
__version__ = "10.0.
|
|
5
|
+
__version__ = "10.0.0-rc.3"
|
|
6
6
|
__all__ = ['PostOutputSession','IngestionSession', 'ParsingSession']
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _is_databricks_environment() -> bool:
|
|
5
|
+
"""Detect whether the current runtime is Databricks."""
|
|
6
|
+
spark_obj = globals().get("spark")
|
|
7
|
+
if spark_obj is not None:
|
|
8
|
+
spark_class_name = getattr(getattr(spark_obj, "__class__", None), "__name__", None)
|
|
9
|
+
if spark_class_name == "SparkSession":
|
|
10
|
+
return True
|
|
11
|
+
if os.environ.get("DATABRICKS_RUNTIME_VERSION"):
|
|
12
|
+
return True
|
|
13
|
+
try:
|
|
14
|
+
from pyspark.sql import SparkSession # type: ignore
|
|
15
|
+
except ImportError:
|
|
16
|
+
return False
|
|
17
|
+
try:
|
|
18
|
+
return SparkSession.getActiveSession() is not None
|
|
19
|
+
except Exception:
|
|
20
|
+
return False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
if _is_databricks_environment():
|
|
24
|
+
from dataforge.databricks._databricks_base_session import _Databricks_Base_Session
|
|
25
|
+
from dataforge.databricks._databricks_ingestion_session import _Databricks_Ingestion_Session
|
|
26
|
+
from dataforge.databricks._databricks_parsing_session import _Databricks_Parsing_Session
|
|
27
|
+
|
|
28
|
+
_Session = _Databricks_Base_Session
|
|
29
|
+
_Ingestion_Session = _Databricks_Ingestion_Session
|
|
30
|
+
_Parsing_Session = _Databricks_Parsing_Session
|
|
31
|
+
_platform = "databricks"
|
|
32
|
+
else:
|
|
33
|
+
from dataforge.snowflake._snowflake_base_session import _Snowflake_Base_Session
|
|
34
|
+
from dataforge.snowflake._snowflake_ingestion_session import _Snowflake_Ingestion_Session
|
|
35
|
+
from dataforge.snowflake._snowflake_parsing_session import _Snowflake_Parsing_Session
|
|
36
|
+
|
|
37
|
+
_Session = _Snowflake_Base_Session
|
|
38
|
+
_Ingestion_Session = _Snowflake_Ingestion_Session
|
|
39
|
+
_Parsing_Session = _Snowflake_Parsing_Session
|
|
40
|
+
_platform = "snowflake"
|
|
File without changes
|
|
@@ -1,24 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
1
2
|
import json
|
|
2
3
|
import re
|
|
3
|
-
from typing import Callable
|
|
4
4
|
|
|
5
|
-
from pyspark.dbutils import DBUtils
|
|
6
|
-
from pyspark.sql import SparkSession, DataFrame
|
|
7
|
-
from pyspark.sql.functions import monotonically_increasing_id, lit
|
|
8
|
-
from pyspark.sql.types import LongType
|
|
9
5
|
|
|
10
6
|
from dataforge._base_session import _Base_Session
|
|
11
7
|
|
|
12
8
|
|
|
13
|
-
class
|
|
9
|
+
class _Databricks_Base_Session(_Base_Session):
|
|
14
10
|
"""Base session class for Databricks platform.
|
|
15
11
|
Class should not be instantiated by user directly: use process-specific Session classes instead
|
|
16
12
|
Adds Spark session, DBUtilsto Base_Session
|
|
17
13
|
"""
|
|
18
|
-
|
|
19
|
-
dbutils: DBUtils
|
|
14
|
+
|
|
20
15
|
|
|
21
16
|
def __init__(self):
|
|
17
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
18
|
+
self.spark = SparkSession.builder.getOrCreate()
|
|
19
|
+
self.dbutils = self._get_dbutils()
|
|
22
20
|
pg_connection_string_read = self.dbutils.secrets.get("sparky", "pg_read")
|
|
23
21
|
core_jwt_token = self.dbutils.secrets.get("sparky", "coreJWT")
|
|
24
22
|
try:
|
|
@@ -27,69 +25,17 @@ class _Databricks_Session(_Base_Session):
|
|
|
27
25
|
process_id = None
|
|
28
26
|
|
|
29
27
|
super().__init__(pg_connection_string_read, core_jwt_token, process_id)
|
|
30
|
-
self.spark = SparkSession.builder.getOrCreate()
|
|
31
|
-
self.dbutils = self._get_dbutils()
|
|
32
28
|
self.process_parameters["start_process_flag"] = True
|
|
33
29
|
self.logger.info(f"Initialized databricks base session for {self.__class__.__name__} with parameters {self.process_parameters}")
|
|
34
30
|
|
|
35
31
|
|
|
36
32
|
def _get_dbutils(self):
|
|
33
|
+
from pyspark.dbutils import DBUtils
|
|
37
34
|
return DBUtils(self.spark)
|
|
38
35
|
|
|
39
36
|
|
|
40
|
-
def ingest(self,df: DataFrame | Callable[[], DataFrame] | None = None):
|
|
41
|
-
"""Ingest the provided DataFrame into the DataForge and update input record.
|
|
42
|
-
|
|
43
|
-
Writes the DataFrame to raw Parquet file,
|
|
44
|
-
updates the input record with status, file size, record count, and notifies
|
|
45
|
-
the Core API of process completion. On failure, updates logs and flags the input and process
|
|
46
|
-
records as failed.
|
|
47
37
|
|
|
48
|
-
|
|
49
|
-
df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Spark DataFrame to ingest (recommended),
|
|
50
|
-
or spark DataFrame
|
|
51
|
-
"""
|
|
52
|
-
try:
|
|
53
|
-
if not self._is_open:
|
|
54
|
-
raise Exception("Session is closed")
|
|
55
|
-
if df is None:
|
|
56
|
-
status = "Z"
|
|
57
|
-
row_count = 0
|
|
58
|
-
file_size = 0
|
|
59
|
-
else:
|
|
60
|
-
if callable(df):
|
|
61
|
-
result_df = df() # call it to get the DataFrame
|
|
62
|
-
else:
|
|
63
|
-
result_df = df
|
|
64
|
-
dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
|
|
65
|
-
file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
|
|
66
|
-
status = "P" if row_count > 0 else "Z"
|
|
67
|
-
input_update_json = {
|
|
68
|
-
"ingestion_status_code": status,
|
|
69
|
-
"extract_datetime": datetime.now().isoformat(),
|
|
70
|
-
"file_size": file_size,
|
|
71
|
-
"process_id": self.process.processId,
|
|
72
|
-
"input_id": self.process.inputId,
|
|
73
|
-
"record_counts": {"Total": row_count}
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
|
|
77
|
-
(json.dumps(input_update_json),), fetch=False)
|
|
78
|
-
self.logger.info("Ingestion completed successfully")
|
|
79
|
-
|
|
80
|
-
except Exception as e:
|
|
81
|
-
self._log_fail(e)
|
|
82
|
-
failure_update_json = {
|
|
83
|
-
"process_id": self.process.processId,
|
|
84
|
-
"ingestion_status_code": "F"
|
|
85
|
-
}
|
|
86
|
-
self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
|
|
87
|
-
(json.dumps(failure_update_json),), fetch=False)
|
|
88
|
-
finally:
|
|
89
|
-
self._core_api_call(f"process-complete/{self.process.processId}")
|
|
90
|
-
self.close()
|
|
91
|
-
|
|
92
|
-
def _write_parsed_data(self, in_df: DataFrame, dest_file_path: str) -> tuple[int, int]:
|
|
38
|
+
def _write_parsed_data(self, in_df: pyspark.sql.DataFrame, dest_file_path: str) -> tuple[int, int]:
|
|
93
39
|
"""Process input DataFrame, write to Parquet, and update metadata.
|
|
94
40
|
|
|
95
41
|
Args:
|
|
@@ -102,6 +48,10 @@ class _Databricks_Session(_Base_Session):
|
|
|
102
48
|
Raises:
|
|
103
49
|
Exception: If duplicate columns are detected or metadata update fails.
|
|
104
50
|
"""
|
|
51
|
+
from pyspark.sql.functions import monotonically_increasing_id, lit
|
|
52
|
+
from pyspark.sql.types import LongType
|
|
53
|
+
|
|
54
|
+
|
|
105
55
|
self.log("Data read successfully. Checking schema.")
|
|
106
56
|
|
|
107
57
|
select_list = self._pg.sql("SELECT sparky.get_select_list(%s)", (self.process.sourceId,))
|
|
@@ -126,8 +76,6 @@ class _Databricks_Session(_Base_Session):
|
|
|
126
76
|
schema = []
|
|
127
77
|
for f in df.schema.fields:
|
|
128
78
|
field_name = f.name.lower() if self.process.forceCaseInsensitive else f.name
|
|
129
|
-
name_normalized = re.sub(r'\W+', '_', field_name)
|
|
130
|
-
column_normalized = ("_" if field_name[0].isdigit() else "") + name_normalized # add leading underscore
|
|
131
79
|
|
|
132
80
|
if f.dataType.simpleString().startswith("struct"):
|
|
133
81
|
spark_type = "StructType"
|
|
@@ -139,10 +87,9 @@ class _Databricks_Session(_Base_Session):
|
|
|
139
87
|
spark_type = type(f.dataType).__name__
|
|
140
88
|
|
|
141
89
|
attr_schema = json.loads(f.dataType.json())
|
|
142
|
-
self.logger.info(f"Column `{
|
|
90
|
+
self.logger.info(f"Column `{field_name}` schema: {attr_schema}")
|
|
143
91
|
schema.append({
|
|
144
92
|
"name": field_name,
|
|
145
|
-
"column_normalized": column_normalized,
|
|
146
93
|
"spark_type": spark_type,
|
|
147
94
|
"schema": attr_schema
|
|
148
95
|
})
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Callable
|
|
6
|
+
from dataforge.databricks._databricks_base_session import _Databricks_Base_Session
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class _Databricks_Ingestion_Session(_Databricks_Base_Session):
|
|
10
|
+
"""Base session class for Databricks platform.
|
|
11
|
+
Class should not be instantiated by user directly: use process-specific Session classes instead
|
|
12
|
+
Adds Spark session, DBUtilsto Base_Session
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
super().__init__()
|
|
17
|
+
|
|
18
|
+
def ingest(self,df: pyspark.sql.DataFrame | Callable[[], pyspark.sql.DataFrame] | None = None):
|
|
19
|
+
"""Ingest the provided DataFrame into the DataForge and update input record.
|
|
20
|
+
|
|
21
|
+
Writes the DataFrame to raw Parquet file,
|
|
22
|
+
updates the input record with status, file size, record count, and notifies
|
|
23
|
+
the Core API of process completion. On failure, updates logs and flags the input and process
|
|
24
|
+
records as failed.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Spark DataFrame to ingest (recommended),
|
|
28
|
+
or spark DataFrame
|
|
29
|
+
"""
|
|
30
|
+
try:
|
|
31
|
+
if not self._is_open:
|
|
32
|
+
raise Exception("Session is closed")
|
|
33
|
+
if df is None:
|
|
34
|
+
status = "Z"
|
|
35
|
+
row_count = 0
|
|
36
|
+
file_size = 0
|
|
37
|
+
else:
|
|
38
|
+
if callable(df):
|
|
39
|
+
result_df = df() # call it to get the DataFrame
|
|
40
|
+
else:
|
|
41
|
+
result_df = df
|
|
42
|
+
dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
|
|
43
|
+
file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
|
|
44
|
+
status = "P" if row_count > 0 else "Z"
|
|
45
|
+
input_update_json = {
|
|
46
|
+
"ingestion_status_code": status,
|
|
47
|
+
"extract_datetime": datetime.now().isoformat(),
|
|
48
|
+
"file_size": file_size,
|
|
49
|
+
"process_id": self.process.processId,
|
|
50
|
+
"input_id": self.process.inputId,
|
|
51
|
+
"record_counts": {"Total": row_count}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
|
|
55
|
+
(json.dumps(input_update_json),), fetch=False)
|
|
56
|
+
self.logger.info("Ingestion completed successfully")
|
|
57
|
+
|
|
58
|
+
except Exception as e:
|
|
59
|
+
self._log_fail(e)
|
|
60
|
+
failure_update_json = {
|
|
61
|
+
"process_id": self.process.processId,
|
|
62
|
+
"ingestion_status_code": "F"
|
|
63
|
+
}
|
|
64
|
+
self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
|
|
65
|
+
(json.dumps(failure_update_json),), fetch=False)
|
|
66
|
+
finally:
|
|
67
|
+
self._core_api_call(f"process-complete/{self.process.processId}")
|
|
68
|
+
self.close()
|
|
69
|
+
|
|
70
|
+
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Callable
|
|
3
|
+
from ._databricks_base_session import _Databricks_Base_Session
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class _Databricks_Parsing_Session(_Databricks_Base_Session):
|
|
7
|
+
"""Implements run method for Databricks.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def run(self,df: pyspark.sql.DataFrame | Callable[[], pyspark.sql.DataFrame] | None = None):
|
|
12
|
+
"""Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
|
|
13
|
+
|
|
14
|
+
Writes the DataFrame to parsed Parquet file,
|
|
15
|
+
updates the input record with status, file size, record count, and notifies
|
|
16
|
+
the Core API of process completion. On failure, updates logs and flags the input and process
|
|
17
|
+
records as failed.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
df (DataFrame): parameterless def that you defined, returning the Spark DataFrame containing parsed file data (recommended),
|
|
21
|
+
or spark DataFrame
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
if not self._is_open:
|
|
25
|
+
raise Exception("Session is closed")
|
|
26
|
+
if callable(df):
|
|
27
|
+
result_df = df() # call it to get the DataFrame
|
|
28
|
+
else:
|
|
29
|
+
result_df = df
|
|
30
|
+
|
|
31
|
+
if result_df is None or result_df.isEmpty():
|
|
32
|
+
file_size, row_count = (0, 0)
|
|
33
|
+
else:
|
|
34
|
+
dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
|
|
35
|
+
file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
|
|
36
|
+
input_update_json = {
|
|
37
|
+
"file_size": file_size,
|
|
38
|
+
"input_id": self.process.inputId,
|
|
39
|
+
"record_counts": {"Total": row_count}
|
|
40
|
+
}
|
|
41
|
+
self._end_process('P' if row_count > 0 else 'Z', input_update_json)
|
|
42
|
+
|
|
43
|
+
except Exception as e:
|
|
44
|
+
self._log_fail(e)
|
|
45
|
+
self._end_process("F")
|
|
46
|
+
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Postgres utilities for data operations.
|
|
2
|
+
|
|
3
|
+
This module provides functions to execute SQL queries against a Postgres database
|
|
4
|
+
using Spark JDBC for reads and a direct write connection for write operations.
|
|
5
|
+
"""
|
|
6
|
+
from dataforge.postgres_connection import PostgresConnection
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataBricksPg:
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
from pyspark.dbutils import DBUtils
|
|
14
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
15
|
+
from .postgres_connection import PostgresConnection
|
|
16
|
+
|
|
17
|
+
self.spark = SparkSession.builder.getOrCreate()
|
|
18
|
+
self.dbutils = DBUtils(spark)
|
|
19
|
+
self.pg_connection_string_read = dbutils.secrets.get("sparky", "pg_read")
|
|
20
|
+
|
|
21
|
+
def update(self,query: str):
|
|
22
|
+
"""Execute an update SQL query on the DataForge metastore Postgres database.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
query (str): SQL query string to execute.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
None
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
Exception: If write connection cannot be established or SQL execution fails.
|
|
32
|
+
"""
|
|
33
|
+
pg = self._get_pg_write_connection()
|
|
34
|
+
pg.sql(query, fetch=False)
|
|
35
|
+
|
|
36
|
+
def execute(self,query: str):
|
|
37
|
+
"""Alias for update() to execute write SQL queries.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
query (str): SQL query string to execute.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
None
|
|
44
|
+
"""
|
|
45
|
+
self.update(query)
|
|
46
|
+
|
|
47
|
+
def select(self,query: str) -> DataFrame:
|
|
48
|
+
"""Execute a SELECT SQL query on the DataForge metastore Postgres database and return a DataFrame with results.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
query (str): SQL SELECT query string.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
DataFrame: Spark DataFrame containing query results.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
Exception: If Spark fails to load data or connection issues.
|
|
58
|
+
"""
|
|
59
|
+
return self.spark.read.format("jdbc") \
|
|
60
|
+
.option("url", self.pg_connection_string_read) \
|
|
61
|
+
.option("query", query) \
|
|
62
|
+
.load()
|
|
63
|
+
|
|
64
|
+
def pull(self,source_id: int):
|
|
65
|
+
"""Trigger new ingestion (pull data) on DataForge source for a given source ID.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
source_id (int): Identifier for the source to pull.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
None
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
Exception: If write connection cannot be established or SQL execution fails.
|
|
75
|
+
"""
|
|
76
|
+
pg = self._get_pg_write_connection()
|
|
77
|
+
pg.sql("SELECT meta.svc_pull_source(%s, %s)", (source_id,'sdk'), fetch=False)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _get_pg_write_connection(self) -> PostgresConnection:
|
|
81
|
+
"""Internal method to retrieve a PostgresConnection for write operations using secured secrets.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
PostgresConnection: Connection object for executing write queries.
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
Exception: If the 'pg_write' secret is not defined in the 'sparky' scope.
|
|
88
|
+
"""
|
|
89
|
+
secrets = self.dbutils.secrets.list("sparky")
|
|
90
|
+
if any(secret.key == "pg_write" for secret in secrets):
|
|
91
|
+
conn_string = self.dbutils.secrets.get("sparky", "pg_write")
|
|
92
|
+
return PostgresConnection(conn_string + "&application_name=sdk-pg")
|
|
93
|
+
else:
|
|
94
|
+
raise Exception("pg_write secret is not defined in sparky scope")
|
|
@@ -9,11 +9,11 @@ Classes:
|
|
|
9
9
|
"""
|
|
10
10
|
import json
|
|
11
11
|
from typing import Optional
|
|
12
|
-
from ._session import
|
|
12
|
+
from ._session import _Ingestion_Session
|
|
13
13
|
from .process_record import ProcessRecord
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class IngestionSession(
|
|
16
|
+
class IngestionSession(_Ingestion_Session):
|
|
17
17
|
|
|
18
18
|
"""Session class to manage custom ingestion process lifecycle.
|
|
19
19
|
|
|
@@ -1,12 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
1
2
|
|
|
2
3
|
import json
|
|
3
4
|
from typing import Optional, Callable
|
|
4
|
-
from pyspark.sql import DataFrame
|
|
5
5
|
from .process_record import ProcessRecord
|
|
6
|
-
from ._session import
|
|
6
|
+
from ._session import _Parsing_Session
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class ParsingSession(
|
|
9
|
+
class ParsingSession(_Parsing_Session):
|
|
10
|
+
|
|
10
11
|
|
|
11
12
|
"""Session class to manage custom parse process lifecycle.
|
|
12
13
|
|
|
@@ -44,39 +45,5 @@ class ParsingSession(_Session):
|
|
|
44
45
|
"""
|
|
45
46
|
return self._parsing_parameters.get('custom_parameters')
|
|
46
47
|
|
|
47
|
-
def run(self,df: DataFrame | Callable[[], DataFrame] | None = None):
|
|
48
|
-
"""Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
|
|
49
|
-
|
|
50
|
-
Writes the DataFrame to parsed Parquet file,
|
|
51
|
-
updates the input record with status, file size, record count, and notifies
|
|
52
|
-
the Core API of process completion. On failure, updates logs and flags the input and process
|
|
53
|
-
records as failed.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
df (DataFrame): parameterless def that you defined, returning the Spark DataFrame containing parsed file data (recommended),
|
|
57
|
-
or spark DataFrame
|
|
58
|
-
"""
|
|
59
|
-
try:
|
|
60
|
-
if not self._is_open:
|
|
61
|
-
raise Exception("Session is closed")
|
|
62
|
-
if callable(df):
|
|
63
|
-
result_df = df() # call it to get the DataFrame
|
|
64
|
-
else:
|
|
65
|
-
result_df = df
|
|
66
|
-
|
|
67
|
-
if result_df is None or result_df.isEmpty():
|
|
68
|
-
file_size, row_count = (0, 0)
|
|
69
|
-
else:
|
|
70
|
-
dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
|
|
71
|
-
file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
|
|
72
|
-
input_update_json = {
|
|
73
|
-
"file_size": file_size,
|
|
74
|
-
"input_id": self.process.inputId,
|
|
75
|
-
"record_counts": {"Total": row_count}
|
|
76
|
-
}
|
|
77
|
-
self._end_process('P' if row_count > 0 else 'Z', input_update_json)
|
|
78
48
|
|
|
79
|
-
except Exception as e:
|
|
80
|
-
self._log_fail(e)
|
|
81
|
-
self._end_process("F")
|
|
82
49
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Postgres utilities for data operations.
|
|
2
|
+
|
|
3
|
+
This module provides functions to execute SQL queries against a Postgres database
|
|
4
|
+
using Spark JDBC for reads and a direct write connection for write operations.
|
|
5
|
+
"""
|
|
6
|
+
from dataforge._session import _platform
|
|
7
|
+
|
|
8
|
+
if _platform=='databricks':
|
|
9
|
+
from dataforge.databricks._databricks_pg import DataBricksPg
|
|
10
|
+
pg = DataBricksPg()
|
|
@@ -16,7 +16,6 @@ class PostOutputSession(_Session):
|
|
|
16
16
|
"""Initialize custom post-output session and start a new post-output process.
|
|
17
17
|
|
|
18
18
|
Args:
|
|
19
|
-
input_id (Optional[int]): Optional input_id of the batch for interactive testing.
|
|
20
19
|
Leave blank for production use.
|
|
21
20
|
"""
|
|
22
21
|
super().__init__()
|
|
@@ -16,7 +16,8 @@ class PostgresConnection:
|
|
|
16
16
|
|
|
17
17
|
except Exception as e:
|
|
18
18
|
logger.error(f"Error connecting to Postgres: {e}")
|
|
19
|
-
|
|
19
|
+
raise
|
|
20
|
+
# sys.exit(1)
|
|
20
21
|
|
|
21
22
|
def sql(self, query: str, params=None, fetch=True):
|
|
22
23
|
try:
|
|
@@ -29,7 +30,8 @@ class PostgresConnection:
|
|
|
29
30
|
return res[0]
|
|
30
31
|
except Exception as e:
|
|
31
32
|
self.logger.error(f"Error executing query {query}({params}) on Postgres: {e}")
|
|
32
|
-
sys.exit(1)
|
|
33
|
+
# sys.exit(1)
|
|
34
|
+
raise
|
|
33
35
|
|
|
34
36
|
def connect(self, connection_string: str):
|
|
35
37
|
# Execute a query
|
|
@@ -40,7 +42,8 @@ class PostgresConnection:
|
|
|
40
42
|
# Change connection
|
|
41
43
|
except Exception as e:
|
|
42
44
|
self.logger.error(f"Error connecting to Postgres database or insufficient permissions. Details: {e}")
|
|
43
|
-
sys.exit(1)
|
|
45
|
+
# sys.exit(1)
|
|
46
|
+
raise
|
|
44
47
|
|
|
45
48
|
def close(self ):
|
|
46
49
|
self.conn.close()
|
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Callable
|
|
3
|
+
from dataforge._base_session import _Base_Session
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
class _Snowflake_Base_Session(_Base_Session):
|
|
7
|
+
"""Base session class for Snowflake platform.
|
|
8
|
+
Class should not be instantiated by user directly: use process-specific Session classes instead
|
|
9
|
+
Adds Snowpark session
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
from snowflake.snowpark.context import get_active_session
|
|
14
|
+
self.snowpark_session = get_active_session()
|
|
15
|
+
pg_connection_string_read = self.snowpark_session.sql("SELECT get_secret('DATAFORGE_PG_READ')").first(1)[0][0]
|
|
16
|
+
core_jwt_token = self.snowpark_session.sql("SELECT get_secret('DATAFORGE_CORE_JWT')").first(1)[0][0]
|
|
17
|
+
params = self.parse_key_value_args()
|
|
18
|
+
process_id = params.get('process_id')
|
|
19
|
+
self.input_id = params.get('input_id')
|
|
20
|
+
|
|
21
|
+
super().__init__(pg_connection_string_read, core_jwt_token, process_id)
|
|
22
|
+
|
|
23
|
+
self.process_parameters["start_process_flag"] = process_id is None
|
|
24
|
+
|
|
25
|
+
self.logger.info(f"Initialized Snowflake base session for {self.__class__.__name__} with parameters {self.process_parameters}")
|
|
26
|
+
|
|
27
|
+
def _write_input_table(self,df: snowflake.snowpark.dataframe.DataFrame | Callable[[],
|
|
28
|
+
snowflake.snowpark.dataframeDataFrame] | None = None) -> snowflake.snowpark.dataframe.DataFrame:
|
|
29
|
+
from snowflake.snowpark.types import StructType, StructField, StringType
|
|
30
|
+
if not self._is_open:
|
|
31
|
+
raise Exception("Session is closed")
|
|
32
|
+
if df is None:
|
|
33
|
+
# create empty df
|
|
34
|
+
result_df = df = self.snowpark_session.create_dataframe([], StructType([StructField("id", StringType())]))
|
|
35
|
+
else:
|
|
36
|
+
if callable(df):
|
|
37
|
+
result_df = df() # call it to get the DataFrame
|
|
38
|
+
else:
|
|
39
|
+
result_df = df
|
|
40
|
+
table = f"{self._systemConfiguration.dataLakeDbName}.{self._systemConfiguration.dataLakeSchemaName}.RAW_INPUT_{self.process.inputId}"
|
|
41
|
+
self.log(f"Writing dataframe to table {table}")
|
|
42
|
+
result_df.write.save_as_table(
|
|
43
|
+
table_name=table,
|
|
44
|
+
mode="overwrite",
|
|
45
|
+
table_type="transient"
|
|
46
|
+
)
|
|
47
|
+
self.log(f"Table {table} written")
|
|
48
|
+
if self.process.startProcessFlag:
|
|
49
|
+
# process started by IngestionSession, tell Core to continue and not run Notebook
|
|
50
|
+
self._pg.sql("SELECT sparky.sdk_complete_manual_process(%s)", [self.process.processId], fetch=False)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def parse_key_value_args():
|
|
54
|
+
"""
|
|
55
|
+
Parse command line arguments formatted as key=value into a dict.
|
|
56
|
+
Example: python script.py foo=123 bar=hello
|
|
57
|
+
|
|
58
|
+
Returns: {'foo': '123', 'bar': 'hello'}
|
|
59
|
+
"""
|
|
60
|
+
argv = sys.argv
|
|
61
|
+
params: dict[str,str] = {}
|
|
62
|
+
for arg in argv:
|
|
63
|
+
if "=" in arg:
|
|
64
|
+
key, value = arg.split("=", 1) # split only on first '='
|
|
65
|
+
params[key] = value
|
|
66
|
+
return params
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
from typing import Callable
|
|
4
|
+
from dataforge.snowflake._snowflake_base_session import _Snowflake_Base_Session
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class _Snowflake_Ingestion_Session(_Snowflake_Base_Session):
|
|
8
|
+
"""Base ingestion session class for Snowflake platform.
|
|
9
|
+
Class should not be instantiated by user directly: use process-specific Session classes instead
|
|
10
|
+
Adds Snowpark session
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
super().__init__()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def ingest(self,df: snowflake.snowpark.dataframe.DataFrame | Callable[[], snowflake.snowpark.dataframeDataFrame] | None = None):
|
|
18
|
+
"""Ingest the provided DataFrame into the DataForge and update input record.
|
|
19
|
+
|
|
20
|
+
Writes the DataFrame to raw Snowflake table
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Snowpark DataFrame to ingest (recommended),
|
|
24
|
+
or spark DataFrame
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
self._write_input_table(df)
|
|
28
|
+
except Exception as e:
|
|
29
|
+
self._log_fail(e)
|
|
30
|
+
if self.process.startProcessFlag:
|
|
31
|
+
# Fail input and process to prevent core from executing it
|
|
32
|
+
failure_update_json = {
|
|
33
|
+
"process_id": self.process.processId,
|
|
34
|
+
"ingestion_status_code": "F"
|
|
35
|
+
}
|
|
36
|
+
self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
|
|
37
|
+
(json.dumps(failure_update_json),), fetch=False)
|
|
38
|
+
finally:
|
|
39
|
+
self.close()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Callable
|
|
3
|
+
from ._snowflake_base_session import _Snowflake_Base_Session
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class _Snowflake_Parsing_Session(_Snowflake_Base_Session):
|
|
7
|
+
"""Implements run method for Databricks.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def run(self,df: snowflake.snowpark.dataframe.DataFrame | Callable[[], snowflake.snowpark.dataframeDataFrame] | None = None):
|
|
12
|
+
"""Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
|
|
13
|
+
|
|
14
|
+
Writes the DataFrame to parsed Parquet file,
|
|
15
|
+
updates the input record with status, file size, record count, and notifies
|
|
16
|
+
the Core API of process completion. On failure, updates logs and flags the input and process
|
|
17
|
+
records as failed.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
df (DataFrame): parameterless def that you defined, returning the Spark DataFrame containing parsed file data (recommended),
|
|
21
|
+
or spark DataFrame
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
self._write_input_table(df)
|
|
25
|
+
|
|
26
|
+
except Exception as e:
|
|
27
|
+
self._log_fail(e)
|
|
28
|
+
if self.process.startProcessFlag:
|
|
29
|
+
self._end_process("F")
|
|
30
|
+
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-sdk
|
|
3
|
-
Version: 10.0.
|
|
3
|
+
Version: 10.0.0rc3
|
|
4
4
|
Summary: SDK for creating DataForge extensions
|
|
5
5
|
Author-email: Vadim Orlov <vorlov@dataforgelabs.com>
|
|
6
6
|
Project-URL: Homepage, https://docs.dataforgelabs.com
|
|
7
7
|
Project-URL: Issues, https://docs.dataforgelabs.com/hc/en-us/requests/new
|
|
8
8
|
Requires-Python: >=3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
|
+
Provides-Extra: psycopg2
|
|
11
|
+
Requires-Dist: psycopg2-binary>=2.9; extra == "psycopg2"
|
|
10
12
|
|
|
11
13
|
# dataforge-sdk
|
|
12
14
|
SDK for creating DataForge extensions.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.cfg
|
|
4
|
+
dataforge/__init__.py
|
|
5
|
+
dataforge/_base_session.py
|
|
6
|
+
dataforge/_session.py
|
|
7
|
+
dataforge/ingestion_session.py
|
|
8
|
+
dataforge/parsing_session.py
|
|
9
|
+
dataforge/pg.py
|
|
10
|
+
dataforge/post_output_session.py
|
|
11
|
+
dataforge/postgres_connection.py
|
|
12
|
+
dataforge/process_record.py
|
|
13
|
+
dataforge/system_configuration.py
|
|
14
|
+
dataforge/utils.py
|
|
15
|
+
dataforge/databricks/__init__.py
|
|
16
|
+
dataforge/databricks/_databricks_base_session.py
|
|
17
|
+
dataforge/databricks/_databricks_ingestion_session.py
|
|
18
|
+
dataforge/databricks/_databricks_parsing_session.py
|
|
19
|
+
dataforge/databricks/_databricks_pg.py
|
|
20
|
+
dataforge/snowflake/__init__.py
|
|
21
|
+
dataforge/snowflake/_snowflake_base_session.py
|
|
22
|
+
dataforge/snowflake/_snowflake_ingestion_session.py
|
|
23
|
+
dataforge/snowflake/_snowflake_parsing_session.py
|
|
24
|
+
dataforge_sdk.egg-info/PKG-INFO
|
|
25
|
+
dataforge_sdk.egg-info/SOURCES.txt
|
|
26
|
+
dataforge_sdk.egg-info/dependency_links.txt
|
|
27
|
+
dataforge_sdk.egg-info/requires.txt
|
|
28
|
+
dataforge_sdk.egg-info/top_level.txt
|
|
@@ -6,15 +6,15 @@ requires = [
|
|
|
6
6
|
build-backend = "setuptools.build_meta"
|
|
7
7
|
[project]
|
|
8
8
|
name = "dataforge-sdk"
|
|
9
|
-
version = "10.0.
|
|
9
|
+
version = "10.0.0-rc.3"
|
|
10
10
|
authors = [
|
|
11
11
|
{name="Vadim Orlov", email="vorlov@dataforgelabs.com"}
|
|
12
12
|
]
|
|
13
13
|
description = "SDK for creating DataForge extensions"
|
|
14
14
|
readme = "README.md"
|
|
15
15
|
requires-python = ">=3.10"
|
|
16
|
-
dependencies
|
|
17
|
-
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
psycopg2 = ["psycopg2-binary>=2.9"]
|
|
18
18
|
[project.urls]
|
|
19
19
|
Homepage = "https://docs.dataforgelabs.com"
|
|
20
20
|
Issues = "https://docs.dataforgelabs.com/hc/en-us/requests/new"
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
# Pick base class at import time
|
|
2
|
-
if "spark" in globals() and type(spark).__name__ == 'SparkSession':
|
|
3
|
-
from dataforge._databricks_session import _Databricks_Session
|
|
4
|
-
_Session = _Databricks_Session
|
|
5
|
-
else:
|
|
6
|
-
from dataforge._snowflake_session import _Snowflake_Session
|
|
7
|
-
_Session = _Snowflake_Session
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from typing import Callable
|
|
3
|
-
|
|
4
|
-
from dataforge._base_session import _Base_Session
|
|
5
|
-
import sys
|
|
6
|
-
from snowflake.snowpark.context import get_active_session
|
|
7
|
-
import streamlit as st
|
|
8
|
-
from snowflake.snowpark.dataframe import DataFrame
|
|
9
|
-
|
|
10
|
-
class _Snowflake_Session(_Base_Session):
|
|
11
|
-
"""Base session class for Snowflake platform.
|
|
12
|
-
Class should not be instantiated by user directly: use process-specific Session classes instead
|
|
13
|
-
Adds Snowpark session
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def __init__(self):
|
|
17
|
-
pg_connection_string_read = st.secrets['DATAFORGE_PG_READ']
|
|
18
|
-
core_jwt_token = st.secrets['DATAFORGE_CORE_JWT']
|
|
19
|
-
params = self.parse_key_value_args()
|
|
20
|
-
process_id = params.get('process_id')
|
|
21
|
-
self.input_id = params.get('input_id')
|
|
22
|
-
|
|
23
|
-
super().__init__(pg_connection_string_read, core_jwt_token, process_id)
|
|
24
|
-
self.snowpark_session = get_active_session()
|
|
25
|
-
self.process_parameters["start_process_flag"] = process_id is None
|
|
26
|
-
|
|
27
|
-
self.logger.info(f"Initialized Snowflake base session for {self.__class__.__name__} with parameters {self.process_parameters}")
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@staticmethod
|
|
31
|
-
def parse_key_value_args():
|
|
32
|
-
"""
|
|
33
|
-
Parse command line arguments formatted as key=value into a dict.
|
|
34
|
-
Example: python script.py foo=123 bar=hello
|
|
35
|
-
|
|
36
|
-
Returns: {'foo': '123', 'bar': 'hello'}
|
|
37
|
-
"""
|
|
38
|
-
argv = sys.argv
|
|
39
|
-
params: dict[str,str] = {}
|
|
40
|
-
for arg in argv:
|
|
41
|
-
if "=" in arg:
|
|
42
|
-
key, value = arg.split("=", 1) # split only on first '='
|
|
43
|
-
params[key] = value
|
|
44
|
-
else:
|
|
45
|
-
raise ValueError(f"Invalid argument format (expected key=value): {arg}")
|
|
46
|
-
return params
|
|
47
|
-
|
|
48
|
-
def ingest(self,df: DataFrame | Callable[[], DataFrame] | None = None):
|
|
49
|
-
"""Ingest the provided DataFrame into the DataForge and update input record.
|
|
50
|
-
|
|
51
|
-
Writes the DataFrame to raw Snowflake table
|
|
52
|
-
|
|
53
|
-
Args:
|
|
54
|
-
df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Spark DataFrame to ingest (recommended),
|
|
55
|
-
or spark DataFrame
|
|
56
|
-
"""
|
|
57
|
-
try:
|
|
58
|
-
if not self._is_open:
|
|
59
|
-
raise Exception("Session is closed")
|
|
60
|
-
table = f"{self._systemConfiguration.dataLakeDbName}.{self._systemConfiguration.dataLakeSchemaName}.INPUT_{self.process.inputId}"
|
|
61
|
-
self.log(f"Writing dataframe to table {table}")
|
|
62
|
-
df.write.save_as_table(
|
|
63
|
-
name=table,
|
|
64
|
-
mode="overwrite",
|
|
65
|
-
table_type="transient"
|
|
66
|
-
)
|
|
67
|
-
self.log(f"Table {table} written")
|
|
68
|
-
if self.process.startProcessFlag:
|
|
69
|
-
# process started by IngestionSession, tell Core to continue and not run Notebook
|
|
70
|
-
self._pg.sql("SELECT sparky.sdk_complete_manual_process(%s)", [self.process.processId], fetch=False)
|
|
71
|
-
except Exception as e:
|
|
72
|
-
self._log_fail(e)
|
|
73
|
-
if self.process.startProcessFlag:
|
|
74
|
-
# Fail input and process to prevent core from executing it
|
|
75
|
-
failure_update_json = {
|
|
76
|
-
"process_id": self.process.processId,
|
|
77
|
-
"ingestion_status_code": "F"
|
|
78
|
-
}
|
|
79
|
-
self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
|
|
80
|
-
(json.dumps(failure_update_json),), fetch=False)
|
|
81
|
-
finally:
|
|
82
|
-
self.close()
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
"""Postgres utilities for data operations.
|
|
2
|
-
|
|
3
|
-
This module provides functions to execute SQL queries against a Postgres database
|
|
4
|
-
using Spark JDBC for reads and a direct write connection for write operations.
|
|
5
|
-
"""
|
|
6
|
-
from pyspark.dbutils import DBUtils
|
|
7
|
-
from pyspark.sql import SparkSession, DataFrame
|
|
8
|
-
from .postgres_connection import PostgresConnection
|
|
9
|
-
|
|
10
|
-
spark = SparkSession.builder.getOrCreate()
|
|
11
|
-
dbutils = DBUtils(spark)
|
|
12
|
-
pg_connection_string_read = dbutils.secrets.get("sparky", "pg_read")
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def update(query: str):
|
|
16
|
-
"""Execute an update SQL query on the DataForge metastore Postgres database.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
query (str): SQL query string to execute.
|
|
20
|
-
|
|
21
|
-
Returns:
|
|
22
|
-
None
|
|
23
|
-
|
|
24
|
-
Raises:
|
|
25
|
-
Exception: If write connection cannot be established or SQL execution fails.
|
|
26
|
-
"""
|
|
27
|
-
pg = _get_pg_write_connection()
|
|
28
|
-
pg.sql(query, fetch=False)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def execute(query: str):
|
|
32
|
-
"""Alias for update() to execute write SQL queries.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
query (str): SQL query string to execute.
|
|
36
|
-
|
|
37
|
-
Returns:
|
|
38
|
-
None
|
|
39
|
-
"""
|
|
40
|
-
update(query)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def select(query: str) -> DataFrame:
|
|
44
|
-
"""Execute a SELECT SQL query on the DataForge metastore Postgres database and return a DataFrame with results.
|
|
45
|
-
|
|
46
|
-
Args:
|
|
47
|
-
query (str): SQL SELECT query string.
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
DataFrame: Spark DataFrame containing query results.
|
|
51
|
-
|
|
52
|
-
Raises:
|
|
53
|
-
Exception: If Spark fails to load data or connection issues.
|
|
54
|
-
"""
|
|
55
|
-
return spark.read.format("jdbc") \
|
|
56
|
-
.option("url", pg_connection_string_read) \
|
|
57
|
-
.option("query", query) \
|
|
58
|
-
.load()
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def pull(source_id: int):
|
|
62
|
-
"""Trigger new ingestion (pull data) on DataForge source for a given source ID.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
source_id (int): Identifier for the source to pull.
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
None
|
|
69
|
-
|
|
70
|
-
Raises:
|
|
71
|
-
Exception: If write connection cannot be established or SQL execution fails.
|
|
72
|
-
"""
|
|
73
|
-
pg = _get_pg_write_connection()
|
|
74
|
-
pg.sql("SELECT meta.svc_pull_source(%s, %s)", (source_id,'sdk'), fetch=False)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def _get_pg_write_connection() -> PostgresConnection:
|
|
78
|
-
"""Internal method to retrieve a PostgresConnection for write operations using secured secrets.
|
|
79
|
-
|
|
80
|
-
Returns:
|
|
81
|
-
PostgresConnection: Connection object for executing write queries.
|
|
82
|
-
|
|
83
|
-
Raises:
|
|
84
|
-
Exception: If the 'pg_write' secret is not defined in the 'sparky' scope.
|
|
85
|
-
"""
|
|
86
|
-
secrets = dbutils.secrets.list("sparky")
|
|
87
|
-
if any(secret.key == "pg_write" for secret in secrets):
|
|
88
|
-
conn_string = dbutils.secrets.get("sparky", "pg_write")
|
|
89
|
-
return PostgresConnection(conn_string + "&application_name=sdk-pg")
|
|
90
|
-
else:
|
|
91
|
-
raise Exception("pg_write secret is not defined in sparky scope")
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
README.md
|
|
2
|
-
pyproject.toml
|
|
3
|
-
setup.cfg
|
|
4
|
-
dataforge/__init__.py
|
|
5
|
-
dataforge/_base_session.py
|
|
6
|
-
dataforge/_databricks_session.py
|
|
7
|
-
dataforge/_session.py
|
|
8
|
-
dataforge/_snowflake_session.py
|
|
9
|
-
dataforge/ingestion_session.py
|
|
10
|
-
dataforge/parsing_session.py
|
|
11
|
-
dataforge/pg.py
|
|
12
|
-
dataforge/post_output_session.py
|
|
13
|
-
dataforge/postgres_connection.py
|
|
14
|
-
dataforge/process_record.py
|
|
15
|
-
dataforge/system_configuration.py
|
|
16
|
-
dataforge/utils.py
|
|
17
|
-
dataforge_sdk.egg-info/PKG-INFO
|
|
18
|
-
dataforge_sdk.egg-info/SOURCES.txt
|
|
19
|
-
dataforge_sdk.egg-info/dependency_links.txt
|
|
20
|
-
dataforge_sdk.egg-info/top_level.txt
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge_sdk.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|