dataforge-sdk 9.2.0rc3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_sdk-9.2.0rc3/PKG-INFO +125 -0
- dataforge_sdk-9.2.0rc3/README.md +115 -0
- dataforge_sdk-9.2.0rc3/dataforge/__init__.py +6 -0
- dataforge_sdk-9.2.0rc3/dataforge/_session.py +302 -0
- dataforge_sdk-9.2.0rc3/dataforge/ingestion_session.py +119 -0
- dataforge_sdk-9.2.0rc3/dataforge/parsing_session.py +82 -0
- dataforge_sdk-9.2.0rc3/dataforge/pg.py +91 -0
- dataforge_sdk-9.2.0rc3/dataforge/post_output_session.py +60 -0
- dataforge_sdk-9.2.0rc3/dataforge/postgres_connection.py +49 -0
- dataforge_sdk-9.2.0rc3/dataforge/process_record.py +21 -0
- dataforge_sdk-9.2.0rc3/dataforge/system_configuration.py +22 -0
- dataforge_sdk-9.2.0rc3/dataforge/utils.py +25 -0
- dataforge_sdk-9.2.0rc3/dataforge_sdk.egg-info/PKG-INFO +125 -0
- dataforge_sdk-9.2.0rc3/dataforge_sdk.egg-info/SOURCES.txt +17 -0
- dataforge_sdk-9.2.0rc3/dataforge_sdk.egg-info/dependency_links.txt +1 -0
- dataforge_sdk-9.2.0rc3/dataforge_sdk.egg-info/top_level.txt +1 -0
- dataforge_sdk-9.2.0rc3/pyproject.toml +20 -0
- dataforge_sdk-9.2.0rc3/setup.cfg +22 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge-sdk
|
|
3
|
+
Version: 9.2.0rc3
|
|
4
|
+
Summary: SDK for creating DataForge extensions
|
|
5
|
+
Author-email: Vadim Orlov <vorlov@dataforgelabs.com>
|
|
6
|
+
Project-URL: Homepage, https://docs.dataforgelabs.com
|
|
7
|
+
Project-URL: Issues, https://docs.dataforgelabs.com/hc/en-us/requests/new
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# dataforge-sdk
|
|
12
|
+
SDK for creating DataForge extensions.
|
|
13
|
+
|
|
14
|
+
## Postgres Utilities
|
|
15
|
+
|
|
16
|
+
The `dataforge.pg` module provides helper functions to execute SQL operations against the DataForge Postgres metastore:
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from dataforge.pg import select, update, pull
|
|
20
|
+
|
|
21
|
+
# Execute a SELECT query and return a Spark DataFrame
|
|
22
|
+
df = select("SELECT * FROM my_table")
|
|
23
|
+
|
|
24
|
+
# Execute an UPDATE/INSERT/DELETE query
|
|
25
|
+
update("UPDATE my_table SET col = 'value'")
|
|
26
|
+
|
|
27
|
+
# Trigger a new data pull for source_id 123
|
|
28
|
+
pull(123)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## IngestionSession
|
|
32
|
+
|
|
33
|
+
The `IngestionSession` class manages a custom data ingestion process lifecycle.
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from dataforge import IngestionSession
|
|
37
|
+
|
|
38
|
+
# Initialize a session (production use)
|
|
39
|
+
session = IngestionSession()
|
|
40
|
+
|
|
41
|
+
# Initialize a session (optional source_name/project_name for testing)
|
|
42
|
+
session = IngestionSession(source_name="my_source", project_name="my_project")
|
|
43
|
+
|
|
44
|
+
# Ingest data
|
|
45
|
+
# pass a function returning a DataFrame (recommended to integrate logging with DataForge)
|
|
46
|
+
session.ingest(lambda: spark.read.csv("s3://bucket/path/input.csv"))
|
|
47
|
+
|
|
48
|
+
# pass a DataFrame (can be used for testing, not recommended for production deployment)
|
|
49
|
+
df = spark.read.csv("s3://bucket/path/input.csv")
|
|
50
|
+
session.ingest(df)
|
|
51
|
+
|
|
52
|
+
# ingest empty dataframe to create 0-record input
|
|
53
|
+
session.ingest()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Fail the process with error message
|
|
57
|
+
session.fail("Error message")
|
|
58
|
+
|
|
59
|
+
# Retrieve latest tracking fields
|
|
60
|
+
tracking = session.latest_tracking_fields()
|
|
61
|
+
|
|
62
|
+
# Retrieve connection parameters for the current source
|
|
63
|
+
connection_parameters = session.connection_parameters()
|
|
64
|
+
|
|
65
|
+
# Retrieve custom parameters for the current source
|
|
66
|
+
custom_parameters = session.custom_parameters()
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## ParsingSession
|
|
71
|
+
|
|
72
|
+
The `ParsingSession` class manages a custom parse process lifecycle.
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from dataforge import ParsingSession
|
|
76
|
+
|
|
77
|
+
# Initialize a session (production use)
|
|
78
|
+
session = ParsingSession()
|
|
79
|
+
|
|
80
|
+
# Initialize a session (optional input_id for testing)
|
|
81
|
+
session = ParsingSession(input_id=123)
|
|
82
|
+
|
|
83
|
+
# Retrieve custom parameters
|
|
84
|
+
params = session.custom_parameters()
|
|
85
|
+
|
|
86
|
+
# Get the path of file to be parsed
|
|
87
|
+
path = session.file_path
|
|
88
|
+
|
|
89
|
+
# Run parsing: pass a DataFrame, a function returning a DataFrame or None (0-record file)
|
|
90
|
+
session.run(lambda: spark.read.json(session.file_path))
|
|
91
|
+
|
|
92
|
+
# Fail the process with error message
|
|
93
|
+
session.fail("Error message")
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## PostOutputSession
|
|
98
|
+
|
|
99
|
+
The `PostOutputSession` class manages a custom post-output process lifecycle.
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from dataforge import PostOutputSession
|
|
103
|
+
|
|
104
|
+
# Initialize a session (production use)
|
|
105
|
+
session = PostOutputSession()
|
|
106
|
+
|
|
107
|
+
# Initialize a session (optional names for testing)
|
|
108
|
+
session = PostOutputSession(output_name="report", output_source_name="my_source", project_name="my_project")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# Get the path of file generated by preceding output process
|
|
112
|
+
path = session.file_path()
|
|
113
|
+
|
|
114
|
+
# Retrieve connection parameters for the current output
|
|
115
|
+
connection_parameters = session.connection_parameters()
|
|
116
|
+
|
|
117
|
+
# Retrieve custom parameters for the current source
|
|
118
|
+
custom_parameters = session.custom_parameters()
|
|
119
|
+
|
|
120
|
+
# Run post-output logic: pass a function encapsulating custom code
|
|
121
|
+
session.run(lambda: print(f"Uploading file from {path}"))
|
|
122
|
+
|
|
123
|
+
# Fail the process with error message
|
|
124
|
+
session.fail("Error message")
|
|
125
|
+
```
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# dataforge-sdk
|
|
2
|
+
SDK for creating DataForge extensions.
|
|
3
|
+
|
|
4
|
+
## Postgres Utilities
|
|
5
|
+
|
|
6
|
+
The `dataforge.pg` module provides helper functions to execute SQL operations against the DataForge Postgres metastore:
|
|
7
|
+
|
|
8
|
+
```python
|
|
9
|
+
from dataforge.pg import select, update, pull
|
|
10
|
+
|
|
11
|
+
# Execute a SELECT query and return a Spark DataFrame
|
|
12
|
+
df = select("SELECT * FROM my_table")
|
|
13
|
+
|
|
14
|
+
# Execute an UPDATE/INSERT/DELETE query
|
|
15
|
+
update("UPDATE my_table SET col = 'value'")
|
|
16
|
+
|
|
17
|
+
# Trigger a new data pull for source_id 123
|
|
18
|
+
pull(123)
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## IngestionSession
|
|
22
|
+
|
|
23
|
+
The `IngestionSession` class manages a custom data ingestion process lifecycle.
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from dataforge import IngestionSession
|
|
27
|
+
|
|
28
|
+
# Initialize a session (production use)
|
|
29
|
+
session = IngestionSession()
|
|
30
|
+
|
|
31
|
+
# Initialize a session (optional source_name/project_name for testing)
|
|
32
|
+
session = IngestionSession(source_name="my_source", project_name="my_project")
|
|
33
|
+
|
|
34
|
+
# Ingest data
|
|
35
|
+
# pass a function returning a DataFrame (recommended to integrate logging with DataForge)
|
|
36
|
+
session.ingest(lambda: spark.read.csv("s3://bucket/path/input.csv"))
|
|
37
|
+
|
|
38
|
+
# pass a DataFrame (can be used for testing, not recommended for production deployment)
|
|
39
|
+
df = spark.read.csv("s3://bucket/path/input.csv")
|
|
40
|
+
session.ingest(df)
|
|
41
|
+
|
|
42
|
+
# ingest empty dataframe to create 0-record input
|
|
43
|
+
session.ingest()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Fail the process with error message
|
|
47
|
+
session.fail("Error message")
|
|
48
|
+
|
|
49
|
+
# Retrieve latest tracking fields
|
|
50
|
+
tracking = session.latest_tracking_fields()
|
|
51
|
+
|
|
52
|
+
# Retrieve connection parameters for the current source
|
|
53
|
+
connection_parameters = session.connection_parameters()
|
|
54
|
+
|
|
55
|
+
# Retrieve custom parameters for the current source
|
|
56
|
+
custom_parameters = session.custom_parameters()
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## ParsingSession
|
|
61
|
+
|
|
62
|
+
The `ParsingSession` class manages a custom parse process lifecycle.
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from dataforge import ParsingSession
|
|
66
|
+
|
|
67
|
+
# Initialize a session (production use)
|
|
68
|
+
session = ParsingSession()
|
|
69
|
+
|
|
70
|
+
# Initialize a session (optional input_id for testing)
|
|
71
|
+
session = ParsingSession(input_id=123)
|
|
72
|
+
|
|
73
|
+
# Retrieve custom parameters
|
|
74
|
+
params = session.custom_parameters()
|
|
75
|
+
|
|
76
|
+
# Get the path of file to be parsed
|
|
77
|
+
path = session.file_path
|
|
78
|
+
|
|
79
|
+
# Run parsing: pass a DataFrame, a function returning a DataFrame or None (0-record file)
|
|
80
|
+
session.run(lambda: spark.read.json(session.file_path))
|
|
81
|
+
|
|
82
|
+
# Fail the process with error message
|
|
83
|
+
session.fail("Error message")
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## PostOutputSession
|
|
88
|
+
|
|
89
|
+
The `PostOutputSession` class manages a custom post-output process lifecycle.
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from dataforge import PostOutputSession
|
|
93
|
+
|
|
94
|
+
# Initialize a session (production use)
|
|
95
|
+
session = PostOutputSession()
|
|
96
|
+
|
|
97
|
+
# Initialize a session (optional names for testing)
|
|
98
|
+
session = PostOutputSession(output_name="report", output_source_name="my_source", project_name="my_project")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# Get the path of file generated by preceding output process
|
|
102
|
+
path = session.file_path()
|
|
103
|
+
|
|
104
|
+
# Retrieve connection parameters for the current output
|
|
105
|
+
connection_parameters = session.connection_parameters()
|
|
106
|
+
|
|
107
|
+
# Retrieve custom parameters for the current source
|
|
108
|
+
custom_parameters = session.custom_parameters()
|
|
109
|
+
|
|
110
|
+
# Run post-output logic: pass a function encapsulating custom code
|
|
111
|
+
session.run(lambda: print(f"Uploading file from {path}"))
|
|
112
|
+
|
|
113
|
+
# Fail the process with error message
|
|
114
|
+
session.fail("Error message")
|
|
115
|
+
```
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""Session management for DataForge SDK.
|
|
2
|
+
|
|
3
|
+
This module provides the Session class for managing DataForge SDK sessions,
|
|
4
|
+
|
|
5
|
+
Classes:
|
|
6
|
+
CoreAPIError: Raised when a Core API call fails.
|
|
7
|
+
VersionMismatchError: Raised when the SDK version does not match server expectation.
|
|
8
|
+
Session: Main class for session management.
|
|
9
|
+
"""
|
|
10
|
+
import json
|
|
11
|
+
import re
|
|
12
|
+
import traceback
|
|
13
|
+
from typing import Optional, Literal
|
|
14
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
15
|
+
import requests
|
|
16
|
+
import logging
|
|
17
|
+
from pyspark.dbutils import DBUtils
|
|
18
|
+
from pyspark.sql.functions import monotonically_increasing_id, lit
|
|
19
|
+
from pyspark.sql.types import LongType
|
|
20
|
+
|
|
21
|
+
from dataforge.process_record import ProcessRecord
|
|
22
|
+
from dataforge.system_configuration import SystemConfiguration
|
|
23
|
+
from dataforge.postgres_connection import PostgresConnection
|
|
24
|
+
from importlib.metadata import version
|
|
25
|
+
|
|
26
|
+
from .utils import _setup_logger
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CoreAPIError(Exception):
|
|
30
|
+
"""Exception raised when a Core API call fails."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
class VersionMismatchError(Exception):
|
|
34
|
+
"""Exception raised when the SDK version does not match server expectation."""
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
class _Session:
|
|
38
|
+
"""Base session class for DataForge SDK operations.
|
|
39
|
+
Class should not be instantiated by user directly: use process-specific Session classes instead
|
|
40
|
+
Manages Spark session, DBUtils, system configuration, Core API calls,
|
|
41
|
+
logging, and process lifecycle.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
spark (SparkSession): Spark session for data processing.
|
|
45
|
+
dbutils (DBUtils): Databricks DBUtils instance.
|
|
46
|
+
logger (logging.Logger): Logger instance for session logs.
|
|
47
|
+
_systemConfiguration (SystemConfiguration): System configuration loaded from Postgres.
|
|
48
|
+
_pg (PostgresConnection): Postgres connection to core API database.
|
|
49
|
+
process (ProcessRecord): The current process record.
|
|
50
|
+
version (str): DataForge SDK version.
|
|
51
|
+
"""
|
|
52
|
+
_systemConfiguration: SystemConfiguration
|
|
53
|
+
_pg: PostgresConnection
|
|
54
|
+
spark: SparkSession
|
|
55
|
+
dbutils: DBUtils
|
|
56
|
+
logger: logging.Logger
|
|
57
|
+
process: ProcessRecord
|
|
58
|
+
version = version("dataforge-sdk")
|
|
59
|
+
process_parameters: dict
|
|
60
|
+
_is_open: bool = False
|
|
61
|
+
|
|
62
|
+
def __init__(self):
|
|
63
|
+
self.logger = _setup_logger(self.__class__.__name__)
|
|
64
|
+
self.spark = SparkSession.builder.getOrCreate()
|
|
65
|
+
self.dbutils = self._get_dbutils()
|
|
66
|
+
pg_connection_string_read = self.dbutils.secrets.get("sparky", "pg_read")
|
|
67
|
+
pg_read = PostgresConnection(f"{pg_connection_string_read}&application_name=sdk", self.logger)
|
|
68
|
+
self._systemConfiguration = SystemConfiguration(pg_read)
|
|
69
|
+
pg_connection_string: str = self._core_api_call("core/sparky-db-connection")["connection"]
|
|
70
|
+
self._pg = PostgresConnection(f"{pg_connection_string}&application_name=sdk", self.logger)
|
|
71
|
+
self._is_open = True
|
|
72
|
+
self._check_version()
|
|
73
|
+
self.process_parameters = {
|
|
74
|
+
"version": self.version,
|
|
75
|
+
"packageName": "dataforge-sdk"
|
|
76
|
+
}
|
|
77
|
+
try:
|
|
78
|
+
self.process_parameters['process_id'] = int(self.dbutils.widgets.get("process_id"))
|
|
79
|
+
except Exception as e:
|
|
80
|
+
pass
|
|
81
|
+
self.logger.info(f"Initialized base session for {self.__class__.__name__}")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _get_dbutils(self):
|
|
85
|
+
return DBUtils(self.spark)
|
|
86
|
+
|
|
87
|
+
def _core_api_call(self, route: str):
|
|
88
|
+
"""Make a GET request to the Core API.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
route (str): API route to call, appended to core URI.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
dict: Parsed JSON response from Core API.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
CoreAPIError: If response status is not 200.
|
|
98
|
+
"""
|
|
99
|
+
add_core = "core/" if route.startswith("process-complete") and self._systemConfiguration.saas_flag else ""
|
|
100
|
+
end_point = f"{self._systemConfiguration.coreUri}/{add_core}{route}"
|
|
101
|
+
core_jwt_token = self.dbutils.secrets.get("sparky", "coreJWT")
|
|
102
|
+
headers = {
|
|
103
|
+
"Authorization": f"Bearer {core_jwt_token}",
|
|
104
|
+
"Content-Type": "application/json",
|
|
105
|
+
"Accept": "application/json"
|
|
106
|
+
}
|
|
107
|
+
self.logger.debug(f"Executing core API call to {end_point}")
|
|
108
|
+
response = requests.get(end_point, headers=headers)
|
|
109
|
+
if response.status_code == 200:
|
|
110
|
+
data = response.json() if response.content else None
|
|
111
|
+
return data
|
|
112
|
+
else:
|
|
113
|
+
raise CoreAPIError(f"Core API call to {end_point} failed with status code {response.status_code}")
|
|
114
|
+
|
|
115
|
+
def _check_version(self):
|
|
116
|
+
"""Checks that the SDK version is compatible with the server.
|
|
117
|
+
|
|
118
|
+
Calls the database function to verify the SDK version. Raises
|
|
119
|
+
VersionMismatchError if there is a version mismatch.
|
|
120
|
+
"""
|
|
121
|
+
self.logger.info(f"Executing package dataforge-sdk version {self.version}")
|
|
122
|
+
check_res = self._pg.sql("select sparky.sdk_check_version(%s,%s)", [self.version, "dataforge-sdk"])
|
|
123
|
+
if check_res.get("error"):
|
|
124
|
+
raise VersionMismatchError(check_res.get("error"))
|
|
125
|
+
|
|
126
|
+
def log(self, message:str, severity: str = "I"):
|
|
127
|
+
"""Log a message to both the local logger and the process log in the database.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
message (str): The log message.
|
|
131
|
+
severity (str): Severity level, "I" for info, "E" for error. Defaults to "I".
|
|
132
|
+
"""
|
|
133
|
+
match severity:
|
|
134
|
+
case "I":
|
|
135
|
+
self.logger.info(message)
|
|
136
|
+
case "E":
|
|
137
|
+
self.logger.error(message)
|
|
138
|
+
payload = {"log_id": self.process.logId, "operation_type": self.process.operationType,
|
|
139
|
+
"severity": severity, "message": message}
|
|
140
|
+
self._pg.sql("SELECT sparky.write_log(%s)", [json.dumps(payload)], fetch=False)
|
|
141
|
+
|
|
142
|
+
def connection_parameters(self):
|
|
143
|
+
"""Retrieve connection parameters for the current process.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
dict: Dictionary containing private and public connection parameters.
|
|
147
|
+
"""
|
|
148
|
+
connection_id = self.process.connectionId
|
|
149
|
+
self.log(f"Reading connection parameters of connection id {connection_id}")
|
|
150
|
+
params = self._core_api_call(f"core/connection/{connection_id}").get("parameters")
|
|
151
|
+
return {"private_connection_parameters": params.get("private_connection_parameters"),
|
|
152
|
+
"public_connection_parameters": params.get("public_connection_parameters")}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _end_process(self, status_code: Literal['P', 'F', 'Z'] = 'P', parameters: Optional[dict] = None):
|
|
156
|
+
"""Private method. End the current process
|
|
157
|
+
"""
|
|
158
|
+
payload = parameters if parameters else {}
|
|
159
|
+
payload["process_id"] = self.process.processId
|
|
160
|
+
payload["status_code"] = status_code
|
|
161
|
+
self._pg.sql("select meta.prc_process_end(%s)", [json.dumps(payload)], fetch=False)
|
|
162
|
+
self._core_api_call(f"process-complete/{self.process.processId}")
|
|
163
|
+
if status_code in ('P','Z'):
|
|
164
|
+
self.logger.info("Session completed successfully")
|
|
165
|
+
else:
|
|
166
|
+
self.logger.error("Process failed")
|
|
167
|
+
self.close()
|
|
168
|
+
|
|
169
|
+
def fail(self,message: str):
|
|
170
|
+
"""Fail current session and log the message.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
message (str): The log message.
|
|
174
|
+
"""
|
|
175
|
+
self.log(f"Process failed with error: {message}", "E")
|
|
176
|
+
self._end_process("F")
|
|
177
|
+
|
|
178
|
+
def _log_fail(self, e: Exception):
|
|
179
|
+
"""Log failure for the given exception.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
e (Exception): The exception that caused the failure.
|
|
183
|
+
"""
|
|
184
|
+
traceback.print_exception(e)
|
|
185
|
+
self.log(f"Process failed with error: {e}", "E")
|
|
186
|
+
|
|
187
|
+
def close(self):
|
|
188
|
+
"""Close the session."""
|
|
189
|
+
self._pg.close()
|
|
190
|
+
self.logger.info("Session closed")
|
|
191
|
+
self._is_open = False
|
|
192
|
+
|
|
193
|
+
def custom_parameters(self):
|
|
194
|
+
"""Retrieve custom parameters from the process.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
dict: The custom parameters dictionary from the process.
|
|
198
|
+
"""
|
|
199
|
+
return self.process.parameters.get('custom_parameters')
|
|
200
|
+
|
|
201
|
+
def _write_parsed_data(self, in_df: DataFrame, dest_file_path: str) -> tuple[int, int]:
|
|
202
|
+
"""Process input DataFrame, write to Parquet, and update metadata.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
in_df (DataFrame): Input Spark DataFrame to process and write.
|
|
206
|
+
dest_file_path (str): Destination path for saving Parquet file.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
tuple[int, int]: A tuple containing the total file size in bytes and the number of records written.
|
|
210
|
+
|
|
211
|
+
Raises:
|
|
212
|
+
Exception: If duplicate columns are detected or metadata update fails.
|
|
213
|
+
"""
|
|
214
|
+
self.log("Data read successfully. Checking schema.")
|
|
215
|
+
|
|
216
|
+
select_list = self._pg.sql("SELECT sparky.get_select_list(%s)", (self.process.sourceId,))
|
|
217
|
+
df_sel = in_df.selectExpr(*select_list)
|
|
218
|
+
self.log(f"Applied select list {select_list}")
|
|
219
|
+
|
|
220
|
+
# Duplicate column check
|
|
221
|
+
cols = df_sel.columns
|
|
222
|
+
dup_columns = [col for col in set(cols) if cols.count(col) > 1]
|
|
223
|
+
if dup_columns:
|
|
224
|
+
raise Exception(f"Duplicate columns detected: {', '.join(dup_columns)}")
|
|
225
|
+
|
|
226
|
+
# Cast binary/void to string
|
|
227
|
+
binary_casts = [
|
|
228
|
+
f"CAST(`{f.name}` AS STRING) `{f.name}`" if f.dataType.typeName() in ("binary", "void")
|
|
229
|
+
else f"`{f.name}`"
|
|
230
|
+
for f in df_sel.schema.fields
|
|
231
|
+
]
|
|
232
|
+
df = df_sel.selectExpr(*binary_casts)
|
|
233
|
+
|
|
234
|
+
# Schema as JSON array
|
|
235
|
+
schema = []
|
|
236
|
+
for f in df.schema.fields:
|
|
237
|
+
field_name = f.name.lower() if self.process.forceCaseInsensitive else f.name
|
|
238
|
+
name_normalized = re.sub(r'\W+', '_', field_name)
|
|
239
|
+
column_normalized = ("_" if field_name[0].isdigit() else "") + name_normalized # add leading underscore
|
|
240
|
+
|
|
241
|
+
if f.dataType.simpleString().startswith("struct"):
|
|
242
|
+
spark_type = "StructType"
|
|
243
|
+
elif f.dataType.simpleString().startswith("array"):
|
|
244
|
+
spark_type = "ArrayType"
|
|
245
|
+
elif f.dataType.simpleString().startswith("decimal"):
|
|
246
|
+
spark_type = "DecimalType"
|
|
247
|
+
else:
|
|
248
|
+
spark_type = f.dataType.simpleString()
|
|
249
|
+
|
|
250
|
+
attr_schema = json.loads(f.dataType.json())
|
|
251
|
+
self.logger.info(f"Column `{column_normalized}` schema: {attr_schema}")
|
|
252
|
+
schema.append({
|
|
253
|
+
"name": field_name,
|
|
254
|
+
"column_normalized": column_normalized,
|
|
255
|
+
"spark_type": spark_type,
|
|
256
|
+
"schema": attr_schema
|
|
257
|
+
})
|
|
258
|
+
|
|
259
|
+
self.log("Schema read successfully. Updating source raw metadata.")
|
|
260
|
+
|
|
261
|
+
metadata_update_json = {
|
|
262
|
+
"source_id": self.process.sourceId,
|
|
263
|
+
"input_id": self.process.inputId,
|
|
264
|
+
"raw_attributes": schema,
|
|
265
|
+
"ingestion_type": "sparky"
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
result = self._pg.sql("SELECT meta.prc_n_normalize_raw_attribute(%s)", (json.dumps(metadata_update_json),))
|
|
269
|
+
if "error" in result:
|
|
270
|
+
raise Exception(result["error"])
|
|
271
|
+
|
|
272
|
+
normalize_attributes = result["normalized_metadata"]
|
|
273
|
+
|
|
274
|
+
self.log("Source metadata updated. Renaming and upcasting attributes")
|
|
275
|
+
|
|
276
|
+
cast_rename_expr = []
|
|
277
|
+
for att in normalize_attributes:
|
|
278
|
+
base_expr = att.get("upcastExpr") or att["raw_attribute_name"]
|
|
279
|
+
if att["raw_attribute_name"] != att["column_alias"] or self.process.forceCaseInsensitive:
|
|
280
|
+
base_expr += f" AS {att['column_alias']}"
|
|
281
|
+
cast_rename_expr.append(base_expr)
|
|
282
|
+
|
|
283
|
+
self.logger.info("Normalized SQL: " + ", ".join(cast_rename_expr))
|
|
284
|
+
df_update = df.selectExpr(*cast_rename_expr)
|
|
285
|
+
|
|
286
|
+
if self.process.parameters.get("generate_row_id", False):
|
|
287
|
+
self.logger.info("Added s_row_id to data.")
|
|
288
|
+
df_final = df_update.withColumn("s_row_id", monotonically_increasing_id())
|
|
289
|
+
else:
|
|
290
|
+
self.logger.info("generate_row_id = false, added null s_row_id.")
|
|
291
|
+
df_final = df_update.withColumn("s_row_id", lit(None).cast(LongType()))
|
|
292
|
+
|
|
293
|
+
self.log("Writing file")
|
|
294
|
+
df_final.write.format("parquet").mode("overwrite").save(dest_file_path)
|
|
295
|
+
self.log(f"Wrote file {dest_file_path}")
|
|
296
|
+
|
|
297
|
+
row_count = self.spark.read.format("parquet").load(dest_file_path).count()
|
|
298
|
+
self.log(f"{row_count} records counted")
|
|
299
|
+
|
|
300
|
+
file_size = sum(f.size for f in self.dbutils.fs.ls(dest_file_path))
|
|
301
|
+
return file_size, row_count
|
|
302
|
+
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""IngestionSession management for DataForge SDK.
|
|
2
|
+
|
|
3
|
+
This module provides the IngestionSession class
|
|
4
|
+
that manages custom data ingestion process, including initialization, data writing,
|
|
5
|
+
metadata updates, and failure handling.
|
|
6
|
+
|
|
7
|
+
Classes:
|
|
8
|
+
IngestionSession: Manages the custom ingestion process lifecycle.
|
|
9
|
+
"""
|
|
10
|
+
import json
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Optional, Callable
|
|
13
|
+
from pyspark.sql import DataFrame
|
|
14
|
+
from .process_record import ProcessRecord
|
|
15
|
+
from ._session import _Session
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IngestionSession(_Session):
|
|
19
|
+
|
|
20
|
+
"""Session class to manage custom ingestion process lifecycle.
|
|
21
|
+
|
|
22
|
+
Initializes an ingestion process record and provides methods to ingest
|
|
23
|
+
data frames and handle failures, updating metadata and notifying the Core API.
|
|
24
|
+
"""
|
|
25
|
+
def __init__(self, source_name: Optional[str] = None, project_name: Optional[str] = None):
|
|
26
|
+
"""Initialize ingestion session and start a new ingestion process.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
source_name (Optional[str]): Optional name of the data source being ingested.
|
|
30
|
+
Used for interactive testing. Leave blank for production use.
|
|
31
|
+
project_name (Optional[str]): Optional project context name. Used for interactive testing.
|
|
32
|
+
Leave blank for production use.
|
|
33
|
+
"""
|
|
34
|
+
super().__init__()
|
|
35
|
+
# initialize process
|
|
36
|
+
self.process_parameters["start_process_flag"] = True
|
|
37
|
+
if source_name is not None:
|
|
38
|
+
self.process_parameters["source_name"] = source_name
|
|
39
|
+
if project_name is not None:
|
|
40
|
+
self.process_parameters["project_name"] = project_name
|
|
41
|
+
|
|
42
|
+
process = self._pg.sql("select sparky.sdk_new_ingestion(%s)", (json.dumps(self.process_parameters),))
|
|
43
|
+
self.process = ProcessRecord(process)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def latest_tracking_fields(self):
|
|
47
|
+
"""Get the latest tracking fields from the ingestion process record.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
dict: The latest tracking data from the process record, or None if not available.
|
|
51
|
+
"""
|
|
52
|
+
return self.process.process.get("tracking")
|
|
53
|
+
|
|
54
|
+
def ingest(self,df: DataFrame | Callable[[], DataFrame] | None = None):
|
|
55
|
+
"""Ingest the provided DataFrame into the DataForge and update input record.
|
|
56
|
+
|
|
57
|
+
Writes the DataFrame to raw Parquet file,
|
|
58
|
+
updates the input record with status, file size, record count, and notifies
|
|
59
|
+
the Core API of process completion. On failure, updates logs and flags the input and process
|
|
60
|
+
records as failed.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Spark DataFrame to ingest (recommended),
|
|
64
|
+
or spark DataFrame
|
|
65
|
+
"""
|
|
66
|
+
try:
|
|
67
|
+
if not self._is_open:
|
|
68
|
+
raise Exception("Session is closed")
|
|
69
|
+
if df is None:
|
|
70
|
+
status = "Z"
|
|
71
|
+
row_count = 0
|
|
72
|
+
file_size = 0
|
|
73
|
+
else:
|
|
74
|
+
if callable(df):
|
|
75
|
+
result_df = df() # call it to get the DataFrame
|
|
76
|
+
else:
|
|
77
|
+
result_df = df
|
|
78
|
+
dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
|
|
79
|
+
file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
|
|
80
|
+
status = "P" if row_count > 0 else "Z"
|
|
81
|
+
input_update_json = {
|
|
82
|
+
"ingestion_status_code": status,
|
|
83
|
+
"extract_datetime": datetime.now().isoformat(),
|
|
84
|
+
"file_size": file_size,
|
|
85
|
+
"process_id": self.process.processId,
|
|
86
|
+
"input_id": self.process.inputId,
|
|
87
|
+
"record_counts": {"Total": row_count}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
|
|
91
|
+
(json.dumps(input_update_json),), fetch=False)
|
|
92
|
+
self.logger.info("Ingestion completed successfully")
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
self._log_fail(e)
|
|
96
|
+
failure_update_json = {
|
|
97
|
+
"process_id": self.process.processId,
|
|
98
|
+
"ingestion_status_code": "F"
|
|
99
|
+
}
|
|
100
|
+
self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
|
|
101
|
+
(json.dumps(failure_update_json),), fetch=False)
|
|
102
|
+
finally:
|
|
103
|
+
self._core_api_call(f"process-complete/{self.process.processId}")
|
|
104
|
+
self.close()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def fail(self, message: str):
|
|
108
|
+
"""Mark the ingestion as failed with a custom message.
|
|
109
|
+
|
|
110
|
+
Logs the provided error message, updates the input and process record to failure status,
|
|
111
|
+
and notifies the Core API of process completion.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
message (str): Custom error message explaining the failure.
|
|
115
|
+
"""
|
|
116
|
+
self.log(f"Custom Ingestion failed with error: {message}", "E")
|
|
117
|
+
self._pg.sql("select meta.prc_iw_in_update_input_record(%s)",
|
|
118
|
+
[json.dumps({"process_id": self.process.processId, "ingestion_status_code" : "F"})], fetch=False)
|
|
119
|
+
self._core_api_call(f"process-complete/{self.process.processId}")
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
|
|
2
|
+
import json
|
|
3
|
+
from typing import Optional, Callable
|
|
4
|
+
from pyspark.sql import DataFrame
|
|
5
|
+
from .process_record import ProcessRecord
|
|
6
|
+
from ._session import _Session
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ParsingSession(_Session):
|
|
10
|
+
|
|
11
|
+
"""Session class to manage custom parse process lifecycle.
|
|
12
|
+
|
|
13
|
+
Initializes parse process record and provides methods to parse
|
|
14
|
+
data frames and handle failures, updating metadata and notifying the Core API.
|
|
15
|
+
"""
|
|
16
|
+
_parsing_parameters: dict
|
|
17
|
+
def __init__(self, input_id: Optional[int] = None):
|
|
18
|
+
"""Initialize custom parse session and start a new custom parse process.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
input_id (Optional[int]): Optional input_id of the batch for interactive testing.
|
|
22
|
+
Leave blank for production use.
|
|
23
|
+
"""
|
|
24
|
+
super().__init__()
|
|
25
|
+
# initialize process
|
|
26
|
+
if input_id is not None:
|
|
27
|
+
self.process_parameters["input_id"] = input_id
|
|
28
|
+
|
|
29
|
+
process = self._pg.sql("select sparky.sdk_new_parse(%s)", (json.dumps(self.process_parameters),))
|
|
30
|
+
self.process = ProcessRecord(process)
|
|
31
|
+
self._parsing_parameters = self._pg.sql("select meta.prc_n_start_parse(%s)",
|
|
32
|
+
(json.dumps({"input_id": self.process.inputId}),))
|
|
33
|
+
# Extract the file extension
|
|
34
|
+
file_name = self._parsing_parameters["source_file_name"]
|
|
35
|
+
file_extension = file_name.split(".")[-1]
|
|
36
|
+
# Construct the path
|
|
37
|
+
self.file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/raw/raw_input_{self.process.inputId}.{file_extension}"
|
|
38
|
+
|
|
39
|
+
def custom_parameters(self):
|
|
40
|
+
"""Retrieve custom parsing parameters.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
dict: The custom parsing parameters dictionary.
|
|
44
|
+
"""
|
|
45
|
+
return self._parsing_parameters.get('custom_parameters')
|
|
46
|
+
|
|
47
|
+
def run(self,df: DataFrame | Callable[[], DataFrame] | None = None):
|
|
48
|
+
"""Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
|
|
49
|
+
|
|
50
|
+
Writes the DataFrame to parsed Parquet file,
|
|
51
|
+
updates the input record with status, file size, record count, and notifies
|
|
52
|
+
the Core API of process completion. On failure, updates logs and flags the input and process
|
|
53
|
+
records as failed.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
df (DataFrame): parameterless def that you defined, returning the Spark DataFrame containing parsed file data (recommended),
|
|
57
|
+
or spark DataFrame
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
if not self._is_open:
|
|
61
|
+
raise Exception("Session is closed")
|
|
62
|
+
if callable(df):
|
|
63
|
+
result_df = df() # call it to get the DataFrame
|
|
64
|
+
else:
|
|
65
|
+
result_df = df
|
|
66
|
+
|
|
67
|
+
if result_df is None or result_df.isEmpty():
|
|
68
|
+
file_size, row_count = (0, 0)
|
|
69
|
+
else:
|
|
70
|
+
dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
|
|
71
|
+
file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
|
|
72
|
+
input_update_json = {
|
|
73
|
+
"file_size": file_size,
|
|
74
|
+
"input_id": self.process.inputId,
|
|
75
|
+
"record_counts": {"Total": row_count}
|
|
76
|
+
}
|
|
77
|
+
self._end_process('P' if row_count > 0 else 'Z', input_update_json)
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
self._log_fail(e)
|
|
81
|
+
self._end_process("F")
|
|
82
|
+
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Postgres utilities for data operations.
|
|
2
|
+
|
|
3
|
+
This module provides functions to execute SQL queries against a Postgres database
|
|
4
|
+
using Spark JDBC for reads and a direct write connection for write operations.
|
|
5
|
+
"""
|
|
6
|
+
from pyspark.dbutils import DBUtils
|
|
7
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
8
|
+
from .postgres_connection import PostgresConnection
|
|
9
|
+
|
|
10
|
+
spark = SparkSession.builder.getOrCreate()
|
|
11
|
+
dbutils = DBUtils(spark)
|
|
12
|
+
pg_connection_string_read = dbutils.secrets.get("sparky", "pg_read")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def update(query: str):
|
|
16
|
+
"""Execute an update SQL query on the DataForge metastore Postgres database.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
query (str): SQL query string to execute.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
None
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
Exception: If write connection cannot be established or SQL execution fails.
|
|
26
|
+
"""
|
|
27
|
+
pg = _get_pg_write_connection()
|
|
28
|
+
pg.sql(query, fetch=False)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def execute(query: str):
|
|
32
|
+
"""Alias for update() to execute write SQL queries.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
query (str): SQL query string to execute.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
None
|
|
39
|
+
"""
|
|
40
|
+
update(query)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def select(query: str) -> DataFrame:
|
|
44
|
+
"""Execute a SELECT SQL query on the DataForge metastore Postgres database and return a DataFrame with results.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
query (str): SQL SELECT query string.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
DataFrame: Spark DataFrame containing query results.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
Exception: If Spark fails to load data or connection issues.
|
|
54
|
+
"""
|
|
55
|
+
return spark.read.format("jdbc") \
|
|
56
|
+
.option("url", pg_connection_string_read) \
|
|
57
|
+
.option("query", query) \
|
|
58
|
+
.load()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def pull(source_id: int):
|
|
62
|
+
"""Trigger new ingestion (pull data) on DataForge source for a given source ID.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
source_id (int): Identifier for the source to pull.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
None
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
Exception: If write connection cannot be established or SQL execution fails.
|
|
72
|
+
"""
|
|
73
|
+
pg = _get_pg_write_connection()
|
|
74
|
+
pg.sql("SELECT meta.svc_pull_source(%s, %s)", (source_id,'sdk'), fetch=False)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _get_pg_write_connection() -> PostgresConnection:
|
|
78
|
+
"""Internal method to retrieve a PostgresConnection for write operations using secured secrets.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
PostgresConnection: Connection object for executing write queries.
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
Exception: If the 'pg_write' secret is not defined in the 'sparky' scope.
|
|
85
|
+
"""
|
|
86
|
+
secrets = dbutils.secrets.list("sparky")
|
|
87
|
+
if any(secret.key == "pg_write" for secret in secrets):
|
|
88
|
+
conn_string = dbutils.secrets.get("sparky", "pg_write")
|
|
89
|
+
return PostgresConnection(conn_string + "&application_name=sdk-pg")
|
|
90
|
+
else:
|
|
91
|
+
raise Exception("pg_write secret is not defined in sparky scope")
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
|
|
2
|
+
import json
|
|
3
|
+
from typing import Optional, Callable
|
|
4
|
+
from .process_record import ProcessRecord
|
|
5
|
+
from ._session import _Session
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PostOutputSession(_Session):
|
|
9
|
+
|
|
10
|
+
"""Session class to manage custom post-output process lifecycle.
|
|
11
|
+
|
|
12
|
+
Initializes post-output process record and provides run() method to execute custom logic post-output completion,
|
|
13
|
+
handle failures and update metadata.
|
|
14
|
+
"""
|
|
15
|
+
def __init__(self, output_name: Optional[str] = None, output_source_name: Optional[str] = None, project_name: Optional[str] = None):
|
|
16
|
+
"""Initialize custom post-output session and start a new post-output process.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
input_id (Optional[int]): Optional input_id of the batch for interactive testing.
|
|
20
|
+
Leave blank for production use.
|
|
21
|
+
"""
|
|
22
|
+
super().__init__()
|
|
23
|
+
# initialize process
|
|
24
|
+
if output_name is not None:
|
|
25
|
+
self.process_parameters["output_name"] = output_name
|
|
26
|
+
self.process_parameters["output_source_name"] = output_source_name
|
|
27
|
+
self.process_parameters["project_name"] = project_name
|
|
28
|
+
|
|
29
|
+
process = self._pg.sql("select sparky.sdk_new_post_output(%s)", (json.dumps(self.process_parameters),))
|
|
30
|
+
self.process = ProcessRecord(process)
|
|
31
|
+
|
|
32
|
+
def file_path(self):
|
|
33
|
+
"""Returns file path for the file generated by output process"""
|
|
34
|
+
return self.process.parameters.get("file_path")
|
|
35
|
+
|
|
36
|
+
def run(self, po_commands: Callable[[],None]):
|
|
37
|
+
|
|
38
|
+
"""Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
|
|
39
|
+
|
|
40
|
+
Writes the DataFrame to parsed Parquet file,
|
|
41
|
+
updates the input record with status, file size, record count, and notifies
|
|
42
|
+
the Core API of process completion. On failure, updates logs and flags the input and process
|
|
43
|
+
records as failed.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
po_commands (po_commands: Callable[[],None]): The custom-defined, parameterless def containing post-output logic.
|
|
47
|
+
Returns None
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
if not self._is_open:
|
|
51
|
+
raise Exception("Session is closed")
|
|
52
|
+
self.log(f"Running post output def {po_commands}")
|
|
53
|
+
po_commands()
|
|
54
|
+
self.log(f"Post output execution completed successfully")
|
|
55
|
+
self._end_process("P")
|
|
56
|
+
|
|
57
|
+
except Exception as e:
|
|
58
|
+
self._log_fail(e)
|
|
59
|
+
self._end_process("F")
|
|
60
|
+
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
import psycopg2
|
|
4
|
+
from .utils import _setup_logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PostgresConnection:
|
|
8
|
+
logger: logging.Logger
|
|
9
|
+
def __init__(self, connection_string: str, logger: logging.Logger | None = None):
|
|
10
|
+
try:
|
|
11
|
+
if logger:
|
|
12
|
+
self.logger = logger
|
|
13
|
+
else:
|
|
14
|
+
self.logger = _setup_logger("PG")
|
|
15
|
+
self.connect(connection_string)
|
|
16
|
+
|
|
17
|
+
except Exception as e:
|
|
18
|
+
logger.error(f"Error connecting to Postgres: {e}")
|
|
19
|
+
sys.exit(1)
|
|
20
|
+
|
|
21
|
+
def sql(self, query: str, params=None, fetch=True):
|
|
22
|
+
try:
|
|
23
|
+
# Execute a query
|
|
24
|
+
cur = self.conn.cursor()
|
|
25
|
+
cur.execute(query, params)
|
|
26
|
+
# Retrieve query results
|
|
27
|
+
res = cur.fetchone() if fetch else [None]
|
|
28
|
+
cur.close()
|
|
29
|
+
return res[0]
|
|
30
|
+
except Exception as e:
|
|
31
|
+
self.logger.error(f"Error executing query {query}({params}) on Postgres: {e}")
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
|
|
34
|
+
def connect(self, connection_string: str):
|
|
35
|
+
# Execute a query
|
|
36
|
+
try:
|
|
37
|
+
self.conn = psycopg2.connect(connection_string.removeprefix('jdbc:'))
|
|
38
|
+
self.conn.set_session(autocommit=True)
|
|
39
|
+
self.sql("select 1") # execute test query
|
|
40
|
+
# Change connection
|
|
41
|
+
except Exception as e:
|
|
42
|
+
self.logger.error(f"Error connecting to Postgres database or insufficient permissions. Details: {e}")
|
|
43
|
+
sys.exit(1)
|
|
44
|
+
|
|
45
|
+
def close(self ):
|
|
46
|
+
self.conn.close()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
class ProcessRecord:
|
|
2
|
+
processId: int
|
|
3
|
+
logId: int
|
|
4
|
+
operationType: str
|
|
5
|
+
sourceId: int
|
|
6
|
+
inputId: int
|
|
7
|
+
process: dict
|
|
8
|
+
parameters: dict
|
|
9
|
+
forceCaseInsensitive: bool
|
|
10
|
+
connectionId: int
|
|
11
|
+
|
|
12
|
+
def __init__(self, process):
|
|
13
|
+
self.process = process
|
|
14
|
+
self.processId = process['process_id']
|
|
15
|
+
self.logId = process['log_id']
|
|
16
|
+
self.operationType = process['operation_type']
|
|
17
|
+
self.inputId = process['input_id']
|
|
18
|
+
self.sourceId = process['source_id']
|
|
19
|
+
self.connectionId = process.get('connection_id')
|
|
20
|
+
self.parameters = process['parameters']
|
|
21
|
+
self.forceCaseInsensitive = self.parameters.get('force_case_insensitive', False)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from .postgres_connection import PostgresConnection
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SystemConfiguration:
|
|
5
|
+
datalakePath: str
|
|
6
|
+
databricksDbName: str
|
|
7
|
+
targetParquetFileSize: int
|
|
8
|
+
# etlURI: str
|
|
9
|
+
architecture:str
|
|
10
|
+
coreUri: str
|
|
11
|
+
saas_flag: bool
|
|
12
|
+
|
|
13
|
+
def __init__(self, pg: PostgresConnection):
|
|
14
|
+
conf = pg.sql("SELECT meta.prc_get_system_configuration('sparky')")
|
|
15
|
+
self.datalakePath = conf['data-lake-path'].replace("s3://", "s3a://")
|
|
16
|
+
self.databricksDbName = conf['databricks-db-name']
|
|
17
|
+
self.targetParquetFileSize = conf['target-parquet-file-size']
|
|
18
|
+
# self.etlURL = conf['etl-url']
|
|
19
|
+
self.architecture = conf['architecture']
|
|
20
|
+
self.saas_flag = self.architecture == "saas"
|
|
21
|
+
self.coreUri = f"https://{conf['api-url']}" if self.saas_flag else f"http://{conf['etl-url']}:7131"
|
|
22
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
def _setup_logger(name: str = "SDK"):
|
|
4
|
+
"""Set up and return a configured logger for the SDK.
|
|
5
|
+
|
|
6
|
+
Returns:
|
|
7
|
+
logging.Logger: Configured logger for SDK outputs.
|
|
8
|
+
"""
|
|
9
|
+
logger = logging.getLogger(name)
|
|
10
|
+
logger.setLevel(logging.INFO)
|
|
11
|
+
logger.propagate = False
|
|
12
|
+
|
|
13
|
+
# Create handler
|
|
14
|
+
console_handler = logging.StreamHandler()
|
|
15
|
+
console_handler.setLevel(logging.INFO)
|
|
16
|
+
|
|
17
|
+
# Create formatter with timestamp
|
|
18
|
+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
|
|
19
|
+
console_handler.setFormatter(formatter)
|
|
20
|
+
|
|
21
|
+
if not logger.hasHandlers():
|
|
22
|
+
# Add handler to the logger
|
|
23
|
+
logger.addHandler(console_handler)
|
|
24
|
+
|
|
25
|
+
return logger
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge-sdk
|
|
3
|
+
Version: 9.2.0rc3
|
|
4
|
+
Summary: SDK for creating DataForge extensions
|
|
5
|
+
Author-email: Vadim Orlov <vorlov@dataforgelabs.com>
|
|
6
|
+
Project-URL: Homepage, https://docs.dataforgelabs.com
|
|
7
|
+
Project-URL: Issues, https://docs.dataforgelabs.com/hc/en-us/requests/new
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# dataforge-sdk
|
|
12
|
+
SDK for creating DataForge extensions.
|
|
13
|
+
|
|
14
|
+
## Postgres Utilities
|
|
15
|
+
|
|
16
|
+
The `dataforge.pg` module provides helper functions to execute SQL operations against the DataForge Postgres metastore:
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from dataforge.pg import select, update, pull
|
|
20
|
+
|
|
21
|
+
# Execute a SELECT query and return a Spark DataFrame
|
|
22
|
+
df = select("SELECT * FROM my_table")
|
|
23
|
+
|
|
24
|
+
# Execute an UPDATE/INSERT/DELETE query
|
|
25
|
+
update("UPDATE my_table SET col = 'value'")
|
|
26
|
+
|
|
27
|
+
# Trigger a new data pull for source_id 123
|
|
28
|
+
pull(123)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## IngestionSession
|
|
32
|
+
|
|
33
|
+
The `IngestionSession` class manages a custom data ingestion process lifecycle.
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from dataforge import IngestionSession
|
|
37
|
+
|
|
38
|
+
# Initialize a session (production use)
|
|
39
|
+
session = IngestionSession()
|
|
40
|
+
|
|
41
|
+
# Initialize a session (optional source_name/project_name for testing)
|
|
42
|
+
session = IngestionSession(source_name="my_source", project_name="my_project")
|
|
43
|
+
|
|
44
|
+
# Ingest data
|
|
45
|
+
# pass a function returning a DataFrame (recommended to integrate logging with DataForge)
|
|
46
|
+
session.ingest(lambda: spark.read.csv("s3://bucket/path/input.csv"))
|
|
47
|
+
|
|
48
|
+
# pass a DataFrame (can be used for testing, not recommended for production deployment)
|
|
49
|
+
df = spark.read.csv("s3://bucket/path/input.csv")
|
|
50
|
+
session.ingest(df)
|
|
51
|
+
|
|
52
|
+
# ingest empty dataframe to create 0-record input
|
|
53
|
+
session.ingest()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Fail the process with error message
|
|
57
|
+
session.fail("Error message")
|
|
58
|
+
|
|
59
|
+
# Retrieve latest tracking fields
|
|
60
|
+
tracking = session.latest_tracking_fields()
|
|
61
|
+
|
|
62
|
+
# Retrieve connection parameters for the current source
|
|
63
|
+
connection_parameters = session.connection_parameters()
|
|
64
|
+
|
|
65
|
+
# Retrieve custom parameters for the current source
|
|
66
|
+
custom_parameters = session.custom_parameters()
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## ParsingSession
|
|
71
|
+
|
|
72
|
+
The `ParsingSession` class manages a custom parse process lifecycle.
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from dataforge import ParsingSession
|
|
76
|
+
|
|
77
|
+
# Initialize a session (production use)
|
|
78
|
+
session = ParsingSession()
|
|
79
|
+
|
|
80
|
+
# Initialize a session (optional input_id for testing)
|
|
81
|
+
session = ParsingSession(input_id=123)
|
|
82
|
+
|
|
83
|
+
# Retrieve custom parameters
|
|
84
|
+
params = session.custom_parameters()
|
|
85
|
+
|
|
86
|
+
# Get the path of file to be parsed
|
|
87
|
+
path = session.file_path
|
|
88
|
+
|
|
89
|
+
# Run parsing: pass a DataFrame, a function returning a DataFrame or None (0-record file)
|
|
90
|
+
session.run(lambda: spark.read.json(session.file_path))
|
|
91
|
+
|
|
92
|
+
# Fail the process with error message
|
|
93
|
+
session.fail("Error message")
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## PostOutputSession
|
|
98
|
+
|
|
99
|
+
The `PostOutputSession` class manages a custom post-output process lifecycle.
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from dataforge import PostOutputSession
|
|
103
|
+
|
|
104
|
+
# Initialize a session (production use)
|
|
105
|
+
session = PostOutputSession()
|
|
106
|
+
|
|
107
|
+
# Initialize a session (optional names for testing)
|
|
108
|
+
session = PostOutputSession(output_name="report", output_source_name="my_source", project_name="my_project")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# Get the path of file generated by preceding output process
|
|
112
|
+
path = session.file_path()
|
|
113
|
+
|
|
114
|
+
# Retrieve connection parameters for the current output
|
|
115
|
+
connection_parameters = session.connection_parameters()
|
|
116
|
+
|
|
117
|
+
# Retrieve custom parameters for the current source
|
|
118
|
+
custom_parameters = session.custom_parameters()
|
|
119
|
+
|
|
120
|
+
# Run post-output logic: pass a function encapsulating custom code
|
|
121
|
+
session.run(lambda: print(f"Uploading file from {path}"))
|
|
122
|
+
|
|
123
|
+
# Fail the process with error message
|
|
124
|
+
session.fail("Error message")
|
|
125
|
+
```
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.cfg
|
|
4
|
+
dataforge/__init__.py
|
|
5
|
+
dataforge/_session.py
|
|
6
|
+
dataforge/ingestion_session.py
|
|
7
|
+
dataforge/parsing_session.py
|
|
8
|
+
dataforge/pg.py
|
|
9
|
+
dataforge/post_output_session.py
|
|
10
|
+
dataforge/postgres_connection.py
|
|
11
|
+
dataforge/process_record.py
|
|
12
|
+
dataforge/system_configuration.py
|
|
13
|
+
dataforge/utils.py
|
|
14
|
+
dataforge_sdk.egg-info/PKG-INFO
|
|
15
|
+
dataforge_sdk.egg-info/SOURCES.txt
|
|
16
|
+
dataforge_sdk.egg-info/dependency_links.txt
|
|
17
|
+
dataforge_sdk.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dataforge
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = [
|
|
3
|
+
"setuptools>=61.0",
|
|
4
|
+
"wheel"
|
|
5
|
+
]
|
|
6
|
+
build-backend = "setuptools.build_meta"
|
|
7
|
+
[project]
|
|
8
|
+
name = "dataforge-sdk"
|
|
9
|
+
version = "9.2.0-rc.3"
|
|
10
|
+
authors = [
|
|
11
|
+
{name="Vadim Orlov", email="vorlov@dataforgelabs.com"}
|
|
12
|
+
]
|
|
13
|
+
description = "SDK for creating DataForge extensions"
|
|
14
|
+
readme = "README.md"
|
|
15
|
+
requires-python = ">=3.11"
|
|
16
|
+
dependencies = [
|
|
17
|
+
]
|
|
18
|
+
[project.urls]
|
|
19
|
+
Homepage = "https://docs.dataforgelabs.com"
|
|
20
|
+
Issues = "https://docs.dataforgelabs.com/hc/en-us/requests/new"
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[metadata]
|
|
2
|
+
name = sdk-python
|
|
3
|
+
version = attr: dataforge.__version__
|
|
4
|
+
description = SDK for creating DataForge extensions
|
|
5
|
+
long_description = file: README.md
|
|
6
|
+
long_description_content_type = text/markdown
|
|
7
|
+
classifiers =
|
|
8
|
+
Programming Language :: Python :: 3
|
|
9
|
+
Programming Language :: Python :: 3.10
|
|
10
|
+
Operating System :: OS Independent
|
|
11
|
+
|
|
12
|
+
[options]
|
|
13
|
+
packages = find:
|
|
14
|
+
install_requires =
|
|
15
|
+
pyspark>=3.0
|
|
16
|
+
psycopg2-binary>=2.9
|
|
17
|
+
python_requires = >=3.10
|
|
18
|
+
|
|
19
|
+
[egg_info]
|
|
20
|
+
tag_build =
|
|
21
|
+
tag_date = 0
|
|
22
|
+
|