dataforge-sdk 9.2.0rc3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,125 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataforge-sdk
3
+ Version: 9.2.0rc3
4
+ Summary: SDK for creating DataForge extensions
5
+ Author-email: Vadim Orlov <vorlov@dataforgelabs.com>
6
+ Project-URL: Homepage, https://docs.dataforgelabs.com
7
+ Project-URL: Issues, https://docs.dataforgelabs.com/hc/en-us/requests/new
8
+ Requires-Python: >=3.11
9
+ Description-Content-Type: text/markdown
10
+
11
+ # dataforge-sdk
12
+ SDK for creating DataForge extensions.
13
+
14
+ ## Postgres Utilities
15
+
16
+ The `dataforge.pg` module provides helper functions to execute SQL operations against the DataForge Postgres metastore:
17
+
18
+ ```python
19
+ from dataforge.pg import select, update, pull
20
+
21
+ # Execute a SELECT query and return a Spark DataFrame
22
+ df = select("SELECT * FROM my_table")
23
+
24
+ # Execute an UPDATE/INSERT/DELETE query
25
+ update("UPDATE my_table SET col = 'value'")
26
+
27
+ # Trigger a new data pull for source_id 123
28
+ pull(123)
29
+ ```
30
+
31
+ ## IngestionSession
32
+
33
+ The `IngestionSession` class manages a custom data ingestion process lifecycle.
34
+
35
+ ```python
36
+ from dataforge import IngestionSession
37
+
38
+ # Initialize a session (production use)
39
+ session = IngestionSession()
40
+
41
+ # Initialize a session (optional source_name/project_name for testing)
42
+ session = IngestionSession(source_name="my_source", project_name="my_project")
43
+
44
+ # Ingest data
45
+ # pass a function returning a DataFrame (recommended to integrate logging with DataForge)
46
+ session.ingest(lambda: spark.read.csv("s3://bucket/path/input.csv"))
47
+
48
+ # pass a DataFrame (can be used for testing, not recommended for production deployment)
49
+ df = spark.read.csv("s3://bucket/path/input.csv")
50
+ session.ingest(df)
51
+
52
+ # ingest empty dataframe to create 0-record input
53
+ session.ingest()
54
+
55
+
56
+ # Fail the process with error message
57
+ session.fail("Error message")
58
+
59
+ # Retrieve latest tracking fields
60
+ tracking = session.latest_tracking_fields()
61
+
62
+ # Retrieve connection parameters for the current source
63
+ connection_parameters = session.connection_parameters()
64
+
65
+ # Retrieve custom parameters for the current source
66
+ custom_parameters = session.custom_parameters()
67
+
68
+ ```
69
+
70
+ ## ParsingSession
71
+
72
+ The `ParsingSession` class manages a custom parse process lifecycle.
73
+
74
+ ```python
75
+ from dataforge import ParsingSession
76
+
77
+ # Initialize a session (production use)
78
+ session = ParsingSession()
79
+
80
+ # Initialize a session (optional input_id for testing)
81
+ session = ParsingSession(input_id=123)
82
+
83
+ # Retrieve custom parameters
84
+ params = session.custom_parameters()
85
+
86
+ # Get the path of file to be parsed
87
+ path = session.file_path
88
+
89
+ # Run parsing: pass a DataFrame, a function returning a DataFrame or None (0-record file)
90
+ session.run(lambda: spark.read.json(session.file_path))
91
+
92
+ # Fail the process with error message
93
+ session.fail("Error message")
94
+
95
+ ```
96
+
97
+ ## PostOutputSession
98
+
99
+ The `PostOutputSession` class manages a custom post-output process lifecycle.
100
+
101
+ ```python
102
+ from dataforge import PostOutputSession
103
+
104
+ # Initialize a session (production use)
105
+ session = PostOutputSession()
106
+
107
+ # Initialize a session (optional names for testing)
108
+ session = PostOutputSession(output_name="report", output_source_name="my_source", project_name="my_project")
109
+
110
+
111
+ # Get the path of file generated by preceding output process
112
+ path = session.file_path()
113
+
114
+ # Retrieve connection parameters for the current output
115
+ connection_parameters = session.connection_parameters()
116
+
117
+ # Retrieve custom parameters for the current source
118
+ custom_parameters = session.custom_parameters()
119
+
120
+ # Run post-output logic: pass a function encapsulating custom code
121
+ session.run(lambda: print(f"Uploading file from {path}"))
122
+
123
+ # Fail the process with error message
124
+ session.fail("Error message")
125
+ ```
@@ -0,0 +1,115 @@
1
+ # dataforge-sdk
2
+ SDK for creating DataForge extensions.
3
+
4
+ ## Postgres Utilities
5
+
6
+ The `dataforge.pg` module provides helper functions to execute SQL operations against the DataForge Postgres metastore:
7
+
8
+ ```python
9
+ from dataforge.pg import select, update, pull
10
+
11
+ # Execute a SELECT query and return a Spark DataFrame
12
+ df = select("SELECT * FROM my_table")
13
+
14
+ # Execute an UPDATE/INSERT/DELETE query
15
+ update("UPDATE my_table SET col = 'value'")
16
+
17
+ # Trigger a new data pull for source_id 123
18
+ pull(123)
19
+ ```
20
+
21
+ ## IngestionSession
22
+
23
+ The `IngestionSession` class manages a custom data ingestion process lifecycle.
24
+
25
+ ```python
26
+ from dataforge import IngestionSession
27
+
28
+ # Initialize a session (production use)
29
+ session = IngestionSession()
30
+
31
+ # Initialize a session (optional source_name/project_name for testing)
32
+ session = IngestionSession(source_name="my_source", project_name="my_project")
33
+
34
+ # Ingest data
35
+ # pass a function returning a DataFrame (recommended to integrate logging with DataForge)
36
+ session.ingest(lambda: spark.read.csv("s3://bucket/path/input.csv"))
37
+
38
+ # pass a DataFrame (can be used for testing, not recommended for production deployment)
39
+ df = spark.read.csv("s3://bucket/path/input.csv")
40
+ session.ingest(df)
41
+
42
+ # ingest empty dataframe to create 0-record input
43
+ session.ingest()
44
+
45
+
46
+ # Fail the process with error message
47
+ session.fail("Error message")
48
+
49
+ # Retrieve latest tracking fields
50
+ tracking = session.latest_tracking_fields()
51
+
52
+ # Retrieve connection parameters for the current source
53
+ connection_parameters = session.connection_parameters()
54
+
55
+ # Retrieve custom parameters for the current source
56
+ custom_parameters = session.custom_parameters()
57
+
58
+ ```
59
+
60
+ ## ParsingSession
61
+
62
+ The `ParsingSession` class manages a custom parse process lifecycle.
63
+
64
+ ```python
65
+ from dataforge import ParsingSession
66
+
67
+ # Initialize a session (production use)
68
+ session = ParsingSession()
69
+
70
+ # Initialize a session (optional input_id for testing)
71
+ session = ParsingSession(input_id=123)
72
+
73
+ # Retrieve custom parameters
74
+ params = session.custom_parameters()
75
+
76
+ # Get the path of file to be parsed
77
+ path = session.file_path
78
+
79
+ # Run parsing: pass a DataFrame, a function returning a DataFrame or None (0-record file)
80
+ session.run(lambda: spark.read.json(session.file_path))
81
+
82
+ # Fail the process with error message
83
+ session.fail("Error message")
84
+
85
+ ```
86
+
87
+ ## PostOutputSession
88
+
89
+ The `PostOutputSession` class manages a custom post-output process lifecycle.
90
+
91
+ ```python
92
+ from dataforge import PostOutputSession
93
+
94
+ # Initialize a session (production use)
95
+ session = PostOutputSession()
96
+
97
+ # Initialize a session (optional names for testing)
98
+ session = PostOutputSession(output_name="report", output_source_name="my_source", project_name="my_project")
99
+
100
+
101
+ # Get the path of file generated by preceding output process
102
+ path = session.file_path()
103
+
104
+ # Retrieve connection parameters for the current output
105
+ connection_parameters = session.connection_parameters()
106
+
107
+ # Retrieve custom parameters for the current source
108
+ custom_parameters = session.custom_parameters()
109
+
110
+ # Run post-output logic: pass a function encapsulating custom code
111
+ session.run(lambda: print(f"Uploading file from {path}"))
112
+
113
+ # Fail the process with error message
114
+ session.fail("Error message")
115
+ ```
@@ -0,0 +1,6 @@
1
+ from .post_output_session import PostOutputSession
2
+ from .ingestion_session import IngestionSession
3
+ from .parsing_session import ParsingSession
4
+
5
+ __version__ = "9.2.0-rc.3"
6
+ __all__ = ['PostOutputSession','IngestionSession', 'ParsingSession']
@@ -0,0 +1,302 @@
1
+ """Session management for DataForge SDK.
2
+
3
+ This module provides the Session class for managing DataForge SDK sessions,
4
+
5
+ Classes:
6
+ CoreAPIError: Raised when a Core API call fails.
7
+ VersionMismatchError: Raised when the SDK version does not match server expectation.
8
+ Session: Main class for session management.
9
+ """
10
+ import json
11
+ import re
12
+ import traceback
13
+ from typing import Optional, Literal
14
+ from pyspark.sql import SparkSession, DataFrame
15
+ import requests
16
+ import logging
17
+ from pyspark.dbutils import DBUtils
18
+ from pyspark.sql.functions import monotonically_increasing_id, lit
19
+ from pyspark.sql.types import LongType
20
+
21
+ from dataforge.process_record import ProcessRecord
22
+ from dataforge.system_configuration import SystemConfiguration
23
+ from dataforge.postgres_connection import PostgresConnection
24
+ from importlib.metadata import version
25
+
26
+ from .utils import _setup_logger
27
+
28
+
29
+ class CoreAPIError(Exception):
30
+ """Exception raised when a Core API call fails."""
31
+ pass
32
+
33
+ class VersionMismatchError(Exception):
34
+ """Exception raised when the SDK version does not match server expectation."""
35
+ pass
36
+
37
+ class _Session:
38
+ """Base session class for DataForge SDK operations.
39
+ Class should not be instantiated by user directly: use process-specific Session classes instead
40
+ Manages Spark session, DBUtils, system configuration, Core API calls,
41
+ logging, and process lifecycle.
42
+
43
+ Attributes:
44
+ spark (SparkSession): Spark session for data processing.
45
+ dbutils (DBUtils): Databricks DBUtils instance.
46
+ logger (logging.Logger): Logger instance for session logs.
47
+ _systemConfiguration (SystemConfiguration): System configuration loaded from Postgres.
48
+ _pg (PostgresConnection): Postgres connection to core API database.
49
+ process (ProcessRecord): The current process record.
50
+ version (str): DataForge SDK version.
51
+ """
52
+ _systemConfiguration: SystemConfiguration
53
+ _pg: PostgresConnection
54
+ spark: SparkSession
55
+ dbutils: DBUtils
56
+ logger: logging.Logger
57
+ process: ProcessRecord
58
+ version = version("dataforge-sdk")
59
+ process_parameters: dict
60
+ _is_open: bool = False
61
+
62
+ def __init__(self):
63
+ self.logger = _setup_logger(self.__class__.__name__)
64
+ self.spark = SparkSession.builder.getOrCreate()
65
+ self.dbutils = self._get_dbutils()
66
+ pg_connection_string_read = self.dbutils.secrets.get("sparky", "pg_read")
67
+ pg_read = PostgresConnection(f"{pg_connection_string_read}&application_name=sdk", self.logger)
68
+ self._systemConfiguration = SystemConfiguration(pg_read)
69
+ pg_connection_string: str = self._core_api_call("core/sparky-db-connection")["connection"]
70
+ self._pg = PostgresConnection(f"{pg_connection_string}&application_name=sdk", self.logger)
71
+ self._is_open = True
72
+ self._check_version()
73
+ self.process_parameters = {
74
+ "version": self.version,
75
+ "packageName": "dataforge-sdk"
76
+ }
77
+ try:
78
+ self.process_parameters['process_id'] = int(self.dbutils.widgets.get("process_id"))
79
+ except Exception as e:
80
+ pass
81
+ self.logger.info(f"Initialized base session for {self.__class__.__name__}")
82
+
83
+
84
+ def _get_dbutils(self):
85
+ return DBUtils(self.spark)
86
+
87
+ def _core_api_call(self, route: str):
88
+ """Make a GET request to the Core API.
89
+
90
+ Args:
91
+ route (str): API route to call, appended to core URI.
92
+
93
+ Returns:
94
+ dict: Parsed JSON response from Core API.
95
+
96
+ Raises:
97
+ CoreAPIError: If response status is not 200.
98
+ """
99
+ add_core = "core/" if route.startswith("process-complete") and self._systemConfiguration.saas_flag else ""
100
+ end_point = f"{self._systemConfiguration.coreUri}/{add_core}{route}"
101
+ core_jwt_token = self.dbutils.secrets.get("sparky", "coreJWT")
102
+ headers = {
103
+ "Authorization": f"Bearer {core_jwt_token}",
104
+ "Content-Type": "application/json",
105
+ "Accept": "application/json"
106
+ }
107
+ self.logger.debug(f"Executing core API call to {end_point}")
108
+ response = requests.get(end_point, headers=headers)
109
+ if response.status_code == 200:
110
+ data = response.json() if response.content else None
111
+ return data
112
+ else:
113
+ raise CoreAPIError(f"Core API call to {end_point} failed with status code {response.status_code}")
114
+
115
+ def _check_version(self):
116
+ """Checks that the SDK version is compatible with the server.
117
+
118
+ Calls the database function to verify the SDK version. Raises
119
+ VersionMismatchError if there is a version mismatch.
120
+ """
121
+ self.logger.info(f"Executing package dataforge-sdk version {self.version}")
122
+ check_res = self._pg.sql("select sparky.sdk_check_version(%s,%s)", [self.version, "dataforge-sdk"])
123
+ if check_res.get("error"):
124
+ raise VersionMismatchError(check_res.get("error"))
125
+
126
+ def log(self, message:str, severity: str = "I"):
127
+ """Log a message to both the local logger and the process log in the database.
128
+
129
+ Args:
130
+ message (str): The log message.
131
+ severity (str): Severity level, "I" for info, "E" for error. Defaults to "I".
132
+ """
133
+ match severity:
134
+ case "I":
135
+ self.logger.info(message)
136
+ case "E":
137
+ self.logger.error(message)
138
+ payload = {"log_id": self.process.logId, "operation_type": self.process.operationType,
139
+ "severity": severity, "message": message}
140
+ self._pg.sql("SELECT sparky.write_log(%s)", [json.dumps(payload)], fetch=False)
141
+
142
+ def connection_parameters(self):
143
+ """Retrieve connection parameters for the current process.
144
+
145
+ Returns:
146
+ dict: Dictionary containing private and public connection parameters.
147
+ """
148
+ connection_id = self.process.connectionId
149
+ self.log(f"Reading connection parameters of connection id {connection_id}")
150
+ params = self._core_api_call(f"core/connection/{connection_id}").get("parameters")
151
+ return {"private_connection_parameters": params.get("private_connection_parameters"),
152
+ "public_connection_parameters": params.get("public_connection_parameters")}
153
+
154
+
155
+ def _end_process(self, status_code: Literal['P', 'F', 'Z'] = 'P', parameters: Optional[dict] = None):
156
+ """Private method. End the current process
157
+ """
158
+ payload = parameters if parameters else {}
159
+ payload["process_id"] = self.process.processId
160
+ payload["status_code"] = status_code
161
+ self._pg.sql("select meta.prc_process_end(%s)", [json.dumps(payload)], fetch=False)
162
+ self._core_api_call(f"process-complete/{self.process.processId}")
163
+ if status_code in ('P','Z'):
164
+ self.logger.info("Session completed successfully")
165
+ else:
166
+ self.logger.error("Process failed")
167
+ self.close()
168
+
169
+ def fail(self,message: str):
170
+ """Fail current session and log the message.
171
+
172
+ Args:
173
+ message (str): The log message.
174
+ """
175
+ self.log(f"Process failed with error: {message}", "E")
176
+ self._end_process("F")
177
+
178
+ def _log_fail(self, e: Exception):
179
+ """Log failure for the given exception.
180
+
181
+ Args:
182
+ e (Exception): The exception that caused the failure.
183
+ """
184
+ traceback.print_exception(e)
185
+ self.log(f"Process failed with error: {e}", "E")
186
+
187
+ def close(self):
188
+ """Close the session."""
189
+ self._pg.close()
190
+ self.logger.info("Session closed")
191
+ self._is_open = False
192
+
193
+ def custom_parameters(self):
194
+ """Retrieve custom parameters from the process.
195
+
196
+ Returns:
197
+ dict: The custom parameters dictionary from the process.
198
+ """
199
+ return self.process.parameters.get('custom_parameters')
200
+
201
+ def _write_parsed_data(self, in_df: DataFrame, dest_file_path: str) -> tuple[int, int]:
202
+ """Process input DataFrame, write to Parquet, and update metadata.
203
+
204
+ Args:
205
+ in_df (DataFrame): Input Spark DataFrame to process and write.
206
+ dest_file_path (str): Destination path for saving Parquet file.
207
+
208
+ Returns:
209
+ tuple[int, int]: A tuple containing the total file size in bytes and the number of records written.
210
+
211
+ Raises:
212
+ Exception: If duplicate columns are detected or metadata update fails.
213
+ """
214
+ self.log("Data read successfully. Checking schema.")
215
+
216
+ select_list = self._pg.sql("SELECT sparky.get_select_list(%s)", (self.process.sourceId,))
217
+ df_sel = in_df.selectExpr(*select_list)
218
+ self.log(f"Applied select list {select_list}")
219
+
220
+ # Duplicate column check
221
+ cols = df_sel.columns
222
+ dup_columns = [col for col in set(cols) if cols.count(col) > 1]
223
+ if dup_columns:
224
+ raise Exception(f"Duplicate columns detected: {', '.join(dup_columns)}")
225
+
226
+ # Cast binary/void to string
227
+ binary_casts = [
228
+ f"CAST(`{f.name}` AS STRING) `{f.name}`" if f.dataType.typeName() in ("binary", "void")
229
+ else f"`{f.name}`"
230
+ for f in df_sel.schema.fields
231
+ ]
232
+ df = df_sel.selectExpr(*binary_casts)
233
+
234
+ # Schema as JSON array
235
+ schema = []
236
+ for f in df.schema.fields:
237
+ field_name = f.name.lower() if self.process.forceCaseInsensitive else f.name
238
+ name_normalized = re.sub(r'\W+', '_', field_name)
239
+ column_normalized = ("_" if field_name[0].isdigit() else "") + name_normalized # add leading underscore
240
+
241
+ if f.dataType.simpleString().startswith("struct"):
242
+ spark_type = "StructType"
243
+ elif f.dataType.simpleString().startswith("array"):
244
+ spark_type = "ArrayType"
245
+ elif f.dataType.simpleString().startswith("decimal"):
246
+ spark_type = "DecimalType"
247
+ else:
248
+ spark_type = f.dataType.simpleString()
249
+
250
+ attr_schema = json.loads(f.dataType.json())
251
+ self.logger.info(f"Column `{column_normalized}` schema: {attr_schema}")
252
+ schema.append({
253
+ "name": field_name,
254
+ "column_normalized": column_normalized,
255
+ "spark_type": spark_type,
256
+ "schema": attr_schema
257
+ })
258
+
259
+ self.log("Schema read successfully. Updating source raw metadata.")
260
+
261
+ metadata_update_json = {
262
+ "source_id": self.process.sourceId,
263
+ "input_id": self.process.inputId,
264
+ "raw_attributes": schema,
265
+ "ingestion_type": "sparky"
266
+ }
267
+
268
+ result = self._pg.sql("SELECT meta.prc_n_normalize_raw_attribute(%s)", (json.dumps(metadata_update_json),))
269
+ if "error" in result:
270
+ raise Exception(result["error"])
271
+
272
+ normalize_attributes = result["normalized_metadata"]
273
+
274
+ self.log("Source metadata updated. Renaming and upcasting attributes")
275
+
276
+ cast_rename_expr = []
277
+ for att in normalize_attributes:
278
+ base_expr = att.get("upcastExpr") or att["raw_attribute_name"]
279
+ if att["raw_attribute_name"] != att["column_alias"] or self.process.forceCaseInsensitive:
280
+ base_expr += f" AS {att['column_alias']}"
281
+ cast_rename_expr.append(base_expr)
282
+
283
+ self.logger.info("Normalized SQL: " + ", ".join(cast_rename_expr))
284
+ df_update = df.selectExpr(*cast_rename_expr)
285
+
286
+ if self.process.parameters.get("generate_row_id", False):
287
+ self.logger.info("Added s_row_id to data.")
288
+ df_final = df_update.withColumn("s_row_id", monotonically_increasing_id())
289
+ else:
290
+ self.logger.info("generate_row_id = false, added null s_row_id.")
291
+ df_final = df_update.withColumn("s_row_id", lit(None).cast(LongType()))
292
+
293
+ self.log("Writing file")
294
+ df_final.write.format("parquet").mode("overwrite").save(dest_file_path)
295
+ self.log(f"Wrote file {dest_file_path}")
296
+
297
+ row_count = self.spark.read.format("parquet").load(dest_file_path).count()
298
+ self.log(f"{row_count} records counted")
299
+
300
+ file_size = sum(f.size for f in self.dbutils.fs.ls(dest_file_path))
301
+ return file_size, row_count
302
+
@@ -0,0 +1,119 @@
1
+ """IngestionSession management for DataForge SDK.
2
+
3
+ This module provides the IngestionSession class
4
+ that manages custom data ingestion process, including initialization, data writing,
5
+ metadata updates, and failure handling.
6
+
7
+ Classes:
8
+ IngestionSession: Manages the custom ingestion process lifecycle.
9
+ """
10
+ import json
11
+ from datetime import datetime
12
+ from typing import Optional, Callable
13
+ from pyspark.sql import DataFrame
14
+ from .process_record import ProcessRecord
15
+ from ._session import _Session
16
+
17
+
18
+ class IngestionSession(_Session):
19
+
20
+ """Session class to manage custom ingestion process lifecycle.
21
+
22
+ Initializes an ingestion process record and provides methods to ingest
23
+ data frames and handle failures, updating metadata and notifying the Core API.
24
+ """
25
+ def __init__(self, source_name: Optional[str] = None, project_name: Optional[str] = None):
26
+ """Initialize ingestion session and start a new ingestion process.
27
+
28
+ Args:
29
+ source_name (Optional[str]): Optional name of the data source being ingested.
30
+ Used for interactive testing. Leave blank for production use.
31
+ project_name (Optional[str]): Optional project context name. Used for interactive testing.
32
+ Leave blank for production use.
33
+ """
34
+ super().__init__()
35
+ # initialize process
36
+ self.process_parameters["start_process_flag"] = True
37
+ if source_name is not None:
38
+ self.process_parameters["source_name"] = source_name
39
+ if project_name is not None:
40
+ self.process_parameters["project_name"] = project_name
41
+
42
+ process = self._pg.sql("select sparky.sdk_new_ingestion(%s)", (json.dumps(self.process_parameters),))
43
+ self.process = ProcessRecord(process)
44
+
45
+
46
+ def latest_tracking_fields(self):
47
+ """Get the latest tracking fields from the ingestion process record.
48
+
49
+ Returns:
50
+ dict: The latest tracking data from the process record, or None if not available.
51
+ """
52
+ return self.process.process.get("tracking")
53
+
54
+ def ingest(self,df: DataFrame | Callable[[], DataFrame] | None = None):
55
+ """Ingest the provided DataFrame into the DataForge and update input record.
56
+
57
+ Writes the DataFrame to raw Parquet file,
58
+ updates the input record with status, file size, record count, and notifies
59
+ the Core API of process completion. On failure, updates logs and flags the input and process
60
+ records as failed.
61
+
62
+ Args:
63
+ df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Spark DataFrame to ingest (recommended),
64
+ or spark DataFrame
65
+ """
66
+ try:
67
+ if not self._is_open:
68
+ raise Exception("Session is closed")
69
+ if df is None:
70
+ status = "Z"
71
+ row_count = 0
72
+ file_size = 0
73
+ else:
74
+ if callable(df):
75
+ result_df = df() # call it to get the DataFrame
76
+ else:
77
+ result_df = df
78
+ dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
79
+ file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
80
+ status = "P" if row_count > 0 else "Z"
81
+ input_update_json = {
82
+ "ingestion_status_code": status,
83
+ "extract_datetime": datetime.now().isoformat(),
84
+ "file_size": file_size,
85
+ "process_id": self.process.processId,
86
+ "input_id": self.process.inputId,
87
+ "record_counts": {"Total": row_count}
88
+ }
89
+
90
+ self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
91
+ (json.dumps(input_update_json),), fetch=False)
92
+ self.logger.info("Ingestion completed successfully")
93
+
94
+ except Exception as e:
95
+ self._log_fail(e)
96
+ failure_update_json = {
97
+ "process_id": self.process.processId,
98
+ "ingestion_status_code": "F"
99
+ }
100
+ self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
101
+ (json.dumps(failure_update_json),), fetch=False)
102
+ finally:
103
+ self._core_api_call(f"process-complete/{self.process.processId}")
104
+ self.close()
105
+
106
+
107
+ def fail(self, message: str):
108
+ """Mark the ingestion as failed with a custom message.
109
+
110
+ Logs the provided error message, updates the input and process record to failure status,
111
+ and notifies the Core API of process completion.
112
+
113
+ Args:
114
+ message (str): Custom error message explaining the failure.
115
+ """
116
+ self.log(f"Custom Ingestion failed with error: {message}", "E")
117
+ self._pg.sql("select meta.prc_iw_in_update_input_record(%s)",
118
+ [json.dumps({"process_id": self.process.processId, "ingestion_status_code" : "F"})], fetch=False)
119
+ self._core_api_call(f"process-complete/{self.process.processId}")
@@ -0,0 +1,82 @@
1
+
2
+ import json
3
+ from typing import Optional, Callable
4
+ from pyspark.sql import DataFrame
5
+ from .process_record import ProcessRecord
6
+ from ._session import _Session
7
+
8
+
9
+ class ParsingSession(_Session):
10
+
11
+ """Session class to manage custom parse process lifecycle.
12
+
13
+ Initializes parse process record and provides methods to parse
14
+ data frames and handle failures, updating metadata and notifying the Core API.
15
+ """
16
+ _parsing_parameters: dict
17
+ def __init__(self, input_id: Optional[int] = None):
18
+ """Initialize custom parse session and start a new custom parse process.
19
+
20
+ Args:
21
+ input_id (Optional[int]): Optional input_id of the batch for interactive testing.
22
+ Leave blank for production use.
23
+ """
24
+ super().__init__()
25
+ # initialize process
26
+ if input_id is not None:
27
+ self.process_parameters["input_id"] = input_id
28
+
29
+ process = self._pg.sql("select sparky.sdk_new_parse(%s)", (json.dumps(self.process_parameters),))
30
+ self.process = ProcessRecord(process)
31
+ self._parsing_parameters = self._pg.sql("select meta.prc_n_start_parse(%s)",
32
+ (json.dumps({"input_id": self.process.inputId}),))
33
+ # Extract the file extension
34
+ file_name = self._parsing_parameters["source_file_name"]
35
+ file_extension = file_name.split(".")[-1]
36
+ # Construct the path
37
+ self.file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/raw/raw_input_{self.process.inputId}.{file_extension}"
38
+
39
+ def custom_parameters(self):
40
+ """Retrieve custom parsing parameters.
41
+
42
+ Returns:
43
+ dict: The custom parsing parameters dictionary.
44
+ """
45
+ return self._parsing_parameters.get('custom_parameters')
46
+
47
+ def run(self,df: DataFrame | Callable[[], DataFrame] | None = None):
48
+ """Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
49
+
50
+ Writes the DataFrame to parsed Parquet file,
51
+ updates the input record with status, file size, record count, and notifies
52
+ the Core API of process completion. On failure, updates logs and flags the input and process
53
+ records as failed.
54
+
55
+ Args:
56
+ df (DataFrame): parameterless def that you defined, returning the Spark DataFrame containing parsed file data (recommended),
57
+ or spark DataFrame
58
+ """
59
+ try:
60
+ if not self._is_open:
61
+ raise Exception("Session is closed")
62
+ if callable(df):
63
+ result_df = df() # call it to get the DataFrame
64
+ else:
65
+ result_df = df
66
+
67
+ if result_df is None or result_df.isEmpty():
68
+ file_size, row_count = (0, 0)
69
+ else:
70
+ dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
71
+ file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
72
+ input_update_json = {
73
+ "file_size": file_size,
74
+ "input_id": self.process.inputId,
75
+ "record_counts": {"Total": row_count}
76
+ }
77
+ self._end_process('P' if row_count > 0 else 'Z', input_update_json)
78
+
79
+ except Exception as e:
80
+ self._log_fail(e)
81
+ self._end_process("F")
82
+
@@ -0,0 +1,91 @@
1
+ """Postgres utilities for data operations.
2
+
3
+ This module provides functions to execute SQL queries against a Postgres database
4
+ using Spark JDBC for reads and a direct write connection for write operations.
5
+ """
6
+ from pyspark.dbutils import DBUtils
7
+ from pyspark.sql import SparkSession, DataFrame
8
+ from .postgres_connection import PostgresConnection
9
+
10
+ spark = SparkSession.builder.getOrCreate()
11
+ dbutils = DBUtils(spark)
12
+ pg_connection_string_read = dbutils.secrets.get("sparky", "pg_read")
13
+
14
+
15
+ def update(query: str):
16
+ """Execute an update SQL query on the DataForge metastore Postgres database.
17
+
18
+ Args:
19
+ query (str): SQL query string to execute.
20
+
21
+ Returns:
22
+ None
23
+
24
+ Raises:
25
+ Exception: If write connection cannot be established or SQL execution fails.
26
+ """
27
+ pg = _get_pg_write_connection()
28
+ pg.sql(query, fetch=False)
29
+
30
+
31
+ def execute(query: str):
32
+ """Alias for update() to execute write SQL queries.
33
+
34
+ Args:
35
+ query (str): SQL query string to execute.
36
+
37
+ Returns:
38
+ None
39
+ """
40
+ update(query)
41
+
42
+
43
+ def select(query: str) -> DataFrame:
44
+ """Execute a SELECT SQL query on the DataForge metastore Postgres database and return a DataFrame with results.
45
+
46
+ Args:
47
+ query (str): SQL SELECT query string.
48
+
49
+ Returns:
50
+ DataFrame: Spark DataFrame containing query results.
51
+
52
+ Raises:
53
+ Exception: If Spark fails to load data or connection issues.
54
+ """
55
+ return spark.read.format("jdbc") \
56
+ .option("url", pg_connection_string_read) \
57
+ .option("query", query) \
58
+ .load()
59
+
60
+
61
+ def pull(source_id: int):
62
+ """Trigger new ingestion (pull data) on DataForge source for a given source ID.
63
+
64
+ Args:
65
+ source_id (int): Identifier for the source to pull.
66
+
67
+ Returns:
68
+ None
69
+
70
+ Raises:
71
+ Exception: If write connection cannot be established or SQL execution fails.
72
+ """
73
+ pg = _get_pg_write_connection()
74
+ pg.sql("SELECT meta.svc_pull_source(%s, %s)", (source_id,'sdk'), fetch=False)
75
+
76
+
77
+ def _get_pg_write_connection() -> PostgresConnection:
78
+ """Internal method to retrieve a PostgresConnection for write operations using secured secrets.
79
+
80
+ Returns:
81
+ PostgresConnection: Connection object for executing write queries.
82
+
83
+ Raises:
84
+ Exception: If the 'pg_write' secret is not defined in the 'sparky' scope.
85
+ """
86
+ secrets = dbutils.secrets.list("sparky")
87
+ if any(secret.key == "pg_write" for secret in secrets):
88
+ conn_string = dbutils.secrets.get("sparky", "pg_write")
89
+ return PostgresConnection(conn_string + "&application_name=sdk-pg")
90
+ else:
91
+ raise Exception("pg_write secret is not defined in sparky scope")
@@ -0,0 +1,60 @@
1
+
2
+ import json
3
+ from typing import Optional, Callable
4
+ from .process_record import ProcessRecord
5
+ from ._session import _Session
6
+
7
+
8
+ class PostOutputSession(_Session):
9
+
10
+ """Session class to manage custom post-output process lifecycle.
11
+
12
+ Initializes post-output process record and provides run() method to execute custom logic post-output completion,
13
+ handle failures and update metadata.
14
+ """
15
+ def __init__(self, output_name: Optional[str] = None, output_source_name: Optional[str] = None, project_name: Optional[str] = None):
16
+ """Initialize custom post-output session and start a new post-output process.
17
+
18
+ Args:
19
+ input_id (Optional[int]): Optional input_id of the batch for interactive testing.
20
+ Leave blank for production use.
21
+ """
22
+ super().__init__()
23
+ # initialize process
24
+ if output_name is not None:
25
+ self.process_parameters["output_name"] = output_name
26
+ self.process_parameters["output_source_name"] = output_source_name
27
+ self.process_parameters["project_name"] = project_name
28
+
29
+ process = self._pg.sql("select sparky.sdk_new_post_output(%s)", (json.dumps(self.process_parameters),))
30
+ self.process = ProcessRecord(process)
31
+
32
+ def file_path(self):
33
+ """Returns file path for the file generated by output process"""
34
+ return self.process.parameters.get("file_path")
35
+
36
+ def run(self, po_commands: Callable[[],None]):
37
+
38
+ """Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
39
+
40
+ Writes the DataFrame to parsed Parquet file,
41
+ updates the input record with status, file size, record count, and notifies
42
+ the Core API of process completion. On failure, updates logs and flags the input and process
43
+ records as failed.
44
+
45
+ Args:
46
+ po_commands (po_commands: Callable[[],None]): The custom-defined, parameterless def containing post-output logic.
47
+ Returns None
48
+ """
49
+ try:
50
+ if not self._is_open:
51
+ raise Exception("Session is closed")
52
+ self.log(f"Running post output def {po_commands}")
53
+ po_commands()
54
+ self.log(f"Post output execution completed successfully")
55
+ self._end_process("P")
56
+
57
+ except Exception as e:
58
+ self._log_fail(e)
59
+ self._end_process("F")
60
+
@@ -0,0 +1,49 @@
1
+ import logging
2
+ import sys
3
+ import psycopg2
4
+ from .utils import _setup_logger
5
+
6
+
7
+ class PostgresConnection:
8
+ logger: logging.Logger
9
+ def __init__(self, connection_string: str, logger: logging.Logger | None = None):
10
+ try:
11
+ if logger:
12
+ self.logger = logger
13
+ else:
14
+ self.logger = _setup_logger("PG")
15
+ self.connect(connection_string)
16
+
17
+ except Exception as e:
18
+ logger.error(f"Error connecting to Postgres: {e}")
19
+ sys.exit(1)
20
+
21
+ def sql(self, query: str, params=None, fetch=True):
22
+ try:
23
+ # Execute a query
24
+ cur = self.conn.cursor()
25
+ cur.execute(query, params)
26
+ # Retrieve query results
27
+ res = cur.fetchone() if fetch else [None]
28
+ cur.close()
29
+ return res[0]
30
+ except Exception as e:
31
+ self.logger.error(f"Error executing query {query}({params}) on Postgres: {e}")
32
+ sys.exit(1)
33
+
34
+ def connect(self, connection_string: str):
35
+ # Execute a query
36
+ try:
37
+ self.conn = psycopg2.connect(connection_string.removeprefix('jdbc:'))
38
+ self.conn.set_session(autocommit=True)
39
+ self.sql("select 1") # execute test query
40
+ # Change connection
41
+ except Exception as e:
42
+ self.logger.error(f"Error connecting to Postgres database or insufficient permissions. Details: {e}")
43
+ sys.exit(1)
44
+
45
+ def close(self ):
46
+ self.conn.close()
47
+
48
+
49
+
@@ -0,0 +1,21 @@
1
+ class ProcessRecord:
2
+ processId: int
3
+ logId: int
4
+ operationType: str
5
+ sourceId: int
6
+ inputId: int
7
+ process: dict
8
+ parameters: dict
9
+ forceCaseInsensitive: bool
10
+ connectionId: int
11
+
12
+ def __init__(self, process):
13
+ self.process = process
14
+ self.processId = process['process_id']
15
+ self.logId = process['log_id']
16
+ self.operationType = process['operation_type']
17
+ self.inputId = process['input_id']
18
+ self.sourceId = process['source_id']
19
+ self.connectionId = process.get('connection_id')
20
+ self.parameters = process['parameters']
21
+ self.forceCaseInsensitive = self.parameters.get('force_case_insensitive', False)
@@ -0,0 +1,22 @@
1
+ from .postgres_connection import PostgresConnection
2
+
3
+
4
+ class SystemConfiguration:
5
+ datalakePath: str
6
+ databricksDbName: str
7
+ targetParquetFileSize: int
8
+ # etlURI: str
9
+ architecture:str
10
+ coreUri: str
11
+ saas_flag: bool
12
+
13
+ def __init__(self, pg: PostgresConnection):
14
+ conf = pg.sql("SELECT meta.prc_get_system_configuration('sparky')")
15
+ self.datalakePath = conf['data-lake-path'].replace("s3://", "s3a://")
16
+ self.databricksDbName = conf['databricks-db-name']
17
+ self.targetParquetFileSize = conf['target-parquet-file-size']
18
+ # self.etlURL = conf['etl-url']
19
+ self.architecture = conf['architecture']
20
+ self.saas_flag = self.architecture == "saas"
21
+ self.coreUri = f"https://{conf['api-url']}" if self.saas_flag else f"http://{conf['etl-url']}:7131"
22
+
@@ -0,0 +1,25 @@
1
+ import logging
2
+
3
+ def _setup_logger(name: str = "SDK"):
4
+ """Set up and return a configured logger for the SDK.
5
+
6
+ Returns:
7
+ logging.Logger: Configured logger for SDK outputs.
8
+ """
9
+ logger = logging.getLogger(name)
10
+ logger.setLevel(logging.INFO)
11
+ logger.propagate = False
12
+
13
+ # Create handler
14
+ console_handler = logging.StreamHandler()
15
+ console_handler.setLevel(logging.INFO)
16
+
17
+ # Create formatter with timestamp
18
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
19
+ console_handler.setFormatter(formatter)
20
+
21
+ if not logger.hasHandlers():
22
+ # Add handler to the logger
23
+ logger.addHandler(console_handler)
24
+
25
+ return logger
@@ -0,0 +1,125 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataforge-sdk
3
+ Version: 9.2.0rc3
4
+ Summary: SDK for creating DataForge extensions
5
+ Author-email: Vadim Orlov <vorlov@dataforgelabs.com>
6
+ Project-URL: Homepage, https://docs.dataforgelabs.com
7
+ Project-URL: Issues, https://docs.dataforgelabs.com/hc/en-us/requests/new
8
+ Requires-Python: >=3.11
9
+ Description-Content-Type: text/markdown
10
+
11
+ # dataforge-sdk
12
+ SDK for creating DataForge extensions.
13
+
14
+ ## Postgres Utilities
15
+
16
+ The `dataforge.pg` module provides helper functions to execute SQL operations against the DataForge Postgres metastore:
17
+
18
+ ```python
19
+ from dataforge.pg import select, update, pull
20
+
21
+ # Execute a SELECT query and return a Spark DataFrame
22
+ df = select("SELECT * FROM my_table")
23
+
24
+ # Execute an UPDATE/INSERT/DELETE query
25
+ update("UPDATE my_table SET col = 'value'")
26
+
27
+ # Trigger a new data pull for source_id 123
28
+ pull(123)
29
+ ```
30
+
31
+ ## IngestionSession
32
+
33
+ The `IngestionSession` class manages a custom data ingestion process lifecycle.
34
+
35
+ ```python
36
+ from dataforge import IngestionSession
37
+
38
+ # Initialize a session (production use)
39
+ session = IngestionSession()
40
+
41
+ # Initialize a session (optional source_name/project_name for testing)
42
+ session = IngestionSession(source_name="my_source", project_name="my_project")
43
+
44
+ # Ingest data
45
+ # pass a function returning a DataFrame (recommended to integrate logging with DataForge)
46
+ session.ingest(lambda: spark.read.csv("s3://bucket/path/input.csv"))
47
+
48
+ # pass a DataFrame (can be used for testing, not recommended for production deployment)
49
+ df = spark.read.csv("s3://bucket/path/input.csv")
50
+ session.ingest(df)
51
+
52
+ # ingest empty dataframe to create 0-record input
53
+ session.ingest()
54
+
55
+
56
+ # Fail the process with error message
57
+ session.fail("Error message")
58
+
59
+ # Retrieve latest tracking fields
60
+ tracking = session.latest_tracking_fields()
61
+
62
+ # Retrieve connection parameters for the current source
63
+ connection_parameters = session.connection_parameters()
64
+
65
+ # Retrieve custom parameters for the current source
66
+ custom_parameters = session.custom_parameters()
67
+
68
+ ```
69
+
70
+ ## ParsingSession
71
+
72
+ The `ParsingSession` class manages a custom parse process lifecycle.
73
+
74
+ ```python
75
+ from dataforge import ParsingSession
76
+
77
+ # Initialize a session (production use)
78
+ session = ParsingSession()
79
+
80
+ # Initialize a session (optional input_id for testing)
81
+ session = ParsingSession(input_id=123)
82
+
83
+ # Retrieve custom parameters
84
+ params = session.custom_parameters()
85
+
86
+ # Get the path of file to be parsed
87
+ path = session.file_path
88
+
89
+ # Run parsing: pass a DataFrame, a function returning a DataFrame or None (0-record file)
90
+ session.run(lambda: spark.read.json(session.file_path))
91
+
92
+ # Fail the process with error message
93
+ session.fail("Error message")
94
+
95
+ ```
96
+
97
+ ## PostOutputSession
98
+
99
+ The `PostOutputSession` class manages a custom post-output process lifecycle.
100
+
101
+ ```python
102
+ from dataforge import PostOutputSession
103
+
104
+ # Initialize a session (production use)
105
+ session = PostOutputSession()
106
+
107
+ # Initialize a session (optional names for testing)
108
+ session = PostOutputSession(output_name="report", output_source_name="my_source", project_name="my_project")
109
+
110
+
111
+ # Get the path of file generated by preceding output process
112
+ path = session.file_path()
113
+
114
+ # Retrieve connection parameters for the current output
115
+ connection_parameters = session.connection_parameters()
116
+
117
+ # Retrieve custom parameters for the current source
118
+ custom_parameters = session.custom_parameters()
119
+
120
+ # Run post-output logic: pass a function encapsulating custom code
121
+ session.run(lambda: print(f"Uploading file from {path}"))
122
+
123
+ # Fail the process with error message
124
+ session.fail("Error message")
125
+ ```
@@ -0,0 +1,17 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.cfg
4
+ dataforge/__init__.py
5
+ dataforge/_session.py
6
+ dataforge/ingestion_session.py
7
+ dataforge/parsing_session.py
8
+ dataforge/pg.py
9
+ dataforge/post_output_session.py
10
+ dataforge/postgres_connection.py
11
+ dataforge/process_record.py
12
+ dataforge/system_configuration.py
13
+ dataforge/utils.py
14
+ dataforge_sdk.egg-info/PKG-INFO
15
+ dataforge_sdk.egg-info/SOURCES.txt
16
+ dataforge_sdk.egg-info/dependency_links.txt
17
+ dataforge_sdk.egg-info/top_level.txt
@@ -0,0 +1,20 @@
1
+ [build-system]
2
+ requires = [
3
+ "setuptools>=61.0",
4
+ "wheel"
5
+ ]
6
+ build-backend = "setuptools.build_meta"
7
+ [project]
8
+ name = "dataforge-sdk"
9
+ version = "9.2.0-rc.3"
10
+ authors = [
11
+ {name="Vadim Orlov", email="vorlov@dataforgelabs.com"}
12
+ ]
13
+ description = "SDK for creating DataForge extensions"
14
+ readme = "README.md"
15
+ requires-python = ">=3.11"
16
+ dependencies = [
17
+ ]
18
+ [project.urls]
19
+ Homepage = "https://docs.dataforgelabs.com"
20
+ Issues = "https://docs.dataforgelabs.com/hc/en-us/requests/new"
@@ -0,0 +1,22 @@
1
+ [metadata]
2
+ name = sdk-python
3
+ version = attr: dataforge.__version__
4
+ description = SDK for creating DataForge extensions
5
+ long_description = file: README.md
6
+ long_description_content_type = text/markdown
7
+ classifiers =
8
+ Programming Language :: Python :: 3
9
+ Programming Language :: Python :: 3.10
10
+ Operating System :: OS Independent
11
+
12
+ [options]
13
+ packages = find:
14
+ install_requires =
15
+ pyspark>=3.0
16
+ psycopg2-binary>=2.9
17
+ python_requires = >=3.10
18
+
19
+ [egg_info]
20
+ tag_build =
21
+ tag_date = 0
22
+