dataforge-sdk 10.0.dev115__tar.gz → 10.0.0rc3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/PKG-INFO +3 -1
  2. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/__init__.py +1 -1
  3. dataforge_sdk-10.0.0rc3/dataforge/_session.py +40 -0
  4. dataforge_sdk-10.0.0rc3/dataforge/databricks/__init__.py +0 -0
  5. dataforge_sdk-10.0.dev115/dataforge/_databricks_session.py → dataforge_sdk-10.0.0rc3/dataforge/databricks/_databricks_base_session.py +13 -66
  6. dataforge_sdk-10.0.0rc3/dataforge/databricks/_databricks_ingestion_session.py +70 -0
  7. dataforge_sdk-10.0.0rc3/dataforge/databricks/_databricks_parsing_session.py +46 -0
  8. dataforge_sdk-10.0.0rc3/dataforge/databricks/_databricks_pg.py +94 -0
  9. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/ingestion_session.py +2 -2
  10. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/parsing_session.py +4 -37
  11. dataforge_sdk-10.0.0rc3/dataforge/pg.py +10 -0
  12. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/post_output_session.py +0 -1
  13. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/postgres_connection.py +6 -3
  14. dataforge_sdk-10.0.0rc3/dataforge/snowflake/__init__.py +0 -0
  15. dataforge_sdk-10.0.0rc3/dataforge/snowflake/_snowflake_base_session.py +66 -0
  16. dataforge_sdk-10.0.0rc3/dataforge/snowflake/_snowflake_ingestion_session.py +39 -0
  17. dataforge_sdk-10.0.0rc3/dataforge/snowflake/_snowflake_parsing_session.py +30 -0
  18. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge_sdk.egg-info/PKG-INFO +3 -1
  19. dataforge_sdk-10.0.0rc3/dataforge_sdk.egg-info/SOURCES.txt +28 -0
  20. dataforge_sdk-10.0.0rc3/dataforge_sdk.egg-info/requires.txt +3 -0
  21. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/pyproject.toml +3 -3
  22. dataforge_sdk-10.0.dev115/dataforge/_session.py +0 -7
  23. dataforge_sdk-10.0.dev115/dataforge/_snowflake_session.py +0 -82
  24. dataforge_sdk-10.0.dev115/dataforge/pg.py +0 -91
  25. dataforge_sdk-10.0.dev115/dataforge_sdk.egg-info/SOURCES.txt +0 -20
  26. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/README.md +0 -0
  27. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/_base_session.py +0 -0
  28. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/process_record.py +0 -0
  29. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/system_configuration.py +0 -0
  30. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge/utils.py +0 -0
  31. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge_sdk.egg-info/dependency_links.txt +0 -0
  32. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/dataforge_sdk.egg-info/top_level.txt +0 -0
  33. {dataforge_sdk-10.0.dev115 → dataforge_sdk-10.0.0rc3}/setup.cfg +0 -0
@@ -1,12 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-sdk
3
- Version: 10.0.dev115
3
+ Version: 10.0.0rc3
4
4
  Summary: SDK for creating DataForge extensions
5
5
  Author-email: Vadim Orlov <vorlov@dataforgelabs.com>
6
6
  Project-URL: Homepage, https://docs.dataforgelabs.com
7
7
  Project-URL: Issues, https://docs.dataforgelabs.com/hc/en-us/requests/new
8
8
  Requires-Python: >=3.10
9
9
  Description-Content-Type: text/markdown
10
+ Provides-Extra: psycopg2
11
+ Requires-Dist: psycopg2-binary>=2.9; extra == "psycopg2"
10
12
 
11
13
  # dataforge-sdk
12
14
  SDK for creating DataForge extensions.
@@ -2,5 +2,5 @@ from .post_output_session import PostOutputSession
2
2
  from .ingestion_session import IngestionSession
3
3
  from .parsing_session import ParsingSession
4
4
 
5
- __version__ = "10.0.dev115"
5
+ __version__ = "10.0.0-rc.3"
6
6
  __all__ = ['PostOutputSession','IngestionSession', 'ParsingSession']
@@ -0,0 +1,40 @@
1
+ import os
2
+
3
+
4
+ def _is_databricks_environment() -> bool:
5
+ """Detect whether the current runtime is Databricks."""
6
+ spark_obj = globals().get("spark")
7
+ if spark_obj is not None:
8
+ spark_class_name = getattr(getattr(spark_obj, "__class__", None), "__name__", None)
9
+ if spark_class_name == "SparkSession":
10
+ return True
11
+ if os.environ.get("DATABRICKS_RUNTIME_VERSION"):
12
+ return True
13
+ try:
14
+ from pyspark.sql import SparkSession # type: ignore
15
+ except ImportError:
16
+ return False
17
+ try:
18
+ return SparkSession.getActiveSession() is not None
19
+ except Exception:
20
+ return False
21
+
22
+
23
+ if _is_databricks_environment():
24
+ from dataforge.databricks._databricks_base_session import _Databricks_Base_Session
25
+ from dataforge.databricks._databricks_ingestion_session import _Databricks_Ingestion_Session
26
+ from dataforge.databricks._databricks_parsing_session import _Databricks_Parsing_Session
27
+
28
+ _Session = _Databricks_Base_Session
29
+ _Ingestion_Session = _Databricks_Ingestion_Session
30
+ _Parsing_Session = _Databricks_Parsing_Session
31
+ _platform = "databricks"
32
+ else:
33
+ from dataforge.snowflake._snowflake_base_session import _Snowflake_Base_Session
34
+ from dataforge.snowflake._snowflake_ingestion_session import _Snowflake_Ingestion_Session
35
+ from dataforge.snowflake._snowflake_parsing_session import _Snowflake_Parsing_Session
36
+
37
+ _Session = _Snowflake_Base_Session
38
+ _Ingestion_Session = _Snowflake_Ingestion_Session
39
+ _Parsing_Session = _Snowflake_Parsing_Session
40
+ _platform = "snowflake"
@@ -1,24 +1,22 @@
1
+ from __future__ import annotations
1
2
  import json
2
3
  import re
3
- from typing import Callable
4
4
 
5
- from pyspark.dbutils import DBUtils
6
- from pyspark.sql import SparkSession, DataFrame
7
- from pyspark.sql.functions import monotonically_increasing_id, lit
8
- from pyspark.sql.types import LongType
9
5
 
10
6
  from dataforge._base_session import _Base_Session
11
7
 
12
8
 
13
- class _Databricks_Session(_Base_Session):
9
+ class _Databricks_Base_Session(_Base_Session):
14
10
  """Base session class for Databricks platform.
15
11
  Class should not be instantiated by user directly: use process-specific Session classes instead
16
12
  Adds Spark session, DBUtilsto Base_Session
17
13
  """
18
- spark: SparkSession
19
- dbutils: DBUtils
14
+
20
15
 
21
16
  def __init__(self):
17
+ from pyspark.sql import SparkSession, DataFrame
18
+ self.spark = SparkSession.builder.getOrCreate()
19
+ self.dbutils = self._get_dbutils()
22
20
  pg_connection_string_read = self.dbutils.secrets.get("sparky", "pg_read")
23
21
  core_jwt_token = self.dbutils.secrets.get("sparky", "coreJWT")
24
22
  try:
@@ -27,69 +25,17 @@ class _Databricks_Session(_Base_Session):
27
25
  process_id = None
28
26
 
29
27
  super().__init__(pg_connection_string_read, core_jwt_token, process_id)
30
- self.spark = SparkSession.builder.getOrCreate()
31
- self.dbutils = self._get_dbutils()
32
28
  self.process_parameters["start_process_flag"] = True
33
29
  self.logger.info(f"Initialized databricks base session for {self.__class__.__name__} with parameters {self.process_parameters}")
34
30
 
35
31
 
36
32
  def _get_dbutils(self):
33
+ from pyspark.dbutils import DBUtils
37
34
  return DBUtils(self.spark)
38
35
 
39
36
 
40
- def ingest(self,df: DataFrame | Callable[[], DataFrame] | None = None):
41
- """Ingest the provided DataFrame into the DataForge and update input record.
42
-
43
- Writes the DataFrame to raw Parquet file,
44
- updates the input record with status, file size, record count, and notifies
45
- the Core API of process completion. On failure, updates logs and flags the input and process
46
- records as failed.
47
37
 
48
- Args:
49
- df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Spark DataFrame to ingest (recommended),
50
- or spark DataFrame
51
- """
52
- try:
53
- if not self._is_open:
54
- raise Exception("Session is closed")
55
- if df is None:
56
- status = "Z"
57
- row_count = 0
58
- file_size = 0
59
- else:
60
- if callable(df):
61
- result_df = df() # call it to get the DataFrame
62
- else:
63
- result_df = df
64
- dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
65
- file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
66
- status = "P" if row_count > 0 else "Z"
67
- input_update_json = {
68
- "ingestion_status_code": status,
69
- "extract_datetime": datetime.now().isoformat(),
70
- "file_size": file_size,
71
- "process_id": self.process.processId,
72
- "input_id": self.process.inputId,
73
- "record_counts": {"Total": row_count}
74
- }
75
-
76
- self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
77
- (json.dumps(input_update_json),), fetch=False)
78
- self.logger.info("Ingestion completed successfully")
79
-
80
- except Exception as e:
81
- self._log_fail(e)
82
- failure_update_json = {
83
- "process_id": self.process.processId,
84
- "ingestion_status_code": "F"
85
- }
86
- self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
87
- (json.dumps(failure_update_json),), fetch=False)
88
- finally:
89
- self._core_api_call(f"process-complete/{self.process.processId}")
90
- self.close()
91
-
92
- def _write_parsed_data(self, in_df: DataFrame, dest_file_path: str) -> tuple[int, int]:
38
+ def _write_parsed_data(self, in_df: pyspark.sql.DataFrame, dest_file_path: str) -> tuple[int, int]:
93
39
  """Process input DataFrame, write to Parquet, and update metadata.
94
40
 
95
41
  Args:
@@ -102,6 +48,10 @@ class _Databricks_Session(_Base_Session):
102
48
  Raises:
103
49
  Exception: If duplicate columns are detected or metadata update fails.
104
50
  """
51
+ from pyspark.sql.functions import monotonically_increasing_id, lit
52
+ from pyspark.sql.types import LongType
53
+
54
+
105
55
  self.log("Data read successfully. Checking schema.")
106
56
 
107
57
  select_list = self._pg.sql("SELECT sparky.get_select_list(%s)", (self.process.sourceId,))
@@ -126,8 +76,6 @@ class _Databricks_Session(_Base_Session):
126
76
  schema = []
127
77
  for f in df.schema.fields:
128
78
  field_name = f.name.lower() if self.process.forceCaseInsensitive else f.name
129
- name_normalized = re.sub(r'\W+', '_', field_name)
130
- column_normalized = ("_" if field_name[0].isdigit() else "") + name_normalized # add leading underscore
131
79
 
132
80
  if f.dataType.simpleString().startswith("struct"):
133
81
  spark_type = "StructType"
@@ -139,10 +87,9 @@ class _Databricks_Session(_Base_Session):
139
87
  spark_type = type(f.dataType).__name__
140
88
 
141
89
  attr_schema = json.loads(f.dataType.json())
142
- self.logger.info(f"Column `{column_normalized}` schema: {attr_schema}")
90
+ self.logger.info(f"Column `{field_name}` schema: {attr_schema}")
143
91
  schema.append({
144
92
  "name": field_name,
145
- "column_normalized": column_normalized,
146
93
  "spark_type": spark_type,
147
94
  "schema": attr_schema
148
95
  })
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import re
4
+ from datetime import datetime
5
+ from typing import Callable
6
+ from dataforge.databricks._databricks_base_session import _Databricks_Base_Session
7
+
8
+
9
+ class _Databricks_Ingestion_Session(_Databricks_Base_Session):
10
+ """Base session class for Databricks platform.
11
+ Class should not be instantiated by user directly: use process-specific Session classes instead
12
+ Adds Spark session, DBUtilsto Base_Session
13
+ """
14
+
15
+ def __init__(self):
16
+ super().__init__()
17
+
18
+ def ingest(self,df: pyspark.sql.DataFrame | Callable[[], pyspark.sql.DataFrame] | None = None):
19
+ """Ingest the provided DataFrame into the DataForge and update input record.
20
+
21
+ Writes the DataFrame to raw Parquet file,
22
+ updates the input record with status, file size, record count, and notifies
23
+ the Core API of process completion. On failure, updates logs and flags the input and process
24
+ records as failed.
25
+
26
+ Args:
27
+ df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Spark DataFrame to ingest (recommended),
28
+ or spark DataFrame
29
+ """
30
+ try:
31
+ if not self._is_open:
32
+ raise Exception("Session is closed")
33
+ if df is None:
34
+ status = "Z"
35
+ row_count = 0
36
+ file_size = 0
37
+ else:
38
+ if callable(df):
39
+ result_df = df() # call it to get the DataFrame
40
+ else:
41
+ result_df = df
42
+ dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
43
+ file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
44
+ status = "P" if row_count > 0 else "Z"
45
+ input_update_json = {
46
+ "ingestion_status_code": status,
47
+ "extract_datetime": datetime.now().isoformat(),
48
+ "file_size": file_size,
49
+ "process_id": self.process.processId,
50
+ "input_id": self.process.inputId,
51
+ "record_counts": {"Total": row_count}
52
+ }
53
+
54
+ self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
55
+ (json.dumps(input_update_json),), fetch=False)
56
+ self.logger.info("Ingestion completed successfully")
57
+
58
+ except Exception as e:
59
+ self._log_fail(e)
60
+ failure_update_json = {
61
+ "process_id": self.process.processId,
62
+ "ingestion_status_code": "F"
63
+ }
64
+ self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
65
+ (json.dumps(failure_update_json),), fetch=False)
66
+ finally:
67
+ self._core_api_call(f"process-complete/{self.process.processId}")
68
+ self.close()
69
+
70
+
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+ from typing import Callable
3
+ from ._databricks_base_session import _Databricks_Base_Session
4
+
5
+
6
+ class _Databricks_Parsing_Session(_Databricks_Base_Session):
7
+ """Implements run method for Databricks.
8
+ """
9
+
10
+
11
+ def run(self,df: pyspark.sql.DataFrame | Callable[[], pyspark.sql.DataFrame] | None = None):
12
+ """Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
13
+
14
+ Writes the DataFrame to parsed Parquet file,
15
+ updates the input record with status, file size, record count, and notifies
16
+ the Core API of process completion. On failure, updates logs and flags the input and process
17
+ records as failed.
18
+
19
+ Args:
20
+ df (DataFrame): parameterless def that you defined, returning the Spark DataFrame containing parsed file data (recommended),
21
+ or spark DataFrame
22
+ """
23
+ try:
24
+ if not self._is_open:
25
+ raise Exception("Session is closed")
26
+ if callable(df):
27
+ result_df = df() # call it to get the DataFrame
28
+ else:
29
+ result_df = df
30
+
31
+ if result_df is None or result_df.isEmpty():
32
+ file_size, row_count = (0, 0)
33
+ else:
34
+ dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
35
+ file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
36
+ input_update_json = {
37
+ "file_size": file_size,
38
+ "input_id": self.process.inputId,
39
+ "record_counts": {"Total": row_count}
40
+ }
41
+ self._end_process('P' if row_count > 0 else 'Z', input_update_json)
42
+
43
+ except Exception as e:
44
+ self._log_fail(e)
45
+ self._end_process("F")
46
+
@@ -0,0 +1,94 @@
1
+ """Postgres utilities for data operations.
2
+
3
+ This module provides functions to execute SQL queries against a Postgres database
4
+ using Spark JDBC for reads and a direct write connection for write operations.
5
+ """
6
+ from dataforge.postgres_connection import PostgresConnection
7
+
8
+
9
+ class DataBricksPg:
10
+
11
+
12
+ def __init__(self):
13
+ from pyspark.dbutils import DBUtils
14
+ from pyspark.sql import SparkSession, DataFrame
15
+ from .postgres_connection import PostgresConnection
16
+
17
+ self.spark = SparkSession.builder.getOrCreate()
18
+ self.dbutils = DBUtils(spark)
19
+ self.pg_connection_string_read = dbutils.secrets.get("sparky", "pg_read")
20
+
21
+ def update(self,query: str):
22
+ """Execute an update SQL query on the DataForge metastore Postgres database.
23
+
24
+ Args:
25
+ query (str): SQL query string to execute.
26
+
27
+ Returns:
28
+ None
29
+
30
+ Raises:
31
+ Exception: If write connection cannot be established or SQL execution fails.
32
+ """
33
+ pg = self._get_pg_write_connection()
34
+ pg.sql(query, fetch=False)
35
+
36
+ def execute(self,query: str):
37
+ """Alias for update() to execute write SQL queries.
38
+
39
+ Args:
40
+ query (str): SQL query string to execute.
41
+
42
+ Returns:
43
+ None
44
+ """
45
+ self.update(query)
46
+
47
+ def select(self,query: str) -> DataFrame:
48
+ """Execute a SELECT SQL query on the DataForge metastore Postgres database and return a DataFrame with results.
49
+
50
+ Args:
51
+ query (str): SQL SELECT query string.
52
+
53
+ Returns:
54
+ DataFrame: Spark DataFrame containing query results.
55
+
56
+ Raises:
57
+ Exception: If Spark fails to load data or connection issues.
58
+ """
59
+ return self.spark.read.format("jdbc") \
60
+ .option("url", self.pg_connection_string_read) \
61
+ .option("query", query) \
62
+ .load()
63
+
64
+ def pull(self,source_id: int):
65
+ """Trigger new ingestion (pull data) on DataForge source for a given source ID.
66
+
67
+ Args:
68
+ source_id (int): Identifier for the source to pull.
69
+
70
+ Returns:
71
+ None
72
+
73
+ Raises:
74
+ Exception: If write connection cannot be established or SQL execution fails.
75
+ """
76
+ pg = self._get_pg_write_connection()
77
+ pg.sql("SELECT meta.svc_pull_source(%s, %s)", (source_id,'sdk'), fetch=False)
78
+
79
+
80
+ def _get_pg_write_connection(self) -> PostgresConnection:
81
+ """Internal method to retrieve a PostgresConnection for write operations using secured secrets.
82
+
83
+ Returns:
84
+ PostgresConnection: Connection object for executing write queries.
85
+
86
+ Raises:
87
+ Exception: If the 'pg_write' secret is not defined in the 'sparky' scope.
88
+ """
89
+ secrets = self.dbutils.secrets.list("sparky")
90
+ if any(secret.key == "pg_write" for secret in secrets):
91
+ conn_string = self.dbutils.secrets.get("sparky", "pg_write")
92
+ return PostgresConnection(conn_string + "&application_name=sdk-pg")
93
+ else:
94
+ raise Exception("pg_write secret is not defined in sparky scope")
@@ -9,11 +9,11 @@ Classes:
9
9
  """
10
10
  import json
11
11
  from typing import Optional
12
- from ._session import _Session
12
+ from ._session import _Ingestion_Session
13
13
  from .process_record import ProcessRecord
14
14
 
15
15
 
16
- class IngestionSession(_Session):
16
+ class IngestionSession(_Ingestion_Session):
17
17
 
18
18
  """Session class to manage custom ingestion process lifecycle.
19
19
 
@@ -1,12 +1,13 @@
1
+ from __future__ import annotations
1
2
 
2
3
  import json
3
4
  from typing import Optional, Callable
4
- from pyspark.sql import DataFrame
5
5
  from .process_record import ProcessRecord
6
- from ._session import _Session
6
+ from ._session import _Parsing_Session
7
7
 
8
8
 
9
- class ParsingSession(_Session):
9
+ class ParsingSession(_Parsing_Session):
10
+
10
11
 
11
12
  """Session class to manage custom parse process lifecycle.
12
13
 
@@ -44,39 +45,5 @@ class ParsingSession(_Session):
44
45
  """
45
46
  return self._parsing_parameters.get('custom_parameters')
46
47
 
47
- def run(self,df: DataFrame | Callable[[], DataFrame] | None = None):
48
- """Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
49
-
50
- Writes the DataFrame to parsed Parquet file,
51
- updates the input record with status, file size, record count, and notifies
52
- the Core API of process completion. On failure, updates logs and flags the input and process
53
- records as failed.
54
-
55
- Args:
56
- df (DataFrame): parameterless def that you defined, returning the Spark DataFrame containing parsed file data (recommended),
57
- or spark DataFrame
58
- """
59
- try:
60
- if not self._is_open:
61
- raise Exception("Session is closed")
62
- if callable(df):
63
- result_df = df() # call it to get the DataFrame
64
- else:
65
- result_df = df
66
-
67
- if result_df is None or result_df.isEmpty():
68
- file_size, row_count = (0, 0)
69
- else:
70
- dest_file_path = f"{self._systemConfiguration.datalakePath}/source_{self.process.sourceId}/parsed/parsed_input_{self.process.inputId}"
71
- file_size, row_count = self._write_parsed_data(result_df, dest_file_path)
72
- input_update_json = {
73
- "file_size": file_size,
74
- "input_id": self.process.inputId,
75
- "record_counts": {"Total": row_count}
76
- }
77
- self._end_process('P' if row_count > 0 else 'Z', input_update_json)
78
48
 
79
- except Exception as e:
80
- self._log_fail(e)
81
- self._end_process("F")
82
49
 
@@ -0,0 +1,10 @@
1
+ """Postgres utilities for data operations.
2
+
3
+ This module provides functions to execute SQL queries against a Postgres database
4
+ using Spark JDBC for reads and a direct write connection for write operations.
5
+ """
6
+ from dataforge._session import _platform
7
+
8
+ if _platform=='databricks':
9
+ from dataforge.databricks._databricks_pg import DataBricksPg
10
+ pg = DataBricksPg()
@@ -16,7 +16,6 @@ class PostOutputSession(_Session):
16
16
  """Initialize custom post-output session and start a new post-output process.
17
17
 
18
18
  Args:
19
- input_id (Optional[int]): Optional input_id of the batch for interactive testing.
20
19
  Leave blank for production use.
21
20
  """
22
21
  super().__init__()
@@ -16,7 +16,8 @@ class PostgresConnection:
16
16
 
17
17
  except Exception as e:
18
18
  logger.error(f"Error connecting to Postgres: {e}")
19
- sys.exit(1)
19
+ raise
20
+ # sys.exit(1)
20
21
 
21
22
  def sql(self, query: str, params=None, fetch=True):
22
23
  try:
@@ -29,7 +30,8 @@ class PostgresConnection:
29
30
  return res[0]
30
31
  except Exception as e:
31
32
  self.logger.error(f"Error executing query {query}({params}) on Postgres: {e}")
32
- sys.exit(1)
33
+ # sys.exit(1)
34
+ raise
33
35
 
34
36
  def connect(self, connection_string: str):
35
37
  # Execute a query
@@ -40,7 +42,8 @@ class PostgresConnection:
40
42
  # Change connection
41
43
  except Exception as e:
42
44
  self.logger.error(f"Error connecting to Postgres database or insufficient permissions. Details: {e}")
43
- sys.exit(1)
45
+ # sys.exit(1)
46
+ raise
44
47
 
45
48
  def close(self ):
46
49
  self.conn.close()
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+ from typing import Callable
3
+ from dataforge._base_session import _Base_Session
4
+ import sys
5
+
6
+ class _Snowflake_Base_Session(_Base_Session):
7
+ """Base session class for Snowflake platform.
8
+ Class should not be instantiated by user directly: use process-specific Session classes instead
9
+ Adds Snowpark session
10
+ """
11
+
12
+ def __init__(self):
13
+ from snowflake.snowpark.context import get_active_session
14
+ self.snowpark_session = get_active_session()
15
+ pg_connection_string_read = self.snowpark_session.sql("SELECT get_secret('DATAFORGE_PG_READ')").first(1)[0][0]
16
+ core_jwt_token = self.snowpark_session.sql("SELECT get_secret('DATAFORGE_CORE_JWT')").first(1)[0][0]
17
+ params = self.parse_key_value_args()
18
+ process_id = params.get('process_id')
19
+ self.input_id = params.get('input_id')
20
+
21
+ super().__init__(pg_connection_string_read, core_jwt_token, process_id)
22
+
23
+ self.process_parameters["start_process_flag"] = process_id is None
24
+
25
+ self.logger.info(f"Initialized Snowflake base session for {self.__class__.__name__} with parameters {self.process_parameters}")
26
+
27
+ def _write_input_table(self,df: snowflake.snowpark.dataframe.DataFrame | Callable[[],
28
+ snowflake.snowpark.dataframeDataFrame] | None = None) -> snowflake.snowpark.dataframe.DataFrame:
29
+ from snowflake.snowpark.types import StructType, StructField, StringType
30
+ if not self._is_open:
31
+ raise Exception("Session is closed")
32
+ if df is None:
33
+ # create empty df
34
+ result_df = df = self.snowpark_session.create_dataframe([], StructType([StructField("id", StringType())]))
35
+ else:
36
+ if callable(df):
37
+ result_df = df() # call it to get the DataFrame
38
+ else:
39
+ result_df = df
40
+ table = f"{self._systemConfiguration.dataLakeDbName}.{self._systemConfiguration.dataLakeSchemaName}.RAW_INPUT_{self.process.inputId}"
41
+ self.log(f"Writing dataframe to table {table}")
42
+ result_df.write.save_as_table(
43
+ table_name=table,
44
+ mode="overwrite",
45
+ table_type="transient"
46
+ )
47
+ self.log(f"Table {table} written")
48
+ if self.process.startProcessFlag:
49
+ # process started by IngestionSession, tell Core to continue and not run Notebook
50
+ self._pg.sql("SELECT sparky.sdk_complete_manual_process(%s)", [self.process.processId], fetch=False)
51
+
52
+ @staticmethod
53
+ def parse_key_value_args():
54
+ """
55
+ Parse command line arguments formatted as key=value into a dict.
56
+ Example: python script.py foo=123 bar=hello
57
+
58
+ Returns: {'foo': '123', 'bar': 'hello'}
59
+ """
60
+ argv = sys.argv
61
+ params: dict[str,str] = {}
62
+ for arg in argv:
63
+ if "=" in arg:
64
+ key, value = arg.split("=", 1) # split only on first '='
65
+ params[key] = value
66
+ return params
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+ import json
3
+ from typing import Callable
4
+ from dataforge.snowflake._snowflake_base_session import _Snowflake_Base_Session
5
+
6
+
7
+ class _Snowflake_Ingestion_Session(_Snowflake_Base_Session):
8
+ """Base ingestion session class for Snowflake platform.
9
+ Class should not be instantiated by user directly: use process-specific Session classes instead
10
+ Adds Snowpark session
11
+ """
12
+
13
+ def __init__(self):
14
+ super().__init__()
15
+
16
+
17
+ def ingest(self,df: snowflake.snowpark.dataframe.DataFrame | Callable[[], snowflake.snowpark.dataframeDataFrame] | None = None):
18
+ """Ingest the provided DataFrame into the DataForge and update input record.
19
+
20
+ Writes the DataFrame to raw Snowflake table
21
+
22
+ Args:
23
+ df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Snowpark DataFrame to ingest (recommended),
24
+ or spark DataFrame
25
+ """
26
+ try:
27
+ self._write_input_table(df)
28
+ except Exception as e:
29
+ self._log_fail(e)
30
+ if self.process.startProcessFlag:
31
+ # Fail input and process to prevent core from executing it
32
+ failure_update_json = {
33
+ "process_id": self.process.processId,
34
+ "ingestion_status_code": "F"
35
+ }
36
+ self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
37
+ (json.dumps(failure_update_json),), fetch=False)
38
+ finally:
39
+ self.close()
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+ from typing import Callable
3
+ from ._snowflake_base_session import _Snowflake_Base_Session
4
+
5
+
6
+ class _Snowflake_Parsing_Session(_Snowflake_Base_Session):
7
+ """Implements run method for Databricks.
8
+ """
9
+
10
+
11
+ def run(self,df: snowflake.snowpark.dataframe.DataFrame | Callable[[], snowflake.snowpark.dataframeDataFrame] | None = None):
12
+ """Save parsed file from the provided DataFrame, and upload it into the DataForge data lake.
13
+
14
+ Writes the DataFrame to parsed Parquet file,
15
+ updates the input record with status, file size, record count, and notifies
16
+ the Core API of process completion. On failure, updates logs and flags the input and process
17
+ records as failed.
18
+
19
+ Args:
20
+ df (DataFrame): parameterless def that you defined, returning the Spark DataFrame containing parsed file data (recommended),
21
+ or spark DataFrame
22
+ """
23
+ try:
24
+ self._write_input_table(df)
25
+
26
+ except Exception as e:
27
+ self._log_fail(e)
28
+ if self.process.startProcessFlag:
29
+ self._end_process("F")
30
+
@@ -1,12 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-sdk
3
- Version: 10.0.dev115
3
+ Version: 10.0.0rc3
4
4
  Summary: SDK for creating DataForge extensions
5
5
  Author-email: Vadim Orlov <vorlov@dataforgelabs.com>
6
6
  Project-URL: Homepage, https://docs.dataforgelabs.com
7
7
  Project-URL: Issues, https://docs.dataforgelabs.com/hc/en-us/requests/new
8
8
  Requires-Python: >=3.10
9
9
  Description-Content-Type: text/markdown
10
+ Provides-Extra: psycopg2
11
+ Requires-Dist: psycopg2-binary>=2.9; extra == "psycopg2"
10
12
 
11
13
  # dataforge-sdk
12
14
  SDK for creating DataForge extensions.
@@ -0,0 +1,28 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.cfg
4
+ dataforge/__init__.py
5
+ dataforge/_base_session.py
6
+ dataforge/_session.py
7
+ dataforge/ingestion_session.py
8
+ dataforge/parsing_session.py
9
+ dataforge/pg.py
10
+ dataforge/post_output_session.py
11
+ dataforge/postgres_connection.py
12
+ dataforge/process_record.py
13
+ dataforge/system_configuration.py
14
+ dataforge/utils.py
15
+ dataforge/databricks/__init__.py
16
+ dataforge/databricks/_databricks_base_session.py
17
+ dataforge/databricks/_databricks_ingestion_session.py
18
+ dataforge/databricks/_databricks_parsing_session.py
19
+ dataforge/databricks/_databricks_pg.py
20
+ dataforge/snowflake/__init__.py
21
+ dataforge/snowflake/_snowflake_base_session.py
22
+ dataforge/snowflake/_snowflake_ingestion_session.py
23
+ dataforge/snowflake/_snowflake_parsing_session.py
24
+ dataforge_sdk.egg-info/PKG-INFO
25
+ dataforge_sdk.egg-info/SOURCES.txt
26
+ dataforge_sdk.egg-info/dependency_links.txt
27
+ dataforge_sdk.egg-info/requires.txt
28
+ dataforge_sdk.egg-info/top_level.txt
@@ -0,0 +1,3 @@
1
+
2
+ [psycopg2]
3
+ psycopg2-binary>=2.9
@@ -6,15 +6,15 @@ requires = [
6
6
  build-backend = "setuptools.build_meta"
7
7
  [project]
8
8
  name = "dataforge-sdk"
9
- version = "10.0.dev115"
9
+ version = "10.0.0-rc.3"
10
10
  authors = [
11
11
  {name="Vadim Orlov", email="vorlov@dataforgelabs.com"}
12
12
  ]
13
13
  description = "SDK for creating DataForge extensions"
14
14
  readme = "README.md"
15
15
  requires-python = ">=3.10"
16
- dependencies = [
17
- ]
16
+ [project.optional-dependencies]
17
+ psycopg2 = ["psycopg2-binary>=2.9"]
18
18
  [project.urls]
19
19
  Homepage = "https://docs.dataforgelabs.com"
20
20
  Issues = "https://docs.dataforgelabs.com/hc/en-us/requests/new"
@@ -1,7 +0,0 @@
1
- # Pick base class at import time
2
- if "spark" in globals() and type(spark).__name__ == 'SparkSession':
3
- from dataforge._databricks_session import _Databricks_Session
4
- _Session = _Databricks_Session
5
- else:
6
- from dataforge._snowflake_session import _Snowflake_Session
7
- _Session = _Snowflake_Session
@@ -1,82 +0,0 @@
1
- import json
2
- from typing import Callable
3
-
4
- from dataforge._base_session import _Base_Session
5
- import sys
6
- from snowflake.snowpark.context import get_active_session
7
- import streamlit as st
8
- from snowflake.snowpark.dataframe import DataFrame
9
-
10
- class _Snowflake_Session(_Base_Session):
11
- """Base session class for Snowflake platform.
12
- Class should not be instantiated by user directly: use process-specific Session classes instead
13
- Adds Snowpark session
14
- """
15
-
16
- def __init__(self):
17
- pg_connection_string_read = st.secrets['DATAFORGE_PG_READ']
18
- core_jwt_token = st.secrets['DATAFORGE_CORE_JWT']
19
- params = self.parse_key_value_args()
20
- process_id = params.get('process_id')
21
- self.input_id = params.get('input_id')
22
-
23
- super().__init__(pg_connection_string_read, core_jwt_token, process_id)
24
- self.snowpark_session = get_active_session()
25
- self.process_parameters["start_process_flag"] = process_id is None
26
-
27
- self.logger.info(f"Initialized Snowflake base session for {self.__class__.__name__} with parameters {self.process_parameters}")
28
-
29
-
30
- @staticmethod
31
- def parse_key_value_args():
32
- """
33
- Parse command line arguments formatted as key=value into a dict.
34
- Example: python script.py foo=123 bar=hello
35
-
36
- Returns: {'foo': '123', 'bar': 'hello'}
37
- """
38
- argv = sys.argv
39
- params: dict[str,str] = {}
40
- for arg in argv:
41
- if "=" in arg:
42
- key, value = arg.split("=", 1) # split only on first '='
43
- params[key] = value
44
- else:
45
- raise ValueError(f"Invalid argument format (expected key=value): {arg}")
46
- return params
47
-
48
- def ingest(self,df: DataFrame | Callable[[], DataFrame] | None = None):
49
- """Ingest the provided DataFrame into the DataForge and update input record.
50
-
51
- Writes the DataFrame to raw Snowflake table
52
-
53
- Args:
54
- df (Callable[[], DataFrame] | DataFrame): parameterless def that you defined, returning the Spark DataFrame to ingest (recommended),
55
- or spark DataFrame
56
- """
57
- try:
58
- if not self._is_open:
59
- raise Exception("Session is closed")
60
- table = f"{self._systemConfiguration.dataLakeDbName}.{self._systemConfiguration.dataLakeSchemaName}.INPUT_{self.process.inputId}"
61
- self.log(f"Writing dataframe to table {table}")
62
- df.write.save_as_table(
63
- name=table,
64
- mode="overwrite",
65
- table_type="transient"
66
- )
67
- self.log(f"Table {table} written")
68
- if self.process.startProcessFlag:
69
- # process started by IngestionSession, tell Core to continue and not run Notebook
70
- self._pg.sql("SELECT sparky.sdk_complete_manual_process(%s)", [self.process.processId], fetch=False)
71
- except Exception as e:
72
- self._log_fail(e)
73
- if self.process.startProcessFlag:
74
- # Fail input and process to prevent core from executing it
75
- failure_update_json = {
76
- "process_id": self.process.processId,
77
- "ingestion_status_code": "F"
78
- }
79
- self._pg.sql("SELECT meta.prc_iw_in_update_input_record(%s)",
80
- (json.dumps(failure_update_json),), fetch=False)
81
- finally:
82
- self.close()
@@ -1,91 +0,0 @@
1
- """Postgres utilities for data operations.
2
-
3
- This module provides functions to execute SQL queries against a Postgres database
4
- using Spark JDBC for reads and a direct write connection for write operations.
5
- """
6
- from pyspark.dbutils import DBUtils
7
- from pyspark.sql import SparkSession, DataFrame
8
- from .postgres_connection import PostgresConnection
9
-
10
- spark = SparkSession.builder.getOrCreate()
11
- dbutils = DBUtils(spark)
12
- pg_connection_string_read = dbutils.secrets.get("sparky", "pg_read")
13
-
14
-
15
- def update(query: str):
16
- """Execute an update SQL query on the DataForge metastore Postgres database.
17
-
18
- Args:
19
- query (str): SQL query string to execute.
20
-
21
- Returns:
22
- None
23
-
24
- Raises:
25
- Exception: If write connection cannot be established or SQL execution fails.
26
- """
27
- pg = _get_pg_write_connection()
28
- pg.sql(query, fetch=False)
29
-
30
-
31
- def execute(query: str):
32
- """Alias for update() to execute write SQL queries.
33
-
34
- Args:
35
- query (str): SQL query string to execute.
36
-
37
- Returns:
38
- None
39
- """
40
- update(query)
41
-
42
-
43
- def select(query: str) -> DataFrame:
44
- """Execute a SELECT SQL query on the DataForge metastore Postgres database and return a DataFrame with results.
45
-
46
- Args:
47
- query (str): SQL SELECT query string.
48
-
49
- Returns:
50
- DataFrame: Spark DataFrame containing query results.
51
-
52
- Raises:
53
- Exception: If Spark fails to load data or connection issues.
54
- """
55
- return spark.read.format("jdbc") \
56
- .option("url", pg_connection_string_read) \
57
- .option("query", query) \
58
- .load()
59
-
60
-
61
- def pull(source_id: int):
62
- """Trigger new ingestion (pull data) on DataForge source for a given source ID.
63
-
64
- Args:
65
- source_id (int): Identifier for the source to pull.
66
-
67
- Returns:
68
- None
69
-
70
- Raises:
71
- Exception: If write connection cannot be established or SQL execution fails.
72
- """
73
- pg = _get_pg_write_connection()
74
- pg.sql("SELECT meta.svc_pull_source(%s, %s)", (source_id,'sdk'), fetch=False)
75
-
76
-
77
- def _get_pg_write_connection() -> PostgresConnection:
78
- """Internal method to retrieve a PostgresConnection for write operations using secured secrets.
79
-
80
- Returns:
81
- PostgresConnection: Connection object for executing write queries.
82
-
83
- Raises:
84
- Exception: If the 'pg_write' secret is not defined in the 'sparky' scope.
85
- """
86
- secrets = dbutils.secrets.list("sparky")
87
- if any(secret.key == "pg_write" for secret in secrets):
88
- conn_string = dbutils.secrets.get("sparky", "pg_write")
89
- return PostgresConnection(conn_string + "&application_name=sdk-pg")
90
- else:
91
- raise Exception("pg_write secret is not defined in sparky scope")
@@ -1,20 +0,0 @@
1
- README.md
2
- pyproject.toml
3
- setup.cfg
4
- dataforge/__init__.py
5
- dataforge/_base_session.py
6
- dataforge/_databricks_session.py
7
- dataforge/_session.py
8
- dataforge/_snowflake_session.py
9
- dataforge/ingestion_session.py
10
- dataforge/parsing_session.py
11
- dataforge/pg.py
12
- dataforge/post_output_session.py
13
- dataforge/postgres_connection.py
14
- dataforge/process_record.py
15
- dataforge/system_configuration.py
16
- dataforge/utils.py
17
- dataforge_sdk.egg-info/PKG-INFO
18
- dataforge_sdk.egg-info/SOURCES.txt
19
- dataforge_sdk.egg-info/dependency_links.txt
20
- dataforge_sdk.egg-info/top_level.txt