salesforce-data-customcode 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. datacustomcode/__init__.py +20 -0
  2. datacustomcode/cli.py +142 -0
  3. datacustomcode/client.py +227 -0
  4. datacustomcode/cmd.py +105 -0
  5. datacustomcode/config.py +149 -0
  6. datacustomcode/config.yaml +15 -0
  7. datacustomcode/credentials.py +97 -0
  8. datacustomcode/deploy.py +379 -0
  9. datacustomcode/io/__init__.py +14 -0
  10. datacustomcode/io/base.py +28 -0
  11. datacustomcode/io/reader/__init__.py +14 -0
  12. datacustomcode/io/reader/base.py +34 -0
  13. datacustomcode/io/reader/query_api.py +115 -0
  14. datacustomcode/io/writer/__init__.py +14 -0
  15. datacustomcode/io/writer/base.py +49 -0
  16. datacustomcode/io/writer/csv.py +41 -0
  17. datacustomcode/io/writer/print.py +33 -0
  18. datacustomcode/mixin.py +94 -0
  19. datacustomcode/py.typed +0 -0
  20. datacustomcode/run.py +47 -0
  21. datacustomcode/scan.py +153 -0
  22. datacustomcode/template.py +36 -0
  23. datacustomcode/templates/.devcontainer/devcontainer.json +10 -0
  24. datacustomcode/templates/Dockerfile +20 -0
  25. datacustomcode/templates/README.md +0 -0
  26. datacustomcode/templates/jupyterlab.sh +97 -0
  27. datacustomcode/templates/payload/config.json +1 -0
  28. datacustomcode/templates/payload/entrypoint.py +10 -0
  29. datacustomcode/templates/requirements-dev.txt +10 -0
  30. datacustomcode/templates/requirements.txt +1 -0
  31. salesforce_data_customcode-0.1.0.dist-info/LICENSE.txt +206 -0
  32. salesforce_data_customcode-0.1.0.dist-info/METADATA +159 -0
  33. salesforce_data_customcode-0.1.0.dist-info/RECORD +35 -0
  34. salesforce_data_customcode-0.1.0.dist-info/WHEEL +4 -0
  35. salesforce_data_customcode-0.1.0.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,97 @@
1
+ # Copyright (c) 2025, Salesforce, Inc.
2
+ # SPDX-License-Identifier: Apache-2
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from __future__ import annotations
16
+
17
+ import configparser
18
+ from dataclasses import dataclass
19
+ import os
20
+
21
+ from loguru import logger
22
+
23
+ ENV_CREDENTIALS = {
24
+ "username": "SFDC_USERNAME",
25
+ "password": "SFDC_PASSWORD",
26
+ "client_id": "SFDC_CLIENT_ID",
27
+ "client_secret": "SFDC_CLIENT_SECRET",
28
+ "login_url": "SFDC_LOGIN_URL",
29
+ }
30
+ INI_FILE = os.path.expanduser("~/.datacustomcode/credentials.ini")
31
+
32
+
33
+ @dataclass
34
+ class Credentials:
35
+ username: str
36
+ password: str
37
+ client_id: str
38
+ client_secret: str
39
+ login_url: str
40
+
41
+ @classmethod
42
+ def from_ini(
43
+ cls,
44
+ profile: str = "default",
45
+ ini_file: str = INI_FILE,
46
+ ) -> Credentials:
47
+ config = configparser.ConfigParser()
48
+ logger.debug(f"Reading {ini_file} for profile {profile}")
49
+ config.read(ini_file)
50
+ return cls(
51
+ username=config[profile]["username"],
52
+ password=config[profile]["password"],
53
+ client_id=config[profile]["client_id"],
54
+ client_secret=config[profile]["client_secret"],
55
+ login_url=config[profile]["login_url"],
56
+ )
57
+
58
+ @classmethod
59
+ def from_env(cls) -> Credentials:
60
+ try:
61
+ return cls(**{k: os.environ[v] for k, v in ENV_CREDENTIALS.items()})
62
+ except KeyError as exc:
63
+ raise ValueError(
64
+ f"All of {ENV_CREDENTIALS.values()} must be set in environment."
65
+ ) from exc
66
+
67
+ @classmethod
68
+ def from_available(cls) -> Credentials:
69
+ if os.environ.get("SFDC_USERNAME"):
70
+ return cls.from_env()
71
+ if os.path.exists(INI_FILE):
72
+ return cls.from_ini()
73
+ raise ValueError(
74
+ "Credentials not found in env or ini file. "
75
+ "Run `datacustomcode configure` to create a credentials file."
76
+ )
77
+
78
+ def update_ini(self, profile: str = "default", ini_file: str = INI_FILE):
79
+ config = configparser.ConfigParser()
80
+
81
+ expanded_ini_file = os.path.expanduser(ini_file)
82
+ os.makedirs(os.path.dirname(expanded_ini_file), exist_ok=True)
83
+
84
+ if os.path.exists(expanded_ini_file):
85
+ config.read(expanded_ini_file)
86
+
87
+ if profile not in config:
88
+ config[profile] = {}
89
+
90
+ config[profile]["username"] = self.username
91
+ config[profile]["password"] = self.password
92
+ config[profile]["client_id"] = self.client_id
93
+ config[profile]["client_secret"] = self.client_secret
94
+ config[profile]["login_url"] = self.login_url
95
+
96
+ with open(expanded_ini_file, "w") as f:
97
+ config.write(f)
@@ -0,0 +1,379 @@
1
+ # Copyright (c) 2025, Salesforce, Inc.
2
+ # SPDX-License-Identifier: Apache-2
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from __future__ import annotations
16
+
17
+ from html import unescape
18
+ import json
19
+ import os
20
+ import shutil
21
+ import tarfile
22
+ import tempfile
23
+ import time
24
+ from typing import (
25
+ TYPE_CHECKING,
26
+ Any,
27
+ Callable,
28
+ Dict,
29
+ List,
30
+ Union,
31
+ )
32
+
33
+ from loguru import logger
34
+ from pydantic import BaseModel
35
+ import requests
36
+
37
+ from datacustomcode.cmd import cmd_output
38
+ from datacustomcode.scan import scan_file
39
+
40
+ if TYPE_CHECKING:
41
+ from datacustomcode.credentials import Credentials
42
+
43
+ DATA_CUSTOM_CODE_PATH = "services/data/v63.0/ssot/data-custom-code"
44
+ DATA_TRANSFORMS_PATH = "services/data/v63.0/ssot/data-transforms"
45
+ AUTH_PATH = "services/oauth2/token"
46
+ WAIT_FOR_DEPLOYMENT_TIMEOUT = 3000
47
+
48
+
49
+ class TransformationJobMetadata(BaseModel):
50
+ name: str
51
+ version: str
52
+ description: str
53
+
54
+
55
+ def _join_strip_url(*args: str) -> str:
56
+ return "/".join(arg.strip("/") for arg in args)
57
+
58
+
59
+ JSONValue = Union[
60
+ Dict[str, "JSONValue"], List["JSONValue"], str, int, float, bool, None
61
+ ]
62
+
63
+
64
+ def _make_api_call(
65
+ url: str,
66
+ method: str,
67
+ headers: Union[dict, None] = None,
68
+ token: Union[str, None] = None,
69
+ **kwargs,
70
+ ) -> dict[str, JSONValue]:
71
+ """Make a request to Data Cloud Custom Code API."""
72
+ headers = headers or {}
73
+ if token:
74
+ headers["Authorization"] = f"Bearer {token}"
75
+
76
+ logger.debug(f"Making API call: {method} {url}")
77
+ logger.debug(f"Headers: {headers}")
78
+ logger.debug(f"Request params: {kwargs}")
79
+
80
+ response = requests.request(method=method, url=url, headers=headers, **kwargs)
81
+ response.raise_for_status()
82
+ json_response = response.json()
83
+ assert isinstance(
84
+ json_response, dict
85
+ ), f"Unexpected response type: {type(json_response)}"
86
+ return json_response
87
+
88
+
89
+ class AccessTokenResponse(BaseModel):
90
+ access_token: str
91
+ instance_url: str
92
+
93
+
94
+ def _retrieve_access_token(credentials: Credentials) -> AccessTokenResponse:
95
+ """Get a token for the Salesforce API."""
96
+ logger.debug("Getting oauth token...")
97
+
98
+ url = f"{credentials.login_url.rstrip('/')}/{AUTH_PATH.lstrip('/')}"
99
+
100
+ data = {
101
+ "grant_type": "password",
102
+ "username": credentials.username,
103
+ "password": credentials.password,
104
+ "client_id": credentials.client_id,
105
+ "client_secret": credentials.client_secret,
106
+ }
107
+ response = _make_api_call(url, "POST", data=data)
108
+ return AccessTokenResponse(**response)
109
+
110
+
111
+ class CreateDeploymentResponse(BaseModel):
112
+ fileUploadUrl: str
113
+
114
+
115
+ def create_deployment(
116
+ access_token: AccessTokenResponse, metadata: TransformationJobMetadata
117
+ ) -> CreateDeploymentResponse:
118
+ """Create a custom code deployment in the DataCloud."""
119
+ url = _join_strip_url(access_token.instance_url, DATA_CUSTOM_CODE_PATH)
120
+ body = {
121
+ "label": metadata.name,
122
+ "name": metadata.name,
123
+ "description": metadata.description,
124
+ "version": metadata.version,
125
+ "computeType": "CPU_M",
126
+ }
127
+ logger.debug(f"Creating deployment {metadata.name}...")
128
+ try:
129
+ response = _make_api_call(
130
+ url, "POST", token=access_token.access_token, json=body
131
+ )
132
+ return CreateDeploymentResponse(**response)
133
+ except requests.HTTPError as exc:
134
+ if exc.response.status_code == 409:
135
+ raise ValueError(
136
+ f"Deployment {metadata.name} exists. Please use a different name."
137
+ ) from exc
138
+ raise
139
+
140
+
141
+ DOCKER_IMAGE_NAME = "datacloud-custom-code"
142
+ DEPENDENCIES_ARCHIVE_NAME = "dependencies.tar.gz"
143
+ ZIP_FILE_NAME = "deployment.zip"
144
+
145
+
146
+ def prepare_dependency_archive(directory: str) -> None:
147
+ cmd = f"docker images -q {DOCKER_IMAGE_NAME}"
148
+ image_exists = cmd_output(cmd)
149
+
150
+ if not image_exists:
151
+ logger.debug("Building docker image...")
152
+ cmd = f"docker build -t {DOCKER_IMAGE_NAME} ."
153
+ cmd_output(cmd)
154
+
155
+ with tempfile.TemporaryDirectory() as temp_dir:
156
+ shutil.copy("requirements.txt", temp_dir)
157
+ cmd = (
158
+ f"docker run --rm "
159
+ f"-v {temp_dir}:/dependencies "
160
+ f"{DOCKER_IMAGE_NAME} "
161
+ f'/bin/bash -c "cd /dependencies && pip download -r requirements.txt"'
162
+ )
163
+ cmd_output(cmd)
164
+
165
+ archives_dir = os.path.join(directory, "archives")
166
+ os.makedirs(archives_dir, exist_ok=True)
167
+ archive_file = os.path.join(archives_dir, DEPENDENCIES_ARCHIVE_NAME)
168
+ with tarfile.open(archive_file, "w:gz") as tar:
169
+ for file in os.listdir(temp_dir):
170
+ tar.add(os.path.join(temp_dir, file), arcname=file)
171
+
172
+ logger.debug(f"Dependencies downloaded and archived to {archive_file}")
173
+
174
+
175
+ def zip_and_upload_directory(directory: str, file_upload_url: str) -> None:
176
+ file_upload_url = unescape(file_upload_url)
177
+
178
+ logger.debug(f"Zipping directory... {directory}")
179
+ shutil.make_archive(ZIP_FILE_NAME.rstrip(".zip"), "zip", directory)
180
+
181
+ logger.debug(f"Uploading deployment to {file_upload_url}")
182
+ with open(ZIP_FILE_NAME, "rb") as zip_file:
183
+ response = requests.put(
184
+ file_upload_url, data=zip_file, headers={"Content-Type": "application/zip"}
185
+ )
186
+ response.raise_for_status()
187
+
188
+
189
+ class DeploymentsResponse(BaseModel):
190
+ deploymentStatus: str
191
+
192
+
193
+ def get_deployments(
194
+ access_token: AccessTokenResponse, metadata: TransformationJobMetadata
195
+ ) -> DeploymentsResponse:
196
+ """Get all custom code deployments from the DataCloud."""
197
+ url = _join_strip_url(
198
+ access_token.instance_url, DATA_CUSTOM_CODE_PATH, metadata.name
199
+ )
200
+ response = _make_api_call(url, "GET", token=access_token.access_token)
201
+ return DeploymentsResponse(**response)
202
+
203
+
204
+ def wait_for_deployment(
205
+ access_token: AccessTokenResponse,
206
+ metadata: TransformationJobMetadata,
207
+ callback: Union[Callable[[str], None], None] = None,
208
+ ) -> None:
209
+ """Wait for deployment to complete.
210
+
211
+ Args:
212
+ callback: Optional callback function that receives the deployment status
213
+ """
214
+ start_time = time.time()
215
+ logger.debug("Waiting for deployment to complete")
216
+
217
+ while True:
218
+ deployment_status = get_deployments(access_token, metadata)
219
+ status = deployment_status.deploymentStatus
220
+ if (time.time() - start_time) > WAIT_FOR_DEPLOYMENT_TIMEOUT:
221
+ raise TimeoutError("Deployment timed out.")
222
+
223
+ if callback:
224
+ callback(status)
225
+ if status == "Deployed":
226
+ logger.debug(
227
+ "Deployment completed, Elapsed time: {time.time() - start_time}"
228
+ )
229
+ break
230
+ time.sleep(1)
231
+
232
+
233
+ DATA_TRANSFORM_REQUEST_TEMPLATE: dict[str, Any] = {
234
+ "metadata": {
235
+ "dbt_schema_version": "https://schemas.getdbt.com/dbt/manifest/v8.json",
236
+ "dbt_version": "1.4.6",
237
+ "generated_at": "2023-04-25T18:54:11.375589Z",
238
+ "invocation_id": "d6c68c69-533a-4d54-861e-1493d6cd8092",
239
+ "env": {},
240
+ "project_id": "jaffle_shop",
241
+ "user_id": "1ca8403c-a1a5-43af-8b88-9265e948b9d2",
242
+ "send_anonymous_usage_stats": True,
243
+ "adapter_type": "spark",
244
+ },
245
+ "nodes": {
246
+ "model.dcexample.dim_listings_w_hosts": {
247
+ "name": "dim_listings_w_hosts",
248
+ "resource_type": "model",
249
+ "relation_name": "{OUTPUT_DLO}",
250
+ "config": {"materialized": "table"},
251
+ "compiled_code": "",
252
+ "depends_on": {"nodes": []},
253
+ }
254
+ },
255
+ "sources": {
256
+ "source.dcexample.listings": {
257
+ "name": "listings",
258
+ "resource_type": "source",
259
+ "relation_name": "{INPUT_DLO}",
260
+ "identifier": "{INPUT_DLO}",
261
+ }
262
+ },
263
+ "macros": {
264
+ "macro.dcexample.byoc": {
265
+ "name": "byoc_example",
266
+ "resource_type": "macro",
267
+ "path": "",
268
+ "original_file_path": "",
269
+ "unique_id": "unique id",
270
+ "macro_sql": "",
271
+ "supported_languages": None,
272
+ "arguments": [{"name": "{SCRIPT_NAME}", "type": "BYOC_SCRIPT"}],
273
+ }
274
+ },
275
+ }
276
+
277
+
278
+ class DataTransformConfig(BaseModel):
279
+ input: Union[str, list[str]]
280
+ output: Union[str, list[str]]
281
+
282
+
283
+ DATA_TRANSFORM_CONFIG_TEMPLATE: dict[str, Any] = {
284
+ "entryPoint": "entrypoint.py",
285
+ "dataspace": "default",
286
+ "permissions": {"read": {"dlo": ""}, "write": {"dlo": ""}},
287
+ }
288
+
289
+
290
+ def get_data_transform_config(directory: str) -> DataTransformConfig:
291
+ """Get the data transform config from the entrypoint.py file."""
292
+ entrypoint_file = os.path.join(directory, "entrypoint.py")
293
+ data_access_layer_calls = scan_file(entrypoint_file)
294
+ input_ = data_access_layer_calls.input_str
295
+ output = data_access_layer_calls.output_str
296
+ return DataTransformConfig(input=input_, output=output)
297
+
298
+
299
+ def create_data_transform_config(directory: str) -> None:
300
+ """Create a data transform config.json file in the directory."""
301
+ data_transform_config = get_data_transform_config(directory)
302
+ request_hydrated = DATA_TRANSFORM_CONFIG_TEMPLATE.copy()
303
+ request_hydrated["permissions"]["read"]["dlo"] = data_transform_config.input
304
+ request_hydrated["permissions"]["write"]["dlo"] = data_transform_config.output
305
+ logger.debug(f"Creating data transform config in {directory}")
306
+ json.dump(
307
+ request_hydrated, open(os.path.join(directory, "config.json"), "w"), indent=4
308
+ )
309
+
310
+
311
+ def create_data_transform(
312
+ directory: str,
313
+ access_token: AccessTokenResponse,
314
+ metadata: TransformationJobMetadata,
315
+ ) -> dict:
316
+ """Create a data transform in the DataCloud."""
317
+ script_name = metadata.name
318
+ data_transform_config = get_data_transform_config(directory)
319
+ request_hydrated = DATA_TRANSFORM_REQUEST_TEMPLATE.copy()
320
+ request_hydrated["nodes"]["model.dcexample.dim_listings_w_hosts"][
321
+ "relation_name"
322
+ ] = data_transform_config.input
323
+ request_hydrated["sources"]["source.dcexample.listings"][
324
+ "relation_name"
325
+ ] = data_transform_config.output
326
+ request_hydrated["sources"]["source.dcexample.listings"][
327
+ "identifier"
328
+ ] = data_transform_config.output
329
+ request_hydrated["macros"]["macro.dcexample.byoc"]["arguments"][0][
330
+ "name"
331
+ ] = script_name
332
+
333
+ body = {
334
+ "definition": {
335
+ "type": "DBT",
336
+ "manifest": request_hydrated,
337
+ "version": "56.0",
338
+ },
339
+ "label": f"{metadata.name}",
340
+ "name": f"{metadata.name}",
341
+ "type": "BATCH",
342
+ }
343
+
344
+ url = _join_strip_url(access_token.instance_url, DATA_TRANSFORMS_PATH)
345
+ response = _make_api_call(url, "POST", token=access_token.access_token, json=body)
346
+ return response
347
+
348
+
349
+ def deploy_full(
350
+ directory: str,
351
+ metadata: TransformationJobMetadata,
352
+ credentials: Credentials,
353
+ callback=None,
354
+ ) -> AccessTokenResponse:
355
+ """Deploy a data transform in the DataCloud."""
356
+ access_token = _retrieve_access_token(credentials)
357
+
358
+ # prepare payload
359
+ prepare_dependency_archive(directory)
360
+ create_data_transform_config(directory)
361
+
362
+ # create deployment and upload payload
363
+ deployment = create_deployment(access_token, metadata)
364
+ zip_and_upload_directory(directory, deployment.fileUploadUrl)
365
+ wait_for_deployment(access_token, metadata, callback)
366
+
367
+ # create data transform
368
+ create_data_transform(directory, access_token, metadata)
369
+ return access_token
370
+
371
+
372
+ def run_data_transform(
373
+ access_token: AccessTokenResponse, metadata: TransformationJobMetadata
374
+ ) -> dict:
375
+ logger.debug(f"Triggering data transform {metadata.name}")
376
+ url = _join_strip_url(
377
+ access_token.instance_url, DATA_TRANSFORMS_PATH, metadata.name, "actions", "run"
378
+ )
379
+ return _make_api_call(url, "POST")
@@ -0,0 +1,14 @@
1
+ # Copyright (c) 2025, Salesforce, Inc.
2
+ # SPDX-License-Identifier: Apache-2
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
@@ -0,0 +1,28 @@
1
+ # Copyright (c) 2025, Salesforce, Inc.
2
+ # SPDX-License-Identifier: Apache-2
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from __future__ import annotations
16
+
17
+ from abc import ABC
18
+ from typing import TYPE_CHECKING
19
+
20
+ from datacustomcode.mixin import UserExtendableNamedConfigMixin
21
+
22
+ if TYPE_CHECKING:
23
+ from pyspark.sql import SparkSession
24
+
25
+
26
+ class BaseDataAccessLayer(ABC, UserExtendableNamedConfigMixin):
27
+ def __init__(self, spark: SparkSession):
28
+ self.spark = spark
@@ -0,0 +1,14 @@
1
+ # Copyright (c) 2025, Salesforce, Inc.
2
+ # SPDX-License-Identifier: Apache-2
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
@@ -0,0 +1,34 @@
1
+ # Copyright (c) 2025, Salesforce, Inc.
2
+ # SPDX-License-Identifier: Apache-2
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from __future__ import annotations
16
+
17
+ from abc import abstractmethod
18
+ from typing import TYPE_CHECKING
19
+
20
+ from datacustomcode.io.base import BaseDataAccessLayer
21
+
22
+ if TYPE_CHECKING:
23
+ from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
24
+
25
+
26
+ class BaseDataCloudReader(BaseDataAccessLayer):
27
+ def __init__(self, spark: SparkSession):
28
+ self.spark = spark
29
+
30
+ @abstractmethod
31
+ def read_dlo(self, name: str) -> PySparkDataFrame: ...
32
+
33
+ @abstractmethod
34
+ def read_dmo(self, name: str) -> PySparkDataFrame: ...
@@ -0,0 +1,115 @@
1
+ # Copyright (c) 2025, Salesforce, Inc.
2
+ # SPDX-License-Identifier: Apache-2
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ from typing import (
19
+ TYPE_CHECKING,
20
+ Final,
21
+ Union,
22
+ )
23
+
24
+ from pyspark.sql.types import (
25
+ BooleanType,
26
+ DoubleType,
27
+ LongType,
28
+ StringType,
29
+ StructField,
30
+ StructType,
31
+ TimestampType,
32
+ )
33
+ from salesforcecdpconnector.connection import SalesforceCDPConnection
34
+
35
+ from datacustomcode.credentials import Credentials
36
+ from datacustomcode.io.reader.base import BaseDataCloudReader
37
+
38
+ if TYPE_CHECKING:
39
+ import pandas
40
+ from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
41
+ from pyspark.sql.types import AtomicType
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ SQL_QUERY_TEMPLATE: Final = "SELECT * FROM {}"
47
+ PANDAS_TYPE_MAPPING = {
48
+ "object": StringType(),
49
+ "int64": LongType(),
50
+ "float64": DoubleType(),
51
+ "datetime64[ns]": TimestampType(),
52
+ "datetime64[ns, UTC]": TimestampType(),
53
+ "bool": BooleanType(),
54
+ }
55
+
56
+
57
+ def _pandas_to_spark_schema(
58
+ pandas_df: pandas.DataFrame, nullable: bool = True
59
+ ) -> StructType:
60
+ fields = []
61
+ for column, dtype in pandas_df.dtypes.items():
62
+ spark_type = PANDAS_TYPE_MAPPING.get(str(dtype), StringType())
63
+ fields.append(StructField(column, spark_type, nullable))
64
+ return StructType(fields)
65
+
66
+
67
+ class QueryAPIDataCloudReader(BaseDataCloudReader):
68
+ """DataCloud reader using Query API.
69
+
70
+ This reader emulates data access within Data Cloud by calling the Query API.
71
+ """
72
+
73
+ CONFIG_NAME = "QueryAPIDataCloudReader"
74
+
75
+ def __init__(self, spark: SparkSession) -> None:
76
+ self.spark = spark
77
+ credentials = Credentials.from_available()
78
+
79
+ self._conn = SalesforceCDPConnection(
80
+ credentials.login_url,
81
+ credentials.username,
82
+ credentials.password,
83
+ credentials.client_id,
84
+ credentials.client_secret,
85
+ )
86
+
87
+ def read_dlo(
88
+ self, name: str, schema: Union[AtomicType, StructType, str, None] = None
89
+ ) -> PySparkDataFrame:
90
+ """
91
+ Read a Data Lake Object (DLO) from the Data Cloud.
92
+
93
+ Args:
94
+ name (str): The name of the DLO.
95
+ schema (Optional[Union[AtomicType, StructType, str]]): Schema of the DLO.
96
+
97
+ Returns:
98
+ PySparkDataFrame: The PySpark DataFrame.
99
+ """
100
+ pandas_df = self._conn.get_pandas_dataframe(SQL_QUERY_TEMPLATE.format(name))
101
+ if not schema:
102
+ # auto infer schema
103
+ schema = _pandas_to_spark_schema(pandas_df)
104
+ spark_dataframe = self.spark.createDataFrame(pandas_df, schema)
105
+ return spark_dataframe
106
+
107
+ def read_dmo(
108
+ self, name: str, schema: Union[AtomicType, StructType, str, None] = None
109
+ ) -> PySparkDataFrame:
110
+ pandas_df = self._conn.get_pandas_dataframe(SQL_QUERY_TEMPLATE.format(name))
111
+ if not schema:
112
+ # auto infer schema
113
+ schema = _pandas_to_spark_schema(pandas_df)
114
+ spark_dataframe = self.spark.createDataFrame(pandas_df, schema)
115
+ return spark_dataframe