salesforce-data-customcode 4.0.1__tar.gz → 5.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/PKG-INFO +1 -1
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/pyproject.toml +1 -1
- salesforce_data_customcode-5.0.0/src/datacustomcode/__init__.py +47 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/client.py +2 -2
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/feature_types/chunking.py +89 -51
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function_utils.py +25 -3
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/sf_cli.py +3 -1
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/utils.py +19 -19
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/writer/csv.py +5 -1
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/writer/print.py +8 -3
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/mixin.py +29 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/spark/base.py +1 -1
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/spark/default.py +1 -1
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/chunking/payload/entrypoint.py +1 -3
- salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_llm/config.json +3 -0
- salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_llm/entrypoint.py +99 -0
- salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_llm/files/chunking_prompt.txt +19 -0
- salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_llm/tests/test.json +51 -0
- salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_prediction/config.json +3 -0
- salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_prediction/entrypoint.py +251 -0
- salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_prediction/tests/test.json +40 -0
- salesforce_data_customcode-4.0.1/src/datacustomcode/__init__.py +0 -27
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/LICENSE.txt +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/README.md +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/auth.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/cli.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/cmd.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/common_config.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/config.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/config.yaml +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/constants.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/credentials.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/deploy.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_platform_client.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_platform_config.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions/base.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions/impl/default.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions/types.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions_config.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/file/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/file/base.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/file/path/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/file/path/default.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/base.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/feature_types/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/runtime.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/base.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/base.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/query_api.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/writer/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/writer/base.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/base.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/default.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/generate_text_request.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/generate_text_request_builder.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/generate_text_response.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/generate_text_response_builder.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway_config.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/proxy/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/proxy/base.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/proxy/client/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/proxy/client/base.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/py.typed +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/run.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/scan.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/spark/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/template.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/__init__.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/.devcontainer/devcontainer.json +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/Dockerfile.dependencies +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/README.md +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/build_native_dependencies.sh +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/chunking/payload/config.json +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/chunking/requirements.txt +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/payload/config.json +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/payload/entrypoint.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/requirements-dev.txt +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/requirements.txt +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/.devcontainer/devcontainer.json +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/Dockerfile +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/Dockerfile.dependencies +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/README.md +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/account.ipynb +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/build_native_dependencies.sh +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/examples/employee_hierarchy/employee_data.csv +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/examples/employee_hierarchy/entrypoint.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/jupyterlab.sh +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/payload/config.json +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/payload/entrypoint.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/requirements-dev.txt +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/requirements.txt +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/token_provider.py +0 -0
- {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/version.py +0 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Copyright (c) 2025, Salesforce, Inc.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"AuthType",
|
|
18
|
+
"Client",
|
|
19
|
+
"Credentials",
|
|
20
|
+
"PrintDataCloudWriter",
|
|
21
|
+
"QueryAPIDataCloudReader",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def __getattr__(name: str):
|
|
26
|
+
"""Lazy import heavy dependencies."""
|
|
27
|
+
if name == "Client":
|
|
28
|
+
from datacustomcode.client import Client
|
|
29
|
+
|
|
30
|
+
return Client
|
|
31
|
+
elif name == "AuthType":
|
|
32
|
+
from datacustomcode.credentials import AuthType
|
|
33
|
+
|
|
34
|
+
return AuthType
|
|
35
|
+
elif name == "Credentials":
|
|
36
|
+
from datacustomcode.credentials import Credentials
|
|
37
|
+
|
|
38
|
+
return Credentials
|
|
39
|
+
elif name == "PrintDataCloudWriter":
|
|
40
|
+
from datacustomcode.io.writer.print import PrintDataCloudWriter
|
|
41
|
+
|
|
42
|
+
return PrintDataCloudWriter
|
|
43
|
+
elif name == "QueryAPIDataCloudReader":
|
|
44
|
+
from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
|
|
45
|
+
|
|
46
|
+
return QueryAPIDataCloudReader
|
|
47
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
{salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/client.py
RENAMED
|
@@ -112,8 +112,8 @@ class Client:
|
|
|
112
112
|
def __new__(
|
|
113
113
|
cls,
|
|
114
114
|
reader: Optional[BaseDataCloudReader] = None,
|
|
115
|
-
writer: Optional[
|
|
116
|
-
spark_provider: Optional[
|
|
115
|
+
writer: Optional[BaseDataCloudWriter] = None,
|
|
116
|
+
spark_provider: Optional[BaseSparkSessionProvider] = None,
|
|
117
117
|
code_type: str = "script",
|
|
118
118
|
) -> Client:
|
|
119
119
|
|
|
@@ -31,8 +31,8 @@ from pydantic import (
|
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
class
|
|
35
|
-
"""
|
|
34
|
+
class ElementType(str, Enum):
|
|
35
|
+
"""Element type enumeration"""
|
|
36
36
|
|
|
37
37
|
TEXT = "text"
|
|
38
38
|
TITLE = "title"
|
|
@@ -50,16 +50,16 @@ class ChunkType(str, Enum):
|
|
|
50
50
|
class SearchIndexChunkingV1PrependField(BaseModel):
|
|
51
51
|
"""Field to prepend to chunk content"""
|
|
52
52
|
|
|
53
|
-
dmo_name: str = Field(
|
|
54
|
-
default=
|
|
53
|
+
dmo_name: Optional[str] = Field(
|
|
54
|
+
default=None, description="Data Model Object name", examples=["udmo_1__dlm"]
|
|
55
55
|
)
|
|
56
|
-
field_name: str = Field(
|
|
57
|
-
default=
|
|
56
|
+
field_name: Optional[str] = Field(
|
|
57
|
+
default=None,
|
|
58
58
|
description="Field name to prepend",
|
|
59
59
|
examples=["ResolvedFilePath__c"],
|
|
60
60
|
)
|
|
61
|
-
value: str = Field(
|
|
62
|
-
default=
|
|
61
|
+
value: Optional[str] = Field(
|
|
62
|
+
default=None,
|
|
63
63
|
description="Field value to prepend",
|
|
64
64
|
examples=["udlo_1__dll:quarterly_report.pdf"],
|
|
65
65
|
)
|
|
@@ -67,65 +67,97 @@ class SearchIndexChunkingV1PrependField(BaseModel):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class SearchIndexChunkingV1TranscriptField(BaseModel):
|
|
70
|
-
"""
|
|
70
|
+
"""Transcript timing and speaker metadata for audio/video documents"""
|
|
71
71
|
|
|
72
|
-
speaker: str = Field(
|
|
73
|
-
default=
|
|
72
|
+
speaker: Optional[str] = Field(
|
|
73
|
+
default=None,
|
|
74
74
|
description="Speaker name for audio/video transcripts",
|
|
75
75
|
examples=["Agent"],
|
|
76
76
|
)
|
|
77
|
-
start_timestamp:
|
|
78
|
-
default=
|
|
79
|
-
description="Start timestamp
|
|
80
|
-
examples=["
|
|
77
|
+
start_timestamp: Optional[float] = Field(
|
|
78
|
+
default=None,
|
|
79
|
+
description="Start timestamp of the audio/video clip",
|
|
80
|
+
examples=["1.0"],
|
|
81
81
|
)
|
|
82
|
-
end_timestamp:
|
|
83
|
-
default=
|
|
84
|
-
description="End timestamp
|
|
85
|
-
examples=["
|
|
82
|
+
end_timestamp: Optional[float] = Field(
|
|
83
|
+
default=None,
|
|
84
|
+
description="End timestamp of the audio/video clip",
|
|
85
|
+
examples=["8.75"],
|
|
86
86
|
)
|
|
87
87
|
model_config = ConfigDict(extra="ignore")
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
class SearchIndexChunkingV1Metadata(BaseModel):
|
|
91
|
-
"""Metadata for input documents"""
|
|
91
|
+
"""Metadata for input documents."""
|
|
92
92
|
|
|
93
|
-
type:
|
|
94
|
-
default=
|
|
95
|
-
)
|
|
96
|
-
transcript_fields: SearchIndexChunkingV1TranscriptField = Field(
|
|
97
|
-
default_factory=SearchIndexChunkingV1TranscriptField,
|
|
93
|
+
type: Optional[ElementType] = Field(
|
|
94
|
+
default=ElementType.TEXT,
|
|
98
95
|
description=(
|
|
99
|
-
"
|
|
96
|
+
"Element type of the chunk input. Currently only 'text' is supported."
|
|
100
97
|
),
|
|
98
|
+
examples=["text"],
|
|
101
99
|
)
|
|
102
|
-
page_number: int = Field(
|
|
103
|
-
default=
|
|
104
|
-
description="Page number in the source document
|
|
100
|
+
page_number: Optional[int] = Field(
|
|
101
|
+
default=None,
|
|
102
|
+
description=("Page number in the source document."),
|
|
105
103
|
examples=[1],
|
|
106
104
|
)
|
|
105
|
+
transcript_fields: Optional[SearchIndexChunkingV1TranscriptField] = Field(
|
|
106
|
+
default=None,
|
|
107
|
+
description=(
|
|
108
|
+
"Speaker and timestamp metadata for audio/video transcripts. "
|
|
109
|
+
"Optional — only present when the source document is a transcript."
|
|
110
|
+
),
|
|
111
|
+
)
|
|
107
112
|
text_as_html: Optional[str] = Field(
|
|
108
113
|
default=None,
|
|
109
|
-
description="
|
|
114
|
+
description=("Table represented as HTML"),
|
|
110
115
|
examples=["<p>Online Remittance Instructions</p>"],
|
|
111
116
|
)
|
|
112
|
-
source_dmo_fields: Dict[str, Union[str, int]] = Field(
|
|
113
|
-
|
|
117
|
+
source_dmo_fields: Optional[Dict[str, Union[str, int, float]]] = Field(
|
|
118
|
+
default=None,
|
|
114
119
|
description=(
|
|
115
|
-
"Source Data Model Object fields as key-value pairs "
|
|
116
|
-
"
|
|
120
|
+
"Source Data Model Object fields as key-value pairs. "
|
|
121
|
+
"Values can be string, int, or float."
|
|
117
122
|
),
|
|
118
123
|
examples=[
|
|
119
124
|
{
|
|
120
125
|
"FilePath__c": "quarterly_report.pdf",
|
|
121
|
-
"Size__c": 1377454,
|
|
126
|
+
"Size__c": 1377454.0,
|
|
122
127
|
"ContentType__c": "pdf",
|
|
123
128
|
"LastModified__c": "2026-03-25T02:01:24.918000",
|
|
124
129
|
}
|
|
125
130
|
],
|
|
126
131
|
)
|
|
127
|
-
prepend: List[SearchIndexChunkingV1PrependField] = Field(
|
|
128
|
-
|
|
132
|
+
prepend: Optional[List[SearchIndexChunkingV1PrependField]] = Field(
|
|
133
|
+
default=None,
|
|
134
|
+
description=(
|
|
135
|
+
"List of DMO fields whose values are prepended to the chunk "
|
|
136
|
+
"text before indexing"
|
|
137
|
+
),
|
|
138
|
+
)
|
|
139
|
+
image_base64: Optional[str] = Field(
|
|
140
|
+
default=None,
|
|
141
|
+
description=(
|
|
142
|
+
"Base64-encoded image data associated with this chunk. "
|
|
143
|
+
"Optional — only applicable for image-type document elements."
|
|
144
|
+
),
|
|
145
|
+
)
|
|
146
|
+
image_mime_type: Optional[str] = Field(
|
|
147
|
+
default=None,
|
|
148
|
+
description=(
|
|
149
|
+
"MIME type of the associated image (e.g., 'image/png', 'image/jpeg'). "
|
|
150
|
+
"Optional — should be provided alongside image_base64 when present."
|
|
151
|
+
),
|
|
152
|
+
examples=["image/png", "image/jpeg"],
|
|
153
|
+
)
|
|
154
|
+
image_type: Optional[str] = Field(
|
|
155
|
+
default=None,
|
|
156
|
+
description=(
|
|
157
|
+
"Semantic category of the image content"
|
|
158
|
+
"(e.g., 'diagram', 'screenshot', 'chart'). Optional."
|
|
159
|
+
),
|
|
160
|
+
examples=["diagram", "screenshot"],
|
|
129
161
|
)
|
|
130
162
|
model_config = ConfigDict(extra="ignore")
|
|
131
163
|
|
|
@@ -143,9 +175,12 @@ class SearchIndexChunkingV1DocElement(BaseModel):
|
|
|
143
175
|
)
|
|
144
176
|
],
|
|
145
177
|
)
|
|
146
|
-
metadata: SearchIndexChunkingV1Metadata = Field(
|
|
147
|
-
|
|
148
|
-
description=
|
|
178
|
+
metadata: Optional[SearchIndexChunkingV1Metadata] = Field(
|
|
179
|
+
default=None,
|
|
180
|
+
description=(
|
|
181
|
+
"Source document metadata. Optional — may be absent if no "
|
|
182
|
+
"metadata is available for the document element."
|
|
183
|
+
),
|
|
149
184
|
)
|
|
150
185
|
model_config = ConfigDict(extra="ignore")
|
|
151
186
|
|
|
@@ -159,21 +194,25 @@ class SearchIndexChunkingV1Output(BaseModel):
|
|
|
159
194
|
examples=["Online Remittance Instructions"],
|
|
160
195
|
)
|
|
161
196
|
seq_no: int = Field(
|
|
162
|
-
default=
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
197
|
+
default=1,
|
|
198
|
+
description=(
|
|
199
|
+
"Sequential order of this chunk within the output "
|
|
200
|
+
"Represents chunk ordering within the source document (1-based)."
|
|
201
|
+
),
|
|
202
|
+
ge=1,
|
|
203
|
+
examples=[1],
|
|
168
204
|
)
|
|
169
205
|
chunk_type: ChunkType = Field(
|
|
170
206
|
default=ChunkType.TEXT,
|
|
171
|
-
description="Type of chunk
|
|
207
|
+
description="Type of chunk. Fixed value — always 'text'.",
|
|
172
208
|
examples=["text"],
|
|
173
209
|
)
|
|
174
|
-
citations: Dict[str, str] = Field(
|
|
175
|
-
|
|
176
|
-
description=
|
|
210
|
+
citations: Optional[Dict[str, str]] = Field(
|
|
211
|
+
default=None,
|
|
212
|
+
description=(
|
|
213
|
+
"Citation metadata associated with this chunk as key-value "
|
|
214
|
+
"pairs. Optional — defaults to None if no citations are present."
|
|
215
|
+
),
|
|
177
216
|
examples=[{"source": "quarterly_report.pdf"}],
|
|
178
217
|
)
|
|
179
218
|
model_config = ConfigDict(extra="ignore")
|
|
@@ -194,4 +233,3 @@ class SearchIndexChunkingV1Response(BaseModel):
|
|
|
194
233
|
output: List[SearchIndexChunkingV1Output] = Field(
|
|
195
234
|
default_factory=list, description="Flat list of chunks from all docs"
|
|
196
235
|
)
|
|
197
|
-
model_config = ConfigDict(extra="ignore")
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
"""Utilities for inspecting and working with function entrypoints."""
|
|
17
17
|
|
|
18
18
|
import ast
|
|
19
|
+
from enum import Enum
|
|
19
20
|
import importlib.util
|
|
20
21
|
import inspect
|
|
21
22
|
import json
|
|
@@ -278,11 +279,17 @@ def _generate_model_sample_data(model_type):
|
|
|
278
279
|
# Use examples if available
|
|
279
280
|
if field_info.examples and len(field_info.examples) > 0:
|
|
280
281
|
sample_data[field_name] = field_info.examples[0]
|
|
281
|
-
#
|
|
282
|
-
elif
|
|
282
|
+
# If field has a non-None, non-empty default value, use it
|
|
283
|
+
elif (
|
|
284
|
+
field_info.default is not PydanticUndefined
|
|
285
|
+
and field_info.default is not None
|
|
286
|
+
and field_info.default != []
|
|
287
|
+
and field_info.default != {}
|
|
288
|
+
):
|
|
283
289
|
sample_data[field_name] = field_info.default
|
|
290
|
+
# For all other fields (including default_factory, None defaults,
|
|
291
|
+
# empty defaults), generate sample data
|
|
284
292
|
else:
|
|
285
|
-
# Required field or field without default - generate sample
|
|
286
293
|
sample_data[field_name] = generate_sample_value(
|
|
287
294
|
field_info.annotation, field_name
|
|
288
295
|
)
|
|
@@ -301,6 +308,17 @@ def generate_sample_value(field_type, field_name: str):
|
|
|
301
308
|
"""
|
|
302
309
|
origin = typing.get_origin(field_type)
|
|
303
310
|
|
|
311
|
+
# Handle Optional[T] (Union[T, None]) by unwrapping to T
|
|
312
|
+
if origin is typing.Union:
|
|
313
|
+
non_none_args = [
|
|
314
|
+
arg for arg in typing.get_args(field_type) if arg is not type(None)
|
|
315
|
+
]
|
|
316
|
+
return (
|
|
317
|
+
generate_sample_value(non_none_args[0], field_name)
|
|
318
|
+
if non_none_args
|
|
319
|
+
else None
|
|
320
|
+
)
|
|
321
|
+
|
|
304
322
|
if origin is list or field_type is list:
|
|
305
323
|
args = typing.get_args(field_type)
|
|
306
324
|
if args:
|
|
@@ -320,6 +338,10 @@ def generate_sample_value(field_type, field_name: str):
|
|
|
320
338
|
return 1.0
|
|
321
339
|
elif field_type is bool:
|
|
322
340
|
return True
|
|
341
|
+
# Handle Enum types
|
|
342
|
+
elif isinstance(field_type, type) and issubclass(field_type, Enum):
|
|
343
|
+
# Return the first enum value
|
|
344
|
+
return next(iter(field_type)).value
|
|
323
345
|
elif hasattr(field_type, "model_fields"):
|
|
324
346
|
# Nested Pydantic model - use shared helper
|
|
325
347
|
return _generate_model_sample_data(field_type)
|
|
@@ -23,7 +23,6 @@ from typing import (
|
|
|
23
23
|
Union,
|
|
24
24
|
)
|
|
25
25
|
|
|
26
|
-
import pandas as pd
|
|
27
26
|
import requests
|
|
28
27
|
|
|
29
28
|
from datacustomcode.io.reader.base import BaseDataCloudReader
|
|
@@ -31,6 +30,7 @@ from datacustomcode.io.reader.utils import _pandas_to_spark_schema
|
|
|
31
30
|
from datacustomcode.token_provider import SFCLITokenProvider
|
|
32
31
|
|
|
33
32
|
if TYPE_CHECKING:
|
|
33
|
+
import pandas as pd
|
|
34
34
|
from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
|
|
35
35
|
from pyspark.sql.types import AtomicType, StructType
|
|
36
36
|
|
|
@@ -97,6 +97,8 @@ class SFCLIDataCloudReader(BaseDataCloudReader):
|
|
|
97
97
|
Raises:
|
|
98
98
|
RuntimeError: On HTTP errors or unexpected response shapes.
|
|
99
99
|
"""
|
|
100
|
+
import pandas as pd
|
|
101
|
+
|
|
100
102
|
access_token, instance_url = self._get_token()
|
|
101
103
|
|
|
102
104
|
url = f"{instance_url}/services/data/{API_VERSION}/ssot/query-sql"
|
|
@@ -16,32 +16,32 @@ from __future__ import annotations
|
|
|
16
16
|
|
|
17
17
|
from typing import TYPE_CHECKING
|
|
18
18
|
|
|
19
|
-
import pandas.api.types as pd_types
|
|
20
|
-
from pyspark.sql.types import (
|
|
21
|
-
BooleanType,
|
|
22
|
-
DoubleType,
|
|
23
|
-
LongType,
|
|
24
|
-
StringType,
|
|
25
|
-
StructField,
|
|
26
|
-
StructType,
|
|
27
|
-
TimestampType,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
19
|
if TYPE_CHECKING:
|
|
31
20
|
import pandas
|
|
32
|
-
from pyspark.sql.types import AtomicType
|
|
33
|
-
|
|
34
|
-
PANDAS_TYPE_MAPPING = {
|
|
35
|
-
"object": StringType(),
|
|
36
|
-
"int64": LongType(),
|
|
37
|
-
"float64": DoubleType(),
|
|
38
|
-
"bool": BooleanType(),
|
|
39
|
-
}
|
|
21
|
+
from pyspark.sql.types import AtomicType, StructType
|
|
40
22
|
|
|
41
23
|
|
|
42
24
|
def _pandas_to_spark_schema(
|
|
43
25
|
pandas_df: pandas.DataFrame, nullable: bool = True
|
|
44
26
|
) -> StructType:
|
|
27
|
+
import pandas.api.types as pd_types
|
|
28
|
+
from pyspark.sql.types import (
|
|
29
|
+
BooleanType,
|
|
30
|
+
DoubleType,
|
|
31
|
+
LongType,
|
|
32
|
+
StringType,
|
|
33
|
+
StructField,
|
|
34
|
+
StructType,
|
|
35
|
+
TimestampType,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
PANDAS_TYPE_MAPPING = {
|
|
39
|
+
"object": StringType(),
|
|
40
|
+
"int64": LongType(),
|
|
41
|
+
"float64": DoubleType(),
|
|
42
|
+
"bool": BooleanType(),
|
|
43
|
+
}
|
|
44
|
+
|
|
45
45
|
fields = []
|
|
46
46
|
for column, dtype in pandas_df.dtypes.items():
|
|
47
47
|
spark_type: AtomicType
|
|
@@ -13,8 +13,12 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
+
from __future__ import annotations
|
|
16
17
|
|
|
17
|
-
from
|
|
18
|
+
from typing import TYPE_CHECKING
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from pyspark.sql import DataFrame as PySparkDataFrame
|
|
18
22
|
|
|
19
23
|
from datacustomcode.io.writer.base import BaseDataCloudWriter, WriteMode
|
|
20
24
|
|
|
@@ -13,12 +13,15 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
+
from __future__ import annotations
|
|
16
17
|
|
|
17
|
-
from typing import Optional
|
|
18
|
+
from typing import TYPE_CHECKING, Optional
|
|
18
19
|
|
|
19
|
-
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
|
|
22
|
+
|
|
23
|
+
from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
|
|
20
24
|
|
|
21
|
-
from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
|
|
22
25
|
from datacustomcode.io.writer.base import BaseDataCloudWriter, WriteMode
|
|
23
26
|
|
|
24
27
|
|
|
@@ -61,6 +64,8 @@ class PrintDataCloudWriter(BaseDataCloudWriter):
|
|
|
61
64
|
sf_cli_org: Optional SF CLI org alias or username. If provided,
|
|
62
65
|
credentials are fetched via `sf org display`.
|
|
63
66
|
"""
|
|
67
|
+
from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
|
|
68
|
+
|
|
64
69
|
super().__init__(spark)
|
|
65
70
|
if reader is None:
|
|
66
71
|
self.reader = QueryAPIDataCloudReader(
|
{salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/mixin.py
RENAMED
|
@@ -72,6 +72,35 @@ class UserExtendableNamedConfigMixin:
|
|
|
72
72
|
Args:
|
|
73
73
|
config_name: should match a subclass's ``CONFIG_NAME``.
|
|
74
74
|
"""
|
|
75
|
+
# First, check if already registered (from __init_subclass__)
|
|
76
|
+
if config_name in UserExtendableNamedConfigMixin._registered_config_names:
|
|
77
|
+
candidate = UserExtendableNamedConfigMixin._registered_config_names[
|
|
78
|
+
config_name
|
|
79
|
+
]
|
|
80
|
+
# Verify it's actually a subclass of cls (respects hierarchy)
|
|
81
|
+
if candidate is cls or issubclass(candidate, cls):
|
|
82
|
+
return candidate
|
|
83
|
+
|
|
84
|
+
# If not found, try to trigger lazy import via __getattr__
|
|
85
|
+
# This handles the case where subclasses use lazy loading
|
|
86
|
+
try:
|
|
87
|
+
import datacustomcode
|
|
88
|
+
|
|
89
|
+
# Attempt to trigger __getattr__ by accessing the name
|
|
90
|
+
getattr(datacustomcode, config_name, None)
|
|
91
|
+
except (ImportError, AttributeError):
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
# Check again after potential lazy import
|
|
95
|
+
if config_name in UserExtendableNamedConfigMixin._registered_config_names:
|
|
96
|
+
candidate = UserExtendableNamedConfigMixin._registered_config_names[
|
|
97
|
+
config_name
|
|
98
|
+
]
|
|
99
|
+
# Verify it's actually a subclass of cls (respects hierarchy)
|
|
100
|
+
if candidate is cls or issubclass(candidate, cls):
|
|
101
|
+
return candidate
|
|
102
|
+
|
|
103
|
+
# Fallback to dynamic lookup (for user-added subclasses)
|
|
75
104
|
subclass_config_name_map = {}
|
|
76
105
|
for type_ in _get_all_subclass_descendants(cls):
|
|
77
106
|
if name := getattr(type_, "CONFIG_NAME", ""):
|
|
@@ -27,7 +27,7 @@ if TYPE_CHECKING:
|
|
|
27
27
|
class DefaultSparkSessionProvider(BaseSparkSessionProvider):
|
|
28
28
|
CONFIG_NAME = "DefaultSparkSessionProvider"
|
|
29
29
|
|
|
30
|
-
def get_session(self, spark_config: SparkConfig) ->
|
|
30
|
+
def get_session(self, spark_config: SparkConfig) -> SparkSession:
|
|
31
31
|
from pyspark.sql import SparkSession
|
|
32
32
|
|
|
33
33
|
builder = SparkSession.builder
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import uuid
|
|
3
2
|
|
|
4
3
|
from datacustomcode.function import Runtime
|
|
5
4
|
from datacustomcode.function.feature_types.chunking import (
|
|
@@ -124,12 +123,11 @@ def function(
|
|
|
124
123
|
for chunk_text in text_chunks:
|
|
125
124
|
# Create citations from source_dmo_fields if available
|
|
126
125
|
citations = {}
|
|
127
|
-
if metadata.source_dmo_fields:
|
|
126
|
+
if metadata and metadata.source_dmo_fields:
|
|
128
127
|
for key, value in metadata.source_dmo_fields.items():
|
|
129
128
|
citations[key] = str(value)
|
|
130
129
|
|
|
131
130
|
chunk_output = SearchIndexChunkingV1Output(
|
|
132
|
-
chunk_id=str(uuid.uuid4()),
|
|
133
131
|
chunk_type=ChunkType.TEXT,
|
|
134
132
|
text=chunk_text.strip(),
|
|
135
133
|
seq_no=seq_no,
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Sample Search Index Chunking Customer Function
|
|
4
|
+
|
|
5
|
+
This function demonstrates the new signature-based invocation with Pydantic models:
|
|
6
|
+
- Uses SearchIndexChunkingV1Request/Response (Pydantic models)
|
|
7
|
+
- Requires Runtime parameter (for agentic capabilities)
|
|
8
|
+
- Type-safe with direct field access (no wrappers)
|
|
9
|
+
- Automatic validation and conversion
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
from datacustomcode.function.feature_types.chunking import (
|
|
15
|
+
ChunkType,
|
|
16
|
+
SearchIndexChunkingV1Output,
|
|
17
|
+
SearchIndexChunkingV1Request,
|
|
18
|
+
SearchIndexChunkingV1Response,
|
|
19
|
+
)
|
|
20
|
+
from datacustomcode.function.runtime import Runtime
|
|
21
|
+
from datacustomcode.llm_gateway.types.generate_text_request_builder import (
|
|
22
|
+
GenerateTextRequestBuilder,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
logging.basicConfig(level=logging.INFO)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _load_prompt_template(runtime: Runtime) -> str:
|
|
30
|
+
"""Load the chunking prompt template from file."""
|
|
31
|
+
prompt_file = runtime.file.find_file_path("chunking_prompt.txt")
|
|
32
|
+
with open(prompt_file, "r") as f:
|
|
33
|
+
_prompt_template_cache = f.read()
|
|
34
|
+
logger.info(f"Loaded prompt template from {prompt_file}")
|
|
35
|
+
return _prompt_template_cache
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def function(
|
|
39
|
+
request: SearchIndexChunkingV1Request, runtime: Runtime
|
|
40
|
+
) -> SearchIndexChunkingV1Response:
|
|
41
|
+
"""
|
|
42
|
+
Chunk documents for Search Index.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
request: SearchIndexChunkingV1Request with input documents
|
|
46
|
+
runtime: Runtime instance for agentic capabilities (future use)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
SearchIndexChunkingV1Response with chunked output
|
|
50
|
+
"""
|
|
51
|
+
logger.info(f"Received {len(request.input)} documents to chunk")
|
|
52
|
+
|
|
53
|
+
# Load prompt template (cached after first call)
|
|
54
|
+
prompt_template = _load_prompt_template(runtime)
|
|
55
|
+
|
|
56
|
+
chunks = []
|
|
57
|
+
chunk_id = 1
|
|
58
|
+
|
|
59
|
+
# Process each document
|
|
60
|
+
for doc_idx, doc in enumerate(request.input):
|
|
61
|
+
# Direct field access - no wrappers!
|
|
62
|
+
text = doc.text
|
|
63
|
+
|
|
64
|
+
# Use LLM to intelligently chunk the document
|
|
65
|
+
# This creates semantic chunks that preserve context and meaning
|
|
66
|
+
prompt = prompt_template.format(text=text)
|
|
67
|
+
|
|
68
|
+
builder = GenerateTextRequestBuilder()
|
|
69
|
+
llm_request = (
|
|
70
|
+
builder.set_model("sfdc_ai__DefaultGPT4Turbo").set_prompt(prompt).build()
|
|
71
|
+
)
|
|
72
|
+
response = runtime.llm_gateway.generate_text(llm_request)
|
|
73
|
+
|
|
74
|
+
if response.is_success:
|
|
75
|
+
# Parse LLM response to extract chunks
|
|
76
|
+
llm_chunks = response.text.split("---CHUNK---")
|
|
77
|
+
llm_chunks = [chunk.strip() for chunk in llm_chunks if chunk.strip()]
|
|
78
|
+
|
|
79
|
+
# Create chunk outputs
|
|
80
|
+
for chunk_text in llm_chunks:
|
|
81
|
+
chunk = SearchIndexChunkingV1Output(
|
|
82
|
+
text=chunk_text,
|
|
83
|
+
seq_no=chunk_id,
|
|
84
|
+
chunk_type=ChunkType.TEXT,
|
|
85
|
+
citations={},
|
|
86
|
+
)
|
|
87
|
+
chunks.append(chunk)
|
|
88
|
+
chunk_id += 1
|
|
89
|
+
|
|
90
|
+
else:
|
|
91
|
+
# LLM chunking failed - log error and raise exception
|
|
92
|
+
error_msg = (
|
|
93
|
+
f"LLM chunking failed for document {doc_idx + 1}: {response.error_code}"
|
|
94
|
+
)
|
|
95
|
+
logger.error(error_msg)
|
|
96
|
+
raise RuntimeError(error_msg)
|
|
97
|
+
|
|
98
|
+
# Return Pydantic response
|
|
99
|
+
return SearchIndexChunkingV1Response(output=chunks)
|