salesforce-data-customcode 4.0.1__tar.gz → 5.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/PKG-INFO +1 -1
  2. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/pyproject.toml +1 -1
  3. salesforce_data_customcode-5.0.0/src/datacustomcode/__init__.py +47 -0
  4. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/client.py +2 -2
  5. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/feature_types/chunking.py +89 -51
  6. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function_utils.py +25 -3
  7. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/sf_cli.py +3 -1
  8. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/utils.py +19 -19
  9. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/writer/csv.py +5 -1
  10. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/writer/print.py +8 -3
  11. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/mixin.py +29 -0
  12. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/spark/base.py +1 -1
  13. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/spark/default.py +1 -1
  14. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/chunking/payload/entrypoint.py +1 -3
  15. salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_llm/config.json +3 -0
  16. salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_llm/entrypoint.py +99 -0
  17. salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_llm/files/chunking_prompt.txt +19 -0
  18. salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_llm/tests/test.json +51 -0
  19. salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_prediction/config.json +3 -0
  20. salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_prediction/entrypoint.py +251 -0
  21. salesforce_data_customcode-5.0.0/src/datacustomcode/templates/function/example/chunking_with_prediction/tests/test.json +40 -0
  22. salesforce_data_customcode-4.0.1/src/datacustomcode/__init__.py +0 -27
  23. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/LICENSE.txt +0 -0
  24. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/README.md +0 -0
  25. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/auth.py +0 -0
  26. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/cli.py +0 -0
  27. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/cmd.py +0 -0
  28. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/common_config.py +0 -0
  29. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/config.py +0 -0
  30. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/config.yaml +0 -0
  31. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/constants.py +0 -0
  32. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/credentials.py +0 -0
  33. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/deploy.py +0 -0
  34. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_platform_client.py +0 -0
  35. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_platform_config.py +0 -0
  36. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions/__init__.py +0 -0
  37. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions/base.py +0 -0
  38. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions/impl/default.py +0 -0
  39. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions/types.py +0 -0
  40. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/einstein_predictions_config.py +0 -0
  41. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/file/__init__.py +0 -0
  42. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/file/base.py +0 -0
  43. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/file/path/__init__.py +0 -0
  44. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/file/path/default.py +0 -0
  45. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/__init__.py +0 -0
  46. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/base.py +0 -0
  47. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/feature_types/__init__.py +0 -0
  48. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/function/runtime.py +0 -0
  49. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/__init__.py +0 -0
  50. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/base.py +0 -0
  51. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/__init__.py +0 -0
  52. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/base.py +0 -0
  53. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/reader/query_api.py +0 -0
  54. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/writer/__init__.py +0 -0
  55. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/io/writer/base.py +0 -0
  56. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/__init__.py +0 -0
  57. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/base.py +0 -0
  58. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/default.py +0 -0
  59. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/__init__.py +0 -0
  60. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/generate_text_request.py +0 -0
  61. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/generate_text_request_builder.py +0 -0
  62. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/generate_text_response.py +0 -0
  63. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway/types/generate_text_response_builder.py +0 -0
  64. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/llm_gateway_config.py +0 -0
  65. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/proxy/__init__.py +0 -0
  66. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/proxy/base.py +0 -0
  67. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/proxy/client/__init__.py +0 -0
  68. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/proxy/client/base.py +0 -0
  69. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/py.typed +0 -0
  70. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/run.py +0 -0
  71. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/scan.py +0 -0
  72. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/spark/__init__.py +0 -0
  73. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/template.py +0 -0
  74. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/__init__.py +0 -0
  75. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/.devcontainer/devcontainer.json +0 -0
  76. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/Dockerfile.dependencies +0 -0
  77. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/README.md +0 -0
  78. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/build_native_dependencies.sh +0 -0
  79. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/chunking/payload/config.json +0 -0
  80. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/chunking/requirements.txt +0 -0
  81. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/payload/config.json +0 -0
  82. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/payload/entrypoint.py +0 -0
  83. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/requirements-dev.txt +0 -0
  84. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/function/requirements.txt +0 -0
  85. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/.devcontainer/devcontainer.json +0 -0
  86. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/Dockerfile +0 -0
  87. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/Dockerfile.dependencies +0 -0
  88. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/README.md +0 -0
  89. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/account.ipynb +0 -0
  90. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/build_native_dependencies.sh +0 -0
  91. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/examples/employee_hierarchy/employee_data.csv +0 -0
  92. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/examples/employee_hierarchy/entrypoint.py +0 -0
  93. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/jupyterlab.sh +0 -0
  94. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/payload/config.json +0 -0
  95. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/payload/entrypoint.py +0 -0
  96. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/requirements-dev.txt +0 -0
  97. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/templates/script/requirements.txt +0 -0
  98. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/token_provider.py +0 -0
  99. {salesforce_data_customcode-4.0.1 → salesforce_data_customcode-5.0.0}/src/datacustomcode/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: salesforce-data-customcode
3
- Version: 4.0.1
3
+ Version: 5.0.0
4
4
  Summary: Data Cloud Custom Code SDK
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE.txt
@@ -18,7 +18,7 @@ license = "Apache-2.0"
18
18
  name = "salesforce-data-customcode"
19
19
  readme = "README.md"
20
20
  requires-python = ">=3.10,<3.12"
21
- version = "4.0.1"
21
+ version = "5.0.0"
22
22
 
23
23
  [tool.black]
24
24
  exclude = '''
@@ -0,0 +1,47 @@
1
+ # Copyright (c) 2025, Salesforce, Inc.
2
+ # SPDX-License-Identifier: Apache-2
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = [
17
+ "AuthType",
18
+ "Client",
19
+ "Credentials",
20
+ "PrintDataCloudWriter",
21
+ "QueryAPIDataCloudReader",
22
+ ]
23
+
24
+
25
+ def __getattr__(name: str):
26
+ """Lazy import heavy dependencies."""
27
+ if name == "Client":
28
+ from datacustomcode.client import Client
29
+
30
+ return Client
31
+ elif name == "AuthType":
32
+ from datacustomcode.credentials import AuthType
33
+
34
+ return AuthType
35
+ elif name == "Credentials":
36
+ from datacustomcode.credentials import Credentials
37
+
38
+ return Credentials
39
+ elif name == "PrintDataCloudWriter":
40
+ from datacustomcode.io.writer.print import PrintDataCloudWriter
41
+
42
+ return PrintDataCloudWriter
43
+ elif name == "QueryAPIDataCloudReader":
44
+ from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
45
+
46
+ return QueryAPIDataCloudReader
47
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -112,8 +112,8 @@ class Client:
112
112
  def __new__(
113
113
  cls,
114
114
  reader: Optional[BaseDataCloudReader] = None,
115
- writer: Optional["BaseDataCloudWriter"] = None,
116
- spark_provider: Optional["BaseSparkSessionProvider"] = None,
115
+ writer: Optional[BaseDataCloudWriter] = None,
116
+ spark_provider: Optional[BaseSparkSessionProvider] = None,
117
117
  code_type: str = "script",
118
118
  ) -> Client:
119
119
 
@@ -31,8 +31,8 @@ from pydantic import (
31
31
  )
32
32
 
33
33
 
34
- class DocumentType(str, Enum):
35
- """Document type enumeration"""
34
+ class ElementType(str, Enum):
35
+ """Element type enumeration"""
36
36
 
37
37
  TEXT = "text"
38
38
  TITLE = "title"
@@ -50,16 +50,16 @@ class ChunkType(str, Enum):
50
50
  class SearchIndexChunkingV1PrependField(BaseModel):
51
51
  """Field to prepend to chunk content"""
52
52
 
53
- dmo_name: str = Field(
54
- default="", description="Data Model Object name", examples=["udmo_1__dlm"]
53
+ dmo_name: Optional[str] = Field(
54
+ default=None, description="Data Model Object name", examples=["udmo_1__dlm"]
55
55
  )
56
- field_name: str = Field(
57
- default="",
56
+ field_name: Optional[str] = Field(
57
+ default=None,
58
58
  description="Field name to prepend",
59
59
  examples=["ResolvedFilePath__c"],
60
60
  )
61
- value: str = Field(
62
- default="",
61
+ value: Optional[str] = Field(
62
+ default=None,
63
63
  description="Field value to prepend",
64
64
  examples=["udlo_1__dll:quarterly_report.pdf"],
65
65
  )
@@ -67,65 +67,97 @@ class SearchIndexChunkingV1PrependField(BaseModel):
67
67
 
68
68
 
69
69
  class SearchIndexChunkingV1TranscriptField(BaseModel):
70
- """Field to prepend to chunk content"""
70
+ """Transcript timing and speaker metadata for audio/video documents"""
71
71
 
72
- speaker: str = Field(
73
- default="",
72
+ speaker: Optional[str] = Field(
73
+ default=None,
74
74
  description="Speaker name for audio/video transcripts",
75
75
  examples=["Agent"],
76
76
  )
77
- start_timestamp: str = Field(
78
- default="",
79
- description="Start timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
80
- examples=["2026-03-25T02:01:24.918000"],
77
+ start_timestamp: Optional[float] = Field(
78
+ default=None,
79
+ description="Start timestamp of the audio/video clip",
80
+ examples=["1.0"],
81
81
  )
82
- end_timestamp: str = Field(
83
- default="",
84
- description="End timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
85
- examples=["2026-03-25T02:01:30.500000"],
82
+ end_timestamp: Optional[float] = Field(
83
+ default=None,
84
+ description="End timestamp of the audio/video clip",
85
+ examples=["8.75"],
86
86
  )
87
87
  model_config = ConfigDict(extra="ignore")
88
88
 
89
89
 
90
90
  class SearchIndexChunkingV1Metadata(BaseModel):
91
- """Metadata for input documents"""
91
+ """Metadata for input documents."""
92
92
 
93
- type: DocumentType = Field(
94
- default=DocumentType.TEXT, description="Document type (text)", examples=["text"]
95
- )
96
- transcript_fields: SearchIndexChunkingV1TranscriptField = Field(
97
- default_factory=SearchIndexChunkingV1TranscriptField,
93
+ type: Optional[ElementType] = Field(
94
+ default=ElementType.TEXT,
98
95
  description=(
99
- "Transcript information. Will only be there in case of audio-video files"
96
+ "Element type of the chunk input. Currently only 'text' is supported."
100
97
  ),
98
+ examples=["text"],
101
99
  )
102
- page_number: int = Field(
103
- default=0,
104
- description="Page number in the source document (0-based)",
100
+ page_number: Optional[int] = Field(
101
+ default=None,
102
+ description=("Page number in the source document."),
105
103
  examples=[1],
106
104
  )
105
+ transcript_fields: Optional[SearchIndexChunkingV1TranscriptField] = Field(
106
+ default=None,
107
+ description=(
108
+ "Speaker and timestamp metadata for audio/video transcripts. "
109
+ "Optional — only present when the source document is a transcript."
110
+ ),
111
+ )
107
112
  text_as_html: Optional[str] = Field(
108
113
  default=None,
109
- description="HTML representation of the document text",
114
+ description=("Table represented as HTML"),
110
115
  examples=["<p>Online Remittance Instructions</p>"],
111
116
  )
112
- source_dmo_fields: Dict[str, Union[str, int]] = Field(
113
- default_factory=dict,
117
+ source_dmo_fields: Optional[Dict[str, Union[str, int, float]]] = Field(
118
+ default=None,
114
119
  description=(
115
- "Source Data Model Object fields as key-value pairs "
116
- "(values can be string or int)"
120
+ "Source Data Model Object fields as key-value pairs. "
121
+ "Values can be string, int, or float."
117
122
  ),
118
123
  examples=[
119
124
  {
120
125
  "FilePath__c": "quarterly_report.pdf",
121
- "Size__c": 1377454,
126
+ "Size__c": 1377454.0,
122
127
  "ContentType__c": "pdf",
123
128
  "LastModified__c": "2026-03-25T02:01:24.918000",
124
129
  }
125
130
  ],
126
131
  )
127
- prepend: List[SearchIndexChunkingV1PrependField] = Field(
128
- default_factory=list, description="List of fields to prepend to each chunk"
132
+ prepend: Optional[List[SearchIndexChunkingV1PrependField]] = Field(
133
+ default=None,
134
+ description=(
135
+ "List of DMO fields whose values are prepended to the chunk "
136
+ "text before indexing"
137
+ ),
138
+ )
139
+ image_base64: Optional[str] = Field(
140
+ default=None,
141
+ description=(
142
+ "Base64-encoded image data associated with this chunk. "
143
+ "Optional — only applicable for image-type document elements."
144
+ ),
145
+ )
146
+ image_mime_type: Optional[str] = Field(
147
+ default=None,
148
+ description=(
149
+ "MIME type of the associated image (e.g., 'image/png', 'image/jpeg'). "
150
+ "Optional — should be provided alongside image_base64 when present."
151
+ ),
152
+ examples=["image/png", "image/jpeg"],
153
+ )
154
+ image_type: Optional[str] = Field(
155
+ default=None,
156
+ description=(
157
+ "Semantic category of the image content"
158
+ "(e.g., 'diagram', 'screenshot', 'chart'). Optional."
159
+ ),
160
+ examples=["diagram", "screenshot"],
129
161
  )
130
162
  model_config = ConfigDict(extra="ignore")
131
163
 
@@ -143,9 +175,12 @@ class SearchIndexChunkingV1DocElement(BaseModel):
143
175
  )
144
176
  ],
145
177
  )
146
- metadata: SearchIndexChunkingV1Metadata = Field(
147
- default_factory=SearchIndexChunkingV1Metadata,
148
- description="Source document metadata",
178
+ metadata: Optional[SearchIndexChunkingV1Metadata] = Field(
179
+ default=None,
180
+ description=(
181
+ "Source document metadata. Optional — may be absent if no "
182
+ "metadata is available for the document element."
183
+ ),
149
184
  )
150
185
  model_config = ConfigDict(extra="ignore")
151
186
 
@@ -159,21 +194,25 @@ class SearchIndexChunkingV1Output(BaseModel):
159
194
  examples=["Online Remittance Instructions"],
160
195
  )
161
196
  seq_no: int = Field(
162
- default=0, description="Sequential chunk number (1-based)", ge=1, examples=[1]
163
- )
164
- chunk_id: str = Field(
165
- default="",
166
- description="Unique identifier for this chunk (UUID format)",
167
- examples=["550e8400-e29b-41d4-a716-446655440000"],
197
+ default=1,
198
+ description=(
199
+ "Sequential order of this chunk within the output "
200
+ "Represents chunk ordering within the source document (1-based)."
201
+ ),
202
+ ge=1,
203
+ examples=[1],
168
204
  )
169
205
  chunk_type: ChunkType = Field(
170
206
  default=ChunkType.TEXT,
171
- description="Type of chunk (e.g., 'text')",
207
+ description="Type of chunk. Fixed value — always 'text'.",
172
208
  examples=["text"],
173
209
  )
174
- citations: Dict[str, str] = Field(
175
- default_factory=dict,
176
- description="Citation information as key-value pairs",
210
+ citations: Optional[Dict[str, str]] = Field(
211
+ default=None,
212
+ description=(
213
+ "Citation metadata associated with this chunk as key-value "
214
+ "pairs. Optional — defaults to None if no citations are present."
215
+ ),
177
216
  examples=[{"source": "quarterly_report.pdf"}],
178
217
  )
179
218
  model_config = ConfigDict(extra="ignore")
@@ -194,4 +233,3 @@ class SearchIndexChunkingV1Response(BaseModel):
194
233
  output: List[SearchIndexChunkingV1Output] = Field(
195
234
  default_factory=list, description="Flat list of chunks from all docs"
196
235
  )
197
- model_config = ConfigDict(extra="ignore")
@@ -16,6 +16,7 @@
16
16
  """Utilities for inspecting and working with function entrypoints."""
17
17
 
18
18
  import ast
19
+ from enum import Enum
19
20
  import importlib.util
20
21
  import inspect
21
22
  import json
@@ -278,11 +279,17 @@ def _generate_model_sample_data(model_type):
278
279
  # Use examples if available
279
280
  if field_info.examples and len(field_info.examples) > 0:
280
281
  sample_data[field_name] = field_info.examples[0]
281
- # Check if field has a real default value
282
- elif field_info.default is not PydanticUndefined:
282
+ # If field has a non-None, non-empty default value, use it
283
+ elif (
284
+ field_info.default is not PydanticUndefined
285
+ and field_info.default is not None
286
+ and field_info.default != []
287
+ and field_info.default != {}
288
+ ):
283
289
  sample_data[field_name] = field_info.default
290
+ # For all other fields (including default_factory, None defaults,
291
+ # empty defaults), generate sample data
284
292
  else:
285
- # Required field or field without default - generate sample
286
293
  sample_data[field_name] = generate_sample_value(
287
294
  field_info.annotation, field_name
288
295
  )
@@ -301,6 +308,17 @@ def generate_sample_value(field_type, field_name: str):
301
308
  """
302
309
  origin = typing.get_origin(field_type)
303
310
 
311
+ # Handle Optional[T] (Union[T, None]) by unwrapping to T
312
+ if origin is typing.Union:
313
+ non_none_args = [
314
+ arg for arg in typing.get_args(field_type) if arg is not type(None)
315
+ ]
316
+ return (
317
+ generate_sample_value(non_none_args[0], field_name)
318
+ if non_none_args
319
+ else None
320
+ )
321
+
304
322
  if origin is list or field_type is list:
305
323
  args = typing.get_args(field_type)
306
324
  if args:
@@ -320,6 +338,10 @@ def generate_sample_value(field_type, field_name: str):
320
338
  return 1.0
321
339
  elif field_type is bool:
322
340
  return True
341
+ # Handle Enum types
342
+ elif isinstance(field_type, type) and issubclass(field_type, Enum):
343
+ # Return the first enum value
344
+ return next(iter(field_type)).value
323
345
  elif hasattr(field_type, "model_fields"):
324
346
  # Nested Pydantic model - use shared helper
325
347
  return _generate_model_sample_data(field_type)
@@ -23,7 +23,6 @@ from typing import (
23
23
  Union,
24
24
  )
25
25
 
26
- import pandas as pd
27
26
  import requests
28
27
 
29
28
  from datacustomcode.io.reader.base import BaseDataCloudReader
@@ -31,6 +30,7 @@ from datacustomcode.io.reader.utils import _pandas_to_spark_schema
31
30
  from datacustomcode.token_provider import SFCLITokenProvider
32
31
 
33
32
  if TYPE_CHECKING:
33
+ import pandas as pd
34
34
  from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
35
35
  from pyspark.sql.types import AtomicType, StructType
36
36
 
@@ -97,6 +97,8 @@ class SFCLIDataCloudReader(BaseDataCloudReader):
97
97
  Raises:
98
98
  RuntimeError: On HTTP errors or unexpected response shapes.
99
99
  """
100
+ import pandas as pd
101
+
100
102
  access_token, instance_url = self._get_token()
101
103
 
102
104
  url = f"{instance_url}/services/data/{API_VERSION}/ssot/query-sql"
@@ -16,32 +16,32 @@ from __future__ import annotations
16
16
 
17
17
  from typing import TYPE_CHECKING
18
18
 
19
- import pandas.api.types as pd_types
20
- from pyspark.sql.types import (
21
- BooleanType,
22
- DoubleType,
23
- LongType,
24
- StringType,
25
- StructField,
26
- StructType,
27
- TimestampType,
28
- )
29
-
30
19
  if TYPE_CHECKING:
31
20
  import pandas
32
- from pyspark.sql.types import AtomicType
33
-
34
- PANDAS_TYPE_MAPPING = {
35
- "object": StringType(),
36
- "int64": LongType(),
37
- "float64": DoubleType(),
38
- "bool": BooleanType(),
39
- }
21
+ from pyspark.sql.types import AtomicType, StructType
40
22
 
41
23
 
42
24
  def _pandas_to_spark_schema(
43
25
  pandas_df: pandas.DataFrame, nullable: bool = True
44
26
  ) -> StructType:
27
+ import pandas.api.types as pd_types
28
+ from pyspark.sql.types import (
29
+ BooleanType,
30
+ DoubleType,
31
+ LongType,
32
+ StringType,
33
+ StructField,
34
+ StructType,
35
+ TimestampType,
36
+ )
37
+
38
+ PANDAS_TYPE_MAPPING = {
39
+ "object": StringType(),
40
+ "int64": LongType(),
41
+ "float64": DoubleType(),
42
+ "bool": BooleanType(),
43
+ }
44
+
45
45
  fields = []
46
46
  for column, dtype in pandas_df.dtypes.items():
47
47
  spark_type: AtomicType
@@ -13,8 +13,12 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
+ from __future__ import annotations
16
17
 
17
- from pyspark.sql import DataFrame as PySparkDataFrame
18
+ from typing import TYPE_CHECKING
19
+
20
+ if TYPE_CHECKING:
21
+ from pyspark.sql import DataFrame as PySparkDataFrame
18
22
 
19
23
  from datacustomcode.io.writer.base import BaseDataCloudWriter, WriteMode
20
24
 
@@ -13,12 +13,15 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
+ from __future__ import annotations
16
17
 
17
- from typing import Optional
18
+ from typing import TYPE_CHECKING, Optional
18
19
 
19
- from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
20
+ if TYPE_CHECKING:
21
+ from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
22
+
23
+ from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
20
24
 
21
- from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
22
25
  from datacustomcode.io.writer.base import BaseDataCloudWriter, WriteMode
23
26
 
24
27
 
@@ -61,6 +64,8 @@ class PrintDataCloudWriter(BaseDataCloudWriter):
61
64
  sf_cli_org: Optional SF CLI org alias or username. If provided,
62
65
  credentials are fetched via `sf org display`.
63
66
  """
67
+ from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
68
+
64
69
  super().__init__(spark)
65
70
  if reader is None:
66
71
  self.reader = QueryAPIDataCloudReader(
@@ -72,6 +72,35 @@ class UserExtendableNamedConfigMixin:
72
72
  Args:
73
73
  config_name: should match a subclass's ``CONFIG_NAME``.
74
74
  """
75
+ # First, check if already registered (from __init_subclass__)
76
+ if config_name in UserExtendableNamedConfigMixin._registered_config_names:
77
+ candidate = UserExtendableNamedConfigMixin._registered_config_names[
78
+ config_name
79
+ ]
80
+ # Verify it's actually a subclass of cls (respects hierarchy)
81
+ if candidate is cls or issubclass(candidate, cls):
82
+ return candidate
83
+
84
+ # If not found, try to trigger lazy import via __getattr__
85
+ # This handles the case where subclasses use lazy loading
86
+ try:
87
+ import datacustomcode
88
+
89
+ # Attempt to trigger __getattr__ by accessing the name
90
+ getattr(datacustomcode, config_name, None)
91
+ except (ImportError, AttributeError):
92
+ pass
93
+
94
+ # Check again after potential lazy import
95
+ if config_name in UserExtendableNamedConfigMixin._registered_config_names:
96
+ candidate = UserExtendableNamedConfigMixin._registered_config_names[
97
+ config_name
98
+ ]
99
+ # Verify it's actually a subclass of cls (respects hierarchy)
100
+ if candidate is cls or issubclass(candidate, cls):
101
+ return candidate
102
+
103
+ # Fallback to dynamic lookup (for user-added subclasses)
75
104
  subclass_config_name_map = {}
76
105
  for type_ in _get_all_subclass_descendants(cls):
77
106
  if name := getattr(type_, "CONFIG_NAME", ""):
@@ -25,5 +25,5 @@ if TYPE_CHECKING:
25
25
 
26
26
 
27
27
  class BaseSparkSessionProvider(UserExtendableNamedConfigMixin):
28
- def get_session(self, spark_config: SparkConfig) -> "SparkSession":
28
+ def get_session(self, spark_config: SparkConfig) -> SparkSession:
29
29
  raise NotImplementedError
@@ -27,7 +27,7 @@ if TYPE_CHECKING:
27
27
  class DefaultSparkSessionProvider(BaseSparkSessionProvider):
28
28
  CONFIG_NAME = "DefaultSparkSessionProvider"
29
29
 
30
- def get_session(self, spark_config: SparkConfig) -> "SparkSession":
30
+ def get_session(self, spark_config: SparkConfig) -> SparkSession:
31
31
  from pyspark.sql import SparkSession
32
32
 
33
33
  builder = SparkSession.builder
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import uuid
3
2
 
4
3
  from datacustomcode.function import Runtime
5
4
  from datacustomcode.function.feature_types.chunking import (
@@ -124,12 +123,11 @@ def function(
124
123
  for chunk_text in text_chunks:
125
124
  # Create citations from source_dmo_fields if available
126
125
  citations = {}
127
- if metadata.source_dmo_fields:
126
+ if metadata and metadata.source_dmo_fields:
128
127
  for key, value in metadata.source_dmo_fields.items():
129
128
  citations[key] = str(value)
130
129
 
131
130
  chunk_output = SearchIndexChunkingV1Output(
132
- chunk_id=str(uuid.uuid4()),
133
131
  chunk_type=ChunkType.TEXT,
134
132
  text=chunk_text.strip(),
135
133
  seq_no=seq_no,
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Sample Search Index Chunking Customer Function
4
+
5
+ This function demonstrates the new signature-based invocation with Pydantic models:
6
+ - Uses SearchIndexChunkingV1Request/Response (Pydantic models)
7
+ - Requires Runtime parameter (for agentic capabilities)
8
+ - Type-safe with direct field access (no wrappers)
9
+ - Automatic validation and conversion
10
+ """
11
+
12
+ import logging
13
+
14
+ from datacustomcode.function.feature_types.chunking import (
15
+ ChunkType,
16
+ SearchIndexChunkingV1Output,
17
+ SearchIndexChunkingV1Request,
18
+ SearchIndexChunkingV1Response,
19
+ )
20
+ from datacustomcode.function.runtime import Runtime
21
+ from datacustomcode.llm_gateway.types.generate_text_request_builder import (
22
+ GenerateTextRequestBuilder,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+ logging.basicConfig(level=logging.INFO)
27
+
28
+
29
+ def _load_prompt_template(runtime: Runtime) -> str:
30
+ """Load the chunking prompt template from file."""
31
+ prompt_file = runtime.file.find_file_path("chunking_prompt.txt")
32
+ with open(prompt_file, "r") as f:
33
+ _prompt_template_cache = f.read()
34
+ logger.info(f"Loaded prompt template from {prompt_file}")
35
+ return _prompt_template_cache
36
+
37
+
38
+ def function(
39
+ request: SearchIndexChunkingV1Request, runtime: Runtime
40
+ ) -> SearchIndexChunkingV1Response:
41
+ """
42
+ Chunk documents for Search Index.
43
+
44
+ Args:
45
+ request: SearchIndexChunkingV1Request with input documents
46
+ runtime: Runtime instance for agentic capabilities (future use)
47
+
48
+ Returns:
49
+ SearchIndexChunkingV1Response with chunked output
50
+ """
51
+ logger.info(f"Received {len(request.input)} documents to chunk")
52
+
53
+ # Load prompt template (cached after first call)
54
+ prompt_template = _load_prompt_template(runtime)
55
+
56
+ chunks = []
57
+ chunk_id = 1
58
+
59
+ # Process each document
60
+ for doc_idx, doc in enumerate(request.input):
61
+ # Direct field access - no wrappers!
62
+ text = doc.text
63
+
64
+ # Use LLM to intelligently chunk the document
65
+ # This creates semantic chunks that preserve context and meaning
66
+ prompt = prompt_template.format(text=text)
67
+
68
+ builder = GenerateTextRequestBuilder()
69
+ llm_request = (
70
+ builder.set_model("sfdc_ai__DefaultGPT4Turbo").set_prompt(prompt).build()
71
+ )
72
+ response = runtime.llm_gateway.generate_text(llm_request)
73
+
74
+ if response.is_success:
75
+ # Parse LLM response to extract chunks
76
+ llm_chunks = response.text.split("---CHUNK---")
77
+ llm_chunks = [chunk.strip() for chunk in llm_chunks if chunk.strip()]
78
+
79
+ # Create chunk outputs
80
+ for chunk_text in llm_chunks:
81
+ chunk = SearchIndexChunkingV1Output(
82
+ text=chunk_text,
83
+ seq_no=chunk_id,
84
+ chunk_type=ChunkType.TEXT,
85
+ citations={},
86
+ )
87
+ chunks.append(chunk)
88
+ chunk_id += 1
89
+
90
+ else:
91
+ # LLM chunking failed - log error and raise exception
92
+ error_msg = (
93
+ f"LLM chunking failed for document {doc_idx + 1}: {response.error_code}"
94
+ )
95
+ logger.error(error_msg)
96
+ raise RuntimeError(error_msg)
97
+
98
+ # Return Pydantic response
99
+ return SearchIndexChunkingV1Response(output=chunks)