acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ import time
4
4
  from dataclasses import dataclass, field as dataclass_field
5
5
  from datetime import datetime
6
6
  from enum import Enum
7
- from typing import Any, Dict, Iterable, List, Optional
7
+ from typing import Any, Dict, Iterable, List, Literal, Optional, TypedDict
8
8
 
9
9
  import requests
10
10
  from pydantic import Field, validator
@@ -17,7 +17,9 @@ from datahub.configuration.common import (
17
17
  ConfigModel,
18
18
  ConfigurationError,
19
19
  )
20
- from datahub.configuration.source_common import DatasetSourceConfigMixin
20
+ from datahub.configuration.source_common import (
21
+ DatasetSourceConfigMixin,
22
+ )
21
23
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
22
24
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
23
25
  from datahub.ingestion.api.common import PipelineContext
@@ -29,9 +31,17 @@ from datahub.ingestion.api.decorators import (
29
31
  platform_name,
30
32
  support_status,
31
33
  )
32
- from datahub.ingestion.api.source import Source, SourceReport
34
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
33
35
  from datahub.ingestion.api.workunit import MetadataWorkUnit
34
36
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
37
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
38
+ StaleEntityRemovalHandler,
39
+ StaleEntityRemovalSourceReport,
40
+ )
41
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
42
+ StatefulIngestionConfigBase,
43
+ StatefulIngestionSourceBase,
44
+ )
35
45
  from datahub.ingestion.source_config.operation_config import (
36
46
  OperationConfig,
37
47
  is_profiling_enabled,
@@ -41,6 +51,7 @@ from datahub.metadata.schema_classes import (
41
51
  BooleanTypeClass,
42
52
  BytesTypeClass,
43
53
  DataPlatformInstanceClass,
54
+ DatasetLineageTypeClass,
44
55
  DatasetProfileClass,
45
56
  DatasetPropertiesClass,
46
57
  DateTypeClass,
@@ -59,6 +70,8 @@ from datahub.metadata.schema_classes import (
59
70
  StringTypeClass,
60
71
  SubTypesClass,
61
72
  TagAssociationClass,
73
+ UpstreamClass,
74
+ UpstreamLineageClass,
62
75
  )
63
76
  from datahub.utilities import config_clean
64
77
  from datahub.utilities.lossy_collections import LossyList
@@ -85,7 +98,10 @@ class SalesforceProfilingConfig(ConfigModel):
85
98
  # TODO - support field level profiling
86
99
 
87
100
 
88
- class SalesforceConfig(DatasetSourceConfigMixin):
101
+ class SalesforceConfig(
102
+ StatefulIngestionConfigBase,
103
+ DatasetSourceConfigMixin,
104
+ ):
89
105
  platform: str = "salesforce"
90
106
 
91
107
  auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
@@ -138,6 +154,12 @@ class SalesforceConfig(DatasetSourceConfigMixin):
138
154
  description="Regex patterns for profiles to filter in ingestion, allowed by the `object_pattern`.",
139
155
  )
140
156
 
157
+ # Given lack of ERD visual graph view support, this alternate is useful.
158
+ use_referenced_entities_as_upstreams: bool = Field(
159
+ default=False,
160
+ description="(Experimental) If enabled, referenced entities will be treated as upstream entities.",
161
+ )
162
+
141
163
  def is_profiling_enabled(self) -> bool:
142
164
  return self.profiling.enabled and is_profiling_enabled(
143
165
  self.profiling.operation_config
@@ -149,9 +171,15 @@ class SalesforceConfig(DatasetSourceConfigMixin):
149
171
 
150
172
 
151
173
  @dataclass
152
- class SalesforceSourceReport(SourceReport):
174
+ class SalesforceSourceReport(StaleEntityRemovalSourceReport):
153
175
  filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
154
176
 
177
+ objects_with_calculated_field: LossyList[str] = dataclass_field(
178
+ default_factory=LossyList
179
+ )
180
+
181
+ num_objects_missing_formula: int = 0
182
+
155
183
  def report_dropped(self, ent_name: str) -> None:
156
184
  self.filtered.append(ent_name)
157
185
 
@@ -186,6 +214,310 @@ FIELD_TYPE_MAPPING = {
186
214
  }
187
215
 
188
216
 
217
+ class EntityDefinition(TypedDict):
218
+ DurableId: str
219
+ QualifiedApiName: str
220
+ DeveloperName: str
221
+ Label: str
222
+ PluralLabel: str
223
+ InternalSharingModel: str
224
+ ExternalSharingModel: str
225
+ DeploymentStatus: Literal[
226
+ "Deployed", "InDevelopment"
227
+ ] # Common values for DeploymentStatus
228
+
229
+
230
+ class UserInfo(TypedDict):
231
+ Username: str
232
+
233
+
234
+ class FieldDefinition(TypedDict):
235
+ DataType: str
236
+ LastModifiedDate: str
237
+ LastModifiedBy: UserInfo
238
+ IsIndexed: bool
239
+ ComplianceGroup: Optional[str]
240
+ Description: Optional[str]
241
+
242
+
243
+ class ReferenceTo(TypedDict):
244
+ referenceTo: List[str]
245
+
246
+
247
+ class EntityParticle(TypedDict):
248
+ QualifiedApiName: str
249
+ DeveloperName: str
250
+ Label: str
251
+ DataType: str
252
+ Precision: Optional[int]
253
+ Scale: Optional[int]
254
+ Length: Optional[int]
255
+ Digits: Optional[int]
256
+ IsUnique: bool
257
+ IsCompound: bool
258
+ IsComponent: bool
259
+ ReferenceTo: Optional[ReferenceTo]
260
+ RelationshipName: Optional[str]
261
+ IsNillable: bool
262
+ InlineHelpText: Optional[str]
263
+ IsCalculated: bool
264
+ FieldDefinition: FieldDefinition
265
+
266
+
267
+ class CustomObject(TypedDict):
268
+ Description: Optional[str]
269
+ Language: str
270
+ ManageableState: Literal["unmanaged", "installed", "beta", "released"]
271
+ CreatedDate: str
272
+ CreatedBy: UserInfo
273
+ LastModifiedDate: str
274
+ LastModifiedBy: UserInfo
275
+
276
+
277
+ class CustomField(TypedDict):
278
+ DeveloperName: str
279
+ CreatedDate: str
280
+ CreatedBy: UserInfo
281
+ InlineHelpText: Optional[str]
282
+ LastModifiedDate: str
283
+ LastModifiedBy: UserInfo
284
+
285
+
286
+ class SObjectRecordCount(TypedDict):
287
+ count: int
288
+ name: str
289
+
290
+
291
+ class SObjectField(TypedDict):
292
+ name: str
293
+ calculatedFormula: Optional[str]
294
+
295
+
296
+ class SObjectDescribe(TypedDict):
297
+ fields: List[SObjectField]
298
+
299
+
300
+ class SalesforceApi:
301
+ def __init__(
302
+ self, sf: Salesforce, config: SalesforceConfig, report: SalesforceSourceReport
303
+ ) -> None:
304
+ self.config = config
305
+ self.report = report
306
+ self.sf = sf
307
+ self.base_url = "https://{instance}/services/data/v{sf_version}/".format(
308
+ instance=self.sf.sf_instance, sf_version=self.sf.sf_version
309
+ )
310
+
311
+ @staticmethod
312
+ def create_salesforce_client(config: SalesforceConfig) -> Salesforce:
313
+ common_args: Dict[str, Any] = {
314
+ "domain": "test" if config.is_sandbox else None,
315
+ "session": requests.Session(),
316
+ }
317
+ if config.api_version:
318
+ common_args["version"] = config.api_version
319
+
320
+ if config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
321
+ logger.debug("Access Token Provided in Config")
322
+ assert config.access_token is not None, (
323
+ "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
324
+ )
325
+ assert config.instance_url is not None, (
326
+ "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
327
+ )
328
+
329
+ sf = Salesforce(
330
+ instance_url=config.instance_url,
331
+ session_id=config.access_token,
332
+ **common_args,
333
+ )
334
+ elif config.auth is SalesforceAuthType.USERNAME_PASSWORD:
335
+ logger.debug("Username/Password Provided in Config")
336
+ assert config.username is not None, (
337
+ "Config username is required for USERNAME_PASSWORD auth"
338
+ )
339
+ assert config.password is not None, (
340
+ "Config password is required for USERNAME_PASSWORD auth"
341
+ )
342
+ assert config.security_token is not None, (
343
+ "Config security_token is required for USERNAME_PASSWORD auth"
344
+ )
345
+
346
+ sf = Salesforce(
347
+ username=config.username,
348
+ password=config.password,
349
+ security_token=config.security_token,
350
+ **common_args,
351
+ )
352
+
353
+ elif config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
354
+ logger.debug("Json Web Token provided in the config")
355
+ assert config.username is not None, (
356
+ "Config username is required for JSON_WEB_TOKEN auth"
357
+ )
358
+ assert config.consumer_key is not None, (
359
+ "Config consumer_key is required for JSON_WEB_TOKEN auth"
360
+ )
361
+ assert config.private_key is not None, (
362
+ "Config private_key is required for JSON_WEB_TOKEN auth"
363
+ )
364
+
365
+ sf = Salesforce(
366
+ username=config.username,
367
+ consumer_key=config.consumer_key,
368
+ privatekey=config.private_key,
369
+ **common_args,
370
+ )
371
+
372
+ SalesforceApi.update_salesforce_api_version(config, sf)
373
+
374
+ return sf
375
+
376
+ @staticmethod
377
+ def update_salesforce_api_version(config: SalesforceConfig, sf: Salesforce) -> None:
378
+ if not config.api_version:
379
+ # List all REST API versions and use latest one
380
+ versions_url = "https://{instance}/services/data/".format(
381
+ instance=sf.sf_instance,
382
+ )
383
+ versions_response = sf._call_salesforce("GET", versions_url).json()
384
+ latest_version = versions_response[-1]
385
+ version = latest_version["version"]
386
+ # we could avoid setting the version like below (after the Salesforce object has been already initiated
387
+ # above), since, according to the docs:
388
+ # https://developer.salesforce.com/docs/atlas.en-us.api_rest.meta/api_rest/dome_versions.htm
389
+ # we don't need to be authenticated to list the versions (so we could perform this call before even
390
+ # authenticating)
391
+ sf.sf_version = version
392
+ logger.debug(
393
+ "Using Salesforce REST API version: {version}".format(version=sf.sf_version)
394
+ )
395
+
396
+ def list_objects(self) -> List[EntityDefinition]:
397
+ # Using Describe Global REST API returns many more objects than required.
398
+ # Response does not have the attribute ("customizable") that can be used
399
+ # to filter out entities not on ObjectManager UI. Hence SOQL on EntityDefinition
400
+ # object is used instead, as suggested by salesforce support.
401
+
402
+ query_url = (
403
+ self.base_url
404
+ + "tooling/query/?q=SELECT DurableId,QualifiedApiName,DeveloperName,"
405
+ + "Label,PluralLabel,InternalSharingModel,ExternalSharingModel,DeploymentStatus "
406
+ + "FROM EntityDefinition WHERE IsCustomizable = true"
407
+ )
408
+ entities_response = self.sf._call_salesforce("GET", query_url).json()
409
+ logger.debug(
410
+ "Salesforce EntityDefinition query returned {count} sObjects".format(
411
+ count=len(entities_response["records"])
412
+ )
413
+ )
414
+ return entities_response["records"]
415
+
416
+ def describe_object(self, sObjectName: str) -> SObjectDescribe:
417
+ logger.debug(f"Querying Salesforce {sObjectName} describe REST API")
418
+
419
+ describe_endpoint = f"{self.base_url}sobjects/{sObjectName}/describe/"
420
+ response = self.sf._call_salesforce("GET", describe_endpoint)
421
+
422
+ logger.debug(f"Received Salesforce {sObjectName} describe respone")
423
+ return {"fields": response.json()["fields"]}
424
+
425
+ def get_custom_object_details(
426
+ self, sObjectDeveloperName: str
427
+ ) -> Optional[CustomObject]:
428
+ query_url = (
429
+ self.base_url
430
+ + "tooling/query/?q=SELECT Description, Language, ManageableState, "
431
+ + "CreatedDate, CreatedBy.Username, LastModifiedDate, LastModifiedBy.Username "
432
+ + f"FROM CustomObject where DeveloperName='{sObjectDeveloperName}'"
433
+ )
434
+ custom_objects_response = self.sf._call_salesforce("GET", query_url).json()
435
+ if len(custom_objects_response["records"]) > 0:
436
+ logger.debug("Salesforce CustomObject query returned with details")
437
+ return custom_objects_response["records"][0]
438
+ return None
439
+
440
+ def get_fields_for_object(
441
+ self, sObjectName: str, sObjectDurableId: str
442
+ ) -> List[EntityParticle]:
443
+ sObject_fields_query_url = (
444
+ self.base_url
445
+ + "tooling/query?q=SELECT "
446
+ + "QualifiedApiName,DeveloperName,Label, FieldDefinition.DataType, DataType,"
447
+ + "FieldDefinition.LastModifiedDate, FieldDefinition.LastModifiedBy.Username,"
448
+ + "Precision, Scale, Length, Digits ,FieldDefinition.IsIndexed, IsUnique,"
449
+ + "IsCompound, IsComponent, ReferenceTo, FieldDefinition.ComplianceGroup,"
450
+ + "RelationshipName, IsNillable, FieldDefinition.Description, InlineHelpText, "
451
+ + "IsCalculated FROM EntityParticle WHERE EntityDefinitionId='{}'".format(
452
+ sObjectDurableId
453
+ )
454
+ )
455
+
456
+ sObject_fields_response = self.sf._call_salesforce(
457
+ "GET", sObject_fields_query_url
458
+ ).json()
459
+
460
+ logger.debug(f"Received Salesforce {sObjectName} fields response")
461
+
462
+ all_fields = sObject_fields_response["records"]
463
+ return all_fields
464
+
465
+ def get_custom_fields_for_object(
466
+ self, sObjectName: str, sObjectDurableId: str
467
+ ) -> Dict[str, CustomField]:
468
+ sObject_custom_fields_query_url = (
469
+ self.base_url
470
+ + "tooling/query?q=SELECT "
471
+ + "DeveloperName,CreatedDate,CreatedBy.Username,InlineHelpText,"
472
+ + "LastModifiedDate,LastModifiedBy.Username "
473
+ + "FROM CustomField WHERE EntityDefinitionId='{}'".format(sObjectDurableId)
474
+ )
475
+
476
+ customFields: Dict[str, CustomField] = {}
477
+ try:
478
+ sObject_custom_fields_response = self.sf._call_salesforce(
479
+ "GET", sObject_custom_fields_query_url
480
+ ).json()
481
+
482
+ logger.debug(
483
+ "Received Salesforce {sObject} custom fields response".format(
484
+ sObject=sObjectName
485
+ )
486
+ )
487
+
488
+ except Exception as e:
489
+ error = "Salesforce CustomField query failed. "
490
+ if "sObject type 'CustomField' is not supported." in str(e):
491
+ # https://github.com/afawcett/apex-toolingapi/issues/19
492
+ error += "Please verify if user has 'View All Data' permission."
493
+
494
+ self.report.warning(message=error, exc=e)
495
+ else:
496
+ customFields = {
497
+ record["DeveloperName"]: record
498
+ for record in sObject_custom_fields_response["records"]
499
+ }
500
+
501
+ return customFields
502
+
503
+ def get_approximate_record_count(self, sObjectName: str) -> SObjectRecordCount:
504
+ sObject_records_count_url = (
505
+ f"{self.base_url}limits/recordCount?sObjects={sObjectName}"
506
+ )
507
+
508
+ sObject_record_count_response = self.sf._call_salesforce(
509
+ "GET", sObject_records_count_url
510
+ ).json()
511
+
512
+ logger.debug(
513
+ "Received Salesforce {sObject} record count response".format(
514
+ sObject=sObjectName
515
+ )
516
+ )
517
+ sobject_record_counts = sObject_record_count_response.get("sObjects", [])
518
+ return sobject_record_counts[0]
519
+
520
+
189
521
  @platform_name("Salesforce")
190
522
  @config_class(SalesforceConfig)
191
523
  @support_status(SupportStatus.INCUBATING)
@@ -214,123 +546,44 @@ FIELD_TYPE_MAPPING = {
214
546
  capability_name=SourceCapability.TAGS,
215
547
  description="Enabled by default",
216
548
  )
217
- class SalesforceSource(Source):
218
- base_url: str
219
- config: SalesforceConfig
220
- report: SalesforceSourceReport
221
- session: requests.Session
222
- sf: Salesforce
223
- fieldCounts: Dict[str, int]
224
-
549
+ class SalesforceSource(StatefulIngestionSourceBase):
225
550
  def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None:
226
- super().__init__(ctx)
551
+ super().__init__(config, ctx)
552
+ self.ctx = ctx
227
553
  self.config = config
228
- self.report = SalesforceSourceReport()
229
- self.session = requests.Session()
554
+ self.report: SalesforceSourceReport = SalesforceSourceReport()
230
555
  self.platform: str = "salesforce"
231
- self.fieldCounts = {}
232
- common_args: Dict[str, Any] = {
233
- "domain": "test" if self.config.is_sandbox else None,
234
- "session": self.session,
235
- }
236
- if self.config.api_version:
237
- common_args["version"] = self.config.api_version
556
+ self.fieldCounts: Dict[str, int] = {}
557
+
558
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
559
+ return [
560
+ *super().get_workunit_processors(),
561
+ StaleEntityRemovalHandler.create(
562
+ self, self.config, self.ctx
563
+ ).workunit_processor,
564
+ ]
238
565
 
566
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
239
567
  try:
240
- if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
241
- logger.debug("Access Token Provided in Config")
242
- assert self.config.access_token is not None, (
243
- "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
244
- )
245
- assert self.config.instance_url is not None, (
246
- "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
247
- )
248
-
249
- self.sf = Salesforce(
250
- instance_url=self.config.instance_url,
251
- session_id=self.config.access_token,
252
- **common_args,
253
- )
254
- elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD:
255
- logger.debug("Username/Password Provided in Config")
256
- assert self.config.username is not None, (
257
- "Config username is required for USERNAME_PASSWORD auth"
258
- )
259
- assert self.config.password is not None, (
260
- "Config password is required for USERNAME_PASSWORD auth"
261
- )
262
- assert self.config.security_token is not None, (
263
- "Config security_token is required for USERNAME_PASSWORD auth"
264
- )
265
-
266
- self.sf = Salesforce(
267
- username=self.config.username,
268
- password=self.config.password,
269
- security_token=self.config.security_token,
270
- **common_args,
271
- )
272
-
273
- elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
274
- logger.debug("Json Web Token provided in the config")
275
- assert self.config.username is not None, (
276
- "Config username is required for JSON_WEB_TOKEN auth"
277
- )
278
- assert self.config.consumer_key is not None, (
279
- "Config consumer_key is required for JSON_WEB_TOKEN auth"
280
- )
281
- assert self.config.private_key is not None, (
282
- "Config private_key is required for JSON_WEB_TOKEN auth"
283
- )
284
-
285
- self.sf = Salesforce(
286
- username=self.config.username,
287
- consumer_key=self.config.consumer_key,
288
- privatekey=self.config.private_key,
289
- **common_args,
290
- )
291
-
568
+ sf = SalesforceApi.create_salesforce_client(self.config)
292
569
  except SalesforceAuthenticationFailed as e:
293
- logger.error(e)
294
570
  if "API_CURRENTLY_DISABLED" in str(e):
295
571
  # https://help.salesforce.com/s/articleView?id=001473830&type=1
296
- error = "Salesforce login failed. Please make sure user has API Enabled Access."
572
+ error = "Please make sure user has API Enabled Access."
297
573
  else:
298
- error = "Salesforce login failed. Please verify your credentials."
574
+ error = "Please verify your credentials."
299
575
  if (
300
576
  self.config.instance_url
301
577
  and "sandbox" in self.config.instance_url.lower()
302
578
  ):
303
579
  error += "Please set `is_sandbox: True` in recipe if this is sandbox account."
304
- raise ConfigurationError(error) from e
305
-
306
- if not self.config.api_version:
307
- # List all REST API versions and use latest one
308
- versions_url = "https://{instance}/services/data/".format(
309
- instance=self.sf.sf_instance,
310
- )
311
- versions_response = self.sf._call_salesforce("GET", versions_url).json()
312
- latest_version = versions_response[-1]
313
- version = latest_version["version"]
314
- # we could avoid setting the version like below (after the Salesforce object has been already initiated
315
- # above), since, according to the docs:
316
- # https://developer.salesforce.com/docs/atlas.en-us.api_rest.meta/api_rest/dome_versions.htm
317
- # we don't need to be authenticated to list the versions (so we could perform this call before even
318
- # authenticating)
319
- self.sf.sf_version = version
320
-
321
- self.base_url = "https://{instance}/services/data/v{sf_version}/".format(
322
- instance=self.sf.sf_instance, sf_version=self.sf.sf_version
323
- )
580
+ self.report.failure(title="Salesforce login failed", message=error, exc=e)
581
+ return
324
582
 
325
- logger.debug(
326
- "Using Salesforce REST API version: {version}".format(
327
- version=self.sf.sf_version
328
- )
329
- )
583
+ self.sf_api = SalesforceApi(sf, self.config, self.report)
330
584
 
331
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
332
585
  try:
333
- sObjects = self.get_salesforce_objects()
586
+ sObjects = self.sf_api.list_objects()
334
587
  except Exception as e:
335
588
  if "sObject type 'EntityDefinition' is not supported." in str(e):
336
589
  # https://developer.salesforce.com/docs/atlas.en-us.api_tooling.meta/api_tooling/tooling_api_objects_entitydefinition.htm
@@ -344,7 +597,7 @@ class SalesforceSource(Source):
344
597
  yield from self.get_salesforce_object_workunits(sObject)
345
598
 
346
599
  def get_salesforce_object_workunits(
347
- self, sObject: dict
600
+ self, sObject: EntityDefinition
348
601
  ) -> Iterable[MetadataWorkUnit]:
349
602
  sObjectName = sObject["QualifiedApiName"]
350
603
 
@@ -364,19 +617,50 @@ class SalesforceSource(Source):
364
617
  self.config.env,
365
618
  )
366
619
 
367
- customObject = {}
620
+ customObject = None
368
621
  if sObjectName.endswith("__c"): # Is Custom Object
369
- customObject = self.get_custom_object_details(sObject["DeveloperName"])
622
+ customObject = self.sf_api.get_custom_object_details(
623
+ sObject["DeveloperName"]
624
+ )
370
625
 
371
626
  # Table Created, LastModified is available for Custom Object
372
627
  yield from self.get_operation_workunit(customObject, datasetUrn)
373
628
 
374
629
  yield self.get_properties_workunit(sObject, customObject, datasetUrn)
375
630
 
631
+ allFields = self.sf_api.get_fields_for_object(sObjectName, sObject["DurableId"])
632
+
633
+ customFields = self.sf_api.get_custom_fields_for_object(
634
+ sObjectName, sObject["DurableId"]
635
+ )
636
+
637
+ if any(field["IsCalculated"] for field in allFields):
638
+ # Although formula is present in Metadata column of CustomField entity,
639
+ # we can not use it as it allows querying only for one field at a time
640
+ # and that would not be performant
641
+ calculated_field_formulae = self.get_calculated_field_formulae(sObjectName)
642
+ if calculated_field_formulae:
643
+ self.report.objects_with_calculated_field.append(sObjectName)
644
+ else:
645
+ # For some objects, although some fields are calculated, formula is absent
646
+ # These are typically salesforce system calculated fields whose formula
647
+ # is not exposed
648
+ self.report.num_objects_missing_formula += 1
649
+ else:
650
+ calculated_field_formulae = {}
651
+
376
652
  yield from self.get_schema_metadata_workunit(
377
- sObjectName, sObject, customObject, datasetUrn
653
+ sObjectName,
654
+ allFields,
655
+ customFields,
656
+ customObject,
657
+ datasetUrn,
658
+ calculated_field_formulae,
378
659
  )
379
660
 
661
+ if self.config.use_referenced_entities_as_upstreams:
662
+ yield from self.get_upstream_workunit(datasetUrn, allFields)
663
+
380
664
  yield self.get_subtypes_workunit(sObjectName, datasetUrn)
381
665
 
382
666
  if self.config.platform_instance is not None:
@@ -390,39 +674,33 @@ class SalesforceSource(Source):
390
674
  ):
391
675
  yield from self.get_profile_workunit(sObjectName, datasetUrn)
392
676
 
393
- def get_custom_object_details(self, sObjectDeveloperName: str) -> dict:
394
- customObject = {}
395
- query_url = (
396
- self.base_url
397
- + "tooling/query/?q=SELECT Description, Language, ManageableState, "
398
- + "CreatedDate, CreatedBy.Username, LastModifiedDate, LastModifiedBy.Username "
399
- + f"FROM CustomObject where DeveloperName='{sObjectDeveloperName}'"
400
- )
401
- custom_objects_response = self.sf._call_salesforce("GET", query_url).json()
402
- if len(custom_objects_response["records"]) > 0:
403
- logger.debug("Salesforce CustomObject query returned with details")
404
- customObject = custom_objects_response["records"][0]
405
- return customObject
406
-
407
- def get_salesforce_objects(self) -> List:
408
- # Using Describe Global REST API returns many more objects than required.
409
- # Response does not have the attribute ("customizable") that can be used
410
- # to filter out entities not on ObjectManager UI. Hence SOQL on EntityDefinition
411
- # object is used instead, as suggested by salesforce support.
412
-
413
- query_url = (
414
- self.base_url
415
- + "tooling/query/?q=SELECT DurableId,QualifiedApiName,DeveloperName,"
416
- + "Label,PluralLabel,InternalSharingModel,ExternalSharingModel,DeploymentStatus "
417
- + "FROM EntityDefinition WHERE IsCustomizable = true"
418
- )
419
- entities_response = self.sf._call_salesforce("GET", query_url).json()
420
- logger.debug(
421
- "Salesforce EntityDefinition query returned {count} sObjects".format(
422
- count=len(entities_response["records"])
423
- )
424
- )
425
- return entities_response["records"]
677
+ def get_upstream_workunit(
678
+ self, datasetUrn: str, allFields: List[EntityParticle]
679
+ ) -> Iterable[MetadataWorkUnit]:
680
+ upstreams: List[UpstreamClass] = []
681
+ for field in allFields:
682
+ if (
683
+ field["DataType"] == "reference"
684
+ and field["ReferenceTo"]
685
+ and field["ReferenceTo"]["referenceTo"]
686
+ ):
687
+ for referenced_sObjectName in field["ReferenceTo"]["referenceTo"]:
688
+ upstreams.append(
689
+ UpstreamClass(
690
+ dataset=builder.make_dataset_urn_with_platform_instance(
691
+ self.platform,
692
+ referenced_sObjectName,
693
+ self.config.platform_instance,
694
+ self.config.env,
695
+ ),
696
+ type=DatasetLineageTypeClass.TRANSFORMED,
697
+ )
698
+ )
699
+
700
+ if upstreams:
701
+ yield MetadataChangeProposalWrapper(
702
+ entityUrn=datasetUrn, aspect=UpstreamLineageClass(upstreams=upstreams)
703
+ ).as_workunit()
426
704
 
427
705
  def get_domain_workunit(
428
706
  self, dataset_name: str, datasetUrn: str
@@ -452,11 +730,15 @@ class SalesforceSource(Source):
452
730
  ).as_workunit()
453
731
 
454
732
  def get_operation_workunit(
455
- self, customObject: dict, datasetUrn: str
733
+ self, customObject: Optional[CustomObject], datasetUrn: str
456
734
  ) -> Iterable[MetadataWorkUnit]:
457
735
  reported_time: int = int(time.time() * 1000)
458
736
 
459
- if customObject.get("CreatedBy") and customObject.get("CreatedDate"):
737
+ if (
738
+ customObject
739
+ and customObject.get("CreatedBy")
740
+ and customObject.get("CreatedDate")
741
+ ):
460
742
  timestamp = self.get_time_from_salesforce_timestamp(
461
743
  customObject["CreatedDate"]
462
744
  )
@@ -499,7 +781,10 @@ class SalesforceSource(Source):
499
781
  )
500
782
 
501
783
  def get_properties_workunit(
502
- self, sObject: dict, customObject: Dict[str, str], datasetUrn: str
784
+ self,
785
+ sObject: EntityDefinition,
786
+ customObject: Optional[CustomObject],
787
+ datasetUrn: str,
503
788
  ) -> MetadataWorkUnit:
504
789
  propertyLabels = {
505
790
  # from EntityDefinition
@@ -520,17 +805,18 @@ class SalesforceSource(Source):
520
805
  for k, v in sObject.items()
521
806
  if k in propertyLabels and v is not None
522
807
  }
523
- sObjectProperties.update(
524
- {
525
- propertyLabels[k]: str(v)
526
- for k, v in customObject.items()
527
- if k in propertyLabels and v is not None
528
- }
529
- )
808
+ if customObject:
809
+ sObjectProperties.update(
810
+ {
811
+ propertyLabels[k]: str(v)
812
+ for k, v in customObject.items()
813
+ if k in propertyLabels and v is not None
814
+ }
815
+ )
530
816
 
531
817
  datasetProperties = DatasetPropertiesClass(
532
818
  name=sObject["Label"],
533
- description=customObject.get("Description"),
819
+ description=customObject.get("Description") if customObject else None,
534
820
  customProperties=sObjectProperties,
535
821
  )
536
822
  return MetadataChangeProposalWrapper(
@@ -555,58 +841,58 @@ class SalesforceSource(Source):
555
841
  ) -> Iterable[MetadataWorkUnit]:
556
842
  # Here approximate record counts as returned by recordCount API are used as rowCount
557
843
  # In future, count() SOQL query may be used instead, if required, might be more expensive
558
- sObject_records_count_url = (
559
- f"{self.base_url}limits/recordCount?sObjects={sObjectName}"
560
- )
561
-
562
- sObject_record_count_response = self.sf._call_salesforce(
563
- "GET", sObject_records_count_url
564
- ).json()
844
+ sobject_record_count = self.sf_api.get_approximate_record_count(sObjectName)
565
845
 
566
- logger.debug(
567
- "Received Salesforce {sObject} record count response".format(
568
- sObject=sObjectName
569
- )
846
+ datasetProfile = DatasetProfileClass(
847
+ timestampMillis=int(time.time() * 1000),
848
+ rowCount=sobject_record_count["count"],
849
+ columnCount=self.fieldCounts[sObjectName],
570
850
  )
571
-
572
- for entry in sObject_record_count_response.get("sObjects", []):
573
- datasetProfile = DatasetProfileClass(
574
- timestampMillis=int(time.time() * 1000),
575
- rowCount=entry["count"],
576
- columnCount=self.fieldCounts[sObjectName],
577
- )
578
- yield MetadataChangeProposalWrapper(
579
- entityUrn=datasetUrn, aspect=datasetProfile
580
- ).as_workunit()
851
+ yield MetadataChangeProposalWrapper(
852
+ entityUrn=datasetUrn, aspect=datasetProfile
853
+ ).as_workunit()
581
854
 
582
855
  # Here field description is created from label, description and inlineHelpText
583
- def _get_field_description(self, field: dict, customField: dict) -> str:
584
- if "Label" not in field or field["Label"] is None:
585
- desc = ""
586
- elif field["Label"].startswith("#"):
587
- desc = "\\" + field["Label"]
588
- else:
589
- desc = field["Label"]
856
+ def _get_field_description(
857
+ self,
858
+ field: EntityParticle,
859
+ customField: Optional[CustomField],
860
+ formula: Optional[str],
861
+ ) -> str:
862
+ description_parts: List[str] = []
863
+
864
+ if field.get("Label") and field["Label"].startswith("#"):
865
+ description_parts.append("\\" + field["Label"])
866
+ elif field.get("Label"):
867
+ description_parts.append(field["Label"])
590
868
 
591
869
  text = field.get("FieldDefinition", {}).get("Description", None)
592
870
  if text:
593
871
  prefix = "\\" if text.startswith("#") else ""
594
- desc += f"\n\n{prefix}{text}"
872
+ description_parts.append(f"{prefix}{text}")
595
873
 
596
- text = field.get("InlineHelpText", None)
874
+ text = field.get("InlineHelpText")
597
875
  if text:
598
876
  prefix = "\\" if text.startswith("#") else ""
599
- desc += f"\n\n{prefix}{text}"
877
+ description_parts.append(f"{prefix}{text}")
878
+
879
+ if formula:
880
+ description_parts.append(f"Formula: {formula}")
600
881
 
601
- return desc
882
+ return "\n\n".join(description_parts)
602
883
 
603
884
  # Here jsonProps is used to add additional salesforce field level properties.
604
- def _get_field_json_props(self, field: dict, customField: dict) -> str:
885
+ def _get_field_json_props(
886
+ self, field: EntityParticle, customField: Optional[CustomField]
887
+ ) -> str:
605
888
  jsonProps = {}
606
889
 
607
890
  if field.get("IsUnique"):
608
891
  jsonProps["IsUnique"] = True
609
892
 
893
+ if field.get("IsCalculated"):
894
+ jsonProps["IsCalculated"] = True
895
+
610
896
  return json.dumps(jsonProps)
611
897
 
612
898
  def _get_schema_field(
@@ -614,8 +900,9 @@ class SalesforceSource(Source):
614
900
  sObjectName: str,
615
901
  fieldName: str,
616
902
  fieldType: str,
617
- field: dict,
618
- customField: dict,
903
+ field: EntityParticle,
904
+ customField: Optional[CustomField],
905
+ formula: Optional[str] = None,
619
906
  ) -> SchemaFieldClass:
620
907
  fieldPath = fieldName
621
908
 
@@ -629,7 +916,7 @@ class SalesforceSource(Source):
629
916
 
630
917
  fieldTags: List[str] = self.get_field_tags(fieldName, field)
631
918
 
632
- description = self._get_field_description(field, customField)
919
+ description = self._get_field_description(field, customField, formula)
633
920
 
634
921
  schemaField = SchemaFieldClass(
635
922
  fieldPath=fieldPath,
@@ -644,11 +931,19 @@ class SalesforceSource(Source):
644
931
  )
645
932
 
646
933
  # Created and LastModified Date and Actor are available for Custom Fields only
647
- if customField.get("CreatedDate") and customField.get("CreatedBy"):
934
+ if (
935
+ customField
936
+ and customField.get("CreatedDate")
937
+ and customField.get("CreatedBy")
938
+ ):
648
939
  schemaField.created = self.get_audit_stamp(
649
940
  customField["CreatedDate"], customField["CreatedBy"]["Username"]
650
941
  )
651
- if customField.get("LastModifiedDate") and customField.get("LastModifiedBy"):
942
+ if (
943
+ customField
944
+ and customField.get("LastModifiedDate")
945
+ and customField.get("LastModifiedBy")
946
+ ):
652
947
  schemaField.lastModified = self.get_audit_stamp(
653
948
  customField["LastModifiedDate"],
654
949
  customField["LastModifiedBy"]["Username"],
@@ -656,7 +951,7 @@ class SalesforceSource(Source):
656
951
 
657
952
  return schemaField
658
953
 
659
- def get_field_tags(self, fieldName: str, field: dict) -> List[str]:
954
+ def get_field_tags(self, fieldName: str, field: EntityParticle) -> List[str]:
660
955
  fieldTags: List[str] = []
661
956
 
662
957
  if fieldName.endswith("__c"):
@@ -689,69 +984,39 @@ class SalesforceSource(Source):
689
984
  actor=builder.make_user_urn(username),
690
985
  )
691
986
 
692
- def get_schema_metadata_workunit(
693
- self, sObjectName: str, sObject: dict, customObject: dict, datasetUrn: str
694
- ) -> Iterable[MetadataWorkUnit]:
695
- sObject_fields_query_url = (
696
- self.base_url
697
- + "tooling/query?q=SELECT "
698
- + "QualifiedApiName,DeveloperName,Label, FieldDefinition.DataType, DataType,"
699
- + "FieldDefinition.LastModifiedDate, FieldDefinition.LastModifiedBy.Username,"
700
- + "Precision, Scale, Length, Digits ,FieldDefinition.IsIndexed, IsUnique,"
701
- + "IsCompound, IsComponent, ReferenceTo, FieldDefinition.ComplianceGroup,"
702
- + "RelationshipName, IsNillable, FieldDefinition.Description, InlineHelpText "
703
- + "FROM EntityParticle WHERE EntityDefinitionId='{}'".format(
704
- sObject["DurableId"]
705
- )
706
- )
987
+ def get_calculated_field_formulae(self, sObjectName: str) -> Dict[str, str]:
988
+ # extract field wise formula and return response
989
+ # Includes entries for calculated fields only
707
990
 
708
- sObject_fields_response = self.sf._call_salesforce(
709
- "GET", sObject_fields_query_url
710
- ).json()
711
-
712
- logger.debug(f"Received Salesforce {sObjectName} fields response")
713
-
714
- sObject_custom_fields_query_url = (
715
- self.base_url
716
- + "tooling/query?q=SELECT "
717
- + "DeveloperName,CreatedDate,CreatedBy.Username,InlineHelpText,"
718
- + "LastModifiedDate,LastModifiedBy.Username "
719
- + "FROM CustomField WHERE EntityDefinitionId='{}'".format(
720
- sObject["DurableId"]
721
- )
722
- )
723
-
724
- customFields: Dict[str, Dict] = {}
991
+ calculated_fields = {}
725
992
  try:
726
- sObject_custom_fields_response = self.sf._call_salesforce(
727
- "GET", sObject_custom_fields_query_url
728
- ).json()
729
-
730
- logger.debug(
731
- "Received Salesforce {sObject} custom fields response".format(
732
- sObject=sObjectName
733
- )
734
- )
735
-
993
+ describe_object_result = self.sf_api.describe_object(sObjectName)
994
+ for field in describe_object_result["fields"]:
995
+ if field["calculatedFormula"]:
996
+ calculated_fields[field["name"]] = field["calculatedFormula"]
736
997
  except Exception as e:
737
- error = "Salesforce CustomField query failed. "
738
- if "sObject type 'CustomField' is not supported." in str(e):
739
- # https://github.com/afawcett/apex-toolingapi/issues/19
740
- error += "Please verify if user has 'View All Data' permission."
741
-
742
- self.report.warning(message=error, exc=e)
743
- else:
744
- customFields = {
745
- record["DeveloperName"]: record
746
- for record in sObject_custom_fields_response["records"]
747
- }
998
+ self.report.warning(
999
+ message="Failed to get calculated field formulae",
1000
+ context=sObjectName,
1001
+ exc=e,
1002
+ )
1003
+ return calculated_fields
748
1004
 
1005
+ def get_schema_metadata_workunit(
1006
+ self,
1007
+ sObjectName: str,
1008
+ all_fields: List[EntityParticle],
1009
+ custom_fields: Dict[str, CustomField],
1010
+ customObject: Optional[CustomObject],
1011
+ datasetUrn: str,
1012
+ calculated_field_formulae: Dict[str, str],
1013
+ ) -> Iterable[MetadataWorkUnit]:
749
1014
  fields: List[SchemaFieldClass] = []
750
1015
  primaryKeys: List[str] = []
751
1016
  foreignKeys: List[ForeignKeyConstraintClass] = []
752
1017
 
753
- for field in sObject_fields_response["records"]:
754
- customField = customFields.get(field["DeveloperName"], {})
1018
+ for field in all_fields:
1019
+ customField = custom_fields.get(field["DeveloperName"])
755
1020
 
756
1021
  fieldName = field["QualifiedApiName"]
757
1022
  fieldType = field["DataType"]
@@ -761,20 +1026,21 @@ class SalesforceSource(Source):
761
1026
  continue
762
1027
 
763
1028
  schemaField: SchemaFieldClass = self._get_schema_field(
764
- sObjectName, fieldName, fieldType, field, customField
1029
+ sObjectName,
1030
+ fieldName,
1031
+ fieldType,
1032
+ field,
1033
+ customField,
1034
+ calculated_field_formulae.get(fieldName),
765
1035
  )
766
1036
  fields.append(schemaField)
767
1037
 
768
1038
  if fieldType == "id":
769
1039
  primaryKeys.append(fieldName)
770
1040
 
771
- if (
772
- fieldType == "reference"
773
- and field["ReferenceTo"]["referenceTo"] is not None
774
- ):
775
- foreignKeys.extend(
776
- list(self.get_foreign_keys_from_field(fieldName, field, datasetUrn))
777
- )
1041
+ foreignKeys.extend(
1042
+ list(self.get_foreign_keys_from_field(fieldName, field, datasetUrn))
1043
+ )
778
1044
 
779
1045
  schemaMetadata = SchemaMetadataClass(
780
1046
  schemaName="",
@@ -788,7 +1054,11 @@ class SalesforceSource(Source):
788
1054
  )
789
1055
 
790
1056
  # Created Date and Actor are available for Custom Object only
791
- if customObject.get("CreatedDate") and customObject.get("CreatedBy"):
1057
+ if (
1058
+ customObject
1059
+ and customObject.get("CreatedDate")
1060
+ and customObject.get("CreatedBy")
1061
+ ):
792
1062
  schemaMetadata.created = self.get_audit_stamp(
793
1063
  customObject["CreatedDate"], customObject["CreatedBy"]["Username"]
794
1064
  )
@@ -799,26 +1069,31 @@ class SalesforceSource(Source):
799
1069
  ).as_workunit()
800
1070
 
801
1071
  def get_foreign_keys_from_field(
802
- self, fieldName: str, field: dict, datasetUrn: str
1072
+ self, fieldName: str, field: EntityParticle, datasetUrn: str
803
1073
  ) -> Iterable[ForeignKeyConstraintClass]:
804
- # https://developer.salesforce.com/docs/atlas.en-us.object_reference.meta/object_reference/field_types.htm#i1435823
805
- foreignDatasets = [
806
- builder.make_dataset_urn_with_platform_instance(
807
- self.platform,
808
- fsObject,
809
- self.config.platform_instance,
810
- self.config.env,
811
- )
812
- for fsObject in field["ReferenceTo"]["referenceTo"]
813
- ]
814
-
815
- for foreignDataset in foreignDatasets:
816
- yield ForeignKeyConstraintClass(
817
- name=field["RelationshipName"] if field.get("RelationshipName") else "",
818
- foreignDataset=foreignDataset,
819
- foreignFields=[builder.make_schema_field_urn(foreignDataset, "Id")],
820
- sourceFields=[builder.make_schema_field_urn(datasetUrn, fieldName)],
821
- )
1074
+ if (
1075
+ field["DataType"] == "reference"
1076
+ and field["ReferenceTo"]
1077
+ and field["ReferenceTo"]["referenceTo"] is not None
1078
+ ):
1079
+ # https://developer.salesforce.com/docs/atlas.en-us.object_reference.meta/object_reference/field_types.htm#i1435823
1080
+ foreignDatasets = [
1081
+ builder.make_dataset_urn_with_platform_instance(
1082
+ self.platform,
1083
+ fsObject,
1084
+ self.config.platform_instance,
1085
+ self.config.env,
1086
+ )
1087
+ for fsObject in field["ReferenceTo"]["referenceTo"]
1088
+ ]
1089
+
1090
+ for foreignDataset in foreignDatasets:
1091
+ yield ForeignKeyConstraintClass(
1092
+ name=field["RelationshipName"] if field["RelationshipName"] else "",
1093
+ foreignDataset=foreignDataset,
1094
+ foreignFields=[builder.make_schema_field_urn(foreignDataset, "Id")],
1095
+ sourceFields=[builder.make_schema_field_urn(datasetUrn, fieldName)],
1096
+ )
822
1097
 
823
1098
  def get_report(self) -> SourceReport:
824
1099
  return self.report