acryl-datahub 1.0.0rc16__py3-none-any.whl → 1.0.0rc17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -4,7 +4,7 @@ import time
4
4
  from dataclasses import dataclass, field as dataclass_field
5
5
  from datetime import datetime
6
6
  from enum import Enum
7
- from typing import Any, Dict, Iterable, List, Optional
7
+ from typing import Any, Dict, Iterable, List, Literal, Optional, TypedDict
8
8
 
9
9
  import requests
10
10
  from pydantic import Field, validator
@@ -51,6 +51,7 @@ from datahub.metadata.schema_classes import (
51
51
  BooleanTypeClass,
52
52
  BytesTypeClass,
53
53
  DataPlatformInstanceClass,
54
+ DatasetLineageTypeClass,
54
55
  DatasetProfileClass,
55
56
  DatasetPropertiesClass,
56
57
  DateTypeClass,
@@ -69,6 +70,8 @@ from datahub.metadata.schema_classes import (
69
70
  StringTypeClass,
70
71
  SubTypesClass,
71
72
  TagAssociationClass,
73
+ UpstreamClass,
74
+ UpstreamLineageClass,
72
75
  )
73
76
  from datahub.utilities import config_clean
74
77
  from datahub.utilities.lossy_collections import LossyList
@@ -151,6 +154,12 @@ class SalesforceConfig(
151
154
  description="Regex patterns for profiles to filter in ingestion, allowed by the `object_pattern`.",
152
155
  )
153
156
 
157
+ # Given lack of ERD visual graph view support, this alternate is useful.
158
+ use_referenced_entities_as_upstreams: bool = Field(
159
+ default=False,
160
+ description="(Experimental) If enabled, referenced entities will be treated as upstream entities.",
161
+ )
162
+
154
163
  def is_profiling_enabled(self) -> bool:
155
164
  return self.profiling.enabled and is_profiling_enabled(
156
165
  self.profiling.operation_config
@@ -165,6 +174,12 @@ class SalesforceConfig(
165
174
  class SalesforceSourceReport(StaleEntityRemovalSourceReport):
166
175
  filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
167
176
 
177
+ objects_with_calculated_field: LossyList[str] = dataclass_field(
178
+ default_factory=LossyList
179
+ )
180
+
181
+ num_objects_missing_formula: int = 0
182
+
168
183
  def report_dropped(self, ent_name: str) -> None:
169
184
  self.filtered.append(ent_name)
170
185
 
@@ -199,6 +214,310 @@ FIELD_TYPE_MAPPING = {
199
214
  }
200
215
 
201
216
 
217
+ class EntityDefinition(TypedDict):
218
+ DurableId: str
219
+ QualifiedApiName: str
220
+ DeveloperName: str
221
+ Label: str
222
+ PluralLabel: str
223
+ InternalSharingModel: str
224
+ ExternalSharingModel: str
225
+ DeploymentStatus: Literal[
226
+ "Deployed", "InDevelopment"
227
+ ] # Common values for DeploymentStatus
228
+
229
+
230
+ class UserInfo(TypedDict):
231
+ Username: str
232
+
233
+
234
+ class FieldDefinition(TypedDict):
235
+ DataType: str
236
+ LastModifiedDate: str
237
+ LastModifiedBy: UserInfo
238
+ IsIndexed: bool
239
+ ComplianceGroup: Optional[str]
240
+ Description: Optional[str]
241
+
242
+
243
+ class ReferenceTo(TypedDict):
244
+ referenceTo: List[str]
245
+
246
+
247
+ class EntityParticle(TypedDict):
248
+ QualifiedApiName: str
249
+ DeveloperName: str
250
+ Label: str
251
+ DataType: str
252
+ Precision: Optional[int]
253
+ Scale: Optional[int]
254
+ Length: Optional[int]
255
+ Digits: Optional[int]
256
+ IsUnique: bool
257
+ IsCompound: bool
258
+ IsComponent: bool
259
+ ReferenceTo: Optional[ReferenceTo]
260
+ RelationshipName: Optional[str]
261
+ IsNillable: bool
262
+ InlineHelpText: Optional[str]
263
+ IsCalculated: bool
264
+ FieldDefinition: FieldDefinition
265
+
266
+
267
+ class CustomObject(TypedDict):
268
+ Description: Optional[str]
269
+ Language: str
270
+ ManageableState: Literal["unmanaged", "installed", "beta", "released"]
271
+ CreatedDate: str
272
+ CreatedBy: UserInfo
273
+ LastModifiedDate: str
274
+ LastModifiedBy: UserInfo
275
+
276
+
277
+ class CustomField(TypedDict):
278
+ DeveloperName: str
279
+ CreatedDate: str
280
+ CreatedBy: UserInfo
281
+ InlineHelpText: Optional[str]
282
+ LastModifiedDate: str
283
+ LastModifiedBy: UserInfo
284
+
285
+
286
+ class SObjectRecordCount(TypedDict):
287
+ count: int
288
+ name: str
289
+
290
+
291
+ class SObjectField(TypedDict):
292
+ name: str
293
+ calculatedFormula: Optional[str]
294
+
295
+
296
+ class SObjectDescribe(TypedDict):
297
+ fields: List[SObjectField]
298
+
299
+
300
+ class SalesforceApi:
301
+ def __init__(
302
+ self, sf: Salesforce, config: SalesforceConfig, report: SalesforceSourceReport
303
+ ) -> None:
304
+ self.config = config
305
+ self.report = report
306
+ self.sf = sf
307
+ self.base_url = "https://{instance}/services/data/v{sf_version}/".format(
308
+ instance=self.sf.sf_instance, sf_version=self.sf.sf_version
309
+ )
310
+
311
+ @staticmethod
312
+ def create_salesforce_client(config: SalesforceConfig) -> Salesforce:
313
+ common_args: Dict[str, Any] = {
314
+ "domain": "test" if config.is_sandbox else None,
315
+ "session": requests.Session(),
316
+ }
317
+ if config.api_version:
318
+ common_args["version"] = config.api_version
319
+
320
+ if config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
321
+ logger.debug("Access Token Provided in Config")
322
+ assert config.access_token is not None, (
323
+ "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
324
+ )
325
+ assert config.instance_url is not None, (
326
+ "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
327
+ )
328
+
329
+ sf = Salesforce(
330
+ instance_url=config.instance_url,
331
+ session_id=config.access_token,
332
+ **common_args,
333
+ )
334
+ elif config.auth is SalesforceAuthType.USERNAME_PASSWORD:
335
+ logger.debug("Username/Password Provided in Config")
336
+ assert config.username is not None, (
337
+ "Config username is required for USERNAME_PASSWORD auth"
338
+ )
339
+ assert config.password is not None, (
340
+ "Config password is required for USERNAME_PASSWORD auth"
341
+ )
342
+ assert config.security_token is not None, (
343
+ "Config security_token is required for USERNAME_PASSWORD auth"
344
+ )
345
+
346
+ sf = Salesforce(
347
+ username=config.username,
348
+ password=config.password,
349
+ security_token=config.security_token,
350
+ **common_args,
351
+ )
352
+
353
+ elif config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
354
+ logger.debug("Json Web Token provided in the config")
355
+ assert config.username is not None, (
356
+ "Config username is required for JSON_WEB_TOKEN auth"
357
+ )
358
+ assert config.consumer_key is not None, (
359
+ "Config consumer_key is required for JSON_WEB_TOKEN auth"
360
+ )
361
+ assert config.private_key is not None, (
362
+ "Config private_key is required for JSON_WEB_TOKEN auth"
363
+ )
364
+
365
+ sf = Salesforce(
366
+ username=config.username,
367
+ consumer_key=config.consumer_key,
368
+ privatekey=config.private_key,
369
+ **common_args,
370
+ )
371
+
372
+ SalesforceApi.update_salesforce_api_version(config, sf)
373
+
374
+ return sf
375
+
376
+ @staticmethod
377
+ def update_salesforce_api_version(config: SalesforceConfig, sf: Salesforce) -> None:
378
+ if not config.api_version:
379
+ # List all REST API versions and use latest one
380
+ versions_url = "https://{instance}/services/data/".format(
381
+ instance=sf.sf_instance,
382
+ )
383
+ versions_response = sf._call_salesforce("GET", versions_url).json()
384
+ latest_version = versions_response[-1]
385
+ version = latest_version["version"]
386
+ # we could avoid setting the version like below (after the Salesforce object has been already initiated
387
+ # above), since, according to the docs:
388
+ # https://developer.salesforce.com/docs/atlas.en-us.api_rest.meta/api_rest/dome_versions.htm
389
+ # we don't need to be authenticated to list the versions (so we could perform this call before even
390
+ # authenticating)
391
+ sf.sf_version = version
392
+ logger.debug(
393
+ "Using Salesforce REST API version: {version}".format(version=sf.sf_version)
394
+ )
395
+
396
+ def list_objects(self) -> List[EntityDefinition]:
397
+ # Using Describe Global REST API returns many more objects than required.
398
+ # Response does not have the attribute ("customizable") that can be used
399
+ # to filter out entities not on ObjectManager UI. Hence SOQL on EntityDefinition
400
+ # object is used instead, as suggested by salesforce support.
401
+
402
+ query_url = (
403
+ self.base_url
404
+ + "tooling/query/?q=SELECT DurableId,QualifiedApiName,DeveloperName,"
405
+ + "Label,PluralLabel,InternalSharingModel,ExternalSharingModel,DeploymentStatus "
406
+ + "FROM EntityDefinition WHERE IsCustomizable = true"
407
+ )
408
+ entities_response = self.sf._call_salesforce("GET", query_url).json()
409
+ logger.debug(
410
+ "Salesforce EntityDefinition query returned {count} sObjects".format(
411
+ count=len(entities_response["records"])
412
+ )
413
+ )
414
+ return entities_response["records"]
415
+
416
+ def describe_object(self, sObjectName: str) -> SObjectDescribe:
417
+ logger.debug(f"Querying Salesforce {sObjectName} describe REST API")
418
+
419
+ describe_endpoint = f"{self.base_url}sobjects/{sObjectName}/describe/"
420
+ response = self.sf._call_salesforce("GET", describe_endpoint)
421
+
422
+ logger.debug(f"Received Salesforce {sObjectName} describe respone")
423
+ return {"fields": response.json()["fields"]}
424
+
425
+ def get_custom_object_details(
426
+ self, sObjectDeveloperName: str
427
+ ) -> Optional[CustomObject]:
428
+ query_url = (
429
+ self.base_url
430
+ + "tooling/query/?q=SELECT Description, Language, ManageableState, "
431
+ + "CreatedDate, CreatedBy.Username, LastModifiedDate, LastModifiedBy.Username "
432
+ + f"FROM CustomObject where DeveloperName='{sObjectDeveloperName}'"
433
+ )
434
+ custom_objects_response = self.sf._call_salesforce("GET", query_url).json()
435
+ if len(custom_objects_response["records"]) > 0:
436
+ logger.debug("Salesforce CustomObject query returned with details")
437
+ return custom_objects_response["records"][0]
438
+ return None
439
+
440
+ def get_fields_for_object(
441
+ self, sObjectName: str, sObjectDurableId: str
442
+ ) -> List[EntityParticle]:
443
+ sObject_fields_query_url = (
444
+ self.base_url
445
+ + "tooling/query?q=SELECT "
446
+ + "QualifiedApiName,DeveloperName,Label, FieldDefinition.DataType, DataType,"
447
+ + "FieldDefinition.LastModifiedDate, FieldDefinition.LastModifiedBy.Username,"
448
+ + "Precision, Scale, Length, Digits ,FieldDefinition.IsIndexed, IsUnique,"
449
+ + "IsCompound, IsComponent, ReferenceTo, FieldDefinition.ComplianceGroup,"
450
+ + "RelationshipName, IsNillable, FieldDefinition.Description, InlineHelpText, "
451
+ + "IsCalculated FROM EntityParticle WHERE EntityDefinitionId='{}'".format(
452
+ sObjectDurableId
453
+ )
454
+ )
455
+
456
+ sObject_fields_response = self.sf._call_salesforce(
457
+ "GET", sObject_fields_query_url
458
+ ).json()
459
+
460
+ logger.debug(f"Received Salesforce {sObjectName} fields response")
461
+
462
+ all_fields = sObject_fields_response["records"]
463
+ return all_fields
464
+
465
+ def get_custom_fields_for_object(
466
+ self, sObjectName: str, sObjectDurableId: str
467
+ ) -> Dict[str, CustomField]:
468
+ sObject_custom_fields_query_url = (
469
+ self.base_url
470
+ + "tooling/query?q=SELECT "
471
+ + "DeveloperName,CreatedDate,CreatedBy.Username,InlineHelpText,"
472
+ + "LastModifiedDate,LastModifiedBy.Username "
473
+ + "FROM CustomField WHERE EntityDefinitionId='{}'".format(sObjectDurableId)
474
+ )
475
+
476
+ customFields: Dict[str, CustomField] = {}
477
+ try:
478
+ sObject_custom_fields_response = self.sf._call_salesforce(
479
+ "GET", sObject_custom_fields_query_url
480
+ ).json()
481
+
482
+ logger.debug(
483
+ "Received Salesforce {sObject} custom fields response".format(
484
+ sObject=sObjectName
485
+ )
486
+ )
487
+
488
+ except Exception as e:
489
+ error = "Salesforce CustomField query failed. "
490
+ if "sObject type 'CustomField' is not supported." in str(e):
491
+ # https://github.com/afawcett/apex-toolingapi/issues/19
492
+ error += "Please verify if user has 'View All Data' permission."
493
+
494
+ self.report.warning(message=error, exc=e)
495
+ else:
496
+ customFields = {
497
+ record["DeveloperName"]: record
498
+ for record in sObject_custom_fields_response["records"]
499
+ }
500
+
501
+ return customFields
502
+
503
+ def get_approximate_record_count(self, sObjectName: str) -> SObjectRecordCount:
504
+ sObject_records_count_url = (
505
+ f"{self.base_url}limits/recordCount?sObjects={sObjectName}"
506
+ )
507
+
508
+ sObject_record_count_response = self.sf._call_salesforce(
509
+ "GET", sObject_records_count_url
510
+ ).json()
511
+
512
+ logger.debug(
513
+ "Received Salesforce {sObject} record count response".format(
514
+ sObject=sObjectName
515
+ )
516
+ )
517
+ sobject_record_counts = sObject_record_count_response.get("sObjects", [])
518
+ return sobject_record_counts[0]
519
+
520
+
202
521
  @platform_name("Salesforce")
203
522
  @config_class(SalesforceConfig)
204
523
  @support_status(SupportStatus.INCUBATING)
@@ -228,131 +547,43 @@ FIELD_TYPE_MAPPING = {
228
547
  description="Enabled by default",
229
548
  )
230
549
  class SalesforceSource(StatefulIngestionSourceBase):
231
- base_url: str
232
- config: SalesforceConfig
233
- report: SalesforceSourceReport
234
- session: requests.Session
235
- sf: Salesforce
236
- fieldCounts: Dict[str, int]
237
-
238
550
  def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None:
239
551
  super().__init__(config, ctx)
240
552
  self.ctx = ctx
241
553
  self.config = config
242
- self.report = SalesforceSourceReport()
243
- self.session = requests.Session()
554
+ self.report: SalesforceSourceReport = SalesforceSourceReport()
244
555
  self.platform: str = "salesforce"
245
- self.fieldCounts = {}
246
- common_args: Dict[str, Any] = {
247
- "domain": "test" if self.config.is_sandbox else None,
248
- "session": self.session,
249
- }
250
- if self.config.api_version:
251
- common_args["version"] = self.config.api_version
252
-
253
- try:
254
- if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
255
- logger.debug("Access Token Provided in Config")
256
- assert self.config.access_token is not None, (
257
- "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
258
- )
259
- assert self.config.instance_url is not None, (
260
- "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
261
- )
556
+ self.fieldCounts: Dict[str, int] = {}
262
557
 
263
- self.sf = Salesforce(
264
- instance_url=self.config.instance_url,
265
- session_id=self.config.access_token,
266
- **common_args,
267
- )
268
- elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD:
269
- logger.debug("Username/Password Provided in Config")
270
- assert self.config.username is not None, (
271
- "Config username is required for USERNAME_PASSWORD auth"
272
- )
273
- assert self.config.password is not None, (
274
- "Config password is required for USERNAME_PASSWORD auth"
275
- )
276
- assert self.config.security_token is not None, (
277
- "Config security_token is required for USERNAME_PASSWORD auth"
278
- )
279
-
280
- self.sf = Salesforce(
281
- username=self.config.username,
282
- password=self.config.password,
283
- security_token=self.config.security_token,
284
- **common_args,
285
- )
286
-
287
- elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
288
- logger.debug("Json Web Token provided in the config")
289
- assert self.config.username is not None, (
290
- "Config username is required for JSON_WEB_TOKEN auth"
291
- )
292
- assert self.config.consumer_key is not None, (
293
- "Config consumer_key is required for JSON_WEB_TOKEN auth"
294
- )
295
- assert self.config.private_key is not None, (
296
- "Config private_key is required for JSON_WEB_TOKEN auth"
297
- )
298
-
299
- self.sf = Salesforce(
300
- username=self.config.username,
301
- consumer_key=self.config.consumer_key,
302
- privatekey=self.config.private_key,
303
- **common_args,
304
- )
558
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
559
+ return [
560
+ *super().get_workunit_processors(),
561
+ StaleEntityRemovalHandler.create(
562
+ self, self.config, self.ctx
563
+ ).workunit_processor,
564
+ ]
305
565
 
566
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
567
+ try:
568
+ sf = SalesforceApi.create_salesforce_client(self.config)
306
569
  except SalesforceAuthenticationFailed as e:
307
- logger.error(e)
308
570
  if "API_CURRENTLY_DISABLED" in str(e):
309
571
  # https://help.salesforce.com/s/articleView?id=001473830&type=1
310
- error = "Salesforce login failed. Please make sure user has API Enabled Access."
572
+ error = "Please make sure user has API Enabled Access."
311
573
  else:
312
- error = "Salesforce login failed. Please verify your credentials."
574
+ error = "Please verify your credentials."
313
575
  if (
314
576
  self.config.instance_url
315
577
  and "sandbox" in self.config.instance_url.lower()
316
578
  ):
317
579
  error += "Please set `is_sandbox: True` in recipe if this is sandbox account."
318
- raise ConfigurationError(error) from e
319
-
320
- if not self.config.api_version:
321
- # List all REST API versions and use latest one
322
- versions_url = "https://{instance}/services/data/".format(
323
- instance=self.sf.sf_instance,
324
- )
325
- versions_response = self.sf._call_salesforce("GET", versions_url).json()
326
- latest_version = versions_response[-1]
327
- version = latest_version["version"]
328
- # we could avoid setting the version like below (after the Salesforce object has been already initiated
329
- # above), since, according to the docs:
330
- # https://developer.salesforce.com/docs/atlas.en-us.api_rest.meta/api_rest/dome_versions.htm
331
- # we don't need to be authenticated to list the versions (so we could perform this call before even
332
- # authenticating)
333
- self.sf.sf_version = version
334
-
335
- self.base_url = "https://{instance}/services/data/v{sf_version}/".format(
336
- instance=self.sf.sf_instance, sf_version=self.sf.sf_version
337
- )
338
-
339
- logger.debug(
340
- "Using Salesforce REST API version: {version}".format(
341
- version=self.sf.sf_version
342
- )
343
- )
580
+ self.report.failure(title="Salesforce login failed", message=error, exc=e)
581
+ return
344
582
 
345
- def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
346
- return [
347
- *super().get_workunit_processors(),
348
- StaleEntityRemovalHandler.create(
349
- self, self.config, self.ctx
350
- ).workunit_processor,
351
- ]
583
+ self.sf_api = SalesforceApi(sf, self.config, self.report)
352
584
 
353
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
354
585
  try:
355
- sObjects = self.get_salesforce_objects()
586
+ sObjects = self.sf_api.list_objects()
356
587
  except Exception as e:
357
588
  if "sObject type 'EntityDefinition' is not supported." in str(e):
358
589
  # https://developer.salesforce.com/docs/atlas.en-us.api_tooling.meta/api_tooling/tooling_api_objects_entitydefinition.htm
@@ -366,7 +597,7 @@ class SalesforceSource(StatefulIngestionSourceBase):
366
597
  yield from self.get_salesforce_object_workunits(sObject)
367
598
 
368
599
  def get_salesforce_object_workunits(
369
- self, sObject: dict
600
+ self, sObject: EntityDefinition
370
601
  ) -> Iterable[MetadataWorkUnit]:
371
602
  sObjectName = sObject["QualifiedApiName"]
372
603
 
@@ -386,19 +617,50 @@ class SalesforceSource(StatefulIngestionSourceBase):
386
617
  self.config.env,
387
618
  )
388
619
 
389
- customObject = {}
620
+ customObject = None
390
621
  if sObjectName.endswith("__c"): # Is Custom Object
391
- customObject = self.get_custom_object_details(sObject["DeveloperName"])
622
+ customObject = self.sf_api.get_custom_object_details(
623
+ sObject["DeveloperName"]
624
+ )
392
625
 
393
626
  # Table Created, LastModified is available for Custom Object
394
627
  yield from self.get_operation_workunit(customObject, datasetUrn)
395
628
 
396
629
  yield self.get_properties_workunit(sObject, customObject, datasetUrn)
397
630
 
631
+ allFields = self.sf_api.get_fields_for_object(sObjectName, sObject["DurableId"])
632
+
633
+ customFields = self.sf_api.get_custom_fields_for_object(
634
+ sObjectName, sObject["DurableId"]
635
+ )
636
+
637
+ if any(field["IsCalculated"] for field in allFields):
638
+ # Although formula is present in Metadata column of CustomField entity,
639
+ # we can not use it as it allows querying only for one field at a time
640
+ # and that would not be performant
641
+ calculated_field_formulae = self.get_calculated_field_formulae(sObjectName)
642
+ if calculated_field_formulae:
643
+ self.report.objects_with_calculated_field.append(sObjectName)
644
+ else:
645
+ # For some objects, although some fields are calculated, formula is absent
646
+ # These are typically salesforce system calculated fields whose formula
647
+ # is not exposed
648
+ self.report.num_objects_missing_formula += 1
649
+ else:
650
+ calculated_field_formulae = {}
651
+
398
652
  yield from self.get_schema_metadata_workunit(
399
- sObjectName, sObject, customObject, datasetUrn
653
+ sObjectName,
654
+ allFields,
655
+ customFields,
656
+ customObject,
657
+ datasetUrn,
658
+ calculated_field_formulae,
400
659
  )
401
660
 
661
+ if self.config.use_referenced_entities_as_upstreams:
662
+ yield from self.get_upstream_workunit(datasetUrn, allFields)
663
+
402
664
  yield self.get_subtypes_workunit(sObjectName, datasetUrn)
403
665
 
404
666
  if self.config.platform_instance is not None:
@@ -412,39 +674,33 @@ class SalesforceSource(StatefulIngestionSourceBase):
412
674
  ):
413
675
  yield from self.get_profile_workunit(sObjectName, datasetUrn)
414
676
 
415
- def get_custom_object_details(self, sObjectDeveloperName: str) -> dict:
416
- customObject = {}
417
- query_url = (
418
- self.base_url
419
- + "tooling/query/?q=SELECT Description, Language, ManageableState, "
420
- + "CreatedDate, CreatedBy.Username, LastModifiedDate, LastModifiedBy.Username "
421
- + f"FROM CustomObject where DeveloperName='{sObjectDeveloperName}'"
422
- )
423
- custom_objects_response = self.sf._call_salesforce("GET", query_url).json()
424
- if len(custom_objects_response["records"]) > 0:
425
- logger.debug("Salesforce CustomObject query returned with details")
426
- customObject = custom_objects_response["records"][0]
427
- return customObject
428
-
429
- def get_salesforce_objects(self) -> List:
430
- # Using Describe Global REST API returns many more objects than required.
431
- # Response does not have the attribute ("customizable") that can be used
432
- # to filter out entities not on ObjectManager UI. Hence SOQL on EntityDefinition
433
- # object is used instead, as suggested by salesforce support.
434
-
435
- query_url = (
436
- self.base_url
437
- + "tooling/query/?q=SELECT DurableId,QualifiedApiName,DeveloperName,"
438
- + "Label,PluralLabel,InternalSharingModel,ExternalSharingModel,DeploymentStatus "
439
- + "FROM EntityDefinition WHERE IsCustomizable = true"
440
- )
441
- entities_response = self.sf._call_salesforce("GET", query_url).json()
442
- logger.debug(
443
- "Salesforce EntityDefinition query returned {count} sObjects".format(
444
- count=len(entities_response["records"])
445
- )
446
- )
447
- return entities_response["records"]
677
+ def get_upstream_workunit(
678
+ self, datasetUrn: str, allFields: List[EntityParticle]
679
+ ) -> Iterable[MetadataWorkUnit]:
680
+ upstreams: List[UpstreamClass] = []
681
+ for field in allFields:
682
+ if (
683
+ field["DataType"] == "reference"
684
+ and field["ReferenceTo"]
685
+ and field["ReferenceTo"]["referenceTo"]
686
+ ):
687
+ for referenced_sObjectName in field["ReferenceTo"]["referenceTo"]:
688
+ upstreams.append(
689
+ UpstreamClass(
690
+ dataset=builder.make_dataset_urn_with_platform_instance(
691
+ self.platform,
692
+ referenced_sObjectName,
693
+ self.config.platform_instance,
694
+ self.config.env,
695
+ ),
696
+ type=DatasetLineageTypeClass.TRANSFORMED,
697
+ )
698
+ )
699
+
700
+ if upstreams:
701
+ yield MetadataChangeProposalWrapper(
702
+ entityUrn=datasetUrn, aspect=UpstreamLineageClass(upstreams=upstreams)
703
+ ).as_workunit()
448
704
 
449
705
  def get_domain_workunit(
450
706
  self, dataset_name: str, datasetUrn: str
@@ -474,11 +730,15 @@ class SalesforceSource(StatefulIngestionSourceBase):
474
730
  ).as_workunit()
475
731
 
476
732
  def get_operation_workunit(
477
- self, customObject: dict, datasetUrn: str
733
+ self, customObject: Optional[CustomObject], datasetUrn: str
478
734
  ) -> Iterable[MetadataWorkUnit]:
479
735
  reported_time: int = int(time.time() * 1000)
480
736
 
481
- if customObject.get("CreatedBy") and customObject.get("CreatedDate"):
737
+ if (
738
+ customObject
739
+ and customObject.get("CreatedBy")
740
+ and customObject.get("CreatedDate")
741
+ ):
482
742
  timestamp = self.get_time_from_salesforce_timestamp(
483
743
  customObject["CreatedDate"]
484
744
  )
@@ -521,7 +781,10 @@ class SalesforceSource(StatefulIngestionSourceBase):
521
781
  )
522
782
 
523
783
  def get_properties_workunit(
524
- self, sObject: dict, customObject: Dict[str, str], datasetUrn: str
784
+ self,
785
+ sObject: EntityDefinition,
786
+ customObject: Optional[CustomObject],
787
+ datasetUrn: str,
525
788
  ) -> MetadataWorkUnit:
526
789
  propertyLabels = {
527
790
  # from EntityDefinition
@@ -542,17 +805,18 @@ class SalesforceSource(StatefulIngestionSourceBase):
542
805
  for k, v in sObject.items()
543
806
  if k in propertyLabels and v is not None
544
807
  }
545
- sObjectProperties.update(
546
- {
547
- propertyLabels[k]: str(v)
548
- for k, v in customObject.items()
549
- if k in propertyLabels and v is not None
550
- }
551
- )
808
+ if customObject:
809
+ sObjectProperties.update(
810
+ {
811
+ propertyLabels[k]: str(v)
812
+ for k, v in customObject.items()
813
+ if k in propertyLabels and v is not None
814
+ }
815
+ )
552
816
 
553
817
  datasetProperties = DatasetPropertiesClass(
554
818
  name=sObject["Label"],
555
- description=customObject.get("Description"),
819
+ description=customObject.get("Description") if customObject else None,
556
820
  customProperties=sObjectProperties,
557
821
  )
558
822
  return MetadataChangeProposalWrapper(
@@ -577,58 +841,58 @@ class SalesforceSource(StatefulIngestionSourceBase):
577
841
  ) -> Iterable[MetadataWorkUnit]:
578
842
  # Here approximate record counts as returned by recordCount API are used as rowCount
579
843
  # In future, count() SOQL query may be used instead, if required, might be more expensive
580
- sObject_records_count_url = (
581
- f"{self.base_url}limits/recordCount?sObjects={sObjectName}"
582
- )
583
-
584
- sObject_record_count_response = self.sf._call_salesforce(
585
- "GET", sObject_records_count_url
586
- ).json()
844
+ sobject_record_count = self.sf_api.get_approximate_record_count(sObjectName)
587
845
 
588
- logger.debug(
589
- "Received Salesforce {sObject} record count response".format(
590
- sObject=sObjectName
591
- )
846
+ datasetProfile = DatasetProfileClass(
847
+ timestampMillis=int(time.time() * 1000),
848
+ rowCount=sobject_record_count["count"],
849
+ columnCount=self.fieldCounts[sObjectName],
592
850
  )
593
-
594
- for entry in sObject_record_count_response.get("sObjects", []):
595
- datasetProfile = DatasetProfileClass(
596
- timestampMillis=int(time.time() * 1000),
597
- rowCount=entry["count"],
598
- columnCount=self.fieldCounts[sObjectName],
599
- )
600
- yield MetadataChangeProposalWrapper(
601
- entityUrn=datasetUrn, aspect=datasetProfile
602
- ).as_workunit()
851
+ yield MetadataChangeProposalWrapper(
852
+ entityUrn=datasetUrn, aspect=datasetProfile
853
+ ).as_workunit()
603
854
 
604
855
  # Here field description is created from label, description and inlineHelpText
605
- def _get_field_description(self, field: dict, customField: dict) -> str:
606
- if "Label" not in field or field["Label"] is None:
607
- desc = ""
608
- elif field["Label"].startswith("#"):
609
- desc = "\\" + field["Label"]
610
- else:
611
- desc = field["Label"]
856
+ def _get_field_description(
857
+ self,
858
+ field: EntityParticle,
859
+ customField: Optional[CustomField],
860
+ formula: Optional[str],
861
+ ) -> str:
862
+ description_parts: List[str] = []
863
+
864
+ if field.get("Label") and field["Label"].startswith("#"):
865
+ description_parts.append("\\" + field["Label"])
866
+ elif field.get("Label"):
867
+ description_parts.append(field["Label"])
612
868
 
613
869
  text = field.get("FieldDefinition", {}).get("Description", None)
614
870
  if text:
615
871
  prefix = "\\" if text.startswith("#") else ""
616
- desc += f"\n\n{prefix}{text}"
872
+ description_parts.append(f"{prefix}{text}")
617
873
 
618
874
  text = field.get("InlineHelpText")
619
875
  if text:
620
876
  prefix = "\\" if text.startswith("#") else ""
621
- desc += f"\n\n{prefix}{text}"
877
+ description_parts.append(f"{prefix}{text}")
878
+
879
+ if formula:
880
+ description_parts.append(f"Formula: {formula}")
622
881
 
623
- return desc
882
+ return "\n\n".join(description_parts)
624
883
 
625
884
  # Here jsonProps is used to add additional salesforce field level properties.
626
- def _get_field_json_props(self, field: dict, customField: dict) -> str:
885
+ def _get_field_json_props(
886
+ self, field: EntityParticle, customField: Optional[CustomField]
887
+ ) -> str:
627
888
  jsonProps = {}
628
889
 
629
890
  if field.get("IsUnique"):
630
891
  jsonProps["IsUnique"] = True
631
892
 
893
+ if field.get("IsCalculated"):
894
+ jsonProps["IsCalculated"] = True
895
+
632
896
  return json.dumps(jsonProps)
633
897
 
634
898
  def _get_schema_field(
@@ -636,8 +900,9 @@ class SalesforceSource(StatefulIngestionSourceBase):
636
900
  sObjectName: str,
637
901
  fieldName: str,
638
902
  fieldType: str,
639
- field: dict,
640
- customField: dict,
903
+ field: EntityParticle,
904
+ customField: Optional[CustomField],
905
+ formula: Optional[str] = None,
641
906
  ) -> SchemaFieldClass:
642
907
  fieldPath = fieldName
643
908
 
@@ -651,7 +916,7 @@ class SalesforceSource(StatefulIngestionSourceBase):
651
916
 
652
917
  fieldTags: List[str] = self.get_field_tags(fieldName, field)
653
918
 
654
- description = self._get_field_description(field, customField)
919
+ description = self._get_field_description(field, customField, formula)
655
920
 
656
921
  schemaField = SchemaFieldClass(
657
922
  fieldPath=fieldPath,
@@ -666,11 +931,19 @@ class SalesforceSource(StatefulIngestionSourceBase):
666
931
  )
667
932
 
668
933
  # Created and LastModified Date and Actor are available for Custom Fields only
669
- if customField.get("CreatedDate") and customField.get("CreatedBy"):
934
+ if (
935
+ customField
936
+ and customField.get("CreatedDate")
937
+ and customField.get("CreatedBy")
938
+ ):
670
939
  schemaField.created = self.get_audit_stamp(
671
940
  customField["CreatedDate"], customField["CreatedBy"]["Username"]
672
941
  )
673
- if customField.get("LastModifiedDate") and customField.get("LastModifiedBy"):
942
+ if (
943
+ customField
944
+ and customField.get("LastModifiedDate")
945
+ and customField.get("LastModifiedBy")
946
+ ):
674
947
  schemaField.lastModified = self.get_audit_stamp(
675
948
  customField["LastModifiedDate"],
676
949
  customField["LastModifiedBy"]["Username"],
@@ -678,7 +951,7 @@ class SalesforceSource(StatefulIngestionSourceBase):
678
951
 
679
952
  return schemaField
680
953
 
681
- def get_field_tags(self, fieldName: str, field: dict) -> List[str]:
954
+ def get_field_tags(self, fieldName: str, field: EntityParticle) -> List[str]:
682
955
  fieldTags: List[str] = []
683
956
 
684
957
  if fieldName.endswith("__c"):
@@ -711,69 +984,39 @@ class SalesforceSource(StatefulIngestionSourceBase):
711
984
  actor=builder.make_user_urn(username),
712
985
  )
713
986
 
714
- def get_schema_metadata_workunit(
715
- self, sObjectName: str, sObject: dict, customObject: dict, datasetUrn: str
716
- ) -> Iterable[MetadataWorkUnit]:
717
- sObject_fields_query_url = (
718
- self.base_url
719
- + "tooling/query?q=SELECT "
720
- + "QualifiedApiName,DeveloperName,Label, FieldDefinition.DataType, DataType,"
721
- + "FieldDefinition.LastModifiedDate, FieldDefinition.LastModifiedBy.Username,"
722
- + "Precision, Scale, Length, Digits ,FieldDefinition.IsIndexed, IsUnique,"
723
- + "IsCompound, IsComponent, ReferenceTo, FieldDefinition.ComplianceGroup,"
724
- + "RelationshipName, IsNillable, FieldDefinition.Description, InlineHelpText "
725
- + "FROM EntityParticle WHERE EntityDefinitionId='{}'".format(
726
- sObject["DurableId"]
727
- )
728
- )
729
-
730
- sObject_fields_response = self.sf._call_salesforce(
731
- "GET", sObject_fields_query_url
732
- ).json()
733
-
734
- logger.debug(f"Received Salesforce {sObjectName} fields response")
735
-
736
- sObject_custom_fields_query_url = (
737
- self.base_url
738
- + "tooling/query?q=SELECT "
739
- + "DeveloperName,CreatedDate,CreatedBy.Username,InlineHelpText,"
740
- + "LastModifiedDate,LastModifiedBy.Username "
741
- + "FROM CustomField WHERE EntityDefinitionId='{}'".format(
742
- sObject["DurableId"]
743
- )
744
- )
987
+ def get_calculated_field_formulae(self, sObjectName: str) -> Dict[str, str]:
988
+ # extract field wise formula and return response
989
+ # Includes entries for calculated fields only
745
990
 
746
- customFields: Dict[str, Dict] = {}
991
+ calculated_fields = {}
747
992
  try:
748
- sObject_custom_fields_response = self.sf._call_salesforce(
749
- "GET", sObject_custom_fields_query_url
750
- ).json()
751
-
752
- logger.debug(
753
- "Received Salesforce {sObject} custom fields response".format(
754
- sObject=sObjectName
755
- )
756
- )
757
-
993
+ describe_object_result = self.sf_api.describe_object(sObjectName)
994
+ for field in describe_object_result["fields"]:
995
+ if field["calculatedFormula"]:
996
+ calculated_fields[field["name"]] = field["calculatedFormula"]
758
997
  except Exception as e:
759
- error = "Salesforce CustomField query failed. "
760
- if "sObject type 'CustomField' is not supported." in str(e):
761
- # https://github.com/afawcett/apex-toolingapi/issues/19
762
- error += "Please verify if user has 'View All Data' permission."
763
-
764
- self.report.warning(message=error, exc=e)
765
- else:
766
- customFields = {
767
- record["DeveloperName"]: record
768
- for record in sObject_custom_fields_response["records"]
769
- }
998
+ self.report.warning(
999
+ message="Failed to get calculated field formulae",
1000
+ context=sObjectName,
1001
+ exc=e,
1002
+ )
1003
+ return calculated_fields
770
1004
 
1005
+ def get_schema_metadata_workunit(
1006
+ self,
1007
+ sObjectName: str,
1008
+ all_fields: List[EntityParticle],
1009
+ custom_fields: Dict[str, CustomField],
1010
+ customObject: Optional[CustomObject],
1011
+ datasetUrn: str,
1012
+ calculated_field_formulae: Dict[str, str],
1013
+ ) -> Iterable[MetadataWorkUnit]:
771
1014
  fields: List[SchemaFieldClass] = []
772
1015
  primaryKeys: List[str] = []
773
1016
  foreignKeys: List[ForeignKeyConstraintClass] = []
774
1017
 
775
- for field in sObject_fields_response["records"]:
776
- customField = customFields.get(field["DeveloperName"], {})
1018
+ for field in all_fields:
1019
+ customField = custom_fields.get(field["DeveloperName"])
777
1020
 
778
1021
  fieldName = field["QualifiedApiName"]
779
1022
  fieldType = field["DataType"]
@@ -783,20 +1026,21 @@ class SalesforceSource(StatefulIngestionSourceBase):
783
1026
  continue
784
1027
 
785
1028
  schemaField: SchemaFieldClass = self._get_schema_field(
786
- sObjectName, fieldName, fieldType, field, customField
1029
+ sObjectName,
1030
+ fieldName,
1031
+ fieldType,
1032
+ field,
1033
+ customField,
1034
+ calculated_field_formulae.get(fieldName),
787
1035
  )
788
1036
  fields.append(schemaField)
789
1037
 
790
1038
  if fieldType == "id":
791
1039
  primaryKeys.append(fieldName)
792
1040
 
793
- if (
794
- fieldType == "reference"
795
- and field["ReferenceTo"]["referenceTo"] is not None
796
- ):
797
- foreignKeys.extend(
798
- list(self.get_foreign_keys_from_field(fieldName, field, datasetUrn))
799
- )
1041
+ foreignKeys.extend(
1042
+ list(self.get_foreign_keys_from_field(fieldName, field, datasetUrn))
1043
+ )
800
1044
 
801
1045
  schemaMetadata = SchemaMetadataClass(
802
1046
  schemaName="",
@@ -810,7 +1054,11 @@ class SalesforceSource(StatefulIngestionSourceBase):
810
1054
  )
811
1055
 
812
1056
  # Created Date and Actor are available for Custom Object only
813
- if customObject.get("CreatedDate") and customObject.get("CreatedBy"):
1057
+ if (
1058
+ customObject
1059
+ and customObject.get("CreatedDate")
1060
+ and customObject.get("CreatedBy")
1061
+ ):
814
1062
  schemaMetadata.created = self.get_audit_stamp(
815
1063
  customObject["CreatedDate"], customObject["CreatedBy"]["Username"]
816
1064
  )
@@ -821,26 +1069,31 @@ class SalesforceSource(StatefulIngestionSourceBase):
821
1069
  ).as_workunit()
822
1070
 
823
1071
  def get_foreign_keys_from_field(
824
- self, fieldName: str, field: dict, datasetUrn: str
1072
+ self, fieldName: str, field: EntityParticle, datasetUrn: str
825
1073
  ) -> Iterable[ForeignKeyConstraintClass]:
826
- # https://developer.salesforce.com/docs/atlas.en-us.object_reference.meta/object_reference/field_types.htm#i1435823
827
- foreignDatasets = [
828
- builder.make_dataset_urn_with_platform_instance(
829
- self.platform,
830
- fsObject,
831
- self.config.platform_instance,
832
- self.config.env,
833
- )
834
- for fsObject in field["ReferenceTo"]["referenceTo"]
835
- ]
836
-
837
- for foreignDataset in foreignDatasets:
838
- yield ForeignKeyConstraintClass(
839
- name=field["RelationshipName"] if field.get("RelationshipName") else "",
840
- foreignDataset=foreignDataset,
841
- foreignFields=[builder.make_schema_field_urn(foreignDataset, "Id")],
842
- sourceFields=[builder.make_schema_field_urn(datasetUrn, fieldName)],
843
- )
1074
+ if (
1075
+ field["DataType"] == "reference"
1076
+ and field["ReferenceTo"]
1077
+ and field["ReferenceTo"]["referenceTo"] is not None
1078
+ ):
1079
+ # https://developer.salesforce.com/docs/atlas.en-us.object_reference.meta/object_reference/field_types.htm#i1435823
1080
+ foreignDatasets = [
1081
+ builder.make_dataset_urn_with_platform_instance(
1082
+ self.platform,
1083
+ fsObject,
1084
+ self.config.platform_instance,
1085
+ self.config.env,
1086
+ )
1087
+ for fsObject in field["ReferenceTo"]["referenceTo"]
1088
+ ]
1089
+
1090
+ for foreignDataset in foreignDatasets:
1091
+ yield ForeignKeyConstraintClass(
1092
+ name=field["RelationshipName"] if field["RelationshipName"] else "",
1093
+ foreignDataset=foreignDataset,
1094
+ foreignFields=[builder.make_schema_field_urn(foreignDataset, "Id")],
1095
+ sourceFields=[builder.make_schema_field_urn(datasetUrn, fieldName)],
1096
+ )
844
1097
 
845
1098
  def get_report(self) -> SourceReport:
846
1099
  return self.report