acryl-datahub-cloud 0.3.10rc4__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (243) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
  3. acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
  4. acryl_datahub_cloud/action_request/action_request_owner_source.py +37 -8
  5. acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
  6. acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
  7. acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
  8. acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
  9. acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
  10. acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
  11. acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
  12. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +39 -19
  13. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +60 -25
  14. acryl_datahub_cloud/datahub_reporting/extract_graph.py +9 -3
  15. acryl_datahub_cloud/datahub_reporting/extract_sql.py +248 -52
  16. acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
  17. acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
  18. acryl_datahub_cloud/datahub_restore/source.py +3 -2
  19. acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
  20. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
  21. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +532 -109
  22. acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
  23. acryl_datahub_cloud/graphql_utils.py +64 -0
  24. acryl_datahub_cloud/lineage_features/source.py +555 -49
  25. acryl_datahub_cloud/metadata/_urns/urn_defs.py +2390 -1938
  26. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
  27. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
  28. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  29. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +6 -2
  30. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  31. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
  32. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
  33. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
  34. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  35. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
  36. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
  37. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
  38. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  39. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +14 -0
  40. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
  41. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  42. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/monitor/__init__.py +6 -0
  43. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
  44. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  45. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  46. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  47. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
  48. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  49. acryl_datahub_cloud/metadata/schema.avsc +27843 -23200
  50. acryl_datahub_cloud/metadata/schema_classes.py +29901 -24310
  51. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
  52. acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
  53. acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
  54. acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
  55. acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
  56. acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
  57. acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
  58. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +375 -212
  59. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
  60. acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
  61. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +191 -21
  62. acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
  63. acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
  64. acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
  65. acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
  66. acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
  67. acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
  68. acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
  69. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  70. acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
  71. acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  72. acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
  73. acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
  74. acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
  75. acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  76. acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
  77. acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
  78. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
  79. acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
  80. acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
  81. acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
  82. acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
  83. acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +2 -1
  84. acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
  85. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
  86. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
  87. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
  88. acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
  89. acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
  90. acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  91. acryl_datahub_cloud/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  92. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  93. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
  94. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  95. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  96. acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  97. acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
  98. acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
  99. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
  100. acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
  101. acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  102. acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
  103. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
  104. acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
  105. acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +4 -2
  106. acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
  107. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
  108. acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
  109. acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  110. acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
  111. acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
  112. acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
  113. acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
  114. acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
  115. acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
  116. acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  117. acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  118. acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  119. acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  120. acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  121. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  122. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  123. acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  124. acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  125. acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  126. acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
  127. acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
  128. acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
  129. acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
  130. acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
  131. acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
  132. acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
  133. acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
  134. acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
  135. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
  136. acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
  137. acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  138. acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
  139. acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  140. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
  141. acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
  142. acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
  143. acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
  144. acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
  145. acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
  146. acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
  147. acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
  148. acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
  149. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
  150. acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
  151. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
  152. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  153. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
  154. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
  155. acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
  156. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
  157. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +424 -97
  158. acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +65 -44
  159. acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  160. acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +84 -29
  161. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +221 -23
  162. acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
  163. acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
  164. acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
  165. acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
  166. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
  167. acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
  168. acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
  169. acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +4 -2
  170. acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
  171. acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  172. acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
  173. acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  174. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
  175. acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
  176. acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
  177. acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
  178. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
  179. acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  180. acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
  181. acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
  182. acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +147 -0
  183. acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
  184. acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
  185. acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
  186. acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
  187. acryl_datahub_cloud/metadata/schemas/__init__.py +3 -3
  188. acryl_datahub_cloud/notifications/__init__.py +0 -0
  189. acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
  190. acryl_datahub_cloud/sdk/__init__.py +69 -0
  191. acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
  192. acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
  193. acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
  194. acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
  195. acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
  196. acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
  197. acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
  198. acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
  199. acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
  200. acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
  201. acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
  202. acryl_datahub_cloud/sdk/assertion/types.py +20 -0
  203. acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
  204. acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
  205. acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
  206. acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
  207. acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
  208. acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
  209. acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
  210. acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
  211. acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
  212. acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
  213. acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
  214. acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
  215. acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
  216. acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
  217. acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
  218. acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
  219. acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
  220. acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
  221. acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
  222. acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
  223. acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
  224. acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
  225. acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
  226. acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
  227. acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
  228. acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
  229. acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
  230. acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
  231. acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
  232. acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
  233. acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
  234. acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
  235. acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
  236. acryl_datahub_cloud/sdk/errors.py +34 -0
  237. acryl_datahub_cloud/sdk/resolver_client.py +42 -0
  238. acryl_datahub_cloud/sdk/subscription_client.py +737 -0
  239. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +49 -43
  240. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +243 -145
  241. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
  242. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
  243. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,27 @@
1
1
  import logging
2
2
  import os
3
+ import threading
4
+ import time
3
5
  from collections import defaultdict
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
7
  from dataclasses import dataclass
5
- from datetime import datetime, timezone
6
- from typing import Dict, Iterable, List, Set
8
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Set
7
9
 
8
10
  from opensearchpy import OpenSearch
11
+ from opensearchpy.exceptions import (
12
+ ConnectionError as OpenSearchConnectionError,
13
+ ConnectionTimeout,
14
+ RequestError,
15
+ TransportError,
16
+ )
17
+ from pydantic import field_validator
18
+ from tenacity import (
19
+ before_sleep_log,
20
+ retry,
21
+ retry_if_exception_type,
22
+ stop_after_attempt,
23
+ wait_exponential,
24
+ )
9
25
 
10
26
  from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
11
27
  from acryl_datahub_cloud.elasticsearch.graph_service import ElasticGraphRow
@@ -20,6 +36,7 @@ from datahub.ingestion.api.decorators import (
20
36
  )
21
37
  from datahub.ingestion.api.source import Source, SourceReport
22
38
  from datahub.ingestion.api.workunit import MetadataWorkUnit
39
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
23
40
  from datahub.metadata.schema_classes import AuditStampClass, LineageFeaturesClass
24
41
 
25
42
  logger = logging.getLogger(__name__)
@@ -28,23 +45,71 @@ SYSTEM_ACTOR = "urn:li:corpuser:__datahub_system"
28
45
 
29
46
 
30
47
  class LineageFeaturesSourceConfig(ConfigModel):
48
+ enabled: bool = True
49
+ materialize_entities: bool = False
31
50
  search_index: ElasticSearchClientConfig = ElasticSearchClientConfig()
32
51
  query_timeout: int = 30
33
- extract_batch_size: int = 2000
52
+ extract_batch_size: int = 3000
53
+ max_retries: int = 3
54
+ retry_delay_seconds: int = 5
55
+ retry_backoff_multiplier: float = 2.0
56
+
57
+ # Cleanup old features when they have not been updated for this many days
58
+ # This is required because we only emit this feature for cases where we find a lineage
59
+ # in the graph index
60
+ cleanup_batch_size: int = 100
61
+ cleanup_old_features_days: int = 2
62
+
63
+ @field_validator("max_retries")
64
+ @classmethod
65
+ def validate_max_retries(cls, v: int) -> int:
66
+ if v < 1:
67
+ raise ValueError("max_retries must be at least 1")
68
+ return v
69
+
70
+ @field_validator("retry_delay_seconds")
71
+ @classmethod
72
+ def validate_retry_delay_seconds(cls, v: int) -> int:
73
+ if v < 1:
74
+ raise ValueError("retry_delay_seconds must be at least 1")
75
+ return v
76
+
77
+ @field_validator("retry_backoff_multiplier")
78
+ @classmethod
79
+ def validate_retry_backoff_multiplier(cls, v: float) -> float:
80
+ if v < 1.0:
81
+ raise ValueError("retry_backoff_multiplier must be at least 1.0")
82
+ return v
34
83
 
35
84
 
36
85
  @dataclass
37
- class LineageExtractGraphSourceReport(SourceReport):
86
+ class LineageExtractGraphSourceReport(SourceReport, IngestionStageReport):
87
+ valid_urns_count: int = 0
88
+ upstream_count: int = 0
89
+ downstream_count: int = 0
38
90
  edges_scanned: int = 0
91
+ skipped_materialized_urns_count: int = 0
92
+ zero_upstream_count: int = 0
93
+ zero_downstream_count: int = 0
94
+ has_asset_level_lineage_count: int = 0
95
+ zero_asset_level_lineage_count: int = 0
96
+ cleanup_old_features_time: int = 0
97
+ cleanup_old_features_count: int = 0
39
98
 
40
99
 
41
100
  @platform_name(id="datahub", platform_name="DataHub")
42
101
  @config_class(LineageFeaturesSourceConfig)
43
102
  @support_status(SupportStatus.INCUBATING)
44
103
  class DataHubLineageFeaturesSource(Source):
104
+ """
105
+ DataHub Lineage Features Source that extracts lineage information from Elasticsearch/OpenSearch.
106
+ """
107
+
45
108
  platform = "datahub"
46
109
 
47
- def __init__(self, config: LineageFeaturesSourceConfig, ctx: PipelineContext):
110
+ def __init__(
111
+ self, config: LineageFeaturesSourceConfig, ctx: PipelineContext
112
+ ) -> None:
48
113
  super().__init__(ctx)
49
114
  self.config: LineageFeaturesSourceConfig = config
50
115
  self.report = LineageExtractGraphSourceReport()
@@ -53,34 +118,337 @@ class DataHubLineageFeaturesSource(Source):
53
118
  self.valid_urns: Set[str] = set()
54
119
  self.upstream_counts: Dict[str, int] = defaultdict(int)
55
120
  self.downstream_counts: Dict[str, int] = defaultdict(int)
121
+ self.last_print_time = time.time()
122
+ # Lock for thread-safe updates to shared state
123
+ self._process_lock = threading.Lock()
124
+
125
+ def _get_retry_decorator(
126
+ self,
127
+ ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
128
+ """Create a retry decorator based on config parameters"""
129
+
130
+ def should_retry_exception(exception: Exception) -> bool:
131
+ """Custom retry predicate for OpenSearch exceptions"""
132
+ if isinstance(
133
+ exception,
134
+ (
135
+ OpenSearchConnectionError,
136
+ ConnectionTimeout,
137
+ RequestError,
138
+ TransportError,
139
+ ),
140
+ ):
141
+ return True
142
+ # Also retry on general connection and timeout errors
143
+ if isinstance(exception, (ConnectionError, TimeoutError)):
144
+ return True
145
+ return False
146
+
147
+ return retry(
148
+ retry=retry_if_exception_type(
149
+ (
150
+ OpenSearchConnectionError,
151
+ ConnectionTimeout,
152
+ RequestError,
153
+ TransportError,
154
+ ConnectionError,
155
+ TimeoutError,
156
+ )
157
+ ),
158
+ stop=stop_after_attempt(self.config.max_retries),
159
+ wait=wait_exponential(
160
+ multiplier=self.config.retry_backoff_multiplier,
161
+ min=self.config.retry_delay_seconds,
162
+ max=30,
163
+ ),
164
+ before_sleep=before_sleep_log(logger, logging.WARNING),
165
+ reraise=True,
166
+ )
167
+
168
+ def _search_with_retry(
169
+ self,
170
+ server: OpenSearch,
171
+ index: str,
172
+ query: dict,
173
+ batch_size: int,
174
+ scroll: Optional[str] = None,
175
+ ) -> dict:
176
+ """Execute search with retry logic"""
177
+ retry_decorator = self._get_retry_decorator()
178
+
179
+ @retry_decorator
180
+ def _search() -> dict:
181
+ logger.debug(f"Executing search with batch size: {batch_size}")
182
+ search_params: dict = {"timeout": self.config.query_timeout}
183
+ if scroll:
184
+ search_params["scroll"] = scroll
185
+ return server.search(
186
+ index=index,
187
+ body=query,
188
+ size=batch_size,
189
+ params=search_params,
190
+ )
191
+
192
+ return _search()
193
+
194
+ def _scroll_with_retry(
195
+ self, server: OpenSearch, scroll_id: str, scroll: str = "10m"
196
+ ) -> dict:
197
+ """Execute scroll with retry logic"""
198
+ retry_decorator = self._get_retry_decorator()
199
+
200
+ @retry_decorator
201
+ def _scroll() -> dict:
202
+ logger.debug(f"Executing scroll with scroll_id: {scroll_id}")
203
+ return server.scroll(
204
+ scroll_id=scroll_id,
205
+ scroll=scroll,
206
+ params={"timeout": self.config.query_timeout},
207
+ )
208
+
209
+ return _scroll()
210
+
211
+ def _clear_scroll_with_retry(self, server: OpenSearch, scroll_id: str) -> None:
212
+ """Clear scroll context with retry logic"""
213
+ retry_decorator = self._get_retry_decorator()
214
+
215
+ @retry_decorator
216
+ def _clear_scroll() -> None:
217
+ logger.debug(f"Clearing scroll: {scroll_id}")
218
+ server.clear_scroll(scroll_id=scroll_id)
219
+ logger.debug(f"Successfully cleared scroll: {scroll_id}")
220
+
221
+ _clear_scroll()
222
+
223
+ def _create_opensearch_client_with_retry(self) -> OpenSearch:
224
+ """Create OpenSearch client with retry logic"""
225
+ retry_decorator = self._get_retry_decorator()
226
+
227
+ @retry_decorator
228
+ def _create_client() -> OpenSearch:
229
+ logger.info(
230
+ f"Creating OpenSearch client for endpoint: {self.config.search_index.endpoint}"
231
+ )
232
+ return OpenSearch(
233
+ [self.config.search_index.endpoint],
234
+ http_auth=(
235
+ self.config.search_index.username,
236
+ self.config.search_index.password,
237
+ ),
238
+ use_ssl=self.config.search_index.use_ssl,
239
+ )
240
+
241
+ return _create_client()
242
+
243
+ def _get_index_shard_count_with_retry(self, server: OpenSearch, index: str) -> int:
244
+ """Get the number of primary shards for an index with retry logic.
245
+
246
+ Handles both direct index names and aliases. If an alias is provided,
247
+ the actual index name is resolved from the response.
248
+ """
249
+ retry_decorator = self._get_retry_decorator()
250
+
251
+ @retry_decorator
252
+ def _get_shard_count() -> int:
253
+ logger.debug(f"Getting shard count for index/alias: {index}")
254
+ index_settings = server.indices.get_settings(index=index)
255
+
256
+ # Handle alias resolution: get_settings returns the actual index name as key
257
+ # even if we pass an alias. Get the first (and typically only) key from the response.
258
+ actual_index_names = list(index_settings.keys())
259
+ if not actual_index_names:
260
+ raise ValueError(f"No index found for: {index}")
261
+
262
+ # If alias resolves to multiple indices, use the first one
263
+ # (shouldn't happen with proper alias configuration, but handle gracefully)
264
+ actual_index_name = actual_index_names[0]
265
+ if len(actual_index_names) > 1:
266
+ logger.warning(
267
+ f"Alias {index} resolves to {len(actual_index_names)} indices: {actual_index_names}. "
268
+ f"Using first index: {actual_index_name}"
269
+ )
270
+
271
+ # Extract number_of_shards from the settings
272
+ # The structure is: index_settings[index_name]['settings']['index']['number_of_shards']
273
+ number_of_shards = int(
274
+ index_settings[actual_index_name]["settings"]["index"][
275
+ "number_of_shards"
276
+ ]
277
+ )
278
+
279
+ if actual_index_name != index:
280
+ logger.info(
281
+ f"Alias {index} resolved to index {actual_index_name}, which has {number_of_shards} primary shards"
282
+ )
283
+ else:
284
+ logger.info(f"Index {index} has {number_of_shards} primary shards")
285
+
286
+ return number_of_shards
287
+
288
+ return _get_shard_count()
289
+
290
+ def _get_data_node_count_with_retry(self, server: OpenSearch) -> int:
291
+ """Get the number of data nodes in the cluster with retry logic."""
292
+ retry_decorator = self._get_retry_decorator()
293
+
294
+ @retry_decorator
295
+ def _get_data_node_count() -> int:
296
+ logger.debug("Getting data node count from cluster")
297
+ # Get node information including roles
298
+ nodes_info = server.nodes.info()
299
+
300
+ data_node_count = 0
301
+ nodes = nodes_info.get("nodes", {})
302
+
303
+ if not nodes:
304
+ logger.warning("No nodes found in cluster info")
305
+ return 0
306
+
307
+ for _, node_info in nodes.items():
308
+ # Check if node has data role
309
+ # Roles can be a list of strings or a dict with boolean flags
310
+ roles = node_info.get("roles", [])
311
+
312
+ # Handle both list format ["data", "ingest"] and dict format {"data": true, "ingest": true}
313
+ is_data_node = False
314
+ if isinstance(roles, list):
315
+ # List format: check if any data role is present
316
+ is_data_node = any(
317
+ role in roles
318
+ for role in [
319
+ "data",
320
+ "data_content",
321
+ "data_hot",
322
+ "data_warm",
323
+ "data_cold",
324
+ ]
325
+ )
326
+ elif isinstance(roles, dict):
327
+ # Dict format: check if any data role is True
328
+ is_data_node = any(
329
+ roles.get(role, False)
330
+ for role in [
331
+ "data",
332
+ "data_content",
333
+ "data_hot",
334
+ "data_warm",
335
+ "data_cold",
336
+ ]
337
+ )
338
+
339
+ if is_data_node:
340
+ data_node_count += 1
341
+
342
+ if data_node_count == 0:
343
+ logger.warning(
344
+ "No data nodes detected in cluster - will use single-threaded processing"
345
+ )
346
+ else:
347
+ logger.info(f"Cluster has {data_node_count} data node(s)")
348
+
349
+ return data_node_count
350
+
351
+ return _get_data_node_count()
352
+
353
+ def _update_report(self) -> None:
354
+ """
355
+ Information to see whether we are close to hitting the memory limits
356
+ """
357
+ self.report.valid_urns_count = len(self.valid_urns)
358
+ self.report.upstream_count = len(self.upstream_counts.keys())
359
+ self.report.downstream_count = len(self.downstream_counts.keys())
360
+
361
+ def _print_report(self) -> None:
362
+ """
363
+ Printing is required like this because the report is only printed
364
+ when the workunits are yielded
365
+ In case of background processes we won't know the progress if this is not done
366
+ """
367
+ # Thread-safe: protect access to last_print_time and report state
368
+ with self._process_lock:
369
+ time_taken = round(time.time() - self.last_print_time, 1)
370
+ # Print report every 2 minutes
371
+ if time_taken > 120:
372
+ self._update_report()
373
+ self.last_print_time = time.time()
374
+ logger.info(f"\n{self.report.as_string()}")
56
375
 
57
376
  def process_batch(self, results: Iterable[dict]) -> None:
377
+ """Process a batch of results. Thread-safe for parallel processing."""
58
378
  for doc in results:
379
+ self._print_report()
59
380
  row = ElasticGraphRow.from_elastic_doc(doc["_source"])
60
- self.report.edges_scanned += 1
61
- if (
62
- row.source_urn in self.valid_urns
63
- and row.destination_urn in self.valid_urns
64
- ):
65
- self.upstream_counts[row.source_urn] += 1
66
- self.downstream_counts[row.destination_urn] += 1
381
+ # Thread-safe updates to shared state
382
+ with self._process_lock:
383
+ self.report.edges_scanned += 1
384
+ if (
385
+ row.source_urn in self.valid_urns
386
+ and row.destination_urn in self.valid_urns
387
+ ):
388
+ self.upstream_counts[row.source_urn] += 1
389
+ self.downstream_counts[row.destination_urn] += 1
67
390
 
68
- def get_workunits(self) -> Iterable[MetadataWorkUnit]:
391
+ def _process_slice(
392
+ self,
393
+ server: OpenSearch,
394
+ index: str,
395
+ query: dict,
396
+ slice_id: int,
397
+ num_slices: int,
398
+ batch_size: int,
399
+ scroll: str = "10m",
400
+ ) -> None:
401
+ """Process a single slice in parallel. This method is thread-safe."""
402
+ # Create a copy of the base query for this slice
403
+ slice_query = {**query}
404
+
405
+ # Add slice parameter for parallel processing
406
+ # Each slice corresponds to a shard for optimal performance
407
+ slice_query.update({"slice": {"id": slice_id, "max": num_slices}})
408
+ logger.info(f"Processing slice {slice_id + 1} of {num_slices} in thread")
409
+
410
+ scroll_id = None
411
+ try:
412
+ # Initial search with scroll
413
+ results = self._search_with_retry(
414
+ server, index, slice_query, batch_size, scroll=scroll
415
+ )
416
+ scroll_id = results.get("_scroll_id")
417
+ self.process_batch(results["hits"]["hits"])
418
+
419
+ # Process all pages for this slice using scroll
420
+ while True:
421
+ if len(results["hits"]["hits"]) < batch_size:
422
+ break
423
+ if not scroll_id:
424
+ break
425
+ results = self._scroll_with_retry(server, scroll_id, scroll=scroll)
426
+ scroll_id = results.get("_scroll_id")
427
+ self.process_batch(results["hits"]["hits"])
428
+ finally:
429
+ # Clear scroll context - ensure cleanup even if exceptions occur
430
+ if scroll_id:
431
+ try:
432
+ self._clear_scroll_with_retry(server, scroll_id)
433
+ except Exception as e:
434
+ logger.warning(
435
+ f"Failed to clear scroll for slice {slice_id + 1}: {e}"
436
+ )
437
+
438
+ logger.info(f"Completed processing slice {slice_id + 1} of {num_slices}")
439
+
440
+ def populate_valid_urns(self) -> None:
69
441
  graph = self.ctx.require_graph("Load non soft-deleted urns")
70
442
  for urn in graph.get_urns_by_filter(batch_size=self.config.extract_batch_size):
443
+ self._print_report()
71
444
  self.valid_urns.add(urn)
72
445
 
73
- timestamp = datetime.now(tz=timezone.utc)
74
- server = OpenSearch(
75
- [self.config.search_index.endpoint],
76
- http_auth=(
77
- self.config.search_index.username,
78
- self.config.search_index.password,
79
- ),
80
- use_ssl=self.config.search_index.use_ssl,
81
- )
446
+ def get_workunits(self) -> Iterable[MetadataWorkUnit]:
447
+ with self.report.new_stage("Load valid URNs"):
448
+ self.populate_valid_urns()
82
449
 
83
- query = {
450
+ server = self._create_opensearch_client_with_retry()
451
+ query: Dict[str, Any] = {
84
452
  "query": {
85
453
  "bool": {
86
454
  "should": [
@@ -102,57 +470,195 @@ class DataHubLineageFeaturesSource(Source):
102
470
  }
103
471
  },
104
472
  ],
473
+ "must_not": [
474
+ {"term": {"source.entityType": "schemaField"}},
475
+ {"term": {"destination.entityType": "schemaField"}},
476
+ ],
105
477
  },
106
- },
107
- "sort": [
108
- {"source.urn": {"order": "desc"}},
109
- {"destination.urn": {"order": "desc"}},
110
- {"relationshipType": {"order": "desc"}},
111
- {"lifecycleOwner": {"order": "desc"}},
112
- ],
478
+ }
113
479
  }
114
480
 
115
481
  index = f"{self.config.search_index.index_prefix}graph_service_v1"
116
- response = server.create_pit(index, keep_alive="10m")
117
482
 
118
- # TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
119
- pit = response.get("pit_id")
120
- query.update({"pit": {"id": pit, "keep_alive": "10m"}})
483
+ # Get the number of data nodes and calculate slices as datanodes * 2
484
+ # This will be used to set the slice count for parallel search processing
485
+ data_node_count = self._get_data_node_count_with_retry(server)
486
+ num_slices = data_node_count * 2
487
+
488
+ # Add slicing for parallel search processing
489
+ # Slicing divides the search into multiple slices that can be processed in parallel
490
+ # Each slice processes a subset of the data independently
121
491
 
122
- # TODO: Using slicing we can parallelize the ES calls below:
123
- # https://opensearch.org/docs/latest/search-plugins/searching-data/point-in-time/#search-slicing
124
492
  batch_size = self.config.extract_batch_size
125
- while True:
126
- results = server.search(
127
- body=query,
128
- size=batch_size,
129
- params={"timeout": self.config.query_timeout},
130
- )
131
- self.process_batch(results["hits"]["hits"])
132
- if len(results["hits"]["hits"]) < batch_size:
133
- break
134
- query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
493
+ scroll = "10m"
494
+ with self.report.new_stage("Extract lineage features"):
495
+ try:
496
+ # Only use slicing if we have more than 1 slice (max must be > 1)
497
+ if num_slices > 1:
498
+ logger.info(
499
+ f"Using {num_slices} slices for parallel processing with threading "
500
+ f"(based on {data_node_count} data node(s) * 2)"
501
+ )
502
+ # Process slices in parallel using ThreadPoolExecutor
503
+ # Each slice runs in its own thread, truly parallelizing the OpenSearch queries
504
+ with ThreadPoolExecutor(max_workers=num_slices) as executor:
505
+ # Submit all slice processing tasks
506
+ future_to_slice = {
507
+ executor.submit(
508
+ self._process_slice,
509
+ server,
510
+ index,
511
+ query,
512
+ slice_id,
513
+ num_slices,
514
+ batch_size,
515
+ scroll,
516
+ ): slice_id
517
+ for slice_id in range(num_slices)
518
+ }
519
+
520
+ # Wait for all slices to complete and handle any exceptions
521
+ for future in as_completed(future_to_slice):
522
+ slice_id = future_to_slice[future]
523
+ try:
524
+ future.result() # This will raise any exception that occurred
525
+ except Exception as exc:
526
+ logger.error(
527
+ f"Slice {slice_id + 1} generated an exception: {exc}"
528
+ )
529
+ raise
530
+ else:
531
+ # Single slice - no slicing needed
532
+ logger.info(
533
+ "Processing without slicing (single slice or no data nodes)"
534
+ )
535
+ scroll_id = None
536
+ try:
537
+ # Initial search with scroll
538
+ results = self._search_with_retry(
539
+ server, index, query, batch_size, scroll=scroll
540
+ )
541
+ scroll_id = results.get("_scroll_id")
542
+ self.process_batch(results["hits"]["hits"])
135
543
 
136
- server.delete_pit(body={"pit_id": pit})
544
+ # Process all pages using scroll
545
+ while True:
546
+ if len(results["hits"]["hits"]) < batch_size:
547
+ break
548
+ if not scroll_id:
549
+ break
550
+ results = self._scroll_with_retry(
551
+ server, scroll_id, scroll=scroll
552
+ )
553
+ scroll_id = results.get("_scroll_id")
554
+ self.process_batch(results["hits"]["hits"])
555
+ finally:
556
+ # Clear scroll context - ensure cleanup even if exceptions occur
557
+ if scroll_id:
558
+ try:
559
+ self._clear_scroll_with_retry(server, scroll_id)
560
+ except Exception as cleanup_error:
561
+ logger.warning(
562
+ f"Failed to clear scroll after error: {cleanup_error}"
563
+ )
564
+ except Exception as e:
565
+ logger.error(f"Error during lineage extraction: {e}")
566
+ self.report.report_failure(
567
+ title="Lineage extraction failed",
568
+ message="Failed to extract lineage features from Elasticsearch",
569
+ context=f"Error: {str(e)}",
570
+ exc=e,
571
+ )
572
+ raise
573
+ self._update_report()
137
574
 
575
+ with self.report.new_stage("emission of lineage features"):
576
+ yield from self._emit_lineage_features()
577
+
578
+ with self.report.new_stage("cleanup old lineage features"):
579
+ yield from self._cleanup_old_features()
580
+
581
+ def _cleanup_old_features(self) -> Iterable[MetadataWorkUnit]:
582
+ """
583
+ This is required because we only emit this feature for cases where we find a lineage
584
+ in the graph index
585
+ """
586
+ cutoff_time = int(
587
+ (time.time() - (self.config.cleanup_old_features_days * 24 * 60 * 60))
588
+ * 1000
589
+ )
590
+ self.report.cleanup_old_features_time = cutoff_time
591
+
592
+ for urn in self.ctx.require_graph("Cleanup old features").get_urns_by_filter(
593
+ extraFilters=[
594
+ {
595
+ "field": "hasAssetLevelLineageFeature",
596
+ "negated": False,
597
+ "condition": "EQUAL",
598
+ "values": ["true"],
599
+ },
600
+ {
601
+ "field": "lineageFeaturesComputedAt",
602
+ "negated": False,
603
+ "condition": "LESS_THAN",
604
+ "values": [str(cutoff_time)],
605
+ },
606
+ ],
607
+ batch_size=self.config.cleanup_batch_size,
608
+ ):
609
+ # Emit lineage features with zero upstreams and downstreams for cleanup
610
+ wu = MetadataChangeProposalWrapper(
611
+ entityUrn=urn,
612
+ aspect=LineageFeaturesClass(
613
+ upstreamCount=0,
614
+ downstreamCount=0,
615
+ hasAssetLevelLineage=False,
616
+ computedAt=AuditStampClass(
617
+ time=int(time.time() * 1000),
618
+ actor=SYSTEM_ACTOR,
619
+ ),
620
+ ),
621
+ ).as_workunit()
622
+ self.report.cleanup_old_features_count += 1
623
+ self.report.report_workunit(wu)
624
+ yield wu
625
+
626
+ def _emit_lineage_features(self) -> Iterable[MetadataWorkUnit]:
138
627
  # In Python 3.9, can be replaced by `self.self.upstream_counts.keys() | self.downstream_counts.keys()`
139
628
  for urn in set(self.upstream_counts.keys()).union(
140
629
  self.downstream_counts.keys()
141
630
  ):
631
+ if (not self.config.materialize_entities) and urn not in self.valid_urns:
632
+ self.report.skipped_materialized_urns_count += 1
633
+ continue
142
634
  logger.debug(
143
635
  f"{urn}: {self.upstream_counts[urn]}, {self.downstream_counts[urn]}"
144
636
  )
145
- yield MetadataChangeProposalWrapper(
637
+ if self.upstream_counts[urn] == 0:
638
+ self.report.zero_upstream_count += 1
639
+ if self.downstream_counts[urn] == 0:
640
+ self.report.zero_downstream_count += 1
641
+ has_asset_level_lineage = (
642
+ self.upstream_counts[urn] > 0 or self.downstream_counts[urn] > 0
643
+ )
644
+ if has_asset_level_lineage:
645
+ self.report.has_asset_level_lineage_count += 1
646
+ else:
647
+ self.report.zero_asset_level_lineage_count += 1
648
+ wu = MetadataChangeProposalWrapper(
146
649
  entityUrn=urn,
147
650
  aspect=LineageFeaturesClass(
148
651
  upstreamCount=self.upstream_counts[urn],
149
652
  downstreamCount=self.downstream_counts[urn],
653
+ hasAssetLevelLineage=has_asset_level_lineage,
150
654
  computedAt=AuditStampClass(
151
- time=int(timestamp.timestamp() * 1000),
655
+ time=int(time.time() * 1000),
152
656
  actor=SYSTEM_ACTOR,
153
657
  ),
154
658
  ),
155
659
  ).as_workunit()
660
+ self.report.report_workunit(wu)
661
+ yield wu
156
662
 
157
663
  def get_report(self) -> SourceReport:
158
664
  return self.report