acryl-datahub-cloud 0.3.10rc4__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (243) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
  3. acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
  4. acryl_datahub_cloud/action_request/action_request_owner_source.py +37 -8
  5. acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
  6. acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
  7. acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
  8. acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
  9. acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
  10. acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
  11. acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
  12. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +39 -19
  13. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +60 -25
  14. acryl_datahub_cloud/datahub_reporting/extract_graph.py +9 -3
  15. acryl_datahub_cloud/datahub_reporting/extract_sql.py +248 -52
  16. acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
  17. acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
  18. acryl_datahub_cloud/datahub_restore/source.py +3 -2
  19. acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
  20. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
  21. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +532 -109
  22. acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
  23. acryl_datahub_cloud/graphql_utils.py +64 -0
  24. acryl_datahub_cloud/lineage_features/source.py +555 -49
  25. acryl_datahub_cloud/metadata/_urns/urn_defs.py +2390 -1938
  26. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
  27. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
  28. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  29. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +6 -2
  30. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  31. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
  32. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
  33. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
  34. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  35. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
  36. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
  37. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
  38. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  39. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +14 -0
  40. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
  41. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  42. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/monitor/__init__.py +6 -0
  43. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
  44. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  45. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  46. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  47. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
  48. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  49. acryl_datahub_cloud/metadata/schema.avsc +27843 -23200
  50. acryl_datahub_cloud/metadata/schema_classes.py +29901 -24310
  51. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
  52. acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
  53. acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
  54. acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
  55. acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
  56. acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
  57. acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
  58. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +375 -212
  59. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
  60. acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
  61. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +191 -21
  62. acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
  63. acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
  64. acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
  65. acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
  66. acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
  67. acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
  68. acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
  69. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  70. acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
  71. acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  72. acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
  73. acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
  74. acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
  75. acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  76. acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
  77. acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
  78. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
  79. acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
  80. acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
  81. acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
  82. acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
  83. acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +2 -1
  84. acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
  85. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
  86. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
  87. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
  88. acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
  89. acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
  90. acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  91. acryl_datahub_cloud/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  92. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  93. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
  94. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  95. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  96. acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  97. acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
  98. acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
  99. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
  100. acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
  101. acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  102. acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
  103. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
  104. acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
  105. acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +4 -2
  106. acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
  107. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
  108. acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
  109. acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  110. acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
  111. acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
  112. acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
  113. acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
  114. acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
  115. acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
  116. acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  117. acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  118. acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  119. acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  120. acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  121. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  122. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  123. acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  124. acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  125. acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  126. acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
  127. acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
  128. acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
  129. acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
  130. acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
  131. acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
  132. acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
  133. acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
  134. acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
  135. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
  136. acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
  137. acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  138. acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
  139. acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  140. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
  141. acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
  142. acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
  143. acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
  144. acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
  145. acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
  146. acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
  147. acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
  148. acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
  149. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
  150. acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
  151. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
  152. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  153. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
  154. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
  155. acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
  156. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
  157. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +424 -97
  158. acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +65 -44
  159. acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  160. acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +84 -29
  161. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +221 -23
  162. acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
  163. acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
  164. acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
  165. acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
  166. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
  167. acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
  168. acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
  169. acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +4 -2
  170. acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
  171. acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  172. acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
  173. acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  174. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
  175. acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
  176. acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
  177. acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
  178. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
  179. acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  180. acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
  181. acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
  182. acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +147 -0
  183. acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
  184. acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
  185. acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
  186. acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
  187. acryl_datahub_cloud/metadata/schemas/__init__.py +3 -3
  188. acryl_datahub_cloud/notifications/__init__.py +0 -0
  189. acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
  190. acryl_datahub_cloud/sdk/__init__.py +69 -0
  191. acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
  192. acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
  193. acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
  194. acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
  195. acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
  196. acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
  197. acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
  198. acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
  199. acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
  200. acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
  201. acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
  202. acryl_datahub_cloud/sdk/assertion/types.py +20 -0
  203. acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
  204. acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
  205. acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
  206. acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
  207. acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
  208. acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
  209. acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
  210. acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
  211. acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
  212. acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
  213. acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
  214. acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
  215. acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
  216. acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
  217. acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
  218. acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
  219. acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
  220. acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
  221. acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
  222. acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
  223. acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
  224. acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
  225. acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
  226. acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
  227. acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
  228. acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
  229. acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
  230. acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
  231. acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
  232. acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
  233. acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
  234. acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
  235. acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
  236. acryl_datahub_cloud/sdk/errors.py +34 -0
  237. acryl_datahub_cloud/sdk/resolver_client.py +42 -0
  238. acryl_datahub_cloud/sdk/subscription_client.py +737 -0
  239. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +49 -43
  240. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +243 -145
  241. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
  242. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
  243. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
@@ -4,10 +4,14 @@ import shutil
4
4
  import zipfile
5
5
  from datetime import datetime, timedelta
6
6
  from pathlib import Path
7
- from typing import Iterable, List, Optional
7
+ from typing import TYPE_CHECKING, Iterable, List, Literal, Optional
8
8
 
9
9
  import boto3
10
- from pydantic import validator
10
+ from botocore.exceptions import ClientError
11
+ from pydantic import field_validator
12
+
13
+ if TYPE_CHECKING:
14
+ from mypy_boto3_s3.service_resource import ObjectSummary
11
15
 
12
16
  from acryl_datahub_cloud.datahub_reporting.datahub_dataset import (
13
17
  DataHubBasedS3Dataset,
@@ -38,28 +42,40 @@ class S3ClientConfig(ConfigModel):
38
42
 
39
43
 
40
44
  class DataHubReportingExtractSQLSourceConfig(ConfigModel):
45
+ enabled: bool = True
41
46
  server: Optional[DatahubClientConfig] = None
42
47
  sql_backup_config: S3ClientConfig
43
48
  extract_sql_store: FileStoreBackedDatasetConfig
44
-
45
- @validator("extract_sql_store", pre=True, always=True)
49
+ # Maximum size (in bytes) of files to stream from S3 per batch using chunked streaming.
50
+ # Files are streamed in 8MB chunks directly from S3 to ZIP without writing to disk, processing
51
+ # files in batches to limit peak memory usage. This prevents both disk pressure and excessive
52
+ # memory consumption during batch processing.
53
+ # Default: 5GB (5 * 1024 * 1024 * 1024 bytes)
54
+ batch_size_bytes: int = 5 * 1024 * 1024 * 1024
55
+
56
+ @field_validator("extract_sql_store", mode="before")
57
+ @classmethod
46
58
  def set_default_extract_soft_delete_flag(cls, v):
47
- if v is not None:
48
- if "dataset_registration_spec" not in v:
49
- v["dataset_registration_spec"] = DatasetRegistrationSpec(
50
- soft_deleted=False
51
- )
52
- elif "soft_deleted" not in v["dataset_registration_spec"]:
53
- v["dataset_registration_spec"]["soft_deleted"] = False
59
+ if v is None:
60
+ return v
61
+
62
+ # If v is already a FileStoreBackedDatasetConfig object, skip dict-based modifications
63
+ if isinstance(v, FileStoreBackedDatasetConfig):
64
+ return v
65
+
66
+ # v is a dictionary - apply default values
67
+ if "dataset_registration_spec" not in v:
68
+ v["dataset_registration_spec"] = DatasetRegistrationSpec(soft_deleted=False)
69
+ elif "soft_deleted" not in v["dataset_registration_spec"]:
70
+ v["dataset_registration_spec"]["soft_deleted"] = False
71
+
72
+ if "file" not in v:
73
+ default_config = FileStoreBackedDatasetConfig.dummy()
74
+ v["file"] = f"{default_config.file_name}.{default_config.file_extension}"
75
+ else:
76
+ v["file_name"] = v["file"].split(".")[0]
77
+ v["file_extension"] = v["file"].split(".")[-1]
54
78
 
55
- if "file" not in v:
56
- default_config = FileStoreBackedDatasetConfig.dummy()
57
- v["file"] = (
58
- f"{default_config.file_name}.{default_config.file_extension}"
59
- )
60
- else:
61
- v["file_name"] = v["file"].split(".")[0]
62
- v["file_extension"] = v["file"].split(".")[-1]
63
79
  return v
64
80
 
65
81
 
@@ -112,12 +128,16 @@ class DataHubReportingExtractSQLSource(Source):
112
128
 
113
129
  if skip_extract:
114
130
  logger.info(
115
- f"Skipping graph extract as dataset has been updated today {ts}"
131
+ f"Skipping sql extract as dataset has been updated today {ts}"
116
132
  )
117
133
 
118
134
  return skip_extract
119
135
 
120
136
  def get_workunits(self):
137
+ if not self.config.enabled:
138
+ logger.info("Source is disabled, stopping")
139
+ return
140
+
121
141
  self.graph = (
122
142
  self.ctx.require_graph("Loading default graph coordinates.")
123
143
  if self.config.server is None
@@ -161,20 +181,17 @@ class DataHubReportingExtractSQLSource(Source):
161
181
 
162
182
  self._clean_up_old_state(state_directory=tmp_dir)
163
183
 
164
- files_downloaded: bool = self._download_files(
184
+ files_downloaded: bool = self._download_and_zip_in_batches(
165
185
  bucket=self.config.sql_backup_config.bucket,
166
186
  prefix=bucket_prefix,
167
- target_dir=f"{tmp_dir}/download/",
187
+ batch_dir=f"{tmp_dir}/download/",
188
+ output_zip=f"{tmp_dir}/{output_file}",
189
+ batch_size_bytes=self.config.batch_size_bytes,
168
190
  )
169
191
  if not files_downloaded:
170
192
  logger.warning(f"Skipping as no files were found in {bucket_prefix}")
171
193
  return
172
194
 
173
- self._zip_folder(
174
- folder_path=f"{tmp_dir}/download",
175
- output_file=f"{tmp_dir}/{output_file}",
176
- )
177
-
178
195
  # Compute profile & schema information, this is based on the parquet files that were downloaded and not the zip file.
179
196
  # We must hard-code the local file from which the dataset will be created, otherwise the upload to s3 will be in
180
197
  # unexpected path.
@@ -205,40 +222,219 @@ class DataHubReportingExtractSQLSource(Source):
205
222
  path = Path(f"{state_directory}/download/")
206
223
  path.mkdir(parents=True, exist_ok=True)
207
224
 
208
- def _download_files(self, bucket: str, prefix: str, target_dir: str) -> bool:
209
- objects = boto3.resource("s3").Bucket(bucket).objects.filter(Prefix=prefix)
225
+ @staticmethod
226
+ def _stream_file_to_zip_from_local(
227
+ local_file_path: str,
228
+ zipf: zipfile.ZipFile,
229
+ file_name: str,
230
+ chunk_size: int,
231
+ ) -> None:
232
+ """Stream file from local disk to ZIP using chunked reads."""
233
+ with (
234
+ open(local_file_path, "rb") as local_file,
235
+ zipf.open(file_name, "w") as zip_entry,
236
+ ):
237
+ while True:
238
+ chunk = local_file.read(chunk_size)
239
+ if not chunk:
240
+ break
241
+ zip_entry.write(chunk)
242
+
243
+ def _stream_file_to_zip_from_s3(
244
+ self,
245
+ bucket: str,
246
+ file_key: str,
247
+ zipf: zipfile.ZipFile,
248
+ file_name: str,
249
+ chunk_size: int,
250
+ ) -> None:
251
+ """Stream file from S3 to ZIP using chunked reads."""
252
+ s3_response = self.s3_client.get_object(Bucket=bucket, Key=file_key)
253
+ body_stream = s3_response["Body"]
254
+
255
+ with zipf.open(file_name, "w") as zip_entry:
256
+ while True:
257
+ chunk = body_stream.read(chunk_size)
258
+ if not chunk:
259
+ break
260
+ zip_entry.write(chunk)
210
261
 
211
- files_downloaded = False
262
+ @staticmethod
263
+ def _group_objects_into_batches(
264
+ objects: List["ObjectSummary"], batch_size_bytes: int
265
+ ) -> List[List["ObjectSummary"]]:
266
+ """
267
+ Group S3 objects into batches based on cumulative size.
268
+
269
+ Files larger than batch_size_bytes get their own batch.
270
+ """
271
+ batches: List[List["ObjectSummary"]] = []
272
+ current_batch: List["ObjectSummary"] = []
273
+ current_batch_size = 0
212
274
 
213
- # Iterate over objects in the time partition path
214
275
  for obj in objects:
215
- # Extract file key
216
- file_key = obj.key
276
+ obj_size = obj.size
277
+
278
+ # If file is larger than batch size, give it its own batch
279
+ if obj_size > batch_size_bytes:
280
+ if current_batch:
281
+ batches.append(current_batch)
282
+ current_batch = []
283
+ current_batch_size = 0
284
+
285
+ batches.append([obj]) # Solo batch for large file
286
+ logger.warning(
287
+ f"File {obj.key} ({obj_size / (1024**2):.2f} MB) exceeds batch size "
288
+ f"({batch_size_bytes / (1024**2):.2f} MB), processing in separate batch"
289
+ )
290
+ continue
291
+
292
+ # If adding this file would exceed batch size, start a new batch
293
+ if (
294
+ current_batch_size > 0
295
+ and current_batch_size + obj_size > batch_size_bytes
296
+ ):
297
+ batches.append(current_batch)
298
+ current_batch = []
299
+ current_batch_size = 0
300
+
301
+ current_batch.append(obj)
302
+ current_batch_size += obj_size
303
+
304
+ # Add the last batch if it has files
305
+ if current_batch:
306
+ batches.append(current_batch)
307
+
308
+ return batches
309
+
310
+ def _download_and_zip_in_batches(
311
+ self,
312
+ bucket: str,
313
+ prefix: str,
314
+ batch_dir: str,
315
+ output_zip: str,
316
+ batch_size_bytes: int,
317
+ ) -> bool:
318
+ """
319
+ Stream files from S3 directly into ZIP using chunked streaming, processing in batches to limit memory usage.
217
320
 
218
- # Generate local file path
219
- local_file_path = os.path.join(
220
- os.getcwd(), target_dir, os.path.basename(file_key)
221
- )
321
+ Downloads the first file to batch_dir for schema/profile computation, then streams all files to ZIP
322
+ using 8MB chunks to ensure constant memory usage regardless of individual file sizes.
323
+
324
+ Args:
325
+ bucket: S3 bucket name
326
+ prefix: S3 prefix to filter objects
327
+ batch_dir: Local directory for temporary sample file download (for schema computation)
328
+ output_zip: Output ZIP file path
329
+ batch_size_bytes: Maximum total size of files to stream in each batch before flushing
222
330
 
223
- logger.info(f"Downloading s3://{bucket}/{file_key} to {local_file_path}")
331
+ Returns:
332
+ True if any files were processed, False otherwise
333
+ """
334
+ s3_resource = boto3.resource("s3")
335
+ objects = list(s3_resource.Bucket(bucket).objects.filter(Prefix=prefix))
224
336
 
225
- # Download file from S3
226
- self.s3_client.download_file(bucket, file_key, local_file_path)
337
+ if not objects:
338
+ return False
227
339
 
228
- files_downloaded = True
340
+ logger.info(
341
+ f"Found {len(objects)} files in s3://{bucket}/{prefix}, streaming in batches of up to {batch_size_bytes / (1024**2):.2f} MB"
342
+ )
229
343
 
230
- return files_downloaded
344
+ # Download first file to batch_dir for schema/profile computation
345
+ # This is required by register_dataset() which needs a local parquet file to generate schema
346
+ os.makedirs(batch_dir, exist_ok=True)
347
+ first_obj = objects[0]
348
+ sample_file_path = os.path.join(batch_dir, os.path.basename(first_obj.key))
231
349
 
232
- @staticmethod
233
- def _zip_folder(folder_path: str, output_file: str) -> None:
234
- logger.info(f"Zipping {folder_path} to {output_file}")
235
- with zipfile.ZipFile(output_file, "x", zipfile.ZIP_DEFLATED) as zipf:
236
- for root, _, files in os.walk(folder_path):
237
- for file in files:
238
- file_path = os.path.join(root, file)
239
- logger.info(f"Adding {file_path} to ZIP file")
240
- # Add file to zip archive with relative path
241
- zipf.write(file_path, os.path.relpath(file_path, folder_path))
350
+ try:
351
+ logger.info(
352
+ f"Downloading first file s3://{bucket}/{first_obj.key} ({first_obj.size / (1024**2):.2f} MB) "
353
+ f"to {sample_file_path} for schema computation"
354
+ )
355
+ self.s3_client.download_file(bucket, first_obj.key, sample_file_path)
356
+ except ClientError as e:
357
+ logger.error(f"Failed to download first file for schema computation: {e}")
358
+ raise RuntimeError(
359
+ f"Cannot compute schema without at least one sample file: {e}"
360
+ ) from e
361
+
362
+ # Group objects into batches based on cumulative size
363
+ batches = self._group_objects_into_batches(objects, batch_size_bytes)
364
+ logger.info(f"Split {len(objects)} files into {len(batches)} batches")
365
+
366
+ # Track whether we've processed the first file to avoid downloading it twice
367
+ first_obj_processed = False
368
+
369
+ # Process each batch: stream from S3 directly to ZIP using chunked reads
370
+ zip_mode: Literal["x", "a"] = "x" # Create new file for first batch
371
+ chunk_size = 8 * 1024 * 1024 # 8MB chunks for constant memory usage
372
+
373
+ for batch_idx, batch in enumerate(batches):
374
+ batch_size_mb = sum(obj.size for obj in batch) / (1024 * 1024)
375
+ logger.info(
376
+ f"Processing batch {batch_idx + 1}/{len(batches)} with {len(batch)} files ({batch_size_mb:.2f} MB)"
377
+ )
378
+
379
+ # Stream files from S3 directly into ZIP using chunked reads
380
+ with zipfile.ZipFile(output_zip, zip_mode, zipfile.ZIP_DEFLATED) as zipf:
381
+ for obj in batch:
382
+ file_key = obj.key
383
+
384
+ # Preserve S3 path structure in ZIP to avoid filename collisions
385
+ # Strip only the common prefix, keep subdirectories
386
+ relative_path = file_key[len(prefix) :].lstrip("/")
387
+ file_name = (
388
+ relative_path if relative_path else os.path.basename(file_key)
389
+ )
390
+
391
+ try:
392
+ # If this is the first file and we already downloaded it, reuse local copy
393
+ if not first_obj_processed and file_key == first_obj.key:
394
+ logger.info(
395
+ f"Adding {file_name} ({obj.size / (1024**2):.2f} MB) to ZIP from local file "
396
+ f"(already downloaded for schema computation)"
397
+ )
398
+ self._stream_file_to_zip_from_local(
399
+ sample_file_path, zipf, file_name, chunk_size
400
+ )
401
+ first_obj_processed = True
402
+ else:
403
+ # Stream from S3 using chunked reads for constant memory usage
404
+ logger.info(
405
+ f"Streaming {file_name} ({obj.size / (1024**2):.2f} MB) from S3 using chunked reads"
406
+ )
407
+ self._stream_file_to_zip_from_s3(
408
+ bucket, file_key, zipf, file_name, chunk_size
409
+ )
410
+
411
+ logger.info(f"Added {file_name} to ZIP file")
412
+
413
+ except ClientError as e:
414
+ logger.error(f"Failed to stream s3://{bucket}/{file_key}: {e}")
415
+ raise RuntimeError(
416
+ f"Failed to stream file {file_key} from S3: {e}"
417
+ ) from e
418
+ except Exception as e:
419
+ logger.error(
420
+ f"Unexpected error processing s3://{bucket}/{file_key}: {e}"
421
+ )
422
+ raise RuntimeError(
423
+ f"Failed to process file {file_key}: {e}"
424
+ ) from e
425
+
426
+ # After first batch, switch to append mode for subsequent batches
427
+ zip_mode = "a"
428
+
429
+ logger.info(
430
+ f"Batch {batch_idx + 1}/{len(batches)} complete, streamed {len(batch)} files"
431
+ )
432
+
433
+ total_size_mb = sum(obj.size for obj in objects) / (1024 * 1024)
434
+ logger.info(
435
+ f"Successfully streamed all {len(objects)} files ({total_size_mb:.2f} MB) across {len(batches)} batches"
436
+ )
437
+ return True
242
438
 
243
439
  def get_report(self) -> SourceReport:
244
440
  return self.report
@@ -75,7 +75,7 @@ class DataHubReportingFormsSource(Source):
75
75
  enabled=False, dataset_urn=None, physical_uri_prefix=None
76
76
  )
77
77
  result_map = query_result.get(query_name, {})
78
- return FormAnalyticsConfig.parse_obj(
78
+ return FormAnalyticsConfig.model_validate(
79
79
  dict(
80
80
  (field, result_map.get(graphql_field))
81
81
  for field, graphql_field in field_mappings.items()
@@ -2,7 +2,7 @@ from dataclasses import dataclass
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
4
 
5
- from pydantic import validator
5
+ from pydantic import field_validator
6
6
 
7
7
  from datahub.configuration.common import ConfigModel
8
8
  from datahub.ingestion.api.source import SourceReport
@@ -32,7 +32,8 @@ class DataHubReportingFormSourceConfig(ConfigModel):
32
32
  generate_presigned_url: bool = True
33
33
  presigned_url_expiry_days: int = 7
34
34
 
35
- @validator("reporting_snapshot_partitioning_strategy")
35
+ @field_validator("reporting_snapshot_partitioning_strategy")
36
+ @classmethod
36
37
  def validate_partitioning_strategy(cls, v):
37
38
  if v not in PartitioningStrategy:
38
39
  raise ValueError(f"Unsupported partitioning strategy: {v}")
@@ -3,7 +3,7 @@ import time
3
3
  from functools import partial
4
4
  from typing import Any, Dict, Iterable, List, Optional
5
5
 
6
- from pydantic import Field, root_validator
6
+ from pydantic import Field, model_validator
7
7
 
8
8
  from acryl_datahub_cloud.datahub_restore.do_restore import restore_indices
9
9
  from datahub.configuration.common import ConfigModel
@@ -64,7 +64,8 @@ class DataHubRestoreIndicesConfig(ConfigModel, StatefulIngestionConfigBase):
64
64
  description="Same as restore indices endpoint.",
65
65
  )
66
66
 
67
- @root_validator(pre=True)
67
+ @model_validator(mode="before")
68
+ @classmethod
68
69
  def extract_assertion_info(cls, values: Dict[str, Any]) -> Dict[str, Any]:
69
70
  if values.get("urn") is None and values.get("urn_like") is None:
70
71
  raise ValueError("Either urn or urn_like must be provided.")
@@ -0,0 +1,94 @@
1
+ EXCLUDED_PATTERNS = [
2
+ "_ingestion",
3
+ "amplitude",
4
+ "analytics",
5
+ "anomaly",
6
+ "anomalo",
7
+ "airflow",
8
+ "app",
9
+ "api",
10
+ "aws",
11
+ "braze",
12
+ "bigquery",
13
+ "backfill",
14
+ "billing",
15
+ "bot",
16
+ "census",
17
+ "customer_io",
18
+ "connector",
19
+ "composer",
20
+ "compute",
21
+ "circleci",
22
+ "classifier",
23
+ "cron",
24
+ "datahub",
25
+ "data-engine",
26
+ "dbt",
27
+ "datadog",
28
+ "deploy",
29
+ "databricks",
30
+ "dataflow",
31
+ "dataplex",
32
+ "dagster",
33
+ "enterprise",
34
+ "export",
35
+ "etl",
36
+ "fivetran",
37
+ "function",
38
+ "google",
39
+ "gcp",
40
+ "gke",
41
+ "grafana",
42
+ "hex",
43
+ "hightouch",
44
+ "ingest",
45
+ "infra",
46
+ "infer",
47
+ "integration",
48
+ "iam",
49
+ "job",
50
+ "jenkins",
51
+ "looker",
52
+ "lineage",
53
+ "monte_carlo",
54
+ "netsuite",
55
+ "process",
56
+ "prefect",
57
+ "pipeline",
58
+ "query",
59
+ "redash",
60
+ "realtime",
61
+ "report",
62
+ "remote-executor",
63
+ "runner",
64
+ "sagemaker",
65
+ "salesforce",
66
+ "sigma",
67
+ "sandbox",
68
+ "snowplow",
69
+ "segment",
70
+ "sync",
71
+ "schedul",
72
+ "svc",
73
+ "sa_",
74
+ "_sa",
75
+ "sa-",
76
+ "-sa",
77
+ "snowflake",
78
+ "service",
79
+ "system",
80
+ "spark",
81
+ "task",
82
+ "test",
83
+ "team",
84
+ "talend",
85
+ "teleskope",
86
+ "train",
87
+ "tableau",
88
+ "unknown",
89
+ "wiz",
90
+ "warehouse",
91
+ "workload",
92
+ "workflow",
93
+ "worker",
94
+ ]
@@ -1,3 +1,4 @@
1
+ from datetime import datetime, timedelta
1
2
  from typing import Dict
2
3
 
3
4
 
@@ -5,7 +6,7 @@ class QueryBuilder:
5
6
  @staticmethod
6
7
  def get_dataset_entities_query() -> Dict:
7
8
  return {
8
- "sort": [{"urn": {"order": "asc"}}],
9
+ # "sort": [{"urn": {"order": "asc"}}],
9
10
  "_source": {
10
11
  "includes": [
11
12
  "urn",
@@ -19,15 +20,54 @@ class QueryBuilder:
19
20
  }
20
21
 
21
22
  @staticmethod
22
- def get_query_entities_query() -> Dict:
23
+ def get_query_entities_query(days: int) -> Dict:
24
+ thirty_days_ago = datetime.now() - timedelta(days=days)
25
+ thirty_days_ago = thirty_days_ago.replace(
26
+ hour=0, minute=0, second=0, microsecond=0
27
+ )
28
+ epoch_ms = int(thirty_days_ago.timestamp() * 1000)
29
+
23
30
  return {
24
- "sort": [{"urn": {"order": "asc"}}],
31
+ # "sort": [{"urn": {"order": "asc"}}],
25
32
  "_source": {"includes": ["urn", "lastModifiedAt", "platform", "removed"]},
26
33
  "query": {
27
34
  "bool": {
28
35
  "filter": [
29
36
  {"bool": {"must_not": [{"term": {"source": "MANUAL"}}]}},
30
37
  {"exists": {"field": "platform"}},
38
+ {
39
+ "bool": {
40
+ "should": [
41
+ {
42
+ "bool": {
43
+ "filter": [
44
+ {"exists": {"field": "lastModifiedAt"}},
45
+ {
46
+ "range": {
47
+ "lastModifiedAt": {
48
+ "gte": epoch_ms
49
+ }
50
+ }
51
+ },
52
+ ]
53
+ }
54
+ },
55
+ {
56
+ "bool": {
57
+ "must_not": {
58
+ "exists": {"field": "lastModifiedAt"}
59
+ },
60
+ "filter": {
61
+ "range": {
62
+ "createdAt": {"gte": epoch_ms}
63
+ }
64
+ },
65
+ }
66
+ },
67
+ ],
68
+ "minimum_should_match": 1,
69
+ }
70
+ },
31
71
  ]
32
72
  }
33
73
  },
@@ -36,7 +76,7 @@ class QueryBuilder:
36
76
  @staticmethod
37
77
  def get_upstreams_query() -> Dict:
38
78
  return {
39
- "sort": [{"destination.urn": {"order": "asc"}}],
79
+ # "sort": [{"destination.urn": {"order": "asc"}}],
40
80
  "_source": {"includes": ["source.urn", "destination.urn"]},
41
81
  "query": {
42
82
  "bool": {
@@ -51,7 +91,7 @@ class QueryBuilder:
51
91
  @staticmethod
52
92
  def get_dashboard_usage_query(days: int) -> Dict:
53
93
  return {
54
- "sort": [{"urn": {"order": "asc"}}],
94
+ # "sort": [{"urn": {"order": "asc"}}],
55
95
  "_source": {
56
96
  "includes": [
57
97
  "timestampMillis",
@@ -80,7 +120,7 @@ class QueryBuilder:
80
120
  @staticmethod
81
121
  def get_dataset_usage_query(days: int) -> Dict:
82
122
  return {
83
- "sort": [{"urn": {"order": "asc"}}],
123
+ # "sort": [{"urn": {"order": "asc"}}],
84
124
  "_source": {
85
125
  "includes": [
86
126
  "timestampMillis",
@@ -110,7 +150,7 @@ class QueryBuilder:
110
150
  @staticmethod
111
151
  def get_dataset_write_usage_raw_query(days: int) -> Dict:
112
152
  return {
113
- "sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
153
+ # "sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
114
154
  "_source": {
115
155
  "includes": [
116
156
  "urn" # Only field needed for platform extraction via regex
@@ -159,7 +199,7 @@ class QueryBuilder:
159
199
  @staticmethod
160
200
  def get_query_usage_query(days: int) -> Dict:
161
201
  return {
162
- "sort": [{"urn": {"order": "asc"}}],
202
+ # "sort": [{"urn": {"order": "asc"}}],
163
203
  "_source": {
164
204
  "includes": [
165
205
  "timestampMillis",