acryl-datahub-cloud 0.3.11rc0__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (238) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
  3. acryl_datahub_cloud/action_request/action_request_owner_source.py +36 -6
  4. acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
  5. acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
  6. acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
  7. acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
  8. acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
  9. acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
  10. acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
  11. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +37 -13
  12. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +55 -24
  13. acryl_datahub_cloud/datahub_reporting/extract_graph.py +4 -3
  14. acryl_datahub_cloud/datahub_reporting/extract_sql.py +242 -51
  15. acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
  16. acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
  17. acryl_datahub_cloud/datahub_restore/source.py +3 -2
  18. acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
  19. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
  20. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +518 -77
  21. acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
  22. acryl_datahub_cloud/graphql_utils.py +64 -0
  23. acryl_datahub_cloud/lineage_features/source.py +555 -49
  24. acryl_datahub_cloud/metadata/_urns/urn_defs.py +2296 -1900
  25. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
  26. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
  27. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  28. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +4 -2
  29. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  30. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
  31. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
  32. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
  33. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  34. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
  35. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
  36. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
  37. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  38. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +12 -0
  39. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
  40. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  41. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
  42. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  43. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  44. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  45. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
  46. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  47. acryl_datahub_cloud/metadata/schema.avsc +25091 -20557
  48. acryl_datahub_cloud/metadata/schema_classes.py +29269 -23863
  49. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
  50. acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
  51. acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
  52. acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
  53. acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
  54. acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
  55. acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
  56. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +353 -215
  57. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
  58. acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
  59. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +166 -21
  60. acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
  61. acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
  62. acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
  63. acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
  64. acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
  65. acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
  66. acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
  67. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  68. acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
  69. acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  70. acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
  71. acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
  72. acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
  73. acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  74. acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
  75. acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
  76. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
  77. acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
  78. acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
  79. acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
  80. acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
  81. acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
  82. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
  83. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
  84. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
  85. acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
  86. acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
  87. acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  88. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  89. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
  90. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  91. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  92. acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  93. acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
  94. acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
  95. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
  96. acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
  97. acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  98. acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
  99. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
  100. acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
  101. acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
  102. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
  103. acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
  104. acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  105. acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
  106. acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
  107. acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
  108. acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
  109. acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
  110. acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
  111. acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  112. acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  113. acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  114. acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  115. acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  116. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  117. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  118. acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  119. acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  120. acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  121. acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
  122. acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
  123. acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
  124. acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
  125. acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
  126. acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
  127. acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
  128. acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
  129. acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
  130. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
  131. acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
  132. acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  133. acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
  134. acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  135. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
  136. acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
  137. acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
  138. acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
  139. acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
  140. acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
  141. acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
  142. acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
  143. acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
  144. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
  145. acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
  146. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
  147. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
  148. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
  149. acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
  150. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
  151. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +418 -97
  152. acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +62 -44
  153. acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  154. acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +54 -9
  155. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +163 -23
  156. acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
  157. acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
  158. acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
  159. acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
  160. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
  161. acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
  162. acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
  163. acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
  164. acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  165. acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
  166. acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  167. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
  168. acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
  169. acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
  170. acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
  171. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
  172. acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  173. acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
  174. acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
  175. acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +61 -0
  176. acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
  177. acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
  178. acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
  179. acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
  180. acryl_datahub_cloud/notifications/__init__.py +0 -0
  181. acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
  182. acryl_datahub_cloud/sdk/__init__.py +69 -0
  183. acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
  184. acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
  185. acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
  186. acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
  187. acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
  188. acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
  189. acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
  190. acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
  191. acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
  192. acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
  193. acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
  194. acryl_datahub_cloud/sdk/assertion/types.py +20 -0
  195. acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
  196. acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
  197. acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
  198. acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
  199. acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
  200. acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
  201. acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
  202. acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
  203. acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
  204. acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
  205. acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
  206. acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
  207. acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
  208. acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
  209. acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
  210. acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
  211. acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
  212. acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
  213. acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
  214. acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
  215. acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
  216. acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
  217. acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
  218. acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
  219. acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
  220. acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
  221. acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
  222. acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
  223. acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
  224. acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
  225. acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
  226. acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
  227. acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
  228. acryl_datahub_cloud/sdk/errors.py +34 -0
  229. acryl_datahub_cloud/sdk/resolver_client.py +42 -0
  230. acryl_datahub_cloud/sdk/subscription_client.py +737 -0
  231. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +55 -49
  232. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +235 -142
  233. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
  234. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
  235. acryl_datahub_cloud/_sdk_extras/__init__.py +0 -4
  236. acryl_datahub_cloud/_sdk_extras/assertion.py +0 -15
  237. acryl_datahub_cloud/_sdk_extras/assertions_client.py +0 -23
  238. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,14 @@ import pathlib
5
5
  import tempfile
6
6
  import time
7
7
  from enum import Enum
8
- from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
8
+ from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
9
9
 
10
10
  import boto3
11
11
  import duckdb
12
12
  import pandas
13
13
  import pyarrow as pa
14
14
  import pyarrow.parquet as pq
15
- from pydantic import BaseModel, validator
15
+ from pydantic import BaseModel, field_validator
16
16
 
17
17
  from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow, SchemaField
18
18
  from datahub.configuration.common import ConfigModel
@@ -73,7 +73,9 @@ class FileStoreBackedDatasetConfig(ConfigModel):
73
73
  store_platform: str = "s3"
74
74
  file_name: str = "data"
75
75
  file_extension: str = "parquet"
76
- file_compression: str = "snappy"
76
+ file_compression: Literal[
77
+ "gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"
78
+ ] = "snappy"
77
79
  file_overwrite_existing: bool = True
78
80
  snapshot_partitioning_strategy: str = PartitioningStrategy.DATE
79
81
  generate_presigned_url: bool = True
@@ -85,7 +87,8 @@ class FileStoreBackedDatasetConfig(ConfigModel):
85
87
 
86
88
  datahub_platform: str = "acryl"
87
89
 
88
- @validator("snapshot_partitioning_strategy")
90
+ @field_validator("snapshot_partitioning_strategy")
91
+ @classmethod
89
92
  def validate_partitioning_strategy(cls, v):
90
93
  if v not in PartitioningStrategy._value2member_map_:
91
94
  raise ValueError(f"Unsupported partitioning strategy: {v}")
@@ -119,9 +122,14 @@ class DataHubBasedS3Dataset:
119
122
  self.local_file_path: str = (
120
123
  config.file if config.file else self._initialize_local_file()
121
124
  )
122
- self.file_writer = None
125
+ self.file_writer: Optional[pq.ParquetWriter] = None
123
126
  self.schema = (
124
- pa.schema([(x.name, x.type) for x in self.dataset_metadata.schemaFields])
127
+ pa.schema(
128
+ [
129
+ pa.field(x.name, BaseModelRow.string_to_pyarrow_type(x.type))
130
+ for x in self.dataset_metadata.schemaFields
131
+ ]
132
+ )
125
133
  if self.dataset_metadata.schemaFields
126
134
  else None
127
135
  )
@@ -163,18 +171,32 @@ class DataHubBasedS3Dataset:
163
171
  self.schema = row.arrow_schema()
164
172
  else:
165
173
  # hail mary: infer schema from the first row and cast everything to string
166
- self.schema = pa.schema([(key, pa.string()) for key in row])
174
+ self.schema = pa.schema([pa.field(key, pa.string()) for key in row])
167
175
  self.stringify_row = True
168
176
 
169
177
  self._initialize_local_file()
178
+ # Map compression names to PyArrow format (most are direct mappings)
179
+ compression_map = {
180
+ "gzip": "gzip",
181
+ "bz2": "brotli", # PyArrow doesn't support bz2, use brotli
182
+ "brotli": "brotli",
183
+ "lz4": "lz4",
184
+ "zstd": "zstd",
185
+ "snappy": "snappy",
186
+ "none": "none",
187
+ }
188
+ compression = cast(
189
+ Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"],
190
+ compression_map.get(self.config.file_compression, "snappy"),
191
+ )
170
192
  self.file_writer = pq.ParquetWriter(
171
193
  self.local_file_path,
172
194
  self.schema,
173
- compression=self.config.file_compression,
195
+ compression=compression,
174
196
  )
175
197
  if isinstance(row, (BaseModel, BaseModelRow)):
176
198
  # for anything extending BaseModel, we want to use the dict representation
177
- write_row: Dict[str, Any] = row.dict()
199
+ write_row: Dict[str, Any] = row.model_dump()
178
200
  elif isinstance(row, dict):
179
201
  write_row = row
180
202
  else:
@@ -271,7 +293,7 @@ class DataHubBasedS3Dataset:
271
293
  )
272
294
 
273
295
  def _generate_schema_metadata(
274
- self, duckdb_columns: List[Tuple[str, str]]
296
+ self, duckdb_columns: List[Tuple[str, Any]]
275
297
  ) -> SchemaMetadataClass:
276
298
  def get_type_from_dtype(dtype: str) -> SchemaFieldDataTypeClass:
277
299
  if "int" in dtype or "float" in dtype or "number" in dtype:
@@ -302,7 +324,7 @@ class DataHubBasedS3Dataset:
302
324
  )
303
325
  for column in duckdb_columns:
304
326
  # generate data type
305
- data_type = column[1].lower()
327
+ data_type = str(column[1]).lower()
306
328
  schema_metadata.fields.append(
307
329
  SchemaFieldClass(
308
330
  fieldPath=column[0],
@@ -341,7 +363,7 @@ class DataHubBasedS3Dataset:
341
363
  # generate min, max, avg, distinct count, null count
342
364
  column_name = column[0]
343
365
  logger.info(f"Generating field profile for {column_name}")
344
- data_type = column[1].lower()
366
+ data_type = str(column[1]).lower()
345
367
  if "int" in data_type or "float" in data_type:
346
368
  query = (
347
369
  f"SELECT COUNT(DISTINCT {column_name}), COUNT(*) - COUNT({column_name}), MIN({column_name}), MAX({column_name}), AVG({column_name})"
@@ -396,7 +418,9 @@ class DataHubBasedS3Dataset:
396
418
  assert dataset_profiles.fieldProfiles is not None
397
419
  dataset_profiles.fieldProfiles.append(field_profile)
398
420
  logger.info("Generated dataset profile")
399
- schema_metadata = self._generate_schema_metadata(columns)
421
+ schema_metadata = self._generate_schema_metadata(
422
+ [(col[0], col[1]) for col in columns]
423
+ )
400
424
  return dataset_profiles, schema_metadata
401
425
 
402
426
  def register_dataset(
@@ -1,16 +1,16 @@
1
- import json
2
1
  import logging
3
2
  from datetime import date, datetime, timezone
4
3
  from enum import Enum
5
- from typing import Any, Callable, Dict, Iterable, List, Optional
4
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
6
5
 
7
6
  import pandas as pd
8
- from pydantic import BaseModel
7
+ from pydantic import BaseModel, field_validator
9
8
 
10
9
  from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow
10
+ from acryl_datahub_cloud.graphql_utils import parse_extra_properties_for_model
11
11
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
12
  from datahub.ingestion.graph.client import DataHubGraph
13
- from datahub.ingestion.graph.filters import RawSearchFilterRule
13
+ from datahub.ingestion.graph.filters import RawSearchFilter
14
14
  from datahub.metadata.schema_classes import (
15
15
  DomainPropertiesClass,
16
16
  FormAssociationClass,
@@ -130,6 +130,22 @@ class DataHubFormReportingData(FormData):
130
130
  platformInstance: Optional[str] = None
131
131
  domains: List[str] = []
132
132
 
133
+ @field_validator(
134
+ "completedFormsIncompletePromptResponseTimes",
135
+ "completedFormsCompletedPromptResponseTimes",
136
+ "incompleteFormsIncompletePromptResponseTimes",
137
+ "incompleteFormsCompletedPromptResponseTimes",
138
+ mode="before",
139
+ )
140
+ @classmethod
141
+ def convert_timestamps_to_strings(
142
+ cls, v: Union[List[int], List[str]]
143
+ ) -> List[str]:
144
+ """Convert timestamp integers to strings for compatibility with GMS data."""
145
+ if not isinstance(v, list):
146
+ return v
147
+ return [str(item) for item in v]
148
+
133
149
  def __init__(self, graph: DataHubGraph, allowed_forms: Optional[List[str]] = None):
134
150
  self.graph: DataHubGraph = graph
135
151
  self.form_registry = FormRegistry(graph)
@@ -143,13 +159,13 @@ class DataHubFormReportingData(FormData):
143
159
  on_form_scanned: Callable[[str], Any],
144
160
  ) -> pd.DataFrame:
145
161
  return pd.DataFrame(
146
- x.dict()
162
+ x.model_dump()
147
163
  for x in self.get_data(
148
164
  on_asset_scanned=on_asset_scanned, on_form_scanned=on_form_scanned
149
165
  )
150
166
  )
151
167
 
152
- def get_form_existence_or_filters(self) -> List[RawSearchFilterRule]:
168
+ def get_form_existence_or_filters(self) -> RawSearchFilter:
153
169
  """
154
170
  Datasets must either have completedForms or incompleteForms assigned to
155
171
  them
@@ -157,25 +173,41 @@ class DataHubFormReportingData(FormData):
157
173
  if self.allowed_forms:
158
174
  return [
159
175
  {
160
- "field": "completedForms",
161
- "condition": "EQUAL",
162
- "values": self.allowed_forms,
176
+ "and": [
177
+ {
178
+ "field": "completedForms",
179
+ "condition": "EQUAL",
180
+ "values": self.allowed_forms,
181
+ }
182
+ ]
163
183
  },
164
184
  {
165
- "field": "incompleteForms",
166
- "condition": "EQUAL",
167
- "values": self.allowed_forms,
185
+ "and": [
186
+ {
187
+ "field": "incompleteForms",
188
+ "condition": "EQUAL",
189
+ "values": self.allowed_forms,
190
+ }
191
+ ]
168
192
  },
169
193
  ]
170
194
  else:
171
195
  return [
172
196
  {
173
- "field": "completedForms",
174
- "condition": "EXISTS",
197
+ "and": [
198
+ {
199
+ "field": "completedForms",
200
+ "condition": "EXISTS",
201
+ }
202
+ ]
175
203
  },
176
204
  {
177
- "field": "incompleteForms",
178
- "condition": "EXISTS",
205
+ "and": [
206
+ {
207
+ "field": "incompleteForms",
208
+ "condition": "EXISTS",
209
+ }
210
+ ]
179
211
  },
180
212
  ]
181
213
 
@@ -290,10 +322,10 @@ class DataHubFormReportingData(FormData):
290
322
  on_asset_scanned: Optional[Callable[[str], Any]] = None,
291
323
  on_form_scanned: Optional[Callable[[str], Any]] = None,
292
324
  ) -> Iterable[FormReportingRow]:
293
- extra_fields = [f for f in self.DataHubDatasetSearchRow.__fields__]
325
+ extra_fields = [f for f in self.DataHubDatasetSearchRow.model_fields]
294
326
  # TODO: Replace with the new search/filter SDK.
295
327
  result = self.graph.get_results_by_filter(
296
- extra_or_filters=[{"and": self.get_form_existence_or_filters()}],
328
+ extra_or_filters=self.get_form_existence_or_filters(),
297
329
  extra_source_fields=extra_fields,
298
330
  skip_cache=True,
299
331
  )
@@ -304,10 +336,9 @@ class DataHubFormReportingData(FormData):
304
336
  if row_index % 100 == 0:
305
337
  logger.info(f"Scanned {row_index} assets")
306
338
  extra_properties = row["extraProperties"]
307
-
308
- extra_properties_map = {
309
- x["name"]: json.loads(x["value"]) for x in extra_properties
310
- }
339
+ extra_properties_map = parse_extra_properties_for_model(
340
+ extra_properties, self.DataHubDatasetSearchRow
341
+ )
311
342
  search_row = self.DataHubDatasetSearchRow(**extra_properties_map)
312
343
  if on_asset_scanned:
313
344
  on_asset_scanned(search_row.urn)
@@ -414,7 +445,7 @@ class DataHubFormReportingData(FormData):
414
445
  question_status=QuestionStatus.COMPLETED,
415
446
  question_completed_date=datetime.fromtimestamp(
416
447
  float(prompt_response_time) / 1000, tz=timezone.utc
417
- ),
448
+ ).date(),
418
449
  snapshot_date=self.snapshot_date,
419
450
  )
420
451
  complete_forms = (
@@ -516,7 +547,7 @@ class DataHubFormReportingData(FormData):
516
547
  question_status=QuestionStatus.COMPLETED,
517
548
  question_completed_date=datetime.fromtimestamp(
518
549
  float(prompt_response_time) / 1000, tz=timezone.utc
519
- ),
550
+ ).date(),
520
551
  snapshot_date=self.snapshot_date,
521
552
  )
522
553
 
@@ -6,7 +6,7 @@ from typing import List, Optional
6
6
 
7
7
  import boto3
8
8
  from opensearchpy import OpenSearch
9
- from pydantic import validator
9
+ from pydantic import field_validator
10
10
 
11
11
  from acryl_datahub_cloud.datahub_reporting.datahub_dataset import (
12
12
  DataHubBasedS3Dataset,
@@ -43,8 +43,9 @@ class DataHubReportingExtractGraphSourceConfig(ConfigModel):
43
43
  query_timeout: int = 30
44
44
  extract_batch_size: int = 2000
45
45
 
46
- @validator("extract_graph_store", pre=True, always=True)
47
- def set_default_extract_soft_delete_flag(cls, v, values):
46
+ @field_validator("extract_graph_store", mode="before")
47
+ @classmethod
48
+ def set_default_extract_soft_delete_flag(cls, v):
48
49
  if v is not None:
49
50
  if "dataset_registration_spec" not in v:
50
51
  v["dataset_registration_spec"] = DatasetRegistrationSpec(
@@ -4,10 +4,14 @@ import shutil
4
4
  import zipfile
5
5
  from datetime import datetime, timedelta
6
6
  from pathlib import Path
7
- from typing import Iterable, List, Optional
7
+ from typing import TYPE_CHECKING, Iterable, List, Literal, Optional
8
8
 
9
9
  import boto3
10
- from pydantic import validator
10
+ from botocore.exceptions import ClientError
11
+ from pydantic import field_validator
12
+
13
+ if TYPE_CHECKING:
14
+ from mypy_boto3_s3.service_resource import ObjectSummary
11
15
 
12
16
  from acryl_datahub_cloud.datahub_reporting.datahub_dataset import (
13
17
  DataHubBasedS3Dataset,
@@ -42,25 +46,36 @@ class DataHubReportingExtractSQLSourceConfig(ConfigModel):
42
46
  server: Optional[DatahubClientConfig] = None
43
47
  sql_backup_config: S3ClientConfig
44
48
  extract_sql_store: FileStoreBackedDatasetConfig
45
-
46
- @validator("extract_sql_store", pre=True, always=True)
49
+ # Maximum size (in bytes) of files to stream from S3 per batch using chunked streaming.
50
+ # Files are streamed in 8MB chunks directly from S3 to ZIP without writing to disk, processing
51
+ # files in batches to limit peak memory usage. This prevents both disk pressure and excessive
52
+ # memory consumption during batch processing.
53
+ # Default: 5GB (5 * 1024 * 1024 * 1024 bytes)
54
+ batch_size_bytes: int = 5 * 1024 * 1024 * 1024
55
+
56
+ @field_validator("extract_sql_store", mode="before")
57
+ @classmethod
47
58
  def set_default_extract_soft_delete_flag(cls, v):
48
- if v is not None:
49
- if "dataset_registration_spec" not in v:
50
- v["dataset_registration_spec"] = DatasetRegistrationSpec(
51
- soft_deleted=False
52
- )
53
- elif "soft_deleted" not in v["dataset_registration_spec"]:
54
- v["dataset_registration_spec"]["soft_deleted"] = False
59
+ if v is None:
60
+ return v
61
+
62
+ # If v is already a FileStoreBackedDatasetConfig object, skip dict-based modifications
63
+ if isinstance(v, FileStoreBackedDatasetConfig):
64
+ return v
65
+
66
+ # v is a dictionary - apply default values
67
+ if "dataset_registration_spec" not in v:
68
+ v["dataset_registration_spec"] = DatasetRegistrationSpec(soft_deleted=False)
69
+ elif "soft_deleted" not in v["dataset_registration_spec"]:
70
+ v["dataset_registration_spec"]["soft_deleted"] = False
71
+
72
+ if "file" not in v:
73
+ default_config = FileStoreBackedDatasetConfig.dummy()
74
+ v["file"] = f"{default_config.file_name}.{default_config.file_extension}"
75
+ else:
76
+ v["file_name"] = v["file"].split(".")[0]
77
+ v["file_extension"] = v["file"].split(".")[-1]
55
78
 
56
- if "file" not in v:
57
- default_config = FileStoreBackedDatasetConfig.dummy()
58
- v["file"] = (
59
- f"{default_config.file_name}.{default_config.file_extension}"
60
- )
61
- else:
62
- v["file_name"] = v["file"].split(".")[0]
63
- v["file_extension"] = v["file"].split(".")[-1]
64
79
  return v
65
80
 
66
81
 
@@ -166,20 +181,17 @@ class DataHubReportingExtractSQLSource(Source):
166
181
 
167
182
  self._clean_up_old_state(state_directory=tmp_dir)
168
183
 
169
- files_downloaded: bool = self._download_files(
184
+ files_downloaded: bool = self._download_and_zip_in_batches(
170
185
  bucket=self.config.sql_backup_config.bucket,
171
186
  prefix=bucket_prefix,
172
- target_dir=f"{tmp_dir}/download/",
187
+ batch_dir=f"{tmp_dir}/download/",
188
+ output_zip=f"{tmp_dir}/{output_file}",
189
+ batch_size_bytes=self.config.batch_size_bytes,
173
190
  )
174
191
  if not files_downloaded:
175
192
  logger.warning(f"Skipping as no files were found in {bucket_prefix}")
176
193
  return
177
194
 
178
- self._zip_folder(
179
- folder_path=f"{tmp_dir}/download",
180
- output_file=f"{tmp_dir}/{output_file}",
181
- )
182
-
183
195
  # Compute profile & schema information, this is based on the parquet files that were downloaded and not the zip file.
184
196
  # We must hard-code the local file from which the dataset will be created, otherwise the upload to s3 will be in
185
197
  # unexpected path.
@@ -210,40 +222,219 @@ class DataHubReportingExtractSQLSource(Source):
210
222
  path = Path(f"{state_directory}/download/")
211
223
  path.mkdir(parents=True, exist_ok=True)
212
224
 
213
- def _download_files(self, bucket: str, prefix: str, target_dir: str) -> bool:
214
- objects = boto3.resource("s3").Bucket(bucket).objects.filter(Prefix=prefix)
225
+ @staticmethod
226
+ def _stream_file_to_zip_from_local(
227
+ local_file_path: str,
228
+ zipf: zipfile.ZipFile,
229
+ file_name: str,
230
+ chunk_size: int,
231
+ ) -> None:
232
+ """Stream file from local disk to ZIP using chunked reads."""
233
+ with (
234
+ open(local_file_path, "rb") as local_file,
235
+ zipf.open(file_name, "w") as zip_entry,
236
+ ):
237
+ while True:
238
+ chunk = local_file.read(chunk_size)
239
+ if not chunk:
240
+ break
241
+ zip_entry.write(chunk)
242
+
243
+ def _stream_file_to_zip_from_s3(
244
+ self,
245
+ bucket: str,
246
+ file_key: str,
247
+ zipf: zipfile.ZipFile,
248
+ file_name: str,
249
+ chunk_size: int,
250
+ ) -> None:
251
+ """Stream file from S3 to ZIP using chunked reads."""
252
+ s3_response = self.s3_client.get_object(Bucket=bucket, Key=file_key)
253
+ body_stream = s3_response["Body"]
254
+
255
+ with zipf.open(file_name, "w") as zip_entry:
256
+ while True:
257
+ chunk = body_stream.read(chunk_size)
258
+ if not chunk:
259
+ break
260
+ zip_entry.write(chunk)
215
261
 
216
- files_downloaded = False
262
+ @staticmethod
263
+ def _group_objects_into_batches(
264
+ objects: List["ObjectSummary"], batch_size_bytes: int
265
+ ) -> List[List["ObjectSummary"]]:
266
+ """
267
+ Group S3 objects into batches based on cumulative size.
268
+
269
+ Files larger than batch_size_bytes get their own batch.
270
+ """
271
+ batches: List[List["ObjectSummary"]] = []
272
+ current_batch: List["ObjectSummary"] = []
273
+ current_batch_size = 0
217
274
 
218
- # Iterate over objects in the time partition path
219
275
  for obj in objects:
220
- # Extract file key
221
- file_key = obj.key
276
+ obj_size = obj.size
277
+
278
+ # If file is larger than batch size, give it its own batch
279
+ if obj_size > batch_size_bytes:
280
+ if current_batch:
281
+ batches.append(current_batch)
282
+ current_batch = []
283
+ current_batch_size = 0
284
+
285
+ batches.append([obj]) # Solo batch for large file
286
+ logger.warning(
287
+ f"File {obj.key} ({obj_size / (1024**2):.2f} MB) exceeds batch size "
288
+ f"({batch_size_bytes / (1024**2):.2f} MB), processing in separate batch"
289
+ )
290
+ continue
291
+
292
+ # If adding this file would exceed batch size, start a new batch
293
+ if (
294
+ current_batch_size > 0
295
+ and current_batch_size + obj_size > batch_size_bytes
296
+ ):
297
+ batches.append(current_batch)
298
+ current_batch = []
299
+ current_batch_size = 0
300
+
301
+ current_batch.append(obj)
302
+ current_batch_size += obj_size
303
+
304
+ # Add the last batch if it has files
305
+ if current_batch:
306
+ batches.append(current_batch)
307
+
308
+ return batches
309
+
310
+ def _download_and_zip_in_batches(
311
+ self,
312
+ bucket: str,
313
+ prefix: str,
314
+ batch_dir: str,
315
+ output_zip: str,
316
+ batch_size_bytes: int,
317
+ ) -> bool:
318
+ """
319
+ Stream files from S3 directly into ZIP using chunked streaming, processing in batches to limit memory usage.
222
320
 
223
- # Generate local file path
224
- local_file_path = os.path.join(
225
- os.getcwd(), target_dir, os.path.basename(file_key)
226
- )
321
+ Downloads the first file to batch_dir for schema/profile computation, then streams all files to ZIP
322
+ using 8MB chunks to ensure constant memory usage regardless of individual file sizes.
323
+
324
+ Args:
325
+ bucket: S3 bucket name
326
+ prefix: S3 prefix to filter objects
327
+ batch_dir: Local directory for temporary sample file download (for schema computation)
328
+ output_zip: Output ZIP file path
329
+ batch_size_bytes: Maximum total size of files to stream in each batch before flushing
227
330
 
228
- logger.info(f"Downloading s3://{bucket}/{file_key} to {local_file_path}")
331
+ Returns:
332
+ True if any files were processed, False otherwise
333
+ """
334
+ s3_resource = boto3.resource("s3")
335
+ objects = list(s3_resource.Bucket(bucket).objects.filter(Prefix=prefix))
229
336
 
230
- # Download file from S3
231
- self.s3_client.download_file(bucket, file_key, local_file_path)
337
+ if not objects:
338
+ return False
232
339
 
233
- files_downloaded = True
340
+ logger.info(
341
+ f"Found {len(objects)} files in s3://{bucket}/{prefix}, streaming in batches of up to {batch_size_bytes / (1024**2):.2f} MB"
342
+ )
234
343
 
235
- return files_downloaded
344
+ # Download first file to batch_dir for schema/profile computation
345
+ # This is required by register_dataset() which needs a local parquet file to generate schema
346
+ os.makedirs(batch_dir, exist_ok=True)
347
+ first_obj = objects[0]
348
+ sample_file_path = os.path.join(batch_dir, os.path.basename(first_obj.key))
236
349
 
237
- @staticmethod
238
- def _zip_folder(folder_path: str, output_file: str) -> None:
239
- logger.info(f"Zipping {folder_path} to {output_file}")
240
- with zipfile.ZipFile(output_file, "x", zipfile.ZIP_DEFLATED) as zipf:
241
- for root, _, files in os.walk(folder_path):
242
- for file in files:
243
- file_path = os.path.join(root, file)
244
- logger.info(f"Adding {file_path} to ZIP file")
245
- # Add file to zip archive with relative path
246
- zipf.write(file_path, os.path.relpath(file_path, folder_path))
350
+ try:
351
+ logger.info(
352
+ f"Downloading first file s3://{bucket}/{first_obj.key} ({first_obj.size / (1024**2):.2f} MB) "
353
+ f"to {sample_file_path} for schema computation"
354
+ )
355
+ self.s3_client.download_file(bucket, first_obj.key, sample_file_path)
356
+ except ClientError as e:
357
+ logger.error(f"Failed to download first file for schema computation: {e}")
358
+ raise RuntimeError(
359
+ f"Cannot compute schema without at least one sample file: {e}"
360
+ ) from e
361
+
362
+ # Group objects into batches based on cumulative size
363
+ batches = self._group_objects_into_batches(objects, batch_size_bytes)
364
+ logger.info(f"Split {len(objects)} files into {len(batches)} batches")
365
+
366
+ # Track whether we've processed the first file to avoid downloading it twice
367
+ first_obj_processed = False
368
+
369
+ # Process each batch: stream from S3 directly to ZIP using chunked reads
370
+ zip_mode: Literal["x", "a"] = "x" # Create new file for first batch
371
+ chunk_size = 8 * 1024 * 1024 # 8MB chunks for constant memory usage
372
+
373
+ for batch_idx, batch in enumerate(batches):
374
+ batch_size_mb = sum(obj.size for obj in batch) / (1024 * 1024)
375
+ logger.info(
376
+ f"Processing batch {batch_idx + 1}/{len(batches)} with {len(batch)} files ({batch_size_mb:.2f} MB)"
377
+ )
378
+
379
+ # Stream files from S3 directly into ZIP using chunked reads
380
+ with zipfile.ZipFile(output_zip, zip_mode, zipfile.ZIP_DEFLATED) as zipf:
381
+ for obj in batch:
382
+ file_key = obj.key
383
+
384
+ # Preserve S3 path structure in ZIP to avoid filename collisions
385
+ # Strip only the common prefix, keep subdirectories
386
+ relative_path = file_key[len(prefix) :].lstrip("/")
387
+ file_name = (
388
+ relative_path if relative_path else os.path.basename(file_key)
389
+ )
390
+
391
+ try:
392
+ # If this is the first file and we already downloaded it, reuse local copy
393
+ if not first_obj_processed and file_key == first_obj.key:
394
+ logger.info(
395
+ f"Adding {file_name} ({obj.size / (1024**2):.2f} MB) to ZIP from local file "
396
+ f"(already downloaded for schema computation)"
397
+ )
398
+ self._stream_file_to_zip_from_local(
399
+ sample_file_path, zipf, file_name, chunk_size
400
+ )
401
+ first_obj_processed = True
402
+ else:
403
+ # Stream from S3 using chunked reads for constant memory usage
404
+ logger.info(
405
+ f"Streaming {file_name} ({obj.size / (1024**2):.2f} MB) from S3 using chunked reads"
406
+ )
407
+ self._stream_file_to_zip_from_s3(
408
+ bucket, file_key, zipf, file_name, chunk_size
409
+ )
410
+
411
+ logger.info(f"Added {file_name} to ZIP file")
412
+
413
+ except ClientError as e:
414
+ logger.error(f"Failed to stream s3://{bucket}/{file_key}: {e}")
415
+ raise RuntimeError(
416
+ f"Failed to stream file {file_key} from S3: {e}"
417
+ ) from e
418
+ except Exception as e:
419
+ logger.error(
420
+ f"Unexpected error processing s3://{bucket}/{file_key}: {e}"
421
+ )
422
+ raise RuntimeError(
423
+ f"Failed to process file {file_key}: {e}"
424
+ ) from e
425
+
426
+ # After first batch, switch to append mode for subsequent batches
427
+ zip_mode = "a"
428
+
429
+ logger.info(
430
+ f"Batch {batch_idx + 1}/{len(batches)} complete, streamed {len(batch)} files"
431
+ )
432
+
433
+ total_size_mb = sum(obj.size for obj in objects) / (1024 * 1024)
434
+ logger.info(
435
+ f"Successfully streamed all {len(objects)} files ({total_size_mb:.2f} MB) across {len(batches)} batches"
436
+ )
437
+ return True
247
438
 
248
439
  def get_report(self) -> SourceReport:
249
440
  return self.report
@@ -75,7 +75,7 @@ class DataHubReportingFormsSource(Source):
75
75
  enabled=False, dataset_urn=None, physical_uri_prefix=None
76
76
  )
77
77
  result_map = query_result.get(query_name, {})
78
- return FormAnalyticsConfig.parse_obj(
78
+ return FormAnalyticsConfig.model_validate(
79
79
  dict(
80
80
  (field, result_map.get(graphql_field))
81
81
  for field, graphql_field in field_mappings.items()
@@ -2,7 +2,7 @@ from dataclasses import dataclass
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
4
 
5
- from pydantic import validator
5
+ from pydantic import field_validator
6
6
 
7
7
  from datahub.configuration.common import ConfigModel
8
8
  from datahub.ingestion.api.source import SourceReport
@@ -32,7 +32,8 @@ class DataHubReportingFormSourceConfig(ConfigModel):
32
32
  generate_presigned_url: bool = True
33
33
  presigned_url_expiry_days: int = 7
34
34
 
35
- @validator("reporting_snapshot_partitioning_strategy")
35
+ @field_validator("reporting_snapshot_partitioning_strategy")
36
+ @classmethod
36
37
  def validate_partitioning_strategy(cls, v):
37
38
  if v not in PartitioningStrategy:
38
39
  raise ValueError(f"Unsupported partitioning strategy: {v}")