acryl-datahub-cloud 0.3.11rc0__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
- acryl_datahub_cloud/action_request/action_request_owner_source.py +36 -6
- acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
- acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
- acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
- acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
- acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +37 -13
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +55 -24
- acryl_datahub_cloud/datahub_reporting/extract_graph.py +4 -3
- acryl_datahub_cloud/datahub_reporting/extract_sql.py +242 -51
- acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
- acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
- acryl_datahub_cloud/datahub_restore/source.py +3 -2
- acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +518 -77
- acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
- acryl_datahub_cloud/graphql_utils.py +64 -0
- acryl_datahub_cloud/lineage_features/source.py +555 -49
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +2296 -1900
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +4 -2
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +12 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- acryl_datahub_cloud/metadata/schema.avsc +25091 -20557
- acryl_datahub_cloud/metadata/schema_classes.py +29269 -23863
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
- acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
- acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
- acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +353 -215
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
- acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +166 -21
- acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
- acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
- acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
- acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
- acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
- acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
- acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
- acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
- acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
- acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
- acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
- acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
- acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
- acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
- acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
- acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
- acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
- acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +418 -97
- acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +62 -44
- acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +54 -9
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +163 -23
- acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
- acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
- acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
- acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
- acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
- acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
- acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +61 -0
- acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
- acryl_datahub_cloud/notifications/__init__.py +0 -0
- acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
- acryl_datahub_cloud/sdk/__init__.py +69 -0
- acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
- acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
- acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
- acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
- acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
- acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
- acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
- acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
- acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
- acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
- acryl_datahub_cloud/sdk/assertion/types.py +20 -0
- acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
- acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
- acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
- acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
- acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
- acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
- acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
- acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
- acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
- acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
- acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
- acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
- acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
- acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
- acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
- acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
- acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
- acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
- acryl_datahub_cloud/sdk/errors.py +34 -0
- acryl_datahub_cloud/sdk/resolver_client.py +42 -0
- acryl_datahub_cloud/sdk/subscription_client.py +737 -0
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +55 -49
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +235 -142
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
- acryl_datahub_cloud/_sdk_extras/__init__.py +0 -4
- acryl_datahub_cloud/_sdk_extras/assertion.py +0 -15
- acryl_datahub_cloud/_sdk_extras/assertions_client.py +0 -23
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
|
@@ -5,14 +5,14 @@ import pathlib
|
|
|
5
5
|
import tempfile
|
|
6
6
|
import time
|
|
7
7
|
from enum import Enum
|
|
8
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
8
|
+
from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
|
|
9
9
|
|
|
10
10
|
import boto3
|
|
11
11
|
import duckdb
|
|
12
12
|
import pandas
|
|
13
13
|
import pyarrow as pa
|
|
14
14
|
import pyarrow.parquet as pq
|
|
15
|
-
from pydantic import BaseModel,
|
|
15
|
+
from pydantic import BaseModel, field_validator
|
|
16
16
|
|
|
17
17
|
from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow, SchemaField
|
|
18
18
|
from datahub.configuration.common import ConfigModel
|
|
@@ -73,7 +73,9 @@ class FileStoreBackedDatasetConfig(ConfigModel):
|
|
|
73
73
|
store_platform: str = "s3"
|
|
74
74
|
file_name: str = "data"
|
|
75
75
|
file_extension: str = "parquet"
|
|
76
|
-
file_compression:
|
|
76
|
+
file_compression: Literal[
|
|
77
|
+
"gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"
|
|
78
|
+
] = "snappy"
|
|
77
79
|
file_overwrite_existing: bool = True
|
|
78
80
|
snapshot_partitioning_strategy: str = PartitioningStrategy.DATE
|
|
79
81
|
generate_presigned_url: bool = True
|
|
@@ -85,7 +87,8 @@ class FileStoreBackedDatasetConfig(ConfigModel):
|
|
|
85
87
|
|
|
86
88
|
datahub_platform: str = "acryl"
|
|
87
89
|
|
|
88
|
-
@
|
|
90
|
+
@field_validator("snapshot_partitioning_strategy")
|
|
91
|
+
@classmethod
|
|
89
92
|
def validate_partitioning_strategy(cls, v):
|
|
90
93
|
if v not in PartitioningStrategy._value2member_map_:
|
|
91
94
|
raise ValueError(f"Unsupported partitioning strategy: {v}")
|
|
@@ -119,9 +122,14 @@ class DataHubBasedS3Dataset:
|
|
|
119
122
|
self.local_file_path: str = (
|
|
120
123
|
config.file if config.file else self._initialize_local_file()
|
|
121
124
|
)
|
|
122
|
-
self.file_writer = None
|
|
125
|
+
self.file_writer: Optional[pq.ParquetWriter] = None
|
|
123
126
|
self.schema = (
|
|
124
|
-
pa.schema(
|
|
127
|
+
pa.schema(
|
|
128
|
+
[
|
|
129
|
+
pa.field(x.name, BaseModelRow.string_to_pyarrow_type(x.type))
|
|
130
|
+
for x in self.dataset_metadata.schemaFields
|
|
131
|
+
]
|
|
132
|
+
)
|
|
125
133
|
if self.dataset_metadata.schemaFields
|
|
126
134
|
else None
|
|
127
135
|
)
|
|
@@ -163,18 +171,32 @@ class DataHubBasedS3Dataset:
|
|
|
163
171
|
self.schema = row.arrow_schema()
|
|
164
172
|
else:
|
|
165
173
|
# hail mary: infer schema from the first row and cast everything to string
|
|
166
|
-
self.schema = pa.schema([(key, pa.string()) for key in row])
|
|
174
|
+
self.schema = pa.schema([pa.field(key, pa.string()) for key in row])
|
|
167
175
|
self.stringify_row = True
|
|
168
176
|
|
|
169
177
|
self._initialize_local_file()
|
|
178
|
+
# Map compression names to PyArrow format (most are direct mappings)
|
|
179
|
+
compression_map = {
|
|
180
|
+
"gzip": "gzip",
|
|
181
|
+
"bz2": "brotli", # PyArrow doesn't support bz2, use brotli
|
|
182
|
+
"brotli": "brotli",
|
|
183
|
+
"lz4": "lz4",
|
|
184
|
+
"zstd": "zstd",
|
|
185
|
+
"snappy": "snappy",
|
|
186
|
+
"none": "none",
|
|
187
|
+
}
|
|
188
|
+
compression = cast(
|
|
189
|
+
Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"],
|
|
190
|
+
compression_map.get(self.config.file_compression, "snappy"),
|
|
191
|
+
)
|
|
170
192
|
self.file_writer = pq.ParquetWriter(
|
|
171
193
|
self.local_file_path,
|
|
172
194
|
self.schema,
|
|
173
|
-
compression=
|
|
195
|
+
compression=compression,
|
|
174
196
|
)
|
|
175
197
|
if isinstance(row, (BaseModel, BaseModelRow)):
|
|
176
198
|
# for anything extending BaseModel, we want to use the dict representation
|
|
177
|
-
write_row: Dict[str, Any] = row.
|
|
199
|
+
write_row: Dict[str, Any] = row.model_dump()
|
|
178
200
|
elif isinstance(row, dict):
|
|
179
201
|
write_row = row
|
|
180
202
|
else:
|
|
@@ -271,7 +293,7 @@ class DataHubBasedS3Dataset:
|
|
|
271
293
|
)
|
|
272
294
|
|
|
273
295
|
def _generate_schema_metadata(
|
|
274
|
-
self, duckdb_columns: List[Tuple[str,
|
|
296
|
+
self, duckdb_columns: List[Tuple[str, Any]]
|
|
275
297
|
) -> SchemaMetadataClass:
|
|
276
298
|
def get_type_from_dtype(dtype: str) -> SchemaFieldDataTypeClass:
|
|
277
299
|
if "int" in dtype or "float" in dtype or "number" in dtype:
|
|
@@ -302,7 +324,7 @@ class DataHubBasedS3Dataset:
|
|
|
302
324
|
)
|
|
303
325
|
for column in duckdb_columns:
|
|
304
326
|
# generate data type
|
|
305
|
-
data_type = column[1].lower()
|
|
327
|
+
data_type = str(column[1]).lower()
|
|
306
328
|
schema_metadata.fields.append(
|
|
307
329
|
SchemaFieldClass(
|
|
308
330
|
fieldPath=column[0],
|
|
@@ -341,7 +363,7 @@ class DataHubBasedS3Dataset:
|
|
|
341
363
|
# generate min, max, avg, distinct count, null count
|
|
342
364
|
column_name = column[0]
|
|
343
365
|
logger.info(f"Generating field profile for {column_name}")
|
|
344
|
-
data_type = column[1].lower()
|
|
366
|
+
data_type = str(column[1]).lower()
|
|
345
367
|
if "int" in data_type or "float" in data_type:
|
|
346
368
|
query = (
|
|
347
369
|
f"SELECT COUNT(DISTINCT {column_name}), COUNT(*) - COUNT({column_name}), MIN({column_name}), MAX({column_name}), AVG({column_name})"
|
|
@@ -396,7 +418,9 @@ class DataHubBasedS3Dataset:
|
|
|
396
418
|
assert dataset_profiles.fieldProfiles is not None
|
|
397
419
|
dataset_profiles.fieldProfiles.append(field_profile)
|
|
398
420
|
logger.info("Generated dataset profile")
|
|
399
|
-
schema_metadata = self._generate_schema_metadata(
|
|
421
|
+
schema_metadata = self._generate_schema_metadata(
|
|
422
|
+
[(col[0], col[1]) for col in columns]
|
|
423
|
+
)
|
|
400
424
|
return dataset_profiles, schema_metadata
|
|
401
425
|
|
|
402
426
|
def register_dataset(
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import logging
|
|
3
2
|
from datetime import date, datetime, timezone
|
|
4
3
|
from enum import Enum
|
|
5
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional
|
|
4
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
6
5
|
|
|
7
6
|
import pandas as pd
|
|
8
|
-
from pydantic import BaseModel
|
|
7
|
+
from pydantic import BaseModel, field_validator
|
|
9
8
|
|
|
10
9
|
from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow
|
|
10
|
+
from acryl_datahub_cloud.graphql_utils import parse_extra_properties_for_model
|
|
11
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
12
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
13
|
-
from datahub.ingestion.graph.filters import
|
|
13
|
+
from datahub.ingestion.graph.filters import RawSearchFilter
|
|
14
14
|
from datahub.metadata.schema_classes import (
|
|
15
15
|
DomainPropertiesClass,
|
|
16
16
|
FormAssociationClass,
|
|
@@ -130,6 +130,22 @@ class DataHubFormReportingData(FormData):
|
|
|
130
130
|
platformInstance: Optional[str] = None
|
|
131
131
|
domains: List[str] = []
|
|
132
132
|
|
|
133
|
+
@field_validator(
|
|
134
|
+
"completedFormsIncompletePromptResponseTimes",
|
|
135
|
+
"completedFormsCompletedPromptResponseTimes",
|
|
136
|
+
"incompleteFormsIncompletePromptResponseTimes",
|
|
137
|
+
"incompleteFormsCompletedPromptResponseTimes",
|
|
138
|
+
mode="before",
|
|
139
|
+
)
|
|
140
|
+
@classmethod
|
|
141
|
+
def convert_timestamps_to_strings(
|
|
142
|
+
cls, v: Union[List[int], List[str]]
|
|
143
|
+
) -> List[str]:
|
|
144
|
+
"""Convert timestamp integers to strings for compatibility with GMS data."""
|
|
145
|
+
if not isinstance(v, list):
|
|
146
|
+
return v
|
|
147
|
+
return [str(item) for item in v]
|
|
148
|
+
|
|
133
149
|
def __init__(self, graph: DataHubGraph, allowed_forms: Optional[List[str]] = None):
|
|
134
150
|
self.graph: DataHubGraph = graph
|
|
135
151
|
self.form_registry = FormRegistry(graph)
|
|
@@ -143,13 +159,13 @@ class DataHubFormReportingData(FormData):
|
|
|
143
159
|
on_form_scanned: Callable[[str], Any],
|
|
144
160
|
) -> pd.DataFrame:
|
|
145
161
|
return pd.DataFrame(
|
|
146
|
-
x.
|
|
162
|
+
x.model_dump()
|
|
147
163
|
for x in self.get_data(
|
|
148
164
|
on_asset_scanned=on_asset_scanned, on_form_scanned=on_form_scanned
|
|
149
165
|
)
|
|
150
166
|
)
|
|
151
167
|
|
|
152
|
-
def get_form_existence_or_filters(self) ->
|
|
168
|
+
def get_form_existence_or_filters(self) -> RawSearchFilter:
|
|
153
169
|
"""
|
|
154
170
|
Datasets must either have completedForms or incompleteForms assigned to
|
|
155
171
|
them
|
|
@@ -157,25 +173,41 @@ class DataHubFormReportingData(FormData):
|
|
|
157
173
|
if self.allowed_forms:
|
|
158
174
|
return [
|
|
159
175
|
{
|
|
160
|
-
"
|
|
161
|
-
|
|
162
|
-
|
|
176
|
+
"and": [
|
|
177
|
+
{
|
|
178
|
+
"field": "completedForms",
|
|
179
|
+
"condition": "EQUAL",
|
|
180
|
+
"values": self.allowed_forms,
|
|
181
|
+
}
|
|
182
|
+
]
|
|
163
183
|
},
|
|
164
184
|
{
|
|
165
|
-
"
|
|
166
|
-
|
|
167
|
-
|
|
185
|
+
"and": [
|
|
186
|
+
{
|
|
187
|
+
"field": "incompleteForms",
|
|
188
|
+
"condition": "EQUAL",
|
|
189
|
+
"values": self.allowed_forms,
|
|
190
|
+
}
|
|
191
|
+
]
|
|
168
192
|
},
|
|
169
193
|
]
|
|
170
194
|
else:
|
|
171
195
|
return [
|
|
172
196
|
{
|
|
173
|
-
"
|
|
174
|
-
|
|
197
|
+
"and": [
|
|
198
|
+
{
|
|
199
|
+
"field": "completedForms",
|
|
200
|
+
"condition": "EXISTS",
|
|
201
|
+
}
|
|
202
|
+
]
|
|
175
203
|
},
|
|
176
204
|
{
|
|
177
|
-
"
|
|
178
|
-
|
|
205
|
+
"and": [
|
|
206
|
+
{
|
|
207
|
+
"field": "incompleteForms",
|
|
208
|
+
"condition": "EXISTS",
|
|
209
|
+
}
|
|
210
|
+
]
|
|
179
211
|
},
|
|
180
212
|
]
|
|
181
213
|
|
|
@@ -290,10 +322,10 @@ class DataHubFormReportingData(FormData):
|
|
|
290
322
|
on_asset_scanned: Optional[Callable[[str], Any]] = None,
|
|
291
323
|
on_form_scanned: Optional[Callable[[str], Any]] = None,
|
|
292
324
|
) -> Iterable[FormReportingRow]:
|
|
293
|
-
extra_fields = [f for f in self.DataHubDatasetSearchRow.
|
|
325
|
+
extra_fields = [f for f in self.DataHubDatasetSearchRow.model_fields]
|
|
294
326
|
# TODO: Replace with the new search/filter SDK.
|
|
295
327
|
result = self.graph.get_results_by_filter(
|
|
296
|
-
extra_or_filters=
|
|
328
|
+
extra_or_filters=self.get_form_existence_or_filters(),
|
|
297
329
|
extra_source_fields=extra_fields,
|
|
298
330
|
skip_cache=True,
|
|
299
331
|
)
|
|
@@ -304,10 +336,9 @@ class DataHubFormReportingData(FormData):
|
|
|
304
336
|
if row_index % 100 == 0:
|
|
305
337
|
logger.info(f"Scanned {row_index} assets")
|
|
306
338
|
extra_properties = row["extraProperties"]
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
}
|
|
339
|
+
extra_properties_map = parse_extra_properties_for_model(
|
|
340
|
+
extra_properties, self.DataHubDatasetSearchRow
|
|
341
|
+
)
|
|
311
342
|
search_row = self.DataHubDatasetSearchRow(**extra_properties_map)
|
|
312
343
|
if on_asset_scanned:
|
|
313
344
|
on_asset_scanned(search_row.urn)
|
|
@@ -414,7 +445,7 @@ class DataHubFormReportingData(FormData):
|
|
|
414
445
|
question_status=QuestionStatus.COMPLETED,
|
|
415
446
|
question_completed_date=datetime.fromtimestamp(
|
|
416
447
|
float(prompt_response_time) / 1000, tz=timezone.utc
|
|
417
|
-
),
|
|
448
|
+
).date(),
|
|
418
449
|
snapshot_date=self.snapshot_date,
|
|
419
450
|
)
|
|
420
451
|
complete_forms = (
|
|
@@ -516,7 +547,7 @@ class DataHubFormReportingData(FormData):
|
|
|
516
547
|
question_status=QuestionStatus.COMPLETED,
|
|
517
548
|
question_completed_date=datetime.fromtimestamp(
|
|
518
549
|
float(prompt_response_time) / 1000, tz=timezone.utc
|
|
519
|
-
),
|
|
550
|
+
).date(),
|
|
520
551
|
snapshot_date=self.snapshot_date,
|
|
521
552
|
)
|
|
522
553
|
|
|
@@ -6,7 +6,7 @@ from typing import List, Optional
|
|
|
6
6
|
|
|
7
7
|
import boto3
|
|
8
8
|
from opensearchpy import OpenSearch
|
|
9
|
-
from pydantic import
|
|
9
|
+
from pydantic import field_validator
|
|
10
10
|
|
|
11
11
|
from acryl_datahub_cloud.datahub_reporting.datahub_dataset import (
|
|
12
12
|
DataHubBasedS3Dataset,
|
|
@@ -43,8 +43,9 @@ class DataHubReportingExtractGraphSourceConfig(ConfigModel):
|
|
|
43
43
|
query_timeout: int = 30
|
|
44
44
|
extract_batch_size: int = 2000
|
|
45
45
|
|
|
46
|
-
@
|
|
47
|
-
|
|
46
|
+
@field_validator("extract_graph_store", mode="before")
|
|
47
|
+
@classmethod
|
|
48
|
+
def set_default_extract_soft_delete_flag(cls, v):
|
|
48
49
|
if v is not None:
|
|
49
50
|
if "dataset_registration_spec" not in v:
|
|
50
51
|
v["dataset_registration_spec"] = DatasetRegistrationSpec(
|
|
@@ -4,10 +4,14 @@ import shutil
|
|
|
4
4
|
import zipfile
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Iterable, List, Optional
|
|
7
|
+
from typing import TYPE_CHECKING, Iterable, List, Literal, Optional
|
|
8
8
|
|
|
9
9
|
import boto3
|
|
10
|
-
from
|
|
10
|
+
from botocore.exceptions import ClientError
|
|
11
|
+
from pydantic import field_validator
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from mypy_boto3_s3.service_resource import ObjectSummary
|
|
11
15
|
|
|
12
16
|
from acryl_datahub_cloud.datahub_reporting.datahub_dataset import (
|
|
13
17
|
DataHubBasedS3Dataset,
|
|
@@ -42,25 +46,36 @@ class DataHubReportingExtractSQLSourceConfig(ConfigModel):
|
|
|
42
46
|
server: Optional[DatahubClientConfig] = None
|
|
43
47
|
sql_backup_config: S3ClientConfig
|
|
44
48
|
extract_sql_store: FileStoreBackedDatasetConfig
|
|
45
|
-
|
|
46
|
-
|
|
49
|
+
# Maximum size (in bytes) of files to stream from S3 per batch using chunked streaming.
|
|
50
|
+
# Files are streamed in 8MB chunks directly from S3 to ZIP without writing to disk, processing
|
|
51
|
+
# files in batches to limit peak memory usage. This prevents both disk pressure and excessive
|
|
52
|
+
# memory consumption during batch processing.
|
|
53
|
+
# Default: 5GB (5 * 1024 * 1024 * 1024 bytes)
|
|
54
|
+
batch_size_bytes: int = 5 * 1024 * 1024 * 1024
|
|
55
|
+
|
|
56
|
+
@field_validator("extract_sql_store", mode="before")
|
|
57
|
+
@classmethod
|
|
47
58
|
def set_default_extract_soft_delete_flag(cls, v):
|
|
48
|
-
if v is
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
59
|
+
if v is None:
|
|
60
|
+
return v
|
|
61
|
+
|
|
62
|
+
# If v is already a FileStoreBackedDatasetConfig object, skip dict-based modifications
|
|
63
|
+
if isinstance(v, FileStoreBackedDatasetConfig):
|
|
64
|
+
return v
|
|
65
|
+
|
|
66
|
+
# v is a dictionary - apply default values
|
|
67
|
+
if "dataset_registration_spec" not in v:
|
|
68
|
+
v["dataset_registration_spec"] = DatasetRegistrationSpec(soft_deleted=False)
|
|
69
|
+
elif "soft_deleted" not in v["dataset_registration_spec"]:
|
|
70
|
+
v["dataset_registration_spec"]["soft_deleted"] = False
|
|
71
|
+
|
|
72
|
+
if "file" not in v:
|
|
73
|
+
default_config = FileStoreBackedDatasetConfig.dummy()
|
|
74
|
+
v["file"] = f"{default_config.file_name}.{default_config.file_extension}"
|
|
75
|
+
else:
|
|
76
|
+
v["file_name"] = v["file"].split(".")[0]
|
|
77
|
+
v["file_extension"] = v["file"].split(".")[-1]
|
|
55
78
|
|
|
56
|
-
if "file" not in v:
|
|
57
|
-
default_config = FileStoreBackedDatasetConfig.dummy()
|
|
58
|
-
v["file"] = (
|
|
59
|
-
f"{default_config.file_name}.{default_config.file_extension}"
|
|
60
|
-
)
|
|
61
|
-
else:
|
|
62
|
-
v["file_name"] = v["file"].split(".")[0]
|
|
63
|
-
v["file_extension"] = v["file"].split(".")[-1]
|
|
64
79
|
return v
|
|
65
80
|
|
|
66
81
|
|
|
@@ -166,20 +181,17 @@ class DataHubReportingExtractSQLSource(Source):
|
|
|
166
181
|
|
|
167
182
|
self._clean_up_old_state(state_directory=tmp_dir)
|
|
168
183
|
|
|
169
|
-
files_downloaded: bool = self.
|
|
184
|
+
files_downloaded: bool = self._download_and_zip_in_batches(
|
|
170
185
|
bucket=self.config.sql_backup_config.bucket,
|
|
171
186
|
prefix=bucket_prefix,
|
|
172
|
-
|
|
187
|
+
batch_dir=f"{tmp_dir}/download/",
|
|
188
|
+
output_zip=f"{tmp_dir}/{output_file}",
|
|
189
|
+
batch_size_bytes=self.config.batch_size_bytes,
|
|
173
190
|
)
|
|
174
191
|
if not files_downloaded:
|
|
175
192
|
logger.warning(f"Skipping as no files were found in {bucket_prefix}")
|
|
176
193
|
return
|
|
177
194
|
|
|
178
|
-
self._zip_folder(
|
|
179
|
-
folder_path=f"{tmp_dir}/download",
|
|
180
|
-
output_file=f"{tmp_dir}/{output_file}",
|
|
181
|
-
)
|
|
182
|
-
|
|
183
195
|
# Compute profile & schema information, this is based on the parquet files that were downloaded and not the zip file.
|
|
184
196
|
# We must hard-code the local file from which the dataset will be created, otherwise the upload to s3 will be in
|
|
185
197
|
# unexpected path.
|
|
@@ -210,40 +222,219 @@ class DataHubReportingExtractSQLSource(Source):
|
|
|
210
222
|
path = Path(f"{state_directory}/download/")
|
|
211
223
|
path.mkdir(parents=True, exist_ok=True)
|
|
212
224
|
|
|
213
|
-
|
|
214
|
-
|
|
225
|
+
@staticmethod
|
|
226
|
+
def _stream_file_to_zip_from_local(
|
|
227
|
+
local_file_path: str,
|
|
228
|
+
zipf: zipfile.ZipFile,
|
|
229
|
+
file_name: str,
|
|
230
|
+
chunk_size: int,
|
|
231
|
+
) -> None:
|
|
232
|
+
"""Stream file from local disk to ZIP using chunked reads."""
|
|
233
|
+
with (
|
|
234
|
+
open(local_file_path, "rb") as local_file,
|
|
235
|
+
zipf.open(file_name, "w") as zip_entry,
|
|
236
|
+
):
|
|
237
|
+
while True:
|
|
238
|
+
chunk = local_file.read(chunk_size)
|
|
239
|
+
if not chunk:
|
|
240
|
+
break
|
|
241
|
+
zip_entry.write(chunk)
|
|
242
|
+
|
|
243
|
+
def _stream_file_to_zip_from_s3(
|
|
244
|
+
self,
|
|
245
|
+
bucket: str,
|
|
246
|
+
file_key: str,
|
|
247
|
+
zipf: zipfile.ZipFile,
|
|
248
|
+
file_name: str,
|
|
249
|
+
chunk_size: int,
|
|
250
|
+
) -> None:
|
|
251
|
+
"""Stream file from S3 to ZIP using chunked reads."""
|
|
252
|
+
s3_response = self.s3_client.get_object(Bucket=bucket, Key=file_key)
|
|
253
|
+
body_stream = s3_response["Body"]
|
|
254
|
+
|
|
255
|
+
with zipf.open(file_name, "w") as zip_entry:
|
|
256
|
+
while True:
|
|
257
|
+
chunk = body_stream.read(chunk_size)
|
|
258
|
+
if not chunk:
|
|
259
|
+
break
|
|
260
|
+
zip_entry.write(chunk)
|
|
215
261
|
|
|
216
|
-
|
|
262
|
+
@staticmethod
|
|
263
|
+
def _group_objects_into_batches(
|
|
264
|
+
objects: List["ObjectSummary"], batch_size_bytes: int
|
|
265
|
+
) -> List[List["ObjectSummary"]]:
|
|
266
|
+
"""
|
|
267
|
+
Group S3 objects into batches based on cumulative size.
|
|
268
|
+
|
|
269
|
+
Files larger than batch_size_bytes get their own batch.
|
|
270
|
+
"""
|
|
271
|
+
batches: List[List["ObjectSummary"]] = []
|
|
272
|
+
current_batch: List["ObjectSummary"] = []
|
|
273
|
+
current_batch_size = 0
|
|
217
274
|
|
|
218
|
-
# Iterate over objects in the time partition path
|
|
219
275
|
for obj in objects:
|
|
220
|
-
|
|
221
|
-
|
|
276
|
+
obj_size = obj.size
|
|
277
|
+
|
|
278
|
+
# If file is larger than batch size, give it its own batch
|
|
279
|
+
if obj_size > batch_size_bytes:
|
|
280
|
+
if current_batch:
|
|
281
|
+
batches.append(current_batch)
|
|
282
|
+
current_batch = []
|
|
283
|
+
current_batch_size = 0
|
|
284
|
+
|
|
285
|
+
batches.append([obj]) # Solo batch for large file
|
|
286
|
+
logger.warning(
|
|
287
|
+
f"File {obj.key} ({obj_size / (1024**2):.2f} MB) exceeds batch size "
|
|
288
|
+
f"({batch_size_bytes / (1024**2):.2f} MB), processing in separate batch"
|
|
289
|
+
)
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
# If adding this file would exceed batch size, start a new batch
|
|
293
|
+
if (
|
|
294
|
+
current_batch_size > 0
|
|
295
|
+
and current_batch_size + obj_size > batch_size_bytes
|
|
296
|
+
):
|
|
297
|
+
batches.append(current_batch)
|
|
298
|
+
current_batch = []
|
|
299
|
+
current_batch_size = 0
|
|
300
|
+
|
|
301
|
+
current_batch.append(obj)
|
|
302
|
+
current_batch_size += obj_size
|
|
303
|
+
|
|
304
|
+
# Add the last batch if it has files
|
|
305
|
+
if current_batch:
|
|
306
|
+
batches.append(current_batch)
|
|
307
|
+
|
|
308
|
+
return batches
|
|
309
|
+
|
|
310
|
+
def _download_and_zip_in_batches(
|
|
311
|
+
self,
|
|
312
|
+
bucket: str,
|
|
313
|
+
prefix: str,
|
|
314
|
+
batch_dir: str,
|
|
315
|
+
output_zip: str,
|
|
316
|
+
batch_size_bytes: int,
|
|
317
|
+
) -> bool:
|
|
318
|
+
"""
|
|
319
|
+
Stream files from S3 directly into ZIP using chunked streaming, processing in batches to limit memory usage.
|
|
222
320
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
321
|
+
Downloads the first file to batch_dir for schema/profile computation, then streams all files to ZIP
|
|
322
|
+
using 8MB chunks to ensure constant memory usage regardless of individual file sizes.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
bucket: S3 bucket name
|
|
326
|
+
prefix: S3 prefix to filter objects
|
|
327
|
+
batch_dir: Local directory for temporary sample file download (for schema computation)
|
|
328
|
+
output_zip: Output ZIP file path
|
|
329
|
+
batch_size_bytes: Maximum total size of files to stream in each batch before flushing
|
|
227
330
|
|
|
228
|
-
|
|
331
|
+
Returns:
|
|
332
|
+
True if any files were processed, False otherwise
|
|
333
|
+
"""
|
|
334
|
+
s3_resource = boto3.resource("s3")
|
|
335
|
+
objects = list(s3_resource.Bucket(bucket).objects.filter(Prefix=prefix))
|
|
229
336
|
|
|
230
|
-
|
|
231
|
-
|
|
337
|
+
if not objects:
|
|
338
|
+
return False
|
|
232
339
|
|
|
233
|
-
|
|
340
|
+
logger.info(
|
|
341
|
+
f"Found {len(objects)} files in s3://{bucket}/{prefix}, streaming in batches of up to {batch_size_bytes / (1024**2):.2f} MB"
|
|
342
|
+
)
|
|
234
343
|
|
|
235
|
-
|
|
344
|
+
# Download first file to batch_dir for schema/profile computation
|
|
345
|
+
# This is required by register_dataset() which needs a local parquet file to generate schema
|
|
346
|
+
os.makedirs(batch_dir, exist_ok=True)
|
|
347
|
+
first_obj = objects[0]
|
|
348
|
+
sample_file_path = os.path.join(batch_dir, os.path.basename(first_obj.key))
|
|
236
349
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
350
|
+
try:
|
|
351
|
+
logger.info(
|
|
352
|
+
f"Downloading first file s3://{bucket}/{first_obj.key} ({first_obj.size / (1024**2):.2f} MB) "
|
|
353
|
+
f"to {sample_file_path} for schema computation"
|
|
354
|
+
)
|
|
355
|
+
self.s3_client.download_file(bucket, first_obj.key, sample_file_path)
|
|
356
|
+
except ClientError as e:
|
|
357
|
+
logger.error(f"Failed to download first file for schema computation: {e}")
|
|
358
|
+
raise RuntimeError(
|
|
359
|
+
f"Cannot compute schema without at least one sample file: {e}"
|
|
360
|
+
) from e
|
|
361
|
+
|
|
362
|
+
# Group objects into batches based on cumulative size
|
|
363
|
+
batches = self._group_objects_into_batches(objects, batch_size_bytes)
|
|
364
|
+
logger.info(f"Split {len(objects)} files into {len(batches)} batches")
|
|
365
|
+
|
|
366
|
+
# Track whether we've processed the first file to avoid downloading it twice
|
|
367
|
+
first_obj_processed = False
|
|
368
|
+
|
|
369
|
+
# Process each batch: stream from S3 directly to ZIP using chunked reads
|
|
370
|
+
zip_mode: Literal["x", "a"] = "x" # Create new file for first batch
|
|
371
|
+
chunk_size = 8 * 1024 * 1024 # 8MB chunks for constant memory usage
|
|
372
|
+
|
|
373
|
+
for batch_idx, batch in enumerate(batches):
|
|
374
|
+
batch_size_mb = sum(obj.size for obj in batch) / (1024 * 1024)
|
|
375
|
+
logger.info(
|
|
376
|
+
f"Processing batch {batch_idx + 1}/{len(batches)} with {len(batch)} files ({batch_size_mb:.2f} MB)"
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# Stream files from S3 directly into ZIP using chunked reads
|
|
380
|
+
with zipfile.ZipFile(output_zip, zip_mode, zipfile.ZIP_DEFLATED) as zipf:
|
|
381
|
+
for obj in batch:
|
|
382
|
+
file_key = obj.key
|
|
383
|
+
|
|
384
|
+
# Preserve S3 path structure in ZIP to avoid filename collisions
|
|
385
|
+
# Strip only the common prefix, keep subdirectories
|
|
386
|
+
relative_path = file_key[len(prefix) :].lstrip("/")
|
|
387
|
+
file_name = (
|
|
388
|
+
relative_path if relative_path else os.path.basename(file_key)
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
try:
|
|
392
|
+
# If this is the first file and we already downloaded it, reuse local copy
|
|
393
|
+
if not first_obj_processed and file_key == first_obj.key:
|
|
394
|
+
logger.info(
|
|
395
|
+
f"Adding {file_name} ({obj.size / (1024**2):.2f} MB) to ZIP from local file "
|
|
396
|
+
f"(already downloaded for schema computation)"
|
|
397
|
+
)
|
|
398
|
+
self._stream_file_to_zip_from_local(
|
|
399
|
+
sample_file_path, zipf, file_name, chunk_size
|
|
400
|
+
)
|
|
401
|
+
first_obj_processed = True
|
|
402
|
+
else:
|
|
403
|
+
# Stream from S3 using chunked reads for constant memory usage
|
|
404
|
+
logger.info(
|
|
405
|
+
f"Streaming {file_name} ({obj.size / (1024**2):.2f} MB) from S3 using chunked reads"
|
|
406
|
+
)
|
|
407
|
+
self._stream_file_to_zip_from_s3(
|
|
408
|
+
bucket, file_key, zipf, file_name, chunk_size
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
logger.info(f"Added {file_name} to ZIP file")
|
|
412
|
+
|
|
413
|
+
except ClientError as e:
|
|
414
|
+
logger.error(f"Failed to stream s3://{bucket}/{file_key}: {e}")
|
|
415
|
+
raise RuntimeError(
|
|
416
|
+
f"Failed to stream file {file_key} from S3: {e}"
|
|
417
|
+
) from e
|
|
418
|
+
except Exception as e:
|
|
419
|
+
logger.error(
|
|
420
|
+
f"Unexpected error processing s3://{bucket}/{file_key}: {e}"
|
|
421
|
+
)
|
|
422
|
+
raise RuntimeError(
|
|
423
|
+
f"Failed to process file {file_key}: {e}"
|
|
424
|
+
) from e
|
|
425
|
+
|
|
426
|
+
# After first batch, switch to append mode for subsequent batches
|
|
427
|
+
zip_mode = "a"
|
|
428
|
+
|
|
429
|
+
logger.info(
|
|
430
|
+
f"Batch {batch_idx + 1}/{len(batches)} complete, streamed {len(batch)} files"
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
total_size_mb = sum(obj.size for obj in objects) / (1024 * 1024)
|
|
434
|
+
logger.info(
|
|
435
|
+
f"Successfully streamed all {len(objects)} files ({total_size_mb:.2f} MB) across {len(batches)} batches"
|
|
436
|
+
)
|
|
437
|
+
return True
|
|
247
438
|
|
|
248
439
|
def get_report(self) -> SourceReport:
|
|
249
440
|
return self.report
|
|
@@ -75,7 +75,7 @@ class DataHubReportingFormsSource(Source):
|
|
|
75
75
|
enabled=False, dataset_urn=None, physical_uri_prefix=None
|
|
76
76
|
)
|
|
77
77
|
result_map = query_result.get(query_name, {})
|
|
78
|
-
return FormAnalyticsConfig.
|
|
78
|
+
return FormAnalyticsConfig.model_validate(
|
|
79
79
|
dict(
|
|
80
80
|
(field, result_map.get(graphql_field))
|
|
81
81
|
for field, graphql_field in field_mappings.items()
|
|
@@ -2,7 +2,7 @@ from dataclasses import dataclass
|
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
|
|
5
|
-
from pydantic import
|
|
5
|
+
from pydantic import field_validator
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import ConfigModel
|
|
8
8
|
from datahub.ingestion.api.source import SourceReport
|
|
@@ -32,7 +32,8 @@ class DataHubReportingFormSourceConfig(ConfigModel):
|
|
|
32
32
|
generate_presigned_url: bool = True
|
|
33
33
|
presigned_url_expiry_days: int = 7
|
|
34
34
|
|
|
35
|
-
@
|
|
35
|
+
@field_validator("reporting_snapshot_partitioning_strategy")
|
|
36
|
+
@classmethod
|
|
36
37
|
def validate_partitioning_strategy(cls, v):
|
|
37
38
|
if v not in PartitioningStrategy:
|
|
38
39
|
raise ValueError(f"Unsupported partitioning strategy: {v}")
|