acryl-datahub-cloud 0.3.10rc4__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
- acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
- acryl_datahub_cloud/action_request/action_request_owner_source.py +37 -8
- acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
- acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
- acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
- acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
- acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +39 -19
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +60 -25
- acryl_datahub_cloud/datahub_reporting/extract_graph.py +9 -3
- acryl_datahub_cloud/datahub_reporting/extract_sql.py +248 -52
- acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
- acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
- acryl_datahub_cloud/datahub_restore/source.py +3 -2
- acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +532 -109
- acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
- acryl_datahub_cloud/graphql_utils.py +64 -0
- acryl_datahub_cloud/lineage_features/source.py +555 -49
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +2390 -1938
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +6 -2
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +14 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/monitor/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- acryl_datahub_cloud/metadata/schema.avsc +27843 -23200
- acryl_datahub_cloud/metadata/schema_classes.py +29901 -24310
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
- acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
- acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
- acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +375 -212
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
- acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +191 -21
- acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
- acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
- acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
- acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
- acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
- acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
- acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
- acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
- acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
- acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
- acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
- acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
- acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
- acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
- acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
- acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
- acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
- acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +424 -97
- acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +65 -44
- acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +84 -29
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +221 -23
- acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
- acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
- acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
- acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
- acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
- acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
- acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +147 -0
- acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
- acryl_datahub_cloud/metadata/schemas/__init__.py +3 -3
- acryl_datahub_cloud/notifications/__init__.py +0 -0
- acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
- acryl_datahub_cloud/sdk/__init__.py +69 -0
- acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
- acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
- acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
- acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
- acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
- acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
- acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
- acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
- acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
- acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
- acryl_datahub_cloud/sdk/assertion/types.py +20 -0
- acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
- acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
- acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
- acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
- acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
- acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
- acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
- acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
- acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
- acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
- acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
- acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
- acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
- acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
- acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
- acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
- acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
- acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
- acryl_datahub_cloud/sdk/errors.py +34 -0
- acryl_datahub_cloud/sdk/resolver_client.py +42 -0
- acryl_datahub_cloud/sdk/subscription_client.py +737 -0
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +49 -43
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +243 -145
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
|
@@ -5,14 +5,14 @@ import pathlib
|
|
|
5
5
|
import tempfile
|
|
6
6
|
import time
|
|
7
7
|
from enum import Enum
|
|
8
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
8
|
+
from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
|
|
9
9
|
|
|
10
10
|
import boto3
|
|
11
11
|
import duckdb
|
|
12
12
|
import pandas
|
|
13
13
|
import pyarrow as pa
|
|
14
14
|
import pyarrow.parquet as pq
|
|
15
|
-
from pydantic import BaseModel,
|
|
15
|
+
from pydantic import BaseModel, field_validator
|
|
16
16
|
|
|
17
17
|
from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow, SchemaField
|
|
18
18
|
from datahub.configuration.common import ConfigModel
|
|
@@ -73,7 +73,9 @@ class FileStoreBackedDatasetConfig(ConfigModel):
|
|
|
73
73
|
store_platform: str = "s3"
|
|
74
74
|
file_name: str = "data"
|
|
75
75
|
file_extension: str = "parquet"
|
|
76
|
-
file_compression:
|
|
76
|
+
file_compression: Literal[
|
|
77
|
+
"gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"
|
|
78
|
+
] = "snappy"
|
|
77
79
|
file_overwrite_existing: bool = True
|
|
78
80
|
snapshot_partitioning_strategy: str = PartitioningStrategy.DATE
|
|
79
81
|
generate_presigned_url: bool = True
|
|
@@ -85,7 +87,8 @@ class FileStoreBackedDatasetConfig(ConfigModel):
|
|
|
85
87
|
|
|
86
88
|
datahub_platform: str = "acryl"
|
|
87
89
|
|
|
88
|
-
@
|
|
90
|
+
@field_validator("snapshot_partitioning_strategy")
|
|
91
|
+
@classmethod
|
|
89
92
|
def validate_partitioning_strategy(cls, v):
|
|
90
93
|
if v not in PartitioningStrategy._value2member_map_:
|
|
91
94
|
raise ValueError(f"Unsupported partitioning strategy: {v}")
|
|
@@ -119,9 +122,14 @@ class DataHubBasedS3Dataset:
|
|
|
119
122
|
self.local_file_path: str = (
|
|
120
123
|
config.file if config.file else self._initialize_local_file()
|
|
121
124
|
)
|
|
122
|
-
self.file_writer = None
|
|
125
|
+
self.file_writer: Optional[pq.ParquetWriter] = None
|
|
123
126
|
self.schema = (
|
|
124
|
-
pa.schema(
|
|
127
|
+
pa.schema(
|
|
128
|
+
[
|
|
129
|
+
pa.field(x.name, BaseModelRow.string_to_pyarrow_type(x.type))
|
|
130
|
+
for x in self.dataset_metadata.schemaFields
|
|
131
|
+
]
|
|
132
|
+
)
|
|
125
133
|
if self.dataset_metadata.schemaFields
|
|
126
134
|
else None
|
|
127
135
|
)
|
|
@@ -163,18 +171,32 @@ class DataHubBasedS3Dataset:
|
|
|
163
171
|
self.schema = row.arrow_schema()
|
|
164
172
|
else:
|
|
165
173
|
# hail mary: infer schema from the first row and cast everything to string
|
|
166
|
-
self.schema = pa.schema([(key, pa.string()) for key in row
|
|
174
|
+
self.schema = pa.schema([pa.field(key, pa.string()) for key in row])
|
|
167
175
|
self.stringify_row = True
|
|
168
176
|
|
|
169
177
|
self._initialize_local_file()
|
|
178
|
+
# Map compression names to PyArrow format (most are direct mappings)
|
|
179
|
+
compression_map = {
|
|
180
|
+
"gzip": "gzip",
|
|
181
|
+
"bz2": "brotli", # PyArrow doesn't support bz2, use brotli
|
|
182
|
+
"brotli": "brotli",
|
|
183
|
+
"lz4": "lz4",
|
|
184
|
+
"zstd": "zstd",
|
|
185
|
+
"snappy": "snappy",
|
|
186
|
+
"none": "none",
|
|
187
|
+
}
|
|
188
|
+
compression = cast(
|
|
189
|
+
Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"],
|
|
190
|
+
compression_map.get(self.config.file_compression, "snappy"),
|
|
191
|
+
)
|
|
170
192
|
self.file_writer = pq.ParquetWriter(
|
|
171
193
|
self.local_file_path,
|
|
172
194
|
self.schema,
|
|
173
|
-
compression=
|
|
195
|
+
compression=compression,
|
|
174
196
|
)
|
|
175
|
-
if isinstance(row, BaseModel
|
|
197
|
+
if isinstance(row, (BaseModel, BaseModelRow)):
|
|
176
198
|
# for anything extending BaseModel, we want to use the dict representation
|
|
177
|
-
write_row: Dict[str, Any] = row.
|
|
199
|
+
write_row: Dict[str, Any] = row.model_dump()
|
|
178
200
|
elif isinstance(row, dict):
|
|
179
201
|
write_row = row
|
|
180
202
|
else:
|
|
@@ -271,14 +293,10 @@ class DataHubBasedS3Dataset:
|
|
|
271
293
|
)
|
|
272
294
|
|
|
273
295
|
def _generate_schema_metadata(
|
|
274
|
-
self, duckdb_columns: List[Tuple[str,
|
|
296
|
+
self, duckdb_columns: List[Tuple[str, Any]]
|
|
275
297
|
) -> SchemaMetadataClass:
|
|
276
298
|
def get_type_from_dtype(dtype: str) -> SchemaFieldDataTypeClass:
|
|
277
|
-
if "int" in dtype:
|
|
278
|
-
return SchemaFieldDataTypeClass(type=NumberTypeClass())
|
|
279
|
-
elif "float" in dtype:
|
|
280
|
-
return SchemaFieldDataTypeClass(type=NumberTypeClass())
|
|
281
|
-
elif "number" in dtype:
|
|
299
|
+
if "int" in dtype or "float" in dtype or "number" in dtype:
|
|
282
300
|
return SchemaFieldDataTypeClass(type=NumberTypeClass())
|
|
283
301
|
elif "bool" in dtype:
|
|
284
302
|
return SchemaFieldDataTypeClass(type=BooleanTypeClass())
|
|
@@ -306,7 +324,7 @@ class DataHubBasedS3Dataset:
|
|
|
306
324
|
)
|
|
307
325
|
for column in duckdb_columns:
|
|
308
326
|
# generate data type
|
|
309
|
-
data_type = column[1].lower()
|
|
327
|
+
data_type = str(column[1]).lower()
|
|
310
328
|
schema_metadata.fields.append(
|
|
311
329
|
SchemaFieldClass(
|
|
312
330
|
fieldPath=column[0],
|
|
@@ -345,7 +363,7 @@ class DataHubBasedS3Dataset:
|
|
|
345
363
|
# generate min, max, avg, distinct count, null count
|
|
346
364
|
column_name = column[0]
|
|
347
365
|
logger.info(f"Generating field profile for {column_name}")
|
|
348
|
-
data_type = column[1].lower()
|
|
366
|
+
data_type = str(column[1]).lower()
|
|
349
367
|
if "int" in data_type or "float" in data_type:
|
|
350
368
|
query = (
|
|
351
369
|
f"SELECT COUNT(DISTINCT {column_name}), COUNT(*) - COUNT({column_name}), MIN({column_name}), MAX({column_name}), AVG({column_name})"
|
|
@@ -400,7 +418,9 @@ class DataHubBasedS3Dataset:
|
|
|
400
418
|
assert dataset_profiles.fieldProfiles is not None
|
|
401
419
|
dataset_profiles.fieldProfiles.append(field_profile)
|
|
402
420
|
logger.info("Generated dataset profile")
|
|
403
|
-
schema_metadata = self._generate_schema_metadata(
|
|
421
|
+
schema_metadata = self._generate_schema_metadata(
|
|
422
|
+
[(col[0], col[1]) for col in columns]
|
|
423
|
+
)
|
|
404
424
|
return dataset_profiles, schema_metadata
|
|
405
425
|
|
|
406
426
|
def register_dataset(
|
|
@@ -1,13 +1,16 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import logging
|
|
3
2
|
from datetime import date, datetime, timezone
|
|
4
3
|
from enum import Enum
|
|
5
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional
|
|
4
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
6
5
|
|
|
7
6
|
import pandas as pd
|
|
7
|
+
from pydantic import BaseModel, field_validator
|
|
8
|
+
|
|
9
|
+
from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow
|
|
10
|
+
from acryl_datahub_cloud.graphql_utils import parse_extra_properties_for_model
|
|
8
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
12
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
10
|
-
from datahub.ingestion.graph.filters import
|
|
13
|
+
from datahub.ingestion.graph.filters import RawSearchFilter
|
|
11
14
|
from datahub.metadata.schema_classes import (
|
|
12
15
|
DomainPropertiesClass,
|
|
13
16
|
FormAssociationClass,
|
|
@@ -16,9 +19,6 @@ from datahub.metadata.schema_classes import (
|
|
|
16
19
|
FormStateClass,
|
|
17
20
|
FormTypeClass,
|
|
18
21
|
)
|
|
19
|
-
from pydantic import BaseModel
|
|
20
|
-
|
|
21
|
-
from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow
|
|
22
22
|
|
|
23
23
|
logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
@@ -130,6 +130,22 @@ class DataHubFormReportingData(FormData):
|
|
|
130
130
|
platformInstance: Optional[str] = None
|
|
131
131
|
domains: List[str] = []
|
|
132
132
|
|
|
133
|
+
@field_validator(
|
|
134
|
+
"completedFormsIncompletePromptResponseTimes",
|
|
135
|
+
"completedFormsCompletedPromptResponseTimes",
|
|
136
|
+
"incompleteFormsIncompletePromptResponseTimes",
|
|
137
|
+
"incompleteFormsCompletedPromptResponseTimes",
|
|
138
|
+
mode="before",
|
|
139
|
+
)
|
|
140
|
+
@classmethod
|
|
141
|
+
def convert_timestamps_to_strings(
|
|
142
|
+
cls, v: Union[List[int], List[str]]
|
|
143
|
+
) -> List[str]:
|
|
144
|
+
"""Convert timestamp integers to strings for compatibility with GMS data."""
|
|
145
|
+
if not isinstance(v, list):
|
|
146
|
+
return v
|
|
147
|
+
return [str(item) for item in v]
|
|
148
|
+
|
|
133
149
|
def __init__(self, graph: DataHubGraph, allowed_forms: Optional[List[str]] = None):
|
|
134
150
|
self.graph: DataHubGraph = graph
|
|
135
151
|
self.form_registry = FormRegistry(graph)
|
|
@@ -143,13 +159,13 @@ class DataHubFormReportingData(FormData):
|
|
|
143
159
|
on_form_scanned: Callable[[str], Any],
|
|
144
160
|
) -> pd.DataFrame:
|
|
145
161
|
return pd.DataFrame(
|
|
146
|
-
x.
|
|
162
|
+
x.model_dump()
|
|
147
163
|
for x in self.get_data(
|
|
148
164
|
on_asset_scanned=on_asset_scanned, on_form_scanned=on_form_scanned
|
|
149
165
|
)
|
|
150
166
|
)
|
|
151
167
|
|
|
152
|
-
def get_form_existence_or_filters(self) ->
|
|
168
|
+
def get_form_existence_or_filters(self) -> RawSearchFilter:
|
|
153
169
|
"""
|
|
154
170
|
Datasets must either have completedForms or incompleteForms assigned to
|
|
155
171
|
them
|
|
@@ -157,25 +173,41 @@ class DataHubFormReportingData(FormData):
|
|
|
157
173
|
if self.allowed_forms:
|
|
158
174
|
return [
|
|
159
175
|
{
|
|
160
|
-
"
|
|
161
|
-
|
|
162
|
-
|
|
176
|
+
"and": [
|
|
177
|
+
{
|
|
178
|
+
"field": "completedForms",
|
|
179
|
+
"condition": "EQUAL",
|
|
180
|
+
"values": self.allowed_forms,
|
|
181
|
+
}
|
|
182
|
+
]
|
|
163
183
|
},
|
|
164
184
|
{
|
|
165
|
-
"
|
|
166
|
-
|
|
167
|
-
|
|
185
|
+
"and": [
|
|
186
|
+
{
|
|
187
|
+
"field": "incompleteForms",
|
|
188
|
+
"condition": "EQUAL",
|
|
189
|
+
"values": self.allowed_forms,
|
|
190
|
+
}
|
|
191
|
+
]
|
|
168
192
|
},
|
|
169
193
|
]
|
|
170
194
|
else:
|
|
171
195
|
return [
|
|
172
196
|
{
|
|
173
|
-
"
|
|
174
|
-
|
|
197
|
+
"and": [
|
|
198
|
+
{
|
|
199
|
+
"field": "completedForms",
|
|
200
|
+
"condition": "EXISTS",
|
|
201
|
+
}
|
|
202
|
+
]
|
|
175
203
|
},
|
|
176
204
|
{
|
|
177
|
-
"
|
|
178
|
-
|
|
205
|
+
"and": [
|
|
206
|
+
{
|
|
207
|
+
"field": "incompleteForms",
|
|
208
|
+
"condition": "EXISTS",
|
|
209
|
+
}
|
|
210
|
+
]
|
|
179
211
|
},
|
|
180
212
|
]
|
|
181
213
|
|
|
@@ -257,6 +289,7 @@ class DataHubFormReportingData(FormData):
|
|
|
257
289
|
for prompt_id, response_time in zip(
|
|
258
290
|
search_row.completedFormsCompletedPromptIds,
|
|
259
291
|
search_row.completedFormsCompletedPromptResponseTimes,
|
|
292
|
+
strict=False,
|
|
260
293
|
)
|
|
261
294
|
if prompt_id in form_prompts
|
|
262
295
|
}
|
|
@@ -289,7 +322,8 @@ class DataHubFormReportingData(FormData):
|
|
|
289
322
|
on_asset_scanned: Optional[Callable[[str], Any]] = None,
|
|
290
323
|
on_form_scanned: Optional[Callable[[str], Any]] = None,
|
|
291
324
|
) -> Iterable[FormReportingRow]:
|
|
292
|
-
extra_fields = [f for f in self.DataHubDatasetSearchRow.
|
|
325
|
+
extra_fields = [f for f in self.DataHubDatasetSearchRow.model_fields]
|
|
326
|
+
# TODO: Replace with the new search/filter SDK.
|
|
293
327
|
result = self.graph.get_results_by_filter(
|
|
294
328
|
extra_or_filters=self.get_form_existence_or_filters(),
|
|
295
329
|
extra_source_fields=extra_fields,
|
|
@@ -302,10 +336,9 @@ class DataHubFormReportingData(FormData):
|
|
|
302
336
|
if row_index % 100 == 0:
|
|
303
337
|
logger.info(f"Scanned {row_index} assets")
|
|
304
338
|
extra_properties = row["extraProperties"]
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
}
|
|
339
|
+
extra_properties_map = parse_extra_properties_for_model(
|
|
340
|
+
extra_properties, self.DataHubDatasetSearchRow
|
|
341
|
+
)
|
|
309
342
|
search_row = self.DataHubDatasetSearchRow(**extra_properties_map)
|
|
310
343
|
if on_asset_scanned:
|
|
311
344
|
on_asset_scanned(search_row.urn)
|
|
@@ -388,6 +421,7 @@ class DataHubFormReportingData(FormData):
|
|
|
388
421
|
for (p, p_response_time) in zip(
|
|
389
422
|
search_row.incompleteFormsCompletedPromptIds,
|
|
390
423
|
search_row.incompleteFormsCompletedPromptResponseTimes,
|
|
424
|
+
strict=False,
|
|
391
425
|
)
|
|
392
426
|
if p in form_prompts
|
|
393
427
|
]:
|
|
@@ -411,7 +445,7 @@ class DataHubFormReportingData(FormData):
|
|
|
411
445
|
question_status=QuestionStatus.COMPLETED,
|
|
412
446
|
question_completed_date=datetime.fromtimestamp(
|
|
413
447
|
float(prompt_response_time) / 1000, tz=timezone.utc
|
|
414
|
-
),
|
|
448
|
+
).date(),
|
|
415
449
|
snapshot_date=self.snapshot_date,
|
|
416
450
|
)
|
|
417
451
|
complete_forms = (
|
|
@@ -485,6 +519,7 @@ class DataHubFormReportingData(FormData):
|
|
|
485
519
|
for (p, p_response_time) in zip(
|
|
486
520
|
search_row.completedFormsCompletedPromptIds,
|
|
487
521
|
search_row.completedFormsCompletedPromptResponseTimes,
|
|
522
|
+
strict=False,
|
|
488
523
|
)
|
|
489
524
|
if p in form_prompts
|
|
490
525
|
]:
|
|
@@ -512,7 +547,7 @@ class DataHubFormReportingData(FormData):
|
|
|
512
547
|
question_status=QuestionStatus.COMPLETED,
|
|
513
548
|
question_completed_date=datetime.fromtimestamp(
|
|
514
549
|
float(prompt_response_time) / 1000, tz=timezone.utc
|
|
515
|
-
),
|
|
550
|
+
).date(),
|
|
516
551
|
snapshot_date=self.snapshot_date,
|
|
517
552
|
)
|
|
518
553
|
|
|
@@ -6,7 +6,7 @@ from typing import List, Optional
|
|
|
6
6
|
|
|
7
7
|
import boto3
|
|
8
8
|
from opensearchpy import OpenSearch
|
|
9
|
-
from pydantic import
|
|
9
|
+
from pydantic import field_validator
|
|
10
10
|
|
|
11
11
|
from acryl_datahub_cloud.datahub_reporting.datahub_dataset import (
|
|
12
12
|
DataHubBasedS3Dataset,
|
|
@@ -32,6 +32,7 @@ logger = logging.getLogger(__name__)
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class DataHubReportingExtractGraphSourceConfig(ConfigModel):
|
|
35
|
+
enabled: bool = True
|
|
35
36
|
server: Optional[DatahubClientConfig] = None
|
|
36
37
|
search_index: ElasticSearchClientConfig = ElasticSearchClientConfig()
|
|
37
38
|
extract_graph_store: FileStoreBackedDatasetConfig
|
|
@@ -42,8 +43,9 @@ class DataHubReportingExtractGraphSourceConfig(ConfigModel):
|
|
|
42
43
|
query_timeout: int = 30
|
|
43
44
|
extract_batch_size: int = 2000
|
|
44
45
|
|
|
45
|
-
@
|
|
46
|
-
|
|
46
|
+
@field_validator("extract_graph_store", mode="before")
|
|
47
|
+
@classmethod
|
|
48
|
+
def set_default_extract_soft_delete_flag(cls, v):
|
|
47
49
|
if v is not None:
|
|
48
50
|
if "dataset_registration_spec" not in v:
|
|
49
51
|
v["dataset_registration_spec"] = DatasetRegistrationSpec(
|
|
@@ -118,6 +120,10 @@ class DataHubReportingExtractGraphSource(Source):
|
|
|
118
120
|
return skip_extract
|
|
119
121
|
|
|
120
122
|
def get_workunits(self):
|
|
123
|
+
if not self.config.enabled:
|
|
124
|
+
logger.info("Source is disabled, stopping")
|
|
125
|
+
return
|
|
126
|
+
|
|
121
127
|
self.graph = (
|
|
122
128
|
self.ctx.require_graph("Loading default graph coordinates.")
|
|
123
129
|
if self.config.server is None
|