acryl-datahub-cloud 0.3.10rc4__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
- acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
- acryl_datahub_cloud/action_request/action_request_owner_source.py +37 -8
- acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
- acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
- acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
- acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
- acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +39 -19
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +60 -25
- acryl_datahub_cloud/datahub_reporting/extract_graph.py +9 -3
- acryl_datahub_cloud/datahub_reporting/extract_sql.py +248 -52
- acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
- acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
- acryl_datahub_cloud/datahub_restore/source.py +3 -2
- acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +532 -109
- acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
- acryl_datahub_cloud/graphql_utils.py +64 -0
- acryl_datahub_cloud/lineage_features/source.py +555 -49
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +2390 -1938
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +6 -2
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +14 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/monitor/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- acryl_datahub_cloud/metadata/schema.avsc +27843 -23200
- acryl_datahub_cloud/metadata/schema_classes.py +29901 -24310
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
- acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
- acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
- acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +375 -212
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
- acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +191 -21
- acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
- acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
- acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
- acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
- acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
- acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
- acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
- acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
- acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
- acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
- acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
- acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
- acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
- acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
- acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
- acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
- acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
- acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +424 -97
- acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +65 -44
- acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +84 -29
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +221 -23
- acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
- acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
- acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
- acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
- acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
- acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
- acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +147 -0
- acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
- acryl_datahub_cloud/metadata/schemas/__init__.py +3 -3
- acryl_datahub_cloud/notifications/__init__.py +0 -0
- acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
- acryl_datahub_cloud/sdk/__init__.py +69 -0
- acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
- acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
- acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
- acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
- acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
- acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
- acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
- acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
- acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
- acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
- acryl_datahub_cloud/sdk/assertion/types.py +20 -0
- acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
- acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
- acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
- acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
- acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
- acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
- acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
- acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
- acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
- acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
- acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
- acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
- acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
- acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
- acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
- acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
- acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
- acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
- acryl_datahub_cloud/sdk/errors.py +34 -0
- acryl_datahub_cloud/sdk/resolver_client.py +42 -0
- acryl_datahub_cloud/sdk/subscription_client.py +737 -0
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +49 -43
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +243 -145
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
|
@@ -4,10 +4,14 @@ import shutil
|
|
|
4
4
|
import zipfile
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Iterable, List, Optional
|
|
7
|
+
from typing import TYPE_CHECKING, Iterable, List, Literal, Optional
|
|
8
8
|
|
|
9
9
|
import boto3
|
|
10
|
-
from
|
|
10
|
+
from botocore.exceptions import ClientError
|
|
11
|
+
from pydantic import field_validator
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from mypy_boto3_s3.service_resource import ObjectSummary
|
|
11
15
|
|
|
12
16
|
from acryl_datahub_cloud.datahub_reporting.datahub_dataset import (
|
|
13
17
|
DataHubBasedS3Dataset,
|
|
@@ -38,28 +42,40 @@ class S3ClientConfig(ConfigModel):
|
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
class DataHubReportingExtractSQLSourceConfig(ConfigModel):
|
|
45
|
+
enabled: bool = True
|
|
41
46
|
server: Optional[DatahubClientConfig] = None
|
|
42
47
|
sql_backup_config: S3ClientConfig
|
|
43
48
|
extract_sql_store: FileStoreBackedDatasetConfig
|
|
44
|
-
|
|
45
|
-
|
|
49
|
+
# Maximum size (in bytes) of files to stream from S3 per batch using chunked streaming.
|
|
50
|
+
# Files are streamed in 8MB chunks directly from S3 to ZIP without writing to disk, processing
|
|
51
|
+
# files in batches to limit peak memory usage. This prevents both disk pressure and excessive
|
|
52
|
+
# memory consumption during batch processing.
|
|
53
|
+
# Default: 5GB (5 * 1024 * 1024 * 1024 bytes)
|
|
54
|
+
batch_size_bytes: int = 5 * 1024 * 1024 * 1024
|
|
55
|
+
|
|
56
|
+
@field_validator("extract_sql_store", mode="before")
|
|
57
|
+
@classmethod
|
|
46
58
|
def set_default_extract_soft_delete_flag(cls, v):
|
|
47
|
-
if v is
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
59
|
+
if v is None:
|
|
60
|
+
return v
|
|
61
|
+
|
|
62
|
+
# If v is already a FileStoreBackedDatasetConfig object, skip dict-based modifications
|
|
63
|
+
if isinstance(v, FileStoreBackedDatasetConfig):
|
|
64
|
+
return v
|
|
65
|
+
|
|
66
|
+
# v is a dictionary - apply default values
|
|
67
|
+
if "dataset_registration_spec" not in v:
|
|
68
|
+
v["dataset_registration_spec"] = DatasetRegistrationSpec(soft_deleted=False)
|
|
69
|
+
elif "soft_deleted" not in v["dataset_registration_spec"]:
|
|
70
|
+
v["dataset_registration_spec"]["soft_deleted"] = False
|
|
71
|
+
|
|
72
|
+
if "file" not in v:
|
|
73
|
+
default_config = FileStoreBackedDatasetConfig.dummy()
|
|
74
|
+
v["file"] = f"{default_config.file_name}.{default_config.file_extension}"
|
|
75
|
+
else:
|
|
76
|
+
v["file_name"] = v["file"].split(".")[0]
|
|
77
|
+
v["file_extension"] = v["file"].split(".")[-1]
|
|
54
78
|
|
|
55
|
-
if "file" not in v:
|
|
56
|
-
default_config = FileStoreBackedDatasetConfig.dummy()
|
|
57
|
-
v["file"] = (
|
|
58
|
-
f"{default_config.file_name}.{default_config.file_extension}"
|
|
59
|
-
)
|
|
60
|
-
else:
|
|
61
|
-
v["file_name"] = v["file"].split(".")[0]
|
|
62
|
-
v["file_extension"] = v["file"].split(".")[-1]
|
|
63
79
|
return v
|
|
64
80
|
|
|
65
81
|
|
|
@@ -112,12 +128,16 @@ class DataHubReportingExtractSQLSource(Source):
|
|
|
112
128
|
|
|
113
129
|
if skip_extract:
|
|
114
130
|
logger.info(
|
|
115
|
-
f"Skipping
|
|
131
|
+
f"Skipping sql extract as dataset has been updated today {ts}"
|
|
116
132
|
)
|
|
117
133
|
|
|
118
134
|
return skip_extract
|
|
119
135
|
|
|
120
136
|
def get_workunits(self):
|
|
137
|
+
if not self.config.enabled:
|
|
138
|
+
logger.info("Source is disabled, stopping")
|
|
139
|
+
return
|
|
140
|
+
|
|
121
141
|
self.graph = (
|
|
122
142
|
self.ctx.require_graph("Loading default graph coordinates.")
|
|
123
143
|
if self.config.server is None
|
|
@@ -161,20 +181,17 @@ class DataHubReportingExtractSQLSource(Source):
|
|
|
161
181
|
|
|
162
182
|
self._clean_up_old_state(state_directory=tmp_dir)
|
|
163
183
|
|
|
164
|
-
files_downloaded: bool = self.
|
|
184
|
+
files_downloaded: bool = self._download_and_zip_in_batches(
|
|
165
185
|
bucket=self.config.sql_backup_config.bucket,
|
|
166
186
|
prefix=bucket_prefix,
|
|
167
|
-
|
|
187
|
+
batch_dir=f"{tmp_dir}/download/",
|
|
188
|
+
output_zip=f"{tmp_dir}/{output_file}",
|
|
189
|
+
batch_size_bytes=self.config.batch_size_bytes,
|
|
168
190
|
)
|
|
169
191
|
if not files_downloaded:
|
|
170
192
|
logger.warning(f"Skipping as no files were found in {bucket_prefix}")
|
|
171
193
|
return
|
|
172
194
|
|
|
173
|
-
self._zip_folder(
|
|
174
|
-
folder_path=f"{tmp_dir}/download",
|
|
175
|
-
output_file=f"{tmp_dir}/{output_file}",
|
|
176
|
-
)
|
|
177
|
-
|
|
178
195
|
# Compute profile & schema information, this is based on the parquet files that were downloaded and not the zip file.
|
|
179
196
|
# We must hard-code the local file from which the dataset will be created, otherwise the upload to s3 will be in
|
|
180
197
|
# unexpected path.
|
|
@@ -205,40 +222,219 @@ class DataHubReportingExtractSQLSource(Source):
|
|
|
205
222
|
path = Path(f"{state_directory}/download/")
|
|
206
223
|
path.mkdir(parents=True, exist_ok=True)
|
|
207
224
|
|
|
208
|
-
|
|
209
|
-
|
|
225
|
+
@staticmethod
|
|
226
|
+
def _stream_file_to_zip_from_local(
|
|
227
|
+
local_file_path: str,
|
|
228
|
+
zipf: zipfile.ZipFile,
|
|
229
|
+
file_name: str,
|
|
230
|
+
chunk_size: int,
|
|
231
|
+
) -> None:
|
|
232
|
+
"""Stream file from local disk to ZIP using chunked reads."""
|
|
233
|
+
with (
|
|
234
|
+
open(local_file_path, "rb") as local_file,
|
|
235
|
+
zipf.open(file_name, "w") as zip_entry,
|
|
236
|
+
):
|
|
237
|
+
while True:
|
|
238
|
+
chunk = local_file.read(chunk_size)
|
|
239
|
+
if not chunk:
|
|
240
|
+
break
|
|
241
|
+
zip_entry.write(chunk)
|
|
242
|
+
|
|
243
|
+
def _stream_file_to_zip_from_s3(
|
|
244
|
+
self,
|
|
245
|
+
bucket: str,
|
|
246
|
+
file_key: str,
|
|
247
|
+
zipf: zipfile.ZipFile,
|
|
248
|
+
file_name: str,
|
|
249
|
+
chunk_size: int,
|
|
250
|
+
) -> None:
|
|
251
|
+
"""Stream file from S3 to ZIP using chunked reads."""
|
|
252
|
+
s3_response = self.s3_client.get_object(Bucket=bucket, Key=file_key)
|
|
253
|
+
body_stream = s3_response["Body"]
|
|
254
|
+
|
|
255
|
+
with zipf.open(file_name, "w") as zip_entry:
|
|
256
|
+
while True:
|
|
257
|
+
chunk = body_stream.read(chunk_size)
|
|
258
|
+
if not chunk:
|
|
259
|
+
break
|
|
260
|
+
zip_entry.write(chunk)
|
|
210
261
|
|
|
211
|
-
|
|
262
|
+
@staticmethod
|
|
263
|
+
def _group_objects_into_batches(
|
|
264
|
+
objects: List["ObjectSummary"], batch_size_bytes: int
|
|
265
|
+
) -> List[List["ObjectSummary"]]:
|
|
266
|
+
"""
|
|
267
|
+
Group S3 objects into batches based on cumulative size.
|
|
268
|
+
|
|
269
|
+
Files larger than batch_size_bytes get their own batch.
|
|
270
|
+
"""
|
|
271
|
+
batches: List[List["ObjectSummary"]] = []
|
|
272
|
+
current_batch: List["ObjectSummary"] = []
|
|
273
|
+
current_batch_size = 0
|
|
212
274
|
|
|
213
|
-
# Iterate over objects in the time partition path
|
|
214
275
|
for obj in objects:
|
|
215
|
-
|
|
216
|
-
|
|
276
|
+
obj_size = obj.size
|
|
277
|
+
|
|
278
|
+
# If file is larger than batch size, give it its own batch
|
|
279
|
+
if obj_size > batch_size_bytes:
|
|
280
|
+
if current_batch:
|
|
281
|
+
batches.append(current_batch)
|
|
282
|
+
current_batch = []
|
|
283
|
+
current_batch_size = 0
|
|
284
|
+
|
|
285
|
+
batches.append([obj]) # Solo batch for large file
|
|
286
|
+
logger.warning(
|
|
287
|
+
f"File {obj.key} ({obj_size / (1024**2):.2f} MB) exceeds batch size "
|
|
288
|
+
f"({batch_size_bytes / (1024**2):.2f} MB), processing in separate batch"
|
|
289
|
+
)
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
# If adding this file would exceed batch size, start a new batch
|
|
293
|
+
if (
|
|
294
|
+
current_batch_size > 0
|
|
295
|
+
and current_batch_size + obj_size > batch_size_bytes
|
|
296
|
+
):
|
|
297
|
+
batches.append(current_batch)
|
|
298
|
+
current_batch = []
|
|
299
|
+
current_batch_size = 0
|
|
300
|
+
|
|
301
|
+
current_batch.append(obj)
|
|
302
|
+
current_batch_size += obj_size
|
|
303
|
+
|
|
304
|
+
# Add the last batch if it has files
|
|
305
|
+
if current_batch:
|
|
306
|
+
batches.append(current_batch)
|
|
307
|
+
|
|
308
|
+
return batches
|
|
309
|
+
|
|
310
|
+
def _download_and_zip_in_batches(
|
|
311
|
+
self,
|
|
312
|
+
bucket: str,
|
|
313
|
+
prefix: str,
|
|
314
|
+
batch_dir: str,
|
|
315
|
+
output_zip: str,
|
|
316
|
+
batch_size_bytes: int,
|
|
317
|
+
) -> bool:
|
|
318
|
+
"""
|
|
319
|
+
Stream files from S3 directly into ZIP using chunked streaming, processing in batches to limit memory usage.
|
|
217
320
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
321
|
+
Downloads the first file to batch_dir for schema/profile computation, then streams all files to ZIP
|
|
322
|
+
using 8MB chunks to ensure constant memory usage regardless of individual file sizes.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
bucket: S3 bucket name
|
|
326
|
+
prefix: S3 prefix to filter objects
|
|
327
|
+
batch_dir: Local directory for temporary sample file download (for schema computation)
|
|
328
|
+
output_zip: Output ZIP file path
|
|
329
|
+
batch_size_bytes: Maximum total size of files to stream in each batch before flushing
|
|
222
330
|
|
|
223
|
-
|
|
331
|
+
Returns:
|
|
332
|
+
True if any files were processed, False otherwise
|
|
333
|
+
"""
|
|
334
|
+
s3_resource = boto3.resource("s3")
|
|
335
|
+
objects = list(s3_resource.Bucket(bucket).objects.filter(Prefix=prefix))
|
|
224
336
|
|
|
225
|
-
|
|
226
|
-
|
|
337
|
+
if not objects:
|
|
338
|
+
return False
|
|
227
339
|
|
|
228
|
-
|
|
340
|
+
logger.info(
|
|
341
|
+
f"Found {len(objects)} files in s3://{bucket}/{prefix}, streaming in batches of up to {batch_size_bytes / (1024**2):.2f} MB"
|
|
342
|
+
)
|
|
229
343
|
|
|
230
|
-
|
|
344
|
+
# Download first file to batch_dir for schema/profile computation
|
|
345
|
+
# This is required by register_dataset() which needs a local parquet file to generate schema
|
|
346
|
+
os.makedirs(batch_dir, exist_ok=True)
|
|
347
|
+
first_obj = objects[0]
|
|
348
|
+
sample_file_path = os.path.join(batch_dir, os.path.basename(first_obj.key))
|
|
231
349
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
350
|
+
try:
|
|
351
|
+
logger.info(
|
|
352
|
+
f"Downloading first file s3://{bucket}/{first_obj.key} ({first_obj.size / (1024**2):.2f} MB) "
|
|
353
|
+
f"to {sample_file_path} for schema computation"
|
|
354
|
+
)
|
|
355
|
+
self.s3_client.download_file(bucket, first_obj.key, sample_file_path)
|
|
356
|
+
except ClientError as e:
|
|
357
|
+
logger.error(f"Failed to download first file for schema computation: {e}")
|
|
358
|
+
raise RuntimeError(
|
|
359
|
+
f"Cannot compute schema without at least one sample file: {e}"
|
|
360
|
+
) from e
|
|
361
|
+
|
|
362
|
+
# Group objects into batches based on cumulative size
|
|
363
|
+
batches = self._group_objects_into_batches(objects, batch_size_bytes)
|
|
364
|
+
logger.info(f"Split {len(objects)} files into {len(batches)} batches")
|
|
365
|
+
|
|
366
|
+
# Track whether we've processed the first file to avoid downloading it twice
|
|
367
|
+
first_obj_processed = False
|
|
368
|
+
|
|
369
|
+
# Process each batch: stream from S3 directly to ZIP using chunked reads
|
|
370
|
+
zip_mode: Literal["x", "a"] = "x" # Create new file for first batch
|
|
371
|
+
chunk_size = 8 * 1024 * 1024 # 8MB chunks for constant memory usage
|
|
372
|
+
|
|
373
|
+
for batch_idx, batch in enumerate(batches):
|
|
374
|
+
batch_size_mb = sum(obj.size for obj in batch) / (1024 * 1024)
|
|
375
|
+
logger.info(
|
|
376
|
+
f"Processing batch {batch_idx + 1}/{len(batches)} with {len(batch)} files ({batch_size_mb:.2f} MB)"
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# Stream files from S3 directly into ZIP using chunked reads
|
|
380
|
+
with zipfile.ZipFile(output_zip, zip_mode, zipfile.ZIP_DEFLATED) as zipf:
|
|
381
|
+
for obj in batch:
|
|
382
|
+
file_key = obj.key
|
|
383
|
+
|
|
384
|
+
# Preserve S3 path structure in ZIP to avoid filename collisions
|
|
385
|
+
# Strip only the common prefix, keep subdirectories
|
|
386
|
+
relative_path = file_key[len(prefix) :].lstrip("/")
|
|
387
|
+
file_name = (
|
|
388
|
+
relative_path if relative_path else os.path.basename(file_key)
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
try:
|
|
392
|
+
# If this is the first file and we already downloaded it, reuse local copy
|
|
393
|
+
if not first_obj_processed and file_key == first_obj.key:
|
|
394
|
+
logger.info(
|
|
395
|
+
f"Adding {file_name} ({obj.size / (1024**2):.2f} MB) to ZIP from local file "
|
|
396
|
+
f"(already downloaded for schema computation)"
|
|
397
|
+
)
|
|
398
|
+
self._stream_file_to_zip_from_local(
|
|
399
|
+
sample_file_path, zipf, file_name, chunk_size
|
|
400
|
+
)
|
|
401
|
+
first_obj_processed = True
|
|
402
|
+
else:
|
|
403
|
+
# Stream from S3 using chunked reads for constant memory usage
|
|
404
|
+
logger.info(
|
|
405
|
+
f"Streaming {file_name} ({obj.size / (1024**2):.2f} MB) from S3 using chunked reads"
|
|
406
|
+
)
|
|
407
|
+
self._stream_file_to_zip_from_s3(
|
|
408
|
+
bucket, file_key, zipf, file_name, chunk_size
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
logger.info(f"Added {file_name} to ZIP file")
|
|
412
|
+
|
|
413
|
+
except ClientError as e:
|
|
414
|
+
logger.error(f"Failed to stream s3://{bucket}/{file_key}: {e}")
|
|
415
|
+
raise RuntimeError(
|
|
416
|
+
f"Failed to stream file {file_key} from S3: {e}"
|
|
417
|
+
) from e
|
|
418
|
+
except Exception as e:
|
|
419
|
+
logger.error(
|
|
420
|
+
f"Unexpected error processing s3://{bucket}/{file_key}: {e}"
|
|
421
|
+
)
|
|
422
|
+
raise RuntimeError(
|
|
423
|
+
f"Failed to process file {file_key}: {e}"
|
|
424
|
+
) from e
|
|
425
|
+
|
|
426
|
+
# After first batch, switch to append mode for subsequent batches
|
|
427
|
+
zip_mode = "a"
|
|
428
|
+
|
|
429
|
+
logger.info(
|
|
430
|
+
f"Batch {batch_idx + 1}/{len(batches)} complete, streamed {len(batch)} files"
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
total_size_mb = sum(obj.size for obj in objects) / (1024 * 1024)
|
|
434
|
+
logger.info(
|
|
435
|
+
f"Successfully streamed all {len(objects)} files ({total_size_mb:.2f} MB) across {len(batches)} batches"
|
|
436
|
+
)
|
|
437
|
+
return True
|
|
242
438
|
|
|
243
439
|
def get_report(self) -> SourceReport:
|
|
244
440
|
return self.report
|
|
@@ -75,7 +75,7 @@ class DataHubReportingFormsSource(Source):
|
|
|
75
75
|
enabled=False, dataset_urn=None, physical_uri_prefix=None
|
|
76
76
|
)
|
|
77
77
|
result_map = query_result.get(query_name, {})
|
|
78
|
-
return FormAnalyticsConfig.
|
|
78
|
+
return FormAnalyticsConfig.model_validate(
|
|
79
79
|
dict(
|
|
80
80
|
(field, result_map.get(graphql_field))
|
|
81
81
|
for field, graphql_field in field_mappings.items()
|
|
@@ -2,7 +2,7 @@ from dataclasses import dataclass
|
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
|
|
5
|
-
from pydantic import
|
|
5
|
+
from pydantic import field_validator
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import ConfigModel
|
|
8
8
|
from datahub.ingestion.api.source import SourceReport
|
|
@@ -32,7 +32,8 @@ class DataHubReportingFormSourceConfig(ConfigModel):
|
|
|
32
32
|
generate_presigned_url: bool = True
|
|
33
33
|
presigned_url_expiry_days: int = 7
|
|
34
34
|
|
|
35
|
-
@
|
|
35
|
+
@field_validator("reporting_snapshot_partitioning_strategy")
|
|
36
|
+
@classmethod
|
|
36
37
|
def validate_partitioning_strategy(cls, v):
|
|
37
38
|
if v not in PartitioningStrategy:
|
|
38
39
|
raise ValueError(f"Unsupported partitioning strategy: {v}")
|
|
@@ -3,7 +3,7 @@ import time
|
|
|
3
3
|
from functools import partial
|
|
4
4
|
from typing import Any, Dict, Iterable, List, Optional
|
|
5
5
|
|
|
6
|
-
from pydantic import Field,
|
|
6
|
+
from pydantic import Field, model_validator
|
|
7
7
|
|
|
8
8
|
from acryl_datahub_cloud.datahub_restore.do_restore import restore_indices
|
|
9
9
|
from datahub.configuration.common import ConfigModel
|
|
@@ -64,7 +64,8 @@ class DataHubRestoreIndicesConfig(ConfigModel, StatefulIngestionConfigBase):
|
|
|
64
64
|
description="Same as restore indices endpoint.",
|
|
65
65
|
)
|
|
66
66
|
|
|
67
|
-
@
|
|
67
|
+
@model_validator(mode="before")
|
|
68
|
+
@classmethod
|
|
68
69
|
def extract_assertion_info(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
69
70
|
if values.get("urn") is None and values.get("urn_like") is None:
|
|
70
71
|
raise ValueError("Either urn or urn_like must be provided.")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
EXCLUDED_PATTERNS = [
|
|
2
|
+
"_ingestion",
|
|
3
|
+
"amplitude",
|
|
4
|
+
"analytics",
|
|
5
|
+
"anomaly",
|
|
6
|
+
"anomalo",
|
|
7
|
+
"airflow",
|
|
8
|
+
"app",
|
|
9
|
+
"api",
|
|
10
|
+
"aws",
|
|
11
|
+
"braze",
|
|
12
|
+
"bigquery",
|
|
13
|
+
"backfill",
|
|
14
|
+
"billing",
|
|
15
|
+
"bot",
|
|
16
|
+
"census",
|
|
17
|
+
"customer_io",
|
|
18
|
+
"connector",
|
|
19
|
+
"composer",
|
|
20
|
+
"compute",
|
|
21
|
+
"circleci",
|
|
22
|
+
"classifier",
|
|
23
|
+
"cron",
|
|
24
|
+
"datahub",
|
|
25
|
+
"data-engine",
|
|
26
|
+
"dbt",
|
|
27
|
+
"datadog",
|
|
28
|
+
"deploy",
|
|
29
|
+
"databricks",
|
|
30
|
+
"dataflow",
|
|
31
|
+
"dataplex",
|
|
32
|
+
"dagster",
|
|
33
|
+
"enterprise",
|
|
34
|
+
"export",
|
|
35
|
+
"etl",
|
|
36
|
+
"fivetran",
|
|
37
|
+
"function",
|
|
38
|
+
"google",
|
|
39
|
+
"gcp",
|
|
40
|
+
"gke",
|
|
41
|
+
"grafana",
|
|
42
|
+
"hex",
|
|
43
|
+
"hightouch",
|
|
44
|
+
"ingest",
|
|
45
|
+
"infra",
|
|
46
|
+
"infer",
|
|
47
|
+
"integration",
|
|
48
|
+
"iam",
|
|
49
|
+
"job",
|
|
50
|
+
"jenkins",
|
|
51
|
+
"looker",
|
|
52
|
+
"lineage",
|
|
53
|
+
"monte_carlo",
|
|
54
|
+
"netsuite",
|
|
55
|
+
"process",
|
|
56
|
+
"prefect",
|
|
57
|
+
"pipeline",
|
|
58
|
+
"query",
|
|
59
|
+
"redash",
|
|
60
|
+
"realtime",
|
|
61
|
+
"report",
|
|
62
|
+
"remote-executor",
|
|
63
|
+
"runner",
|
|
64
|
+
"sagemaker",
|
|
65
|
+
"salesforce",
|
|
66
|
+
"sigma",
|
|
67
|
+
"sandbox",
|
|
68
|
+
"snowplow",
|
|
69
|
+
"segment",
|
|
70
|
+
"sync",
|
|
71
|
+
"schedul",
|
|
72
|
+
"svc",
|
|
73
|
+
"sa_",
|
|
74
|
+
"_sa",
|
|
75
|
+
"sa-",
|
|
76
|
+
"-sa",
|
|
77
|
+
"snowflake",
|
|
78
|
+
"service",
|
|
79
|
+
"system",
|
|
80
|
+
"spark",
|
|
81
|
+
"task",
|
|
82
|
+
"test",
|
|
83
|
+
"team",
|
|
84
|
+
"talend",
|
|
85
|
+
"teleskope",
|
|
86
|
+
"train",
|
|
87
|
+
"tableau",
|
|
88
|
+
"unknown",
|
|
89
|
+
"wiz",
|
|
90
|
+
"warehouse",
|
|
91
|
+
"workload",
|
|
92
|
+
"workflow",
|
|
93
|
+
"worker",
|
|
94
|
+
]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
1
2
|
from typing import Dict
|
|
2
3
|
|
|
3
4
|
|
|
@@ -5,7 +6,7 @@ class QueryBuilder:
|
|
|
5
6
|
@staticmethod
|
|
6
7
|
def get_dataset_entities_query() -> Dict:
|
|
7
8
|
return {
|
|
8
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
9
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
9
10
|
"_source": {
|
|
10
11
|
"includes": [
|
|
11
12
|
"urn",
|
|
@@ -19,15 +20,54 @@ class QueryBuilder:
|
|
|
19
20
|
}
|
|
20
21
|
|
|
21
22
|
@staticmethod
|
|
22
|
-
def get_query_entities_query() -> Dict:
|
|
23
|
+
def get_query_entities_query(days: int) -> Dict:
|
|
24
|
+
thirty_days_ago = datetime.now() - timedelta(days=days)
|
|
25
|
+
thirty_days_ago = thirty_days_ago.replace(
|
|
26
|
+
hour=0, minute=0, second=0, microsecond=0
|
|
27
|
+
)
|
|
28
|
+
epoch_ms = int(thirty_days_ago.timestamp() * 1000)
|
|
29
|
+
|
|
23
30
|
return {
|
|
24
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
31
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
25
32
|
"_source": {"includes": ["urn", "lastModifiedAt", "platform", "removed"]},
|
|
26
33
|
"query": {
|
|
27
34
|
"bool": {
|
|
28
35
|
"filter": [
|
|
29
36
|
{"bool": {"must_not": [{"term": {"source": "MANUAL"}}]}},
|
|
30
37
|
{"exists": {"field": "platform"}},
|
|
38
|
+
{
|
|
39
|
+
"bool": {
|
|
40
|
+
"should": [
|
|
41
|
+
{
|
|
42
|
+
"bool": {
|
|
43
|
+
"filter": [
|
|
44
|
+
{"exists": {"field": "lastModifiedAt"}},
|
|
45
|
+
{
|
|
46
|
+
"range": {
|
|
47
|
+
"lastModifiedAt": {
|
|
48
|
+
"gte": epoch_ms
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
]
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"bool": {
|
|
57
|
+
"must_not": {
|
|
58
|
+
"exists": {"field": "lastModifiedAt"}
|
|
59
|
+
},
|
|
60
|
+
"filter": {
|
|
61
|
+
"range": {
|
|
62
|
+
"createdAt": {"gte": epoch_ms}
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
],
|
|
68
|
+
"minimum_should_match": 1,
|
|
69
|
+
}
|
|
70
|
+
},
|
|
31
71
|
]
|
|
32
72
|
}
|
|
33
73
|
},
|
|
@@ -36,7 +76,7 @@ class QueryBuilder:
|
|
|
36
76
|
@staticmethod
|
|
37
77
|
def get_upstreams_query() -> Dict:
|
|
38
78
|
return {
|
|
39
|
-
"sort": [{"destination.urn": {"order": "asc"}}],
|
|
79
|
+
# "sort": [{"destination.urn": {"order": "asc"}}],
|
|
40
80
|
"_source": {"includes": ["source.urn", "destination.urn"]},
|
|
41
81
|
"query": {
|
|
42
82
|
"bool": {
|
|
@@ -51,7 +91,7 @@ class QueryBuilder:
|
|
|
51
91
|
@staticmethod
|
|
52
92
|
def get_dashboard_usage_query(days: int) -> Dict:
|
|
53
93
|
return {
|
|
54
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
94
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
55
95
|
"_source": {
|
|
56
96
|
"includes": [
|
|
57
97
|
"timestampMillis",
|
|
@@ -80,7 +120,7 @@ class QueryBuilder:
|
|
|
80
120
|
@staticmethod
|
|
81
121
|
def get_dataset_usage_query(days: int) -> Dict:
|
|
82
122
|
return {
|
|
83
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
123
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
84
124
|
"_source": {
|
|
85
125
|
"includes": [
|
|
86
126
|
"timestampMillis",
|
|
@@ -110,7 +150,7 @@ class QueryBuilder:
|
|
|
110
150
|
@staticmethod
|
|
111
151
|
def get_dataset_write_usage_raw_query(days: int) -> Dict:
|
|
112
152
|
return {
|
|
113
|
-
"sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
|
|
153
|
+
# "sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
|
|
114
154
|
"_source": {
|
|
115
155
|
"includes": [
|
|
116
156
|
"urn" # Only field needed for platform extraction via regex
|
|
@@ -159,7 +199,7 @@ class QueryBuilder:
|
|
|
159
199
|
@staticmethod
|
|
160
200
|
def get_query_usage_query(days: int) -> Dict:
|
|
161
201
|
return {
|
|
162
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
202
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
163
203
|
"_source": {
|
|
164
204
|
"includes": [
|
|
165
205
|
"timestampMillis",
|