acryl-datahub-cloud 0.3.8.3__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
- acryl_datahub_cloud/action_request/__init__.py +0 -0
- acryl_datahub_cloud/action_request/action_request_owner_source.py +174 -0
- acryl_datahub_cloud/api/__init__.py +1 -1
- acryl_datahub_cloud/api/client.py +2 -2
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +6 -6
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +69 -35
- acryl_datahub_cloud/datahub_reporting/extract_sql.py +4 -4
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_patch_builder.py +21 -21
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +14 -13
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +1130 -484
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionrequest/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/executorglobalconfig/__init__.py +15 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/executorpool/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metric/__init__.py +29 -0
- acryl_datahub_cloud/metadata/schema.avsc +839 -49
- acryl_datahub_cloud/metadata/schema_classes.py +1286 -63
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +422 -12
- acryl_datahub_cloud/metadata/schemas/ActionRequestStatus.avsc +12 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +5 -3
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +5 -3
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +5 -3
- acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- acryl_datahub_cloud/metadata/schemas/BusinessAttributes.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/DataHubActionInfo.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/DataHubConnectionKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceInfo.avsc +9 -4
- acryl_datahub_cloud/metadata/schemas/DataHubMetricCubeDefinition.avsc +185 -0
- acryl_datahub_cloud/metadata/schemas/DataHubMetricCubeEvent.avsc +184 -0
- acryl_datahub_cloud/metadata/schemas/DataHubMetricCubeKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +4 -4
- acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceInput.avsc +132 -2
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceOutput.avsc +131 -1
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +14 -13
- acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestInput.avsc +6 -1
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestSignal.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTerms.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
- acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +96 -0
- acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/InputFields.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +399 -176
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +6 -4
- acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/Operation.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorGlobalConfigKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorPoolGlobalConfig.avsc +16 -0
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorPoolInfo.avsc +85 -0
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorPoolKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorStatus.avsc +5 -5
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +2 -2
- acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/VersionProperties.avsc +18 -0
- acryl_datahub_cloud/metadata/schemas/VersionSetProperties.avsc +5 -0
- {acryl_datahub_cloud-0.3.8.3.dist-info → acryl_datahub_cloud-0.3.9.dist-info}/METADATA +42 -42
- {acryl_datahub_cloud-0.3.8.3.dist-info → acryl_datahub_cloud-0.3.9.dist-info}/RECORD +78 -68
- {acryl_datahub_cloud-0.3.8.3.dist-info → acryl_datahub_cloud-0.3.9.dist-info}/entry_points.txt +1 -0
- acryl_datahub_cloud/api/entity_versioning.py +0 -167
- {acryl_datahub_cloud-0.3.8.3.dist-info → acryl_datahub_cloud-0.3.9.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.8.3.dist-info → acryl_datahub_cloud-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -370,7 +370,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
370
370
|
for doc in results:
|
|
371
371
|
if "platform" not in doc["_source"] or not doc["_source"]["platform"]:
|
|
372
372
|
logger.warning(
|
|
373
|
-
f"Platform not found in query {
|
|
373
|
+
f"Platform not found in query {doc['_source']['urn']}. Skipping..."
|
|
374
374
|
)
|
|
375
375
|
continue
|
|
376
376
|
|
|
@@ -688,10 +688,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
688
688
|
|
|
689
689
|
@staticmethod
|
|
690
690
|
def polars_to_arrow_schema(
|
|
691
|
-
polars_schema: Dict[str, Union[DataTypeClass, polars.DataType]]
|
|
691
|
+
polars_schema: Dict[str, Union[DataTypeClass, polars.DataType]],
|
|
692
692
|
) -> pa.Schema:
|
|
693
693
|
def convert_dtype(
|
|
694
|
-
polars_dtype: Union[DataTypeClass, polars.DataType]
|
|
694
|
+
polars_dtype: Union[DataTypeClass, polars.DataType],
|
|
695
695
|
) -> pa.DataType:
|
|
696
696
|
type_mapping: Dict[Union[DataTypeClass, polars.DataType], pa.DataType] = {
|
|
697
697
|
polars.Boolean(): pa.bool_(),
|
|
@@ -949,14 +949,14 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
949
949
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
950
950
|
if self.config.dataset_usage_enabled:
|
|
951
951
|
with self.report.dataset_usage_processing_time as timer:
|
|
952
|
-
self.report.
|
|
952
|
+
self.report.new_stage("generate dataset usage")
|
|
953
953
|
yield from self.generate_dataset_usage_mcps()
|
|
954
954
|
time_taken = timer.elapsed_seconds()
|
|
955
955
|
logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
|
|
956
956
|
|
|
957
957
|
if self.config.dashboard_usage_enabled:
|
|
958
958
|
with self.report.dashboard_usage_processing_time as timer:
|
|
959
|
-
self.report.
|
|
959
|
+
self.report.new_stage("generate dashboard usage")
|
|
960
960
|
yield from self.generate_dashboard_usage_mcps()
|
|
961
961
|
|
|
962
962
|
time_taken = timer.elapsed_seconds()
|
|
@@ -964,7 +964,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
964
964
|
|
|
965
965
|
if self.config.chart_usage_enabled:
|
|
966
966
|
with self.report.chart_usage_processing_time as timer:
|
|
967
|
-
self.report.
|
|
967
|
+
self.report.new_stage("generate chart usage")
|
|
968
968
|
|
|
969
969
|
yield from self.generate_chart_usage_mcps()
|
|
970
970
|
|
|
@@ -973,7 +973,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
973
973
|
|
|
974
974
|
if self.config.query_usage_enabled:
|
|
975
975
|
with self.report.query_usage_processing_time as timer:
|
|
976
|
-
self.report.
|
|
976
|
+
self.report.new_stage("generate query usage")
|
|
977
977
|
|
|
978
978
|
yield from self.generate_query_usage_mcps()
|
|
979
979
|
|
|
@@ -1260,7 +1260,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1260
1260
|
# We drop last_modified_at to not use it in merge because we are getting last_modified_at from the usage index
|
|
1261
1261
|
df_with_search_scores = (
|
|
1262
1262
|
entities_lf.filter(
|
|
1263
|
-
polars.col("combinedSearchRankingMultiplier")
|
|
1263
|
+
polars.col("combinedSearchRankingMultiplier")
|
|
1264
|
+
.is_not_null()
|
|
1264
1265
|
# We only want to downrank datasets that have a search score multiplier greater than 1. 1 is the minimum score of a dataset
|
|
1265
1266
|
.and_(polars.col("combinedSearchRankingMultiplier").ne(1))
|
|
1266
1267
|
) # noqa: E712
|
|
@@ -1277,7 +1278,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1277
1278
|
usages_lf = df_with_search_scores.join(
|
|
1278
1279
|
usages_lf, on="urn", how="full", suffix="_right"
|
|
1279
1280
|
)
|
|
1280
|
-
|
|
1281
|
+
## Merge all common fields automatically
|
|
1281
1282
|
for common_field in common_fields:
|
|
1282
1283
|
right_col = f"{common_field}_right"
|
|
1283
1284
|
usages_lf = usages_lf.with_columns(
|
|
@@ -1451,9 +1452,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1451
1452
|
if not self.config.streaming_mode:
|
|
1452
1453
|
return polars.LazyFrame(data, schema)
|
|
1453
1454
|
else:
|
|
1454
|
-
assert (
|
|
1455
|
-
|
|
1456
|
-
)
|
|
1455
|
+
assert self.temp_dir is not None, (
|
|
1456
|
+
"In Streaming mode temp dir should be set. Normally this should not happen..."
|
|
1457
|
+
)
|
|
1457
1458
|
|
|
1458
1459
|
with tempfile.NamedTemporaryFile(
|
|
1459
1460
|
delete=False,
|
|
@@ -1623,7 +1624,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1623
1624
|
time_taken = timer.elapsed_seconds()
|
|
1624
1625
|
processed_count += len(results["hits"]["hits"])
|
|
1625
1626
|
logger.info(
|
|
1626
|
-
f"Processed {len(results['hits']['hits'
|
|
1627
|
+
f"Processed {len(results['hits']['hits'])} data from {index} index in {time_taken:.3f} seconds. Total: {processed_count} processed."
|
|
1627
1628
|
)
|
|
1628
1629
|
if len(results["hits"]["hits"]) < batch_size:
|
|
1629
1630
|
break
|