acryl-datahub-cloud 0.3.7.9rc1__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/acryl_cs_issues/source.py +0 -1
- acryl_datahub_cloud/api/__init__.py +1 -0
- acryl_datahub_cloud/api/client.py +6 -0
- acryl_datahub_cloud/api/entity_versioning.py +167 -0
- acryl_datahub_cloud/datahub_metadata_sharing/__init__.py +0 -0
- acryl_datahub_cloud/datahub_metadata_sharing/metadata_sharing_source.py +267 -0
- acryl_datahub_cloud/datahub_metadata_sharing/query.py +7 -0
- acryl_datahub_cloud/datahub_metadata_sharing/scroll_shared_entities.gql +204 -0
- acryl_datahub_cloud/datahub_metadata_sharing/share_entity.gql +9 -0
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +0 -2
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +0 -1
- acryl_datahub_cloud/datahub_reporting/extract_graph.py +0 -1
- acryl_datahub_cloud/datahub_reporting/extract_sql.py +0 -1
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +79 -57
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +284 -258
- acryl_datahub_cloud/lineage_features/source.py +22 -5
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +1564 -1465
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/executor/__init__.py +15 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- acryl_datahub_cloud/metadata/schema.avsc +23777 -22729
- acryl_datahub_cloud/metadata/schema_classes.py +1322 -519
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/AssertionInferenceDetails.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceInfo.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/DataHubViewInfo.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +63 -0
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/Deprecation.avsc +12 -0
- acryl_datahub_cloud/metadata/schemas/DynamicFormAssignment.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/EntityTypeKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestInput.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestResult.avsc +14 -0
- acryl_datahub_cloud/metadata/schemas/Filter.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureProperties.avsc +51 -0
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- acryl_datahub_cloud/metadata/schemas/MLModelGroupProperties.avsc +155 -0
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +155 -47
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- acryl_datahub_cloud/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +178 -47
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +10 -1
- acryl_datahub_cloud/metadata/schemas/PostInfo.avsc +23 -0
- acryl_datahub_cloud/metadata/schemas/RecommendationModule.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorStatus.avsc +80 -0
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/VersionProperties.avsc +216 -0
- acryl_datahub_cloud/metadata/schemas/VersionSetKey.avsc +26 -0
- acryl_datahub_cloud/metadata/schemas/VersionSetProperties.avsc +49 -0
- {acryl_datahub_cloud-0.3.7.9rc1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/METADATA +57 -47
- {acryl_datahub_cloud-0.3.7.9rc1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/RECORD +66 -49
- {acryl_datahub_cloud-0.3.7.9rc1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/WHEEL +1 -1
- {acryl_datahub_cloud-0.3.7.9rc1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/entry_points.txt +1 -0
- {acryl_datahub_cloud-0.3.7.9rc1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,8 @@ from dataclasses import dataclass, field
|
|
|
9
9
|
from datetime import datetime
|
|
10
10
|
from functools import partial
|
|
11
11
|
from itertools import chain
|
|
12
|
-
from
|
|
12
|
+
from tempfile import TemporaryDirectory
|
|
13
|
+
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union
|
|
13
14
|
|
|
14
15
|
import numpy
|
|
15
16
|
import polars
|
|
@@ -17,6 +18,7 @@ import pyarrow as pa
|
|
|
17
18
|
import pyarrow.parquet as pq
|
|
18
19
|
from elasticsearch.client import Elasticsearch
|
|
19
20
|
from opensearchpy import OpenSearch
|
|
21
|
+
from polars.datatypes import DataTypeClass
|
|
20
22
|
from pydantic import Field
|
|
21
23
|
from scipy.stats import expon
|
|
22
24
|
|
|
@@ -171,7 +173,7 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
171
173
|
)
|
|
172
174
|
|
|
173
175
|
disable_write_usage: bool = Field(
|
|
174
|
-
|
|
176
|
+
True,
|
|
175
177
|
description="Flag to disable write usage statistics collection.'",
|
|
176
178
|
)
|
|
177
179
|
|
|
@@ -245,6 +247,7 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
|
|
|
245
247
|
class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
246
248
|
platform = "datahub"
|
|
247
249
|
temp_files_to_clean: List[str] = []
|
|
250
|
+
temp_dir: Optional[TemporaryDirectory] = None
|
|
248
251
|
|
|
249
252
|
def __init__(
|
|
250
253
|
self, ctx: PipelineContext, config: DataHubUsageFeatureReportingSourceConfig
|
|
@@ -267,6 +270,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
267
270
|
if num > 0:
|
|
268
271
|
logger.info(f"Compiled {num} regexp factors")
|
|
269
272
|
|
|
273
|
+
if self.config.streaming_mode:
|
|
274
|
+
self.temp_dir = tempfile.TemporaryDirectory(prefix="datahub-usage-")
|
|
275
|
+
logger.info(f"Using temp dir: {self.temp_dir.name}")
|
|
276
|
+
|
|
270
277
|
def soft_deleted_batch(self, results: Iterable) -> Iterable[Dict]:
|
|
271
278
|
with PerfTimer() as timer:
|
|
272
279
|
for doc in results:
|
|
@@ -389,60 +396,30 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
389
396
|
continue
|
|
390
397
|
|
|
391
398
|
yield {
|
|
392
|
-
"timestampMillis": doc["_source"]
|
|
393
|
-
"lastObserved": doc["_source"]
|
|
394
|
-
"
|
|
395
|
-
"
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
),
|
|
400
|
-
"
|
|
401
|
-
"viewsCount": (
|
|
402
|
-
doc["_source"]["viewsCount"]
|
|
403
|
-
if "viewsCount" in doc["_source"]
|
|
404
|
-
else 0
|
|
405
|
-
),
|
|
406
|
-
"uniqueUserCount": (
|
|
407
|
-
doc["_source"]["uniqueUserCount"]
|
|
408
|
-
if "uniqueUserCount" in doc["_source"]
|
|
409
|
-
else None
|
|
410
|
-
),
|
|
411
|
-
"userCounts": (
|
|
412
|
-
doc["_source"]["event"]["userCounts"]
|
|
413
|
-
if "userCounts" in doc["_source"]["event"]
|
|
414
|
-
else []
|
|
415
|
-
),
|
|
399
|
+
"timestampMillis": doc["_source"].get("timestampMillis"),
|
|
400
|
+
"lastObserved": doc["_source"]
|
|
401
|
+
.get("systemMetadata", {})
|
|
402
|
+
.get("lastObserved"),
|
|
403
|
+
"urn": doc["_source"].get("urn"),
|
|
404
|
+
"eventGranularity": doc["_source"].get("eventGranularity"),
|
|
405
|
+
"viewsCount": doc["_source"].get("viewsCount", 0),
|
|
406
|
+
"uniqueUserCount": doc["_source"].get("uniqueUserCount"),
|
|
407
|
+
"userCounts": doc["_source"].get("event", {}).get("userCounts", []),
|
|
416
408
|
"platform": platform,
|
|
417
409
|
}
|
|
418
410
|
|
|
419
411
|
def process_query_usage(self, results: Iterable) -> Iterable[Dict]:
|
|
420
412
|
for doc in results:
|
|
421
413
|
yield {
|
|
422
|
-
"timestampMillis": doc["_source"]
|
|
423
|
-
"lastObserved": doc["_source"]
|
|
424
|
-
"
|
|
425
|
-
"
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
),
|
|
430
|
-
"
|
|
431
|
-
"queryCount": (
|
|
432
|
-
doc["_source"]["queryCount"]
|
|
433
|
-
if "queryCount" in doc["_source"]
|
|
434
|
-
else 0
|
|
435
|
-
),
|
|
436
|
-
"uniqueUserCount": (
|
|
437
|
-
doc["_source"]["uniqueUserCount"]
|
|
438
|
-
if "uniqueUserCount" in doc["_source"]
|
|
439
|
-
else None
|
|
440
|
-
),
|
|
441
|
-
"userCounts": (
|
|
442
|
-
doc["_source"]["event"]["userCounts"]
|
|
443
|
-
if "userCounts" in doc["_source"]["event"]
|
|
444
|
-
else []
|
|
445
|
-
),
|
|
414
|
+
"timestampMillis": doc["_source"].get("timestampMillis"),
|
|
415
|
+
"lastObserved": doc["_source"]
|
|
416
|
+
.get("systemMetadata", {})
|
|
417
|
+
.get("lastObserved"),
|
|
418
|
+
"urn": doc["_source"].get("urn"),
|
|
419
|
+
"eventGranularity": doc["_source"].get("eventGranularity"),
|
|
420
|
+
"queryCount": doc["_source"].get("queryCount", 0),
|
|
421
|
+
"uniqueUserCount": doc["_source"].get("uniqueUserCount"),
|
|
422
|
+
"userCounts": doc["_source"].get("event", {}).get("userCounts", []),
|
|
446
423
|
}
|
|
447
424
|
|
|
448
425
|
def upstream_lineage_batch(self, results: Iterable) -> Iterable[Dict]:
|
|
@@ -497,7 +474,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
497
474
|
"timestampMillis": doc["_source"]["timestampMillis"],
|
|
498
475
|
"urn": doc["_source"]["urn"],
|
|
499
476
|
"eventGranularity": doc["_source"]["eventGranularity"],
|
|
500
|
-
"partitionSpec": doc["_source"]["partitionSpec"],
|
|
501
477
|
"totalSqlQueries": doc["_source"]["totalSqlQueries"],
|
|
502
478
|
"uniqueUserCount": doc["_source"]["uniqueUserCount"],
|
|
503
479
|
"userCounts": (
|
|
@@ -695,9 +671,13 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
695
671
|
return lf
|
|
696
672
|
|
|
697
673
|
@staticmethod
|
|
698
|
-
def polars_to_arrow_schema(
|
|
699
|
-
|
|
700
|
-
|
|
674
|
+
def polars_to_arrow_schema(
|
|
675
|
+
polars_schema: Dict[str, Union[DataTypeClass, polars.DataType]]
|
|
676
|
+
) -> pa.Schema:
|
|
677
|
+
def convert_dtype(
|
|
678
|
+
polars_dtype: Union[DataTypeClass, polars.DataType]
|
|
679
|
+
) -> pa.DataType:
|
|
680
|
+
type_mapping: Dict[Union[DataTypeClass, polars.DataType], pa.DataType] = {
|
|
701
681
|
polars.Boolean(): pa.bool_(),
|
|
702
682
|
polars.Int8(): pa.int8(),
|
|
703
683
|
polars.Int16(): pa.int16(),
|
|
@@ -710,6 +690,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
710
690
|
polars.Float32(): pa.float32(),
|
|
711
691
|
polars.Float64(): pa.float64(),
|
|
712
692
|
polars.Utf8(): pa.string(),
|
|
693
|
+
polars.Utf8(): pa.utf8(),
|
|
694
|
+
polars.String(): pa.string(),
|
|
713
695
|
polars.Date(): pa.date32(),
|
|
714
696
|
polars.Datetime(): pa.timestamp("ns"),
|
|
715
697
|
polars.Time(): pa.time64("ns"),
|
|
@@ -718,85 +700,97 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
718
700
|
|
|
719
701
|
if polars_dtype in [type(key) for key in type_mapping.keys()]:
|
|
720
702
|
return type_mapping[polars_dtype]
|
|
721
|
-
elif polars_dtype == polars.Categorical
|
|
703
|
+
elif polars_dtype == polars.Categorical:
|
|
722
704
|
return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
|
|
705
|
+
elif isinstance(polars_dtype, polars.Struct):
|
|
706
|
+
return pa.struct(
|
|
707
|
+
{
|
|
708
|
+
field.name: convert_dtype(field.dtype)
|
|
709
|
+
for field in polars_dtype.fields
|
|
710
|
+
}
|
|
711
|
+
)
|
|
712
|
+
elif isinstance(polars_dtype, polars.List):
|
|
713
|
+
return pa.list_(convert_dtype(polars_dtype.inner))
|
|
723
714
|
else:
|
|
724
715
|
raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
|
|
725
716
|
|
|
726
717
|
fields = [(name, convert_dtype(dtype)) for name, dtype in polars_schema.items()]
|
|
727
718
|
return pa.schema(fields)
|
|
728
719
|
|
|
729
|
-
def
|
|
730
|
-
self,
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
current_batch = []
|
|
754
|
-
|
|
755
|
-
for row in es_data:
|
|
756
|
-
current_batch.append(row)
|
|
757
|
-
|
|
758
|
-
if len(current_batch) >= batch_size:
|
|
759
|
-
# Convert the batch to a PyArrow Table
|
|
760
|
-
table = pa.Table.from_pylist(current_batch, schema=pa_schema)
|
|
720
|
+
def batch_write_parquet(
|
|
721
|
+
self,
|
|
722
|
+
data_iterator: Iterable[Dict[Any, Any]],
|
|
723
|
+
pl_schema: Dict,
|
|
724
|
+
output_path: str,
|
|
725
|
+
batch_size: int = 50000,
|
|
726
|
+
append: bool = False,
|
|
727
|
+
parquet_writer: Optional[pq.ParquetWriter] = None,
|
|
728
|
+
) -> None:
|
|
729
|
+
"""
|
|
730
|
+
Write data in batches to a file with support for appending to existing files.
|
|
731
|
+
|
|
732
|
+
Args:
|
|
733
|
+
data_iterator: Iterator of dictionaries containing the data
|
|
734
|
+
pa_schema: PyArrow schema for the data
|
|
735
|
+
output_path: Path for the output file
|
|
736
|
+
format_type: One of "ipc", "feather", "csv", "parquet", "pl_parquet"
|
|
737
|
+
batch_size: Number of rows per batch
|
|
738
|
+
append: If True, append to existing file. If False, create new file.
|
|
739
|
+
parquet_writer: Parquet doesn't let to append to existing file, so we need to pass the writer object
|
|
740
|
+
Returns:
|
|
741
|
+
LazyFrame pointing to the written data
|
|
742
|
+
"""
|
|
743
|
+
arrow_schema = self.polars_to_arrow_schema(pl_schema)
|
|
761
744
|
|
|
762
|
-
|
|
763
|
-
|
|
745
|
+
total_rows = 0
|
|
746
|
+
total_batches = 0
|
|
764
747
|
|
|
765
|
-
|
|
766
|
-
|
|
748
|
+
try:
|
|
749
|
+
if parquet_writer:
|
|
750
|
+
writer = parquet_writer
|
|
751
|
+
else:
|
|
752
|
+
writer = pq.ParquetWriter(output_path, arrow_schema)
|
|
767
753
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
table = pa.Table.from_pylist(
|
|
754
|
+
try:
|
|
755
|
+
for batch in self._get_batches(data_iterator, batch_size):
|
|
756
|
+
table = pa.Table.from_pylist(batch, schema=arrow_schema)
|
|
771
757
|
writer.write_table(table)
|
|
758
|
+
total_rows += len(batch)
|
|
759
|
+
total_batches += 1
|
|
760
|
+
logger.debug(f"Wrote batch {total_batches} ({len(batch)} rows)")
|
|
761
|
+
finally:
|
|
762
|
+
if not parquet_writer:
|
|
763
|
+
writer.close()
|
|
764
|
+
except Exception as e:
|
|
765
|
+
logger.exception(f"Error during batch writing: {str(e)}", exc_info=True)
|
|
766
|
+
raise
|
|
767
|
+
|
|
768
|
+
def _get_batches(
|
|
769
|
+
self, iterator: Iterable[Dict], batch_size: int
|
|
770
|
+
) -> Iterator[List[Dict]]:
|
|
771
|
+
"""Helper generator to create batches from an iterator."""
|
|
772
|
+
current_batch = []
|
|
773
|
+
for item in iterator:
|
|
774
|
+
current_batch.append(item)
|
|
775
|
+
if len(current_batch) >= batch_size:
|
|
776
|
+
yield current_batch
|
|
777
|
+
current_batch = []
|
|
772
778
|
|
|
773
|
-
|
|
779
|
+
if current_batch:
|
|
780
|
+
yield current_batch
|
|
774
781
|
|
|
775
782
|
def load_write_usage(
|
|
776
783
|
self, soft_deleted_entities_df: polars.LazyFrame
|
|
777
784
|
) -> polars.LazyFrame:
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
wdf = wdf.cast({polars.String: polars.Categorical})
|
|
788
|
-
else:
|
|
789
|
-
wdf = polars.LazyFrame(
|
|
790
|
-
self.load_data_from_es(
|
|
791
|
-
"dataset_operationaspect_v1",
|
|
792
|
-
QueryBuilder.get_dataset_write_usage_raw_query(
|
|
793
|
-
self.config.lookback_days
|
|
794
|
-
),
|
|
795
|
-
self.write_stat_raw_batch,
|
|
796
|
-
),
|
|
797
|
-
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
798
|
-
strict=True,
|
|
799
|
-
)
|
|
785
|
+
wdf = self.load_data_from_es_to_lf(
|
|
786
|
+
index="dataset_operationaspect_v1",
|
|
787
|
+
query=QueryBuilder.get_dataset_write_usage_raw_query(
|
|
788
|
+
self.config.lookback_days
|
|
789
|
+
),
|
|
790
|
+
process_function=self.write_stat_raw_batch,
|
|
791
|
+
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
792
|
+
)
|
|
793
|
+
wdf = wdf.cast({polars.String: polars.Categorical})
|
|
800
794
|
|
|
801
795
|
wdf = wdf.group_by(polars.col("urn"), polars.col("platform")).agg(
|
|
802
796
|
polars.col("urn").count().alias("write_count"),
|
|
@@ -851,18 +845,18 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
851
845
|
def set_table_modification_time_for_views(
|
|
852
846
|
self, datasets_df: polars.LazyFrame
|
|
853
847
|
) -> polars.LazyFrame:
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
schema=
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
strict=True,
|
|
848
|
+
schema = {
|
|
849
|
+
"source_urn": polars.Categorical,
|
|
850
|
+
"destination_urn": polars.Categorical,
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
upstreams_lf = self.load_data_from_es_to_lf(
|
|
854
|
+
schema=schema,
|
|
855
|
+
index="graph_service_v1",
|
|
856
|
+
query=QueryBuilder.get_upstreams_query(),
|
|
857
|
+
process_function=self.upstream_lineage_batch,
|
|
865
858
|
)
|
|
859
|
+
|
|
866
860
|
wdf = (
|
|
867
861
|
(
|
|
868
862
|
upstreams_lf.join(
|
|
@@ -1116,7 +1110,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1116
1110
|
self, lazy_frame: polars.LazyFrame
|
|
1117
1111
|
) -> Iterable[MetadataWorkUnit]:
|
|
1118
1112
|
num = 0
|
|
1119
|
-
for row in lazy_frame.collect().
|
|
1113
|
+
for row in lazy_frame.collect().iter_rows(named=True):
|
|
1120
1114
|
num += 1
|
|
1121
1115
|
|
|
1122
1116
|
query_usage_features = QueryUsageFeaturesClass(
|
|
@@ -1186,49 +1180,43 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1186
1180
|
def generate_dashboard_chart_usage(
|
|
1187
1181
|
self, entity_index: str, usage_index: str
|
|
1188
1182
|
) -> polars.LazyFrame:
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
strict=True,
|
|
1183
|
+
soft_deleted_schema = {
|
|
1184
|
+
"entity_urn": polars.Categorical,
|
|
1185
|
+
"removed": polars.Boolean,
|
|
1186
|
+
"last_modified_at": polars.Int64,
|
|
1187
|
+
"siblings": polars.List(polars.String),
|
|
1188
|
+
"isView": polars.Boolean,
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
soft_deleted_df = self.load_data_from_es_to_lf(
|
|
1192
|
+
schema=soft_deleted_schema,
|
|
1193
|
+
index=entity_index,
|
|
1194
|
+
query=QueryBuilder.get_dataset_entities_query(),
|
|
1195
|
+
process_function=self.soft_deleted_batch,
|
|
1203
1196
|
)
|
|
1204
1197
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
"urn": polars.Categorical,
|
|
1215
|
-
"platform": polars.Categorical,
|
|
1216
|
-
"eventGranularity": polars.String,
|
|
1217
|
-
"partitionSpec": polars.Struct(
|
|
1198
|
+
dashboard_usage_schema = {
|
|
1199
|
+
"timestampMillis": polars.Int64,
|
|
1200
|
+
"lastObserved": polars.Int64,
|
|
1201
|
+
"urn": polars.Categorical,
|
|
1202
|
+
"platform": polars.Categorical,
|
|
1203
|
+
"eventGranularity": polars.String,
|
|
1204
|
+
"viewsCount": polars.Int64,
|
|
1205
|
+
"userCounts": polars.List(
|
|
1206
|
+
polars.Struct(
|
|
1218
1207
|
{
|
|
1219
|
-
"
|
|
1208
|
+
"usageCount": polars.Int64,
|
|
1209
|
+
"user": polars.String,
|
|
1220
1210
|
}
|
|
1221
|
-
)
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
),
|
|
1231
|
-
},
|
|
1211
|
+
)
|
|
1212
|
+
),
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
lf = self.load_data_from_es_to_lf(
|
|
1216
|
+
schema=dashboard_usage_schema,
|
|
1217
|
+
index=usage_index,
|
|
1218
|
+
query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
|
|
1219
|
+
process_function=self.process_dashboard_usage,
|
|
1232
1220
|
)
|
|
1233
1221
|
|
|
1234
1222
|
lf = (
|
|
@@ -1301,48 +1289,41 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1301
1289
|
def generate_query_usage(self) -> polars.LazyFrame:
|
|
1302
1290
|
usage_index = "query_queryusagestatisticsaspect_v1"
|
|
1303
1291
|
entity_index = "queryindex_v2"
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
},
|
|
1317
|
-
strict=True,
|
|
1292
|
+
query_entities_schema = {
|
|
1293
|
+
"entity_urn": polars.Categorical,
|
|
1294
|
+
"last_modified_at": polars.Int64,
|
|
1295
|
+
"platform": polars.Categorical,
|
|
1296
|
+
"removed": polars.Boolean,
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
query_entities = self.load_data_from_es_to_lf(
|
|
1300
|
+
schema=query_entities_schema,
|
|
1301
|
+
index=entity_index,
|
|
1302
|
+
query=QueryBuilder.get_query_entities_query(),
|
|
1303
|
+
process_function=self.queries_entities_batch,
|
|
1318
1304
|
)
|
|
1319
1305
|
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
"lastObserved": polars.Int64,
|
|
1329
|
-
"urn": polars.Categorical,
|
|
1330
|
-
"eventGranularity": polars.String,
|
|
1331
|
-
"partitionSpec": polars.Struct(
|
|
1306
|
+
query_usage_schema = {
|
|
1307
|
+
"timestampMillis": polars.Int64,
|
|
1308
|
+
"lastObserved": polars.Int64,
|
|
1309
|
+
"urn": polars.Categorical,
|
|
1310
|
+
"eventGranularity": polars.String,
|
|
1311
|
+
"queryCount": polars.Int64,
|
|
1312
|
+
"userCounts": polars.List(
|
|
1313
|
+
polars.Struct(
|
|
1332
1314
|
{
|
|
1333
|
-
"
|
|
1315
|
+
"usageCount": polars.Int64,
|
|
1316
|
+
"user": polars.String,
|
|
1334
1317
|
}
|
|
1335
|
-
)
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
),
|
|
1345
|
-
},
|
|
1318
|
+
)
|
|
1319
|
+
),
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
lf = self.load_data_from_es_to_lf(
|
|
1323
|
+
schema=query_usage_schema,
|
|
1324
|
+
index=usage_index,
|
|
1325
|
+
query=QueryBuilder.get_query_usage_query(self.config.lookback_days),
|
|
1326
|
+
process_function=self.process_query_usage,
|
|
1346
1327
|
)
|
|
1347
1328
|
|
|
1348
1329
|
lf = query_entities.join(
|
|
@@ -1380,36 +1361,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1380
1361
|
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
1381
1362
|
datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
|
|
1382
1363
|
|
|
1383
|
-
|
|
1384
|
-
lf: polars.LazyFrame = polars.LazyFrame(
|
|
1385
|
-
self.load_data_from_es(
|
|
1386
|
-
index=index,
|
|
1387
|
-
query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
|
|
1388
|
-
process_function=self.process_batch,
|
|
1389
|
-
),
|
|
1390
|
-
schema={
|
|
1391
|
-
"timestampMillis": polars.Int64,
|
|
1392
|
-
"urn": polars.Categorical,
|
|
1393
|
-
"platform": polars.Categorical,
|
|
1394
|
-
"eventGranularity": polars.String,
|
|
1395
|
-
"partitionSpec": polars.Struct(
|
|
1396
|
-
{
|
|
1397
|
-
"partition": polars.String,
|
|
1398
|
-
}
|
|
1399
|
-
),
|
|
1400
|
-
"totalSqlQueries": polars.Int64,
|
|
1401
|
-
"uniqueUserCount": polars.Int64,
|
|
1402
|
-
"userCounts": polars.List(
|
|
1403
|
-
polars.Struct(
|
|
1404
|
-
{
|
|
1405
|
-
"count": polars.Int64,
|
|
1406
|
-
"user": polars.String,
|
|
1407
|
-
"userEmail": polars.String,
|
|
1408
|
-
}
|
|
1409
|
-
)
|
|
1410
|
-
),
|
|
1411
|
-
},
|
|
1412
|
-
)
|
|
1364
|
+
lf = self.load_dataset_usage()
|
|
1413
1365
|
|
|
1414
1366
|
# Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
|
|
1415
1367
|
lf = (
|
|
@@ -1472,23 +1424,101 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1472
1424
|
)
|
|
1473
1425
|
return usage_and_write_lf
|
|
1474
1426
|
|
|
1475
|
-
def
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1427
|
+
def load_data_from_es_to_lf(
|
|
1428
|
+
self,
|
|
1429
|
+
index: str,
|
|
1430
|
+
schema: Dict,
|
|
1431
|
+
query: Dict,
|
|
1432
|
+
process_function: Callable,
|
|
1433
|
+
aggregation_key: Optional[str] = None,
|
|
1434
|
+
file_to_load: Optional[str] = None,
|
|
1435
|
+
) -> polars.LazyFrame:
|
|
1436
|
+
data = self.load_data_from_es(
|
|
1437
|
+
index=index,
|
|
1438
|
+
query=query,
|
|
1439
|
+
process_function=process_function,
|
|
1440
|
+
aggregation_key=aggregation_key,
|
|
1441
|
+
)
|
|
1442
|
+
|
|
1443
|
+
if not self.config.streaming_mode:
|
|
1444
|
+
return polars.LazyFrame(data, schema)
|
|
1445
|
+
else:
|
|
1446
|
+
assert (
|
|
1447
|
+
self.temp_dir is not None
|
|
1448
|
+
), "In Streaming mode temp dir should be set. Normally this should not happen..."
|
|
1449
|
+
|
|
1450
|
+
with tempfile.NamedTemporaryFile(
|
|
1451
|
+
delete=False,
|
|
1452
|
+
mode="wb",
|
|
1453
|
+
dir=self.temp_dir.name,
|
|
1454
|
+
prefix=f"{index}_",
|
|
1455
|
+
suffix=".parquet",
|
|
1456
|
+
) as temp_file:
|
|
1457
|
+
tempfile_name = temp_file.name
|
|
1458
|
+
with pq.ParquetWriter(
|
|
1459
|
+
tempfile_name, self.polars_to_arrow_schema(schema)
|
|
1460
|
+
) as writer:
|
|
1461
|
+
logger.debug(f"Creating temporary file {tempfile_name}")
|
|
1462
|
+
|
|
1463
|
+
self.batch_write_parquet(
|
|
1464
|
+
data,
|
|
1465
|
+
schema,
|
|
1466
|
+
temp_file.name,
|
|
1467
|
+
parquet_writer=writer,
|
|
1468
|
+
)
|
|
1469
|
+
# Scan parquet fails in some cases with
|
|
1470
|
+
# thread 'polars-1' panicked at crates/polars-parquet/src/arrow/read/deserialize/dictionary_encoded/required_masked_dense.rs:113:72:
|
|
1471
|
+
# called `Option::unwrap()` on a `None` value
|
|
1472
|
+
# Which only happens if we don't collect immediately
|
|
1473
|
+
# return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
|
|
1474
|
+
return (
|
|
1475
|
+
polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
|
|
1476
|
+
.collect()
|
|
1477
|
+
.lazy()
|
|
1478
|
+
)
|
|
1479
|
+
|
|
1480
|
+
def load_dataset_usage(self) -> polars.LazyFrame:
|
|
1481
|
+
index = "dataset_datasetusagestatisticsaspect_v1"
|
|
1482
|
+
schema = {
|
|
1483
|
+
"timestampMillis": polars.Int64,
|
|
1484
|
+
"urn": polars.Categorical,
|
|
1485
|
+
"platform": polars.Categorical,
|
|
1486
|
+
"eventGranularity": polars.String,
|
|
1487
|
+
"totalSqlQueries": polars.Int64,
|
|
1488
|
+
"uniqueUserCount": polars.Int64,
|
|
1489
|
+
"userCounts": polars.List(
|
|
1490
|
+
polars.Struct(
|
|
1491
|
+
{
|
|
1492
|
+
"count": polars.Int64,
|
|
1493
|
+
"user": polars.String,
|
|
1494
|
+
"userEmail": polars.String,
|
|
1495
|
+
}
|
|
1496
|
+
)
|
|
1481
1497
|
),
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
return self.load_data_from_es_to_lf(
|
|
1501
|
+
schema=schema,
|
|
1502
|
+
index=index,
|
|
1503
|
+
query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
|
|
1504
|
+
process_function=self.process_batch,
|
|
1505
|
+
)
|
|
1506
|
+
|
|
1507
|
+
def get_datasets(self) -> polars.LazyFrame:
|
|
1508
|
+
schema = {
|
|
1509
|
+
"entity_urn": polars.Categorical,
|
|
1510
|
+
"removed": polars.Boolean,
|
|
1511
|
+
"last_modified_at": polars.Int64,
|
|
1512
|
+
"siblings": polars.List(polars.String),
|
|
1513
|
+
"isView": polars.Boolean,
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
return self.load_data_from_es_to_lf(
|
|
1517
|
+
schema=schema,
|
|
1518
|
+
index="datasetindex_v2",
|
|
1519
|
+
query=QueryBuilder.get_dataset_entities_query(),
|
|
1520
|
+
process_function=self.soft_deleted_batch,
|
|
1490
1521
|
)
|
|
1491
|
-
return datasets_df
|
|
1492
1522
|
|
|
1493
1523
|
def generate_top_users(
|
|
1494
1524
|
self, lf: polars.LazyFrame, count_field_name: str = "count"
|
|
@@ -1560,6 +1590,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1560
1590
|
batch_size: int = 1000,
|
|
1561
1591
|
delay: Optional[float] = None,
|
|
1562
1592
|
) -> Iterable[Dict[str, Any]]:
|
|
1593
|
+
processed_count = 0
|
|
1563
1594
|
while True:
|
|
1564
1595
|
with PerfTimer() as timer:
|
|
1565
1596
|
logger.debug(f"ES query: {query}")
|
|
@@ -1581,8 +1612,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1581
1612
|
yield from process_function(results["hits"]["hits"])
|
|
1582
1613
|
|
|
1583
1614
|
time_taken = timer.elapsed_seconds()
|
|
1615
|
+
processed_count += len(results["hits"]["hits"])
|
|
1584
1616
|
logger.info(
|
|
1585
|
-
f"Processed {len(results['hits']['hits'''])} data from {index} index in {time_taken:.3f} seconds"
|
|
1617
|
+
f"Processed {len(results['hits']['hits'''])} data from {index} index in {time_taken:.3f} seconds. Total: {processed_count} processed."
|
|
1586
1618
|
)
|
|
1587
1619
|
if len(results["hits"]["hits"]) < batch_size:
|
|
1588
1620
|
break
|
|
@@ -1609,9 +1641,3 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1609
1641
|
|
|
1610
1642
|
def get_report(self) -> SourceReport:
|
|
1611
1643
|
return self.report
|
|
1612
|
-
|
|
1613
|
-
def __del__(self) -> None:
|
|
1614
|
-
for temp_file in self.temp_files_to_clean:
|
|
1615
|
-
logger.info(f"Cleaning up temp file: {temp_file}")
|
|
1616
|
-
os.remove(temp_file)
|
|
1617
|
-
self.temp_files_to_clean = []
|