acryl-datahub-cloud 0.3.7.9.1__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (64) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/acryl_cs_issues/source.py +0 -1
  3. acryl_datahub_cloud/api/__init__.py +1 -0
  4. acryl_datahub_cloud/api/client.py +6 -0
  5. acryl_datahub_cloud/api/entity_versioning.py +167 -0
  6. acryl_datahub_cloud/datahub_metadata_sharing/__init__.py +0 -0
  7. acryl_datahub_cloud/datahub_metadata_sharing/metadata_sharing_source.py +267 -0
  8. acryl_datahub_cloud/datahub_metadata_sharing/query.py +7 -0
  9. acryl_datahub_cloud/datahub_metadata_sharing/scroll_shared_entities.gql +204 -0
  10. acryl_datahub_cloud/datahub_metadata_sharing/share_entity.gql +9 -0
  11. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +0 -2
  12. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +0 -1
  13. acryl_datahub_cloud/datahub_reporting/extract_graph.py +0 -1
  14. acryl_datahub_cloud/datahub_reporting/extract_sql.py +0 -1
  15. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +79 -57
  16. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +284 -258
  17. acryl_datahub_cloud/lineage_features/source.py +22 -5
  18. acryl_datahub_cloud/metadata/_urns/urn_defs.py +1593 -1494
  19. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  20. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/executor/__init__.py +15 -0
  21. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  22. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  23. acryl_datahub_cloud/metadata/schema.avsc +20140 -19735
  24. acryl_datahub_cloud/metadata/schema_classes.py +1083 -486
  25. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +1 -1
  26. acryl_datahub_cloud/metadata/schemas/AssertionInferenceDetails.avsc +1 -1
  27. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +1 -1
  28. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +1 -1
  29. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  30. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
  31. acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceInfo.avsc +6 -0
  32. acryl_datahub_cloud/metadata/schemas/DataHubViewInfo.avsc +2 -0
  33. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +3 -1
  34. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceInput.avsc +2 -4
  35. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceOutput.avsc +0 -2
  36. acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +63 -0
  37. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +2 -1
  38. acryl_datahub_cloud/metadata/schemas/Deprecation.avsc +12 -0
  39. acryl_datahub_cloud/metadata/schemas/DynamicFormAssignment.avsc +2 -0
  40. acryl_datahub_cloud/metadata/schemas/EntityTypeKey.avsc +1 -0
  41. acryl_datahub_cloud/metadata/schemas/ExecutionRequestInput.avsc +9 -0
  42. acryl_datahub_cloud/metadata/schemas/ExecutionRequestResult.avsc +14 -0
  43. acryl_datahub_cloud/metadata/schemas/Filter.avsc +2 -0
  44. acryl_datahub_cloud/metadata/schemas/MLFeatureProperties.avsc +51 -0
  45. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  46. acryl_datahub_cloud/metadata/schemas/MLModelGroupProperties.avsc +51 -0
  47. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +2 -1
  48. acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +51 -0
  49. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  50. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +20 -0
  51. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +10 -1
  52. acryl_datahub_cloud/metadata/schemas/PostInfo.avsc +23 -0
  53. acryl_datahub_cloud/metadata/schemas/RecommendationModule.avsc +2 -0
  54. acryl_datahub_cloud/metadata/schemas/RemoteExecutorKey.avsc +21 -0
  55. acryl_datahub_cloud/metadata/schemas/RemoteExecutorStatus.avsc +80 -0
  56. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +2 -1
  57. acryl_datahub_cloud/metadata/schemas/VersionProperties.avsc +216 -0
  58. acryl_datahub_cloud/metadata/schemas/VersionSetKey.avsc +26 -0
  59. acryl_datahub_cloud/metadata/schemas/VersionSetProperties.avsc +49 -0
  60. {acryl_datahub_cloud-0.3.7.9.1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/METADATA +52 -44
  61. {acryl_datahub_cloud-0.3.7.9.1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/RECORD +64 -48
  62. {acryl_datahub_cloud-0.3.7.9.1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/entry_points.txt +1 -0
  63. {acryl_datahub_cloud-0.3.7.9.1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/WHEEL +0 -0
  64. {acryl_datahub_cloud-0.3.7.9.1.dist-info → acryl_datahub_cloud-0.3.8.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,8 @@ from dataclasses import dataclass, field
9
9
  from datetime import datetime
10
10
  from functools import partial
11
11
  from itertools import chain
12
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
12
+ from tempfile import TemporaryDirectory
13
+ from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union
13
14
 
14
15
  import numpy
15
16
  import polars
@@ -17,6 +18,7 @@ import pyarrow as pa
17
18
  import pyarrow.parquet as pq
18
19
  from elasticsearch.client import Elasticsearch
19
20
  from opensearchpy import OpenSearch
21
+ from polars.datatypes import DataTypeClass
20
22
  from pydantic import Field
21
23
  from scipy.stats import expon
22
24
 
@@ -171,7 +173,7 @@ class DataHubUsageFeatureReportingSourceConfig(
171
173
  )
172
174
 
173
175
  disable_write_usage: bool = Field(
174
- False,
176
+ True,
175
177
  description="Flag to disable write usage statistics collection.'",
176
178
  )
177
179
 
@@ -245,6 +247,7 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
245
247
  class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
246
248
  platform = "datahub"
247
249
  temp_files_to_clean: List[str] = []
250
+ temp_dir: Optional[TemporaryDirectory] = None
248
251
 
249
252
  def __init__(
250
253
  self, ctx: PipelineContext, config: DataHubUsageFeatureReportingSourceConfig
@@ -267,6 +270,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
267
270
  if num > 0:
268
271
  logger.info(f"Compiled {num} regexp factors")
269
272
 
273
+ if self.config.streaming_mode:
274
+ self.temp_dir = tempfile.TemporaryDirectory(prefix="datahub-usage-")
275
+ logger.info(f"Using temp dir: {self.temp_dir.name}")
276
+
270
277
  def soft_deleted_batch(self, results: Iterable) -> Iterable[Dict]:
271
278
  with PerfTimer() as timer:
272
279
  for doc in results:
@@ -389,60 +396,30 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
389
396
  continue
390
397
 
391
398
  yield {
392
- "timestampMillis": doc["_source"]["timestampMillis"],
393
- "lastObserved": doc["_source"]["systemMetadata"]["lastObserved"],
394
- "urn": doc["_source"]["urn"],
395
- "eventGranularity": (
396
- doc["_source"]["eventGranularity"]
397
- if "eventGranularity" in doc["_source"]
398
- else None
399
- ),
400
- "partitionSpec": doc["_source"]["partitionSpec"],
401
- "viewsCount": (
402
- doc["_source"]["viewsCount"]
403
- if "viewsCount" in doc["_source"]
404
- else 0
405
- ),
406
- "uniqueUserCount": (
407
- doc["_source"]["uniqueUserCount"]
408
- if "uniqueUserCount" in doc["_source"]
409
- else None
410
- ),
411
- "userCounts": (
412
- doc["_source"]["event"]["userCounts"]
413
- if "userCounts" in doc["_source"]["event"]
414
- else []
415
- ),
399
+ "timestampMillis": doc["_source"].get("timestampMillis"),
400
+ "lastObserved": doc["_source"]
401
+ .get("systemMetadata", {})
402
+ .get("lastObserved"),
403
+ "urn": doc["_source"].get("urn"),
404
+ "eventGranularity": doc["_source"].get("eventGranularity"),
405
+ "viewsCount": doc["_source"].get("viewsCount", 0),
406
+ "uniqueUserCount": doc["_source"].get("uniqueUserCount"),
407
+ "userCounts": doc["_source"].get("event", {}).get("userCounts", []),
416
408
  "platform": platform,
417
409
  }
418
410
 
419
411
  def process_query_usage(self, results: Iterable) -> Iterable[Dict]:
420
412
  for doc in results:
421
413
  yield {
422
- "timestampMillis": doc["_source"]["timestampMillis"],
423
- "lastObserved": doc["_source"]["systemMetadata"]["lastObserved"],
424
- "urn": doc["_source"]["urn"],
425
- "eventGranularity": (
426
- doc["_source"]["eventGranularity"]
427
- if "eventGranularity" in doc["_source"]
428
- else None
429
- ),
430
- "partitionSpec": doc["_source"]["partitionSpec"],
431
- "queryCount": (
432
- doc["_source"]["queryCount"]
433
- if "queryCount" in doc["_source"]
434
- else 0
435
- ),
436
- "uniqueUserCount": (
437
- doc["_source"]["uniqueUserCount"]
438
- if "uniqueUserCount" in doc["_source"]
439
- else None
440
- ),
441
- "userCounts": (
442
- doc["_source"]["event"]["userCounts"]
443
- if "userCounts" in doc["_source"]["event"]
444
- else []
445
- ),
414
+ "timestampMillis": doc["_source"].get("timestampMillis"),
415
+ "lastObserved": doc["_source"]
416
+ .get("systemMetadata", {})
417
+ .get("lastObserved"),
418
+ "urn": doc["_source"].get("urn"),
419
+ "eventGranularity": doc["_source"].get("eventGranularity"),
420
+ "queryCount": doc["_source"].get("queryCount", 0),
421
+ "uniqueUserCount": doc["_source"].get("uniqueUserCount"),
422
+ "userCounts": doc["_source"].get("event", {}).get("userCounts", []),
446
423
  }
447
424
 
448
425
  def upstream_lineage_batch(self, results: Iterable) -> Iterable[Dict]:
@@ -497,7 +474,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
497
474
  "timestampMillis": doc["_source"]["timestampMillis"],
498
475
  "urn": doc["_source"]["urn"],
499
476
  "eventGranularity": doc["_source"]["eventGranularity"],
500
- "partitionSpec": doc["_source"]["partitionSpec"],
501
477
  "totalSqlQueries": doc["_source"]["totalSqlQueries"],
502
478
  "uniqueUserCount": doc["_source"]["uniqueUserCount"],
503
479
  "userCounts": (
@@ -695,9 +671,13 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
695
671
  return lf
696
672
 
697
673
  @staticmethod
698
- def polars_to_arrow_schema(polars_schema: Dict[str, polars.DataType]) -> pa.Schema:
699
- def convert_dtype(polars_dtype: polars.DataType) -> pa.DataType:
700
- type_mapping: Dict[polars.DataType, pa.DataType] = {
674
+ def polars_to_arrow_schema(
675
+ polars_schema: Dict[str, Union[DataTypeClass, polars.DataType]]
676
+ ) -> pa.Schema:
677
+ def convert_dtype(
678
+ polars_dtype: Union[DataTypeClass, polars.DataType]
679
+ ) -> pa.DataType:
680
+ type_mapping: Dict[Union[DataTypeClass, polars.DataType], pa.DataType] = {
701
681
  polars.Boolean(): pa.bool_(),
702
682
  polars.Int8(): pa.int8(),
703
683
  polars.Int16(): pa.int16(),
@@ -710,6 +690,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
710
690
  polars.Float32(): pa.float32(),
711
691
  polars.Float64(): pa.float64(),
712
692
  polars.Utf8(): pa.string(),
693
+ polars.Utf8(): pa.utf8(),
694
+ polars.String(): pa.string(),
713
695
  polars.Date(): pa.date32(),
714
696
  polars.Datetime(): pa.timestamp("ns"),
715
697
  polars.Time(): pa.time64("ns"),
@@ -718,85 +700,97 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
718
700
 
719
701
  if polars_dtype in [type(key) for key in type_mapping.keys()]:
720
702
  return type_mapping[polars_dtype]
721
- elif polars_dtype == polars.Categorical():
703
+ elif polars_dtype == polars.Categorical:
722
704
  return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
705
+ elif isinstance(polars_dtype, polars.Struct):
706
+ return pa.struct(
707
+ {
708
+ field.name: convert_dtype(field.dtype)
709
+ for field in polars_dtype.fields
710
+ }
711
+ )
712
+ elif isinstance(polars_dtype, polars.List):
713
+ return pa.list_(convert_dtype(polars_dtype.inner))
723
714
  else:
724
715
  raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
725
716
 
726
717
  fields = [(name, convert_dtype(dtype)) for name, dtype in polars_schema.items()]
727
718
  return pa.schema(fields)
728
719
 
729
- def load_es_data_to_lf(
730
- self, index: str, query: Dict, read_function: Callable, schema: Dict
731
- ) -> polars.LazyFrame:
732
- es_data = self.load_data_from_es(
733
- index,
734
- query,
735
- read_function,
736
- )
737
-
738
- with tempfile.NamedTemporaryFile(
739
- delete=False, mode="wb", suffix=".parquet"
740
- ) as temp_file:
741
- tempfile_name = temp_file.name
742
- logger.debug(f"Creating temporary file {tempfile_name}")
743
- self.temp_files_to_clean.append(tempfile_name)
744
-
745
- # Create a PyArrow schema from the provided schema dict
746
- pa_schema = self.polars_to_arrow_schema(schema)
747
-
748
- # Initialize the ParquetWriter
749
- with pq.ParquetWriter(tempfile_name, pa_schema) as writer:
750
- batch_size = (
751
- 1000 # Adjust this value based on your data and memory constraints
752
- )
753
- current_batch = []
754
-
755
- for row in es_data:
756
- current_batch.append(row)
757
-
758
- if len(current_batch) >= batch_size:
759
- # Convert the batch to a PyArrow Table
760
- table = pa.Table.from_pylist(current_batch, schema=pa_schema)
720
+ def batch_write_parquet(
721
+ self,
722
+ data_iterator: Iterable[Dict[Any, Any]],
723
+ pl_schema: Dict,
724
+ output_path: str,
725
+ batch_size: int = 50000,
726
+ append: bool = False,
727
+ parquet_writer: Optional[pq.ParquetWriter] = None,
728
+ ) -> None:
729
+ """
730
+ Write data in batches to a file with support for appending to existing files.
731
+
732
+ Args:
733
+ data_iterator: Iterator of dictionaries containing the data
734
+ pa_schema: PyArrow schema for the data
735
+ output_path: Path for the output file
736
+ format_type: One of "ipc", "feather", "csv", "parquet", "pl_parquet"
737
+ batch_size: Number of rows per batch
738
+ append: If True, append to existing file. If False, create new file.
739
+ parquet_writer: Parquet doesn't let to append to existing file, so we need to pass the writer object
740
+ Returns:
741
+ LazyFrame pointing to the written data
742
+ """
743
+ arrow_schema = self.polars_to_arrow_schema(pl_schema)
761
744
 
762
- # Write the batch
763
- writer.write_table(table)
745
+ total_rows = 0
746
+ total_batches = 0
764
747
 
765
- # Clear the current batch
766
- current_batch = []
748
+ try:
749
+ if parquet_writer:
750
+ writer = parquet_writer
751
+ else:
752
+ writer = pq.ParquetWriter(output_path, arrow_schema)
767
753
 
768
- # Write any remaining rows
769
- if current_batch:
770
- table = pa.Table.from_pylist(current_batch, schema=pa_schema)
754
+ try:
755
+ for batch in self._get_batches(data_iterator, batch_size):
756
+ table = pa.Table.from_pylist(batch, schema=arrow_schema)
771
757
  writer.write_table(table)
758
+ total_rows += len(batch)
759
+ total_batches += 1
760
+ logger.debug(f"Wrote batch {total_batches} ({len(batch)} rows)")
761
+ finally:
762
+ if not parquet_writer:
763
+ writer.close()
764
+ except Exception as e:
765
+ logger.exception(f"Error during batch writing: {str(e)}", exc_info=True)
766
+ raise
767
+
768
+ def _get_batches(
769
+ self, iterator: Iterable[Dict], batch_size: int
770
+ ) -> Iterator[List[Dict]]:
771
+ """Helper generator to create batches from an iterator."""
772
+ current_batch = []
773
+ for item in iterator:
774
+ current_batch.append(item)
775
+ if len(current_batch) >= batch_size:
776
+ yield current_batch
777
+ current_batch = []
772
778
 
773
- return polars.scan_parquet(tempfile_name)
779
+ if current_batch:
780
+ yield current_batch
774
781
 
775
782
  def load_write_usage(
776
783
  self, soft_deleted_entities_df: polars.LazyFrame
777
784
  ) -> polars.LazyFrame:
778
- if self.config.streaming_mode:
779
- wdf = self.load_es_data_to_lf(
780
- index="dataset_operationaspect_v1",
781
- query=QueryBuilder.get_dataset_write_usage_raw_query(
782
- self.config.lookback_days
783
- ),
784
- read_function=self.write_stat_raw_batch,
785
- schema={"urn": polars.Categorical, "platform": polars.Categorical},
786
- )
787
- wdf = wdf.cast({polars.String: polars.Categorical})
788
- else:
789
- wdf = polars.LazyFrame(
790
- self.load_data_from_es(
791
- "dataset_operationaspect_v1",
792
- QueryBuilder.get_dataset_write_usage_raw_query(
793
- self.config.lookback_days
794
- ),
795
- self.write_stat_raw_batch,
796
- ),
797
- schema={"urn": polars.Categorical, "platform": polars.Categorical},
798
- strict=True,
799
- )
785
+ wdf = self.load_data_from_es_to_lf(
786
+ index="dataset_operationaspect_v1",
787
+ query=QueryBuilder.get_dataset_write_usage_raw_query(
788
+ self.config.lookback_days
789
+ ),
790
+ process_function=self.write_stat_raw_batch,
791
+ schema={"urn": polars.Categorical, "platform": polars.Categorical},
792
+ )
793
+ wdf = wdf.cast({polars.String: polars.Categorical})
800
794
 
801
795
  wdf = wdf.group_by(polars.col("urn"), polars.col("platform")).agg(
802
796
  polars.col("urn").count().alias("write_count"),
@@ -851,18 +845,18 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
851
845
  def set_table_modification_time_for_views(
852
846
  self, datasets_df: polars.LazyFrame
853
847
  ) -> polars.LazyFrame:
854
- upstreams_lf = polars.LazyFrame(
855
- self.load_data_from_es(
856
- "graph_service_v1",
857
- QueryBuilder.get_upstreams_query(),
858
- self.upstream_lineage_batch,
859
- ),
860
- schema={
861
- "source_urn": polars.Categorical,
862
- "destination_urn": polars.Categorical,
863
- },
864
- strict=True,
848
+ schema = {
849
+ "source_urn": polars.Categorical,
850
+ "destination_urn": polars.Categorical,
851
+ }
852
+
853
+ upstreams_lf = self.load_data_from_es_to_lf(
854
+ schema=schema,
855
+ index="graph_service_v1",
856
+ query=QueryBuilder.get_upstreams_query(),
857
+ process_function=self.upstream_lineage_batch,
865
858
  )
859
+
866
860
  wdf = (
867
861
  (
868
862
  upstreams_lf.join(
@@ -1116,7 +1110,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1116
1110
  self, lazy_frame: polars.LazyFrame
1117
1111
  ) -> Iterable[MetadataWorkUnit]:
1118
1112
  num = 0
1119
- for row in lazy_frame.collect().to_struct():
1113
+ for row in lazy_frame.collect().iter_rows(named=True):
1120
1114
  num += 1
1121
1115
 
1122
1116
  query_usage_features = QueryUsageFeaturesClass(
@@ -1186,49 +1180,43 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1186
1180
  def generate_dashboard_chart_usage(
1187
1181
  self, entity_index: str, usage_index: str
1188
1182
  ) -> polars.LazyFrame:
1189
- soft_deleted_df = polars.LazyFrame(
1190
- self.load_data_from_es(
1191
- index=entity_index,
1192
- query=QueryBuilder.get_soft_deleted_entities_query(),
1193
- process_function=self.soft_deleted_batch,
1194
- ),
1195
- schema={
1196
- "entity_urn": polars.Categorical,
1197
- "removed": bool,
1198
- "last_modified_at": polars.Int64,
1199
- "siblings": polars.List(polars.String),
1200
- "isView": polars.Boolean,
1201
- },
1202
- strict=True,
1183
+ soft_deleted_schema = {
1184
+ "entity_urn": polars.Categorical,
1185
+ "removed": polars.Boolean,
1186
+ "last_modified_at": polars.Int64,
1187
+ "siblings": polars.List(polars.String),
1188
+ "isView": polars.Boolean,
1189
+ }
1190
+
1191
+ soft_deleted_df = self.load_data_from_es_to_lf(
1192
+ schema=soft_deleted_schema,
1193
+ index=entity_index,
1194
+ query=QueryBuilder.get_dataset_entities_query(),
1195
+ process_function=self.soft_deleted_batch,
1203
1196
  )
1204
1197
 
1205
- lf: polars.LazyFrame = polars.LazyFrame(
1206
- self.load_data_from_es(
1207
- index=usage_index,
1208
- query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
1209
- process_function=self.process_dashboard_usage,
1210
- ),
1211
- schema={
1212
- "timestampMillis": polars.Int64,
1213
- "lastObserved": polars.Int64,
1214
- "urn": polars.Categorical,
1215
- "platform": polars.Categorical,
1216
- "eventGranularity": polars.String,
1217
- "partitionSpec": polars.Struct(
1198
+ dashboard_usage_schema = {
1199
+ "timestampMillis": polars.Int64,
1200
+ "lastObserved": polars.Int64,
1201
+ "urn": polars.Categorical,
1202
+ "platform": polars.Categorical,
1203
+ "eventGranularity": polars.String,
1204
+ "viewsCount": polars.Int64,
1205
+ "userCounts": polars.List(
1206
+ polars.Struct(
1218
1207
  {
1219
- "partition": polars.String,
1208
+ "usageCount": polars.Int64,
1209
+ "user": polars.String,
1220
1210
  }
1221
- ),
1222
- "viewsCount": polars.Int64,
1223
- "userCounts": polars.List(
1224
- polars.Struct(
1225
- {
1226
- "usageCount": polars.Int64,
1227
- "user": polars.String,
1228
- }
1229
- )
1230
- ),
1231
- },
1211
+ )
1212
+ ),
1213
+ }
1214
+
1215
+ lf = self.load_data_from_es_to_lf(
1216
+ schema=dashboard_usage_schema,
1217
+ index=usage_index,
1218
+ query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
1219
+ process_function=self.process_dashboard_usage,
1232
1220
  )
1233
1221
 
1234
1222
  lf = (
@@ -1301,48 +1289,41 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1301
1289
  def generate_query_usage(self) -> polars.LazyFrame:
1302
1290
  usage_index = "query_queryusagestatisticsaspect_v1"
1303
1291
  entity_index = "queryindex_v2"
1304
-
1305
- query_entities = polars.LazyFrame(
1306
- self.load_data_from_es(
1307
- index=entity_index,
1308
- query=QueryBuilder.get_query_entities_query(),
1309
- process_function=self.queries_entities_batch,
1310
- ),
1311
- schema={
1312
- "entity_urn": polars.Categorical,
1313
- "last_modified_at": polars.Int64,
1314
- "platform": polars.Categorical,
1315
- "removed": polars.Boolean,
1316
- },
1317
- strict=True,
1292
+ query_entities_schema = {
1293
+ "entity_urn": polars.Categorical,
1294
+ "last_modified_at": polars.Int64,
1295
+ "platform": polars.Categorical,
1296
+ "removed": polars.Boolean,
1297
+ }
1298
+
1299
+ query_entities = self.load_data_from_es_to_lf(
1300
+ schema=query_entities_schema,
1301
+ index=entity_index,
1302
+ query=QueryBuilder.get_query_entities_query(),
1303
+ process_function=self.queries_entities_batch,
1318
1304
  )
1319
1305
 
1320
- lf: polars.LazyFrame = polars.LazyFrame(
1321
- self.load_data_from_es(
1322
- index=usage_index,
1323
- query=QueryBuilder.get_query_usage_query(self.config.lookback_days),
1324
- process_function=self.process_query_usage,
1325
- ),
1326
- schema={
1327
- "timestampMillis": polars.Int64,
1328
- "lastObserved": polars.Int64,
1329
- "urn": polars.Categorical,
1330
- "eventGranularity": polars.String,
1331
- "partitionSpec": polars.Struct(
1306
+ query_usage_schema = {
1307
+ "timestampMillis": polars.Int64,
1308
+ "lastObserved": polars.Int64,
1309
+ "urn": polars.Categorical,
1310
+ "eventGranularity": polars.String,
1311
+ "queryCount": polars.Int64,
1312
+ "userCounts": polars.List(
1313
+ polars.Struct(
1332
1314
  {
1333
- "partition": polars.String,
1315
+ "usageCount": polars.Int64,
1316
+ "user": polars.String,
1334
1317
  }
1335
- ),
1336
- "queryCount": polars.Int64,
1337
- "userCounts": polars.List(
1338
- polars.Struct(
1339
- {
1340
- "usageCount": polars.Int64,
1341
- "user": polars.String,
1342
- }
1343
- )
1344
- ),
1345
- },
1318
+ )
1319
+ ),
1320
+ }
1321
+
1322
+ lf = self.load_data_from_es_to_lf(
1323
+ schema=query_usage_schema,
1324
+ index=usage_index,
1325
+ query=QueryBuilder.get_query_usage_query(self.config.lookback_days),
1326
+ process_function=self.process_query_usage,
1346
1327
  )
1347
1328
 
1348
1329
  lf = query_entities.join(
@@ -1380,36 +1361,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1380
1361
  if self.config.set_upstream_table_max_modification_time_for_views:
1381
1362
  datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
1382
1363
 
1383
- index = "dataset_datasetusagestatisticsaspect_v1"
1384
- lf: polars.LazyFrame = polars.LazyFrame(
1385
- self.load_data_from_es(
1386
- index=index,
1387
- query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
1388
- process_function=self.process_batch,
1389
- ),
1390
- schema={
1391
- "timestampMillis": polars.Int64,
1392
- "urn": polars.Categorical,
1393
- "platform": polars.Categorical,
1394
- "eventGranularity": polars.String,
1395
- "partitionSpec": polars.Struct(
1396
- {
1397
- "partition": polars.String,
1398
- }
1399
- ),
1400
- "totalSqlQueries": polars.Int64,
1401
- "uniqueUserCount": polars.Int64,
1402
- "userCounts": polars.List(
1403
- polars.Struct(
1404
- {
1405
- "count": polars.Int64,
1406
- "user": polars.String,
1407
- "userEmail": polars.String,
1408
- }
1409
- )
1410
- ),
1411
- },
1412
- )
1364
+ lf = self.load_dataset_usage()
1413
1365
 
1414
1366
  # Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
1415
1367
  lf = (
@@ -1472,23 +1424,101 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1472
1424
  )
1473
1425
  return usage_and_write_lf
1474
1426
 
1475
- def get_datasets(self) -> polars.LazyFrame:
1476
- datasets_df = polars.LazyFrame(
1477
- self.load_data_from_es(
1478
- index="datasetindex_v2",
1479
- query=QueryBuilder.get_soft_deleted_entities_query(),
1480
- process_function=self.soft_deleted_batch,
1427
+ def load_data_from_es_to_lf(
1428
+ self,
1429
+ index: str,
1430
+ schema: Dict,
1431
+ query: Dict,
1432
+ process_function: Callable,
1433
+ aggregation_key: Optional[str] = None,
1434
+ file_to_load: Optional[str] = None,
1435
+ ) -> polars.LazyFrame:
1436
+ data = self.load_data_from_es(
1437
+ index=index,
1438
+ query=query,
1439
+ process_function=process_function,
1440
+ aggregation_key=aggregation_key,
1441
+ )
1442
+
1443
+ if not self.config.streaming_mode:
1444
+ return polars.LazyFrame(data, schema)
1445
+ else:
1446
+ assert (
1447
+ self.temp_dir is not None
1448
+ ), "In Streaming mode temp dir should be set. Normally this should not happen..."
1449
+
1450
+ with tempfile.NamedTemporaryFile(
1451
+ delete=False,
1452
+ mode="wb",
1453
+ dir=self.temp_dir.name,
1454
+ prefix=f"{index}_",
1455
+ suffix=".parquet",
1456
+ ) as temp_file:
1457
+ tempfile_name = temp_file.name
1458
+ with pq.ParquetWriter(
1459
+ tempfile_name, self.polars_to_arrow_schema(schema)
1460
+ ) as writer:
1461
+ logger.debug(f"Creating temporary file {tempfile_name}")
1462
+
1463
+ self.batch_write_parquet(
1464
+ data,
1465
+ schema,
1466
+ temp_file.name,
1467
+ parquet_writer=writer,
1468
+ )
1469
+ # Scan parquet fails in some cases with
1470
+ # thread 'polars-1' panicked at crates/polars-parquet/src/arrow/read/deserialize/dictionary_encoded/required_masked_dense.rs:113:72:
1471
+ # called `Option::unwrap()` on a `None` value
1472
+ # Which only happens if we don't collect immediately
1473
+ # return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
1474
+ return (
1475
+ polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
1476
+ .collect()
1477
+ .lazy()
1478
+ )
1479
+
1480
+ def load_dataset_usage(self) -> polars.LazyFrame:
1481
+ index = "dataset_datasetusagestatisticsaspect_v1"
1482
+ schema = {
1483
+ "timestampMillis": polars.Int64,
1484
+ "urn": polars.Categorical,
1485
+ "platform": polars.Categorical,
1486
+ "eventGranularity": polars.String,
1487
+ "totalSqlQueries": polars.Int64,
1488
+ "uniqueUserCount": polars.Int64,
1489
+ "userCounts": polars.List(
1490
+ polars.Struct(
1491
+ {
1492
+ "count": polars.Int64,
1493
+ "user": polars.String,
1494
+ "userEmail": polars.String,
1495
+ }
1496
+ )
1481
1497
  ),
1482
- schema={
1483
- "entity_urn": polars.Categorical,
1484
- "removed": bool,
1485
- "last_modified_at": polars.Int64,
1486
- "siblings": polars.List(polars.String),
1487
- "isView": polars.Boolean,
1488
- },
1489
- strict=True,
1498
+ }
1499
+
1500
+ return self.load_data_from_es_to_lf(
1501
+ schema=schema,
1502
+ index=index,
1503
+ query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
1504
+ process_function=self.process_batch,
1505
+ )
1506
+
1507
+ def get_datasets(self) -> polars.LazyFrame:
1508
+ schema = {
1509
+ "entity_urn": polars.Categorical,
1510
+ "removed": polars.Boolean,
1511
+ "last_modified_at": polars.Int64,
1512
+ "siblings": polars.List(polars.String),
1513
+ "isView": polars.Boolean,
1514
+ }
1515
+
1516
+ return self.load_data_from_es_to_lf(
1517
+ schema=schema,
1518
+ index="datasetindex_v2",
1519
+ query=QueryBuilder.get_dataset_entities_query(),
1520
+ process_function=self.soft_deleted_batch,
1490
1521
  )
1491
- return datasets_df
1492
1522
 
1493
1523
  def generate_top_users(
1494
1524
  self, lf: polars.LazyFrame, count_field_name: str = "count"
@@ -1560,6 +1590,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1560
1590
  batch_size: int = 1000,
1561
1591
  delay: Optional[float] = None,
1562
1592
  ) -> Iterable[Dict[str, Any]]:
1593
+ processed_count = 0
1563
1594
  while True:
1564
1595
  with PerfTimer() as timer:
1565
1596
  logger.debug(f"ES query: {query}")
@@ -1581,8 +1612,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1581
1612
  yield from process_function(results["hits"]["hits"])
1582
1613
 
1583
1614
  time_taken = timer.elapsed_seconds()
1615
+ processed_count += len(results["hits"]["hits"])
1584
1616
  logger.info(
1585
- f"Processed {len(results['hits']['hits'''])} data from {index} index in {time_taken:.3f} seconds"
1617
+ f"Processed {len(results['hits']['hits'''])} data from {index} index in {time_taken:.3f} seconds. Total: {processed_count} processed."
1586
1618
  )
1587
1619
  if len(results["hits"]["hits"]) < batch_size:
1588
1620
  break
@@ -1609,9 +1641,3 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1609
1641
 
1610
1642
  def get_report(self) -> SourceReport:
1611
1643
  return self.report
1612
-
1613
- def __del__(self) -> None:
1614
- for temp_file in self.temp_files_to_clean:
1615
- logger.info(f"Cleaning up temp file: {temp_file}")
1616
- os.remove(temp_file)
1617
- self.temp_files_to_clean = []