acryl-datahub-cloud 0.3.6.9rc2__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/datahub_reporting/extract_graph.py +1 -1
  3. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_patch_builder.py +466 -0
  4. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +432 -34
  5. acryl_datahub_cloud/metadata/_urns/urn_defs.py +788 -5
  6. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionrequest/__init__.py +2 -0
  7. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/ai/__init__.py +23 -0
  8. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +4 -0
  9. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  10. acryl_datahub_cloud/metadata/schema.avsc +773 -75
  11. acryl_datahub_cloud/metadata/schema_classes.py +750 -20
  12. acryl_datahub_cloud/metadata/schemas/Access.avsc +2 -1
  13. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +76 -0
  14. acryl_datahub_cloud/metadata/schemas/AiInferenceMetadata.avsc +42 -0
  15. acryl_datahub_cloud/metadata/schemas/AnomaliesSummary.avsc +16 -8
  16. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +3506 -0
  17. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +3 -2
  18. acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +2 -0
  19. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +4 -3
  20. acryl_datahub_cloud/metadata/schemas/AssertionSummary.avsc +50 -0
  21. acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -8
  22. acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +4 -2
  23. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  24. acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +3 -1
  25. acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +1 -1
  26. acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +29 -2
  27. acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +3 -1
  28. acryl_datahub_cloud/metadata/schemas/DataHubActionInfo.avsc +10 -1
  29. acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +4 -2
  30. acryl_datahub_cloud/metadata/schemas/DataHubViewInfo.avsc +2 -0
  31. acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +3 -1
  32. acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +3 -1
  33. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +5 -2
  34. acryl_datahub_cloud/metadata/schemas/DynamicFormAssignment.avsc +2 -0
  35. acryl_datahub_cloud/metadata/schemas/EntityInferenceMetadata.avsc +47 -0
  36. acryl_datahub_cloud/metadata/schemas/Filter.avsc +2 -0
  37. acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +75 -0
  38. acryl_datahub_cloud/metadata/schemas/Forms.avsc +18 -9
  39. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +18 -0
  40. acryl_datahub_cloud/metadata/schemas/GlossaryRelatedTerms.avsc +4 -4
  41. acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -1
  42. acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -1
  43. acryl_datahub_cloud/metadata/schemas/IncidentsSummary.avsc +4 -2
  44. acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +4 -2
  45. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +2 -1
  46. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
  47. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +3 -1
  48. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +5 -2
  49. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +119 -11
  50. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +3 -2
  51. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +2 -1
  52. acryl_datahub_cloud/metadata/schemas/Operation.avsc +0 -3
  53. acryl_datahub_cloud/metadata/schemas/Ownership.avsc +2 -1
  54. acryl_datahub_cloud/metadata/schemas/PlatformResourceInfo.avsc +2 -2
  55. acryl_datahub_cloud/metadata/schemas/PlatformResourceKey.avsc +4 -3
  56. acryl_datahub_cloud/metadata/schemas/PostInfo.avsc +2 -1
  57. acryl_datahub_cloud/metadata/schemas/RecommendationModule.avsc +2 -0
  58. acryl_datahub_cloud/metadata/schemas/Share.avsc +2 -1
  59. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +25 -1
  60. acryl_datahub_cloud/metadata/schemas/SubTypes.avsc +1 -1
  61. acryl_datahub_cloud/metadata/schemas/TestResults.avsc +8 -4
  62. {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/METADATA +35 -35
  63. {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/RECORD +66 -60
  64. {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/WHEEL +1 -1
  65. {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/entry_points.txt +0 -0
  66. {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/top_level.txt +0 -0
@@ -2,21 +2,32 @@ import logging
2
2
  import math
3
3
  import os
4
4
  import re
5
+ import tempfile
5
6
  import time
6
7
  from collections import defaultdict
7
8
  from dataclasses import dataclass, field
8
9
  from datetime import datetime
10
+ from functools import partial
9
11
  from itertools import chain
10
12
  from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
11
13
 
12
14
  import numpy
13
15
  import polars
16
+ import pyarrow as pa
17
+ import pyarrow.parquet as pq
14
18
  from elasticsearch.client import Elasticsearch
15
19
  from opensearchpy import OpenSearch
16
20
  from pydantic import Field
17
21
  from scipy.stats import expon
18
22
 
23
+ from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
24
+ UsageFeaturePatchBuilder,
25
+ )
19
26
  from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
27
+ from acryl_datahub_cloud.metadata.schema_classes import (
28
+ QueryUsageFeaturesClass,
29
+ UsageFeaturesClass,
30
+ )
20
31
  from datahub.configuration.common import ConfigModel
21
32
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
22
33
  from datahub.ingestion.api.common import PipelineContext
@@ -26,7 +37,8 @@ from datahub.ingestion.api.decorators import (
26
37
  platform_name,
27
38
  support_status,
28
39
  )
29
- from datahub.ingestion.api.source import SourceReport
40
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
41
+ from datahub.ingestion.api.source_helpers import auto_workunit_reporter
30
42
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
43
  from datahub.ingestion.graph.client import DatahubClientConfig
32
44
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -38,7 +50,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
38
50
  StatefulIngestionSourceBase,
39
51
  )
40
52
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
41
- from datahub.metadata.schema_classes import UsageFeaturesClass
42
53
  from datahub.utilities.perf_timer import PerfTimer
43
54
 
44
55
  logger = logging.getLogger(__name__)
@@ -51,6 +62,21 @@ GET_SOFT_DELETED_ENTITIES = {
51
62
  "sort": [{"urn": {"order": "asc"}}],
52
63
  }
53
64
 
65
+ GET_QUERY_ENTITIES = {
66
+ "sort": [{"urn": {"order": "asc"}}],
67
+ "query": {
68
+ "bool": {
69
+ "filter": {
70
+ "bool": {
71
+ "must_not": [
72
+ {"term": {"source": "MANUAL"}},
73
+ ]
74
+ }
75
+ }
76
+ }
77
+ },
78
+ }
79
+
54
80
  GET_UPSTREAMS = {
55
81
  "sort": [{"destination.urn": {"order": "asc"}}],
56
82
  "query": {
@@ -129,6 +155,22 @@ DATASET_WRITE_USAGE_COMPOSITE_QUERY = {
129
155
  },
130
156
  }
131
157
 
158
+ GET_QUERY_USAGE_QUERY = {
159
+ "sort": [{"urn": {"order": "asc"}}],
160
+ "query": {
161
+ "bool": {
162
+ "filter": {
163
+ "bool": {
164
+ "must": [
165
+ {"range": {"@timestamp": {"gte": "now-30d/d", "lt": "now/d"}}},
166
+ {"term": {"isExploded": False}},
167
+ ]
168
+ }
169
+ }
170
+ }
171
+ },
172
+ }
173
+
132
174
 
133
175
  class S3ClientConfig(ConfigModel):
134
176
  bucket: str = os.getenv("DATA_BUCKET", "")
@@ -208,6 +250,12 @@ class DataHubUsageFeatureReportingSourceConfig(StatefulIngestionConfigBase):
208
250
  chart_usage_enabled: bool = Field(
209
251
  True, description="Flag to enable or disable chart usage statistics collection."
210
252
  )
253
+
254
+ query_usage_enabled: bool = Field(
255
+ default=False,
256
+ description="Flag to enable or disable query usage statistics collection.",
257
+ )
258
+
211
259
  sibling_usage_enabled: bool = Field(
212
260
  True,
213
261
  description="Flag to enable or disable the setting dataset usage statistics for sibling entities (only DBT siblings are set).",
@@ -223,6 +271,21 @@ class DataHubUsageFeatureReportingSourceConfig(StatefulIngestionConfigBase):
223
271
  description="Flag to enable setting the max modification time for views based on their upstream tables' modification time.'",
224
272
  )
225
273
 
274
+ streaming_mode: bool = Field(
275
+ True,
276
+ description="Flag to enable polars streaming mode.'",
277
+ )
278
+
279
+ disable_write_usage: bool = Field(
280
+ False,
281
+ description="Flag to disable write usage statistics collection.'",
282
+ )
283
+
284
+ generate_patch: bool = Field(
285
+ True,
286
+ description="Flag to generate MCP patch for usage features.'",
287
+ )
288
+
226
289
 
227
290
  def exp_cdf(series: polars.Series) -> polars.Series:
228
291
  with PerfTimer() as timer:
@@ -276,6 +339,10 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
276
339
  dataset_usage_processing_time: PerfTimer = PerfTimer()
277
340
  dashboard_usage_processing_time: PerfTimer = PerfTimer()
278
341
  chart_usage_processing_time: PerfTimer = PerfTimer()
342
+ query_usage_processing_time: PerfTimer = PerfTimer()
343
+ query_platforms_count: Dict[str, int] = field(
344
+ default_factory=lambda: defaultdict(lambda: 0)
345
+ )
279
346
 
280
347
 
281
348
  @platform_name(id="datahub", platform_name="DataHub")
@@ -283,6 +350,7 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
283
350
  @support_status(SupportStatus.INCUBATING)
284
351
  class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
285
352
  platform = "datahub"
353
+ temp_files_to_clean: List[str] = []
286
354
 
287
355
  def __init__(
288
356
  self, ctx: PipelineContext, config: DataHubUsageFeatureReportingSourceConfig
@@ -291,6 +359,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
291
359
  # super().__init__(ctx)
292
360
  self.config: DataHubUsageFeatureReportingSourceConfig = config
293
361
  self.report: DatahubUsageFeatureReport = DatahubUsageFeatureReport()
362
+ self.ctx = ctx
294
363
 
295
364
  # We compile regexpes in advance for faster matching
296
365
  self.compiled_regexp_factor: List[Tuple[re.Pattern[str], float]] = []
@@ -346,7 +415,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
346
415
  if match:
347
416
  platform = match.group(1)
348
417
  else:
349
- logging.warning("Platform not found in urn. Skipping...")
418
+ logger.warning("Platform not found in urn. Skipping...")
350
419
  continue
351
420
 
352
421
  yield {
@@ -366,7 +435,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
366
435
  if match:
367
436
  platform = match.group(1)
368
437
  else:
369
- logging.warning("Platform not found in urn. Skipping...")
438
+ logger.warning("Platform not found in urn. Skipping...")
370
439
  continue
371
440
 
372
441
  yield {
@@ -378,6 +447,40 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
378
447
  f"Write Operation aspect processing took {time_taken:.3f} seconds"
379
448
  )
380
449
 
450
+ def queries_entities_batch(self, results: Iterable) -> Iterable[Dict]:
451
+ with PerfTimer() as timer:
452
+
453
+ for doc in results:
454
+ if "platform" not in doc["_source"] or not doc["_source"]["platform"]:
455
+ logger.warning(
456
+ f"Platform not found in query { doc['_source']['urn']}. Skipping..."
457
+ )
458
+ continue
459
+
460
+ self.report.query_platforms_count[doc["_source"]["platform"]] = (
461
+ self.report.query_platforms_count[doc["_source"]["platform"]] + 1
462
+ )
463
+
464
+ yield {
465
+ "entity_urn": doc["_source"]["urn"],
466
+ "last_modified_at": (
467
+ doc["_source"]["lastModifiedAt"]
468
+ if "lastModifiedAt" in doc["_source"]
469
+ else (
470
+ doc["_source"]["lastModifiedAt"]
471
+ if "lastModifiedAt" in doc["_source"]
472
+ else None
473
+ )
474
+ ),
475
+ "platform": doc["_source"]["platform"],
476
+ "removed": doc["_source"]["removed"]
477
+ if "removed" in doc["_source"]
478
+ else False,
479
+ }
480
+
481
+ time_taken = timer.elapsed_seconds()
482
+ logger.info(f"Query entities processing took {time_taken:.3f} seconds")
483
+
381
484
  def process_dashboard_usage(self, results: Iterable) -> Iterable[Dict]:
382
485
  for doc in results:
383
486
  match = re.match(dashboard_chart_platform_regexp, doc["_source"]["urn"])
@@ -385,7 +488,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
385
488
  platform = match.group(1)
386
489
  self.report.dashboard_platforms_count[platform] += 1
387
490
  else:
388
- logging.warning("Platform not found in urn. Skipping...")
491
+ logger.warning("Platform not found in urn. Skipping...")
389
492
  continue
390
493
 
391
494
  yield {
@@ -416,6 +519,35 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
416
519
  "platform": platform,
417
520
  }
418
521
 
522
+ def process_query_usage(self, results: Iterable) -> Iterable[Dict]:
523
+ for doc in results:
524
+ yield {
525
+ "timestampMillis": doc["_source"]["timestampMillis"],
526
+ "lastObserved": doc["_source"]["systemMetadata"]["lastObserved"],
527
+ "urn": doc["_source"]["urn"],
528
+ "eventGranularity": (
529
+ doc["_source"]["eventGranularity"]
530
+ if "eventGranularity" in doc["_source"]
531
+ else None
532
+ ),
533
+ "partitionSpec": doc["_source"]["partitionSpec"],
534
+ "queryCount": (
535
+ doc["_source"]["queryCount"]
536
+ if "queryCount" in doc["_source"]
537
+ else 0
538
+ ),
539
+ "uniqueUserCount": (
540
+ doc["_source"]["uniqueUserCount"]
541
+ if "uniqueUserCount" in doc["_source"]
542
+ else None
543
+ ),
544
+ "userCounts": (
545
+ doc["_source"]["event"]["userCounts"]
546
+ if "userCounts" in doc["_source"]["event"]
547
+ else []
548
+ ),
549
+ }
550
+
419
551
  def upstream_lineage_batch(self, results: Iterable) -> Iterable[Dict]:
420
552
  for doc in results:
421
553
  if (
@@ -431,7 +563,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
431
563
  if source_platform_match:
432
564
  source_platform = source_platform_match.group(1)
433
565
  else:
434
- logging.warning("Source Platform not found in urn. Skipping...")
566
+ logger.warning("Source Platform not found in urn. Skipping...")
435
567
  continue
436
568
 
437
569
  destination_platform_match = re.match(
@@ -440,7 +572,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
440
572
  if destination_platform_match:
441
573
  destination_platform = destination_platform_match.group(1)
442
574
  else:
443
- logging.warning("Destination Platform not found in urn. Skipping...")
575
+ logger.warning("Destination Platform not found in urn. Skipping...")
444
576
  continue
445
577
 
446
578
  # In some case like Tableau there is dataset which marked as view and points to a dataset on another platform
@@ -462,7 +594,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
462
594
  platform = match.group(1)
463
595
  self.report.dataset_platforms_count[platform] += 1
464
596
  else:
465
- logging.warning("Platform not found in urn. Skipping...")
597
+ logger.warning("Platform not found in urn. Skipping...")
466
598
  continue
467
599
 
468
600
  yield {
@@ -620,6 +752,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
620
752
  urn_field: str = "urn",
621
753
  platform_field: str = "platform",
622
754
  prefix: Optional[str] = None,
755
+ use_exp_cdf: Optional[bool] = None,
623
756
  ) -> polars.LazyFrame:
624
757
 
625
758
  logger.debug(f"Generating rank and percentile for {count_field} field")
@@ -630,7 +763,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
630
763
  .alias(f"{prefix}rank")
631
764
  )
632
765
 
633
- if self.config.use_exp_cdf:
766
+ use_exp_cdf = self.config.use_exp_cdf if use_exp_cdf is None else use_exp_cdf
767
+ if use_exp_cdf:
634
768
  lf = lf.with_columns(
635
769
  polars.col(count_field)
636
770
  .map_batches(exp_cdf, return_dtype=polars.Int64)
@@ -665,18 +799,107 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
665
799
 
666
800
  return lf
667
801
 
802
+ @staticmethod
803
+ def polars_to_arrow_schema(polars_schema: Dict[str, polars.DataType]) -> pa.Schema:
804
+ def convert_dtype(polars_dtype: polars.DataType) -> pa.DataType:
805
+ type_mapping: Dict[polars.DataType, pa.DataType] = {
806
+ polars.Boolean(): pa.bool_(),
807
+ polars.Int8(): pa.int8(),
808
+ polars.Int16(): pa.int16(),
809
+ polars.Int32(): pa.int32(),
810
+ polars.Int64(): pa.int64(),
811
+ polars.UInt8(): pa.uint8(),
812
+ polars.UInt16(): pa.uint16(),
813
+ polars.UInt32(): pa.uint32(),
814
+ polars.UInt64(): pa.uint64(),
815
+ polars.Float32(): pa.float32(),
816
+ polars.Float64(): pa.float64(),
817
+ polars.Utf8(): pa.string(),
818
+ polars.Date(): pa.date32(),
819
+ polars.Datetime(): pa.timestamp("ns"),
820
+ polars.Time(): pa.time64("ns"),
821
+ polars.Duration(): pa.duration("ns"),
822
+ }
823
+
824
+ if polars_dtype in [type(key) for key in type_mapping.keys()]:
825
+ return type_mapping[polars_dtype]
826
+ elif polars_dtype == polars.Categorical():
827
+ return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
828
+ else:
829
+ raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
830
+
831
+ fields = [(name, convert_dtype(dtype)) for name, dtype in polars_schema.items()]
832
+ return pa.schema(fields)
833
+
834
+ def load_es_data_to_lf(
835
+ self, index: str, query: Dict, read_function: Callable, schema: Dict
836
+ ) -> polars.LazyFrame:
837
+ es_data = self.load_data_from_es(
838
+ index,
839
+ query,
840
+ read_function,
841
+ )
842
+
843
+ with tempfile.NamedTemporaryFile(
844
+ delete=False, mode="wb", suffix=".parquet"
845
+ ) as temp_file:
846
+ tempfile_name = temp_file.name
847
+ logger.debug(f"Creating temporary file {tempfile_name}")
848
+ self.temp_files_to_clean.append(tempfile_name)
849
+
850
+ # Create a PyArrow schema from the provided schema dict
851
+ pa_schema = self.polars_to_arrow_schema(schema)
852
+
853
+ # Initialize the ParquetWriter
854
+ with pq.ParquetWriter(tempfile_name, pa_schema) as writer:
855
+ batch_size = (
856
+ 1000 # Adjust this value based on your data and memory constraints
857
+ )
858
+ current_batch = []
859
+
860
+ for row in es_data:
861
+ current_batch.append(row)
862
+
863
+ if len(current_batch) >= batch_size:
864
+ # Convert the batch to a PyArrow Table
865
+ table = pa.Table.from_pylist(current_batch, schema=pa_schema)
866
+
867
+ # Write the batch
868
+ writer.write_table(table)
869
+
870
+ # Clear the current batch
871
+ current_batch = []
872
+
873
+ # Write any remaining rows
874
+ if current_batch:
875
+ table = pa.Table.from_pylist(current_batch, schema=pa_schema)
876
+ writer.write_table(table)
877
+
878
+ return polars.scan_parquet(tempfile_name)
879
+
668
880
  def load_write_usage(
669
881
  self, soft_deleted_entities_df: polars.LazyFrame
670
882
  ) -> polars.LazyFrame:
671
- wdf = polars.LazyFrame(
672
- self.load_data_from_es(
673
- "dataset_operationaspect_v1",
674
- DATASET_WRITE_USAGE_RAW_QUERY,
675
- self.write_stat_raw_batch,
676
- ),
677
- schema={"urn": polars.Categorical, "platform": polars.Categorical},
678
- strict=True,
679
- )
883
+
884
+ if self.config.streaming_mode:
885
+ wdf = self.load_es_data_to_lf(
886
+ index="dataset_operationaspect_v1",
887
+ query=DATASET_WRITE_USAGE_RAW_QUERY,
888
+ read_function=self.write_stat_raw_batch,
889
+ schema={"urn": polars.Categorical, "platform": polars.Categorical},
890
+ )
891
+ wdf = wdf.cast({polars.String: polars.Categorical})
892
+ else:
893
+ wdf = polars.LazyFrame(
894
+ self.load_data_from_es(
895
+ "dataset_operationaspect_v1",
896
+ DATASET_WRITE_USAGE_RAW_QUERY,
897
+ self.write_stat_raw_batch,
898
+ ),
899
+ schema={"urn": polars.Categorical, "platform": polars.Categorical},
900
+ strict=True,
901
+ )
902
+
680
903
  wdf = wdf.group_by(polars.col("urn"), polars.col("platform")).agg(
681
904
  polars.col("urn").count().alias("write_count"),
682
905
  )
@@ -692,7 +915,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
692
915
  .drop(["removed"])
693
916
  )
694
917
 
695
- return wdf
918
+ return wdf.collect(streaming=self.config.streaming_mode).lazy()
696
919
 
697
920
  def load_write_usage_server_side_aggregation(
698
921
  self, soft_deleted_entities_df: polars.LazyFrame
@@ -800,7 +1023,22 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
800
1023
  chart_usage_df = self.generate_chart_usage()
801
1024
  yield from self.generate_mcp_from_lazyframe(chart_usage_df)
802
1025
 
803
- def get_workunits(self) -> Iterable[MetadataWorkUnit]:
1026
+ def generate_query_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
1027
+ with polars.StringCache():
1028
+ logger.info("Generate Query Usage")
1029
+ query_usage_df = self.generate_query_usage()
1030
+ yield from self.generate_query_usage_mcp_from_lazyframe(query_usage_df)
1031
+
1032
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
1033
+ """A list of functions that transforms the workunits produced by this source.
1034
+ Run in order, first in list is applied first. Be careful with order when overriding.
1035
+ """
1036
+
1037
+ return [
1038
+ partial(auto_workunit_reporter, self.get_report()),
1039
+ ]
1040
+
1041
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
804
1042
  if self.config.dataset_usage_enabled:
805
1043
  with self.report.dataset_usage_processing_time as timer:
806
1044
  self.report.report_ingestion_stage_start("generate dataset usage")
@@ -825,6 +1063,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
825
1063
  time_taken = timer.elapsed_seconds()
826
1064
  logger.info(f"Chart Usage generation took {time_taken:.3f}")
827
1065
 
1066
+ if self.config.query_usage_enabled:
1067
+ with self.report.query_usage_processing_time as timer:
1068
+ self.report.report_ingestion_stage_start("generate query usage")
1069
+
1070
+ yield from self.generate_query_usage_mcps()
1071
+
1072
+ time_taken = timer.elapsed_seconds()
1073
+ logger.info(f"Query Usage generation took {time_taken:.3f}")
1074
+
828
1075
  def generate_mcp_from_lazyframe(
829
1076
  self, lazy_frame: polars.LazyFrame
830
1077
  ) -> Iterable[MetadataWorkUnit]:
@@ -918,11 +1165,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
918
1165
  int(row["write_count"])
919
1166
  if "write_count" in row and row["write_count"]
920
1167
  else 0
1168
+ if not self.config.disable_write_usage
1169
+ else None
921
1170
  ),
922
1171
  writeCountPercentileLast30Days=(
923
1172
  int(row["write_rank_percentile"])
924
1173
  if "write_count" in row and row["write_rank_percentile"]
925
1174
  else 0
1175
+ if not self.config.disable_write_usage
1176
+ else None
926
1177
  ),
927
1178
  writeCountRankLast30Days=(
928
1179
  int(row["write_rank"])
@@ -950,10 +1201,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
950
1201
  combinedSearchRankingMultiplier=search_ranking_multipliers.combinedSearchRankingMultiplier,
951
1202
  )
952
1203
 
953
- mcp = MetadataChangeProposalWrapper(
954
- entityUrn=row["urn"], aspect=usage_feature
955
- )
956
- yield mcp.as_workunit(is_primary_source=False)
1204
+ yield from self.generate_usage_feature_mcp(row["urn"], usage_feature)
957
1205
 
958
1206
  if (
959
1207
  "siblings" in row
@@ -962,15 +1210,72 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
962
1210
  ):
963
1211
  for sibling in row["siblings"]:
964
1212
  if dbt_platform_regexp.match(sibling):
965
- dbt_sibling_mcp = MetadataChangeProposalWrapper(
966
- entityUrn=sibling, aspect=usage_feature
1213
+ yield from self.generate_usage_feature_mcp(
1214
+ sibling, usage_feature
967
1215
  )
968
- self.report.sibling_usage_count += 1
969
- yield dbt_sibling_mcp.as_workunit(is_primary_source=False)
1216
+
1217
+ def generate_query_usage_mcp_from_lazyframe(
1218
+ self, lazy_frame: polars.LazyFrame
1219
+ ) -> Iterable[MetadataWorkUnit]:
1220
+ num = 0
1221
+ for row in lazy_frame.collect().to_struct():
1222
+ num += 1
1223
+
1224
+ query_usage_features = QueryUsageFeaturesClass(
1225
+ queryCountLast30Days=(
1226
+ int(row["totalSqlQueries"])
1227
+ if "totalSqlQueries" in row and row["totalSqlQueries"]
1228
+ else 0
1229
+ ),
1230
+ queryCountTotal=None, # This is not implemented
1231
+ runsPercentileLast30days=(
1232
+ int(row["queries_rank_percentile"])
1233
+ if "queries_rank_percentile" in row
1234
+ and row["queries_rank_percentile"]
1235
+ else 0
1236
+ ),
1237
+ lastExecutedAt=(
1238
+ int(row["last_modified_at"])
1239
+ if "last_modified_at" in row and row["last_modified_at"]
1240
+ else 0
1241
+ ),
1242
+ topUsersLast30Days=(
1243
+ list(chain.from_iterable(row["top_users"]))
1244
+ if row["top_users"]
1245
+ else None
1246
+ ),
1247
+ queryCostLast30Days=None, # Not implemented yet
1248
+ )
1249
+
1250
+ yield from self.generate_query_usage_feature_mcp(
1251
+ row["urn"], query_usage_features
1252
+ )
1253
+
1254
+ def generate_usage_feature_mcp(
1255
+ self, urn: str, usage_feature: UsageFeaturesClass
1256
+ ) -> Iterable[MetadataWorkUnit]:
1257
+ if self.config.generate_patch:
1258
+ usage_feature_patch_builder = UsageFeaturePatchBuilder(urn=urn)
1259
+ usage_feature_patch_builder.apply_usage_features(usage_feature)
1260
+ for mcp in usage_feature_patch_builder.build():
1261
+ yield MetadataWorkUnit(
1262
+ id=MetadataWorkUnit.generate_workunit_id(mcp),
1263
+ mcp_raw=mcp,
1264
+ is_primary_source=False,
1265
+ )
1266
+ else:
1267
+ mcw = MetadataChangeProposalWrapper(entityUrn=urn, aspect=usage_feature)
1268
+ yield mcw.as_workunit(is_primary_source=False)
1269
+
1270
+ def generate_query_usage_feature_mcp(
1271
+ self, urn: str, query_usage_features: QueryUsageFeaturesClass
1272
+ ) -> Iterable[MetadataWorkUnit]:
1273
+ mcw = MetadataChangeProposalWrapper(entityUrn=urn, aspect=query_usage_features)
1274
+ yield mcw.as_workunit(is_primary_source=False)
970
1275
 
971
1276
  def generate_chart_usage(self) -> polars.LazyFrame:
972
- usage_index = "chartindex_v2"
973
- entity_index = "chart_chartusagestatisticsaspect_v1"
1277
+ entity_index = "chartindex_v2"
1278
+ usage_index = "chart_chartusagestatisticsaspect_v1"
974
1279
 
975
1280
  return self.generate_dashboard_chart_usage(entity_index, usage_index)
976
1281
 
@@ -1095,6 +1400,83 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1095
1400
 
1096
1401
  return lf
1097
1402
 
1403
+ def generate_query_usage(self) -> polars.LazyFrame:
1404
+ usage_index = "query_queryusagestatisticsaspect_v1"
1405
+ entity_index = "queryindex_v2"
1406
+
1407
+ query_entities = polars.LazyFrame(
1408
+ self.load_data_from_es(
1409
+ index=entity_index,
1410
+ query=GET_QUERY_ENTITIES,
1411
+ process_function=self.queries_entities_batch,
1412
+ ),
1413
+ schema={
1414
+ "entity_urn": polars.Categorical,
1415
+ "last_modified_at": polars.Int64,
1416
+ "platform": polars.Categorical,
1417
+ "removed": polars.Boolean,
1418
+ },
1419
+ strict=True,
1420
+ )
1421
+
1422
+ lf: polars.LazyFrame = polars.LazyFrame(
1423
+ self.load_data_from_es(
1424
+ index=usage_index,
1425
+ query=GET_QUERY_USAGE_QUERY,
1426
+ process_function=self.process_query_usage,
1427
+ ),
1428
+ schema={
1429
+ "timestampMillis": polars.Int64,
1430
+ "lastObserved": polars.Int64,
1431
+ "urn": polars.Categorical,
1432
+ "eventGranularity": polars.String,
1433
+ "partitionSpec": polars.Struct(
1434
+ {
1435
+ "partition": polars.String,
1436
+ }
1437
+ ),
1438
+ "queryCount": polars.Int64,
1439
+ "userCounts": polars.List(
1440
+ polars.Struct(
1441
+ {
1442
+ "usageCount": polars.Int64,
1443
+ "user": polars.String,
1444
+ }
1445
+ )
1446
+ ),
1447
+ },
1448
+ )
1449
+
1450
+ lf = query_entities.join(
1451
+ lf, left_on="entity_urn", right_on="urn", how="left", coalesce=False
1452
+ ).filter(
1453
+ polars.col("removed") == False # noqa: E712
1454
+ )
1455
+
1456
+ total_queries = lf.group_by("urn", "platform").agg(
1457
+ polars.col("queryCount").sum().alias("totalSqlQueries"),
1458
+ polars.col("last_modified_at").max().alias("last_modified_at"),
1459
+ )
1460
+
1461
+ top_users = self.generate_top_users(lf, "usageCount")
1462
+
1463
+ usage_with_top_users = top_users.join(total_queries, on="urn", how="inner")
1464
+
1465
+ usage_with_top_users_with_ranks = self.gen_rank_and_percentile(
1466
+ lf=usage_with_top_users,
1467
+ count_field="totalSqlQueries",
1468
+ urn_field="urn",
1469
+ platform_field="platform",
1470
+ prefix="queries_",
1471
+ use_exp_cdf=False,
1472
+ )
1473
+
1474
+ usage_with_top_users_with_ranks = usage_with_top_users_with_ranks.sort(
1475
+ by=["platform", "queries_rank"], descending=[False, False]
1476
+ )
1477
+
1478
+ return usage_with_top_users_with_ranks
1479
+
1098
1480
  def generate_dataset_usage(self) -> polars.LazyFrame:
1099
1481
  datasets_lf = self.get_datasets()
1100
1482
  if self.config.set_upstream_table_max_modification_time_for_views:
@@ -1155,11 +1537,21 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1155
1537
  by=["platform", "queries_rank"], descending=[False, False]
1156
1538
  )
1157
1539
 
1158
- # Calculate write usage
1159
- if self.config.use_server_side_aggregation:
1160
- write_lf = self.load_write_usage_server_side_aggregation(datasets_lf)
1540
+ if not self.config.disable_write_usage:
1541
+ # Calculate write usage
1542
+ if self.config.use_server_side_aggregation:
1543
+ write_lf = self.load_write_usage_server_side_aggregation(datasets_lf)
1544
+ else:
1545
+ write_lf = self.load_write_usage(datasets_lf)
1161
1546
  else:
1162
- write_lf = self.load_write_usage(datasets_lf)
1547
+ logger.info("Write usage disabled")
1548
+ write_lf = polars.LazyFrame(
1549
+ schema={
1550
+ "urn": polars.Categorical,
1551
+ "platform": polars.Categorical,
1552
+ "write_count": polars.Int64,
1553
+ }
1554
+ )
1163
1555
 
1164
1556
  usage_and_write_lf = (
1165
1557
  usage_with_top_users_with_ranks.join(
@@ -1321,3 +1713,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1321
1713
 
1322
1714
  def get_report(self) -> SourceReport:
1323
1715
  return self.report
1716
+
1717
+ def __del__(self) -> None:
1718
+ for temp_file in self.temp_files_to_clean:
1719
+ logger.info(f"Cleaning up temp file: {temp_file}")
1720
+ os.remove(temp_file)
1721
+ self.temp_files_to_clean = []