acryl-datahub-cloud 0.3.6.9rc2__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_reporting/extract_graph.py +1 -1
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_patch_builder.py +466 -0
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +432 -34
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +788 -5
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionrequest/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/ai/__init__.py +23 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- acryl_datahub_cloud/metadata/schema.avsc +773 -75
- acryl_datahub_cloud/metadata/schema_classes.py +750 -20
- acryl_datahub_cloud/metadata/schemas/Access.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +76 -0
- acryl_datahub_cloud/metadata/schemas/AiInferenceMetadata.avsc +42 -0
- acryl_datahub_cloud/metadata/schemas/AnomaliesSummary.avsc +16 -8
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +3506 -0
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +3 -2
- acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +4 -3
- acryl_datahub_cloud/metadata/schemas/AssertionSummary.avsc +50 -0
- acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -8
- acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +29 -2
- acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/DataHubActionInfo.avsc +10 -1
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/DataHubViewInfo.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/DynamicFormAssignment.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/EntityInferenceMetadata.avsc +47 -0
- acryl_datahub_cloud/metadata/schemas/Filter.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +75 -0
- acryl_datahub_cloud/metadata/schemas/Forms.avsc +18 -9
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +18 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryRelatedTerms.avsc +4 -4
- acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/IncidentsSummary.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +119 -11
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +3 -2
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/Operation.avsc +0 -3
- acryl_datahub_cloud/metadata/schemas/Ownership.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/PlatformResourceInfo.avsc +2 -2
- acryl_datahub_cloud/metadata/schemas/PlatformResourceKey.avsc +4 -3
- acryl_datahub_cloud/metadata/schemas/PostInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/RecommendationModule.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/Share.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +25 -1
- acryl_datahub_cloud/metadata/schemas/SubTypes.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/TestResults.avsc +8 -4
- {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/METADATA +35 -35
- {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/RECORD +66 -60
- {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/WHEEL +1 -1
- {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.6.9rc2.dist-info → acryl_datahub_cloud-0.3.7.dist-info}/top_level.txt +0 -0
|
@@ -2,21 +2,32 @@ import logging
|
|
|
2
2
|
import math
|
|
3
3
|
import os
|
|
4
4
|
import re
|
|
5
|
+
import tempfile
|
|
5
6
|
import time
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from dataclasses import dataclass, field
|
|
8
9
|
from datetime import datetime
|
|
10
|
+
from functools import partial
|
|
9
11
|
from itertools import chain
|
|
10
12
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|
11
13
|
|
|
12
14
|
import numpy
|
|
13
15
|
import polars
|
|
16
|
+
import pyarrow as pa
|
|
17
|
+
import pyarrow.parquet as pq
|
|
14
18
|
from elasticsearch.client import Elasticsearch
|
|
15
19
|
from opensearchpy import OpenSearch
|
|
16
20
|
from pydantic import Field
|
|
17
21
|
from scipy.stats import expon
|
|
18
22
|
|
|
23
|
+
from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
|
|
24
|
+
UsageFeaturePatchBuilder,
|
|
25
|
+
)
|
|
19
26
|
from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
|
|
27
|
+
from acryl_datahub_cloud.metadata.schema_classes import (
|
|
28
|
+
QueryUsageFeaturesClass,
|
|
29
|
+
UsageFeaturesClass,
|
|
30
|
+
)
|
|
20
31
|
from datahub.configuration.common import ConfigModel
|
|
21
32
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
22
33
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -26,7 +37,8 @@ from datahub.ingestion.api.decorators import (
|
|
|
26
37
|
platform_name,
|
|
27
38
|
support_status,
|
|
28
39
|
)
|
|
29
|
-
from datahub.ingestion.api.source import SourceReport
|
|
40
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
41
|
+
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
30
42
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
43
|
from datahub.ingestion.graph.client import DatahubClientConfig
|
|
32
44
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -38,7 +50,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
38
50
|
StatefulIngestionSourceBase,
|
|
39
51
|
)
|
|
40
52
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
41
|
-
from datahub.metadata.schema_classes import UsageFeaturesClass
|
|
42
53
|
from datahub.utilities.perf_timer import PerfTimer
|
|
43
54
|
|
|
44
55
|
logger = logging.getLogger(__name__)
|
|
@@ -51,6 +62,21 @@ GET_SOFT_DELETED_ENTITIES = {
|
|
|
51
62
|
"sort": [{"urn": {"order": "asc"}}],
|
|
52
63
|
}
|
|
53
64
|
|
|
65
|
+
GET_QUERY_ENTITIES = {
|
|
66
|
+
"sort": [{"urn": {"order": "asc"}}],
|
|
67
|
+
"query": {
|
|
68
|
+
"bool": {
|
|
69
|
+
"filter": {
|
|
70
|
+
"bool": {
|
|
71
|
+
"must_not": [
|
|
72
|
+
{"term": {"source": "MANUAL"}},
|
|
73
|
+
]
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
}
|
|
79
|
+
|
|
54
80
|
GET_UPSTREAMS = {
|
|
55
81
|
"sort": [{"destination.urn": {"order": "asc"}}],
|
|
56
82
|
"query": {
|
|
@@ -129,6 +155,22 @@ DATASET_WRITE_USAGE_COMPOSITE_QUERY = {
|
|
|
129
155
|
},
|
|
130
156
|
}
|
|
131
157
|
|
|
158
|
+
GET_QUERY_USAGE_QUERY = {
|
|
159
|
+
"sort": [{"urn": {"order": "asc"}}],
|
|
160
|
+
"query": {
|
|
161
|
+
"bool": {
|
|
162
|
+
"filter": {
|
|
163
|
+
"bool": {
|
|
164
|
+
"must": [
|
|
165
|
+
{"range": {"@timestamp": {"gte": "now-30d/d", "lt": "now/d"}}},
|
|
166
|
+
{"term": {"isExploded": False}},
|
|
167
|
+
]
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
},
|
|
172
|
+
}
|
|
173
|
+
|
|
132
174
|
|
|
133
175
|
class S3ClientConfig(ConfigModel):
|
|
134
176
|
bucket: str = os.getenv("DATA_BUCKET", "")
|
|
@@ -208,6 +250,12 @@ class DataHubUsageFeatureReportingSourceConfig(StatefulIngestionConfigBase):
|
|
|
208
250
|
chart_usage_enabled: bool = Field(
|
|
209
251
|
True, description="Flag to enable or disable chart usage statistics collection."
|
|
210
252
|
)
|
|
253
|
+
|
|
254
|
+
query_usage_enabled: bool = Field(
|
|
255
|
+
default=False,
|
|
256
|
+
description="Flag to enable or disable query usage statistics collection.",
|
|
257
|
+
)
|
|
258
|
+
|
|
211
259
|
sibling_usage_enabled: bool = Field(
|
|
212
260
|
True,
|
|
213
261
|
description="Flag to enable or disable the setting dataset usage statistics for sibling entities (only DBT siblings are set).",
|
|
@@ -223,6 +271,21 @@ class DataHubUsageFeatureReportingSourceConfig(StatefulIngestionConfigBase):
|
|
|
223
271
|
description="Flag to enable setting the max modification time for views based on their upstream tables' modification time.'",
|
|
224
272
|
)
|
|
225
273
|
|
|
274
|
+
streaming_mode: bool = Field(
|
|
275
|
+
True,
|
|
276
|
+
description="Flag to enable polars streaming mode.'",
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
disable_write_usage: bool = Field(
|
|
280
|
+
False,
|
|
281
|
+
description="Flag to disable write usage statistics collection.'",
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
generate_patch: bool = Field(
|
|
285
|
+
True,
|
|
286
|
+
description="Flag to generate MCP patch for usage features.'",
|
|
287
|
+
)
|
|
288
|
+
|
|
226
289
|
|
|
227
290
|
def exp_cdf(series: polars.Series) -> polars.Series:
|
|
228
291
|
with PerfTimer() as timer:
|
|
@@ -276,6 +339,10 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
|
|
|
276
339
|
dataset_usage_processing_time: PerfTimer = PerfTimer()
|
|
277
340
|
dashboard_usage_processing_time: PerfTimer = PerfTimer()
|
|
278
341
|
chart_usage_processing_time: PerfTimer = PerfTimer()
|
|
342
|
+
query_usage_processing_time: PerfTimer = PerfTimer()
|
|
343
|
+
query_platforms_count: Dict[str, int] = field(
|
|
344
|
+
default_factory=lambda: defaultdict(lambda: 0)
|
|
345
|
+
)
|
|
279
346
|
|
|
280
347
|
|
|
281
348
|
@platform_name(id="datahub", platform_name="DataHub")
|
|
@@ -283,6 +350,7 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
|
|
|
283
350
|
@support_status(SupportStatus.INCUBATING)
|
|
284
351
|
class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
285
352
|
platform = "datahub"
|
|
353
|
+
temp_files_to_clean: List[str] = []
|
|
286
354
|
|
|
287
355
|
def __init__(
|
|
288
356
|
self, ctx: PipelineContext, config: DataHubUsageFeatureReportingSourceConfig
|
|
@@ -291,6 +359,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
291
359
|
# super().__init__(ctx)
|
|
292
360
|
self.config: DataHubUsageFeatureReportingSourceConfig = config
|
|
293
361
|
self.report: DatahubUsageFeatureReport = DatahubUsageFeatureReport()
|
|
362
|
+
self.ctx = ctx
|
|
294
363
|
|
|
295
364
|
# We compile regexpes in advance for faster matching
|
|
296
365
|
self.compiled_regexp_factor: List[Tuple[re.Pattern[str], float]] = []
|
|
@@ -346,7 +415,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
346
415
|
if match:
|
|
347
416
|
platform = match.group(1)
|
|
348
417
|
else:
|
|
349
|
-
|
|
418
|
+
logger.warning("Platform not found in urn. Skipping...")
|
|
350
419
|
continue
|
|
351
420
|
|
|
352
421
|
yield {
|
|
@@ -366,7 +435,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
366
435
|
if match:
|
|
367
436
|
platform = match.group(1)
|
|
368
437
|
else:
|
|
369
|
-
|
|
438
|
+
logger.warning("Platform not found in urn. Skipping...")
|
|
370
439
|
continue
|
|
371
440
|
|
|
372
441
|
yield {
|
|
@@ -378,6 +447,40 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
378
447
|
f"Write Operation aspect processing took {time_taken:.3f} seconds"
|
|
379
448
|
)
|
|
380
449
|
|
|
450
|
+
def queries_entities_batch(self, results: Iterable) -> Iterable[Dict]:
|
|
451
|
+
with PerfTimer() as timer:
|
|
452
|
+
|
|
453
|
+
for doc in results:
|
|
454
|
+
if "platform" not in doc["_source"] or not doc["_source"]["platform"]:
|
|
455
|
+
logger.warning(
|
|
456
|
+
f"Platform not found in query { doc['_source']['urn']}. Skipping..."
|
|
457
|
+
)
|
|
458
|
+
continue
|
|
459
|
+
|
|
460
|
+
self.report.query_platforms_count[doc["_source"]["platform"]] = (
|
|
461
|
+
self.report.query_platforms_count[doc["_source"]["platform"]] + 1
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
yield {
|
|
465
|
+
"entity_urn": doc["_source"]["urn"],
|
|
466
|
+
"last_modified_at": (
|
|
467
|
+
doc["_source"]["lastModifiedAt"]
|
|
468
|
+
if "lastModifiedAt" in doc["_source"]
|
|
469
|
+
else (
|
|
470
|
+
doc["_source"]["lastModifiedAt"]
|
|
471
|
+
if "lastModifiedAt" in doc["_source"]
|
|
472
|
+
else None
|
|
473
|
+
)
|
|
474
|
+
),
|
|
475
|
+
"platform": doc["_source"]["platform"],
|
|
476
|
+
"removed": doc["_source"]["removed"]
|
|
477
|
+
if "removed" in doc["_source"]
|
|
478
|
+
else False,
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
time_taken = timer.elapsed_seconds()
|
|
482
|
+
logger.info(f"Query entities processing took {time_taken:.3f} seconds")
|
|
483
|
+
|
|
381
484
|
def process_dashboard_usage(self, results: Iterable) -> Iterable[Dict]:
|
|
382
485
|
for doc in results:
|
|
383
486
|
match = re.match(dashboard_chart_platform_regexp, doc["_source"]["urn"])
|
|
@@ -385,7 +488,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
385
488
|
platform = match.group(1)
|
|
386
489
|
self.report.dashboard_platforms_count[platform] += 1
|
|
387
490
|
else:
|
|
388
|
-
|
|
491
|
+
logger.warning("Platform not found in urn. Skipping...")
|
|
389
492
|
continue
|
|
390
493
|
|
|
391
494
|
yield {
|
|
@@ -416,6 +519,35 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
416
519
|
"platform": platform,
|
|
417
520
|
}
|
|
418
521
|
|
|
522
|
+
def process_query_usage(self, results: Iterable) -> Iterable[Dict]:
|
|
523
|
+
for doc in results:
|
|
524
|
+
yield {
|
|
525
|
+
"timestampMillis": doc["_source"]["timestampMillis"],
|
|
526
|
+
"lastObserved": doc["_source"]["systemMetadata"]["lastObserved"],
|
|
527
|
+
"urn": doc["_source"]["urn"],
|
|
528
|
+
"eventGranularity": (
|
|
529
|
+
doc["_source"]["eventGranularity"]
|
|
530
|
+
if "eventGranularity" in doc["_source"]
|
|
531
|
+
else None
|
|
532
|
+
),
|
|
533
|
+
"partitionSpec": doc["_source"]["partitionSpec"],
|
|
534
|
+
"queryCount": (
|
|
535
|
+
doc["_source"]["queryCount"]
|
|
536
|
+
if "queryCount" in doc["_source"]
|
|
537
|
+
else 0
|
|
538
|
+
),
|
|
539
|
+
"uniqueUserCount": (
|
|
540
|
+
doc["_source"]["uniqueUserCount"]
|
|
541
|
+
if "uniqueUserCount" in doc["_source"]
|
|
542
|
+
else None
|
|
543
|
+
),
|
|
544
|
+
"userCounts": (
|
|
545
|
+
doc["_source"]["event"]["userCounts"]
|
|
546
|
+
if "userCounts" in doc["_source"]["event"]
|
|
547
|
+
else []
|
|
548
|
+
),
|
|
549
|
+
}
|
|
550
|
+
|
|
419
551
|
def upstream_lineage_batch(self, results: Iterable) -> Iterable[Dict]:
|
|
420
552
|
for doc in results:
|
|
421
553
|
if (
|
|
@@ -431,7 +563,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
431
563
|
if source_platform_match:
|
|
432
564
|
source_platform = source_platform_match.group(1)
|
|
433
565
|
else:
|
|
434
|
-
|
|
566
|
+
logger.warning("Source Platform not found in urn. Skipping...")
|
|
435
567
|
continue
|
|
436
568
|
|
|
437
569
|
destination_platform_match = re.match(
|
|
@@ -440,7 +572,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
440
572
|
if destination_platform_match:
|
|
441
573
|
destination_platform = destination_platform_match.group(1)
|
|
442
574
|
else:
|
|
443
|
-
|
|
575
|
+
logger.warning("Destination Platform not found in urn. Skipping...")
|
|
444
576
|
continue
|
|
445
577
|
|
|
446
578
|
# In some case like Tableau there is dataset which marked as view and points to a dataset on another platform
|
|
@@ -462,7 +594,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
462
594
|
platform = match.group(1)
|
|
463
595
|
self.report.dataset_platforms_count[platform] += 1
|
|
464
596
|
else:
|
|
465
|
-
|
|
597
|
+
logger.warning("Platform not found in urn. Skipping...")
|
|
466
598
|
continue
|
|
467
599
|
|
|
468
600
|
yield {
|
|
@@ -620,6 +752,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
620
752
|
urn_field: str = "urn",
|
|
621
753
|
platform_field: str = "platform",
|
|
622
754
|
prefix: Optional[str] = None,
|
|
755
|
+
use_exp_cdf: Optional[bool] = None,
|
|
623
756
|
) -> polars.LazyFrame:
|
|
624
757
|
|
|
625
758
|
logger.debug(f"Generating rank and percentile for {count_field} field")
|
|
@@ -630,7 +763,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
630
763
|
.alias(f"{prefix}rank")
|
|
631
764
|
)
|
|
632
765
|
|
|
633
|
-
|
|
766
|
+
use_exp_cdf = self.config.use_exp_cdf if use_exp_cdf is None else use_exp_cdf
|
|
767
|
+
if use_exp_cdf:
|
|
634
768
|
lf = lf.with_columns(
|
|
635
769
|
polars.col(count_field)
|
|
636
770
|
.map_batches(exp_cdf, return_dtype=polars.Int64)
|
|
@@ -665,18 +799,107 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
665
799
|
|
|
666
800
|
return lf
|
|
667
801
|
|
|
802
|
+
@staticmethod
|
|
803
|
+
def polars_to_arrow_schema(polars_schema: Dict[str, polars.DataType]) -> pa.Schema:
|
|
804
|
+
def convert_dtype(polars_dtype: polars.DataType) -> pa.DataType:
|
|
805
|
+
type_mapping: Dict[polars.DataType, pa.DataType] = {
|
|
806
|
+
polars.Boolean(): pa.bool_(),
|
|
807
|
+
polars.Int8(): pa.int8(),
|
|
808
|
+
polars.Int16(): pa.int16(),
|
|
809
|
+
polars.Int32(): pa.int32(),
|
|
810
|
+
polars.Int64(): pa.int64(),
|
|
811
|
+
polars.UInt8(): pa.uint8(),
|
|
812
|
+
polars.UInt16(): pa.uint16(),
|
|
813
|
+
polars.UInt32(): pa.uint32(),
|
|
814
|
+
polars.UInt64(): pa.uint64(),
|
|
815
|
+
polars.Float32(): pa.float32(),
|
|
816
|
+
polars.Float64(): pa.float64(),
|
|
817
|
+
polars.Utf8(): pa.string(),
|
|
818
|
+
polars.Date(): pa.date32(),
|
|
819
|
+
polars.Datetime(): pa.timestamp("ns"),
|
|
820
|
+
polars.Time(): pa.time64("ns"),
|
|
821
|
+
polars.Duration(): pa.duration("ns"),
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
if polars_dtype in [type(key) for key in type_mapping.keys()]:
|
|
825
|
+
return type_mapping[polars_dtype]
|
|
826
|
+
elif polars_dtype == polars.Categorical():
|
|
827
|
+
return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
|
|
828
|
+
else:
|
|
829
|
+
raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
|
|
830
|
+
|
|
831
|
+
fields = [(name, convert_dtype(dtype)) for name, dtype in polars_schema.items()]
|
|
832
|
+
return pa.schema(fields)
|
|
833
|
+
|
|
834
|
+
def load_es_data_to_lf(
|
|
835
|
+
self, index: str, query: Dict, read_function: Callable, schema: Dict
|
|
836
|
+
) -> polars.LazyFrame:
|
|
837
|
+
es_data = self.load_data_from_es(
|
|
838
|
+
index,
|
|
839
|
+
query,
|
|
840
|
+
read_function,
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
with tempfile.NamedTemporaryFile(
|
|
844
|
+
delete=False, mode="wb", suffix=".parquet"
|
|
845
|
+
) as temp_file:
|
|
846
|
+
tempfile_name = temp_file.name
|
|
847
|
+
logger.debug(f"Creating temporary file {tempfile_name}")
|
|
848
|
+
self.temp_files_to_clean.append(tempfile_name)
|
|
849
|
+
|
|
850
|
+
# Create a PyArrow schema from the provided schema dict
|
|
851
|
+
pa_schema = self.polars_to_arrow_schema(schema)
|
|
852
|
+
|
|
853
|
+
# Initialize the ParquetWriter
|
|
854
|
+
with pq.ParquetWriter(tempfile_name, pa_schema) as writer:
|
|
855
|
+
batch_size = (
|
|
856
|
+
1000 # Adjust this value based on your data and memory constraints
|
|
857
|
+
)
|
|
858
|
+
current_batch = []
|
|
859
|
+
|
|
860
|
+
for row in es_data:
|
|
861
|
+
current_batch.append(row)
|
|
862
|
+
|
|
863
|
+
if len(current_batch) >= batch_size:
|
|
864
|
+
# Convert the batch to a PyArrow Table
|
|
865
|
+
table = pa.Table.from_pylist(current_batch, schema=pa_schema)
|
|
866
|
+
|
|
867
|
+
# Write the batch
|
|
868
|
+
writer.write_table(table)
|
|
869
|
+
|
|
870
|
+
# Clear the current batch
|
|
871
|
+
current_batch = []
|
|
872
|
+
|
|
873
|
+
# Write any remaining rows
|
|
874
|
+
if current_batch:
|
|
875
|
+
table = pa.Table.from_pylist(current_batch, schema=pa_schema)
|
|
876
|
+
writer.write_table(table)
|
|
877
|
+
|
|
878
|
+
return polars.scan_parquet(tempfile_name)
|
|
879
|
+
|
|
668
880
|
def load_write_usage(
|
|
669
881
|
self, soft_deleted_entities_df: polars.LazyFrame
|
|
670
882
|
) -> polars.LazyFrame:
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
883
|
+
|
|
884
|
+
if self.config.streaming_mode:
|
|
885
|
+
wdf = self.load_es_data_to_lf(
|
|
886
|
+
index="dataset_operationaspect_v1",
|
|
887
|
+
query=DATASET_WRITE_USAGE_RAW_QUERY,
|
|
888
|
+
read_function=self.write_stat_raw_batch,
|
|
889
|
+
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
890
|
+
)
|
|
891
|
+
wdf = wdf.cast({polars.String: polars.Categorical})
|
|
892
|
+
else:
|
|
893
|
+
wdf = polars.LazyFrame(
|
|
894
|
+
self.load_data_from_es(
|
|
895
|
+
"dataset_operationaspect_v1",
|
|
896
|
+
DATASET_WRITE_USAGE_RAW_QUERY,
|
|
897
|
+
self.write_stat_raw_batch,
|
|
898
|
+
),
|
|
899
|
+
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
900
|
+
strict=True,
|
|
901
|
+
)
|
|
902
|
+
|
|
680
903
|
wdf = wdf.group_by(polars.col("urn"), polars.col("platform")).agg(
|
|
681
904
|
polars.col("urn").count().alias("write_count"),
|
|
682
905
|
)
|
|
@@ -692,7 +915,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
692
915
|
.drop(["removed"])
|
|
693
916
|
)
|
|
694
917
|
|
|
695
|
-
return wdf
|
|
918
|
+
return wdf.collect(streaming=self.config.streaming_mode).lazy()
|
|
696
919
|
|
|
697
920
|
def load_write_usage_server_side_aggregation(
|
|
698
921
|
self, soft_deleted_entities_df: polars.LazyFrame
|
|
@@ -800,7 +1023,22 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
800
1023
|
chart_usage_df = self.generate_chart_usage()
|
|
801
1024
|
yield from self.generate_mcp_from_lazyframe(chart_usage_df)
|
|
802
1025
|
|
|
803
|
-
def
|
|
1026
|
+
def generate_query_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
|
|
1027
|
+
with polars.StringCache():
|
|
1028
|
+
logger.info("Generate Query Usage")
|
|
1029
|
+
query_usage_df = self.generate_query_usage()
|
|
1030
|
+
yield from self.generate_query_usage_mcp_from_lazyframe(query_usage_df)
|
|
1031
|
+
|
|
1032
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
1033
|
+
"""A list of functions that transforms the workunits produced by this source.
|
|
1034
|
+
Run in order, first in list is applied first. Be careful with order when overriding.
|
|
1035
|
+
"""
|
|
1036
|
+
|
|
1037
|
+
return [
|
|
1038
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
1039
|
+
]
|
|
1040
|
+
|
|
1041
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
804
1042
|
if self.config.dataset_usage_enabled:
|
|
805
1043
|
with self.report.dataset_usage_processing_time as timer:
|
|
806
1044
|
self.report.report_ingestion_stage_start("generate dataset usage")
|
|
@@ -825,6 +1063,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
825
1063
|
time_taken = timer.elapsed_seconds()
|
|
826
1064
|
logger.info(f"Chart Usage generation took {time_taken:.3f}")
|
|
827
1065
|
|
|
1066
|
+
if self.config.query_usage_enabled:
|
|
1067
|
+
with self.report.query_usage_processing_time as timer:
|
|
1068
|
+
self.report.report_ingestion_stage_start("generate query usage")
|
|
1069
|
+
|
|
1070
|
+
yield from self.generate_query_usage_mcps()
|
|
1071
|
+
|
|
1072
|
+
time_taken = timer.elapsed_seconds()
|
|
1073
|
+
logger.info(f"Query Usage generation took {time_taken:.3f}")
|
|
1074
|
+
|
|
828
1075
|
def generate_mcp_from_lazyframe(
|
|
829
1076
|
self, lazy_frame: polars.LazyFrame
|
|
830
1077
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -918,11 +1165,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
918
1165
|
int(row["write_count"])
|
|
919
1166
|
if "write_count" in row and row["write_count"]
|
|
920
1167
|
else 0
|
|
1168
|
+
if not self.config.disable_write_usage
|
|
1169
|
+
else None
|
|
921
1170
|
),
|
|
922
1171
|
writeCountPercentileLast30Days=(
|
|
923
1172
|
int(row["write_rank_percentile"])
|
|
924
1173
|
if "write_count" in row and row["write_rank_percentile"]
|
|
925
1174
|
else 0
|
|
1175
|
+
if not self.config.disable_write_usage
|
|
1176
|
+
else None
|
|
926
1177
|
),
|
|
927
1178
|
writeCountRankLast30Days=(
|
|
928
1179
|
int(row["write_rank"])
|
|
@@ -950,10 +1201,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
950
1201
|
combinedSearchRankingMultiplier=search_ranking_multipliers.combinedSearchRankingMultiplier,
|
|
951
1202
|
)
|
|
952
1203
|
|
|
953
|
-
|
|
954
|
-
entityUrn=row["urn"], aspect=usage_feature
|
|
955
|
-
)
|
|
956
|
-
yield mcp.as_workunit(is_primary_source=False)
|
|
1204
|
+
yield from self.generate_usage_feature_mcp(row["urn"], usage_feature)
|
|
957
1205
|
|
|
958
1206
|
if (
|
|
959
1207
|
"siblings" in row
|
|
@@ -962,15 +1210,72 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
962
1210
|
):
|
|
963
1211
|
for sibling in row["siblings"]:
|
|
964
1212
|
if dbt_platform_regexp.match(sibling):
|
|
965
|
-
|
|
966
|
-
|
|
1213
|
+
yield from self.generate_usage_feature_mcp(
|
|
1214
|
+
sibling, usage_feature
|
|
967
1215
|
)
|
|
968
|
-
|
|
969
|
-
|
|
1216
|
+
|
|
1217
|
+
def generate_query_usage_mcp_from_lazyframe(
|
|
1218
|
+
self, lazy_frame: polars.LazyFrame
|
|
1219
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1220
|
+
num = 0
|
|
1221
|
+
for row in lazy_frame.collect().to_struct():
|
|
1222
|
+
num += 1
|
|
1223
|
+
|
|
1224
|
+
query_usage_features = QueryUsageFeaturesClass(
|
|
1225
|
+
queryCountLast30Days=(
|
|
1226
|
+
int(row["totalSqlQueries"])
|
|
1227
|
+
if "totalSqlQueries" in row and row["totalSqlQueries"]
|
|
1228
|
+
else 0
|
|
1229
|
+
),
|
|
1230
|
+
queryCountTotal=None, # This is not implemented
|
|
1231
|
+
runsPercentileLast30days=(
|
|
1232
|
+
int(row["queries_rank_percentile"])
|
|
1233
|
+
if "queries_rank_percentile" in row
|
|
1234
|
+
and row["queries_rank_percentile"]
|
|
1235
|
+
else 0
|
|
1236
|
+
),
|
|
1237
|
+
lastExecutedAt=(
|
|
1238
|
+
int(row["last_modified_at"])
|
|
1239
|
+
if "last_modified_at" in row and row["last_modified_at"]
|
|
1240
|
+
else 0
|
|
1241
|
+
),
|
|
1242
|
+
topUsersLast30Days=(
|
|
1243
|
+
list(chain.from_iterable(row["top_users"]))
|
|
1244
|
+
if row["top_users"]
|
|
1245
|
+
else None
|
|
1246
|
+
),
|
|
1247
|
+
queryCostLast30Days=None, # Not implemented yet
|
|
1248
|
+
)
|
|
1249
|
+
|
|
1250
|
+
yield from self.generate_query_usage_feature_mcp(
|
|
1251
|
+
row["urn"], query_usage_features
|
|
1252
|
+
)
|
|
1253
|
+
|
|
1254
|
+
def generate_usage_feature_mcp(
|
|
1255
|
+
self, urn: str, usage_feature: UsageFeaturesClass
|
|
1256
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1257
|
+
if self.config.generate_patch:
|
|
1258
|
+
usage_feature_patch_builder = UsageFeaturePatchBuilder(urn=urn)
|
|
1259
|
+
usage_feature_patch_builder.apply_usage_features(usage_feature)
|
|
1260
|
+
for mcp in usage_feature_patch_builder.build():
|
|
1261
|
+
yield MetadataWorkUnit(
|
|
1262
|
+
id=MetadataWorkUnit.generate_workunit_id(mcp),
|
|
1263
|
+
mcp_raw=mcp,
|
|
1264
|
+
is_primary_source=False,
|
|
1265
|
+
)
|
|
1266
|
+
else:
|
|
1267
|
+
mcw = MetadataChangeProposalWrapper(entityUrn=urn, aspect=usage_feature)
|
|
1268
|
+
yield mcw.as_workunit(is_primary_source=False)
|
|
1269
|
+
|
|
1270
|
+
def generate_query_usage_feature_mcp(
|
|
1271
|
+
self, urn: str, query_usage_features: QueryUsageFeaturesClass
|
|
1272
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1273
|
+
mcw = MetadataChangeProposalWrapper(entityUrn=urn, aspect=query_usage_features)
|
|
1274
|
+
yield mcw.as_workunit(is_primary_source=False)
|
|
970
1275
|
|
|
971
1276
|
def generate_chart_usage(self) -> polars.LazyFrame:
|
|
972
|
-
|
|
973
|
-
|
|
1277
|
+
entity_index = "chartindex_v2"
|
|
1278
|
+
usage_index = "chart_chartusagestatisticsaspect_v1"
|
|
974
1279
|
|
|
975
1280
|
return self.generate_dashboard_chart_usage(entity_index, usage_index)
|
|
976
1281
|
|
|
@@ -1095,6 +1400,83 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1095
1400
|
|
|
1096
1401
|
return lf
|
|
1097
1402
|
|
|
1403
|
+
def generate_query_usage(self) -> polars.LazyFrame:
|
|
1404
|
+
usage_index = "query_queryusagestatisticsaspect_v1"
|
|
1405
|
+
entity_index = "queryindex_v2"
|
|
1406
|
+
|
|
1407
|
+
query_entities = polars.LazyFrame(
|
|
1408
|
+
self.load_data_from_es(
|
|
1409
|
+
index=entity_index,
|
|
1410
|
+
query=GET_QUERY_ENTITIES,
|
|
1411
|
+
process_function=self.queries_entities_batch,
|
|
1412
|
+
),
|
|
1413
|
+
schema={
|
|
1414
|
+
"entity_urn": polars.Categorical,
|
|
1415
|
+
"last_modified_at": polars.Int64,
|
|
1416
|
+
"platform": polars.Categorical,
|
|
1417
|
+
"removed": polars.Boolean,
|
|
1418
|
+
},
|
|
1419
|
+
strict=True,
|
|
1420
|
+
)
|
|
1421
|
+
|
|
1422
|
+
lf: polars.LazyFrame = polars.LazyFrame(
|
|
1423
|
+
self.load_data_from_es(
|
|
1424
|
+
index=usage_index,
|
|
1425
|
+
query=GET_QUERY_USAGE_QUERY,
|
|
1426
|
+
process_function=self.process_query_usage,
|
|
1427
|
+
),
|
|
1428
|
+
schema={
|
|
1429
|
+
"timestampMillis": polars.Int64,
|
|
1430
|
+
"lastObserved": polars.Int64,
|
|
1431
|
+
"urn": polars.Categorical,
|
|
1432
|
+
"eventGranularity": polars.String,
|
|
1433
|
+
"partitionSpec": polars.Struct(
|
|
1434
|
+
{
|
|
1435
|
+
"partition": polars.String,
|
|
1436
|
+
}
|
|
1437
|
+
),
|
|
1438
|
+
"queryCount": polars.Int64,
|
|
1439
|
+
"userCounts": polars.List(
|
|
1440
|
+
polars.Struct(
|
|
1441
|
+
{
|
|
1442
|
+
"usageCount": polars.Int64,
|
|
1443
|
+
"user": polars.String,
|
|
1444
|
+
}
|
|
1445
|
+
)
|
|
1446
|
+
),
|
|
1447
|
+
},
|
|
1448
|
+
)
|
|
1449
|
+
|
|
1450
|
+
lf = query_entities.join(
|
|
1451
|
+
lf, left_on="entity_urn", right_on="urn", how="left", coalesce=False
|
|
1452
|
+
).filter(
|
|
1453
|
+
polars.col("removed") == False # noqa: E712
|
|
1454
|
+
)
|
|
1455
|
+
|
|
1456
|
+
total_queries = lf.group_by("urn", "platform").agg(
|
|
1457
|
+
polars.col("queryCount").sum().alias("totalSqlQueries"),
|
|
1458
|
+
polars.col("last_modified_at").max().alias("last_modified_at"),
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
top_users = self.generate_top_users(lf, "usageCount")
|
|
1462
|
+
|
|
1463
|
+
usage_with_top_users = top_users.join(total_queries, on="urn", how="inner")
|
|
1464
|
+
|
|
1465
|
+
usage_with_top_users_with_ranks = self.gen_rank_and_percentile(
|
|
1466
|
+
lf=usage_with_top_users,
|
|
1467
|
+
count_field="totalSqlQueries",
|
|
1468
|
+
urn_field="urn",
|
|
1469
|
+
platform_field="platform",
|
|
1470
|
+
prefix="queries_",
|
|
1471
|
+
use_exp_cdf=False,
|
|
1472
|
+
)
|
|
1473
|
+
|
|
1474
|
+
usage_with_top_users_with_ranks = usage_with_top_users_with_ranks.sort(
|
|
1475
|
+
by=["platform", "queries_rank"], descending=[False, False]
|
|
1476
|
+
)
|
|
1477
|
+
|
|
1478
|
+
return usage_with_top_users_with_ranks
|
|
1479
|
+
|
|
1098
1480
|
def generate_dataset_usage(self) -> polars.LazyFrame:
|
|
1099
1481
|
datasets_lf = self.get_datasets()
|
|
1100
1482
|
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
@@ -1155,11 +1537,21 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1155
1537
|
by=["platform", "queries_rank"], descending=[False, False]
|
|
1156
1538
|
)
|
|
1157
1539
|
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1540
|
+
if not self.config.disable_write_usage:
|
|
1541
|
+
# Calculate write usage
|
|
1542
|
+
if self.config.use_server_side_aggregation:
|
|
1543
|
+
write_lf = self.load_write_usage_server_side_aggregation(datasets_lf)
|
|
1544
|
+
else:
|
|
1545
|
+
write_lf = self.load_write_usage(datasets_lf)
|
|
1161
1546
|
else:
|
|
1162
|
-
|
|
1547
|
+
logger.info("Write usage disabled")
|
|
1548
|
+
write_lf = polars.LazyFrame(
|
|
1549
|
+
schema={
|
|
1550
|
+
"urn": polars.Categorical,
|
|
1551
|
+
"platform": polars.Categorical,
|
|
1552
|
+
"write_count": polars.Int64,
|
|
1553
|
+
}
|
|
1554
|
+
)
|
|
1163
1555
|
|
|
1164
1556
|
usage_and_write_lf = (
|
|
1165
1557
|
usage_with_top_users_with_ranks.join(
|
|
@@ -1321,3 +1713,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1321
1713
|
|
|
1322
1714
|
def get_report(self) -> SourceReport:
|
|
1323
1715
|
return self.report
|
|
1716
|
+
|
|
1717
|
+
def __del__(self) -> None:
|
|
1718
|
+
for temp_file in self.temp_files_to_clean:
|
|
1719
|
+
logger.info(f"Cleaning up temp file: {temp_file}")
|
|
1720
|
+
os.remove(temp_file)
|
|
1721
|
+
self.temp_files_to_clean = []
|