acryl-datahub-cloud 0.3.14rc0__py3-none-any.whl → 0.3.14rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +26 -7
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +2034 -2034
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +6 -0
- acryl_datahub_cloud/metadata/schema.avsc +25471 -25146
- acryl_datahub_cloud/metadata/schema_classes.py +1173 -685
- acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
- acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +123 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +123 -2
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +77 -1
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +82 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +123 -2
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +8 -2
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +0 -3
- acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +123 -2
- {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/METADATA +50 -50
- {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/RECORD +29 -26
- {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
EXCLUDED_PATTERNS = [
|
|
2
|
+
"_ingestion",
|
|
3
|
+
"amplitude",
|
|
4
|
+
"analytics",
|
|
5
|
+
"anomaly",
|
|
6
|
+
"anomalo",
|
|
7
|
+
"airflow",
|
|
8
|
+
"app",
|
|
9
|
+
"api",
|
|
10
|
+
"aws",
|
|
11
|
+
"braze",
|
|
12
|
+
"bigquery",
|
|
13
|
+
"backfill",
|
|
14
|
+
"billing",
|
|
15
|
+
"bot",
|
|
16
|
+
"census",
|
|
17
|
+
"customer_io",
|
|
18
|
+
"connector",
|
|
19
|
+
"composer",
|
|
20
|
+
"compute",
|
|
21
|
+
"circleci",
|
|
22
|
+
"classifier",
|
|
23
|
+
"cron",
|
|
24
|
+
"datahub",
|
|
25
|
+
"data-engine",
|
|
26
|
+
"dbt",
|
|
27
|
+
"datadog",
|
|
28
|
+
"deploy",
|
|
29
|
+
"databricks",
|
|
30
|
+
"dataflow",
|
|
31
|
+
"dataplex",
|
|
32
|
+
"dagster",
|
|
33
|
+
"enterprise",
|
|
34
|
+
"export",
|
|
35
|
+
"etl",
|
|
36
|
+
"fivetran",
|
|
37
|
+
"function",
|
|
38
|
+
"google",
|
|
39
|
+
"gcp",
|
|
40
|
+
"gke",
|
|
41
|
+
"grafana",
|
|
42
|
+
"hex",
|
|
43
|
+
"hightouch",
|
|
44
|
+
"ingest",
|
|
45
|
+
"infra",
|
|
46
|
+
"infer",
|
|
47
|
+
"integration",
|
|
48
|
+
"iam",
|
|
49
|
+
"job",
|
|
50
|
+
"jenkins",
|
|
51
|
+
"looker",
|
|
52
|
+
"lineage",
|
|
53
|
+
"monte_carlo",
|
|
54
|
+
"netsuite",
|
|
55
|
+
"process",
|
|
56
|
+
"prefect",
|
|
57
|
+
"pipeline",
|
|
58
|
+
"query",
|
|
59
|
+
"redash",
|
|
60
|
+
"realtime",
|
|
61
|
+
"report",
|
|
62
|
+
"remote-executor",
|
|
63
|
+
"runner",
|
|
64
|
+
"sagemaker",
|
|
65
|
+
"salesforce",
|
|
66
|
+
"sigma",
|
|
67
|
+
"sandbox",
|
|
68
|
+
"snowplow",
|
|
69
|
+
"segment",
|
|
70
|
+
"sync",
|
|
71
|
+
"schedul",
|
|
72
|
+
"svc",
|
|
73
|
+
"sa_",
|
|
74
|
+
"_sa",
|
|
75
|
+
"sa-",
|
|
76
|
+
"-sa",
|
|
77
|
+
"snowflake",
|
|
78
|
+
"service",
|
|
79
|
+
"system",
|
|
80
|
+
"spark",
|
|
81
|
+
"task",
|
|
82
|
+
"test",
|
|
83
|
+
"team",
|
|
84
|
+
"talend",
|
|
85
|
+
"teleskope",
|
|
86
|
+
"train",
|
|
87
|
+
"tableau",
|
|
88
|
+
"unknown",
|
|
89
|
+
"wiz",
|
|
90
|
+
"warehouse",
|
|
91
|
+
"workload",
|
|
92
|
+
"workflow",
|
|
93
|
+
"worker",
|
|
94
|
+
]
|
|
@@ -22,6 +22,7 @@ from polars.datatypes import DataTypeClass
|
|
|
22
22
|
from pydantic import Field
|
|
23
23
|
from scipy.stats import expon
|
|
24
24
|
|
|
25
|
+
from acryl_datahub_cloud.datahub_usage_reporting.excluded import EXCLUDED_PATTERNS
|
|
25
26
|
from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
|
|
26
27
|
from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
|
|
27
28
|
UsageFeaturePatchBuilder,
|
|
@@ -196,6 +197,11 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
196
197
|
description="Flag to generate MCP patch for usage features.'",
|
|
197
198
|
)
|
|
198
199
|
|
|
200
|
+
excluded_platforms: List[str] = Field(
|
|
201
|
+
EXCLUDED_PATTERNS,
|
|
202
|
+
description="List of platforms to exclude from usage statistics collection. This is done to avoid invite user functionality to be filled with service accounts.",
|
|
203
|
+
)
|
|
204
|
+
|
|
199
205
|
|
|
200
206
|
def exp_cdf(series: polars.Series) -> polars.Series:
|
|
201
207
|
with PerfTimer() as timer:
|
|
@@ -1573,6 +1579,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1573
1579
|
polars.col("platform_usage_pairs").fill_null(polars.lit([]))
|
|
1574
1580
|
)
|
|
1575
1581
|
|
|
1582
|
+
def _filter_users(self, users_lf: polars.LazyFrame) -> polars.LazyFrame:
|
|
1583
|
+
filter_condition = polars.col("user").str.contains("@")
|
|
1584
|
+
for pattern in self.config.excluded_platforms:
|
|
1585
|
+
filter_condition = filter_condition & ~polars.col("user").str.contains(
|
|
1586
|
+
pattern
|
|
1587
|
+
)
|
|
1588
|
+
|
|
1589
|
+
return users_lf.filter(filter_condition)
|
|
1590
|
+
|
|
1576
1591
|
def _combine_user_totals(
|
|
1577
1592
|
self,
|
|
1578
1593
|
dataset_usage_lf: polars.LazyFrame,
|
|
@@ -1581,13 +1596,17 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1581
1596
|
) -> polars.LazyFrame:
|
|
1582
1597
|
"""Combine user totals and top_datasets_map from all sources."""
|
|
1583
1598
|
# Collect all unique users in one operation
|
|
1584
|
-
all_users_lf =
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1599
|
+
all_users_lf = (
|
|
1600
|
+
polars.concat(
|
|
1601
|
+
[
|
|
1602
|
+
dataset_usage_lf.select("user"),
|
|
1603
|
+
dashboard_usage_lf.select("user"),
|
|
1604
|
+
chart_usage_lf.select("user"),
|
|
1605
|
+
]
|
|
1606
|
+
)
|
|
1607
|
+
.unique()
|
|
1608
|
+
.pipe(self._filter_users)
|
|
1609
|
+
)
|
|
1591
1610
|
|
|
1592
1611
|
return (
|
|
1593
1612
|
all_users_lf.join(
|