acryl-datahub-cloud 0.3.14rc0__py3-none-any.whl → 0.3.14rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (29) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
  3. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +26 -7
  4. acryl_datahub_cloud/metadata/_urns/urn_defs.py +2034 -2034
  5. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
  6. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  7. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +6 -0
  8. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +6 -0
  9. acryl_datahub_cloud/metadata/schema.avsc +25471 -25146
  10. acryl_datahub_cloud/metadata/schema_classes.py +1173 -685
  11. acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
  12. acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +123 -2
  13. acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +123 -2
  14. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +8 -0
  15. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +77 -1
  16. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +1 -0
  17. acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +1 -0
  18. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +82 -0
  19. acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
  20. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +1 -0
  21. acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +123 -2
  22. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +8 -2
  23. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +0 -3
  24. acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +123 -2
  25. {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/METADATA +50 -50
  26. {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/RECORD +29 -26
  27. {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/WHEEL +0 -0
  28. {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/entry_points.txt +0 -0
  29. {acryl_datahub_cloud-0.3.14rc0.dist-info → acryl_datahub_cloud-0.3.14rc2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "acryl-datahub-cloud",
3
- "version": "0.3.14rc0",
3
+ "version": "0.3.14rc2",
4
4
  "install_requires": [
5
5
  "avro-gen3==0.7.16",
6
6
  "acryl-datahub"
@@ -0,0 +1,94 @@
1
+ EXCLUDED_PATTERNS = [
2
+ "_ingestion",
3
+ "amplitude",
4
+ "analytics",
5
+ "anomaly",
6
+ "anomalo",
7
+ "airflow",
8
+ "app",
9
+ "api",
10
+ "aws",
11
+ "braze",
12
+ "bigquery",
13
+ "backfill",
14
+ "billing",
15
+ "bot",
16
+ "census",
17
+ "customer_io",
18
+ "connector",
19
+ "composer",
20
+ "compute",
21
+ "circleci",
22
+ "classifier",
23
+ "cron",
24
+ "datahub",
25
+ "data-engine",
26
+ "dbt",
27
+ "datadog",
28
+ "deploy",
29
+ "databricks",
30
+ "dataflow",
31
+ "dataplex",
32
+ "dagster",
33
+ "enterprise",
34
+ "export",
35
+ "etl",
36
+ "fivetran",
37
+ "function",
38
+ "google",
39
+ "gcp",
40
+ "gke",
41
+ "grafana",
42
+ "hex",
43
+ "hightouch",
44
+ "ingest",
45
+ "infra",
46
+ "infer",
47
+ "integration",
48
+ "iam",
49
+ "job",
50
+ "jenkins",
51
+ "looker",
52
+ "lineage",
53
+ "monte_carlo",
54
+ "netsuite",
55
+ "process",
56
+ "prefect",
57
+ "pipeline",
58
+ "query",
59
+ "redash",
60
+ "realtime",
61
+ "report",
62
+ "remote-executor",
63
+ "runner",
64
+ "sagemaker",
65
+ "salesforce",
66
+ "sigma",
67
+ "sandbox",
68
+ "snowplow",
69
+ "segment",
70
+ "sync",
71
+ "schedul",
72
+ "svc",
73
+ "sa_",
74
+ "_sa",
75
+ "sa-",
76
+ "-sa",
77
+ "snowflake",
78
+ "service",
79
+ "system",
80
+ "spark",
81
+ "task",
82
+ "test",
83
+ "team",
84
+ "talend",
85
+ "teleskope",
86
+ "train",
87
+ "tableau",
88
+ "unknown",
89
+ "wiz",
90
+ "warehouse",
91
+ "workload",
92
+ "workflow",
93
+ "worker",
94
+ ]
@@ -22,6 +22,7 @@ from polars.datatypes import DataTypeClass
22
22
  from pydantic import Field
23
23
  from scipy.stats import expon
24
24
 
25
+ from acryl_datahub_cloud.datahub_usage_reporting.excluded import EXCLUDED_PATTERNS
25
26
  from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
26
27
  from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
27
28
  UsageFeaturePatchBuilder,
@@ -196,6 +197,11 @@ class DataHubUsageFeatureReportingSourceConfig(
196
197
  description="Flag to generate MCP patch for usage features.'",
197
198
  )
198
199
 
200
+ excluded_platforms: List[str] = Field(
201
+ EXCLUDED_PATTERNS,
202
+ description="List of platforms to exclude from usage statistics collection. This is done to avoid invite user functionality to be filled with service accounts.",
203
+ )
204
+
199
205
 
200
206
  def exp_cdf(series: polars.Series) -> polars.Series:
201
207
  with PerfTimer() as timer:
@@ -1573,6 +1579,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1573
1579
  polars.col("platform_usage_pairs").fill_null(polars.lit([]))
1574
1580
  )
1575
1581
 
1582
+ def _filter_users(self, users_lf: polars.LazyFrame) -> polars.LazyFrame:
1583
+ filter_condition = polars.col("user").str.contains("@")
1584
+ for pattern in self.config.excluded_platforms:
1585
+ filter_condition = filter_condition & ~polars.col("user").str.contains(
1586
+ pattern
1587
+ )
1588
+
1589
+ return users_lf.filter(filter_condition)
1590
+
1576
1591
  def _combine_user_totals(
1577
1592
  self,
1578
1593
  dataset_usage_lf: polars.LazyFrame,
@@ -1581,13 +1596,17 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1581
1596
  ) -> polars.LazyFrame:
1582
1597
  """Combine user totals and top_datasets_map from all sources."""
1583
1598
  # Collect all unique users in one operation
1584
- all_users_lf = polars.concat(
1585
- [
1586
- dataset_usage_lf.select("user"),
1587
- dashboard_usage_lf.select("user"),
1588
- chart_usage_lf.select("user"),
1589
- ]
1590
- ).unique()
1599
+ all_users_lf = (
1600
+ polars.concat(
1601
+ [
1602
+ dataset_usage_lf.select("user"),
1603
+ dashboard_usage_lf.select("user"),
1604
+ chart_usage_lf.select("user"),
1605
+ ]
1606
+ )
1607
+ .unique()
1608
+ .pipe(self._filter_users)
1609
+ )
1591
1610
 
1592
1611
  return (
1593
1612
  all_users_lf.join(