acryl-datahub-cloud 0.3.12rc1__py3-none-any.whl → 0.3.12rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +559 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
- acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
- acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
- acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +49 -40
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +1842 -1786
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- acryl_datahub_cloud/metadata/schema.avsc +24861 -24050
- acryl_datahub_cloud/metadata/schema_classes.py +1031 -631
- acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
- acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +72 -0
- acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +40 -7
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +27 -6
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +31 -7
- acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +14 -0
- acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/FormKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +27 -6
- acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +1 -0
- acryl_datahub_cloud/notifications/__init__.py +0 -0
- acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
- acryl_datahub_cloud/sdk/__init__.py +29 -0
- acryl_datahub_cloud/{_sdk_extras → sdk}/assertion.py +501 -193
- acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
- acryl_datahub_cloud/{_sdk_extras → sdk/assertion_input}/assertion_input.py +733 -189
- acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +261 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +947 -0
- acryl_datahub_cloud/sdk/assertions_client.py +1639 -0
- acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
- acryl_datahub_cloud/{_sdk_extras → sdk}/entities/assertion.py +5 -2
- acryl_datahub_cloud/{_sdk_extras → sdk}/subscription_client.py +146 -33
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/METADATA +48 -43
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/RECORD +72 -54
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/entry_points.txt +1 -0
- acryl_datahub_cloud/_sdk_extras/__init__.py +0 -19
- acryl_datahub_cloud/_sdk_extras/assertions_client.py +0 -717
- /acryl_datahub_cloud/{_sdk_extras/entities → datahub_forms_notifications}/__init__.py +0 -0
- /acryl_datahub_cloud/{_sdk_extras → sdk}/entities/monitor.py +0 -0
- /acryl_datahub_cloud/{_sdk_extras → sdk}/entities/subscription.py +0 -0
- /acryl_datahub_cloud/{_sdk_extras → sdk}/errors.py +0 -0
- /acryl_datahub_cloud/{_sdk_extras → sdk}/resolver_client.py +0 -0
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
1
2
|
from typing import Dict
|
|
2
3
|
|
|
3
4
|
|
|
@@ -5,7 +6,7 @@ class QueryBuilder:
|
|
|
5
6
|
@staticmethod
|
|
6
7
|
def get_dataset_entities_query() -> Dict:
|
|
7
8
|
return {
|
|
8
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
9
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
9
10
|
"_source": {
|
|
10
11
|
"includes": [
|
|
11
12
|
"urn",
|
|
@@ -19,15 +20,54 @@ class QueryBuilder:
|
|
|
19
20
|
}
|
|
20
21
|
|
|
21
22
|
@staticmethod
|
|
22
|
-
def get_query_entities_query() -> Dict:
|
|
23
|
+
def get_query_entities_query(days: int) -> Dict:
|
|
24
|
+
thirty_days_ago = datetime.now() - timedelta(days=days)
|
|
25
|
+
thirty_days_ago = thirty_days_ago.replace(
|
|
26
|
+
hour=0, minute=0, second=0, microsecond=0
|
|
27
|
+
)
|
|
28
|
+
epoch_ms = int(thirty_days_ago.timestamp() * 1000)
|
|
29
|
+
|
|
23
30
|
return {
|
|
24
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
31
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
25
32
|
"_source": {"includes": ["urn", "lastModifiedAt", "platform", "removed"]},
|
|
26
33
|
"query": {
|
|
27
34
|
"bool": {
|
|
28
35
|
"filter": [
|
|
29
36
|
{"bool": {"must_not": [{"term": {"source": "MANUAL"}}]}},
|
|
30
37
|
{"exists": {"field": "platform"}},
|
|
38
|
+
{
|
|
39
|
+
"bool": {
|
|
40
|
+
"should": [
|
|
41
|
+
{
|
|
42
|
+
"bool": {
|
|
43
|
+
"filter": [
|
|
44
|
+
{"exists": {"field": "lastModifiedAt"}},
|
|
45
|
+
{
|
|
46
|
+
"range": {
|
|
47
|
+
"lastModifiedAt": {
|
|
48
|
+
"gte": epoch_ms
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
]
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"bool": {
|
|
57
|
+
"must_not": {
|
|
58
|
+
"exists": {"field": "lastModifiedAt"}
|
|
59
|
+
},
|
|
60
|
+
"filter": {
|
|
61
|
+
"range": {
|
|
62
|
+
"createdAt": {"gte": epoch_ms}
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
],
|
|
68
|
+
"minimum_should_match": 1,
|
|
69
|
+
}
|
|
70
|
+
},
|
|
31
71
|
]
|
|
32
72
|
}
|
|
33
73
|
},
|
|
@@ -36,7 +76,7 @@ class QueryBuilder:
|
|
|
36
76
|
@staticmethod
|
|
37
77
|
def get_upstreams_query() -> Dict:
|
|
38
78
|
return {
|
|
39
|
-
"sort": [{"destination.urn": {"order": "asc"}}],
|
|
79
|
+
# "sort": [{"destination.urn": {"order": "asc"}}],
|
|
40
80
|
"_source": {"includes": ["source.urn", "destination.urn"]},
|
|
41
81
|
"query": {
|
|
42
82
|
"bool": {
|
|
@@ -51,7 +91,7 @@ class QueryBuilder:
|
|
|
51
91
|
@staticmethod
|
|
52
92
|
def get_dashboard_usage_query(days: int) -> Dict:
|
|
53
93
|
return {
|
|
54
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
94
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
55
95
|
"_source": {
|
|
56
96
|
"includes": [
|
|
57
97
|
"timestampMillis",
|
|
@@ -80,7 +120,7 @@ class QueryBuilder:
|
|
|
80
120
|
@staticmethod
|
|
81
121
|
def get_dataset_usage_query(days: int) -> Dict:
|
|
82
122
|
return {
|
|
83
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
123
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
84
124
|
"_source": {
|
|
85
125
|
"includes": [
|
|
86
126
|
"timestampMillis",
|
|
@@ -110,7 +150,7 @@ class QueryBuilder:
|
|
|
110
150
|
@staticmethod
|
|
111
151
|
def get_dataset_write_usage_raw_query(days: int) -> Dict:
|
|
112
152
|
return {
|
|
113
|
-
"sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
|
|
153
|
+
# "sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
|
|
114
154
|
"_source": {
|
|
115
155
|
"includes": [
|
|
116
156
|
"urn" # Only field needed for platform extraction via regex
|
|
@@ -159,7 +199,7 @@ class QueryBuilder:
|
|
|
159
199
|
@staticmethod
|
|
160
200
|
def get_query_usage_query(days: int) -> Dict:
|
|
161
201
|
return {
|
|
162
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
202
|
+
# "sort": [{"urn": {"order": "asc"}}],
|
|
163
203
|
"_source": {
|
|
164
204
|
"includes": [
|
|
165
205
|
"timestampMillis",
|
|
@@ -114,12 +114,12 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
114
114
|
30, description="Timeout in seconds for the search queries."
|
|
115
115
|
)
|
|
116
116
|
extract_batch_size: int = Field(
|
|
117
|
-
|
|
117
|
+
5000,
|
|
118
118
|
description="The number of documents to retrieve in each batch from ElasticSearch or OpenSearch.",
|
|
119
119
|
)
|
|
120
120
|
|
|
121
121
|
extract_delay: Optional[float] = Field(
|
|
122
|
-
0
|
|
122
|
+
0,
|
|
123
123
|
description="The delay in seconds between each batch extraction from ElasticSearch or OpenSearch.",
|
|
124
124
|
)
|
|
125
125
|
|
|
@@ -177,7 +177,7 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
177
177
|
# This option is only needed here until we are sure that the streaming mode is stable.
|
|
178
178
|
# then we can remove it and control it with the streaming_mode option.
|
|
179
179
|
experimental_full_streaming: bool = Field(
|
|
180
|
-
|
|
180
|
+
True,
|
|
181
181
|
description="Flag to enable full streaming mode.'",
|
|
182
182
|
)
|
|
183
183
|
|
|
@@ -617,11 +617,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
617
617
|
),
|
|
618
618
|
)
|
|
619
619
|
|
|
620
|
-
response = server.create_pit(index, keep_alive="10m")
|
|
620
|
+
# response = server.create_pit(index, keep_alive="10m")
|
|
621
621
|
|
|
622
622
|
# TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
|
|
623
|
-
pit = response.get("pit_id")
|
|
624
|
-
query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
|
|
623
|
+
# pit = response.get("pit_id")
|
|
624
|
+
# query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
|
|
625
625
|
else:
|
|
626
626
|
server = Elasticsearch(
|
|
627
627
|
[endpoint],
|
|
@@ -834,7 +834,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
834
834
|
.drop(["removed"])
|
|
835
835
|
)
|
|
836
836
|
|
|
837
|
-
return wdf
|
|
837
|
+
return wdf
|
|
838
838
|
|
|
839
839
|
def load_write_usage_server_side_aggregation(
|
|
840
840
|
self, soft_deleted_entities_df: polars.LazyFrame
|
|
@@ -995,7 +995,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
995
995
|
self, lazy_frame: polars.LazyFrame
|
|
996
996
|
) -> Iterable[MetadataWorkUnit]:
|
|
997
997
|
for row in lazy_frame.collect(
|
|
998
|
-
streaming
|
|
998
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
999
999
|
).to_struct():
|
|
1000
1000
|
if "siblings" in row and row["siblings"]:
|
|
1001
1001
|
logger.info(f"Siblings found for urn: {row['urn']} -> row['siblings']")
|
|
@@ -1086,7 +1086,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1086
1086
|
def generate_query_usage_mcp_from_lazyframe(
|
|
1087
1087
|
self, lazy_frame: polars.LazyFrame
|
|
1088
1088
|
) -> Iterable[MetadataWorkUnit]:
|
|
1089
|
-
for row in lazy_frame.collect(
|
|
1089
|
+
for row in lazy_frame.collect(
|
|
1090
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
1091
|
+
).iter_rows(named=True):
|
|
1090
1092
|
query_usage_features = QueryUsageFeaturesClass(
|
|
1091
1093
|
queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
|
|
1092
1094
|
queryCountTotal=None, # This is not implemented
|
|
@@ -1308,7 +1310,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1308
1310
|
query_entities = self.load_data_from_es_to_lf(
|
|
1309
1311
|
schema=query_entities_schema,
|
|
1310
1312
|
index=entity_index,
|
|
1311
|
-
query=QueryBuilder.get_query_entities_query(),
|
|
1313
|
+
query=QueryBuilder.get_query_entities_query(self.config.lookback_days),
|
|
1312
1314
|
process_function=self.queries_entities_batch,
|
|
1313
1315
|
)
|
|
1314
1316
|
|
|
@@ -1485,11 +1487,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1485
1487
|
# called `Option::unwrap()` on a `None` value
|
|
1486
1488
|
# Which only happens if we don't collect immediately
|
|
1487
1489
|
# return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
|
|
1488
|
-
return (
|
|
1489
|
-
polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
|
|
1490
|
-
.collect()
|
|
1491
|
-
.lazy()
|
|
1492
|
-
)
|
|
1490
|
+
return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
|
|
1493
1491
|
|
|
1494
1492
|
def load_dataset_usage(self) -> polars.LazyFrame:
|
|
1495
1493
|
index = "dataset_datasetusagestatisticsaspect_v1"
|
|
@@ -1606,23 +1604,40 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1606
1604
|
delay: Optional[float] = None,
|
|
1607
1605
|
) -> Iterable[Dict[str, Any]]:
|
|
1608
1606
|
processed_count = 0
|
|
1607
|
+
scroll_id = None
|
|
1609
1608
|
while True:
|
|
1610
1609
|
with PerfTimer() as timer:
|
|
1611
1610
|
logger.debug(f"ES query: {query}")
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1611
|
+
if not scroll_id:
|
|
1612
|
+
logger.debug(
|
|
1613
|
+
f"Getting inital data from index {index} without scroll id"
|
|
1614
|
+
)
|
|
1615
|
+
results = server.search(
|
|
1616
|
+
body=query,
|
|
1617
|
+
size=batch_size,
|
|
1618
|
+
scroll="2m",
|
|
1619
|
+
index=index,
|
|
1620
|
+
params=(
|
|
1621
|
+
{"timeout": self.config.query_timeout}
|
|
1622
|
+
if self.config.search_index.opensearch_dialect
|
|
1623
|
+
else {"request_timeout": self.config.query_timeout}
|
|
1624
|
+
),
|
|
1625
|
+
)
|
|
1626
|
+
else:
|
|
1627
|
+
logger.debug(
|
|
1628
|
+
f"Getting data from index {index} using scroll_id: {scroll_id}"
|
|
1629
|
+
)
|
|
1630
|
+
results = server.scroll(
|
|
1631
|
+
scroll_id=scroll_id,
|
|
1632
|
+
scroll="2m",
|
|
1633
|
+
params=(
|
|
1634
|
+
{"timeout": self.config.query_timeout}
|
|
1635
|
+
if self.config.search_index.opensearch_dialect
|
|
1636
|
+
else {"request_timeout": self.config.query_timeout}
|
|
1637
|
+
),
|
|
1638
|
+
)
|
|
1639
|
+
scroll_id = results["_scroll_id"]
|
|
1640
|
+
|
|
1626
1641
|
if not aggregation_key:
|
|
1627
1642
|
yield from process_function(results["hits"]["hits"])
|
|
1628
1643
|
|
|
@@ -1633,7 +1648,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1633
1648
|
)
|
|
1634
1649
|
if len(results["hits"]["hits"]) < batch_size:
|
|
1635
1650
|
break
|
|
1636
|
-
query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
|
|
1637
1651
|
else:
|
|
1638
1652
|
yield from process_function(
|
|
1639
1653
|
results["aggregations"][aggregation_key]["buckets"]
|
|
@@ -1643,16 +1657,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1643
1657
|
< batch_size
|
|
1644
1658
|
):
|
|
1645
1659
|
break
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
if delay:
|
|
1652
|
-
logger.debug(
|
|
1653
|
-
f"Sleeping for {delay} seconds before getting next batch from ES"
|
|
1654
|
-
)
|
|
1655
|
-
time.sleep(delay)
|
|
1660
|
+
if delay:
|
|
1661
|
+
logger.debug(
|
|
1662
|
+
f"Sleeping for {delay} seconds before getting next batch from ES"
|
|
1663
|
+
)
|
|
1664
|
+
time.sleep(delay)
|
|
1656
1665
|
|
|
1657
1666
|
def get_report(self) -> SourceReport:
|
|
1658
1667
|
return self.report
|