acryl-datahub-cloud 0.3.7.7rc6__py3-none-any.whl → 0.3.7.7rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +163 -0
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +24 -126
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +1507 -1507
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- acryl_datahub_cloud/metadata/schema.avsc +24528 -24891
- acryl_datahub_cloud/metadata/schema_classes.py +611 -490
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- {acryl_datahub_cloud-0.3.7.7rc6.dist-info → acryl_datahub_cloud-0.3.7.7rc8.dist-info}/METADATA +37 -37
- {acryl_datahub_cloud-0.3.7.7rc6.dist-info → acryl_datahub_cloud-0.3.7.7rc8.dist-info}/RECORD +15 -13
- {acryl_datahub_cloud-0.3.7.7rc6.dist-info → acryl_datahub_cloud-0.3.7.7rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.7.7rc6.dist-info → acryl_datahub_cloud-0.3.7.7rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.7.7rc6.dist-info → acryl_datahub_cloud-0.3.7.7rc8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class QueryBuilder:
|
|
5
|
+
@staticmethod
|
|
6
|
+
def get_soft_deleted_entities_query() -> Dict:
|
|
7
|
+
return {
|
|
8
|
+
"sort": [{"urn": {"order": "asc"}}],
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def get_query_entities_query() -> Dict:
|
|
13
|
+
return {
|
|
14
|
+
"sort": [{"urn": {"order": "asc"}}],
|
|
15
|
+
"query": {
|
|
16
|
+
"bool": {
|
|
17
|
+
"filter": {
|
|
18
|
+
"bool": {
|
|
19
|
+
"must_not": [
|
|
20
|
+
{"term": {"source": "MANUAL"}},
|
|
21
|
+
]
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
},
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def get_upstreams_query() -> Dict:
|
|
30
|
+
return {
|
|
31
|
+
"sort": [{"destination.urn": {"order": "asc"}}],
|
|
32
|
+
"query": {
|
|
33
|
+
"bool": {
|
|
34
|
+
"must": [
|
|
35
|
+
{"terms": {"destination.entityType": ["dataset"]}},
|
|
36
|
+
{"terms": {"source.entityType": ["dataset"]}},
|
|
37
|
+
]
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def get_dashboard_usage_query(days: int) -> Dict:
|
|
44
|
+
return {
|
|
45
|
+
"sort": [{"urn": {"order": "asc"}}],
|
|
46
|
+
"query": {
|
|
47
|
+
"bool": {
|
|
48
|
+
"filter": {
|
|
49
|
+
"bool": {
|
|
50
|
+
"must": [
|
|
51
|
+
{
|
|
52
|
+
"range": {
|
|
53
|
+
"@timestamp": {
|
|
54
|
+
"gte": f"now-{days}d",
|
|
55
|
+
"lt": "now/d",
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
{"term": {"isExploded": False}},
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def get_dataset_usage_query(days: int) -> Dict:
|
|
69
|
+
return {
|
|
70
|
+
"sort": [{"urn": {"order": "asc"}}],
|
|
71
|
+
"query": {
|
|
72
|
+
"bool": {
|
|
73
|
+
"filter": {
|
|
74
|
+
"bool": {
|
|
75
|
+
"must": [
|
|
76
|
+
{
|
|
77
|
+
"range": {
|
|
78
|
+
"@timestamp": {
|
|
79
|
+
"gte": f"now-{days}d/d",
|
|
80
|
+
"lt": "now/d",
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
},
|
|
84
|
+
{"term": {"isExploded": False}},
|
|
85
|
+
{"range": {"totalSqlQueries": {"gt": 0}}},
|
|
86
|
+
]
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
},
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def get_dataset_write_usage_raw_query(days: int) -> Dict:
|
|
95
|
+
return {
|
|
96
|
+
"sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
|
|
97
|
+
"query": {
|
|
98
|
+
"bool": {
|
|
99
|
+
"must": [
|
|
100
|
+
{
|
|
101
|
+
"range": {
|
|
102
|
+
"@timestamp": {"gte": f"now-{days}d/d", "lte": "now/d"}
|
|
103
|
+
}
|
|
104
|
+
},
|
|
105
|
+
{"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
|
|
106
|
+
]
|
|
107
|
+
}
|
|
108
|
+
},
|
|
109
|
+
"_source": {
|
|
110
|
+
"includes": ["urn", "@timestamp"],
|
|
111
|
+
},
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def get_dataset_write_usage_composite_query(days: int) -> Dict:
|
|
116
|
+
return {
|
|
117
|
+
"query": {
|
|
118
|
+
"bool": {
|
|
119
|
+
"must": [
|
|
120
|
+
{
|
|
121
|
+
"range": {
|
|
122
|
+
"@timestamp": {"gte": f"now-{days}d/d", "lte": "now/d"}
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
{"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
|
|
126
|
+
]
|
|
127
|
+
}
|
|
128
|
+
},
|
|
129
|
+
"aggs": {
|
|
130
|
+
"urn_count": {
|
|
131
|
+
"composite": {
|
|
132
|
+
"sources": [
|
|
133
|
+
{"dataset_operationaspect_v1": {"terms": {"field": "urn"}}}
|
|
134
|
+
]
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
},
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def get_query_usage_query(days: int) -> Dict:
|
|
142
|
+
return {
|
|
143
|
+
"sort": [{"urn": {"order": "asc"}}],
|
|
144
|
+
"query": {
|
|
145
|
+
"bool": {
|
|
146
|
+
"filter": {
|
|
147
|
+
"bool": {
|
|
148
|
+
"must": [
|
|
149
|
+
{
|
|
150
|
+
"range": {
|
|
151
|
+
"@timestamp": {
|
|
152
|
+
"gte": f"now-{days}d/d",
|
|
153
|
+
"lt": "now/d",
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
},
|
|
157
|
+
{"term": {"isExploded": False}},
|
|
158
|
+
]
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
},
|
|
163
|
+
}
|
|
@@ -20,6 +20,7 @@ from opensearchpy import OpenSearch
|
|
|
20
20
|
from pydantic import Field
|
|
21
21
|
from scipy.stats import expon
|
|
22
22
|
|
|
23
|
+
from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
|
|
23
24
|
from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
|
|
24
25
|
UsageFeaturePatchBuilder,
|
|
25
26
|
)
|
|
@@ -58,119 +59,6 @@ platform_regexp = re.compile(r"urn:li:dataset:\(urn:li:dataPlatform:(.+?),.*")
|
|
|
58
59
|
dashboard_chart_platform_regexp = re.compile(r"urn:li:(?:dashboard|chart):\((.+?),.*")
|
|
59
60
|
dbt_platform_regexp = re.compile(r"urn:li:dataset:\(urn:li:dataPlatform:dbt,.*\)")
|
|
60
61
|
|
|
61
|
-
GET_SOFT_DELETED_ENTITIES = {
|
|
62
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
GET_QUERY_ENTITIES = {
|
|
66
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
67
|
-
"query": {
|
|
68
|
-
"bool": {
|
|
69
|
-
"filter": {
|
|
70
|
-
"bool": {
|
|
71
|
-
"must_not": [
|
|
72
|
-
{"term": {"source": "MANUAL"}},
|
|
73
|
-
]
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
},
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
GET_UPSTREAMS = {
|
|
81
|
-
"sort": [{"destination.urn": {"order": "asc"}}],
|
|
82
|
-
"query": {
|
|
83
|
-
"bool": {
|
|
84
|
-
"must": [
|
|
85
|
-
{"terms": {"destination.entityType": ["dataset"]}},
|
|
86
|
-
{"terms": {"source.entityType": ["dataset"]}},
|
|
87
|
-
]
|
|
88
|
-
}
|
|
89
|
-
},
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
GET_DASHBOARD_USAGE_QUERY = {
|
|
93
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
94
|
-
"query": {
|
|
95
|
-
"bool": {
|
|
96
|
-
"filter": {
|
|
97
|
-
"bool": {
|
|
98
|
-
"must": [
|
|
99
|
-
{"range": {"@timestamp": {"gte": "now-30d", "lt": "now/d"}}},
|
|
100
|
-
{"term": {"isExploded": False}},
|
|
101
|
-
]
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
},
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
GET_DATASET_USAGE_QUERY = {
|
|
109
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
110
|
-
"query": {
|
|
111
|
-
"bool": {
|
|
112
|
-
"filter": {
|
|
113
|
-
"bool": {
|
|
114
|
-
"must": [
|
|
115
|
-
{"range": {"@timestamp": {"gte": "now-30d/d", "lt": "now/d"}}},
|
|
116
|
-
{"term": {"isExploded": False}},
|
|
117
|
-
{"range": {"totalSqlQueries": {"gt": 0}}},
|
|
118
|
-
]
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
},
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
DATASET_WRITE_USAGE_RAW_QUERY = {
|
|
126
|
-
"sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
|
|
127
|
-
"query": {
|
|
128
|
-
"bool": {
|
|
129
|
-
"must": [
|
|
130
|
-
{"range": {"@timestamp": {"gte": "now-30d/d", "lte": "now/d"}}},
|
|
131
|
-
{"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
|
|
132
|
-
]
|
|
133
|
-
}
|
|
134
|
-
},
|
|
135
|
-
"_source": {
|
|
136
|
-
"includes": ["urn", "@timestamp"],
|
|
137
|
-
},
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
DATASET_WRITE_USAGE_COMPOSITE_QUERY = {
|
|
141
|
-
"query": {
|
|
142
|
-
"bool": {
|
|
143
|
-
"must": [
|
|
144
|
-
{"range": {"@timestamp": {"gte": "now-30d/d", "lte": "now/d"}}},
|
|
145
|
-
{"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
|
|
146
|
-
]
|
|
147
|
-
}
|
|
148
|
-
},
|
|
149
|
-
"aggs": {
|
|
150
|
-
"urn_count": {
|
|
151
|
-
"composite": {
|
|
152
|
-
"sources": [{"dataset_operationaspect_v1": {"terms": {"field": "urn"}}}]
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
},
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
GET_QUERY_USAGE_QUERY = {
|
|
159
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
160
|
-
"query": {
|
|
161
|
-
"bool": {
|
|
162
|
-
"filter": {
|
|
163
|
-
"bool": {
|
|
164
|
-
"must": [
|
|
165
|
-
{"range": {"@timestamp": {"gte": "now-30d/d", "lt": "now/d"}}},
|
|
166
|
-
{"term": {"isExploded": False}},
|
|
167
|
-
]
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
},
|
|
172
|
-
}
|
|
173
|
-
|
|
174
62
|
|
|
175
63
|
class S3ClientConfig(ConfigModel):
|
|
176
64
|
bucket: str = os.getenv("DATA_BUCKET", "")
|
|
@@ -206,7 +94,13 @@ class RankingPolicy(ConfigModel):
|
|
|
206
94
|
regexp_based_factors: List[RegexpFactor] = []
|
|
207
95
|
|
|
208
96
|
|
|
209
|
-
class DataHubUsageFeatureReportingSourceConfig(
|
|
97
|
+
class DataHubUsageFeatureReportingSourceConfig(
|
|
98
|
+
ConfigModel, StatefulIngestionConfigBase
|
|
99
|
+
):
|
|
100
|
+
lookback_days: int = Field(
|
|
101
|
+
30, description="Number of days to look back for usage data."
|
|
102
|
+
)
|
|
103
|
+
|
|
210
104
|
server: Optional[DatahubClientConfig] = Field(
|
|
211
105
|
None, description="Optional configuration for the DataHub server connection."
|
|
212
106
|
)
|
|
@@ -882,7 +776,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
882
776
|
if self.config.streaming_mode:
|
|
883
777
|
wdf = self.load_es_data_to_lf(
|
|
884
778
|
index="dataset_operationaspect_v1",
|
|
885
|
-
query=
|
|
779
|
+
query=QueryBuilder.get_dataset_write_usage_raw_query(
|
|
780
|
+
self.config.lookback_days
|
|
781
|
+
),
|
|
886
782
|
read_function=self.write_stat_raw_batch,
|
|
887
783
|
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
888
784
|
)
|
|
@@ -891,7 +787,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
891
787
|
wdf = polars.LazyFrame(
|
|
892
788
|
self.load_data_from_es(
|
|
893
789
|
"dataset_operationaspect_v1",
|
|
894
|
-
|
|
790
|
+
QueryBuilder.get_dataset_write_usage_raw_query(
|
|
791
|
+
self.config.lookback_days
|
|
792
|
+
),
|
|
895
793
|
self.write_stat_raw_batch,
|
|
896
794
|
),
|
|
897
795
|
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
@@ -918,12 +816,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
918
816
|
def load_write_usage_server_side_aggregation(
|
|
919
817
|
self, soft_deleted_entities_df: polars.LazyFrame
|
|
920
818
|
) -> polars.LazyFrame:
|
|
921
|
-
query: Dict = DATASET_WRITE_USAGE_COMPOSITE_QUERY
|
|
922
|
-
query["aggs"]["urn_count"]["composite"]["size"] = self.config.extract_batch_size
|
|
923
819
|
wdf = polars.LazyFrame(
|
|
924
820
|
self.load_data_from_es(
|
|
925
821
|
"dataset_operationaspect_v1",
|
|
926
|
-
|
|
822
|
+
QueryBuilder.get_dataset_write_usage_composite_query(
|
|
823
|
+
self.config.lookback_days
|
|
824
|
+
),
|
|
927
825
|
self.write_stat_batch,
|
|
928
826
|
aggregation_key="urn_count",
|
|
929
827
|
),
|
|
@@ -954,7 +852,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
954
852
|
upstreams_lf = polars.LazyFrame(
|
|
955
853
|
self.load_data_from_es(
|
|
956
854
|
"graph_service_v1",
|
|
957
|
-
|
|
855
|
+
QueryBuilder.get_upstreams_query(),
|
|
958
856
|
self.upstream_lineage_batch,
|
|
959
857
|
),
|
|
960
858
|
schema={
|
|
@@ -1289,7 +1187,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1289
1187
|
soft_deleted_df = polars.LazyFrame(
|
|
1290
1188
|
self.load_data_from_es(
|
|
1291
1189
|
index=entity_index,
|
|
1292
|
-
query=
|
|
1190
|
+
query=QueryBuilder.get_soft_deleted_entities_query(),
|
|
1293
1191
|
process_function=self.soft_deleted_batch,
|
|
1294
1192
|
),
|
|
1295
1193
|
schema={
|
|
@@ -1305,7 +1203,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1305
1203
|
lf: polars.LazyFrame = polars.LazyFrame(
|
|
1306
1204
|
self.load_data_from_es(
|
|
1307
1205
|
index=usage_index,
|
|
1308
|
-
query=
|
|
1206
|
+
query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
|
|
1309
1207
|
process_function=self.process_dashboard_usage,
|
|
1310
1208
|
),
|
|
1311
1209
|
schema={
|
|
@@ -1405,7 +1303,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1405
1303
|
query_entities = polars.LazyFrame(
|
|
1406
1304
|
self.load_data_from_es(
|
|
1407
1305
|
index=entity_index,
|
|
1408
|
-
query=
|
|
1306
|
+
query=QueryBuilder.get_query_entities_query(),
|
|
1409
1307
|
process_function=self.queries_entities_batch,
|
|
1410
1308
|
),
|
|
1411
1309
|
schema={
|
|
@@ -1420,7 +1318,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1420
1318
|
lf: polars.LazyFrame = polars.LazyFrame(
|
|
1421
1319
|
self.load_data_from_es(
|
|
1422
1320
|
index=usage_index,
|
|
1423
|
-
query=
|
|
1321
|
+
query=QueryBuilder.get_query_usage_query(self.config.lookback_days),
|
|
1424
1322
|
process_function=self.process_query_usage,
|
|
1425
1323
|
),
|
|
1426
1324
|
schema={
|
|
@@ -1484,7 +1382,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1484
1382
|
lf: polars.LazyFrame = polars.LazyFrame(
|
|
1485
1383
|
self.load_data_from_es(
|
|
1486
1384
|
index=index,
|
|
1487
|
-
query=
|
|
1385
|
+
query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
|
|
1488
1386
|
process_function=self.process_batch,
|
|
1489
1387
|
),
|
|
1490
1388
|
schema={
|
|
@@ -1576,7 +1474,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1576
1474
|
datasets_df = polars.LazyFrame(
|
|
1577
1475
|
self.load_data_from_es(
|
|
1578
1476
|
index="datasetindex_v2",
|
|
1579
|
-
query=
|
|
1477
|
+
query=QueryBuilder.get_soft_deleted_entities_query(),
|
|
1580
1478
|
process_function=self.soft_deleted_batch,
|
|
1581
1479
|
),
|
|
1582
1480
|
schema={
|