acryl-datahub-cloud 0.3.8rc0__py3-none-any.whl → 0.3.8rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/acryl_cs_issues/source.py +0 -1
- acryl_datahub_cloud/datahub_metadata_sharing/__init__.py +0 -0
- acryl_datahub_cloud/datahub_metadata_sharing/metadata_sharing_source.py +262 -0
- acryl_datahub_cloud/datahub_metadata_sharing/query.py +7 -0
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +0 -2
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +0 -1
- acryl_datahub_cloud/datahub_reporting/extract_graph.py +0 -1
- acryl_datahub_cloud/datahub_reporting/extract_sql.py +0 -1
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +163 -0
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +29 -129
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +1612 -1567
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/executor/__init__.py +15 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- acryl_datahub_cloud/metadata/schema.avsc +25096 -25347
- acryl_datahub_cloud/metadata/schema_classes.py +807 -503
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +63 -0
- acryl_datahub_cloud/metadata/schemas/EntityTypeKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestInput.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestResult.avsc +14 -0
- acryl_datahub_cloud/metadata/schemas/PostInfo.avsc +23 -0
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorStatus.avsc +80 -0
- acryl_datahub_cloud/metadata/schemas/VersionProperties.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/__init__.py +3 -3
- {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc2.dist-info}/METADATA +43 -37
- {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc2.dist-info}/RECORD +33 -25
- {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc2.dist-info}/WHEEL +1 -1
- {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc2.dist-info}/entry_points.txt +1 -0
- {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc2.dist-info}/top_level.txt +0 -0
|
@@ -20,6 +20,7 @@ from opensearchpy import OpenSearch
|
|
|
20
20
|
from pydantic import Field
|
|
21
21
|
from scipy.stats import expon
|
|
22
22
|
|
|
23
|
+
from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
|
|
23
24
|
from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
|
|
24
25
|
UsageFeaturePatchBuilder,
|
|
25
26
|
)
|
|
@@ -58,119 +59,6 @@ platform_regexp = re.compile(r"urn:li:dataset:\(urn:li:dataPlatform:(.+?),.*")
|
|
|
58
59
|
dashboard_chart_platform_regexp = re.compile(r"urn:li:(?:dashboard|chart):\((.+?),.*")
|
|
59
60
|
dbt_platform_regexp = re.compile(r"urn:li:dataset:\(urn:li:dataPlatform:dbt,.*\)")
|
|
60
61
|
|
|
61
|
-
GET_SOFT_DELETED_ENTITIES = {
|
|
62
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
GET_QUERY_ENTITIES = {
|
|
66
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
67
|
-
"query": {
|
|
68
|
-
"bool": {
|
|
69
|
-
"filter": {
|
|
70
|
-
"bool": {
|
|
71
|
-
"must_not": [
|
|
72
|
-
{"term": {"source": "MANUAL"}},
|
|
73
|
-
]
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
},
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
GET_UPSTREAMS = {
|
|
81
|
-
"sort": [{"destination.urn": {"order": "asc"}}],
|
|
82
|
-
"query": {
|
|
83
|
-
"bool": {
|
|
84
|
-
"must": [
|
|
85
|
-
{"terms": {"destination.entityType": ["dataset"]}},
|
|
86
|
-
{"terms": {"source.entityType": ["dataset"]}},
|
|
87
|
-
]
|
|
88
|
-
}
|
|
89
|
-
},
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
GET_DASHBOARD_USAGE_QUERY = {
|
|
93
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
94
|
-
"query": {
|
|
95
|
-
"bool": {
|
|
96
|
-
"filter": {
|
|
97
|
-
"bool": {
|
|
98
|
-
"must": [
|
|
99
|
-
{"range": {"@timestamp": {"gte": "now-30d", "lt": "now/d"}}},
|
|
100
|
-
{"term": {"isExploded": False}},
|
|
101
|
-
]
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
},
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
GET_DATASET_USAGE_QUERY = {
|
|
109
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
110
|
-
"query": {
|
|
111
|
-
"bool": {
|
|
112
|
-
"filter": {
|
|
113
|
-
"bool": {
|
|
114
|
-
"must": [
|
|
115
|
-
{"range": {"@timestamp": {"gte": "now-30d/d", "lt": "now/d"}}},
|
|
116
|
-
{"term": {"isExploded": False}},
|
|
117
|
-
{"range": {"totalSqlQueries": {"gt": 0}}},
|
|
118
|
-
]
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
},
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
DATASET_WRITE_USAGE_RAW_QUERY = {
|
|
126
|
-
"sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
|
|
127
|
-
"query": {
|
|
128
|
-
"bool": {
|
|
129
|
-
"must": [
|
|
130
|
-
{"range": {"@timestamp": {"gte": "now-30d/d", "lte": "now/d"}}},
|
|
131
|
-
{"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
|
|
132
|
-
]
|
|
133
|
-
}
|
|
134
|
-
},
|
|
135
|
-
"_source": {
|
|
136
|
-
"includes": ["urn", "@timestamp"],
|
|
137
|
-
},
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
DATASET_WRITE_USAGE_COMPOSITE_QUERY = {
|
|
141
|
-
"query": {
|
|
142
|
-
"bool": {
|
|
143
|
-
"must": [
|
|
144
|
-
{"range": {"@timestamp": {"gte": "now-30d/d", "lte": "now/d"}}},
|
|
145
|
-
{"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
|
|
146
|
-
]
|
|
147
|
-
}
|
|
148
|
-
},
|
|
149
|
-
"aggs": {
|
|
150
|
-
"urn_count": {
|
|
151
|
-
"composite": {
|
|
152
|
-
"sources": [{"dataset_operationaspect_v1": {"terms": {"field": "urn"}}}]
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
},
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
GET_QUERY_USAGE_QUERY = {
|
|
159
|
-
"sort": [{"urn": {"order": "asc"}}],
|
|
160
|
-
"query": {
|
|
161
|
-
"bool": {
|
|
162
|
-
"filter": {
|
|
163
|
-
"bool": {
|
|
164
|
-
"must": [
|
|
165
|
-
{"range": {"@timestamp": {"gte": "now-30d/d", "lt": "now/d"}}},
|
|
166
|
-
{"term": {"isExploded": False}},
|
|
167
|
-
]
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
},
|
|
172
|
-
}
|
|
173
|
-
|
|
174
62
|
|
|
175
63
|
class S3ClientConfig(ConfigModel):
|
|
176
64
|
bucket: str = os.getenv("DATA_BUCKET", "")
|
|
@@ -206,7 +94,13 @@ class RankingPolicy(ConfigModel):
|
|
|
206
94
|
regexp_based_factors: List[RegexpFactor] = []
|
|
207
95
|
|
|
208
96
|
|
|
209
|
-
class DataHubUsageFeatureReportingSourceConfig(
|
|
97
|
+
class DataHubUsageFeatureReportingSourceConfig(
|
|
98
|
+
ConfigModel, StatefulIngestionConfigBase
|
|
99
|
+
):
|
|
100
|
+
lookback_days: int = Field(
|
|
101
|
+
30, description="Number of days to look back for usage data."
|
|
102
|
+
)
|
|
103
|
+
|
|
210
104
|
server: Optional[DatahubClientConfig] = Field(
|
|
211
105
|
None, description="Optional configuration for the DataHub server connection."
|
|
212
106
|
)
|
|
@@ -381,25 +275,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
381
275
|
"last_modified_at": (
|
|
382
276
|
doc["_source"]["lastModifiedAt"]
|
|
383
277
|
if "lastModifiedAt" in doc["_source"]
|
|
278
|
+
and doc["_source"]["lastModifiedAt"]
|
|
384
279
|
else (
|
|
385
280
|
doc["_source"]["lastModifiedAt"]
|
|
386
281
|
if "lastModifiedAt" in doc["_source"]
|
|
282
|
+
and doc["_source"]["lastModifiedAt"]
|
|
387
283
|
else None
|
|
388
284
|
)
|
|
389
285
|
),
|
|
390
286
|
"removed": (
|
|
391
287
|
doc["_source"]["removed"]
|
|
392
|
-
if "removed" in doc["_source"]
|
|
288
|
+
if "removed" in doc["_source"] and doc["_source"]["removed"]
|
|
393
289
|
else False
|
|
394
290
|
),
|
|
395
291
|
"siblings": (
|
|
396
292
|
doc["_source"]["siblings"]
|
|
397
|
-
if "siblings" in doc["_source"]
|
|
293
|
+
if "siblings" in doc["_source"] and doc["_source"]["siblings"]
|
|
398
294
|
else []
|
|
399
295
|
),
|
|
400
296
|
"isView": (
|
|
401
297
|
"View" in doc["_source"]["typeNames"]
|
|
402
|
-
if "typeNames" in doc["_source"]
|
|
298
|
+
if "typeNames" in doc["_source"] and doc["_source"]["typeNames"]
|
|
403
299
|
else False
|
|
404
300
|
),
|
|
405
301
|
}
|
|
@@ -882,7 +778,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
882
778
|
if self.config.streaming_mode:
|
|
883
779
|
wdf = self.load_es_data_to_lf(
|
|
884
780
|
index="dataset_operationaspect_v1",
|
|
885
|
-
query=
|
|
781
|
+
query=QueryBuilder.get_dataset_write_usage_raw_query(
|
|
782
|
+
self.config.lookback_days
|
|
783
|
+
),
|
|
886
784
|
read_function=self.write_stat_raw_batch,
|
|
887
785
|
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
888
786
|
)
|
|
@@ -891,7 +789,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
891
789
|
wdf = polars.LazyFrame(
|
|
892
790
|
self.load_data_from_es(
|
|
893
791
|
"dataset_operationaspect_v1",
|
|
894
|
-
|
|
792
|
+
QueryBuilder.get_dataset_write_usage_raw_query(
|
|
793
|
+
self.config.lookback_days
|
|
794
|
+
),
|
|
895
795
|
self.write_stat_raw_batch,
|
|
896
796
|
),
|
|
897
797
|
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
@@ -918,12 +818,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
918
818
|
def load_write_usage_server_side_aggregation(
|
|
919
819
|
self, soft_deleted_entities_df: polars.LazyFrame
|
|
920
820
|
) -> polars.LazyFrame:
|
|
921
|
-
query: Dict = DATASET_WRITE_USAGE_COMPOSITE_QUERY
|
|
922
|
-
query["aggs"]["urn_count"]["composite"]["size"] = self.config.extract_batch_size
|
|
923
821
|
wdf = polars.LazyFrame(
|
|
924
822
|
self.load_data_from_es(
|
|
925
823
|
"dataset_operationaspect_v1",
|
|
926
|
-
|
|
824
|
+
QueryBuilder.get_dataset_write_usage_composite_query(
|
|
825
|
+
self.config.lookback_days
|
|
826
|
+
),
|
|
927
827
|
self.write_stat_batch,
|
|
928
828
|
aggregation_key="urn_count",
|
|
929
829
|
),
|
|
@@ -954,7 +854,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
954
854
|
upstreams_lf = polars.LazyFrame(
|
|
955
855
|
self.load_data_from_es(
|
|
956
856
|
"graph_service_v1",
|
|
957
|
-
|
|
857
|
+
QueryBuilder.get_upstreams_query(),
|
|
958
858
|
self.upstream_lineage_batch,
|
|
959
859
|
),
|
|
960
860
|
schema={
|
|
@@ -1289,7 +1189,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1289
1189
|
soft_deleted_df = polars.LazyFrame(
|
|
1290
1190
|
self.load_data_from_es(
|
|
1291
1191
|
index=entity_index,
|
|
1292
|
-
query=
|
|
1192
|
+
query=QueryBuilder.get_soft_deleted_entities_query(),
|
|
1293
1193
|
process_function=self.soft_deleted_batch,
|
|
1294
1194
|
),
|
|
1295
1195
|
schema={
|
|
@@ -1305,7 +1205,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1305
1205
|
lf: polars.LazyFrame = polars.LazyFrame(
|
|
1306
1206
|
self.load_data_from_es(
|
|
1307
1207
|
index=usage_index,
|
|
1308
|
-
query=
|
|
1208
|
+
query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
|
|
1309
1209
|
process_function=self.process_dashboard_usage,
|
|
1310
1210
|
),
|
|
1311
1211
|
schema={
|
|
@@ -1405,7 +1305,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1405
1305
|
query_entities = polars.LazyFrame(
|
|
1406
1306
|
self.load_data_from_es(
|
|
1407
1307
|
index=entity_index,
|
|
1408
|
-
query=
|
|
1308
|
+
query=QueryBuilder.get_query_entities_query(),
|
|
1409
1309
|
process_function=self.queries_entities_batch,
|
|
1410
1310
|
),
|
|
1411
1311
|
schema={
|
|
@@ -1420,7 +1320,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1420
1320
|
lf: polars.LazyFrame = polars.LazyFrame(
|
|
1421
1321
|
self.load_data_from_es(
|
|
1422
1322
|
index=usage_index,
|
|
1423
|
-
query=
|
|
1323
|
+
query=QueryBuilder.get_query_usage_query(self.config.lookback_days),
|
|
1424
1324
|
process_function=self.process_query_usage,
|
|
1425
1325
|
),
|
|
1426
1326
|
schema={
|
|
@@ -1484,7 +1384,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1484
1384
|
lf: polars.LazyFrame = polars.LazyFrame(
|
|
1485
1385
|
self.load_data_from_es(
|
|
1486
1386
|
index=index,
|
|
1487
|
-
query=
|
|
1387
|
+
query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
|
|
1488
1388
|
process_function=self.process_batch,
|
|
1489
1389
|
),
|
|
1490
1390
|
schema={
|
|
@@ -1576,7 +1476,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1576
1476
|
datasets_df = polars.LazyFrame(
|
|
1577
1477
|
self.load_data_from_es(
|
|
1578
1478
|
index="datasetindex_v2",
|
|
1579
|
-
query=
|
|
1479
|
+
query=QueryBuilder.get_soft_deleted_entities_query(),
|
|
1580
1480
|
process_function=self.soft_deleted_batch,
|
|
1581
1481
|
),
|
|
1582
1482
|
schema={
|