acryl-datahub-cloud 0.3.8rc0__py3-none-any.whl → 0.3.8rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (33) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/acryl_cs_issues/source.py +0 -1
  3. acryl_datahub_cloud/datahub_metadata_sharing/__init__.py +0 -0
  4. acryl_datahub_cloud/datahub_metadata_sharing/metadata_sharing_source.py +262 -0
  5. acryl_datahub_cloud/datahub_metadata_sharing/query.py +7 -0
  6. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +0 -2
  7. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +0 -1
  8. acryl_datahub_cloud/datahub_reporting/extract_graph.py +0 -1
  9. acryl_datahub_cloud/datahub_reporting/extract_sql.py +0 -1
  10. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +163 -0
  11. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +29 -129
  12. acryl_datahub_cloud/metadata/_urns/urn_defs.py +1612 -1567
  13. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  14. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/executor/__init__.py +15 -0
  15. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  16. acryl_datahub_cloud/metadata/schema.avsc +25096 -25347
  17. acryl_datahub_cloud/metadata/schema_classes.py +807 -503
  18. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  19. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +2 -1
  20. acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +63 -0
  21. acryl_datahub_cloud/metadata/schemas/EntityTypeKey.avsc +1 -0
  22. acryl_datahub_cloud/metadata/schemas/ExecutionRequestInput.avsc +9 -0
  23. acryl_datahub_cloud/metadata/schemas/ExecutionRequestResult.avsc +14 -0
  24. acryl_datahub_cloud/metadata/schemas/PostInfo.avsc +23 -0
  25. acryl_datahub_cloud/metadata/schemas/RemoteExecutorKey.avsc +21 -0
  26. acryl_datahub_cloud/metadata/schemas/RemoteExecutorStatus.avsc +80 -0
  27. acryl_datahub_cloud/metadata/schemas/VersionProperties.avsc +4 -0
  28. acryl_datahub_cloud/metadata/schemas/__init__.py +3 -3
  29. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/METADATA +34 -33
  30. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/RECORD +33 -25
  31. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/WHEEL +1 -1
  32. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/entry_points.txt +1 -0
  33. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from opensearchpy import OpenSearch
20
20
  from pydantic import Field
21
21
  from scipy.stats import expon
22
22
 
23
+ from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
23
24
  from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
24
25
  UsageFeaturePatchBuilder,
25
26
  )
@@ -58,119 +59,6 @@ platform_regexp = re.compile(r"urn:li:dataset:\(urn:li:dataPlatform:(.+?),.*")
58
59
  dashboard_chart_platform_regexp = re.compile(r"urn:li:(?:dashboard|chart):\((.+?),.*")
59
60
  dbt_platform_regexp = re.compile(r"urn:li:dataset:\(urn:li:dataPlatform:dbt,.*\)")
60
61
 
61
- GET_SOFT_DELETED_ENTITIES = {
62
- "sort": [{"urn": {"order": "asc"}}],
63
- }
64
-
65
- GET_QUERY_ENTITIES = {
66
- "sort": [{"urn": {"order": "asc"}}],
67
- "query": {
68
- "bool": {
69
- "filter": {
70
- "bool": {
71
- "must_not": [
72
- {"term": {"source": "MANUAL"}},
73
- ]
74
- }
75
- }
76
- }
77
- },
78
- }
79
-
80
- GET_UPSTREAMS = {
81
- "sort": [{"destination.urn": {"order": "asc"}}],
82
- "query": {
83
- "bool": {
84
- "must": [
85
- {"terms": {"destination.entityType": ["dataset"]}},
86
- {"terms": {"source.entityType": ["dataset"]}},
87
- ]
88
- }
89
- },
90
- }
91
-
92
- GET_DASHBOARD_USAGE_QUERY = {
93
- "sort": [{"urn": {"order": "asc"}}],
94
- "query": {
95
- "bool": {
96
- "filter": {
97
- "bool": {
98
- "must": [
99
- {"range": {"@timestamp": {"gte": "now-30d", "lt": "now/d"}}},
100
- {"term": {"isExploded": False}},
101
- ]
102
- }
103
- }
104
- }
105
- },
106
- }
107
-
108
- GET_DATASET_USAGE_QUERY = {
109
- "sort": [{"urn": {"order": "asc"}}],
110
- "query": {
111
- "bool": {
112
- "filter": {
113
- "bool": {
114
- "must": [
115
- {"range": {"@timestamp": {"gte": "now-30d/d", "lt": "now/d"}}},
116
- {"term": {"isExploded": False}},
117
- {"range": {"totalSqlQueries": {"gt": 0}}},
118
- ]
119
- }
120
- }
121
- }
122
- },
123
- }
124
-
125
- DATASET_WRITE_USAGE_RAW_QUERY = {
126
- "sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
127
- "query": {
128
- "bool": {
129
- "must": [
130
- {"range": {"@timestamp": {"gte": "now-30d/d", "lte": "now/d"}}},
131
- {"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
132
- ]
133
- }
134
- },
135
- "_source": {
136
- "includes": ["urn", "@timestamp"],
137
- },
138
- }
139
-
140
- DATASET_WRITE_USAGE_COMPOSITE_QUERY = {
141
- "query": {
142
- "bool": {
143
- "must": [
144
- {"range": {"@timestamp": {"gte": "now-30d/d", "lte": "now/d"}}},
145
- {"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
146
- ]
147
- }
148
- },
149
- "aggs": {
150
- "urn_count": {
151
- "composite": {
152
- "sources": [{"dataset_operationaspect_v1": {"terms": {"field": "urn"}}}]
153
- }
154
- }
155
- },
156
- }
157
-
158
- GET_QUERY_USAGE_QUERY = {
159
- "sort": [{"urn": {"order": "asc"}}],
160
- "query": {
161
- "bool": {
162
- "filter": {
163
- "bool": {
164
- "must": [
165
- {"range": {"@timestamp": {"gte": "now-30d/d", "lt": "now/d"}}},
166
- {"term": {"isExploded": False}},
167
- ]
168
- }
169
- }
170
- }
171
- },
172
- }
173
-
174
62
 
175
63
  class S3ClientConfig(ConfigModel):
176
64
  bucket: str = os.getenv("DATA_BUCKET", "")
@@ -206,7 +94,13 @@ class RankingPolicy(ConfigModel):
206
94
  regexp_based_factors: List[RegexpFactor] = []
207
95
 
208
96
 
209
- class DataHubUsageFeatureReportingSourceConfig(StatefulIngestionConfigBase):
97
+ class DataHubUsageFeatureReportingSourceConfig(
98
+ ConfigModel, StatefulIngestionConfigBase
99
+ ):
100
+ lookback_days: int = Field(
101
+ 30, description="Number of days to look back for usage data."
102
+ )
103
+
210
104
  server: Optional[DatahubClientConfig] = Field(
211
105
  None, description="Optional configuration for the DataHub server connection."
212
106
  )
@@ -381,25 +275,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
381
275
  "last_modified_at": (
382
276
  doc["_source"]["lastModifiedAt"]
383
277
  if "lastModifiedAt" in doc["_source"]
278
+ and doc["_source"]["lastModifiedAt"]
384
279
  else (
385
280
  doc["_source"]["lastModifiedAt"]
386
281
  if "lastModifiedAt" in doc["_source"]
282
+ and doc["_source"]["lastModifiedAt"]
387
283
  else None
388
284
  )
389
285
  ),
390
286
  "removed": (
391
287
  doc["_source"]["removed"]
392
- if "removed" in doc["_source"]
288
+ if "removed" in doc["_source"] and doc["_source"]["removed"]
393
289
  else False
394
290
  ),
395
291
  "siblings": (
396
292
  doc["_source"]["siblings"]
397
- if "siblings" in doc["_source"]
293
+ if "siblings" in doc["_source"] and doc["_source"]["siblings"]
398
294
  else []
399
295
  ),
400
296
  "isView": (
401
297
  "View" in doc["_source"]["typeNames"]
402
- if "typeNames" in doc["_source"]
298
+ if "typeNames" in doc["_source"] and doc["_source"]["typeNames"]
403
299
  else False
404
300
  ),
405
301
  }
@@ -882,7 +778,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
882
778
  if self.config.streaming_mode:
883
779
  wdf = self.load_es_data_to_lf(
884
780
  index="dataset_operationaspect_v1",
885
- query=DATASET_WRITE_USAGE_RAW_QUERY,
781
+ query=QueryBuilder.get_dataset_write_usage_raw_query(
782
+ self.config.lookback_days
783
+ ),
886
784
  read_function=self.write_stat_raw_batch,
887
785
  schema={"urn": polars.Categorical, "platform": polars.Categorical},
888
786
  )
@@ -891,7 +789,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
891
789
  wdf = polars.LazyFrame(
892
790
  self.load_data_from_es(
893
791
  "dataset_operationaspect_v1",
894
- DATASET_WRITE_USAGE_RAW_QUERY,
792
+ QueryBuilder.get_dataset_write_usage_raw_query(
793
+ self.config.lookback_days
794
+ ),
895
795
  self.write_stat_raw_batch,
896
796
  ),
897
797
  schema={"urn": polars.Categorical, "platform": polars.Categorical},
@@ -918,12 +818,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
918
818
  def load_write_usage_server_side_aggregation(
919
819
  self, soft_deleted_entities_df: polars.LazyFrame
920
820
  ) -> polars.LazyFrame:
921
- query: Dict = DATASET_WRITE_USAGE_COMPOSITE_QUERY
922
- query["aggs"]["urn_count"]["composite"]["size"] = self.config.extract_batch_size
923
821
  wdf = polars.LazyFrame(
924
822
  self.load_data_from_es(
925
823
  "dataset_operationaspect_v1",
926
- DATASET_WRITE_USAGE_COMPOSITE_QUERY,
824
+ QueryBuilder.get_dataset_write_usage_composite_query(
825
+ self.config.lookback_days
826
+ ),
927
827
  self.write_stat_batch,
928
828
  aggregation_key="urn_count",
929
829
  ),
@@ -954,7 +854,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
954
854
  upstreams_lf = polars.LazyFrame(
955
855
  self.load_data_from_es(
956
856
  "graph_service_v1",
957
- GET_UPSTREAMS,
857
+ QueryBuilder.get_upstreams_query(),
958
858
  self.upstream_lineage_batch,
959
859
  ),
960
860
  schema={
@@ -1289,7 +1189,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1289
1189
  soft_deleted_df = polars.LazyFrame(
1290
1190
  self.load_data_from_es(
1291
1191
  index=entity_index,
1292
- query=GET_SOFT_DELETED_ENTITIES,
1192
+ query=QueryBuilder.get_soft_deleted_entities_query(),
1293
1193
  process_function=self.soft_deleted_batch,
1294
1194
  ),
1295
1195
  schema={
@@ -1305,7 +1205,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1305
1205
  lf: polars.LazyFrame = polars.LazyFrame(
1306
1206
  self.load_data_from_es(
1307
1207
  index=usage_index,
1308
- query=GET_DASHBOARD_USAGE_QUERY,
1208
+ query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
1309
1209
  process_function=self.process_dashboard_usage,
1310
1210
  ),
1311
1211
  schema={
@@ -1405,7 +1305,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1405
1305
  query_entities = polars.LazyFrame(
1406
1306
  self.load_data_from_es(
1407
1307
  index=entity_index,
1408
- query=GET_QUERY_ENTITIES,
1308
+ query=QueryBuilder.get_query_entities_query(),
1409
1309
  process_function=self.queries_entities_batch,
1410
1310
  ),
1411
1311
  schema={
@@ -1420,7 +1320,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1420
1320
  lf: polars.LazyFrame = polars.LazyFrame(
1421
1321
  self.load_data_from_es(
1422
1322
  index=usage_index,
1423
- query=GET_QUERY_USAGE_QUERY,
1323
+ query=QueryBuilder.get_query_usage_query(self.config.lookback_days),
1424
1324
  process_function=self.process_query_usage,
1425
1325
  ),
1426
1326
  schema={
@@ -1484,7 +1384,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1484
1384
  lf: polars.LazyFrame = polars.LazyFrame(
1485
1385
  self.load_data_from_es(
1486
1386
  index=index,
1487
- query=GET_DATASET_USAGE_QUERY,
1387
+ query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
1488
1388
  process_function=self.process_batch,
1489
1389
  ),
1490
1390
  schema={
@@ -1576,7 +1476,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1576
1476
  datasets_df = polars.LazyFrame(
1577
1477
  self.load_data_from_es(
1578
1478
  index="datasetindex_v2",
1579
- query=GET_SOFT_DELETED_ENTITIES,
1479
+ query=QueryBuilder.get_soft_deleted_entities_query(),
1580
1480
  process_function=self.soft_deleted_batch,
1581
1481
  ),
1582
1482
  schema={