acryl-datahub-cloud 0.3.12rc1__py3-none-any.whl → 0.3.12rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (70) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +524 -0
  3. acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
  4. acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
  5. acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
  6. acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
  7. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
  8. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +49 -40
  9. acryl_datahub_cloud/metadata/_urns/urn_defs.py +1842 -1786
  10. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  11. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +4 -0
  12. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
  13. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  14. acryl_datahub_cloud/metadata/schema.avsc +24747 -23945
  15. acryl_datahub_cloud/metadata/schema_classes.py +1031 -631
  16. acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
  17. acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +72 -0
  18. acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
  19. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +31 -7
  20. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +27 -6
  21. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +31 -7
  22. acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +14 -0
  23. acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
  24. acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
  25. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  26. acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
  27. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +2 -1
  28. acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
  29. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
  30. acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  31. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
  32. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +1 -0
  33. acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +1 -1
  34. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +1 -0
  35. acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
  36. acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
  37. acryl_datahub_cloud/metadata/schemas/FormKey.avsc +2 -1
  38. acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
  39. acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +3 -0
  40. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +22 -0
  41. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +1 -0
  42. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +1 -0
  43. acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  44. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +1 -0
  45. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +1 -0
  46. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  47. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +12 -1
  48. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +27 -6
  49. acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
  50. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +1 -0
  51. acryl_datahub_cloud/notifications/__init__.py +0 -0
  52. acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
  53. acryl_datahub_cloud/sdk/__init__.py +25 -0
  54. acryl_datahub_cloud/{_sdk_extras → sdk}/assertion.py +202 -45
  55. acryl_datahub_cloud/{_sdk_extras → sdk}/assertion_input.py +344 -83
  56. acryl_datahub_cloud/{_sdk_extras → sdk}/assertions_client.py +635 -199
  57. acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
  58. acryl_datahub_cloud/{_sdk_extras → sdk}/entities/assertion.py +1 -1
  59. acryl_datahub_cloud/{_sdk_extras → sdk}/subscription_client.py +146 -33
  60. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/METADATA +48 -43
  61. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/RECORD +69 -54
  62. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/entry_points.txt +1 -0
  63. acryl_datahub_cloud/_sdk_extras/__init__.py +0 -19
  64. /acryl_datahub_cloud/{_sdk_extras/entities → datahub_forms_notifications}/__init__.py +0 -0
  65. /acryl_datahub_cloud/{_sdk_extras → sdk}/entities/monitor.py +0 -0
  66. /acryl_datahub_cloud/{_sdk_extras → sdk}/entities/subscription.py +0 -0
  67. /acryl_datahub_cloud/{_sdk_extras → sdk}/errors.py +0 -0
  68. /acryl_datahub_cloud/{_sdk_extras → sdk}/resolver_client.py +0 -0
  69. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/WHEEL +0 -0
  70. {acryl_datahub_cloud-0.3.12rc1.dist-info → acryl_datahub_cloud-0.3.12rc3.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ from datetime import datetime, timedelta
1
2
  from typing import Dict
2
3
 
3
4
 
@@ -5,7 +6,7 @@ class QueryBuilder:
5
6
  @staticmethod
6
7
  def get_dataset_entities_query() -> Dict:
7
8
  return {
8
- "sort": [{"urn": {"order": "asc"}}],
9
+ # "sort": [{"urn": {"order": "asc"}}],
9
10
  "_source": {
10
11
  "includes": [
11
12
  "urn",
@@ -19,15 +20,54 @@ class QueryBuilder:
19
20
  }
20
21
 
21
22
  @staticmethod
22
- def get_query_entities_query() -> Dict:
23
+ def get_query_entities_query(days: int) -> Dict:
24
+ thirty_days_ago = datetime.now() - timedelta(days=days)
25
+ thirty_days_ago = thirty_days_ago.replace(
26
+ hour=0, minute=0, second=0, microsecond=0
27
+ )
28
+ epoch_ms = int(thirty_days_ago.timestamp() * 1000)
29
+
23
30
  return {
24
- "sort": [{"urn": {"order": "asc"}}],
31
+ # "sort": [{"urn": {"order": "asc"}}],
25
32
  "_source": {"includes": ["urn", "lastModifiedAt", "platform", "removed"]},
26
33
  "query": {
27
34
  "bool": {
28
35
  "filter": [
29
36
  {"bool": {"must_not": [{"term": {"source": "MANUAL"}}]}},
30
37
  {"exists": {"field": "platform"}},
38
+ {
39
+ "bool": {
40
+ "should": [
41
+ {
42
+ "bool": {
43
+ "filter": [
44
+ {"exists": {"field": "lastModifiedAt"}},
45
+ {
46
+ "range": {
47
+ "lastModifiedAt": {
48
+ "gte": epoch_ms
49
+ }
50
+ }
51
+ },
52
+ ]
53
+ }
54
+ },
55
+ {
56
+ "bool": {
57
+ "must_not": {
58
+ "exists": {"field": "lastModifiedAt"}
59
+ },
60
+ "filter": {
61
+ "range": {
62
+ "createdAt": {"gte": epoch_ms}
63
+ }
64
+ },
65
+ }
66
+ },
67
+ ],
68
+ "minimum_should_match": 1,
69
+ }
70
+ },
31
71
  ]
32
72
  }
33
73
  },
@@ -36,7 +76,7 @@ class QueryBuilder:
36
76
  @staticmethod
37
77
  def get_upstreams_query() -> Dict:
38
78
  return {
39
- "sort": [{"destination.urn": {"order": "asc"}}],
79
+ # "sort": [{"destination.urn": {"order": "asc"}}],
40
80
  "_source": {"includes": ["source.urn", "destination.urn"]},
41
81
  "query": {
42
82
  "bool": {
@@ -51,7 +91,7 @@ class QueryBuilder:
51
91
  @staticmethod
52
92
  def get_dashboard_usage_query(days: int) -> Dict:
53
93
  return {
54
- "sort": [{"urn": {"order": "asc"}}],
94
+ # "sort": [{"urn": {"order": "asc"}}],
55
95
  "_source": {
56
96
  "includes": [
57
97
  "timestampMillis",
@@ -80,7 +120,7 @@ class QueryBuilder:
80
120
  @staticmethod
81
121
  def get_dataset_usage_query(days: int) -> Dict:
82
122
  return {
83
- "sort": [{"urn": {"order": "asc"}}],
123
+ # "sort": [{"urn": {"order": "asc"}}],
84
124
  "_source": {
85
125
  "includes": [
86
126
  "timestampMillis",
@@ -110,7 +150,7 @@ class QueryBuilder:
110
150
  @staticmethod
111
151
  def get_dataset_write_usage_raw_query(days: int) -> Dict:
112
152
  return {
113
- "sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
153
+ # "sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
114
154
  "_source": {
115
155
  "includes": [
116
156
  "urn" # Only field needed for platform extraction via regex
@@ -159,7 +199,7 @@ class QueryBuilder:
159
199
  @staticmethod
160
200
  def get_query_usage_query(days: int) -> Dict:
161
201
  return {
162
- "sort": [{"urn": {"order": "asc"}}],
202
+ # "sort": [{"urn": {"order": "asc"}}],
163
203
  "_source": {
164
204
  "includes": [
165
205
  "timestampMillis",
@@ -114,12 +114,12 @@ class DataHubUsageFeatureReportingSourceConfig(
114
114
  30, description="Timeout in seconds for the search queries."
115
115
  )
116
116
  extract_batch_size: int = Field(
117
- 1000,
117
+ 5000,
118
118
  description="The number of documents to retrieve in each batch from ElasticSearch or OpenSearch.",
119
119
  )
120
120
 
121
121
  extract_delay: Optional[float] = Field(
122
- 0.25,
122
+ 0,
123
123
  description="The delay in seconds between each batch extraction from ElasticSearch or OpenSearch.",
124
124
  )
125
125
 
@@ -177,7 +177,7 @@ class DataHubUsageFeatureReportingSourceConfig(
177
177
  # This option is only needed here until we are sure that the streaming mode is stable.
178
178
  # then we can remove it and control it with the streaming_mode option.
179
179
  experimental_full_streaming: bool = Field(
180
- False,
180
+ True,
181
181
  description="Flag to enable full streaming mode.'",
182
182
  )
183
183
 
@@ -617,11 +617,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
617
617
  ),
618
618
  )
619
619
 
620
- response = server.create_pit(index, keep_alive="10m")
620
+ # response = server.create_pit(index, keep_alive="10m")
621
621
 
622
622
  # TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
623
- pit = response.get("pit_id")
624
- query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
623
+ # pit = response.get("pit_id")
624
+ # query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
625
625
  else:
626
626
  server = Elasticsearch(
627
627
  [endpoint],
@@ -834,7 +834,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
834
834
  .drop(["removed"])
835
835
  )
836
836
 
837
- return wdf.collect(streaming=self.config.streaming_mode).lazy()
837
+ return wdf
838
838
 
839
839
  def load_write_usage_server_side_aggregation(
840
840
  self, soft_deleted_entities_df: polars.LazyFrame
@@ -995,7 +995,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
995
995
  self, lazy_frame: polars.LazyFrame
996
996
  ) -> Iterable[MetadataWorkUnit]:
997
997
  for row in lazy_frame.collect(
998
- streaming=self.config.experimental_full_streaming
998
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
999
999
  ).to_struct():
1000
1000
  if "siblings" in row and row["siblings"]:
1001
1001
  logger.info(f"Siblings found for urn: {row['urn']} -> row['siblings']")
@@ -1086,7 +1086,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1086
1086
  def generate_query_usage_mcp_from_lazyframe(
1087
1087
  self, lazy_frame: polars.LazyFrame
1088
1088
  ) -> Iterable[MetadataWorkUnit]:
1089
- for row in lazy_frame.collect().iter_rows(named=True):
1089
+ for row in lazy_frame.collect(
1090
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
1091
+ ).iter_rows(named=True):
1090
1092
  query_usage_features = QueryUsageFeaturesClass(
1091
1093
  queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
1092
1094
  queryCountTotal=None, # This is not implemented
@@ -1308,7 +1310,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1308
1310
  query_entities = self.load_data_from_es_to_lf(
1309
1311
  schema=query_entities_schema,
1310
1312
  index=entity_index,
1311
- query=QueryBuilder.get_query_entities_query(),
1313
+ query=QueryBuilder.get_query_entities_query(self.config.lookback_days),
1312
1314
  process_function=self.queries_entities_batch,
1313
1315
  )
1314
1316
 
@@ -1485,11 +1487,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1485
1487
  # called `Option::unwrap()` on a `None` value
1486
1488
  # Which only happens if we don't collect immediately
1487
1489
  # return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
1488
- return (
1489
- polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
1490
- .collect()
1491
- .lazy()
1492
- )
1490
+ return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
1493
1491
 
1494
1492
  def load_dataset_usage(self) -> polars.LazyFrame:
1495
1493
  index = "dataset_datasetusagestatisticsaspect_v1"
@@ -1606,23 +1604,40 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1606
1604
  delay: Optional[float] = None,
1607
1605
  ) -> Iterable[Dict[str, Any]]:
1608
1606
  processed_count = 0
1607
+ scroll_id = None
1609
1608
  while True:
1610
1609
  with PerfTimer() as timer:
1611
1610
  logger.debug(f"ES query: {query}")
1612
- results = server.search(
1613
- body=query,
1614
- size=batch_size,
1615
- index=(
1616
- index
1617
- if not self.config.search_index.opensearch_dialect
1618
- else None
1619
- ),
1620
- params=(
1621
- {"timeout": self.config.query_timeout}
1622
- if self.config.search_index.opensearch_dialect
1623
- else {"request_timeout": self.config.query_timeout}
1624
- ),
1625
- )
1611
+ if not scroll_id:
1612
+ logger.debug(
1613
+ f"Getting inital data from index {index} without scroll id"
1614
+ )
1615
+ results = server.search(
1616
+ body=query,
1617
+ size=batch_size,
1618
+ scroll="2m",
1619
+ index=index,
1620
+ params=(
1621
+ {"timeout": self.config.query_timeout}
1622
+ if self.config.search_index.opensearch_dialect
1623
+ else {"request_timeout": self.config.query_timeout}
1624
+ ),
1625
+ )
1626
+ else:
1627
+ logger.debug(
1628
+ f"Getting data from index {index} using scroll_id: {scroll_id}"
1629
+ )
1630
+ results = server.scroll(
1631
+ scroll_id=scroll_id,
1632
+ scroll="2m",
1633
+ params=(
1634
+ {"timeout": self.config.query_timeout}
1635
+ if self.config.search_index.opensearch_dialect
1636
+ else {"request_timeout": self.config.query_timeout}
1637
+ ),
1638
+ )
1639
+ scroll_id = results["_scroll_id"]
1640
+
1626
1641
  if not aggregation_key:
1627
1642
  yield from process_function(results["hits"]["hits"])
1628
1643
 
@@ -1633,7 +1648,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1633
1648
  )
1634
1649
  if len(results["hits"]["hits"]) < batch_size:
1635
1650
  break
1636
- query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
1637
1651
  else:
1638
1652
  yield from process_function(
1639
1653
  results["aggregations"][aggregation_key]["buckets"]
@@ -1643,16 +1657,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1643
1657
  < batch_size
1644
1658
  ):
1645
1659
  break
1646
- if "after_key" in results["aggregations"][aggregation_key]:
1647
- query["aggs"][aggregation_key]["composite"]["after"] = results[
1648
- "aggregations"
1649
- ][aggregation_key]["after_key"]
1650
-
1651
- if delay:
1652
- logger.debug(
1653
- f"Sleeping for {delay} seconds before getting next batch from ES"
1654
- )
1655
- time.sleep(delay)
1660
+ if delay:
1661
+ logger.debug(
1662
+ f"Sleeping for {delay} seconds before getting next batch from ES"
1663
+ )
1664
+ time.sleep(delay)
1656
1665
 
1657
1666
  def get_report(self) -> SourceReport:
1658
1667
  return self.report