acryl-datahub-cloud 0.3.10.4__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (33) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/_sdk_extras/__init__.py +4 -0
  3. acryl_datahub_cloud/_sdk_extras/assertion.py +15 -0
  4. acryl_datahub_cloud/_sdk_extras/assertions_client.py +23 -0
  5. acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
  6. acryl_datahub_cloud/action_request/action_request_owner_source.py +1 -2
  7. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +3 -7
  8. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +9 -5
  9. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +14 -32
  10. acryl_datahub_cloud/metadata/_urns/urn_defs.py +1849 -1793
  11. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
  12. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  13. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/monitor/__init__.py +6 -0
  14. acryl_datahub_cloud/metadata/schema.avsc +25538 -25429
  15. acryl_datahub_cloud/metadata/schema_classes.py +861 -676
  16. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +25 -0
  17. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +25 -0
  18. acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +2 -1
  19. acryl_datahub_cloud/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  20. acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +4 -2
  21. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  22. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  23. acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +3 -0
  24. acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  25. acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +36 -26
  26. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +58 -0
  27. acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +4 -2
  28. acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +86 -0
  29. {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/METADATA +43 -43
  30. {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/RECORD +33 -28
  31. {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/WHEEL +1 -1
  32. {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/entry_points.txt +0 -0
  33. {acryl_datahub_cloud-0.3.10.4.dist-info → acryl_datahub_cloud-0.3.11.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "acryl-datahub-cloud",
3
- "version": "0.3.10.4",
3
+ "version": "0.3.11",
4
4
  "install_requires": [
5
5
  "avro-gen3==0.7.16",
6
6
  "acryl-datahub"
@@ -0,0 +1,4 @@
1
+ from acryl_datahub_cloud._sdk_extras.assertion import Assertion
2
+ from acryl_datahub_cloud._sdk_extras.assertions_client import AssertionsClient
3
+
4
+ __all__ = ["Assertion", "AssertionsClient"]
@@ -0,0 +1,15 @@
1
+ """
2
+ This file contains the Assertion class, which is used to represent an assertion in DataHub.
3
+
4
+ The Assertion class is currently not implemented, this is a placeholder for future implementation.
5
+ """
6
+
7
+ from typing import Union
8
+
9
+ from datahub.metadata.urns import AssertionUrn
10
+
11
+
12
+ class Assertion:
13
+ def __init__(self, urn: Union[str, AssertionUrn]):
14
+ print(f"The Assertion class is currently not implemented. Urn provided: {urn}")
15
+ self.urn = AssertionUrn(urn)
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Union
4
+
5
+ from acryl_datahub_cloud._sdk_extras.assertion import Assertion
6
+ from datahub.metadata.urns import AssertionUrn
7
+
8
+ if TYPE_CHECKING:
9
+ from datahub.sdk.main_client import DataHubClient
10
+
11
+
12
+ class AssertionsClient:
13
+ def __init__(self, client: DataHubClient):
14
+ self.client = client
15
+
16
+ def get_assertions(
17
+ self, urn: Union[str, list[str], AssertionUrn, list[AssertionUrn]]
18
+ ) -> list[Assertion]:
19
+ print(
20
+ "get_assertions is not implemented, this is a placeholder. Returning empty list."
21
+ )
22
+ print(f"urn provided: {urn}")
23
+ return []
@@ -219,6 +219,7 @@ class AcrylCustomer:
219
219
  self._emitted_containers: Dict[str, bool] = {}
220
220
 
221
221
  def _get_owner_from_assignee(self, assignee: ExternalUser) -> str:
222
+ assert assignee.email, "Assignee must have an email"
222
223
  owner_urn_options = [
223
224
  urn
224
225
  for urn in self.graph.get_urns_by_filter(
@@ -233,7 +234,6 @@ class AcrylCustomer:
233
234
  )
234
235
  ]
235
236
  if not owner_urn_options:
236
- assert assignee.email, "Assignee must have an email"
237
237
  owner_urn = "urn:li:corpuser:" + assignee.email
238
238
  self.graph.emit(
239
239
  MetadataChangeProposalWrapper(
@@ -2,6 +2,7 @@ import logging
2
2
  from typing import Dict, Iterable, List, Optional
3
3
 
4
4
  from datahub.configuration import ConfigModel
5
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
5
6
  from datahub.ingestion.api.common import PipelineContext
6
7
  from datahub.ingestion.api.decorators import (
7
8
  SupportStatus,
@@ -14,10 +15,8 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
14
15
  from datahub.metadata.schema_classes import (
15
16
  ActionRequestInfoClass,
16
17
  )
17
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
18
  from datahub.utilities.urns.urn import guess_entity_type
19
19
 
20
-
21
20
  logger = logging.getLogger(__name__)
22
21
 
23
22
 
@@ -163,7 +163,7 @@ class DataHubBasedS3Dataset:
163
163
  self.schema = row.arrow_schema()
164
164
  else:
165
165
  # hail mary: infer schema from the first row and cast everything to string
166
- self.schema = pa.schema([(key, pa.string()) for key in row.keys()])
166
+ self.schema = pa.schema([(key, pa.string()) for key in row])
167
167
  self.stringify_row = True
168
168
 
169
169
  self._initialize_local_file()
@@ -172,7 +172,7 @@ class DataHubBasedS3Dataset:
172
172
  self.schema,
173
173
  compression=self.config.file_compression,
174
174
  )
175
- if isinstance(row, BaseModel) or isinstance(row, BaseModelRow):
175
+ if isinstance(row, (BaseModel, BaseModelRow)):
176
176
  # for anything extending BaseModel, we want to use the dict representation
177
177
  write_row: Dict[str, Any] = row.dict()
178
178
  elif isinstance(row, dict):
@@ -274,11 +274,7 @@ class DataHubBasedS3Dataset:
274
274
  self, duckdb_columns: List[Tuple[str, str]]
275
275
  ) -> SchemaMetadataClass:
276
276
  def get_type_from_dtype(dtype: str) -> SchemaFieldDataTypeClass:
277
- if "int" in dtype:
278
- return SchemaFieldDataTypeClass(type=NumberTypeClass())
279
- elif "float" in dtype:
280
- return SchemaFieldDataTypeClass(type=NumberTypeClass())
281
- elif "number" in dtype:
277
+ if "int" in dtype or "float" in dtype or "number" in dtype:
282
278
  return SchemaFieldDataTypeClass(type=NumberTypeClass())
283
279
  elif "bool" in dtype:
284
280
  return SchemaFieldDataTypeClass(type=BooleanTypeClass())
@@ -5,6 +5,9 @@ from enum import Enum
5
5
  from typing import Any, Callable, Dict, Iterable, List, Optional
6
6
 
7
7
  import pandas as pd
8
+ from pydantic import BaseModel
9
+
10
+ from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow
8
11
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
12
  from datahub.ingestion.graph.client import DataHubGraph
10
13
  from datahub.ingestion.graph.filters import RawSearchFilterRule
@@ -16,9 +19,6 @@ from datahub.metadata.schema_classes import (
16
19
  FormStateClass,
17
20
  FormTypeClass,
18
21
  )
19
- from pydantic import BaseModel
20
-
21
- from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
@@ -257,6 +257,7 @@ class DataHubFormReportingData(FormData):
257
257
  for prompt_id, response_time in zip(
258
258
  search_row.completedFormsCompletedPromptIds,
259
259
  search_row.completedFormsCompletedPromptResponseTimes,
260
+ strict=False,
260
261
  )
261
262
  if prompt_id in form_prompts
262
263
  }
@@ -289,9 +290,10 @@ class DataHubFormReportingData(FormData):
289
290
  on_asset_scanned: Optional[Callable[[str], Any]] = None,
290
291
  on_form_scanned: Optional[Callable[[str], Any]] = None,
291
292
  ) -> Iterable[FormReportingRow]:
292
- extra_fields = [f for f in self.DataHubDatasetSearchRow.__fields__.keys()]
293
+ extra_fields = [f for f in self.DataHubDatasetSearchRow.__fields__]
294
+ # TODO: Replace with the new search/filter SDK.
293
295
  result = self.graph.get_results_by_filter(
294
- extra_or_filters=self.get_form_existence_or_filters(),
296
+ extra_or_filters=[{"and": self.get_form_existence_or_filters()}],
295
297
  extra_source_fields=extra_fields,
296
298
  skip_cache=True,
297
299
  )
@@ -388,6 +390,7 @@ class DataHubFormReportingData(FormData):
388
390
  for (p, p_response_time) in zip(
389
391
  search_row.incompleteFormsCompletedPromptIds,
390
392
  search_row.incompleteFormsCompletedPromptResponseTimes,
393
+ strict=False,
391
394
  )
392
395
  if p in form_prompts
393
396
  ]:
@@ -485,6 +488,7 @@ class DataHubFormReportingData(FormData):
485
488
  for (p, p_response_time) in zip(
486
489
  search_row.completedFormsCompletedPromptIds,
487
490
  search_row.completedFormsCompletedPromptResponseTimes,
491
+ strict=False,
488
492
  )
489
493
  if p in form_prompts
490
494
  ]:
@@ -395,18 +395,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
395
395
  "last_modified_at": (
396
396
  doc["_source"]["lastModifiedAt"]
397
397
  if "lastModifiedAt" in doc["_source"]
398
- else (
399
- doc["_source"]["lastModifiedAt"]
400
- if "lastModifiedAt" in doc["_source"]
401
- else None
402
- )
398
+ else (doc["_source"].get("lastModifiedAt", None))
403
399
  ),
404
400
  "platform": doc["_source"]["platform"],
405
- "removed": (
406
- doc["_source"]["removed"]
407
- if "removed" in doc["_source"]
408
- else False
409
- ),
401
+ "removed": (doc["_source"].get("removed", False)),
410
402
  }
411
403
 
412
404
  time_taken = timer.elapsed_seconds()
@@ -509,11 +501,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
509
501
  "eventGranularity": doc["_source"].get("eventGranularity"),
510
502
  "totalSqlQueries": doc["_source"].get("totalSqlQueries", 0),
511
503
  "uniqueUserCount": doc["_source"].get("uniqueUserCount", 0),
512
- "userCounts": (
513
- doc["_source"]["event"]["userCounts"]
514
- if "userCounts" in doc["_source"]["event"]
515
- else None
516
- ),
504
+ "userCounts": (doc["_source"]["event"].get("userCounts", None)),
517
505
  "platform": platform,
518
506
  }
519
507
  except KeyError as e:
@@ -525,7 +513,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
525
513
  time_taken = timer.elapsed_seconds()
526
514
  logger.info(f"DatasetUsage processing took {time_taken:.3f} seconds")
527
515
 
528
- def search_score(
516
+ def search_score( # noqa: C901
529
517
  self, urn: str, last_update_time: int, usage_percentile: int
530
518
  ) -> SearchRankingMultipliers:
531
519
  usage_search_score_multiplier = 1.0
@@ -622,10 +610,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
622
610
  [endpoint],
623
611
  http_auth=(user, password),
624
612
  use_ssl=(
625
- True
626
- if self.config.search_index
627
- and self.config.search_index.use_ssl
628
- else False
613
+ bool(
614
+ self.config.search_index
615
+ and self.config.search_index.use_ssl
616
+ )
629
617
  ),
630
618
  )
631
619
 
@@ -639,10 +627,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
639
627
  [endpoint],
640
628
  http_auth=(user, password),
641
629
  use_ssl=(
642
- True
643
- if self.config.search_index
644
- and self.config.search_index.use_ssl
645
- else False
630
+ bool(
631
+ self.config.search_index
632
+ and self.config.search_index.use_ssl
633
+ )
646
634
  ),
647
635
  )
648
636
 
@@ -737,7 +725,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
737
725
  polars.Duration(): pa.duration("ns"),
738
726
  }
739
727
 
740
- if polars_dtype in [type(key) for key in type_mapping.keys()]:
728
+ if polars_dtype in [type(key) for key in type_mapping]:
741
729
  return type_mapping[polars_dtype]
742
730
  elif polars_dtype == polars.Categorical:
743
731
  return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
@@ -1006,12 +994,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1006
994
  def generate_mcp_from_lazyframe(
1007
995
  self, lazy_frame: polars.LazyFrame
1008
996
  ) -> Iterable[MetadataWorkUnit]:
1009
- num = 0
1010
997
  for row in lazy_frame.collect(
1011
998
  streaming=self.config.experimental_full_streaming
1012
999
  ).to_struct():
1013
- num += 1
1014
-
1015
1000
  if "siblings" in row and row["siblings"]:
1016
1001
  logger.info(f"Siblings found for urn: {row['urn']} -> row['siblings']")
1017
1002
 
@@ -1101,10 +1086,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1101
1086
  def generate_query_usage_mcp_from_lazyframe(
1102
1087
  self, lazy_frame: polars.LazyFrame
1103
1088
  ) -> Iterable[MetadataWorkUnit]:
1104
- num = 0
1105
1089
  for row in lazy_frame.collect().iter_rows(named=True):
1106
- num += 1
1107
-
1108
1090
  query_usage_features = QueryUsageFeaturesClass(
1109
1091
  queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
1110
1092
  queryCountTotal=None, # This is not implemented
@@ -1287,7 +1269,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1287
1269
  .is_not_null()
1288
1270
  # We only want to downrank datasets that have a search score multiplier greater than 1. 1 is the minimum score of a dataset
1289
1271
  .and_(polars.col("combinedSearchRankingMultiplier").ne(1))
1290
- ) # noqa: E712
1272
+ )
1291
1273
  .filter(polars.col("removed") == False) # noqa: E712
1292
1274
  .drop(["removed"])
1293
1275
  .drop(["last_modified_at"])