acryl-datahub-cloud 0.3.7.6__py3-none-any.whl → 0.3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +7 -11
  3. acryl_datahub_cloud/metadata/_urns/urn_defs.py +54 -0
  4. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  5. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  6. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  7. acryl_datahub_cloud/metadata/schema.avsc +207 -19
  8. acryl_datahub_cloud/metadata/schema_classes.py +262 -2
  9. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +1 -1
  10. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +1 -1
  11. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +1 -1
  12. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +2 -1
  13. acryl_datahub_cloud/metadata/schemas/InputFields.avsc +1 -1
  14. acryl_datahub_cloud/metadata/schemas/MLFeatureProperties.avsc +51 -0
  15. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  16. acryl_datahub_cloud/metadata/schemas/MLModelGroupProperties.avsc +51 -0
  17. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +2 -1
  18. acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +51 -0
  19. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  20. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +9 -1
  21. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +1 -1
  22. acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +1 -1
  23. acryl_datahub_cloud/metadata/schemas/VersionProperties.avsc +212 -0
  24. acryl_datahub_cloud/metadata/schemas/VersionSetKey.avsc +26 -0
  25. acryl_datahub_cloud/metadata/schemas/VersionSetProperties.avsc +49 -0
  26. {acryl_datahub_cloud-0.3.7.6.dist-info → acryl_datahub_cloud-0.3.7.7.dist-info}/METADATA +39 -36
  27. {acryl_datahub_cloud-0.3.7.6.dist-info → acryl_datahub_cloud-0.3.7.7.dist-info}/RECORD +30 -26
  28. {acryl_datahub_cloud-0.3.7.6.dist-info → acryl_datahub_cloud-0.3.7.7.dist-info}/WHEEL +0 -0
  29. {acryl_datahub_cloud-0.3.7.6.dist-info → acryl_datahub_cloud-0.3.7.7.dist-info}/entry_points.txt +0 -0
  30. {acryl_datahub_cloud-0.3.7.6.dist-info → acryl_datahub_cloud-0.3.7.7.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "acryl-datahub-cloud",
3
- "version": "0.3.7.6",
3
+ "version": "0.3.7.7",
4
4
  "install_requires": [
5
5
  "avro-gen3==0.7.16",
6
6
  "acryl-datahub"
@@ -449,7 +449,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
449
449
 
450
450
  def queries_entities_batch(self, results: Iterable) -> Iterable[Dict]:
451
451
  with PerfTimer() as timer:
452
-
453
452
  for doc in results:
454
453
  if "platform" not in doc["_source"] or not doc["_source"]["platform"]:
455
454
  logger.warning(
@@ -473,9 +472,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
473
472
  )
474
473
  ),
475
474
  "platform": doc["_source"]["platform"],
476
- "removed": doc["_source"]["removed"]
477
- if "removed" in doc["_source"]
478
- else False,
475
+ "removed": (
476
+ doc["_source"]["removed"]
477
+ if "removed" in doc["_source"]
478
+ else False
479
+ ),
479
480
  }
480
481
 
481
482
  time_taken = timer.elapsed_seconds()
@@ -587,7 +588,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
587
588
 
588
589
  def process_batch(self, results: Iterable) -> Iterable[Dict]:
589
590
  with PerfTimer() as timer:
590
-
591
591
  for doc in results:
592
592
  match = re.match(platform_regexp, doc["_source"]["urn"])
593
593
  if match:
@@ -754,7 +754,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
754
754
  prefix: Optional[str] = None,
755
755
  use_exp_cdf: Optional[bool] = None,
756
756
  ) -> polars.LazyFrame:
757
-
758
757
  logger.debug(f"Generating rank and percentile for {count_field} field")
759
758
  lf = lf.with_columns(
760
759
  polars.col(count_field)
@@ -880,7 +879,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
880
879
  def load_write_usage(
881
880
  self, soft_deleted_entities_df: polars.LazyFrame
882
881
  ) -> polars.LazyFrame:
883
-
884
882
  if self.config.streaming_mode:
885
883
  wdf = self.load_es_data_to_lf(
886
884
  index="dataset_operationaspect_v1",
@@ -1557,10 +1555,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1557
1555
  usage_with_top_users_with_ranks.join(
1558
1556
  write_lf, on="urn", how="full", suffix="_write"
1559
1557
  )
1560
- .with_columns("write_count")
1561
- .fill_null(polars.lit(0))
1562
- .with_columns("totalSqlQueries")
1563
- .fill_null(polars.lit(0))
1558
+ .with_columns(polars.col("write_count").fill_null(polars.lit(0)))
1559
+ .with_columns(polars.col("totalSqlQueries").fill_null(polars.lit(0)))
1564
1560
  )
1565
1561
 
1566
1562
  # If we get a dataset from the operation aspect index only then we have to use its urn and platform
@@ -21,6 +21,60 @@ from datahub.utilities.urns.error import InvalidUrnError
21
21
 
22
22
  deprecated = functools.partial(_sphinx_deprecated, version="0.12.0.2")
23
23
 
24
+ if TYPE_CHECKING:
25
+ from datahub.metadata.schema_classes import VersionSetKeyClass
26
+
27
+ class VersionSetUrn(_SpecificUrn):
28
+ ENTITY_TYPE: ClassVar[str] = "versionSet"
29
+ URN_PARTS: ClassVar[int] = 2
30
+
31
+ def __init__(self, id: str, entity_type: str, *, _allow_coercion: bool = True) -> None:
32
+ if _allow_coercion:
33
+ # Field coercion logic (if any is required).
34
+ id = UrnEncoder.encode_string(id)
35
+ entity_type = UrnEncoder.encode_string(entity_type)
36
+
37
+ # Validation logic.
38
+ if not id:
39
+ raise InvalidUrnError("VersionSetUrn id cannot be empty")
40
+ if UrnEncoder.contains_reserved_char(id):
41
+ raise InvalidUrnError(f'VersionSetUrn id contains reserved characters')
42
+ if not entity_type:
43
+ raise InvalidUrnError("VersionSetUrn entity_type cannot be empty")
44
+ if UrnEncoder.contains_reserved_char(entity_type):
45
+ raise InvalidUrnError(f'VersionSetUrn entity_type contains reserved characters')
46
+
47
+ super().__init__(self.ENTITY_TYPE, [id, entity_type])
48
+
49
+ @classmethod
50
+ def _parse_ids(cls, entity_ids: List[str]) -> "VersionSetUrn":
51
+ if len(entity_ids) != cls.URN_PARTS:
52
+ raise InvalidUrnError(f"VersionSetUrn should have {cls.URN_PARTS} parts, got {len(entity_ids)}: {entity_ids}")
53
+ return cls(id=entity_ids[0], entity_type=entity_ids[1], _allow_coercion=False)
54
+
55
+ @classmethod
56
+ def underlying_key_aspect_type(cls) -> Type["VersionSetKeyClass"]:
57
+ from datahub.metadata.schema_classes import VersionSetKeyClass
58
+
59
+ return VersionSetKeyClass
60
+
61
+ def to_key_aspect(self) -> "VersionSetKeyClass":
62
+ from datahub.metadata.schema_classes import VersionSetKeyClass
63
+
64
+ return VersionSetKeyClass(id=self.id, entityType=self.entity_type)
65
+
66
+ @classmethod
67
+ def from_key_aspect(cls, key_aspect: "VersionSetKeyClass") -> "VersionSetUrn":
68
+ return cls(id=key_aspect.id, entity_type=key_aspect.entityType)
69
+
70
+ @property
71
+ def id(self) -> str:
72
+ return self.entity_ids[0]
73
+
74
+ @property
75
+ def entity_type(self) -> str:
76
+ return self.entity_ids[1]
77
+
24
78
  if TYPE_CHECKING:
25
79
  from datahub.metadata.schema_classes import DataHubConnectionKeyClass
26
80
 
@@ -87,6 +87,7 @@ from .....schema_classes import SubTypesClass
87
87
  from .....schema_classes import SyncMechanismClass
88
88
  from .....schema_classes import TagAssociationClass
89
89
  from .....schema_classes import TimeStampClass
90
+ from .....schema_classes import VersionPropertiesClass
90
91
  from .....schema_classes import VersionTagClass
91
92
  from .....schema_classes import WindowDurationClass
92
93
 
@@ -171,6 +172,7 @@ SubTypes = SubTypesClass
171
172
  SyncMechanism = SyncMechanismClass
172
173
  TagAssociation = TagAssociationClass
173
174
  TimeStamp = TimeStampClass
175
+ VersionProperties = VersionPropertiesClass
174
176
  VersionTag = VersionTagClass
175
177
  WindowDuration = WindowDurationClass
176
178
 
@@ -65,6 +65,7 @@ from ......schema_classes import SubscriptionKeyClass
65
65
  from ......schema_classes import TagKeyClass
66
66
  from ......schema_classes import TelemetryKeyClass
67
67
  from ......schema_classes import TestKeyClass
68
+ from ......schema_classes import VersionSetKeyClass
68
69
 
69
70
 
70
71
  ActionRequestKey = ActionRequestKeyClass
@@ -125,5 +126,6 @@ SubscriptionKey = SubscriptionKeyClass
125
126
  TagKey = TagKeyClass
126
127
  TelemetryKey = TelemetryKeyClass
127
128
  TestKey = TestKeyClass
129
+ VersionSetKey = VersionSetKeyClass
128
130
 
129
131
  # fmt: on
@@ -0,0 +1,17 @@
1
+ # mypy: ignore-errors
2
+ # flake8: noqa
3
+
4
+ # This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py
5
+ # Do not modify manually!
6
+
7
+ # pylint: skip-file
8
+ # fmt: off
9
+ # isort: skip_file
10
+ from .....schema_classes import VersionSetPropertiesClass
11
+ from .....schema_classes import VersioningSchemeClass
12
+
13
+
14
+ VersionSetProperties = VersionSetPropertiesClass
15
+ VersioningScheme = VersioningSchemeClass
16
+
17
+ # fmt: on
@@ -2418,6 +2418,32 @@
2418
2418
  }
2419
2419
  ]
2420
2420
  },
2421
+ {
2422
+ "type": "record",
2423
+ "Aspect": {
2424
+ "name": "versionSetKey",
2425
+ "keyForEntity": "versionSet",
2426
+ "entityCategory": "core",
2427
+ "entityAspects": [
2428
+ "versionSetProperties"
2429
+ ]
2430
+ },
2431
+ "name": "VersionSetKey",
2432
+ "namespace": "com.linkedin.pegasus2avro.metadata.key",
2433
+ "fields": [
2434
+ {
2435
+ "type": "string",
2436
+ "name": "id",
2437
+ "doc": "ID of the Version Set, generated from platform + asset id / name"
2438
+ },
2439
+ {
2440
+ "type": "string",
2441
+ "name": "entityType",
2442
+ "doc": "Type of entities included in version set, limits to a single entity type between linked versioned entities"
2443
+ }
2444
+ ],
2445
+ "doc": "Key for a Version Set entity"
2446
+ },
2421
2447
  {
2422
2448
  "type": "record",
2423
2449
  "Aspect": {
@@ -4090,7 +4116,8 @@
4090
4116
  "share",
4091
4117
  "origin",
4092
4118
  "documentation",
4093
- "entityInferenceMetadata"
4119
+ "entityInferenceMetadata",
4120
+ "versionProperties"
4094
4121
  ],
4095
4122
  "entityDoc": "Datasets represent logical or physical data assets stored or represented in various data platforms. Tables, Views, Streams are all instances of datasets."
4096
4123
  },
@@ -4409,7 +4436,8 @@
4409
4436
  "origin",
4410
4437
  "lineageFeatures",
4411
4438
  "documentation",
4412
- "incidentsSummary"
4439
+ "incidentsSummary",
4440
+ "versionProperties"
4413
4441
  ]
4414
4442
  },
4415
4443
  "name": "MLModelKey",
@@ -9292,7 +9320,7 @@
9292
9320
  "fields": [
9293
9321
  {
9294
9322
  "Searchable": {
9295
- "boostScore": 5.0,
9323
+ "boostScore": 1.0,
9296
9324
  "fieldName": "fieldPaths",
9297
9325
  "fieldType": "TEXT",
9298
9326
  "queryByDefault": "true"
@@ -11820,6 +11848,132 @@
11820
11848
  ],
11821
11849
  "doc": "Proposals aspect for proposed tags and glossary terms for an entity.\nThese are secondary indices and ActionRequests remain the source of truth."
11822
11850
  },
11851
+ {
11852
+ "type": "record",
11853
+ "Aspect": {
11854
+ "name": "versionProperties"
11855
+ },
11856
+ "name": "VersionProperties",
11857
+ "namespace": "com.linkedin.pegasus2avro.common",
11858
+ "fields": [
11859
+ {
11860
+ "Relationship": {
11861
+ "entityTypes": [
11862
+ "versionSet"
11863
+ ],
11864
+ "name": "VersionOf"
11865
+ },
11866
+ "Searchable": {
11867
+ "queryByDefault": false
11868
+ },
11869
+ "java": {
11870
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
11871
+ },
11872
+ "Urn": "Urn",
11873
+ "entityTypes": [
11874
+ "versionSet"
11875
+ ],
11876
+ "type": "string",
11877
+ "name": "versionSet",
11878
+ "doc": "The linked Version Set entity that ties multiple versioned assets together"
11879
+ },
11880
+ {
11881
+ "Searchable": {
11882
+ "/versionTag": {
11883
+ "fieldName": "version",
11884
+ "queryByDefault": false
11885
+ }
11886
+ },
11887
+ "type": {
11888
+ "type": "record",
11889
+ "name": "VersionTag",
11890
+ "namespace": "com.linkedin.pegasus2avro.common",
11891
+ "fields": [
11892
+ {
11893
+ "type": [
11894
+ "null",
11895
+ "string"
11896
+ ],
11897
+ "name": "versionTag",
11898
+ "default": null
11899
+ },
11900
+ {
11901
+ "type": [
11902
+ "null",
11903
+ "com.linkedin.pegasus2avro.common.MetadataAttribution"
11904
+ ],
11905
+ "name": "metadataAttribution",
11906
+ "default": null
11907
+ }
11908
+ ],
11909
+ "doc": "A resource-defined string representing the resource state for the purpose of concurrency control"
11910
+ },
11911
+ "name": "version",
11912
+ "doc": "Label for this versioned asset, is unique within a version set"
11913
+ },
11914
+ {
11915
+ "Searchable": {
11916
+ "/*/versionTag": {
11917
+ "fieldName": "aliases",
11918
+ "queryByDefault": false
11919
+ }
11920
+ },
11921
+ "type": {
11922
+ "type": "array",
11923
+ "items": "com.linkedin.pegasus2avro.common.VersionTag"
11924
+ },
11925
+ "name": "aliases",
11926
+ "default": [],
11927
+ "doc": "Associated aliases for this versioned asset"
11928
+ },
11929
+ {
11930
+ "type": [
11931
+ "null",
11932
+ "string"
11933
+ ],
11934
+ "name": "comment",
11935
+ "default": null,
11936
+ "doc": "Comment documenting what this version was created for, changes, or represents"
11937
+ },
11938
+ {
11939
+ "Searchable": {
11940
+ "fieldName": "versionSortId",
11941
+ "queryByDefault": false
11942
+ },
11943
+ "type": "string",
11944
+ "name": "sortId",
11945
+ "doc": "Sort identifier that determines where a version lives in the order of the Version Set.\nWhat this looks like depends on the Version Scheme. For sort ids generated by DataHub we use an 8 character string representation."
11946
+ },
11947
+ {
11948
+ "type": [
11949
+ "null",
11950
+ "com.linkedin.pegasus2avro.common.AuditStamp"
11951
+ ],
11952
+ "name": "sourceCreatedTimestamp",
11953
+ "default": null,
11954
+ "doc": "Timestamp reflecting when this asset version was created in the source system."
11955
+ },
11956
+ {
11957
+ "type": [
11958
+ "null",
11959
+ "com.linkedin.pegasus2avro.common.AuditStamp"
11960
+ ],
11961
+ "name": "metadataCreatedTimestamp",
11962
+ "default": null,
11963
+ "doc": "Timestamp reflecting when the metadata for this version was created in DataHub"
11964
+ },
11965
+ {
11966
+ "type": [
11967
+ "null",
11968
+ "boolean"
11969
+ ],
11970
+ "name": "isLatest",
11971
+ "default": null,
11972
+ "doc": "Marks whether this version is currently the latest. Set by a side effect and should not be modified by API."
11973
+ }
11974
+ ],
11975
+ "doc": "Properties about a versioned asset i.e. dataset, ML Model, etc."
11976
+ },
11823
11977
  {
11824
11978
  "type": "record",
11825
11979
  "Aspect": {
@@ -22048,22 +22202,7 @@
22048
22202
  {
22049
22203
  "type": [
22050
22204
  "null",
22051
- {
22052
- "type": "record",
22053
- "name": "VersionTag",
22054
- "namespace": "com.linkedin.pegasus2avro.common",
22055
- "fields": [
22056
- {
22057
- "type": [
22058
- "null",
22059
- "string"
22060
- ],
22061
- "name": "versionTag",
22062
- "default": null
22063
- }
22064
- ],
22065
- "doc": "A resource-defined string representing the resource state for the purpose of concurrency control"
22066
- }
22205
+ "com.linkedin.pegasus2avro.common.VersionTag"
22067
22206
  ],
22068
22207
  "name": "version",
22069
22208
  "default": null,
@@ -26385,6 +26524,55 @@
26385
26524
  ],
26386
26525
  "doc": "A request to send a notification."
26387
26526
  },
26527
+ {
26528
+ "type": "record",
26529
+ "Aspect": {
26530
+ "name": "versionSetProperties"
26531
+ },
26532
+ "name": "VersionSetProperties",
26533
+ "namespace": "com.linkedin.pegasus2avro.versionset",
26534
+ "fields": [
26535
+ {
26536
+ "Searchable": {
26537
+ "/*": {
26538
+ "fieldType": "TEXT",
26539
+ "queryByDefault": true
26540
+ }
26541
+ },
26542
+ "type": {
26543
+ "type": "map",
26544
+ "values": "string"
26545
+ },
26546
+ "name": "customProperties",
26547
+ "default": {},
26548
+ "doc": "Custom property bag."
26549
+ },
26550
+ {
26551
+ "Searchable": {
26552
+ "queryByDefault": "false"
26553
+ },
26554
+ "java": {
26555
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
26556
+ },
26557
+ "Urn": "Urn",
26558
+ "type": "string",
26559
+ "name": "latest",
26560
+ "doc": "The latest versioned entity linked to in this version set"
26561
+ },
26562
+ {
26563
+ "type": {
26564
+ "type": "enum",
26565
+ "name": "VersioningScheme",
26566
+ "namespace": "com.linkedin.pegasus2avro.versionset",
26567
+ "symbols": [
26568
+ "ALPHANUMERIC_GENERATED_BY_DATAHUB"
26569
+ ]
26570
+ },
26571
+ "name": "versioningScheme",
26572
+ "doc": "What versioning scheme is being utilized for the versioned entities sort criterion. Static once set"
26573
+ }
26574
+ ]
26575
+ },
26388
26576
  {
26389
26577
  "type": "record",
26390
26578
  "Aspect": {