acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (221) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/emitter/rest_emitter.py +70 -12
  36. datahub/entrypoints.py +4 -3
  37. datahub/ingestion/api/decorators.py +15 -3
  38. datahub/ingestion/api/report.py +332 -3
  39. datahub/ingestion/api/sink.py +3 -0
  40. datahub/ingestion/api/source.py +48 -44
  41. datahub/ingestion/autogenerated/__init__.py +0 -0
  42. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  43. datahub/ingestion/autogenerated/lineage.json +401 -0
  44. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  45. datahub/ingestion/extractor/schema_util.py +13 -4
  46. datahub/ingestion/glossary/classification_mixin.py +5 -0
  47. datahub/ingestion/graph/client.py +100 -15
  48. datahub/ingestion/graph/config.py +1 -0
  49. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  50. datahub/ingestion/run/pipeline.py +54 -2
  51. datahub/ingestion/sink/datahub_rest.py +13 -0
  52. datahub/ingestion/source/abs/source.py +1 -1
  53. datahub/ingestion/source/aws/aws_common.py +4 -0
  54. datahub/ingestion/source/aws/glue.py +489 -244
  55. datahub/ingestion/source/aws/tag_entities.py +292 -0
  56. datahub/ingestion/source/azure/azure_common.py +2 -2
  57. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  58. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  59. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  60. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  61. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  62. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  63. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  64. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  65. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  66. datahub/ingestion/source/common/subtypes.py +45 -0
  67. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  68. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  69. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  70. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  71. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  72. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  73. datahub/ingestion/source/debug/__init__.py +0 -0
  74. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  75. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  76. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  77. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  78. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  79. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  80. datahub/ingestion/source/file.py +3 -0
  81. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  82. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  83. datahub/ingestion/source/ge_data_profiler.py +76 -28
  84. datahub/ingestion/source/ge_profiling_config.py +11 -0
  85. datahub/ingestion/source/hex/api.py +26 -1
  86. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  87. datahub/ingestion/source/identity/azure_ad.py +1 -1
  88. datahub/ingestion/source/identity/okta.py +1 -14
  89. datahub/ingestion/source/kafka/kafka.py +16 -0
  90. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  91. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  92. datahub/ingestion/source/looker/looker_source.py +1 -0
  93. datahub/ingestion/source/mlflow.py +11 -1
  94. datahub/ingestion/source/mock_data/__init__.py +0 -0
  95. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  97. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  98. datahub/ingestion/source/nifi.py +1 -1
  99. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  100. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  101. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  102. datahub/ingestion/source/preset.py +2 -2
  103. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  104. datahub/ingestion/source/redshift/redshift.py +21 -1
  105. datahub/ingestion/source/redshift/usage.py +4 -3
  106. datahub/ingestion/source/s3/report.py +4 -2
  107. datahub/ingestion/source/s3/source.py +367 -115
  108. datahub/ingestion/source/sac/sac.py +3 -1
  109. datahub/ingestion/source/salesforce.py +6 -3
  110. datahub/ingestion/source/sigma/sigma.py +7 -1
  111. datahub/ingestion/source/slack/slack.py +2 -1
  112. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  113. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  114. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  115. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  116. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  117. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  118. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  119. datahub/ingestion/source/sql/athena.py +119 -11
  120. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  121. datahub/ingestion/source/sql/clickhouse.py +3 -1
  122. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  123. datahub/ingestion/source/sql/hana.py +3 -1
  124. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  125. datahub/ingestion/source/sql/mariadb.py +0 -1
  126. datahub/ingestion/source/sql/mssql/source.py +239 -34
  127. datahub/ingestion/source/sql/mysql.py +0 -1
  128. datahub/ingestion/source/sql/oracle.py +1 -1
  129. datahub/ingestion/source/sql/postgres.py +0 -1
  130. datahub/ingestion/source/sql/sql_common.py +121 -34
  131. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  132. datahub/ingestion/source/sql/teradata.py +997 -235
  133. datahub/ingestion/source/sql/vertica.py +10 -6
  134. datahub/ingestion/source/sql_queries.py +2 -2
  135. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  136. datahub/ingestion/source/superset.py +58 -3
  137. datahub/ingestion/source/tableau/tableau.py +58 -37
  138. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  139. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  140. datahub/ingestion/source/unity/config.py +5 -0
  141. datahub/ingestion/source/unity/proxy.py +118 -0
  142. datahub/ingestion/source/unity/source.py +195 -17
  143. datahub/ingestion/source/unity/tag_entities.py +295 -0
  144. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  145. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  146. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  147. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  148. datahub/metadata/_internal_schema_classes.py +1433 -546
  149. datahub/metadata/_urns/urn_defs.py +1826 -1658
  150. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  151. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  157. datahub/metadata/schema.avsc +17736 -17112
  158. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  159. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  160. datahub/metadata/schemas/Applications.avsc +38 -0
  161. datahub/metadata/schemas/ChartKey.avsc +1 -0
  162. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  164. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  165. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  166. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  167. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  168. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  169. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  170. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  171. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  172. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  173. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  176. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  177. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  178. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  179. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  180. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  181. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  182. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  183. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  184. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  185. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  186. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  187. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  188. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  189. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  190. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  191. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  192. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  193. datahub/metadata/schemas/__init__.py +3 -3
  194. datahub/sdk/__init__.py +2 -0
  195. datahub/sdk/_all_entities.py +7 -0
  196. datahub/sdk/_shared.py +116 -0
  197. datahub/sdk/chart.py +315 -0
  198. datahub/sdk/container.py +7 -0
  199. datahub/sdk/dashboard.py +432 -0
  200. datahub/sdk/dataflow.py +7 -0
  201. datahub/sdk/datajob.py +45 -13
  202. datahub/sdk/dataset.py +8 -2
  203. datahub/sdk/entity_client.py +82 -2
  204. datahub/sdk/lineage_client.py +683 -82
  205. datahub/sdk/main_client.py +46 -16
  206. datahub/sdk/mlmodel.py +101 -38
  207. datahub/sdk/mlmodelgroup.py +7 -0
  208. datahub/sdk/search_client.py +4 -3
  209. datahub/specific/chart.py +1 -1
  210. datahub/specific/dataproduct.py +4 -0
  211. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  212. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  213. datahub/telemetry/telemetry.py +17 -11
  214. datahub/testing/sdk_v2_helpers.py +7 -1
  215. datahub/upgrade/upgrade.py +46 -13
  216. datahub/utilities/server_config_util.py +8 -0
  217. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  218. datahub/utilities/stats_collections.py +4 -0
  219. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
  220. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
datahub/sdk/dataset.py CHANGED
@@ -26,12 +26,14 @@ from datahub.sdk._shared import (
26
26
  HasInstitutionalMemory,
27
27
  HasOwnership,
28
28
  HasPlatformInstance,
29
+ HasStructuredProperties,
29
30
  HasSubtype,
30
31
  HasTags,
31
32
  HasTerms,
32
33
  LinksInputType,
33
34
  OwnersInputType,
34
35
  ParentContainerInputType,
36
+ StructuredPropertyInputType,
35
37
  TagInputType,
36
38
  TagsInputType,
37
39
  TermInputType,
@@ -428,6 +430,7 @@ class Dataset(
428
430
  HasTags,
429
431
  HasTerms,
430
432
  HasDomain,
433
+ HasStructuredProperties,
431
434
  Entity,
432
435
  ):
433
436
  """Represents a dataset in DataHub.
@@ -471,12 +474,12 @@ class Dataset(
471
474
  links: Optional[LinksInputType] = None,
472
475
  tags: Optional[TagsInputType] = None,
473
476
  terms: Optional[TermsInputType] = None,
474
- # TODO structured_properties
475
477
  domain: Optional[DomainInputType] = None,
476
- extra_aspects: ExtraAspectsType = None,
477
478
  # Dataset-specific aspects.
478
479
  schema: Optional[SchemaFieldsInputType] = None,
479
480
  upstreams: Optional[models.UpstreamLineageClass] = None,
481
+ structured_properties: Optional[StructuredPropertyInputType] = None,
482
+ extra_aspects: ExtraAspectsType = None,
480
483
  ):
481
484
  """Initialize a new Dataset instance.
482
485
 
@@ -548,6 +551,9 @@ class Dataset(
548
551
  self.set_terms(terms)
549
552
  if domain is not None:
550
553
  self.set_domain(domain)
554
+ if structured_properties is not None:
555
+ for key, value in structured_properties.items():
556
+ self.set_structured_property(property_urn=key, values=value)
551
557
 
552
558
  @classmethod
553
559
  def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
@@ -9,7 +9,9 @@ from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
9
9
  from datahub.errors import IngestionAttributionWarning, ItemNotFoundError, SdkUsageError
10
10
  from datahub.ingestion.graph.client import DataHubGraph
11
11
  from datahub.metadata.urns import (
12
+ ChartUrn,
12
13
  ContainerUrn,
14
+ DashboardUrn,
13
15
  DataFlowUrn,
14
16
  DataJobUrn,
15
17
  DatasetUrn,
@@ -19,7 +21,9 @@ from datahub.metadata.urns import (
19
21
  )
20
22
  from datahub.sdk._all_entities import ENTITY_CLASSES
21
23
  from datahub.sdk._shared import UrnOrStr
24
+ from datahub.sdk.chart import Chart
22
25
  from datahub.sdk.container import Container
26
+ from datahub.sdk.dashboard import Dashboard
23
27
  from datahub.sdk.dataflow import DataFlow
24
28
  from datahub.sdk.datajob import DataJob
25
29
  from datahub.sdk.dataset import Dataset
@@ -65,6 +69,10 @@ class EntityClient:
65
69
  @overload
66
70
  def get(self, urn: DataJobUrn) -> DataJob: ...
67
71
  @overload
72
+ def get(self, urn: DashboardUrn) -> Dashboard: ...
73
+ @overload
74
+ def get(self, urn: ChartUrn) -> Chart: ...
75
+ @overload
68
76
  def get(self, urn: Union[Urn, str]) -> Entity: ...
69
77
  def get(self, urn: UrnOrStr) -> Entity:
70
78
  """Retrieve an entity by its urn.
@@ -84,7 +92,26 @@ class EntityClient:
84
92
  urn = Urn.from_string(urn)
85
93
 
86
94
  # TODO: add error handling around this with a suggested alternative if not yet supported
87
- EntityClass = ENTITY_CLASSES[urn.entity_type]
95
+ try:
96
+ EntityClass = ENTITY_CLASSES[urn.entity_type]
97
+ except KeyError as e:
98
+ # Try to import cloud-specific entities if not found
99
+ try:
100
+ from acryl_datahub_cloud._sdk_extras.entities.assertion import Assertion
101
+ from acryl_datahub_cloud._sdk_extras.entities.monitor import Monitor
102
+
103
+ if urn.entity_type == "assertion":
104
+ EntityClass = Assertion
105
+ elif urn.entity_type == "monitor":
106
+ EntityClass = Monitor
107
+ else:
108
+ raise SdkUsageError(
109
+ f"Entity type {urn.entity_type} is not yet supported"
110
+ ) from e
111
+ except ImportError as e:
112
+ raise SdkUsageError(
113
+ f"Entity type {urn.entity_type} is not yet supported"
114
+ ) from e
88
115
 
89
116
  if not self._graph.exists(str(urn)):
90
117
  raise ItemNotFoundError(f"Entity {urn} not found")
@@ -92,7 +119,19 @@ class EntityClient:
92
119
  aspects = self._graph.get_entity_semityped(str(urn))
93
120
 
94
121
  # TODO: save the timestamp so we can use If-Unmodified-Since on the updates
95
- return EntityClass._new_from_graph(urn, aspects)
122
+ entity = EntityClass._new_from_graph(urn, aspects)
123
+
124
+ # Type narrowing for cloud-specific entities
125
+ if urn.entity_type == "assertion":
126
+ from acryl_datahub_cloud._sdk_extras.entities.assertion import Assertion
127
+
128
+ assert isinstance(entity, Assertion)
129
+ elif urn.entity_type == "monitor":
130
+ from acryl_datahub_cloud._sdk_extras.entities.monitor import Monitor
131
+
132
+ assert isinstance(entity, Monitor)
133
+
134
+ return entity
96
135
 
97
136
  def create(self, entity: Entity) -> None:
98
137
  mcps = []
@@ -153,3 +192,44 @@ class EntityClient:
153
192
 
154
193
  mcps = updater.build()
155
194
  self._graph.emit_mcps(mcps)
195
+
196
+ def delete(
197
+ self,
198
+ urn: UrnOrStr,
199
+ check_exists: bool = True,
200
+ cascade: bool = False,
201
+ hard: bool = False,
202
+ ) -> None:
203
+ """Delete an entity by its urn.
204
+
205
+ Args:
206
+ urn: The urn of the entity to delete. Can be a string or :py:class:`Urn` object.
207
+ check_exists: Whether to check if the entity exists before deletion. Defaults to True.
208
+ cascade: Whether to cascade delete related entities. When True, deletes child entities
209
+ like datajobs within dataflows, datasets within containers, etc. Not yet supported.
210
+ hard: Whether to perform a hard delete (permanent) or soft delete. Defaults to False.
211
+
212
+ Raises:
213
+ SdkUsageError: If the entity does not exist and check_exists is True, or if cascade is True (not supported).
214
+
215
+ Note:
216
+ When hard is True, the operation is irreversible and the entity will be permanently removed.
217
+
218
+ Impact of cascade deletion (still to be done) depends on the input entity type:
219
+ - Container: Recursively deletes all containers and data assets within the container.
220
+ - Dataflow: Recursively deletes all data jobs within the dataflow.
221
+ - Dashboard: TBD
222
+ - DataPlatformInstance: TBD
223
+ - ...
224
+ """
225
+ urn_str = str(urn) if isinstance(urn, Urn) else urn
226
+ if check_exists and not self._graph.exists(entity_urn=urn_str):
227
+ raise SdkUsageError(
228
+ f"Entity {urn_str} does not exist, and hence cannot be deleted. "
229
+ "You can bypass this check by setting check_exists=False."
230
+ )
231
+
232
+ if cascade:
233
+ raise SdkUsageError("The 'cascade' parameter is not yet supported.")
234
+
235
+ self._graph.delete_entity(urn=urn_str, hard=hard)