acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -69,9 +69,19 @@ class MetabaseConfig(DatasetLineageProviderConfigBase, StatefulIngestionConfigBa
69
69
  default=None,
70
70
  description="optional URL to use in links (if `connect_uri` is only for ingestion)",
71
71
  )
72
- username: Optional[str] = Field(default=None, description="Metabase username.")
72
+ username: Optional[str] = Field(
73
+ default=None,
74
+ description="Metabase username, used when an API key is not provided.",
75
+ )
73
76
  password: Optional[pydantic.SecretStr] = Field(
74
- default=None, description="Metabase password."
77
+ default=None,
78
+ description="Metabase password, used when an API key is not provided.",
79
+ )
80
+
81
+ # https://www.metabase.com/learn/metabase-basics/administration/administration-and-operation/metabase-api#example-get-request
82
+ api_key: Optional[pydantic.SecretStr] = Field(
83
+ default=None,
84
+ description="Metabase API key. If provided, the username and password will be ignored. Recommended method.",
75
85
  )
76
86
  # TODO: Check and remove this if no longer needed.
77
87
  # Config database_alias is removed from sql sources.
@@ -178,30 +188,40 @@ class MetabaseSource(StatefulIngestionSourceBase):
178
188
  self.source_config: MetabaseConfig = config
179
189
 
180
190
  def setup_session(self) -> None:
181
- login_response = requests.post(
182
- f"{self.config.connect_uri}/api/session",
183
- None,
184
- {
185
- "username": self.config.username,
186
- "password": (
187
- self.config.password.get_secret_value()
188
- if self.config.password
189
- else None
190
- ),
191
- },
192
- )
191
+ self.session = requests.session()
192
+ if self.config.api_key:
193
+ self.session.headers.update(
194
+ {
195
+ "x-api-key": self.config.api_key.get_secret_value(),
196
+ "Content-Type": "application/json",
197
+ "Accept": "*/*",
198
+ }
199
+ )
200
+ else:
201
+ # If no API key is provided, generate a session token using username and password.
202
+ login_response = requests.post(
203
+ f"{self.config.connect_uri}/api/session",
204
+ None,
205
+ {
206
+ "username": self.config.username,
207
+ "password": (
208
+ self.config.password.get_secret_value()
209
+ if self.config.password
210
+ else None
211
+ ),
212
+ },
213
+ )
193
214
 
194
- login_response.raise_for_status()
195
- self.access_token = login_response.json().get("id", "")
215
+ login_response.raise_for_status()
216
+ self.access_token = login_response.json().get("id", "")
196
217
 
197
- self.session = requests.session()
198
- self.session.headers.update(
199
- {
200
- "X-Metabase-Session": f"{self.access_token}",
201
- "Content-Type": "application/json",
202
- "Accept": "*/*",
203
- }
204
- )
218
+ self.session.headers.update(
219
+ {
220
+ "X-Metabase-Session": f"{self.access_token}",
221
+ "Content-Type": "application/json",
222
+ "Accept": "*/*",
223
+ }
224
+ )
205
225
 
206
226
  # Test the connection
207
227
  try:
@@ -217,15 +237,17 @@ class MetabaseSource(StatefulIngestionSourceBase):
217
237
  )
218
238
 
219
239
  def close(self) -> None:
220
- response = requests.delete(
221
- f"{self.config.connect_uri}/api/session",
222
- headers={"X-Metabase-Session": self.access_token},
223
- )
224
- if response.status_code not in (200, 204):
225
- self.report.report_failure(
226
- title="Unable to Log User Out",
227
- message=f"Unable to logout for user {self.config.username}",
240
+ # API key authentication does not require session closure.
241
+ if not self.config.api_key:
242
+ response = requests.delete(
243
+ f"{self.config.connect_uri}/api/session",
244
+ headers={"X-Metabase-Session": self.access_token},
228
245
  )
246
+ if response.status_code not in (200, 204):
247
+ self.report.report_failure(
248
+ title="Unable to Log User Out",
249
+ message=f"Unable to logout for user {self.config.username}",
250
+ )
229
251
  super().close()
230
252
 
231
253
  def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
@@ -291,7 +313,7 @@ class MetabaseSource(StatefulIngestionSourceBase):
291
313
  return None
292
314
 
293
315
  dashboard_urn = builder.make_dashboard_urn(
294
- self.platform, dashboard_details.get("id", "")
316
+ self.platform, str(dashboard_details.get("id", ""))
295
317
  )
296
318
  dashboard_snapshot = DashboardSnapshot(
297
319
  urn=dashboard_urn,
@@ -315,7 +337,7 @@ class MetabaseSource(StatefulIngestionSourceBase):
315
337
  card_id = card_info.get("card").get("id", "")
316
338
  if not card_id:
317
339
  continue # most likely a virtual card without an id (text or heading), not relevant.
318
- chart_urn = builder.make_chart_urn(self.platform, card_id)
340
+ chart_urn = builder.make_chart_urn(self.platform, str(card_id))
319
341
  chart_urns.append(chart_urn)
320
342
 
321
343
  dashboard_info_class = DashboardInfoClass(
@@ -437,7 +459,7 @@ class MetabaseSource(StatefulIngestionSourceBase):
437
459
  )
438
460
  return None
439
461
 
440
- chart_urn = builder.make_chart_urn(self.platform, card_id)
462
+ chart_urn = builder.make_chart_urn(self.platform, str(card_id))
441
463
  chart_snapshot = ChartSnapshot(
442
464
  urn=chart_urn,
443
465
  aspects=[],
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import pathlib
3
+ import re
3
4
  import time
4
5
  from dataclasses import dataclass, field
5
6
  from typing import Any, Dict, Iterable, List, Optional, TypeVar, Union
@@ -118,17 +119,58 @@ class BusinessGlossaryConfig(DefaultConfig):
118
119
  return v
119
120
 
120
121
 
122
+ def clean_url(text: str) -> str:
123
+ """
124
+ Clean text for use in URLs by:
125
+ 1. Replacing spaces with hyphens
126
+ 2. Removing special characters (preserving hyphens and periods)
127
+ 3. Collapsing multiple hyphens and periods into single ones
128
+ """
129
+ # Replace spaces with hyphens
130
+ text = text.replace(" ", "-")
131
+ # Remove special characters except hyphens and periods
132
+ text = re.sub(r"[^a-zA-Z0-9\-.]", "", text)
133
+ # Collapse multiple hyphens into one
134
+ text = re.sub(r"-+", "-", text)
135
+ # Collapse multiple periods into one
136
+ text = re.sub(r"\.+", ".", text)
137
+ # Remove leading/trailing hyphens and periods
138
+ text = text.strip("-.")
139
+ return text
140
+
141
+
121
142
  def create_id(path: List[str], default_id: Optional[str], enable_auto_id: bool) -> str:
143
+ """
144
+ Create an ID for a glossary node or term.
145
+
146
+ Args:
147
+ path: List of path components leading to this node/term
148
+ default_id: Optional manually specified ID
149
+ enable_auto_id: Whether to generate GUIDs
150
+ """
122
151
  if default_id is not None:
123
- return default_id # No need to create id from path as default_id is provided
152
+ return default_id # Use explicitly provided ID
124
153
 
125
154
  id_: str = ".".join(path)
126
155
 
127
- if UrnEncoder.contains_extended_reserved_char(id_):
128
- enable_auto_id = True
156
+ # Check for non-ASCII characters before cleaning
157
+ if any(ord(c) > 127 for c in id_):
158
+ return datahub_guid({"path": id_})
129
159
 
130
160
  if enable_auto_id:
161
+ # Generate GUID for auto_id mode
131
162
  id_ = datahub_guid({"path": id_})
163
+ else:
164
+ # Clean the URL for better readability when not using auto_id
165
+ id_ = clean_url(id_)
166
+
167
+ # Force auto_id if the cleaned URL still contains problematic characters
168
+ if UrnEncoder.contains_extended_reserved_char(id_):
169
+ logger.warning(
170
+ f"ID '{id_}' contains problematic characters after URL cleaning. Falling back to GUID generation for stability."
171
+ )
172
+ id_ = datahub_guid({"path": id_})
173
+
132
174
  return id_
133
175
 
134
176
 
@@ -104,8 +104,8 @@ class FineGrainedLineageConfig(ConfigModel):
104
104
 
105
105
  class EntityNodeConfig(ConfigModel):
106
106
  entity: EntityConfig
107
- upstream: Optional[List["EntityNodeConfig"]]
108
- fineGrainedLineages: Optional[List[FineGrainedLineageConfig]]
107
+ upstream: Optional[List["EntityNodeConfig"]] = None
108
+ fineGrainedLineages: Optional[List[FineGrainedLineageConfig]] = None
109
109
 
110
110
 
111
111
  # https://pydantic-docs.helpmanual.io/usage/postponed_annotations/ required for when you reference a model within itself