acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (221) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/emitter/rest_emitter.py +70 -12
  36. datahub/entrypoints.py +4 -3
  37. datahub/ingestion/api/decorators.py +15 -3
  38. datahub/ingestion/api/report.py +332 -3
  39. datahub/ingestion/api/sink.py +3 -0
  40. datahub/ingestion/api/source.py +48 -44
  41. datahub/ingestion/autogenerated/__init__.py +0 -0
  42. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  43. datahub/ingestion/autogenerated/lineage.json +401 -0
  44. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  45. datahub/ingestion/extractor/schema_util.py +13 -4
  46. datahub/ingestion/glossary/classification_mixin.py +5 -0
  47. datahub/ingestion/graph/client.py +100 -15
  48. datahub/ingestion/graph/config.py +1 -0
  49. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  50. datahub/ingestion/run/pipeline.py +54 -2
  51. datahub/ingestion/sink/datahub_rest.py +13 -0
  52. datahub/ingestion/source/abs/source.py +1 -1
  53. datahub/ingestion/source/aws/aws_common.py +4 -0
  54. datahub/ingestion/source/aws/glue.py +489 -244
  55. datahub/ingestion/source/aws/tag_entities.py +292 -0
  56. datahub/ingestion/source/azure/azure_common.py +2 -2
  57. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  58. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  59. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  60. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  61. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  62. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  63. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  64. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  65. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  66. datahub/ingestion/source/common/subtypes.py +45 -0
  67. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  68. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  69. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  70. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  71. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  72. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  73. datahub/ingestion/source/debug/__init__.py +0 -0
  74. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  75. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  76. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  77. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  78. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  79. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  80. datahub/ingestion/source/file.py +3 -0
  81. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  82. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  83. datahub/ingestion/source/ge_data_profiler.py +76 -28
  84. datahub/ingestion/source/ge_profiling_config.py +11 -0
  85. datahub/ingestion/source/hex/api.py +26 -1
  86. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  87. datahub/ingestion/source/identity/azure_ad.py +1 -1
  88. datahub/ingestion/source/identity/okta.py +1 -14
  89. datahub/ingestion/source/kafka/kafka.py +16 -0
  90. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  91. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  92. datahub/ingestion/source/looker/looker_source.py +1 -0
  93. datahub/ingestion/source/mlflow.py +11 -1
  94. datahub/ingestion/source/mock_data/__init__.py +0 -0
  95. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  97. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  98. datahub/ingestion/source/nifi.py +1 -1
  99. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  100. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  101. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  102. datahub/ingestion/source/preset.py +2 -2
  103. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  104. datahub/ingestion/source/redshift/redshift.py +21 -1
  105. datahub/ingestion/source/redshift/usage.py +4 -3
  106. datahub/ingestion/source/s3/report.py +4 -2
  107. datahub/ingestion/source/s3/source.py +367 -115
  108. datahub/ingestion/source/sac/sac.py +3 -1
  109. datahub/ingestion/source/salesforce.py +6 -3
  110. datahub/ingestion/source/sigma/sigma.py +7 -1
  111. datahub/ingestion/source/slack/slack.py +2 -1
  112. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  113. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  114. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  115. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  116. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  117. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  118. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  119. datahub/ingestion/source/sql/athena.py +119 -11
  120. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  121. datahub/ingestion/source/sql/clickhouse.py +3 -1
  122. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  123. datahub/ingestion/source/sql/hana.py +3 -1
  124. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  125. datahub/ingestion/source/sql/mariadb.py +0 -1
  126. datahub/ingestion/source/sql/mssql/source.py +239 -34
  127. datahub/ingestion/source/sql/mysql.py +0 -1
  128. datahub/ingestion/source/sql/oracle.py +1 -1
  129. datahub/ingestion/source/sql/postgres.py +0 -1
  130. datahub/ingestion/source/sql/sql_common.py +121 -34
  131. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  132. datahub/ingestion/source/sql/teradata.py +997 -235
  133. datahub/ingestion/source/sql/vertica.py +10 -6
  134. datahub/ingestion/source/sql_queries.py +2 -2
  135. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  136. datahub/ingestion/source/superset.py +58 -3
  137. datahub/ingestion/source/tableau/tableau.py +58 -37
  138. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  139. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  140. datahub/ingestion/source/unity/config.py +5 -0
  141. datahub/ingestion/source/unity/proxy.py +118 -0
  142. datahub/ingestion/source/unity/source.py +195 -17
  143. datahub/ingestion/source/unity/tag_entities.py +295 -0
  144. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  145. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  146. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  147. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  148. datahub/metadata/_internal_schema_classes.py +1433 -546
  149. datahub/metadata/_urns/urn_defs.py +1826 -1658
  150. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  151. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  157. datahub/metadata/schema.avsc +17736 -17112
  158. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  159. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  160. datahub/metadata/schemas/Applications.avsc +38 -0
  161. datahub/metadata/schemas/ChartKey.avsc +1 -0
  162. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  164. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  165. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  166. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  167. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  168. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  169. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  170. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  171. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  172. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  173. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  176. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  177. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  178. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  179. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  180. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  181. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  182. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  183. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  184. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  185. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  186. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  187. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  188. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  189. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  190. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  191. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  192. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  193. datahub/metadata/schemas/__init__.py +3 -3
  194. datahub/sdk/__init__.py +2 -0
  195. datahub/sdk/_all_entities.py +7 -0
  196. datahub/sdk/_shared.py +116 -0
  197. datahub/sdk/chart.py +315 -0
  198. datahub/sdk/container.py +7 -0
  199. datahub/sdk/dashboard.py +432 -0
  200. datahub/sdk/dataflow.py +7 -0
  201. datahub/sdk/datajob.py +45 -13
  202. datahub/sdk/dataset.py +8 -2
  203. datahub/sdk/entity_client.py +82 -2
  204. datahub/sdk/lineage_client.py +683 -82
  205. datahub/sdk/main_client.py +46 -16
  206. datahub/sdk/mlmodel.py +101 -38
  207. datahub/sdk/mlmodelgroup.py +7 -0
  208. datahub/sdk/search_client.py +4 -3
  209. datahub/specific/chart.py +1 -1
  210. datahub/specific/dataproduct.py +4 -0
  211. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  212. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  213. datahub/telemetry/telemetry.py +17 -11
  214. datahub/testing/sdk_v2_helpers.py +7 -1
  215. datahub/upgrade/upgrade.py +46 -13
  216. datahub/utilities/server_config_util.py +8 -0
  217. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  218. datahub/utilities/stats_collections.py +4 -0
  219. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
  220. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from abc import ABC, abstractmethod
2
3
 
3
4
  # Add imports for source customization
@@ -236,42 +237,76 @@ class ABSObjectStore(ObjectStoreInterface):
236
237
  """Implementation of ObjectStoreInterface for Azure Blob Storage."""
237
238
 
238
239
  PREFIX = "abfss://"
240
+ HTTPS_REGEX = re.compile(r"(https?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)")
239
241
 
240
242
  @classmethod
241
243
  def is_uri(cls, uri: str) -> bool:
242
- return uri.startswith(cls.PREFIX)
244
+ return uri.startswith(cls.PREFIX) or bool(cls.HTTPS_REGEX.match(uri))
243
245
 
244
246
  @classmethod
245
247
  def get_prefix(cls, uri: str) -> Optional[str]:
246
248
  if uri.startswith(cls.PREFIX):
247
249
  return cls.PREFIX
250
+
251
+ # Check for HTTPS format
252
+ match = cls.HTTPS_REGEX.match(uri)
253
+ if match:
254
+ return match.group(1)
255
+
248
256
  return None
249
257
 
250
258
  @classmethod
251
259
  def strip_prefix(cls, uri: str) -> str:
252
- prefix = cls.get_prefix(uri)
253
- if not prefix:
254
- raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
255
- return uri[len(prefix) :]
260
+ if uri.startswith(cls.PREFIX):
261
+ return uri[len(cls.PREFIX) :]
262
+
263
+ # Handle HTTPS format
264
+ match = cls.HTTPS_REGEX.match(uri)
265
+ if match:
266
+ return uri[len(match.group(1)) :]
267
+
268
+ raise ValueError(
269
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
270
+ )
256
271
 
257
272
  @classmethod
258
273
  def get_bucket_name(cls, uri: str) -> str:
259
274
  if not cls.is_uri(uri):
260
- raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
261
- return cls.strip_prefix(uri).split("@")[0]
275
+ raise ValueError(
276
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
277
+ )
278
+
279
+ if uri.startswith(cls.PREFIX):
280
+ # abfss://container@account.dfs.core.windows.net/path
281
+ return cls.strip_prefix(uri).split("@")[0]
282
+ else:
283
+ # https://account.blob.core.windows.net/container/path
284
+ return cls.strip_prefix(uri).split("/")[0]
262
285
 
263
286
  @classmethod
264
287
  def get_object_key(cls, uri: str) -> str:
265
288
  if not cls.is_uri(uri):
266
- raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
267
- parts = cls.strip_prefix(uri).split("@", 1)
268
- if len(parts) < 2:
269
- return ""
270
- account_path = parts[1]
271
- path_parts = account_path.split("/", 1)
272
- if len(path_parts) < 2:
273
- return ""
274
- return path_parts[1]
289
+ raise ValueError(
290
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
291
+ )
292
+
293
+ if uri.startswith(cls.PREFIX):
294
+ # abfss://container@account.dfs.core.windows.net/path
295
+ parts = cls.strip_prefix(uri).split("@", 1)
296
+ if len(parts) < 2:
297
+ return ""
298
+ account_path = parts[1]
299
+ path_parts = account_path.split("/", 1)
300
+ if len(path_parts) < 2:
301
+ return ""
302
+ return path_parts[1]
303
+ else:
304
+ # https://account.blob.core.windows.net/container/path
305
+ stripped = cls.strip_prefix(uri)
306
+ parts = stripped.split("/", 1)
307
+ if len(parts) < 2:
308
+ return ""
309
+ return parts[1]
275
310
 
276
311
 
277
312
  # Registry of all object store implementations
@@ -331,6 +366,12 @@ def get_object_store_bucket_name(uri: str) -> str:
331
366
  return uri[prefix_length:].split("/")[0]
332
367
  elif uri.startswith(ABSObjectStore.PREFIX):
333
368
  return uri[len(ABSObjectStore.PREFIX) :].split("@")[0]
369
+ elif ABSObjectStore.HTTPS_REGEX.match(uri):
370
+ # Handle HTTPS Azure Blob Storage URLs
371
+ match = ABSObjectStore.HTTPS_REGEX.match(uri)
372
+ if match:
373
+ stripped = uri[len(match.group(1)) :]
374
+ return stripped.split("/")[0]
334
375
 
335
376
  raise ValueError(f"Unsupported URI format: {uri}")
336
377
 
@@ -470,18 +511,25 @@ class ObjectStoreSourceAdapter:
470
511
  if not ABSObjectStore.is_uri(table_data.table_path):
471
512
  return None
472
513
 
473
- # Parse the ABS URI
474
514
  try:
475
- # URI format: abfss://container@account.dfs.core.windows.net/path
476
- path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
477
- parts = path_without_prefix.split("@", 1)
478
- if len(parts) < 2:
479
- return None
480
-
481
- container_name = parts[0]
482
- account_parts = parts[1].split("/", 1)
483
- account_domain = account_parts[0]
484
- account_name = account_domain.split(".")[0]
515
+ if table_data.table_path.startswith("abfss://"):
516
+ # URI format: abfss://container@account.dfs.core.windows.net/path
517
+ path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
518
+ parts = path_without_prefix.split("@", 1)
519
+ if len(parts) < 2:
520
+ return None
521
+
522
+ container_name = parts[0]
523
+ account_parts = parts[1].split("/", 1)
524
+ account_domain = account_parts[0]
525
+ account_name = account_domain.split(".")[0]
526
+ else:
527
+ # Handle HTTPS format: https://account.blob.core.windows.net/container/path
528
+ container_name = ABSObjectStore.get_bucket_name(table_data.table_path)
529
+ if "blob.core.windows.net" in table_data.table_path:
530
+ account_name = table_data.table_path.split("//")[1].split(".")[0]
531
+ else:
532
+ return None
485
533
 
486
534
  # Construct Azure portal URL
487
535
  return f"https://portal.azure.com/#blade/Microsoft_Azure_Storage/ContainerMenuBlade/overview/storageAccountId/{account_name}/containerName/{container_name}"
@@ -519,6 +567,13 @@ class ObjectStoreSourceAdapter:
519
567
  "get_external_url",
520
568
  lambda table_data: self.get_gcs_external_url(table_data),
521
569
  )
570
+ # Fix URI mismatch issue in pattern matching
571
+ self.register_customization(
572
+ "_normalize_uri_for_pattern_matching",
573
+ self._normalize_gcs_uri_for_pattern_matching,
574
+ )
575
+ # Fix URI handling in schema extraction - override strip_s3_prefix for GCS
576
+ self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
522
577
  elif platform == "s3":
523
578
  self.register_customization("is_s3_platform", lambda: True)
524
579
  self.register_customization("create_s3_path", self.create_s3_path)
@@ -612,6 +667,39 @@ class ObjectStoreSourceAdapter:
612
667
  return self.get_abs_external_url(table_data)
613
668
  return None
614
669
 
670
+ def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
671
+ """
672
+ Normalize GCS URI for pattern matching.
673
+
674
+ This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
675
+ fixing the URI mismatch issue in GCS ingestion.
676
+
677
+ Args:
678
+ uri: The URI to normalize
679
+
680
+ Returns:
681
+ The normalized URI for pattern matching
682
+ """
683
+ if uri.startswith("gs://"):
684
+ return uri.replace("gs://", "s3://", 1)
685
+ return uri
686
+
687
+ def _strip_gcs_prefix(self, uri: str) -> str:
688
+ """
689
+ Strip GCS prefix from URI.
690
+
691
+ This method removes the gs:// prefix from GCS URIs for path processing.
692
+
693
+ Args:
694
+ uri: The URI to strip the prefix from
695
+
696
+ Returns:
697
+ The URI without the gs:// prefix
698
+ """
699
+ if uri.startswith("gs://"):
700
+ return uri[5:] # Remove "gs://" prefix
701
+ return uri
702
+
615
703
 
616
704
  # Factory function to create an adapter for a specific platform
617
705
  def create_object_store_adapter(
@@ -166,7 +166,6 @@ class PathSpec(ConfigModel):
166
166
  return False
167
167
 
168
168
  def allowed(self, path: str, ignore_ext: bool = False) -> bool:
169
- logger.debug(f"Checking file to inclusion: {path}")
170
169
  if self.is_path_hidden(path) and not self.include_hidden_folders:
171
170
  return False
172
171
 
@@ -174,19 +173,17 @@ class PathSpec(ConfigModel):
174
173
  self.glob_include, flags=pathlib.GLOBSTAR
175
174
  ):
176
175
  return False
177
- logger.debug(f"{path} matched include ")
176
+
178
177
  if self.exclude:
179
178
  for exclude_path in self.exclude:
180
179
  if pathlib.PurePath(path).globmatch(
181
180
  exclude_path, flags=pathlib.GLOBSTAR
182
181
  ):
183
182
  return False
184
- logger.debug(f"{path} is not excluded")
185
183
 
186
184
  table_name, _ = self.extract_table_name_and_path(path)
187
185
  if not self.tables_filter_pattern.allowed(table_name):
188
186
  return False
189
- logger.debug(f"{path} is passed table name check")
190
187
 
191
188
  ext = os.path.splitext(path)[1].strip(".")
192
189
 
@@ -196,8 +193,6 @@ class PathSpec(ConfigModel):
196
193
  ):
197
194
  return False
198
195
 
199
- logger.debug(f"{path} had selected extension {ext}")
200
- logger.debug(f"{path} allowed for dataset creation")
201
196
  return True
202
197
 
203
198
  def dir_allowed(self, path: str) -> bool:
@@ -219,10 +214,8 @@ class PathSpec(ConfigModel):
219
214
  for _ in range(slash_to_remove_from_glob):
220
215
  glob_include = glob_include.rsplit("/", 1)[0]
221
216
 
222
- logger.debug(f"Checking dir to inclusion: {path}")
223
217
  if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
224
218
  return False
225
- logger.debug(f"{path} matched include ")
226
219
  if self.exclude:
227
220
  for exclude_path in self.exclude:
228
221
  if pathlib.PurePath(path.rstrip("/")).globmatch(
@@ -236,7 +229,7 @@ class PathSpec(ConfigModel):
236
229
  )
237
230
  if not self.tables_filter_pattern.allowed(table_name):
238
231
  return False
239
- logger.debug(f"{path} is passed table name check")
232
+ # logger.debug(f"{path} is passed table name check")
240
233
 
241
234
  return True
242
235
 
@@ -246,10 +239,10 @@ class PathSpec(ConfigModel):
246
239
  if parsable_include.endswith("/{table}/**"):
247
240
  # Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
248
241
  parsable_include = parsable_include[:-2]
249
- else:
250
- # Replace all * with {folder[i]} to make it parsable
251
- for i in range(parsable_include.count("*")):
252
- parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
242
+
243
+ # Replace all * with {folder[i]} to make it parsable
244
+ for i in range(parsable_include.count("*")):
245
+ parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
253
246
  return parsable_include
254
247
 
255
248
  def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
@@ -330,8 +323,6 @@ class PathSpec(ConfigModel):
330
323
  if "{table}" in values["include"]:
331
324
  v = "{table}"
332
325
  else:
333
- logger.debug(f"include fields: {compiled_include.named_fields}")
334
- logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
335
326
  if not all(
336
327
  x in compiled_include.named_fields
337
328
  for x in parse.compile(v).named_fields
@@ -356,9 +347,7 @@ class PathSpec(ConfigModel):
356
347
  @cached_property
357
348
  def compiled_include(self):
358
349
  parsable_include = PathSpec.get_parsable_include(self.include)
359
- logger.debug(f"parsable_include: {parsable_include}")
360
350
  compiled_include = parse.compile(parsable_include)
361
- logger.debug(f"Setting compiled_include: {compiled_include}")
362
351
  return compiled_include
363
352
 
364
353
  @cached_property
@@ -366,9 +355,8 @@ class PathSpec(ConfigModel):
366
355
  parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
367
356
  "/", 1
368
357
  )[0]
369
- logger.debug(f"parsable_folder_include: {parsable_folder_include}")
370
358
  compiled_folder_include = parse.compile(parsable_folder_include)
371
- logger.debug(f"Setting compiled_folder_include: {compiled_folder_include}")
359
+
372
360
  return compiled_folder_include
373
361
 
374
362
  @cached_property
@@ -376,7 +364,8 @@ class PathSpec(ConfigModel):
376
364
  # Regular expression to find all substrings enclosed in {}
377
365
  pattern = r"\{(.*?)\}"
378
366
  # Find all matches
379
- matches = re.findall(pattern, self.include.split("{table}/")[1])
367
+ split_parts = self.include.split("{table}/")
368
+ matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
380
369
  return matches
381
370
 
382
371
  def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
@@ -563,7 +552,7 @@ class PathSpec(ConfigModel):
563
552
  f"{{{template_key}}}", var[key]
564
553
  )
565
554
  else:
566
- partition_format.replace(f"{{{var_key}}}", var)
555
+ partition_format = partition_format.replace(f"{{{var_key}}}", var)
567
556
  return datetime.datetime.strptime(partition_format, datetime_format).replace(
568
557
  tzinfo=datetime.timezone.utc
569
558
  )
@@ -12,7 +12,7 @@ from datahub.emitter.serialization_helper import post_json_transform
12
12
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
13
13
  from datahub.ingestion.source.datahub.report import DataHubSourceReport
14
14
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
15
- from datahub.metadata.schema_classes import ChangeTypeClass, SystemMetadataClass
15
+ from datahub.metadata.schema_classes import SystemMetadataClass
16
16
  from datahub.utilities.lossy_collections import LossyDict, LossyList
17
17
 
18
18
  logger = logging.getLogger(__name__)
@@ -374,7 +374,6 @@ class DataHubDatabaseReader:
374
374
  entityUrn=row["urn"],
375
375
  aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
376
376
  systemMetadata=system_metadata,
377
- changeType=ChangeTypeClass.UPSERT,
378
377
  )
379
378
  except Exception as e:
380
379
  logger.warning(
@@ -9,7 +9,9 @@ import requests
9
9
  from pydantic import Field, root_validator
10
10
 
11
11
  from datahub.ingestion.api.decorators import (
12
+ SourceCapability,
12
13
  SupportStatus,
14
+ capability,
13
15
  config_class,
14
16
  platform_name,
15
17
  support_status,
@@ -24,6 +26,7 @@ from datahub.ingestion.source.dbt.dbt_common import (
24
26
  DBTCommonConfig,
25
27
  DBTNode,
26
28
  DBTSourceBase,
29
+ DBTSourceReport,
27
30
  )
28
31
  from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
29
32
 
@@ -261,8 +264,10 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
261
264
  @platform_name("dbt")
262
265
  @config_class(DBTCloudConfig)
263
266
  @support_status(SupportStatus.CERTIFIED)
267
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
264
268
  class DBTCloudSource(DBTSourceBase, TestableSource):
265
269
  config: DBTCloudConfig
270
+ report: DBTSourceReport # nothing cloud-specific in the report
266
271
 
267
272
  @classmethod
268
273
  def create(cls, config_dict, ctx):
@@ -401,8 +406,11 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
401
406
  if node["resourceType"] in {"model", "seed", "snapshot"}:
402
407
  status = node["status"]
403
408
  if status is None and materialization != "ephemeral":
404
- self.report.report_warning(
405
- key, "node is missing a status, schema metadata will be incomplete"
409
+ self.report.warning(
410
+ title="Schema information may be incomplete",
411
+ message="Some nodes are missing the `status` field, which dbt uses to track the status of the node in the target database.",
412
+ context=key,
413
+ log=False,
406
414
  )
407
415
 
408
416
  # The code fields are new in dbt 1.3, and replace the sql ones.
@@ -355,7 +355,7 @@ class DBTCommonConfig(
355
355
  # override default value to True.
356
356
  incremental_lineage: bool = Field(
357
357
  default=True,
358
- description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run.",
358
+ description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run. This would also require enabling 'incremental_lineage' in the counterpart warehouse ingestion (_e.g._ BigQuery, Redshift, etc).",
359
359
  )
360
360
 
361
361
  _remove_use_compiled_code = pydantic_removed_field("use_compiled_code")
@@ -823,7 +823,9 @@ def get_column_type(
823
823
  @platform_name("dbt")
824
824
  @config_class(DBTCommonConfig)
825
825
  @support_status(SupportStatus.CERTIFIED)
826
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
826
+ @capability(
827
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
828
+ )
827
829
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
828
830
  @capability(
829
831
  SourceCapability.LINEAGE_FINE,
@@ -1334,6 +1336,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1334
1336
  self.config.tag_prefix,
1335
1337
  "SOURCE_CONTROL",
1336
1338
  self.config.strip_user_ids_from_email,
1339
+ match_nested_props=True,
1337
1340
  )
1338
1341
 
1339
1342
  action_processor_tag = OperationProcessor(
@@ -1705,6 +1708,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1705
1708
  self.config.tag_prefix,
1706
1709
  "SOURCE_CONTROL",
1707
1710
  self.config.strip_user_ids_from_email,
1711
+ match_nested_props=True,
1708
1712
  )
1709
1713
 
1710
1714
  canonical_schema: List[SchemaField] = []
@@ -15,7 +15,9 @@ from datahub.configuration.git import GitReference
15
15
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
16
16
  from datahub.ingestion.api.common import PipelineContext
17
17
  from datahub.ingestion.api.decorators import (
18
+ SourceCapability,
18
19
  SupportStatus,
20
+ capability,
19
21
  config_class,
20
22
  platform_name,
21
23
  support_status,
@@ -464,6 +466,7 @@ def load_run_results(
464
466
  @platform_name("dbt")
465
467
  @config_class(DBTCoreConfig)
466
468
  @support_status(SupportStatus.CERTIFIED)
469
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
467
470
  class DBTCoreSource(DBTSourceBase, TestableSource):
468
471
  config: DBTCoreConfig
469
472
  report: DBTCoreReport
File without changes