acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show
  1. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
  2. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
  3. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +2 -1
  7. datahub/api/entities/external/__init__.py +0 -0
  8. datahub/api/entities/external/external_entities.py +239 -0
  9. datahub/api/entities/external/external_tag.py +145 -0
  10. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  11. datahub/api/entities/external/restricted_text.py +247 -0
  12. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  13. datahub/cli/check_cli.py +88 -7
  14. datahub/cli/cli_utils.py +63 -0
  15. datahub/cli/container_cli.py +5 -0
  16. datahub/cli/delete_cli.py +124 -27
  17. datahub/cli/docker_check.py +107 -12
  18. datahub/cli/docker_cli.py +149 -227
  19. datahub/cli/exists_cli.py +0 -2
  20. datahub/cli/get_cli.py +0 -2
  21. datahub/cli/iceberg_cli.py +5 -0
  22. datahub/cli/ingest_cli.py +12 -16
  23. datahub/cli/migrate.py +2 -0
  24. datahub/cli/put_cli.py +1 -4
  25. datahub/cli/quickstart_versioning.py +50 -7
  26. datahub/cli/specific/assertions_cli.py +0 -4
  27. datahub/cli/specific/datacontract_cli.py +0 -3
  28. datahub/cli/specific/dataproduct_cli.py +0 -11
  29. datahub/cli/specific/dataset_cli.py +1 -8
  30. datahub/cli/specific/forms_cli.py +0 -4
  31. datahub/cli/specific/group_cli.py +0 -2
  32. datahub/cli/specific/structuredproperties_cli.py +1 -4
  33. datahub/cli/specific/user_cli.py +0 -2
  34. datahub/cli/state_cli.py +0 -2
  35. datahub/cli/timeline_cli.py +0 -2
  36. datahub/emitter/response_helper.py +86 -1
  37. datahub/emitter/rest_emitter.py +71 -13
  38. datahub/entrypoints.py +4 -3
  39. datahub/ingestion/api/decorators.py +15 -3
  40. datahub/ingestion/api/report.py +332 -3
  41. datahub/ingestion/api/sink.py +3 -0
  42. datahub/ingestion/api/source.py +48 -44
  43. datahub/ingestion/autogenerated/__init__.py +0 -0
  44. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  45. datahub/ingestion/autogenerated/lineage.json +401 -0
  46. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  47. datahub/ingestion/extractor/schema_util.py +13 -4
  48. datahub/ingestion/glossary/classification_mixin.py +5 -0
  49. datahub/ingestion/graph/client.py +100 -15
  50. datahub/ingestion/graph/config.py +1 -0
  51. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  52. datahub/ingestion/run/pipeline.py +54 -2
  53. datahub/ingestion/sink/datahub_rest.py +13 -0
  54. datahub/ingestion/source/abs/source.py +1 -1
  55. datahub/ingestion/source/aws/aws_common.py +4 -0
  56. datahub/ingestion/source/aws/glue.py +489 -244
  57. datahub/ingestion/source/aws/tag_entities.py +292 -0
  58. datahub/ingestion/source/azure/azure_common.py +2 -2
  59. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  60. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  61. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  62. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  63. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  64. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  65. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  66. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  67. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  68. datahub/ingestion/source/common/subtypes.py +45 -0
  69. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  70. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  71. datahub/ingestion/source/datahub/config.py +11 -0
  72. datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
  73. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  74. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  75. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  76. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  77. datahub/ingestion/source/debug/__init__.py +0 -0
  78. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  79. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  80. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  81. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  82. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  83. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  84. datahub/ingestion/source/file.py +3 -0
  85. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  86. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  87. datahub/ingestion/source/ge_data_profiler.py +76 -28
  88. datahub/ingestion/source/ge_profiling_config.py +11 -0
  89. datahub/ingestion/source/hex/api.py +26 -1
  90. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  91. datahub/ingestion/source/identity/azure_ad.py +1 -1
  92. datahub/ingestion/source/identity/okta.py +1 -14
  93. datahub/ingestion/source/kafka/kafka.py +16 -0
  94. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  95. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  96. datahub/ingestion/source/looker/looker_source.py +1 -0
  97. datahub/ingestion/source/mlflow.py +11 -1
  98. datahub/ingestion/source/mock_data/__init__.py +0 -0
  99. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  100. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  101. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  102. datahub/ingestion/source/nifi.py +1 -1
  103. datahub/ingestion/source/openapi.py +12 -0
  104. datahub/ingestion/source/openapi_parser.py +56 -37
  105. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  106. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  108. datahub/ingestion/source/preset.py +2 -2
  109. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  110. datahub/ingestion/source/redshift/redshift.py +21 -1
  111. datahub/ingestion/source/redshift/usage.py +4 -3
  112. datahub/ingestion/source/s3/report.py +4 -2
  113. datahub/ingestion/source/s3/source.py +367 -115
  114. datahub/ingestion/source/sac/sac.py +3 -1
  115. datahub/ingestion/source/salesforce.py +6 -3
  116. datahub/ingestion/source/sigma/sigma.py +7 -1
  117. datahub/ingestion/source/slack/slack.py +2 -1
  118. datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
  119. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  120. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  121. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  122. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  123. datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
  124. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  125. datahub/ingestion/source/sql/athena.py +119 -11
  126. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  127. datahub/ingestion/source/sql/clickhouse.py +3 -1
  128. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  129. datahub/ingestion/source/sql/hana.py +3 -1
  130. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  131. datahub/ingestion/source/sql/mariadb.py +0 -1
  132. datahub/ingestion/source/sql/mssql/source.py +239 -34
  133. datahub/ingestion/source/sql/mysql.py +0 -1
  134. datahub/ingestion/source/sql/oracle.py +1 -1
  135. datahub/ingestion/source/sql/postgres.py +0 -1
  136. datahub/ingestion/source/sql/sql_common.py +121 -34
  137. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  138. datahub/ingestion/source/sql/teradata.py +997 -235
  139. datahub/ingestion/source/sql/vertica.py +10 -6
  140. datahub/ingestion/source/sql_queries.py +2 -2
  141. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  142. datahub/ingestion/source/superset.py +58 -3
  143. datahub/ingestion/source/tableau/tableau.py +58 -37
  144. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  145. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  146. datahub/ingestion/source/unity/config.py +5 -0
  147. datahub/ingestion/source/unity/proxy.py +118 -0
  148. datahub/ingestion/source/unity/source.py +195 -17
  149. datahub/ingestion/source/unity/tag_entities.py +295 -0
  150. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  151. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  152. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  153. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  154. datahub/metadata/_internal_schema_classes.py +1446 -559
  155. datahub/metadata/_urns/urn_defs.py +1721 -1553
  156. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  158. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  159. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  160. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  161. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  162. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  163. datahub/metadata/schema.avsc +18055 -17802
  164. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  165. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  166. datahub/metadata/schemas/Applications.avsc +38 -0
  167. datahub/metadata/schemas/ChartKey.avsc +1 -0
  168. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  169. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  170. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  171. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  172. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  175. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  176. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  177. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  178. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  179. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  180. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  181. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  182. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  183. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  184. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  185. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  186. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  187. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  188. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  189. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  190. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  191. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  192. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  193. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  194. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  195. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  196. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  197. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  198. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  199. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  200. datahub/sdk/__init__.py +6 -0
  201. datahub/sdk/_all_entities.py +11 -0
  202. datahub/sdk/_shared.py +118 -1
  203. datahub/sdk/chart.py +315 -0
  204. datahub/sdk/container.py +7 -0
  205. datahub/sdk/dashboard.py +432 -0
  206. datahub/sdk/dataflow.py +309 -0
  207. datahub/sdk/datajob.py +367 -0
  208. datahub/sdk/dataset.py +8 -2
  209. datahub/sdk/entity_client.py +90 -2
  210. datahub/sdk/lineage_client.py +683 -82
  211. datahub/sdk/main_client.py +46 -16
  212. datahub/sdk/mlmodel.py +101 -38
  213. datahub/sdk/mlmodelgroup.py +7 -0
  214. datahub/sdk/search_client.py +4 -3
  215. datahub/specific/chart.py +1 -1
  216. datahub/specific/dataproduct.py +4 -0
  217. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  218. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  219. datahub/telemetry/telemetry.py +17 -11
  220. datahub/testing/sdk_v2_helpers.py +7 -1
  221. datahub/upgrade/upgrade.py +46 -13
  222. datahub/utilities/server_config_util.py +8 -0
  223. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  224. datahub/utilities/stats_collections.py +4 -0
  225. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  226. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from abc import ABC, abstractmethod
2
3
 
3
4
  # Add imports for source customization
@@ -236,42 +237,76 @@ class ABSObjectStore(ObjectStoreInterface):
236
237
  """Implementation of ObjectStoreInterface for Azure Blob Storage."""
237
238
 
238
239
  PREFIX = "abfss://"
240
+ HTTPS_REGEX = re.compile(r"(https?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)")
239
241
 
240
242
  @classmethod
241
243
  def is_uri(cls, uri: str) -> bool:
242
- return uri.startswith(cls.PREFIX)
244
+ return uri.startswith(cls.PREFIX) or bool(cls.HTTPS_REGEX.match(uri))
243
245
 
244
246
  @classmethod
245
247
  def get_prefix(cls, uri: str) -> Optional[str]:
246
248
  if uri.startswith(cls.PREFIX):
247
249
  return cls.PREFIX
250
+
251
+ # Check for HTTPS format
252
+ match = cls.HTTPS_REGEX.match(uri)
253
+ if match:
254
+ return match.group(1)
255
+
248
256
  return None
249
257
 
250
258
  @classmethod
251
259
  def strip_prefix(cls, uri: str) -> str:
252
- prefix = cls.get_prefix(uri)
253
- if not prefix:
254
- raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
255
- return uri[len(prefix) :]
260
+ if uri.startswith(cls.PREFIX):
261
+ return uri[len(cls.PREFIX) :]
262
+
263
+ # Handle HTTPS format
264
+ match = cls.HTTPS_REGEX.match(uri)
265
+ if match:
266
+ return uri[len(match.group(1)) :]
267
+
268
+ raise ValueError(
269
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
270
+ )
256
271
 
257
272
  @classmethod
258
273
  def get_bucket_name(cls, uri: str) -> str:
259
274
  if not cls.is_uri(uri):
260
- raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
261
- return cls.strip_prefix(uri).split("@")[0]
275
+ raise ValueError(
276
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
277
+ )
278
+
279
+ if uri.startswith(cls.PREFIX):
280
+ # abfss://container@account.dfs.core.windows.net/path
281
+ return cls.strip_prefix(uri).split("@")[0]
282
+ else:
283
+ # https://account.blob.core.windows.net/container/path
284
+ return cls.strip_prefix(uri).split("/")[0]
262
285
 
263
286
  @classmethod
264
287
  def get_object_key(cls, uri: str) -> str:
265
288
  if not cls.is_uri(uri):
266
- raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
267
- parts = cls.strip_prefix(uri).split("@", 1)
268
- if len(parts) < 2:
269
- return ""
270
- account_path = parts[1]
271
- path_parts = account_path.split("/", 1)
272
- if len(path_parts) < 2:
273
- return ""
274
- return path_parts[1]
289
+ raise ValueError(
290
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
291
+ )
292
+
293
+ if uri.startswith(cls.PREFIX):
294
+ # abfss://container@account.dfs.core.windows.net/path
295
+ parts = cls.strip_prefix(uri).split("@", 1)
296
+ if len(parts) < 2:
297
+ return ""
298
+ account_path = parts[1]
299
+ path_parts = account_path.split("/", 1)
300
+ if len(path_parts) < 2:
301
+ return ""
302
+ return path_parts[1]
303
+ else:
304
+ # https://account.blob.core.windows.net/container/path
305
+ stripped = cls.strip_prefix(uri)
306
+ parts = stripped.split("/", 1)
307
+ if len(parts) < 2:
308
+ return ""
309
+ return parts[1]
275
310
 
276
311
 
277
312
  # Registry of all object store implementations
@@ -331,6 +366,12 @@ def get_object_store_bucket_name(uri: str) -> str:
331
366
  return uri[prefix_length:].split("/")[0]
332
367
  elif uri.startswith(ABSObjectStore.PREFIX):
333
368
  return uri[len(ABSObjectStore.PREFIX) :].split("@")[0]
369
+ elif ABSObjectStore.HTTPS_REGEX.match(uri):
370
+ # Handle HTTPS Azure Blob Storage URLs
371
+ match = ABSObjectStore.HTTPS_REGEX.match(uri)
372
+ if match:
373
+ stripped = uri[len(match.group(1)) :]
374
+ return stripped.split("/")[0]
334
375
 
335
376
  raise ValueError(f"Unsupported URI format: {uri}")
336
377
 
@@ -470,18 +511,25 @@ class ObjectStoreSourceAdapter:
470
511
  if not ABSObjectStore.is_uri(table_data.table_path):
471
512
  return None
472
513
 
473
- # Parse the ABS URI
474
514
  try:
475
- # URI format: abfss://container@account.dfs.core.windows.net/path
476
- path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
477
- parts = path_without_prefix.split("@", 1)
478
- if len(parts) < 2:
479
- return None
480
-
481
- container_name = parts[0]
482
- account_parts = parts[1].split("/", 1)
483
- account_domain = account_parts[0]
484
- account_name = account_domain.split(".")[0]
515
+ if table_data.table_path.startswith("abfss://"):
516
+ # URI format: abfss://container@account.dfs.core.windows.net/path
517
+ path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
518
+ parts = path_without_prefix.split("@", 1)
519
+ if len(parts) < 2:
520
+ return None
521
+
522
+ container_name = parts[0]
523
+ account_parts = parts[1].split("/", 1)
524
+ account_domain = account_parts[0]
525
+ account_name = account_domain.split(".")[0]
526
+ else:
527
+ # Handle HTTPS format: https://account.blob.core.windows.net/container/path
528
+ container_name = ABSObjectStore.get_bucket_name(table_data.table_path)
529
+ if "blob.core.windows.net" in table_data.table_path:
530
+ account_name = table_data.table_path.split("//")[1].split(".")[0]
531
+ else:
532
+ return None
485
533
 
486
534
  # Construct Azure portal URL
487
535
  return f"https://portal.azure.com/#blade/Microsoft_Azure_Storage/ContainerMenuBlade/overview/storageAccountId/{account_name}/containerName/{container_name}"
@@ -519,6 +567,13 @@ class ObjectStoreSourceAdapter:
519
567
  "get_external_url",
520
568
  lambda table_data: self.get_gcs_external_url(table_data),
521
569
  )
570
+ # Fix URI mismatch issue in pattern matching
571
+ self.register_customization(
572
+ "_normalize_uri_for_pattern_matching",
573
+ self._normalize_gcs_uri_for_pattern_matching,
574
+ )
575
+ # Fix URI handling in schema extraction - override strip_s3_prefix for GCS
576
+ self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
522
577
  elif platform == "s3":
523
578
  self.register_customization("is_s3_platform", lambda: True)
524
579
  self.register_customization("create_s3_path", self.create_s3_path)
@@ -612,6 +667,39 @@ class ObjectStoreSourceAdapter:
612
667
  return self.get_abs_external_url(table_data)
613
668
  return None
614
669
 
670
+ def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
671
+ """
672
+ Normalize GCS URI for pattern matching.
673
+
674
+ This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
675
+ fixing the URI mismatch issue in GCS ingestion.
676
+
677
+ Args:
678
+ uri: The URI to normalize
679
+
680
+ Returns:
681
+ The normalized URI for pattern matching
682
+ """
683
+ if uri.startswith("gs://"):
684
+ return uri.replace("gs://", "s3://", 1)
685
+ return uri
686
+
687
+ def _strip_gcs_prefix(self, uri: str) -> str:
688
+ """
689
+ Strip GCS prefix from URI.
690
+
691
+ This method removes the gs:// prefix from GCS URIs for path processing.
692
+
693
+ Args:
694
+ uri: The URI to strip the prefix from
695
+
696
+ Returns:
697
+ The URI without the gs:// prefix
698
+ """
699
+ if uri.startswith("gs://"):
700
+ return uri[5:] # Remove "gs://" prefix
701
+ return uri
702
+
615
703
 
616
704
  # Factory function to create an adapter for a specific platform
617
705
  def create_object_store_adapter(
@@ -166,7 +166,6 @@ class PathSpec(ConfigModel):
166
166
  return False
167
167
 
168
168
  def allowed(self, path: str, ignore_ext: bool = False) -> bool:
169
- logger.debug(f"Checking file to inclusion: {path}")
170
169
  if self.is_path_hidden(path) and not self.include_hidden_folders:
171
170
  return False
172
171
 
@@ -174,19 +173,17 @@ class PathSpec(ConfigModel):
174
173
  self.glob_include, flags=pathlib.GLOBSTAR
175
174
  ):
176
175
  return False
177
- logger.debug(f"{path} matched include ")
176
+
178
177
  if self.exclude:
179
178
  for exclude_path in self.exclude:
180
179
  if pathlib.PurePath(path).globmatch(
181
180
  exclude_path, flags=pathlib.GLOBSTAR
182
181
  ):
183
182
  return False
184
- logger.debug(f"{path} is not excluded")
185
183
 
186
184
  table_name, _ = self.extract_table_name_and_path(path)
187
185
  if not self.tables_filter_pattern.allowed(table_name):
188
186
  return False
189
- logger.debug(f"{path} is passed table name check")
190
187
 
191
188
  ext = os.path.splitext(path)[1].strip(".")
192
189
 
@@ -196,8 +193,6 @@ class PathSpec(ConfigModel):
196
193
  ):
197
194
  return False
198
195
 
199
- logger.debug(f"{path} had selected extension {ext}")
200
- logger.debug(f"{path} allowed for dataset creation")
201
196
  return True
202
197
 
203
198
  def dir_allowed(self, path: str) -> bool:
@@ -219,10 +214,8 @@ class PathSpec(ConfigModel):
219
214
  for _ in range(slash_to_remove_from_glob):
220
215
  glob_include = glob_include.rsplit("/", 1)[0]
221
216
 
222
- logger.debug(f"Checking dir to inclusion: {path}")
223
217
  if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
224
218
  return False
225
- logger.debug(f"{path} matched include ")
226
219
  if self.exclude:
227
220
  for exclude_path in self.exclude:
228
221
  if pathlib.PurePath(path.rstrip("/")).globmatch(
@@ -236,7 +229,7 @@ class PathSpec(ConfigModel):
236
229
  )
237
230
  if not self.tables_filter_pattern.allowed(table_name):
238
231
  return False
239
- logger.debug(f"{path} is passed table name check")
232
+ # logger.debug(f"{path} is passed table name check")
240
233
 
241
234
  return True
242
235
 
@@ -246,10 +239,10 @@ class PathSpec(ConfigModel):
246
239
  if parsable_include.endswith("/{table}/**"):
247
240
  # Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
248
241
  parsable_include = parsable_include[:-2]
249
- else:
250
- # Replace all * with {folder[i]} to make it parsable
251
- for i in range(parsable_include.count("*")):
252
- parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
242
+
243
+ # Replace all * with {folder[i]} to make it parsable
244
+ for i in range(parsable_include.count("*")):
245
+ parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
253
246
  return parsable_include
254
247
 
255
248
  def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
@@ -330,8 +323,6 @@ class PathSpec(ConfigModel):
330
323
  if "{table}" in values["include"]:
331
324
  v = "{table}"
332
325
  else:
333
- logger.debug(f"include fields: {compiled_include.named_fields}")
334
- logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
335
326
  if not all(
336
327
  x in compiled_include.named_fields
337
328
  for x in parse.compile(v).named_fields
@@ -356,9 +347,7 @@ class PathSpec(ConfigModel):
356
347
  @cached_property
357
348
  def compiled_include(self):
358
349
  parsable_include = PathSpec.get_parsable_include(self.include)
359
- logger.debug(f"parsable_include: {parsable_include}")
360
350
  compiled_include = parse.compile(parsable_include)
361
- logger.debug(f"Setting compiled_include: {compiled_include}")
362
351
  return compiled_include
363
352
 
364
353
  @cached_property
@@ -366,9 +355,8 @@ class PathSpec(ConfigModel):
366
355
  parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
367
356
  "/", 1
368
357
  )[0]
369
- logger.debug(f"parsable_folder_include: {parsable_folder_include}")
370
358
  compiled_folder_include = parse.compile(parsable_folder_include)
371
- logger.debug(f"Setting compiled_folder_include: {compiled_folder_include}")
359
+
372
360
  return compiled_folder_include
373
361
 
374
362
  @cached_property
@@ -376,7 +364,8 @@ class PathSpec(ConfigModel):
376
364
  # Regular expression to find all substrings enclosed in {}
377
365
  pattern = r"\{(.*?)\}"
378
366
  # Find all matches
379
- matches = re.findall(pattern, self.include.split("{table}/")[1])
367
+ split_parts = self.include.split("{table}/")
368
+ matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
380
369
  return matches
381
370
 
382
371
  def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
@@ -563,7 +552,7 @@ class PathSpec(ConfigModel):
563
552
  f"{{{template_key}}}", var[key]
564
553
  )
565
554
  else:
566
- partition_format.replace(f"{{{var_key}}}", var)
555
+ partition_format = partition_format.replace(f"{{{var_key}}}", var)
567
556
  return datetime.datetime.strptime(partition_format, datetime_format).replace(
568
557
  tzinfo=datetime.timezone.utc
569
558
  )
@@ -118,6 +118,17 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
118
118
  "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
119
119
  )
120
120
 
121
+ structured_properties_template_cache_invalidation_interval: int = Field(
122
+ hidden_from_docs=True,
123
+ default=60,
124
+ description="Interval in seconds to invalidate the structured properties template cache.",
125
+ )
126
+
127
+ query_timeout: Optional[int] = Field(
128
+ default=None,
129
+ description="Timeout for each query in seconds. ",
130
+ )
131
+
121
132
  @root_validator(skip_on_failure=True)
122
133
  def check_ingesting_data(cls, values):
123
134
  if (
@@ -1,10 +1,10 @@
1
- import contextlib
2
1
  import json
3
2
  import logging
3
+ import time
4
4
  from datetime import datetime
5
5
  from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
6
6
 
7
- from sqlalchemy import create_engine
7
+ from sqlalchemy import create_engine, text
8
8
 
9
9
  from datahub.emitter.aspect import ASPECT_MAP
10
10
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -12,13 +12,14 @@ from datahub.emitter.serialization_helper import post_json_transform
12
12
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
13
13
  from datahub.ingestion.source.datahub.report import DataHubSourceReport
14
14
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
15
- from datahub.metadata.schema_classes import ChangeTypeClass, SystemMetadataClass
15
+ from datahub.metadata.schema_classes import SystemMetadataClass
16
16
  from datahub.utilities.lossy_collections import LossyDict, LossyList
17
17
 
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
  # Should work for at least mysql, mariadb, postgres
21
21
  DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
22
+ DATE_FORMAT = "%Y-%m-%d"
22
23
 
23
24
  ROW = TypeVar("ROW", bound=Dict[str, Any])
24
25
 
@@ -85,6 +86,9 @@ class DataHubDatabaseReader:
85
86
  **connection_config.options,
86
87
  )
87
88
 
89
+ # Cache for available dates to avoid redundant queries
90
+ self.available_dates_cache: Optional[List[datetime]] = None
91
+
88
92
  @property
89
93
  def soft_deleted_urns_query(self) -> str:
90
94
  return f"""
@@ -100,14 +104,12 @@ class DataHubDatabaseReader:
100
104
  ORDER BY mav.urn
101
105
  """
102
106
 
103
- @property
104
- def query(self) -> str:
105
- # May repeat rows for the same date
106
- # Offset is generally 0, unless we repeat the same createdon twice
107
+ def query(self, set_structured_properties_filter: bool) -> str:
108
+ """
109
+ Main query that gets data for specified date range with appropriate filters.
110
+ """
111
+ structured_prop_filter = f" AND urn {'' if set_structured_properties_filter else 'NOT'} like 'urn:li:structuredProperty:%%'"
107
112
 
108
- # Ensures stable order, chronological per (urn, aspect)
109
- # Relies on createdon order to reflect version order
110
- # Ordering of entries with the same createdon is handled by VersionOrderer
111
113
  return f"""
112
114
  SELECT *
113
115
  FROM (
@@ -132,6 +134,7 @@ class DataHubDatabaseReader:
132
134
  {"" if self.config.include_all_versions else "AND mav.version = 0"}
133
135
  {"" if not self.config.exclude_aspects else "AND mav.aspect NOT IN %(exclude_aspects)s"}
134
136
  AND mav.createdon >= %(since_createdon)s
137
+ AND mav.createdon < %(end_createdon)s
135
138
  ORDER BY
136
139
  createdon,
137
140
  urn,
@@ -139,50 +142,194 @@ class DataHubDatabaseReader:
139
142
  version
140
143
  ) as t
141
144
  WHERE 1=1
142
- {"" if self.config.include_soft_deleted_entities else "AND (removed = false or removed is NULL)"}
145
+ {"" if self.config.include_soft_deleted_entities else " AND (removed = false or removed is NULL)"}
146
+ {structured_prop_filter}
143
147
  ORDER BY
144
148
  createdon,
145
149
  urn,
146
150
  aspect,
147
151
  version
152
+ LIMIT %(limit)s
153
+ OFFSET %(offset)s
148
154
  """
149
155
 
156
+ def execute_with_params(
157
+ self, query: str, params: Dict[str, Any]
158
+ ) -> List[Dict[str, Any]]:
159
+ """Execute query with proper parameter binding that works with your database"""
160
+ with self.engine.connect() as conn:
161
+ result = conn.execute(query, params or {})
162
+ return [dict(row) for row in result.fetchall()]
163
+
150
164
  def execute_server_cursor(
151
165
  self, query: str, params: Dict[str, Any]
152
166
  ) -> Iterable[Dict[str, Any]]:
167
+ """Execute a query with server-side cursor"""
153
168
  with self.engine.connect() as conn:
154
169
  if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
170
  with (
156
171
  conn.begin()
157
172
  ): # Transaction required for PostgreSQL server-side cursor
158
- # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
159
- # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
173
+ # Set query timeout at the connection level
174
+ if self.config.query_timeout:
175
+ if self.engine.dialect.name == "postgresql":
176
+ conn.execute(
177
+ text(
178
+ f"SET statement_timeout = {self.config.query_timeout * 1000}"
179
+ )
180
+ ) # milliseconds
181
+ elif self.engine.dialect.name in ["mysql", "mariadb"]:
182
+ conn.execute(
183
+ text(
184
+ f"SET max_execution_time = {self.config.query_timeout * 1000}"
185
+ )
186
+ ) # milliseconds
187
+
188
+ # Stream results with batch size
160
189
  conn = conn.execution_options(
161
190
  stream_results=True,
162
191
  yield_per=self.config.database_query_batch_size,
163
192
  )
193
+
194
+ # Execute query - using native parameterization without text()
195
+ # to maintain compatibility with your original code
164
196
  result = conn.execute(query, params)
165
197
  for row in result:
166
198
  yield dict(row)
199
+
200
+ return # Success, exit the retry loop
167
201
  else:
168
202
  raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
169
203
 
170
204
  def _get_rows(
171
- self, from_createdon: datetime, stop_time: datetime
205
+ self,
206
+ start_date: datetime,
207
+ end_date: datetime,
208
+ set_structured_properties_filter: bool,
209
+ limit: int,
172
210
  ) -> Iterable[Dict[str, Any]]:
173
- params = {
174
- "exclude_aspects": list(self.config.exclude_aspects),
175
- "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
176
- }
177
- yield from self.execute_server_cursor(self.query, params)
211
+ """
212
+ Retrieves data rows within a specified date range using pagination.
178
213
 
179
- def get_aspects(
214
+ Implements a hybrid pagination strategy that switches between time-based and
215
+ offset-based approaches depending on the returned data. Uses server-side
216
+ cursors for efficient memory usage.
217
+
218
+ Note: May return duplicate rows across batch boundaries when multiple rows
219
+ share the same 'createdon' timestamp. This is expected behavior when
220
+ transitioning between pagination methods.
221
+
222
+ Args:
223
+ start_date: Beginning of date range (inclusive)
224
+ end_date: End of date range (exclusive)
225
+ set_structured_properties_filter: Whether to apply structured filtering
226
+ limit: Maximum rows to fetch per query
227
+
228
+ Returns:
229
+ An iterable of database rows as dictionaries
230
+ """
231
+ offset = 0
232
+ last_createdon = None
233
+ first_iteration = True
234
+
235
+ while True:
236
+ try:
237
+ # Set up query and parameters - using named parameters
238
+ query = self.query(set_structured_properties_filter)
239
+ params: Dict[str, Any] = {
240
+ "since_createdon": start_date.strftime(DATETIME_FORMAT),
241
+ "end_createdon": end_date.strftime(DATETIME_FORMAT),
242
+ "limit": limit,
243
+ "offset": offset,
244
+ }
245
+
246
+ # Add exclude_aspects if needed
247
+ if (
248
+ hasattr(self.config, "exclude_aspects")
249
+ and self.config.exclude_aspects
250
+ ):
251
+ params["exclude_aspects"] = tuple(self.config.exclude_aspects)
252
+
253
+ logger.info(
254
+ f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
255
+ f"with limit {limit} and offset {offset} (inclusive range)"
256
+ )
257
+
258
+ # Execute query with server-side cursor
259
+ rows = self.execute_server_cursor(query, params)
260
+ # Process and yield rows
261
+ rows_processed = 0
262
+ for row in rows:
263
+ if first_iteration:
264
+ start_date = row.get("createdon", start_date)
265
+ first_iteration = False
266
+
267
+ last_createdon = row.get("createdon")
268
+ rows_processed += 1
269
+ yield row
270
+
271
+ # If we processed fewer than the limit or no last_createdon, we're done
272
+ if rows_processed < limit or not last_createdon:
273
+ break
274
+
275
+ # Update parameters for next iteration
276
+ if start_date != last_createdon:
277
+ start_date = last_createdon
278
+ offset = 0
279
+ else:
280
+ offset += limit
281
+
282
+ logger.info(
283
+ f"Processed {rows_processed} rows for date range {start_date} to {end_date}. Continuing to next batch."
284
+ )
285
+
286
+ except Exception as e:
287
+ logger.error(
288
+ f"Error processing date range {start_date} to {end_date}: {str(e)}"
289
+ )
290
+ # Re-raise the exception after logging
291
+ raise
292
+
293
+ def get_all_aspects(
180
294
  self, from_createdon: datetime, stop_time: datetime
295
+ ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
296
+ logger.info("Fetching Structured properties aspects")
297
+ yield from self.get_aspects(
298
+ from_createdon=from_createdon,
299
+ stop_time=stop_time,
300
+ set_structured_properties_filter=True,
301
+ )
302
+
303
+ logger.info(
304
+ f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
305
+ )
306
+
307
+ time.sleep(
308
+ self.config.structured_properties_template_cache_invalidation_interval
309
+ )
310
+
311
+ logger.info("Fetching aspects")
312
+ yield from self.get_aspects(
313
+ from_createdon=from_createdon,
314
+ stop_time=stop_time,
315
+ set_structured_properties_filter=False,
316
+ )
317
+
318
+ def get_aspects(
319
+ self,
320
+ from_createdon: datetime,
321
+ stop_time: datetime,
322
+ set_structured_properties_filter: bool = False,
181
323
  ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
182
324
  orderer = VersionOrderer[Dict[str, Any]](
183
325
  enabled=self.config.include_all_versions
184
326
  )
185
- rows = self._get_rows(from_createdon=from_createdon, stop_time=stop_time)
327
+ rows = self._get_rows(
328
+ start_date=from_createdon,
329
+ end_date=stop_time,
330
+ set_structured_properties_filter=set_structured_properties_filter,
331
+ limit=self.config.database_query_batch_size,
332
+ )
186
333
  for row in orderer(rows):
187
334
  mcp = self._parse_row(row)
188
335
  if mcp:
@@ -190,23 +337,29 @@ class DataHubDatabaseReader:
190
337
 
191
338
  def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
192
339
  """
193
- Fetches all soft-deleted entities from the database.
340
+ Fetches all soft-deleted entities from the database using pagination.
194
341
 
195
342
  Yields:
196
343
  Row objects containing URNs of soft-deleted entities
197
344
  """
198
- with self.engine.connect() as conn, contextlib.closing(
199
- conn.connection.cursor()
200
- ) as cursor:
201
- logger.debug("Polling soft-deleted urns from database")
202
- cursor.execute(self.soft_deleted_urns_query)
203
- columns = [desc[0] for desc in cursor.description]
204
- while True:
205
- rows = cursor.fetchmany(self.config.database_query_batch_size)
206
- if not rows:
207
- return
208
- for row in rows:
209
- yield dict(zip(columns, row))
345
+ try:
346
+ params: Dict = {}
347
+
348
+ logger.debug("Fetching soft-deleted URNs")
349
+
350
+ # Use server-side cursor implementation
351
+ rows = self.execute_server_cursor(self.soft_deleted_urns_query, params)
352
+ processed_rows = 0
353
+ # Process and yield rows
354
+ for row in rows:
355
+ processed_rows += 1
356
+ yield row
357
+
358
+ logger.debug(f"Fetched batch of {processed_rows} soft-deleted URNs")
359
+
360
+ except Exception:
361
+ logger.exception("Error fetching soft-deleted row", exc_info=True)
362
+ raise
210
363
 
211
364
  def _parse_row(
212
365
  self, row: Dict[str, Any]
@@ -221,7 +374,6 @@ class DataHubDatabaseReader:
221
374
  entityUrn=row["urn"],
222
375
  aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
223
376
  systemMetadata=system_metadata,
224
- changeType=ChangeTypeClass.UPSERT,
225
377
  )
226
378
  except Exception as e:
227
379
  logger.warning(
@@ -117,7 +117,7 @@ class DataHubSource(StatefulIngestionSourceBase):
117
117
  ) -> Iterable[MetadataWorkUnit]:
118
118
  logger.info(f"Fetching database aspects starting from {from_createdon}")
119
119
  progress = ProgressTimer(report_every=timedelta(seconds=60))
120
- mcps = reader.get_aspects(from_createdon, self.report.stop_time)
120
+ mcps = reader.get_all_aspects(from_createdon, self.report.stop_time)
121
121
  for i, (mcp, createdon) in enumerate(mcps):
122
122
  if not self.urn_pattern.allowed(str(mcp.entityUrn)):
123
123
  continue