acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (156) hide show
  1. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
  2. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
  3. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/cli/check_cli.py +65 -11
  7. datahub/cli/cli_utils.py +63 -0
  8. datahub/cli/container_cli.py +5 -0
  9. datahub/cli/delete_cli.py +3 -4
  10. datahub/cli/docker_check.py +107 -12
  11. datahub/cli/docker_cli.py +149 -227
  12. datahub/cli/exists_cli.py +0 -2
  13. datahub/cli/get_cli.py +0 -2
  14. datahub/cli/iceberg_cli.py +5 -0
  15. datahub/cli/ingest_cli.py +3 -15
  16. datahub/cli/migrate.py +2 -0
  17. datahub/cli/put_cli.py +1 -4
  18. datahub/cli/quickstart_versioning.py +50 -7
  19. datahub/cli/specific/assertions_cli.py +0 -4
  20. datahub/cli/specific/datacontract_cli.py +0 -3
  21. datahub/cli/specific/dataproduct_cli.py +0 -11
  22. datahub/cli/specific/dataset_cli.py +1 -8
  23. datahub/cli/specific/forms_cli.py +0 -4
  24. datahub/cli/specific/group_cli.py +0 -2
  25. datahub/cli/specific/structuredproperties_cli.py +1 -4
  26. datahub/cli/specific/user_cli.py +0 -2
  27. datahub/cli/state_cli.py +0 -2
  28. datahub/cli/timeline_cli.py +0 -2
  29. datahub/emitter/rest_emitter.py +41 -8
  30. datahub/entrypoints.py +4 -3
  31. datahub/ingestion/api/decorators.py +15 -3
  32. datahub/ingestion/api/report.py +332 -3
  33. datahub/ingestion/api/sink.py +3 -0
  34. datahub/ingestion/api/source.py +47 -45
  35. datahub/ingestion/autogenerated/__init__.py +0 -0
  36. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  37. datahub/ingestion/autogenerated/lineage.json +401 -0
  38. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  39. datahub/ingestion/extractor/schema_util.py +13 -4
  40. datahub/ingestion/graph/client.py +73 -30
  41. datahub/ingestion/run/pipeline.py +54 -2
  42. datahub/ingestion/sink/datahub_rest.py +12 -0
  43. datahub/ingestion/source/abs/source.py +1 -1
  44. datahub/ingestion/source/aws/glue.py +1 -1
  45. datahub/ingestion/source/azure/azure_common.py +2 -2
  46. datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
  47. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  48. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  49. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  50. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  51. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  52. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  53. datahub/ingestion/source/common/subtypes.py +45 -0
  54. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  55. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  56. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  57. datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
  58. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  59. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  60. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  61. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  62. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  63. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  64. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  65. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  66. datahub/ingestion/source/ge_data_profiler.py +76 -28
  67. datahub/ingestion/source/hex/api.py +26 -1
  68. datahub/ingestion/source/identity/azure_ad.py +1 -1
  69. datahub/ingestion/source/identity/okta.py +1 -14
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  72. datahub/ingestion/source/mlflow.py +11 -1
  73. datahub/ingestion/source/mock_data/__init__.py +0 -0
  74. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  75. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  76. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  77. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  78. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  79. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  80. datahub/ingestion/source/preset.py +2 -2
  81. datahub/ingestion/source/redshift/redshift.py +17 -0
  82. datahub/ingestion/source/redshift/usage.py +4 -3
  83. datahub/ingestion/source/s3/report.py +4 -2
  84. datahub/ingestion/source/s3/source.py +367 -115
  85. datahub/ingestion/source/salesforce.py +6 -3
  86. datahub/ingestion/source/sigma/sigma.py +6 -1
  87. datahub/ingestion/source/slack/slack.py +2 -1
  88. datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
  89. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  90. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  91. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
  92. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  93. datahub/ingestion/source/sql/athena.py +119 -12
  94. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  95. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  96. datahub/ingestion/source/sql/mssql/source.py +24 -15
  97. datahub/ingestion/source/sql/oracle.py +1 -1
  98. datahub/ingestion/source/sql/sql_common.py +11 -0
  99. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  100. datahub/ingestion/source/sql/teradata.py +997 -235
  101. datahub/ingestion/source/sql/vertica.py +10 -6
  102. datahub/ingestion/source/sql_queries.py +2 -2
  103. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  104. datahub/ingestion/source/superset.py +57 -2
  105. datahub/ingestion/source/tableau/tableau.py +57 -37
  106. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  107. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  108. datahub/ingestion/source/unity/proxy.py +4 -3
  109. datahub/ingestion/source/unity/source.py +56 -30
  110. datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
  111. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  112. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  113. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  114. datahub/metadata/_internal_schema_classes.py +1253 -536
  115. datahub/metadata/_urns/urn_defs.py +1797 -1685
  116. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  117. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  118. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  119. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  120. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  121. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  122. datahub/metadata/schema.avsc +16614 -16538
  123. datahub/metadata/schemas/ContainerProperties.avsc +2 -0
  124. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  125. datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
  126. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  127. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  128. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  129. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  130. datahub/metadata/schemas/DataJobInfo.avsc +2 -0
  131. datahub/metadata/schemas/DataProcessKey.avsc +2 -0
  132. datahub/metadata/schemas/DatasetKey.avsc +4 -1
  133. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  134. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
  135. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  136. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
  137. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
  138. datahub/metadata/schemas/MLModelKey.avsc +2 -0
  139. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  140. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  141. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  142. datahub/sdk/datajob.py +39 -15
  143. datahub/sdk/lineage_client.py +2 -0
  144. datahub/sdk/main_client.py +14 -2
  145. datahub/sdk/search_client.py +4 -3
  146. datahub/specific/dataproduct.py +4 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  148. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  149. datahub/telemetry/telemetry.py +17 -11
  150. datahub/upgrade/upgrade.py +46 -13
  151. datahub/utilities/server_config_util.py +8 -0
  152. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  153. datahub/utilities/stats_collections.py +4 -0
  154. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
  155. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
  156. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
@@ -22,6 +22,7 @@ from typing import (
22
22
  Union,
23
23
  )
24
24
 
25
+ import progressbar
25
26
  from avro.schema import RecordSchema
26
27
  from pydantic import BaseModel
27
28
  from requests.models import HTTPError
@@ -504,7 +505,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
504
505
  "limit": limit,
505
506
  "filter": filter,
506
507
  }
507
- end_point = f"{self.config.server}/aspects?action=getTimeseriesAspectValues"
508
+ end_point = f"{self._gms_server}/aspects?action=getTimeseriesAspectValues"
508
509
  resp: Dict = self._post_generic(end_point, query_body)
509
510
 
510
511
  values: Optional[List] = resp.get("value", {}).get("values")
@@ -524,7 +525,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
524
525
  def get_entity_raw(
525
526
  self, entity_urn: str, aspects: Optional[List[str]] = None
526
527
  ) -> Dict:
527
- endpoint: str = f"{self.config.server}/entitiesV2/{Urn.url_encode(entity_urn)}"
528
+ endpoint: str = f"{self._gms_server}/entitiesV2/{Urn.url_encode(entity_urn)}"
528
529
  if aspects is not None:
529
530
  assert aspects, "if provided, aspects must be a non-empty list"
530
531
  endpoint = f"{endpoint}?aspects=List(" + ",".join(aspects) + ")"
@@ -654,15 +655,15 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
654
655
 
655
656
  @property
656
657
  def _search_endpoint(self):
657
- return f"{self.config.server}/entities?action=search"
658
+ return f"{self._gms_server}/entities?action=search"
658
659
 
659
660
  @property
660
661
  def _relationships_endpoint(self):
661
- return f"{self.config.server}/openapi/relationships/v1/"
662
+ return f"{self._gms_server}/openapi/relationships/v1/"
662
663
 
663
664
  @property
664
665
  def _aspect_count_endpoint(self):
665
- return f"{self.config.server}/aspects?action=getCount"
666
+ return f"{self._gms_server}/aspects?action=getCount"
666
667
 
667
668
  def get_domain_urn_by_name(self, domain_name: str) -> Optional[str]:
668
669
  """Retrieve a domain urn based on its name. Returns None if there is no match found"""
@@ -1209,7 +1210,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1209
1210
  operation_name: Optional[str] = None,
1210
1211
  format_exception: bool = True,
1211
1212
  ) -> Dict:
1212
- url = f"{self.config.server}/api/graphql"
1213
+ url = f"{self._gms_server}/api/graphql"
1213
1214
 
1214
1215
  body: Dict = {
1215
1216
  "query": query,
@@ -1434,40 +1435,82 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1434
1435
  related_aspects = response.get("relatedAspects", [])
1435
1436
  return reference_count, related_aspects
1436
1437
 
1438
+ def get_kafka_consumer_offsets(
1439
+ self,
1440
+ ) -> dict:
1441
+ """
1442
+ Get Kafka consumer offsets from the DataHub API.
1443
+
1444
+ Args:
1445
+ graph (DataHubGraph): The DataHub graph client
1446
+
1447
+ """
1448
+ urls = {
1449
+ "mcp": f"{self.config.server}/openapi/operations/kafka/mcp/consumer/offsets",
1450
+ "mcl": f"{self.config.server}/openapi/operations/kafka/mcl/consumer/offsets",
1451
+ "mcl-timeseries": f"{self.config.server}/openapi/operations/kafka/mcl-timeseries/consumer/offsets",
1452
+ }
1453
+
1454
+ params = {"skipCache": "true", "detailed": "true"}
1455
+ results = {}
1456
+ for key, url in urls.items():
1457
+ response = self._get_generic(url=url, params=params)
1458
+ results[key] = response
1459
+ if "errors" in response:
1460
+ logger.error(f"Error: {response['errors']}")
1461
+ return results
1462
+
1463
+ def _restore_index_call(self, payload_obj: dict) -> None:
1464
+ result = self._post_generic(
1465
+ f"{self._gms_server}/operations?action=restoreIndices", payload_obj
1466
+ )
1467
+ logger.debug(f"Restore indices result: {result}")
1468
+
1437
1469
  def restore_indices(
1438
1470
  self,
1439
- urn_pattern: str,
1471
+ urn_pattern: Optional[str] = None,
1440
1472
  aspect: Optional[str] = None,
1441
1473
  start: Optional[int] = None,
1442
1474
  batch_size: Optional[int] = None,
1443
- ) -> str:
1475
+ file: Optional[str] = None,
1476
+ ) -> None:
1444
1477
  """Restore the indices for a given urn or urn-like pattern.
1445
1478
 
1446
1479
  Args:
1447
- urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs.
1480
+ urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs. If not provided, will restore indices from the file.
1448
1481
  aspect: Optional aspect string to restore indices for a specific aspect.
1449
- start: Optional integer to decide which row number of sql store to restore from. Default: 0.
1450
- batch_size: Optional integer to decide how many rows to restore. Default: 10.
1482
+ start: Optional integer to decide which row number of sql store to restore from. Default: 0. Ignored in case file is provided.
1483
+ batch_size: Optional integer to decide how many rows to restore. Default: 10. Ignored in case file is provided.
1484
+ file: Optional file path to a file containing URNs to restore indices for.
1451
1485
 
1452
1486
  Returns:
1453
1487
  A string containing the result of the restore indices operation. This format is subject to change.
1454
1488
  """
1455
- if "%" in urn_pattern:
1456
- payload_obj: dict = {"urnLike": urn_pattern}
1489
+ payload_obj = {}
1490
+ if file is not None:
1491
+ with open(file) as f:
1492
+ for urn in progressbar.progressbar(f.readlines()):
1493
+ urn = urn.strip()
1494
+ if "%" in urn:
1495
+ payload_obj["urnLike"] = urn
1496
+ else:
1497
+ payload_obj["urn"] = urn
1498
+ if aspect is not None:
1499
+ payload_obj["aspect"] = aspect
1500
+ self._restore_index_call(payload_obj)
1457
1501
  else:
1458
- payload_obj = {"urn": urn_pattern}
1459
- if aspect is not None:
1460
- payload_obj["aspect"] = aspect
1461
- if start is not None:
1462
- payload_obj["start"] = start
1463
- if batch_size is not None:
1464
- payload_obj["batchSize"] = batch_size
1465
- raw_result = self._post_generic(
1466
- f"{self._gms_server}/operations?action=restoreIndices", payload_obj
1467
- )
1468
- result = raw_result["value"]
1469
- logger.debug(f"Restore indices result: {result}")
1470
- return result
1502
+ if urn_pattern is not None:
1503
+ if "%" in urn_pattern:
1504
+ payload_obj["urnLike"] = urn_pattern
1505
+ else:
1506
+ payload_obj["urn"] = urn_pattern
1507
+ if aspect is not None:
1508
+ payload_obj["aspect"] = aspect
1509
+ if start is not None:
1510
+ payload_obj["start"] = start
1511
+ if batch_size is not None:
1512
+ payload_obj["batchSize"] = batch_size
1513
+ self._restore_index_call(payload_obj)
1471
1514
 
1472
1515
  @functools.lru_cache
1473
1516
  def _make_schema_resolver(
@@ -1533,7 +1576,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1533
1576
  env: str = DEFAULT_ENV,
1534
1577
  default_db: Optional[str] = None,
1535
1578
  default_schema: Optional[str] = None,
1536
- default_dialect: Optional[str] = None,
1579
+ override_dialect: Optional[str] = None,
1537
1580
  ) -> "SqlParsingResult":
1538
1581
  from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
1539
1582
 
@@ -1547,7 +1590,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1547
1590
  schema_resolver=schema_resolver,
1548
1591
  default_db=default_db,
1549
1592
  default_schema=default_schema,
1550
- default_dialect=default_dialect,
1593
+ override_dialect=override_dialect,
1551
1594
  )
1552
1595
 
1553
1596
  def create_tag(self, tag_name: str) -> str:
@@ -1774,7 +1817,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1774
1817
  "Accept": "application/json",
1775
1818
  "Content-Type": "application/json",
1776
1819
  }
1777
- url = f"{self.config.server}/openapi/v2/entity/batch/{entity_name}"
1820
+ url = f"{self._gms_server}/openapi/v2/entity/batch/{entity_name}"
1778
1821
  response = self._session.post(url, data=json.dumps(payload), headers=headers)
1779
1822
  response.raise_for_status()
1780
1823
 
@@ -1831,7 +1874,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1831
1874
  "Content-Type": "application/json",
1832
1875
  }
1833
1876
 
1834
- url = f"{self.config.server}/openapi/v3/entity/{entity_name}/batchGet"
1877
+ url = f"{self._gms_server}/openapi/v3/entity/{entity_name}/batchGet"
1835
1878
  if with_system_metadata:
1836
1879
  url += "?systemMetadata=true"
1837
1880
 
@@ -44,6 +44,10 @@ from datahub.ingestion.transformer.transform_registry import transform_registry
44
44
  from datahub.sdk._attribution import KnownAttribution, change_default_attribution
45
45
  from datahub.telemetry import stats
46
46
  from datahub.telemetry.telemetry import telemetry_instance
47
+ from datahub.upgrade.upgrade import (
48
+ is_server_default_cli_ahead,
49
+ retrieve_version_stats,
50
+ )
47
51
  from datahub.utilities._custom_package_loader import model_version_name
48
52
  from datahub.utilities.global_warning_util import (
49
53
  clear_global_warnings,
@@ -171,7 +175,10 @@ class Pipeline:
171
175
  self.last_time_printed = int(time.time())
172
176
  self.cli_report = CliReport()
173
177
 
174
- with contextlib.ExitStack() as exit_stack, contextlib.ExitStack() as inner_exit_stack:
178
+ with (
179
+ contextlib.ExitStack() as exit_stack,
180
+ contextlib.ExitStack() as inner_exit_stack,
181
+ ):
175
182
  self.graph: Optional[DataHubGraph] = None
176
183
  with _add_init_error_context("connect to DataHub"):
177
184
  if self.config.datahub_api:
@@ -340,6 +347,44 @@ class Pipeline:
340
347
  except Exception as e:
341
348
  logger.warning("Reporting failed on start", exc_info=e)
342
349
 
350
+ def _warn_old_cli_version(self) -> None:
351
+ """
352
+ Check if the server default CLI version is ahead of the CLI version being used.
353
+ If so, add a warning to the report.
354
+ """
355
+
356
+ try:
357
+ version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
358
+ except RuntimeError as e:
359
+ # Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
360
+ if "no current event loop" in str(e):
361
+ logger.debug("Skipping version check - no event loop available")
362
+ return
363
+ raise
364
+
365
+ if not version_stats or not self.graph:
366
+ return
367
+
368
+ if is_server_default_cli_ahead(version_stats):
369
+ server_default_version = (
370
+ version_stats.server.current_server_default_cli_version.version
371
+ if version_stats.server.current_server_default_cli_version
372
+ else None
373
+ )
374
+ current_version = version_stats.client.current.version
375
+
376
+ logger.debug(f"""
377
+ client_version: {current_version}
378
+ server_default_version: {server_default_version}
379
+ server_default_cli_ahead: True
380
+ """)
381
+
382
+ self.source.get_report().warning(
383
+ title="Server default CLI version is ahead of CLI version",
384
+ message="Please upgrade the CLI version being used",
385
+ context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
386
+ )
387
+
343
388
  def _notify_reporters_on_ingestion_completion(self) -> None:
344
389
  for reporter in self.reporters:
345
390
  try:
@@ -396,6 +441,7 @@ class Pipeline:
396
441
  return False
397
442
 
398
443
  def run(self) -> None:
444
+ self._warn_old_cli_version()
399
445
  with self.exit_stack, self.inner_exit_stack:
400
446
  if self.config.flags.generate_memory_profiles:
401
447
  import memray
@@ -502,7 +548,7 @@ class Pipeline:
502
548
  self._handle_uncaught_pipeline_exception(exc)
503
549
  finally:
504
550
  clear_global_warnings()
505
-
551
+ self.sink.flush()
506
552
  self._notify_reporters_on_ingestion_completion()
507
553
 
508
554
  def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
@@ -578,11 +624,17 @@ class Pipeline:
578
624
  sink_failures = len(self.sink.get_report().failures)
579
625
  sink_warnings = len(self.sink.get_report().warnings)
580
626
  global_warnings = len(get_global_warnings())
627
+ source_aspects = self.source.get_report().get_aspects_dict()
628
+ source_aspects_by_subtype = (
629
+ self.source.get_report().get_aspects_by_subtypes_dict()
630
+ )
581
631
 
582
632
  telemetry_instance.ping(
583
633
  "ingest_stats",
584
634
  {
585
635
  "source_type": self.source_type,
636
+ "source_aspects": source_aspects,
637
+ "source_aspects_by_subtype": source_aspects_by_subtype,
586
638
  "sink_type": self.sink_type,
587
639
  "transformer_types": [
588
640
  transformer.type for transformer in self.config.transformers or []
@@ -5,6 +5,7 @@ import functools
5
5
  import logging
6
6
  import os
7
7
  import threading
8
+ import time
8
9
  import uuid
9
10
  from enum import auto
10
11
  from typing import List, Optional, Tuple, Union
@@ -346,6 +347,17 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
346
347
  RecordEnvelope(item, metadata={}), NoopWriteCallback()
347
348
  )
348
349
 
350
+ def flush(self) -> None:
351
+ """Wait for all pending records to be written."""
352
+ i = 0
353
+ while self.report.pending_requests > 0:
354
+ time.sleep(0.1)
355
+ i += 1
356
+ if i % 1000 == 0:
357
+ logger.info(
358
+ f"Waiting for {self.report.pending_requests} records to be written"
359
+ )
360
+
349
361
  def close(self):
350
362
  with self.report.main_thread_blocking_timer:
351
363
  self.executor.shutdown()
@@ -533,7 +533,7 @@ class ABSSource(StatefulIngestionSourceBase):
533
533
  )
534
534
  path_spec.sample_files = False
535
535
  for obj in container_client.list_blobs(
536
- prefix=f"{prefix}", results_per_page=PAGE_SIZE
536
+ name_starts_with=f"{prefix}", results_per_page=PAGE_SIZE
537
537
  ):
538
538
  abs_path = self.create_abs_path(obj.name)
539
539
  logger.debug(f"Path: {abs_path}")
@@ -269,7 +269,7 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
269
269
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
270
270
  @capability(
271
271
  SourceCapability.DELETION_DETECTION,
272
- "Enabled by default when stateful ingestion is turned on.",
272
+ "Enabled by default via stateful ingestion.",
273
273
  )
274
274
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
275
275
  @capability(
@@ -61,13 +61,13 @@ class AzureConnectionConfig(ConfigModel):
61
61
  def get_blob_service_client(self):
62
62
  return BlobServiceClient(
63
63
  account_url=f"https://{self.account_name}.blob.core.windows.net",
64
- credential=f"{self.get_credentials()}",
64
+ credential=self.get_credentials(),
65
65
  )
66
66
 
67
67
  def get_data_lake_service_client(self) -> DataLakeServiceClient:
68
68
  return DataLakeServiceClient(
69
69
  account_url=f"https://{self.account_name}.dfs.core.windows.net",
70
- credential=f"{self.get_credentials()}",
70
+ credential=self.get_credentials(),
71
71
  )
72
72
 
73
73
  def get_credentials(
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  from typing import Iterable, List, Optional
6
6
 
7
+ from datahub.configuration.common import AllowDenyPattern
7
8
  from datahub.ingestion.api.common import PipelineContext
8
9
  from datahub.ingestion.api.decorators import (
9
10
  SupportStatus,
@@ -44,6 +45,7 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
44
45
  BigQueryQueriesExtractorConfig,
45
46
  )
46
47
  from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
48
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
47
49
  from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
48
50
  from datahub.ingestion.source.state.redundant_run_skip_handler import (
49
51
  RedundantLineageRunSkipHandler,
@@ -77,7 +79,14 @@ def cleanup(config: BigQueryV2Config) -> None:
77
79
  supported=False,
78
80
  )
79
81
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
80
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
82
+ @capability(
83
+ SourceCapability.CONTAINERS,
84
+ "Enabled by default",
85
+ subtype_modifier=[
86
+ SourceCapabilityModifier.BIGQUERY_PROJECT,
87
+ SourceCapabilityModifier.BIGQUERY_DATASET,
88
+ ],
89
+ )
81
90
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
82
91
  @capability(
83
92
  SourceCapability.DATA_PROFILING,
@@ -242,7 +251,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
242
251
  ).workunit_processor,
243
252
  ]
244
253
 
254
+ def _warn_deprecated_configs(self):
255
+ if (
256
+ self.config.match_fully_qualified_names is not None
257
+ and not self.config.match_fully_qualified_names
258
+ and self.config.schema_pattern is not None
259
+ and self.config.schema_pattern != AllowDenyPattern.allow_all()
260
+ ):
261
+ self.report.report_warning(
262
+ message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
263
+ "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
264
+ "The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
265
+ context="Config option deprecation warning",
266
+ title="Config option deprecation warning",
267
+ )
268
+
245
269
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
270
+ self._warn_deprecated_configs()
246
271
  projects = get_projects(
247
272
  self.bq_schema_extractor.schema_api,
248
273
  self.report,
@@ -271,28 +296,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
271
296
  ):
272
297
  return
273
298
 
274
- with self.report.new_stage(
275
- f"*: {QUERIES_EXTRACTION}"
276
- ), BigQueryQueriesExtractor(
277
- connection=self.config.get_bigquery_client(),
278
- schema_api=self.bq_schema_extractor.schema_api,
279
- config=BigQueryQueriesExtractorConfig(
280
- window=self.config,
281
- user_email_pattern=self.config.usage.user_email_pattern,
282
- include_lineage=self.config.include_table_lineage,
283
- include_usage_statistics=self.config.include_usage_statistics,
284
- include_operations=self.config.usage.include_operational_stats,
285
- include_queries=self.config.include_queries,
286
- include_query_usage_statistics=self.config.include_query_usage_statistics,
287
- top_n_queries=self.config.usage.top_n_queries,
288
- region_qualifiers=self.config.region_qualifiers,
289
- ),
290
- structured_report=self.report,
291
- filters=self.filters,
292
- identifiers=self.identifiers,
293
- schema_resolver=self.sql_parser_schema_resolver,
294
- discovered_tables=self.bq_schema_extractor.table_refs,
295
- ) as queries_extractor:
299
+ with (
300
+ self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
301
+ BigQueryQueriesExtractor(
302
+ connection=self.config.get_bigquery_client(),
303
+ schema_api=self.bq_schema_extractor.schema_api,
304
+ config=BigQueryQueriesExtractorConfig(
305
+ window=self.config,
306
+ user_email_pattern=self.config.usage.user_email_pattern,
307
+ include_lineage=self.config.include_table_lineage,
308
+ include_usage_statistics=self.config.include_usage_statistics,
309
+ include_operations=self.config.usage.include_operational_stats,
310
+ include_queries=self.config.include_queries,
311
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
312
+ top_n_queries=self.config.usage.top_n_queries,
313
+ region_qualifiers=self.config.region_qualifiers,
314
+ ),
315
+ structured_report=self.report,
316
+ filters=self.filters,
317
+ identifiers=self.identifiers,
318
+ schema_resolver=self.sql_parser_schema_resolver,
319
+ discovered_tables=self.bq_schema_extractor.table_refs,
320
+ ) as queries_extractor,
321
+ ):
296
322
  self.report.queries_extractor = queries_extractor.report
297
323
  yield from queries_extractor.get_workunits_internal()
298
324
  else:
@@ -342,7 +342,7 @@ class BigQueryV2Config(
342
342
  )
343
343
 
344
344
  use_queries_v2: bool = Field(
345
- default=False,
345
+ default=True,
346
346
  description="If enabled, uses the new queries extractor to extract queries from bigquery.",
347
347
  )
348
348
  include_queries: bool = Field(
@@ -94,3 +94,4 @@ class BigQueryQueriesSource(Source):
94
94
  def close(self) -> None:
95
95
  self.queries_extractor.close()
96
96
  self.connection.close()
97
+ super().close()
@@ -189,6 +189,7 @@ WHERE
189
189
 
190
190
  if len(profile_requests) == 0:
191
191
  return
192
+
192
193
  yield from self.generate_profile_workunits(
193
194
  profile_requests,
194
195
  max_workers=self.config.profiling.max_workers,
@@ -226,10 +227,11 @@ WHERE
226
227
  db_name, schema_name, bq_table, self.config.profiling.partition_datetime
227
228
  )
228
229
 
229
- if partition is None and bq_table.partition_info:
230
+ # For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
231
+ if partition is None and bq_table.partition_info and bq_table.rows_count:
230
232
  self.report.report_warning(
231
233
  title="Profile skipped for partitioned table",
232
- message="profile skipped as partitioned table is empty or partition id or type was invalid",
234
+ message="profile skipped as partition id or type was invalid",
233
235
  context=profile_request.pretty_name,
234
236
  )
235
237
  return None
@@ -45,12 +45,12 @@ SELECT
45
45
  tos.OPTION_VALUE as comment,
46
46
  t.is_insertable_into,
47
47
  t.ddl,
48
- ts.row_count,
48
+ ts.row_count as row_count,
49
49
  ts.size_bytes as bytes,
50
50
  p.num_partitions,
51
51
  p.max_partition_id,
52
- p.active_billable_bytes,
53
- p.long_term_billable_bytes,
52
+ p.active_billable_bytes as active_billable_bytes,
53
+ IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
54
54
  REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
55
55
  REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
56
56
 
@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
80
80
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
81
81
  @capability(
82
82
  SourceCapability.DELETION_DETECTION,
83
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
83
+ "Enabled by default via stateful ingestion",
84
84
  supported=True,
85
85
  )
86
86
  class CassandraSource(StatefulIngestionSourceBase):
@@ -70,11 +70,12 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- with self.report.new_stage(
74
- f"{keyspace_name}: {PROFILING}"
75
- ), ThreadPoolExecutor(
76
- max_workers=self.config.profiling.max_workers
77
- ) as executor:
73
+ with (
74
+ self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
75
+ ThreadPoolExecutor(
76
+ max_workers=self.config.profiling.max_workers
77
+ ) as executor,
78
+ ):
78
79
  future_to_dataset = {
79
80
  executor.submit(
80
81
  self.generate_profile,
@@ -1,5 +1,10 @@
1
+ import logging
2
+ from typing import Any, Dict
3
+
1
4
  from datahub.utilities.str_enum import StrEnum
2
5
 
6
+ logger = logging.getLogger(__name__)
7
+
3
8
 
4
9
  class DatasetSubTypes(StrEnum):
5
10
  # Generic SubTypes
@@ -26,6 +31,8 @@ class DatasetSubTypes(StrEnum):
26
31
  NEO4J_RELATIONSHIP = "Neo4j Relationship"
27
32
  SNOWFLAKE_STREAM = "Snowflake Stream"
28
33
  API_ENDPOINT = "API Endpoint"
34
+ SLACK_CHANNEL = "Slack Channel"
35
+ PROJECTIONS = "Projections"
29
36
 
30
37
  # TODO: Create separate entity...
31
38
  NOTEBOOK = "Notebook"
@@ -52,6 +59,8 @@ class BIContainerSubTypes(StrEnum):
52
59
  LOOKER_FOLDER = "Folder"
53
60
  LOOKML_PROJECT = "LookML Project"
54
61
  LOOKML_MODEL = "LookML Model"
62
+ TABLEAU_SITE = "Site"
63
+ TABLEAU_PROJECT = "Project"
55
64
  TABLEAU_WORKBOOK = "Workbook"
56
65
  POWERBI_DATASET = "Semantic Model"
57
66
  POWERBI_DATASET_TABLE = "Table"
@@ -74,6 +83,9 @@ class JobContainerSubTypes(StrEnum):
74
83
 
75
84
 
76
85
  class BIAssetSubTypes(StrEnum):
86
+ DASHBOARD = "Dashboard"
87
+ CHART = "Chart"
88
+
77
89
  # Generic SubTypes
78
90
  REPORT = "Report"
79
91
 
@@ -116,3 +128,36 @@ class MLAssetSubTypes(StrEnum):
116
128
  VERTEX_PIPELINE = "Pipeline Job"
117
129
  VERTEX_PIPELINE_TASK = "Pipeline Task"
118
130
  VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
131
+
132
+
133
+ def create_source_capability_modifier_enum():
134
+ all_values: Dict[str, Any] = {}
135
+ source_enums = [
136
+ DatasetSubTypes,
137
+ DatasetContainerSubTypes,
138
+ BIContainerSubTypes,
139
+ FlowContainerSubTypes,
140
+ JobContainerSubTypes,
141
+ BIAssetSubTypes,
142
+ MLAssetSubTypes,
143
+ ]
144
+
145
+ for enum_class in source_enums:
146
+ for member in enum_class: # type: ignore[var-annotated]
147
+ if member.name in all_values:
148
+ logger.debug(
149
+ f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
150
+ )
151
+ continue
152
+ all_values[member.name] = member.value
153
+
154
+ enum_code = "class SourceCapabilityModifier(StrEnum):\n"
155
+ for name, value in all_values.items():
156
+ enum_code += f' {name} = "{value}"\n'
157
+
158
+ exec(enum_code, globals())
159
+ return globals()["SourceCapabilityModifier"]
160
+
161
+
162
+ # This will have all values from the enums above
163
+ SourceCapabilityModifier = create_source_capability_modifier_enum()