acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from enum import Enum
3
3
  from pathlib import Path
4
- from typing import Iterable, List, Optional
4
+ from typing import Iterable, List, Optional, Union
5
5
 
6
6
  import yaml
7
7
  from pydantic import validator
@@ -38,7 +38,7 @@ class AllowedTypes(Enum):
38
38
 
39
39
 
40
40
  class AllowedValue(ConfigModel):
41
- value: str
41
+ value: Union[int, float, str]
42
42
  description: Optional[str] = None
43
43
 
44
44
 
datahub/cli/check_cli.py CHANGED
@@ -5,7 +5,8 @@ import pathlib
5
5
  import pprint
6
6
  import shutil
7
7
  import tempfile
8
- from typing import Dict, List, Optional, Union
8
+ from datetime import datetime
9
+ from typing import Any, Dict, List, Optional, Union
9
10
 
10
11
  import click
11
12
 
@@ -20,7 +21,10 @@ from datahub.ingestion.sink.sink_registry import sink_registry
20
21
  from datahub.ingestion.source.source_registry import source_registry
21
22
  from datahub.ingestion.transformer.transform_registry import transform_registry
22
23
  from datahub.telemetry import telemetry
23
- from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
24
+ from datahub.utilities.file_backed_collections import (
25
+ ConnectionWrapper,
26
+ FileBackedDict,
27
+ )
24
28
 
25
29
  logger = logging.getLogger(__name__)
26
30
 
@@ -391,29 +395,78 @@ def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
391
395
  raise e
392
396
 
393
397
 
398
+ def _jsonify(data: Any) -> Any:
399
+ if dataclasses.is_dataclass(data):
400
+ # dataclasses.asdict() is recursive. We're doing the recursion
401
+ # manually here via _jsonify calls, so we can't use
402
+ # dataclasses.asdict() here.
403
+ return {
404
+ f.name: _jsonify(getattr(data, f.name)) for f in dataclasses.fields(data)
405
+ }
406
+ elif isinstance(data, list):
407
+ return [_jsonify(item) for item in data]
408
+ elif isinstance(data, dict):
409
+ return {_jsonify(k): _jsonify(v) for k, v in data.items()}
410
+ elif isinstance(data, datetime):
411
+ return data.isoformat()
412
+ else:
413
+ return data
414
+
415
+
394
416
  @check.command()
395
- @click.argument("query-log-file", type=click.Path(exists=True, dir_okay=False))
396
- @click.option("--output", type=click.Path())
397
- def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None:
417
+ @click.argument("db-file", type=click.Path(exists=True, dir_okay=False))
418
+ def extract_sql_agg_log(db_file: str) -> None:
398
419
  """Convert a sqlite db generated by the SqlParsingAggregator into a JSON."""
399
420
 
400
- from datahub.sql_parsing.sql_parsing_aggregator import LoggedQuery
421
+ if pathlib.Path(db_file).suffix != ".db":
422
+ raise click.UsageError("DB file must be a sqlite db")
423
+
424
+ output_dir = pathlib.Path(db_file).with_suffix("")
425
+ output_dir.mkdir(exist_ok=True)
426
+
427
+ shared_connection = ConnectionWrapper(pathlib.Path(db_file))
428
+
429
+ tables: List[str] = [
430
+ row[0]
431
+ for row in shared_connection.execute(
432
+ """\
433
+ SELECT
434
+ name
435
+ FROM
436
+ sqlite_schema
437
+ WHERE
438
+ type ='table' AND
439
+ name NOT LIKE 'sqlite_%';
440
+ """,
441
+ parameters={},
442
+ )
443
+ ]
444
+ logger.info(f"Extracting {len(tables)} tables from {db_file}: {tables}")
445
+
446
+ for table in tables:
447
+ table_output_path = output_dir / f"{table}.json"
448
+ if table_output_path.exists():
449
+ logger.info(f"Skipping {table_output_path} because it already exists")
450
+ continue
401
451
 
402
- assert dataclasses.is_dataclass(LoggedQuery)
452
+ # Some of the tables might actually be FileBackedList. Because
453
+ # the list is built on top of the FileBackedDict, we don't
454
+ # need to distinguish between the two cases.
403
455
 
404
- shared_connection = ConnectionWrapper(pathlib.Path(query_log_file))
405
- query_log = FileBackedList[LoggedQuery](
406
- shared_connection=shared_connection, tablename="stored_queries"
407
- )
408
- logger.info(f"Extracting {len(query_log)} queries from {query_log_file}")
409
- queries = [dataclasses.asdict(query) for query in query_log]
456
+ table_data: FileBackedDict[Any] = FileBackedDict(
457
+ shared_connection=shared_connection, tablename=table
458
+ )
410
459
 
411
- if output:
412
- with open(output, "w") as f:
413
- json.dump(queries, f, indent=2, default=str)
414
- logger.info(f"Extracted {len(queries)} queries to {output}")
415
- else:
416
- click.echo(json.dumps(queries, indent=2))
460
+ data = {}
461
+ with click.progressbar(
462
+ table_data.items(), length=len(table_data), label=f"Extracting {table}"
463
+ ) as items:
464
+ for k, v in items:
465
+ data[k] = _jsonify(v)
466
+
467
+ with open(table_output_path, "w") as f:
468
+ json.dump(data, f, indent=2, default=str)
469
+ logger.info(f"Extracted {len(data)} entries to {table_output_path}")
417
470
 
418
471
 
419
472
  @check.command()
datahub/cli/docker_cli.py CHANGED
@@ -231,7 +231,7 @@ def _docker_compose_v2() -> List[str]:
231
231
  # docker-compose v1 is not installed either.
232
232
  raise DockerComposeVersionError(
233
233
  "You don't have Docker Compose installed. Please install Docker Compose. See https://docs.docker.com/compose/install/.",
234
- )
234
+ ) from None
235
235
 
236
236
 
237
237
  def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None:
@@ -430,7 +430,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures:
430
430
  return quickstart_arch
431
431
 
432
432
 
433
- @docker.command() # noqa: C901
433
+ @docker.command()
434
434
  @click.option(
435
435
  "--version",
436
436
  type=str,
@@ -592,7 +592,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures:
592
592
  "arch",
593
593
  ]
594
594
  )
595
- def quickstart( # noqa: C901
595
+ def quickstart(
596
596
  version: Optional[str],
597
597
  build_locally: bool,
598
598
  pull_images: bool,
@@ -14,6 +14,7 @@ from datahub.cli.cli_utils import post_entity
14
14
  from datahub.configuration.common import GraphError
15
15
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
16
16
  from datahub.metadata.schema_classes import SystemMetadataClass
17
+ from datahub.telemetry import telemetry
17
18
 
18
19
  logger = logging.getLogger(__name__)
19
20
 
@@ -161,6 +162,7 @@ def validate_warehouse(data_root: str) -> None:
161
162
  type=int,
162
163
  help=f"Expiration duration for temporary credentials used for role. Defaults to {DEFAULT_CREDS_EXPIRY_DURATION_SECONDS} seconds if unspecified",
163
164
  )
165
+ @telemetry.with_telemetry(capture_kwargs=["duration_seconds"])
164
166
  def create(
165
167
  warehouse: str,
166
168
  description: Optional[str],
@@ -313,6 +315,7 @@ def create(
313
315
  type=int,
314
316
  help=f"Expiration duration for temporary credentials used for role. Defaults to {DEFAULT_CREDS_EXPIRY_DURATION_SECONDS} seconds if unspecified",
315
317
  )
318
+ @telemetry.with_telemetry(capture_kwargs=["duration_seconds"])
316
319
  def update(
317
320
  warehouse: str,
318
321
  data_root: str,
@@ -398,6 +401,7 @@ def update(
398
401
 
399
402
 
400
403
  @iceberg.command()
404
+ @telemetry.with_telemetry()
401
405
  def list() -> None:
402
406
  """
403
407
  List iceberg warehouses
@@ -413,6 +417,7 @@ def list() -> None:
413
417
  @click.option(
414
418
  "-w", "--warehouse", required=True, type=str, help="The name of the warehouse"
415
419
  )
420
+ @telemetry.with_telemetry()
416
421
  def get(warehouse: str) -> None:
417
422
  """Fetches the details of the specified iceberg warehouse"""
418
423
  client = get_default_graph()
@@ -442,6 +447,7 @@ def get(warehouse: str) -> None:
442
447
  is_flag=True,
443
448
  help="force the delete if set without confirmation",
444
449
  )
450
+ @telemetry.with_telemetry(capture_kwargs=["dry_run", "force"])
445
451
  def delete(warehouse: str, dry_run: bool, force: bool) -> None:
446
452
  """
447
453
  Delete warehouse
@@ -470,11 +476,19 @@ def delete(warehouse: str, dry_run: bool, force: bool) -> None:
470
476
  # Do we really need this double-check?
471
477
  if "__typename" in entity and "urn" in entity:
472
478
  if entity["__typename"] in ["Container", "Dataset"]:
479
+ # add the Platform Resource URN to also be deleted for each dataset.
480
+ # This is not user visible, so no need to show a name to the user and include it in the count. Each
481
+ # instance corresponds to a dataset whose name is shown.
482
+ if entity["__typename"] == "Dataset":
483
+ resource_urn = platform_resource_urn(
484
+ entity["properties"]["qualifiedName"]
485
+ )
486
+ urns_to_delete.append(resource_urn)
487
+
473
488
  urns_to_delete.append(entity["urn"])
474
489
  resource_names_to_be_deleted.append(
475
490
  entity.get("name", entity.get("urn"))
476
491
  )
477
- # TODO: PlatformResource associated with datasets need to be deleted.
478
492
 
479
493
  if dry_run:
480
494
  click.echo(
@@ -485,18 +499,21 @@ def delete(warehouse: str, dry_run: bool, force: bool) -> None:
485
499
  else:
486
500
  if not force:
487
501
  click.confirm(
488
- f"This will delete {warehouse} warehouse, credentials, and {len(urns_to_delete)} datasets and namespaces from DataHub. Do you want to continue?",
502
+ f"This will delete {warehouse} warehouse, credentials, and {len(resource_names_to_be_deleted)} datasets and namespaces from DataHub. Do you want to continue?",
489
503
  abort=True,
490
504
  )
491
- client.hard_delete_entity(urn)
492
- client.hard_delete_entity(warehouse_aspect.clientId)
493
- client.hard_delete_entity(warehouse_aspect.clientSecret)
494
505
 
506
+ # Delete the resources in the warehouse first, so that in case it is interrupted, the warehouse itself is
507
+ # still available to enumerate the resources in it that are not yet deleted.
495
508
  for urn_to_delete in urns_to_delete:
496
509
  client.hard_delete_entity(urn_to_delete)
497
510
 
511
+ client.hard_delete_entity(urn)
512
+ client.hard_delete_entity(warehouse_aspect.clientId)
513
+ client.hard_delete_entity(warehouse_aspect.clientSecret)
514
+
498
515
  click.echo(
499
- f"✅ Successfully deleted iceberg warehouse {warehouse} and associated credentials, {len(urns_to_delete)} datasets and namespaces"
516
+ f"✅ Successfully deleted iceberg warehouse {warehouse} and associated credentials, {len(resource_names_to_be_deleted)} datasets and namespaces"
500
517
  )
501
518
 
502
519
 
@@ -504,6 +521,10 @@ def iceberg_data_platform_instance_urn(warehouse: str) -> str:
504
521
  return f"urn:li:dataPlatformInstance:({iceberg_data_platform()},{warehouse})"
505
522
 
506
523
 
524
+ def platform_resource_urn(dataset_name: str) -> str:
525
+ return f"urn:li:platformResource:iceberg.{dataset_name}"
526
+
527
+
507
528
  def iceberg_data_platform() -> str:
508
529
  return "urn:li:dataPlatform:iceberg"
509
530
 
@@ -624,7 +645,7 @@ def get_all_warehouses(client: DataHubGraph) -> Iterator[str]:
624
645
  graph_query = """
625
646
  query getIcebergWarehouses($start: Int, $count: Int) {
626
647
  search(
627
- input: {type: DATA_PLATFORM_INSTANCE, query: "*", start: $start, count: $count}
648
+ input: {type: DATA_PLATFORM_INSTANCE, query: "dataPlatform:iceberg", start: $start, count: $count}
628
649
  ) {
629
650
  start
630
651
  total
@@ -677,6 +698,9 @@ def get_related_entities_for_platform_instance(
677
698
  ... on Dataset {
678
699
  urn
679
700
  name
701
+ properties{
702
+ qualifiedName
703
+ }
680
704
  }
681
705
  }
682
706
  }
datahub/cli/ingest_cli.py CHANGED
@@ -15,14 +15,14 @@ from tabulate import tabulate
15
15
  from datahub._version import nice_version_name
16
16
  from datahub.cli import cli_utils
17
17
  from datahub.cli.config_utils import CONDENSED_DATAHUB_CONFIG_PATH
18
- from datahub.configuration.common import ConfigModel, GraphError
18
+ from datahub.configuration.common import GraphError
19
19
  from datahub.configuration.config_loader import load_config_file
20
- from datahub.emitter.mce_builder import datahub_guid
21
20
  from datahub.ingestion.graph.client import get_default_graph
22
21
  from datahub.ingestion.run.connection import ConnectionManager
23
22
  from datahub.ingestion.run.pipeline import Pipeline
24
23
  from datahub.telemetry import telemetry
25
24
  from datahub.upgrade import upgrade
25
+ from datahub.utilities.ingest_utils import deploy_source_vars
26
26
  from datahub.utilities.perf_timer import PerfTimer
27
27
 
28
28
  logger = logging.getLogger(__name__)
@@ -191,23 +191,6 @@ def run(
191
191
  # don't raise SystemExit if there's no error
192
192
 
193
193
 
194
- def _make_ingestion_urn(name: str) -> str:
195
- guid = datahub_guid(
196
- {
197
- "name": name,
198
- }
199
- )
200
- return f"urn:li:dataHubIngestionSource:deploy-{guid}"
201
-
202
-
203
- class DeployOptions(ConfigModel):
204
- name: str
205
- schedule: Optional[str] = None
206
- time_zone: str = "UTC"
207
- cli_version: Optional[str] = None
208
- executor_id: str = "default"
209
-
210
-
211
194
  @ingest.command()
212
195
  @upgrade.check_upgrade
213
196
  @telemetry.with_telemetry()
@@ -258,6 +241,16 @@ class DeployOptions(ConfigModel):
258
241
  required=False,
259
242
  default="UTC",
260
243
  )
244
+ @click.option(
245
+ "--debug", type=bool, help="Should we debug.", required=False, default=False
246
+ )
247
+ @click.option(
248
+ "--extra-pip",
249
+ type=str,
250
+ help='Extra pip packages. e.g. ["memray"]',
251
+ required=False,
252
+ default=None,
253
+ )
261
254
  def deploy(
262
255
  name: Optional[str],
263
256
  config: str,
@@ -266,6 +259,8 @@ def deploy(
266
259
  cli_version: Optional[str],
267
260
  schedule: Optional[str],
268
261
  time_zone: str,
262
+ extra_pip: Optional[str],
263
+ debug: bool = False,
269
264
  ) -> None:
270
265
  """
271
266
  Deploy an ingestion recipe to your DataHub instance.
@@ -276,83 +271,23 @@ def deploy(
276
271
 
277
272
  datahub_graph = get_default_graph()
278
273
 
279
- pipeline_config = load_config_file(
280
- config,
281
- allow_stdin=True,
282
- allow_remote=True,
283
- resolve_env_vars=False,
274
+ variables = deploy_source_vars(
275
+ name=name,
276
+ config=config,
277
+ urn=urn,
278
+ executor_id=executor_id,
279
+ cli_version=cli_version,
280
+ schedule=schedule,
281
+ time_zone=time_zone,
282
+ extra_pip=extra_pip,
283
+ debug=debug,
284
284
  )
285
285
 
286
- deploy_options_raw = pipeline_config.pop("deployment", None)
287
- if deploy_options_raw is not None:
288
- deploy_options = DeployOptions.parse_obj(deploy_options_raw)
289
-
290
- if name:
291
- logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
292
- deploy_options.name = name
293
- else:
294
- if not name:
295
- raise click.UsageError(
296
- "Either --name must be set or deployment_name specified in the config"
297
- )
298
- deploy_options = DeployOptions(name=name)
299
-
300
- # Use remaining CLI args to override deploy_options
301
- if schedule:
302
- deploy_options.schedule = schedule
303
- if time_zone:
304
- deploy_options.time_zone = time_zone
305
- if cli_version:
306
- deploy_options.cli_version = cli_version
307
- if executor_id:
308
- deploy_options.executor_id = executor_id
309
-
310
- logger.info(f"Using {repr(deploy_options)}")
311
-
312
- if not urn:
313
- # When urn/name is not specified, we will generate a unique urn based on the deployment name.
314
- urn = _make_ingestion_urn(deploy_options.name)
315
- logger.info(f"Using recipe urn: {urn}")
316
-
317
- # Invariant - at this point, both urn and deploy_options are set.
318
-
319
- variables: dict = {
320
- "urn": urn,
321
- "name": deploy_options.name,
322
- "type": pipeline_config["source"]["type"],
323
- "recipe": json.dumps(pipeline_config),
324
- "executorId": deploy_options.executor_id,
325
- "version": deploy_options.cli_version,
326
- }
327
-
328
- if deploy_options.schedule is not None:
329
- variables["schedule"] = {
330
- "interval": deploy_options.schedule,
331
- "timezone": deploy_options.time_zone,
332
- }
333
-
334
286
  # The updateIngestionSource endpoint can actually do upserts as well.
335
287
  graphql_query: str = textwrap.dedent(
336
288
  """
337
- mutation updateIngestionSource(
338
- $urn: String!,
339
- $name: String!,
340
- $type: String!,
341
- $schedule: UpdateIngestionSourceScheduleInput,
342
- $recipe: String!,
343
- $executorId: String!
344
- $version: String) {
345
-
346
- updateIngestionSource(urn: $urn, input: {
347
- name: $name,
348
- type: $type,
349
- schedule: $schedule,
350
- config: {
351
- recipe: $recipe,
352
- executorId: $executorId,
353
- version: $version,
354
- }
355
- })
289
+ mutation updateIngestionSource($urn: String!, $input: UpdateIngestionSourceInput!) {
290
+ updateIngestionSource(urn: $urn, input: $input)
356
291
  }
357
292
  """
358
293
  )
@@ -372,7 +307,7 @@ def deploy(
372
307
  sys.exit(1)
373
308
 
374
309
  click.echo(
375
- f"✅ Successfully wrote data ingestion source metadata for recipe {deploy_options.name}:"
310
+ f"✅ Successfully wrote data ingestion source metadata for recipe {variables['input']['name']}:"
376
311
  )
377
312
  click.echo(response)
378
313
 
@@ -414,7 +349,9 @@ def parse_restli_response(response):
414
349
 
415
350
 
416
351
  @ingest.command()
417
- @click.argument("path", type=click.Path(exists=True))
352
+ @click.argument(
353
+ "path", type=click.Path(exists=False)
354
+ ) # exists=False since it only supports local filesystems
418
355
  def mcps(path: str) -> None:
419
356
  """
420
357
  Ingest metadata from a mcp json file or directory of files.
datahub/cli/lite_cli.py CHANGED
@@ -285,10 +285,12 @@ def search(
285
285
  ctx: click.Context,
286
286
  query: str = "",
287
287
  flavor: str = SearchFlavor.FREE_TEXT.name.lower(),
288
- aspect: List[str] = [],
288
+ aspect: Optional[List[str]] = None,
289
289
  details: bool = True,
290
290
  ) -> None:
291
291
  """Search with a free text or exact query string"""
292
+ if aspect is None:
293
+ aspect = []
292
294
 
293
295
  # query flavor should be sanitized by now, but we still need to convert it to a SearchFlavor
294
296
  try:
@@ -296,7 +298,7 @@ def search(
296
298
  except KeyError:
297
299
  raise click.UsageError(
298
300
  f"Failed to find a matching query flavor for {flavor}. Valid values are {[x.lower() for x in SearchFlavor._member_names_]}"
299
- )
301
+ ) from None
300
302
  catalog = _get_datahub_lite(read_only=True)
301
303
  # sanitize query
302
304
  result_ids = set()
@@ -49,7 +49,7 @@ def _abort_if_non_existent_urn(graph: DataHubGraph, urn: str, operation: str) ->
49
49
  entity_type = parsed_urn.get_type()
50
50
  except Exception:
51
51
  click.secho(f"Provided urn {urn} does not seem valid", fg="red")
52
- raise click.Abort()
52
+ raise click.Abort() from None
53
53
  else:
54
54
  if not graph.exists(urn):
55
55
  click.secho(
@@ -1,12 +1,15 @@
1
+ import filecmp
1
2
  import json
2
3
  import logging
4
+ import os
5
+ import shutil
3
6
  from pathlib import Path
4
- from typing import Set, Tuple
7
+ from typing import List, Set, Tuple
5
8
 
6
9
  import click
7
10
  from click_default_group import DefaultGroup
8
11
 
9
- from datahub.api.entities.dataset.dataset import Dataset
12
+ from datahub.api.entities.dataset.dataset import Dataset, DatasetRetrievalConfig
10
13
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
11
14
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
12
15
  from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
@@ -30,18 +33,9 @@ def dataset() -> None:
30
33
  @telemetry.with_telemetry()
31
34
  def upsert(file: Path) -> None:
32
35
  """Upsert attributes to a Dataset in DataHub."""
33
-
34
- with get_default_graph() as graph:
35
- for dataset in Dataset.from_yaml(str(file)):
36
- try:
37
- for mcp in dataset.generate_mcp():
38
- graph.emit(mcp)
39
- click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
40
- except Exception as e:
41
- click.secho(
42
- f"Update failed for id {id}. due to {e}",
43
- fg="red",
44
- )
36
+ # Call the sync command with to_datahub=True to perform the upsert operation
37
+ ctx = click.get_current_context()
38
+ ctx.invoke(sync, file=str(file), to_datahub=True)
45
39
 
46
40
 
47
41
  @dataset.command(
@@ -111,3 +105,123 @@ def _get_existing_siblings(graph: DataHubGraph, urn: str) -> Set[str]:
111
105
  return set(existing.siblings)
112
106
  else:
113
107
  return set()
108
+
109
+
110
+ @dataset.command(
111
+ name="file",
112
+ )
113
+ @click.option("--lintCheck", required=False, is_flag=True)
114
+ @click.option("--lintFix", required=False, is_flag=True)
115
+ @click.argument("file", type=click.Path(exists=True))
116
+ @upgrade.check_upgrade
117
+ @telemetry.with_telemetry()
118
+ def file(lintcheck: bool, lintfix: bool, file: str) -> None:
119
+ """Operate on a Dataset file"""
120
+
121
+ if lintcheck or lintfix:
122
+ import tempfile
123
+ from pathlib import Path
124
+
125
+ # Create a temporary file in a secure way
126
+ # The file will be automatically deleted when the context manager exits
127
+ with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as temp:
128
+ temp_path = Path(temp.name)
129
+ try:
130
+ # Copy content to the temporary file
131
+ shutil.copyfile(file, temp_path)
132
+
133
+ # Run the linting
134
+ datasets = Dataset.from_yaml(temp.name)
135
+ for dataset in datasets:
136
+ dataset.to_yaml(temp_path)
137
+
138
+ # Compare the files
139
+ files_match = filecmp.cmp(file, temp_path)
140
+
141
+ if files_match:
142
+ click.secho("No differences found", fg="green")
143
+ else:
144
+ # Show diff for visibility
145
+ os.system(f"diff {file} {temp_path}")
146
+
147
+ if lintfix:
148
+ shutil.copyfile(temp_path, file)
149
+ click.secho(f"Fixed linting issues in {file}", fg="green")
150
+ else:
151
+ click.secho(
152
+ f"To fix these differences, run 'datahub dataset file --lintFix {file}'",
153
+ fg="yellow",
154
+ )
155
+ finally:
156
+ # Ensure the temporary file is removed
157
+ if temp_path.exists():
158
+ temp_path.unlink()
159
+ else:
160
+ click.secho(
161
+ "No operation specified. Choose from --lintCheck or --lintFix", fg="yellow"
162
+ )
163
+
164
+
165
+ @dataset.command(
166
+ name="sync",
167
+ )
168
+ @click.option("-f", "--file", required=True, type=click.Path(exists=True))
169
+ @click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
170
+ @upgrade.check_upgrade
171
+ @telemetry.with_telemetry()
172
+ def sync(file: str, to_datahub: bool) -> None:
173
+ """Sync a Dataset file to/from DataHub"""
174
+
175
+ failures: List[str] = []
176
+ with get_default_graph() as graph:
177
+ datasets = Dataset.from_yaml(file)
178
+ for dataset in datasets:
179
+ assert (
180
+ dataset.urn is not None
181
+ ) # Validator should have ensured this is filled. Tell mypy it's not None
182
+ if to_datahub:
183
+ missing_entity_references = [
184
+ entity_reference
185
+ for entity_reference in dataset.entity_references()
186
+ if not graph.exists(entity_reference)
187
+ ]
188
+ if missing_entity_references:
189
+ click.secho(
190
+ "\n\t- ".join(
191
+ [
192
+ f"Skipping Dataset {dataset.urn} due to missing entity references: "
193
+ ]
194
+ + missing_entity_references
195
+ ),
196
+ fg="red",
197
+ )
198
+ failures.append(dataset.urn)
199
+ continue
200
+ try:
201
+ for mcp in dataset.generate_mcp():
202
+ graph.emit(mcp)
203
+ click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
204
+ except Exception as e:
205
+ click.secho(
206
+ f"Update failed for id {id}. due to {e}",
207
+ fg="red",
208
+ )
209
+ else:
210
+ # Sync from DataHub
211
+ if graph.exists(dataset.urn):
212
+ dataset_get_config = DatasetRetrievalConfig()
213
+ if dataset.downstreams:
214
+ dataset_get_config.include_downstreams = True
215
+ existing_dataset: Dataset = Dataset.from_datahub(
216
+ graph=graph, urn=dataset.urn, config=dataset_get_config
217
+ )
218
+ existing_dataset.to_yaml(Path(file))
219
+ else:
220
+ click.secho(f"Dataset {dataset.urn} does not exist")
221
+ failures.append(dataset.urn)
222
+ if failures:
223
+ click.secho(
224
+ f"\nFailed to sync the following Datasets: {', '.join(failures)}",
225
+ fg="red",
226
+ )
227
+ raise click.Abort()