acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show
  1. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
  2. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
  3. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +1 -1
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +3 -5
  46. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  47. datahub/ingestion/source/delta_lake/config.py +8 -1
  48. datahub/ingestion/source/delta_lake/report.py +4 -2
  49. datahub/ingestion/source/delta_lake/source.py +20 -5
  50. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  51. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  52. datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
  53. datahub/ingestion/source/elastic_search.py +26 -6
  54. datahub/ingestion/source/feast.py +27 -8
  55. datahub/ingestion/source/file.py +6 -3
  56. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  57. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  58. datahub/ingestion/source/ge_data_profiler.py +12 -15
  59. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  60. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  61. datahub/ingestion/source/identity/okta.py +37 -7
  62. datahub/ingestion/source/kafka/kafka.py +1 -1
  63. datahub/ingestion/source/kafka_connect/common.py +2 -7
  64. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  65. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  66. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  67. datahub/ingestion/source/looker/looker_common.py +3 -3
  68. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  69. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  70. datahub/ingestion/source/looker/looker_source.py +1 -1
  71. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  72. datahub/ingestion/source/looker/lookml_source.py +3 -2
  73. datahub/ingestion/source/metabase.py +57 -35
  74. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  75. datahub/ingestion/source/metadata/lineage.py +2 -2
  76. datahub/ingestion/source/mlflow.py +365 -35
  77. datahub/ingestion/source/mode.py +18 -8
  78. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  79. datahub/ingestion/source/nifi.py +37 -11
  80. datahub/ingestion/source/openapi.py +1 -1
  81. datahub/ingestion/source/openapi_parser.py +49 -17
  82. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  83. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  84. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  85. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  86. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  87. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  88. datahub/ingestion/source/preset.py +7 -4
  89. datahub/ingestion/source/pulsar.py +3 -2
  90. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  91. datahub/ingestion/source/redash.py +31 -7
  92. datahub/ingestion/source/redshift/config.py +4 -0
  93. datahub/ingestion/source/redshift/datashares.py +236 -0
  94. datahub/ingestion/source/redshift/lineage.py +6 -2
  95. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  96. datahub/ingestion/source/redshift/profile.py +1 -1
  97. datahub/ingestion/source/redshift/query.py +133 -33
  98. datahub/ingestion/source/redshift/redshift.py +46 -73
  99. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  100. datahub/ingestion/source/redshift/report.py +3 -0
  101. datahub/ingestion/source/s3/config.py +5 -5
  102. datahub/ingestion/source/s3/source.py +20 -41
  103. datahub/ingestion/source/salesforce.py +550 -275
  104. datahub/ingestion/source/schema_inference/object.py +1 -1
  105. datahub/ingestion/source/sigma/sigma.py +1 -1
  106. datahub/ingestion/source/slack/slack.py +31 -10
  107. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  108. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  109. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  110. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  112. datahub/ingestion/source/sql/athena.py +10 -16
  113. datahub/ingestion/source/sql/druid.py +1 -5
  114. datahub/ingestion/source/sql/hive.py +15 -6
  115. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  116. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  117. datahub/ingestion/source/sql/mssql/source.py +11 -5
  118. datahub/ingestion/source/sql/oracle.py +127 -63
  119. datahub/ingestion/source/sql/sql_common.py +6 -12
  120. datahub/ingestion/source/sql/sql_types.py +2 -2
  121. datahub/ingestion/source/sql/teradata.py +7 -5
  122. datahub/ingestion/source/sql/trino.py +2 -2
  123. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  124. datahub/ingestion/source/superset.py +222 -62
  125. datahub/ingestion/source/tableau/tableau.py +22 -6
  126. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  127. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  128. datahub/ingestion/source/unity/source.py +11 -1
  129. datahub/ingestion/source/vertexai.py +697 -0
  130. datahub/ingestion/source_config/pulsar.py +3 -1
  131. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  132. datahub/lite/duckdb_lite.py +3 -10
  133. datahub/lite/lite_local.py +1 -1
  134. datahub/lite/lite_util.py +4 -3
  135. datahub/metadata/_schema_classes.py +714 -417
  136. datahub/metadata/_urns/urn_defs.py +1673 -1649
  137. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  138. datahub/metadata/schema.avsc +16438 -16603
  139. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  140. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  141. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  142. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  143. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  144. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  145. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  146. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  147. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  148. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  149. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  150. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  151. datahub/metadata/schemas/DomainKey.avsc +2 -1
  152. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  153. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  154. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  155. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  156. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  157. datahub/metadata/schemas/InputFields.avsc +3 -1
  158. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  159. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  160. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  162. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  163. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  164. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  165. datahub/metadata/schemas/PostKey.avsc +2 -1
  166. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  168. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  169. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  170. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  171. datahub/pydantic/__init__.py +0 -0
  172. datahub/pydantic/compat.py +58 -0
  173. datahub/sdk/__init__.py +30 -12
  174. datahub/sdk/_all_entities.py +1 -1
  175. datahub/sdk/_attribution.py +4 -0
  176. datahub/sdk/_shared.py +251 -16
  177. datahub/sdk/_utils.py +35 -0
  178. datahub/sdk/container.py +29 -5
  179. datahub/sdk/dataset.py +118 -20
  180. datahub/sdk/{_entity.py → entity.py} +24 -1
  181. datahub/sdk/entity_client.py +1 -1
  182. datahub/sdk/main_client.py +23 -0
  183. datahub/sdk/resolver_client.py +17 -29
  184. datahub/sdk/search_client.py +50 -0
  185. datahub/sdk/search_filters.py +374 -0
  186. datahub/specific/dataset.py +3 -4
  187. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  188. datahub/sql_parsing/schema_resolver.py +1 -1
  189. datahub/sql_parsing/split_statements.py +20 -13
  190. datahub/sql_parsing/sql_parsing_common.py +7 -0
  191. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  192. datahub/sql_parsing/sqlglot_utils.py +1 -4
  193. datahub/testing/check_sql_parser_result.py +5 -6
  194. datahub/testing/compare_metadata_json.py +7 -6
  195. datahub/testing/pytest_hooks.py +56 -0
  196. datahub/upgrade/upgrade.py +2 -2
  197. datahub/utilities/file_backed_collections.py +3 -14
  198. datahub/utilities/ingest_utils.py +106 -0
  199. datahub/utilities/mapping.py +1 -1
  200. datahub/utilities/memory_footprint.py +3 -2
  201. datahub/utilities/sentinels.py +22 -0
  202. datahub/utilities/unified_diff.py +5 -1
  203. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  204. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from enum import Enum
3
3
  from pathlib import Path
4
- from typing import Iterable, List, Optional
4
+ from typing import Iterable, List, Optional, Union
5
5
 
6
6
  import yaml
7
7
  from pydantic import validator
@@ -38,7 +38,7 @@ class AllowedTypes(Enum):
38
38
 
39
39
 
40
40
  class AllowedValue(ConfigModel):
41
- value: str
41
+ value: Union[int, float, str]
42
42
  description: Optional[str] = None
43
43
 
44
44
 
datahub/cli/check_cli.py CHANGED
@@ -5,7 +5,8 @@ import pathlib
5
5
  import pprint
6
6
  import shutil
7
7
  import tempfile
8
- from typing import Dict, List, Optional, Union
8
+ from datetime import datetime
9
+ from typing import Any, Dict, List, Optional, Union
9
10
 
10
11
  import click
11
12
 
@@ -20,7 +21,10 @@ from datahub.ingestion.sink.sink_registry import sink_registry
20
21
  from datahub.ingestion.source.source_registry import source_registry
21
22
  from datahub.ingestion.transformer.transform_registry import transform_registry
22
23
  from datahub.telemetry import telemetry
23
- from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
24
+ from datahub.utilities.file_backed_collections import (
25
+ ConnectionWrapper,
26
+ FileBackedDict,
27
+ )
24
28
 
25
29
  logger = logging.getLogger(__name__)
26
30
 
@@ -391,29 +395,78 @@ def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
391
395
  raise e
392
396
 
393
397
 
398
+ def _jsonify(data: Any) -> Any:
399
+ if dataclasses.is_dataclass(data):
400
+ # dataclasses.asdict() is recursive. We're doing the recursion
401
+ # manually here via _jsonify calls, so we can't use
402
+ # dataclasses.asdict() here.
403
+ return {
404
+ f.name: _jsonify(getattr(data, f.name)) for f in dataclasses.fields(data)
405
+ }
406
+ elif isinstance(data, list):
407
+ return [_jsonify(item) for item in data]
408
+ elif isinstance(data, dict):
409
+ return {_jsonify(k): _jsonify(v) for k, v in data.items()}
410
+ elif isinstance(data, datetime):
411
+ return data.isoformat()
412
+ else:
413
+ return data
414
+
415
+
394
416
  @check.command()
395
- @click.argument("query-log-file", type=click.Path(exists=True, dir_okay=False))
396
- @click.option("--output", type=click.Path())
397
- def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None:
417
+ @click.argument("db-file", type=click.Path(exists=True, dir_okay=False))
418
+ def extract_sql_agg_log(db_file: str) -> None:
398
419
  """Convert a sqlite db generated by the SqlParsingAggregator into a JSON."""
399
420
 
400
- from datahub.sql_parsing.sql_parsing_aggregator import LoggedQuery
421
+ if pathlib.Path(db_file).suffix != ".db":
422
+ raise click.UsageError("DB file must be a sqlite db")
423
+
424
+ output_dir = pathlib.Path(db_file).with_suffix("")
425
+ output_dir.mkdir(exist_ok=True)
426
+
427
+ shared_connection = ConnectionWrapper(pathlib.Path(db_file))
428
+
429
+ tables: List[str] = [
430
+ row[0]
431
+ for row in shared_connection.execute(
432
+ """\
433
+ SELECT
434
+ name
435
+ FROM
436
+ sqlite_schema
437
+ WHERE
438
+ type ='table' AND
439
+ name NOT LIKE 'sqlite_%';
440
+ """,
441
+ parameters={},
442
+ )
443
+ ]
444
+ logger.info(f"Extracting {len(tables)} tables from {db_file}: {tables}")
445
+
446
+ for table in tables:
447
+ table_output_path = output_dir / f"{table}.json"
448
+ if table_output_path.exists():
449
+ logger.info(f"Skipping {table_output_path} because it already exists")
450
+ continue
401
451
 
402
- assert dataclasses.is_dataclass(LoggedQuery)
452
+ # Some of the tables might actually be FileBackedList. Because
453
+ # the list is built on top of the FileBackedDict, we don't
454
+ # need to distinguish between the two cases.
403
455
 
404
- shared_connection = ConnectionWrapper(pathlib.Path(query_log_file))
405
- query_log = FileBackedList[LoggedQuery](
406
- shared_connection=shared_connection, tablename="stored_queries"
407
- )
408
- logger.info(f"Extracting {len(query_log)} queries from {query_log_file}")
409
- queries = [dataclasses.asdict(query) for query in query_log]
456
+ table_data: FileBackedDict[Any] = FileBackedDict(
457
+ shared_connection=shared_connection, tablename=table
458
+ )
410
459
 
411
- if output:
412
- with open(output, "w") as f:
413
- json.dump(queries, f, indent=2, default=str)
414
- logger.info(f"Extracted {len(queries)} queries to {output}")
415
- else:
416
- click.echo(json.dumps(queries, indent=2))
460
+ data = {}
461
+ with click.progressbar(
462
+ table_data.items(), length=len(table_data), label=f"Extracting {table}"
463
+ ) as items:
464
+ for k, v in items:
465
+ data[k] = _jsonify(v)
466
+
467
+ with open(table_output_path, "w") as f:
468
+ json.dump(data, f, indent=2, default=str)
469
+ logger.info(f"Extracted {len(data)} entries to {table_output_path}")
417
470
 
418
471
 
419
472
  @check.command()
datahub/cli/docker_cli.py CHANGED
@@ -231,7 +231,7 @@ def _docker_compose_v2() -> List[str]:
231
231
  # docker-compose v1 is not installed either.
232
232
  raise DockerComposeVersionError(
233
233
  "You don't have Docker Compose installed. Please install Docker Compose. See https://docs.docker.com/compose/install/.",
234
- )
234
+ ) from None
235
235
 
236
236
 
237
237
  def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None:
@@ -430,7 +430,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures:
430
430
  return quickstart_arch
431
431
 
432
432
 
433
- @docker.command() # noqa: C901
433
+ @docker.command()
434
434
  @click.option(
435
435
  "--version",
436
436
  type=str,
@@ -592,7 +592,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures:
592
592
  "arch",
593
593
  ]
594
594
  )
595
- def quickstart( # noqa: C901
595
+ def quickstart(
596
596
  version: Optional[str],
597
597
  build_locally: bool,
598
598
  pull_images: bool,
@@ -645,7 +645,7 @@ def get_all_warehouses(client: DataHubGraph) -> Iterator[str]:
645
645
  graph_query = """
646
646
  query getIcebergWarehouses($start: Int, $count: Int) {
647
647
  search(
648
- input: {type: DATA_PLATFORM_INSTANCE, query: "*", start: $start, count: $count}
648
+ input: {type: DATA_PLATFORM_INSTANCE, query: "dataPlatform:iceberg", start: $start, count: $count}
649
649
  ) {
650
650
  start
651
651
  total
datahub/cli/ingest_cli.py CHANGED
@@ -15,14 +15,14 @@ from tabulate import tabulate
15
15
  from datahub._version import nice_version_name
16
16
  from datahub.cli import cli_utils
17
17
  from datahub.cli.config_utils import CONDENSED_DATAHUB_CONFIG_PATH
18
- from datahub.configuration.common import ConfigModel, GraphError
18
+ from datahub.configuration.common import GraphError
19
19
  from datahub.configuration.config_loader import load_config_file
20
- from datahub.emitter.mce_builder import datahub_guid
21
20
  from datahub.ingestion.graph.client import get_default_graph
22
21
  from datahub.ingestion.run.connection import ConnectionManager
23
22
  from datahub.ingestion.run.pipeline import Pipeline
24
23
  from datahub.telemetry import telemetry
25
24
  from datahub.upgrade import upgrade
25
+ from datahub.utilities.ingest_utils import deploy_source_vars
26
26
  from datahub.utilities.perf_timer import PerfTimer
27
27
 
28
28
  logger = logging.getLogger(__name__)
@@ -191,23 +191,6 @@ def run(
191
191
  # don't raise SystemExit if there's no error
192
192
 
193
193
 
194
- def _make_ingestion_urn(name: str) -> str:
195
- guid = datahub_guid(
196
- {
197
- "name": name,
198
- }
199
- )
200
- return f"urn:li:dataHubIngestionSource:deploy-{guid}"
201
-
202
-
203
- class DeployOptions(ConfigModel):
204
- name: str
205
- schedule: Optional[str] = None
206
- time_zone: str = "UTC"
207
- cli_version: Optional[str] = None
208
- executor_id: str = "default"
209
-
210
-
211
194
  @ingest.command()
212
195
  @upgrade.check_upgrade
213
196
  @telemetry.with_telemetry()
@@ -258,6 +241,16 @@ class DeployOptions(ConfigModel):
258
241
  required=False,
259
242
  default="UTC",
260
243
  )
244
+ @click.option(
245
+ "--debug", type=bool, help="Should we debug.", required=False, default=False
246
+ )
247
+ @click.option(
248
+ "--extra-pip",
249
+ type=str,
250
+ help='Extra pip packages. e.g. ["memray"]',
251
+ required=False,
252
+ default=None,
253
+ )
261
254
  def deploy(
262
255
  name: Optional[str],
263
256
  config: str,
@@ -266,6 +259,8 @@ def deploy(
266
259
  cli_version: Optional[str],
267
260
  schedule: Optional[str],
268
261
  time_zone: str,
262
+ extra_pip: Optional[str],
263
+ debug: bool = False,
269
264
  ) -> None:
270
265
  """
271
266
  Deploy an ingestion recipe to your DataHub instance.
@@ -276,83 +271,23 @@ def deploy(
276
271
 
277
272
  datahub_graph = get_default_graph()
278
273
 
279
- pipeline_config = load_config_file(
280
- config,
281
- allow_stdin=True,
282
- allow_remote=True,
283
- resolve_env_vars=False,
274
+ variables = deploy_source_vars(
275
+ name=name,
276
+ config=config,
277
+ urn=urn,
278
+ executor_id=executor_id,
279
+ cli_version=cli_version,
280
+ schedule=schedule,
281
+ time_zone=time_zone,
282
+ extra_pip=extra_pip,
283
+ debug=debug,
284
284
  )
285
285
 
286
- deploy_options_raw = pipeline_config.pop("deployment", None)
287
- if deploy_options_raw is not None:
288
- deploy_options = DeployOptions.parse_obj(deploy_options_raw)
289
-
290
- if name:
291
- logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
292
- deploy_options.name = name
293
- else:
294
- if not name:
295
- raise click.UsageError(
296
- "Either --name must be set or deployment_name specified in the config"
297
- )
298
- deploy_options = DeployOptions(name=name)
299
-
300
- # Use remaining CLI args to override deploy_options
301
- if schedule:
302
- deploy_options.schedule = schedule
303
- if time_zone:
304
- deploy_options.time_zone = time_zone
305
- if cli_version:
306
- deploy_options.cli_version = cli_version
307
- if executor_id:
308
- deploy_options.executor_id = executor_id
309
-
310
- logger.info(f"Using {repr(deploy_options)}")
311
-
312
- if not urn:
313
- # When urn/name is not specified, we will generate a unique urn based on the deployment name.
314
- urn = _make_ingestion_urn(deploy_options.name)
315
- logger.info(f"Using recipe urn: {urn}")
316
-
317
- # Invariant - at this point, both urn and deploy_options are set.
318
-
319
- variables: dict = {
320
- "urn": urn,
321
- "name": deploy_options.name,
322
- "type": pipeline_config["source"]["type"],
323
- "recipe": json.dumps(pipeline_config),
324
- "executorId": deploy_options.executor_id,
325
- "version": deploy_options.cli_version,
326
- }
327
-
328
- if deploy_options.schedule is not None:
329
- variables["schedule"] = {
330
- "interval": deploy_options.schedule,
331
- "timezone": deploy_options.time_zone,
332
- }
333
-
334
286
  # The updateIngestionSource endpoint can actually do upserts as well.
335
287
  graphql_query: str = textwrap.dedent(
336
288
  """
337
- mutation updateIngestionSource(
338
- $urn: String!,
339
- $name: String!,
340
- $type: String!,
341
- $schedule: UpdateIngestionSourceScheduleInput,
342
- $recipe: String!,
343
- $executorId: String!
344
- $version: String) {
345
-
346
- updateIngestionSource(urn: $urn, input: {
347
- name: $name,
348
- type: $type,
349
- schedule: $schedule,
350
- config: {
351
- recipe: $recipe,
352
- executorId: $executorId,
353
- version: $version,
354
- }
355
- })
289
+ mutation updateIngestionSource($urn: String!, $input: UpdateIngestionSourceInput!) {
290
+ updateIngestionSource(urn: $urn, input: $input)
356
291
  }
357
292
  """
358
293
  )
@@ -372,7 +307,7 @@ def deploy(
372
307
  sys.exit(1)
373
308
 
374
309
  click.echo(
375
- f"✅ Successfully wrote data ingestion source metadata for recipe {deploy_options.name}:"
310
+ f"✅ Successfully wrote data ingestion source metadata for recipe {variables['input']['name']}:"
376
311
  )
377
312
  click.echo(response)
378
313
 
@@ -414,7 +349,9 @@ def parse_restli_response(response):
414
349
 
415
350
 
416
351
  @ingest.command()
417
- @click.argument("path", type=click.Path(exists=True))
352
+ @click.argument(
353
+ "path", type=click.Path(exists=False)
354
+ ) # exists=False since it only supports local filesystems
418
355
  def mcps(path: str) -> None:
419
356
  """
420
357
  Ingest metadata from a mcp json file or directory of files.
datahub/cli/lite_cli.py CHANGED
@@ -285,10 +285,12 @@ def search(
285
285
  ctx: click.Context,
286
286
  query: str = "",
287
287
  flavor: str = SearchFlavor.FREE_TEXT.name.lower(),
288
- aspect: List[str] = [],
288
+ aspect: Optional[List[str]] = None,
289
289
  details: bool = True,
290
290
  ) -> None:
291
291
  """Search with a free text or exact query string"""
292
+ if aspect is None:
293
+ aspect = []
292
294
 
293
295
  # query flavor should be sanitized by now, but we still need to convert it to a SearchFlavor
294
296
  try:
@@ -296,7 +298,7 @@ def search(
296
298
  except KeyError:
297
299
  raise click.UsageError(
298
300
  f"Failed to find a matching query flavor for {flavor}. Valid values are {[x.lower() for x in SearchFlavor._member_names_]}"
299
- )
301
+ ) from None
300
302
  catalog = _get_datahub_lite(read_only=True)
301
303
  # sanitize query
302
304
  result_ids = set()
@@ -49,7 +49,7 @@ def _abort_if_non_existent_urn(graph: DataHubGraph, urn: str, operation: str) ->
49
49
  entity_type = parsed_urn.get_type()
50
50
  except Exception:
51
51
  click.secho(f"Provided urn {urn} does not seem valid", fg="red")
52
- raise click.Abort()
52
+ raise click.Abort() from None
53
53
  else:
54
54
  if not graph.exists(urn):
55
55
  click.secho(
@@ -1,12 +1,15 @@
1
+ import filecmp
1
2
  import json
2
3
  import logging
4
+ import os
5
+ import shutil
3
6
  from pathlib import Path
4
- from typing import Set, Tuple
7
+ from typing import List, Set, Tuple
5
8
 
6
9
  import click
7
10
  from click_default_group import DefaultGroup
8
11
 
9
- from datahub.api.entities.dataset.dataset import Dataset
12
+ from datahub.api.entities.dataset.dataset import Dataset, DatasetRetrievalConfig
10
13
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
11
14
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
12
15
  from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
@@ -30,18 +33,9 @@ def dataset() -> None:
30
33
  @telemetry.with_telemetry()
31
34
  def upsert(file: Path) -> None:
32
35
  """Upsert attributes to a Dataset in DataHub."""
33
-
34
- with get_default_graph() as graph:
35
- for dataset in Dataset.from_yaml(str(file)):
36
- try:
37
- for mcp in dataset.generate_mcp():
38
- graph.emit(mcp)
39
- click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
40
- except Exception as e:
41
- click.secho(
42
- f"Update failed for id {id}. due to {e}",
43
- fg="red",
44
- )
36
+ # Call the sync command with to_datahub=True to perform the upsert operation
37
+ ctx = click.get_current_context()
38
+ ctx.invoke(sync, file=str(file), to_datahub=True)
45
39
 
46
40
 
47
41
  @dataset.command(
@@ -111,3 +105,123 @@ def _get_existing_siblings(graph: DataHubGraph, urn: str) -> Set[str]:
111
105
  return set(existing.siblings)
112
106
  else:
113
107
  return set()
108
+
109
+
110
+ @dataset.command(
111
+ name="file",
112
+ )
113
+ @click.option("--lintCheck", required=False, is_flag=True)
114
+ @click.option("--lintFix", required=False, is_flag=True)
115
+ @click.argument("file", type=click.Path(exists=True))
116
+ @upgrade.check_upgrade
117
+ @telemetry.with_telemetry()
118
+ def file(lintcheck: bool, lintfix: bool, file: str) -> None:
119
+ """Operate on a Dataset file"""
120
+
121
+ if lintcheck or lintfix:
122
+ import tempfile
123
+ from pathlib import Path
124
+
125
+ # Create a temporary file in a secure way
126
+ # The file will be automatically deleted when the context manager exits
127
+ with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as temp:
128
+ temp_path = Path(temp.name)
129
+ try:
130
+ # Copy content to the temporary file
131
+ shutil.copyfile(file, temp_path)
132
+
133
+ # Run the linting
134
+ datasets = Dataset.from_yaml(temp.name)
135
+ for dataset in datasets:
136
+ dataset.to_yaml(temp_path)
137
+
138
+ # Compare the files
139
+ files_match = filecmp.cmp(file, temp_path)
140
+
141
+ if files_match:
142
+ click.secho("No differences found", fg="green")
143
+ else:
144
+ # Show diff for visibility
145
+ os.system(f"diff {file} {temp_path}")
146
+
147
+ if lintfix:
148
+ shutil.copyfile(temp_path, file)
149
+ click.secho(f"Fixed linting issues in {file}", fg="green")
150
+ else:
151
+ click.secho(
152
+ f"To fix these differences, run 'datahub dataset file --lintFix {file}'",
153
+ fg="yellow",
154
+ )
155
+ finally:
156
+ # Ensure the temporary file is removed
157
+ if temp_path.exists():
158
+ temp_path.unlink()
159
+ else:
160
+ click.secho(
161
+ "No operation specified. Choose from --lintCheck or --lintFix", fg="yellow"
162
+ )
163
+
164
+
165
+ @dataset.command(
166
+ name="sync",
167
+ )
168
+ @click.option("-f", "--file", required=True, type=click.Path(exists=True))
169
+ @click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
170
+ @upgrade.check_upgrade
171
+ @telemetry.with_telemetry()
172
+ def sync(file: str, to_datahub: bool) -> None:
173
+ """Sync a Dataset file to/from DataHub"""
174
+
175
+ failures: List[str] = []
176
+ with get_default_graph() as graph:
177
+ datasets = Dataset.from_yaml(file)
178
+ for dataset in datasets:
179
+ assert (
180
+ dataset.urn is not None
181
+ ) # Validator should have ensured this is filled. Tell mypy it's not None
182
+ if to_datahub:
183
+ missing_entity_references = [
184
+ entity_reference
185
+ for entity_reference in dataset.entity_references()
186
+ if not graph.exists(entity_reference)
187
+ ]
188
+ if missing_entity_references:
189
+ click.secho(
190
+ "\n\t- ".join(
191
+ [
192
+ f"Skipping Dataset {dataset.urn} due to missing entity references: "
193
+ ]
194
+ + missing_entity_references
195
+ ),
196
+ fg="red",
197
+ )
198
+ failures.append(dataset.urn)
199
+ continue
200
+ try:
201
+ for mcp in dataset.generate_mcp():
202
+ graph.emit(mcp)
203
+ click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
204
+ except Exception as e:
205
+ click.secho(
206
+ f"Update failed for id {id}. due to {e}",
207
+ fg="red",
208
+ )
209
+ else:
210
+ # Sync from DataHub
211
+ if graph.exists(dataset.urn):
212
+ dataset_get_config = DatasetRetrievalConfig()
213
+ if dataset.downstreams:
214
+ dataset_get_config.include_downstreams = True
215
+ existing_dataset: Dataset = Dataset.from_datahub(
216
+ graph=graph, urn=dataset.urn, config=dataset_get_config
217
+ )
218
+ existing_dataset.to_yaml(Path(file))
219
+ else:
220
+ click.secho(f"Dataset {dataset.urn} does not exist")
221
+ failures.append(dataset.urn)
222
+ if failures:
223
+ click.secho(
224
+ f"\nFailed to sync the following Datasets: {', '.join(failures)}",
225
+ fg="red",
226
+ )
227
+ raise click.Abort()
@@ -20,7 +20,7 @@ from pydantic import BaseModel, Extra, ValidationError
20
20
  from pydantic.fields import Field
21
21
  from typing_extensions import Protocol, Self
22
22
 
23
- from datahub.configuration._config_enum import ConfigEnum as ConfigEnum # noqa: I250
23
+ from datahub.configuration._config_enum import ConfigEnum as ConfigEnum
24
24
  from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
25
25
  from datahub.utilities.dedup_list import deduplicate_list
26
26
 
@@ -130,7 +130,7 @@ class PermissiveConfigModel(ConfigModel):
130
130
  # It is usually used for argument bags that are passed through to third-party libraries.
131
131
 
132
132
  class Config:
133
- if PYDANTIC_VERSION_2:
133
+ if PYDANTIC_VERSION_2: # noqa: SIM108
134
134
  extra = "allow"
135
135
  else:
136
136
  extra = Extra.allow
@@ -198,6 +198,14 @@ class IgnorableError(MetaError):
198
198
  """An error that can be ignored."""
199
199
 
200
200
 
201
+ class TraceTimeoutError(OperationalError):
202
+ """Failure to complete an API Trace within the timeout."""
203
+
204
+
205
+ class TraceValidationError(OperationalError):
206
+ """Failure to complete the expected write operation."""
207
+
208
+
201
209
  @runtime_checkable
202
210
  class ExceptionWithProps(Protocol):
203
211
  def get_telemetry_props(self) -> Dict[str, Any]: ...
@@ -43,9 +43,7 @@ class GitReference(ConfigModel):
43
43
 
44
44
  @validator("repo", pre=True)
45
45
  def simplify_repo_url(cls, repo: str) -> str:
46
- if repo.startswith("github.com/"):
47
- repo = f"https://{repo}"
48
- elif repo.startswith("gitlab.com"):
46
+ if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
49
47
  repo = f"https://{repo}"
50
48
  elif repo.count("/") == 1:
51
49
  repo = f"https://github.com/{repo}"
@@ -44,7 +44,7 @@ class KafkaConsumerConnectionConfig(_KafkaConnectionConfig):
44
44
  try:
45
45
  value = CallableConsumerConfig(value).callable_config()
46
46
  except Exception as e:
47
- raise ConfigurationError(e)
47
+ raise ConfigurationError(e) from e
48
48
  return value
49
49
 
50
50