acryl-datahub 1.0.0rc3__py3-none-any.whl → 1.0.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
- datahub/_version.py,sha256=ekJghJG0MWYsjIE7Q8oYJZcO06c5OXzZCJv84ngAWko,321
3
+ datahub/_version.py,sha256=u34dk62J-E9ym97j4E0aj92IwtOxAQtbQHOnqvo6ESA,321
4
4
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
5
5
  datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
6
6
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -71,7 +71,7 @@ datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
71
71
  datahub/cli/exists_cli.py,sha256=IsuU86R-g7BJjAl1vULH6d-BWJHAKa4XHLZl5WxGUEM,1233
72
72
  datahub/cli/get_cli.py,sha256=VV80BCXfZ0-C8fr2k43SIuN9DB-fOYP9StWsTHnXwFw,2327
73
73
  datahub/cli/iceberg_cli.py,sha256=Jp3si_xZkOYr1uKA3h9_GlLJbiZPtVN_SpMgLa8OgoE,22984
74
- datahub/cli/ingest_cli.py,sha256=WAS_8BkoumzYVOsN8vbptKtQiQ61T958B_k49xJKdqI,22531
74
+ datahub/cli/ingest_cli.py,sha256=_DznLADNNPe4sm_pFPC1OLT6a5qGRVXNOPTkk721uKE,20453
75
75
  datahub/cli/json_file.py,sha256=nWo-VVthaaW4Do1eUqgrzk0fShb29MjiKXvZVOTq76c,943
76
76
  datahub/cli/lite_cli.py,sha256=lolCnWWMMYojRMebbYTpHWBmOBQF_729RpW4A_y_xF4,13034
77
77
  datahub/cli/migrate.py,sha256=3orGfLNsdh1Q7gkPaCaf2bBWM5b3Ih4fGFw3poe0wiA,17937
@@ -514,7 +514,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=qYgjw0COscvUk8TvgWwZKgYvkYyA3j4yc
514
514
  datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
515
515
  datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
516
516
  datahub/ingestion/source/unity/report.py,sha256=XFT9oQfvEB4RkTvWGgFOoQuLPUN_AIoPXZ79xeDhGHQ,2831
517
- datahub/ingestion/source/unity/source.py,sha256=hdHthF3c9bdGwiyhu324WB7oElTv7N6bA_70hja4Zbk,41929
517
+ datahub/ingestion/source/unity/source.py,sha256=5w24IC4oDhsycdt3TG7rtXOkoQpxE_-dHlLGYui4K8I,42368
518
518
  datahub/ingestion/source/unity/usage.py,sha256=0wETBAaZvHI_EGgBlxX3bKsVHEAdnUV8_bKI_lbyWjY,11500
519
519
  datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
520
520
  datahub/ingestion/source/usage/clickhouse_usage.py,sha256=jJ-EUJdS7t4d9RVjLWQQ2e36wmYzs8xtpD632z6pLiw,9974
@@ -942,6 +942,7 @@ datahub/utilities/file_backed_collections.py,sha256=B3gQS0isgbCM9cH3DEBzpA4PVixt
942
942
  datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
943
943
  datahub/utilities/groupby.py,sha256=pe6rP4ZCttYB98yjbs0Aey8C32aLb7rq-NJ_BFky0H4,524
944
944
  datahub/utilities/hive_schema_to_avro.py,sha256=1MP0a6FFVEYxLg_4lKF7hPxbHJJy0uRQYkML5zRwV3Q,11622
945
+ datahub/utilities/ingest_utils.py,sha256=znIuvFkCdOAOg1dkF-mJn03A2YYFPHlDPZsfCPxKkaQ,3117
945
946
  datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
946
947
  datahub/utilities/logging_manager.py,sha256=bc-x5VZGvFUHT0HD-TF3Uz_nzw3dpKdJSbz6kjpAqAQ,10073
947
948
  datahub/utilities/lossy_collections.py,sha256=5rdtfK2pjwvOrrzLf_KGFOMiVvLLmoXj5EVQXTFSR3E,5704
@@ -1013,9 +1014,9 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1013
1014
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1014
1015
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1015
1016
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1016
- acryl_datahub-1.0.0rc3.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1017
- acryl_datahub-1.0.0rc3.dist-info/METADATA,sha256=OkC6rVw9ET8QynFmJiMs32eyG947NHAhIMkzoTbpv8U,175366
1018
- acryl_datahub-1.0.0rc3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
1019
- acryl_datahub-1.0.0rc3.dist-info/entry_points.txt,sha256=U1e5ZwqPX1OaIbvGrwvozcdB8SbzFYXQM7plpdLKKeo,9592
1020
- acryl_datahub-1.0.0rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1021
- acryl_datahub-1.0.0rc3.dist-info/RECORD,,
1017
+ acryl_datahub-1.0.0rc4.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1018
+ acryl_datahub-1.0.0rc4.dist-info/METADATA,sha256=DNdXlwXHLsQAxYJUquO4ZPH9g58WXUQwmZ71V7qcIWw,175366
1019
+ acryl_datahub-1.0.0rc4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
1020
+ acryl_datahub-1.0.0rc4.dist-info/entry_points.txt,sha256=U1e5ZwqPX1OaIbvGrwvozcdB8SbzFYXQM7plpdLKKeo,9592
1021
+ acryl_datahub-1.0.0rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1022
+ acryl_datahub-1.0.0rc4.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0rc3"
3
+ __version__ = "1.0.0rc4"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
datahub/cli/ingest_cli.py CHANGED
@@ -15,14 +15,14 @@ from tabulate import tabulate
15
15
  from datahub._version import nice_version_name
16
16
  from datahub.cli import cli_utils
17
17
  from datahub.cli.config_utils import CONDENSED_DATAHUB_CONFIG_PATH
18
- from datahub.configuration.common import ConfigModel, GraphError
18
+ from datahub.configuration.common import GraphError
19
19
  from datahub.configuration.config_loader import load_config_file
20
- from datahub.emitter.mce_builder import datahub_guid
21
20
  from datahub.ingestion.graph.client import get_default_graph
22
21
  from datahub.ingestion.run.connection import ConnectionManager
23
22
  from datahub.ingestion.run.pipeline import Pipeline
24
23
  from datahub.telemetry import telemetry
25
24
  from datahub.upgrade import upgrade
25
+ from datahub.utilities.ingest_utils import deploy_source_vars
26
26
  from datahub.utilities.perf_timer import PerfTimer
27
27
 
28
28
  logger = logging.getLogger(__name__)
@@ -191,23 +191,6 @@ def run(
191
191
  # don't raise SystemExit if there's no error
192
192
 
193
193
 
194
- def _make_ingestion_urn(name: str) -> str:
195
- guid = datahub_guid(
196
- {
197
- "name": name,
198
- }
199
- )
200
- return f"urn:li:dataHubIngestionSource:deploy-{guid}"
201
-
202
-
203
- class DeployOptions(ConfigModel):
204
- name: str
205
- schedule: Optional[str] = None
206
- time_zone: str = "UTC"
207
- cli_version: Optional[str] = None
208
- executor_id: str = "default"
209
-
210
-
211
194
  @ingest.command()
212
195
  @upgrade.check_upgrade
213
196
  @telemetry.with_telemetry()
@@ -258,6 +241,16 @@ class DeployOptions(ConfigModel):
258
241
  required=False,
259
242
  default="UTC",
260
243
  )
244
+ @click.option(
245
+ "--debug", type=bool, help="Should we debug.", required=False, default=False
246
+ )
247
+ @click.option(
248
+ "--extra-pip",
249
+ type=str,
250
+ help='Extra pip packages. e.g. ["memray"]',
251
+ required=False,
252
+ default=None,
253
+ )
261
254
  def deploy(
262
255
  name: Optional[str],
263
256
  config: str,
@@ -266,6 +259,8 @@ def deploy(
266
259
  cli_version: Optional[str],
267
260
  schedule: Optional[str],
268
261
  time_zone: str,
262
+ extra_pip: Optional[str],
263
+ debug: bool = False,
269
264
  ) -> None:
270
265
  """
271
266
  Deploy an ingestion recipe to your DataHub instance.
@@ -276,83 +271,23 @@ def deploy(
276
271
 
277
272
  datahub_graph = get_default_graph()
278
273
 
279
- pipeline_config = load_config_file(
280
- config,
281
- allow_stdin=True,
282
- allow_remote=True,
283
- resolve_env_vars=False,
274
+ variables = deploy_source_vars(
275
+ name=name,
276
+ config=config,
277
+ urn=urn,
278
+ executor_id=executor_id,
279
+ cli_version=cli_version,
280
+ schedule=schedule,
281
+ time_zone=time_zone,
282
+ extra_pip=extra_pip,
283
+ debug=debug,
284
284
  )
285
285
 
286
- deploy_options_raw = pipeline_config.pop("deployment", None)
287
- if deploy_options_raw is not None:
288
- deploy_options = DeployOptions.parse_obj(deploy_options_raw)
289
-
290
- if name:
291
- logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
292
- deploy_options.name = name
293
- else:
294
- if not name:
295
- raise click.UsageError(
296
- "Either --name must be set or deployment_name specified in the config"
297
- )
298
- deploy_options = DeployOptions(name=name)
299
-
300
- # Use remaining CLI args to override deploy_options
301
- if schedule:
302
- deploy_options.schedule = schedule
303
- if time_zone:
304
- deploy_options.time_zone = time_zone
305
- if cli_version:
306
- deploy_options.cli_version = cli_version
307
- if executor_id:
308
- deploy_options.executor_id = executor_id
309
-
310
- logger.info(f"Using {repr(deploy_options)}")
311
-
312
- if not urn:
313
- # When urn/name is not specified, we will generate a unique urn based on the deployment name.
314
- urn = _make_ingestion_urn(deploy_options.name)
315
- logger.info(f"Using recipe urn: {urn}")
316
-
317
- # Invariant - at this point, both urn and deploy_options are set.
318
-
319
- variables: dict = {
320
- "urn": urn,
321
- "name": deploy_options.name,
322
- "type": pipeline_config["source"]["type"],
323
- "recipe": json.dumps(pipeline_config),
324
- "executorId": deploy_options.executor_id,
325
- "version": deploy_options.cli_version,
326
- }
327
-
328
- if deploy_options.schedule is not None:
329
- variables["schedule"] = {
330
- "interval": deploy_options.schedule,
331
- "timezone": deploy_options.time_zone,
332
- }
333
-
334
286
  # The updateIngestionSource endpoint can actually do upserts as well.
335
287
  graphql_query: str = textwrap.dedent(
336
288
  """
337
- mutation updateIngestionSource(
338
- $urn: String!,
339
- $name: String!,
340
- $type: String!,
341
- $schedule: UpdateIngestionSourceScheduleInput,
342
- $recipe: String!,
343
- $executorId: String!
344
- $version: String) {
345
-
346
- updateIngestionSource(urn: $urn, input: {
347
- name: $name,
348
- type: $type,
349
- schedule: $schedule,
350
- config: {
351
- recipe: $recipe,
352
- executorId: $executorId,
353
- version: $version,
354
- }
355
- })
289
+ mutation updateIngestionSource($urn: String!, $input: UpdateIngestionSourceInput!) {
290
+ updateIngestionSource(urn: $urn, input: $input)
356
291
  }
357
292
  """
358
293
  )
@@ -372,7 +307,7 @@ def deploy(
372
307
  sys.exit(1)
373
308
 
374
309
  click.echo(
375
- f"✅ Successfully wrote data ingestion source metadata for recipe {deploy_options.name}:"
310
+ f"✅ Successfully wrote data ingestion source metadata for recipe {variables['name']}:"
376
311
  )
377
312
  click.echo(response)
378
313
 
@@ -464,7 +464,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
464
464
 
465
465
  with self.report.new_stage(f"Ingest schema {schema.id}"):
466
466
  yield from self.gen_schema_containers(schema)
467
- yield from self.process_tables(schema)
467
+ try:
468
+ yield from self.process_tables(schema)
469
+ except Exception as e:
470
+ logger.exception(f"Error parsing schema {schema}")
471
+ self.report.report_warning(
472
+ message="Missed schema because of parsing issues",
473
+ context=str(schema),
474
+ title="Error parsing schema",
475
+ exc=e,
476
+ )
477
+ continue
468
478
 
469
479
  self.report.schemas.processed(schema.id)
470
480
 
@@ -0,0 +1,106 @@
1
+ import json
2
+ import logging
3
+ from typing import Optional
4
+
5
+ import click
6
+
7
+ from datahub.configuration.common import ConfigModel
8
+ from datahub.configuration.config_loader import load_config_file
9
+ from datahub.emitter.mce_builder import datahub_guid
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def _make_ingestion_urn(name: str) -> str:
15
+ guid = datahub_guid(
16
+ {
17
+ "name": name,
18
+ }
19
+ )
20
+ return f"urn:li:dataHubIngestionSource:deploy-{guid}"
21
+
22
+
23
+ class DeployOptions(ConfigModel):
24
+ name: str
25
+ schedule: Optional[str] = None
26
+ time_zone: str = "UTC"
27
+ cli_version: Optional[str] = None
28
+ executor_id: str = "default"
29
+
30
+
31
+ def deploy_source_vars(
32
+ name: Optional[str],
33
+ config: str,
34
+ urn: Optional[str],
35
+ executor_id: str,
36
+ cli_version: Optional[str],
37
+ schedule: Optional[str],
38
+ time_zone: str,
39
+ extra_pip: Optional[str],
40
+ debug: bool = False,
41
+ ) -> dict:
42
+ pipeline_config = load_config_file(
43
+ config,
44
+ allow_stdin=True,
45
+ allow_remote=True,
46
+ resolve_env_vars=False,
47
+ )
48
+
49
+ deploy_options_raw = pipeline_config.pop("deployment", None)
50
+ if deploy_options_raw is not None:
51
+ deploy_options = DeployOptions.parse_obj(deploy_options_raw)
52
+
53
+ if name:
54
+ logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
55
+ deploy_options.name = name
56
+ else:
57
+ if not name:
58
+ raise click.UsageError(
59
+ "Either --name must be set or deployment_name specified in the config"
60
+ )
61
+ deploy_options = DeployOptions(name=name)
62
+
63
+ # Use remaining CLI args to override deploy_options
64
+ if schedule:
65
+ deploy_options.schedule = schedule
66
+ if time_zone:
67
+ deploy_options.time_zone = time_zone
68
+ if cli_version:
69
+ deploy_options.cli_version = cli_version
70
+ if executor_id:
71
+ deploy_options.executor_id = executor_id
72
+
73
+ logger.info(f"Using {repr(deploy_options)}")
74
+
75
+ if not urn:
76
+ # When urn/name is not specified, we will generate a unique urn based on the deployment name.
77
+ urn = _make_ingestion_urn(deploy_options.name)
78
+ logger.info(f"Using recipe urn: {urn}")
79
+
80
+ variables: dict = {
81
+ "urn": urn,
82
+ "input": {
83
+ "name": deploy_options.name,
84
+ "type": pipeline_config["source"]["type"],
85
+ "config": {
86
+ "recipe": json.dumps(pipeline_config),
87
+ "executorId": deploy_options.executor_id,
88
+ "debugMode": debug,
89
+ "version": deploy_options.cli_version,
90
+ },
91
+ },
92
+ }
93
+
94
+ if deploy_options.schedule is not None:
95
+ variables["input"]["schedule"] = {
96
+ "interval": deploy_options.schedule,
97
+ "timezone": deploy_options.time_zone,
98
+ }
99
+ if extra_pip is not None:
100
+ extra_args_list = (
101
+ variables.get("input", {}).get("config", {}).get("extraArgs", [])
102
+ )
103
+ extra_args_list.append({"key": "extra_pip_requirements", "value": extra_pip})
104
+ variables["input"]["config"]["extraArgs"] = extra_args_list
105
+
106
+ return variables