acryl-datahub 1.0.0rc3__py3-none-any.whl → 1.0.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc4.dist-info}/METADATA +2535 -2535
- {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc4.dist-info}/RECORD +10 -9
- datahub/_version.py +1 -1
- datahub/cli/ingest_cli.py +27 -92
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/utilities/ingest_utils.py +106 -0
- {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc4.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc3.dist-info → acryl_datahub-1.0.0rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
|
-
datahub/_version.py,sha256=
|
|
3
|
+
datahub/_version.py,sha256=u34dk62J-E9ym97j4E0aj92IwtOxAQtbQHOnqvo6ESA,321
|
|
4
4
|
datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
|
|
5
5
|
datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
|
|
6
6
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -71,7 +71,7 @@ datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
|
|
|
71
71
|
datahub/cli/exists_cli.py,sha256=IsuU86R-g7BJjAl1vULH6d-BWJHAKa4XHLZl5WxGUEM,1233
|
|
72
72
|
datahub/cli/get_cli.py,sha256=VV80BCXfZ0-C8fr2k43SIuN9DB-fOYP9StWsTHnXwFw,2327
|
|
73
73
|
datahub/cli/iceberg_cli.py,sha256=Jp3si_xZkOYr1uKA3h9_GlLJbiZPtVN_SpMgLa8OgoE,22984
|
|
74
|
-
datahub/cli/ingest_cli.py,sha256=
|
|
74
|
+
datahub/cli/ingest_cli.py,sha256=_DznLADNNPe4sm_pFPC1OLT6a5qGRVXNOPTkk721uKE,20453
|
|
75
75
|
datahub/cli/json_file.py,sha256=nWo-VVthaaW4Do1eUqgrzk0fShb29MjiKXvZVOTq76c,943
|
|
76
76
|
datahub/cli/lite_cli.py,sha256=lolCnWWMMYojRMebbYTpHWBmOBQF_729RpW4A_y_xF4,13034
|
|
77
77
|
datahub/cli/migrate.py,sha256=3orGfLNsdh1Q7gkPaCaf2bBWM5b3Ih4fGFw3poe0wiA,17937
|
|
@@ -514,7 +514,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=qYgjw0COscvUk8TvgWwZKgYvkYyA3j4yc
|
|
|
514
514
|
datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
|
|
515
515
|
datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
|
|
516
516
|
datahub/ingestion/source/unity/report.py,sha256=XFT9oQfvEB4RkTvWGgFOoQuLPUN_AIoPXZ79xeDhGHQ,2831
|
|
517
|
-
datahub/ingestion/source/unity/source.py,sha256=
|
|
517
|
+
datahub/ingestion/source/unity/source.py,sha256=5w24IC4oDhsycdt3TG7rtXOkoQpxE_-dHlLGYui4K8I,42368
|
|
518
518
|
datahub/ingestion/source/unity/usage.py,sha256=0wETBAaZvHI_EGgBlxX3bKsVHEAdnUV8_bKI_lbyWjY,11500
|
|
519
519
|
datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
520
520
|
datahub/ingestion/source/usage/clickhouse_usage.py,sha256=jJ-EUJdS7t4d9RVjLWQQ2e36wmYzs8xtpD632z6pLiw,9974
|
|
@@ -942,6 +942,7 @@ datahub/utilities/file_backed_collections.py,sha256=B3gQS0isgbCM9cH3DEBzpA4PVixt
|
|
|
942
942
|
datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
|
|
943
943
|
datahub/utilities/groupby.py,sha256=pe6rP4ZCttYB98yjbs0Aey8C32aLb7rq-NJ_BFky0H4,524
|
|
944
944
|
datahub/utilities/hive_schema_to_avro.py,sha256=1MP0a6FFVEYxLg_4lKF7hPxbHJJy0uRQYkML5zRwV3Q,11622
|
|
945
|
+
datahub/utilities/ingest_utils.py,sha256=znIuvFkCdOAOg1dkF-mJn03A2YYFPHlDPZsfCPxKkaQ,3117
|
|
945
946
|
datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
|
|
946
947
|
datahub/utilities/logging_manager.py,sha256=bc-x5VZGvFUHT0HD-TF3Uz_nzw3dpKdJSbz6kjpAqAQ,10073
|
|
947
948
|
datahub/utilities/lossy_collections.py,sha256=5rdtfK2pjwvOrrzLf_KGFOMiVvLLmoXj5EVQXTFSR3E,5704
|
|
@@ -1013,9 +1014,9 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1013
1014
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1014
1015
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1015
1016
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1016
|
-
acryl_datahub-1.0.
|
|
1017
|
-
acryl_datahub-1.0.
|
|
1018
|
-
acryl_datahub-1.0.
|
|
1019
|
-
acryl_datahub-1.0.
|
|
1020
|
-
acryl_datahub-1.0.
|
|
1021
|
-
acryl_datahub-1.0.
|
|
1017
|
+
acryl_datahub-1.0.0rc4.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
1018
|
+
acryl_datahub-1.0.0rc4.dist-info/METADATA,sha256=DNdXlwXHLsQAxYJUquO4ZPH9g58WXUQwmZ71V7qcIWw,175366
|
|
1019
|
+
acryl_datahub-1.0.0rc4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
1020
|
+
acryl_datahub-1.0.0rc4.dist-info/entry_points.txt,sha256=U1e5ZwqPX1OaIbvGrwvozcdB8SbzFYXQM7plpdLKKeo,9592
|
|
1021
|
+
acryl_datahub-1.0.0rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1022
|
+
acryl_datahub-1.0.0rc4.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
datahub/cli/ingest_cli.py
CHANGED
|
@@ -15,14 +15,14 @@ from tabulate import tabulate
|
|
|
15
15
|
from datahub._version import nice_version_name
|
|
16
16
|
from datahub.cli import cli_utils
|
|
17
17
|
from datahub.cli.config_utils import CONDENSED_DATAHUB_CONFIG_PATH
|
|
18
|
-
from datahub.configuration.common import
|
|
18
|
+
from datahub.configuration.common import GraphError
|
|
19
19
|
from datahub.configuration.config_loader import load_config_file
|
|
20
|
-
from datahub.emitter.mce_builder import datahub_guid
|
|
21
20
|
from datahub.ingestion.graph.client import get_default_graph
|
|
22
21
|
from datahub.ingestion.run.connection import ConnectionManager
|
|
23
22
|
from datahub.ingestion.run.pipeline import Pipeline
|
|
24
23
|
from datahub.telemetry import telemetry
|
|
25
24
|
from datahub.upgrade import upgrade
|
|
25
|
+
from datahub.utilities.ingest_utils import deploy_source_vars
|
|
26
26
|
from datahub.utilities.perf_timer import PerfTimer
|
|
27
27
|
|
|
28
28
|
logger = logging.getLogger(__name__)
|
|
@@ -191,23 +191,6 @@ def run(
|
|
|
191
191
|
# don't raise SystemExit if there's no error
|
|
192
192
|
|
|
193
193
|
|
|
194
|
-
def _make_ingestion_urn(name: str) -> str:
|
|
195
|
-
guid = datahub_guid(
|
|
196
|
-
{
|
|
197
|
-
"name": name,
|
|
198
|
-
}
|
|
199
|
-
)
|
|
200
|
-
return f"urn:li:dataHubIngestionSource:deploy-{guid}"
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
class DeployOptions(ConfigModel):
|
|
204
|
-
name: str
|
|
205
|
-
schedule: Optional[str] = None
|
|
206
|
-
time_zone: str = "UTC"
|
|
207
|
-
cli_version: Optional[str] = None
|
|
208
|
-
executor_id: str = "default"
|
|
209
|
-
|
|
210
|
-
|
|
211
194
|
@ingest.command()
|
|
212
195
|
@upgrade.check_upgrade
|
|
213
196
|
@telemetry.with_telemetry()
|
|
@@ -258,6 +241,16 @@ class DeployOptions(ConfigModel):
|
|
|
258
241
|
required=False,
|
|
259
242
|
default="UTC",
|
|
260
243
|
)
|
|
244
|
+
@click.option(
|
|
245
|
+
"--debug", type=bool, help="Should we debug.", required=False, default=False
|
|
246
|
+
)
|
|
247
|
+
@click.option(
|
|
248
|
+
"--extra-pip",
|
|
249
|
+
type=str,
|
|
250
|
+
help='Extra pip packages. e.g. ["memray"]',
|
|
251
|
+
required=False,
|
|
252
|
+
default=None,
|
|
253
|
+
)
|
|
261
254
|
def deploy(
|
|
262
255
|
name: Optional[str],
|
|
263
256
|
config: str,
|
|
@@ -266,6 +259,8 @@ def deploy(
|
|
|
266
259
|
cli_version: Optional[str],
|
|
267
260
|
schedule: Optional[str],
|
|
268
261
|
time_zone: str,
|
|
262
|
+
extra_pip: Optional[str],
|
|
263
|
+
debug: bool = False,
|
|
269
264
|
) -> None:
|
|
270
265
|
"""
|
|
271
266
|
Deploy an ingestion recipe to your DataHub instance.
|
|
@@ -276,83 +271,23 @@ def deploy(
|
|
|
276
271
|
|
|
277
272
|
datahub_graph = get_default_graph()
|
|
278
273
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
274
|
+
variables = deploy_source_vars(
|
|
275
|
+
name=name,
|
|
276
|
+
config=config,
|
|
277
|
+
urn=urn,
|
|
278
|
+
executor_id=executor_id,
|
|
279
|
+
cli_version=cli_version,
|
|
280
|
+
schedule=schedule,
|
|
281
|
+
time_zone=time_zone,
|
|
282
|
+
extra_pip=extra_pip,
|
|
283
|
+
debug=debug,
|
|
284
284
|
)
|
|
285
285
|
|
|
286
|
-
deploy_options_raw = pipeline_config.pop("deployment", None)
|
|
287
|
-
if deploy_options_raw is not None:
|
|
288
|
-
deploy_options = DeployOptions.parse_obj(deploy_options_raw)
|
|
289
|
-
|
|
290
|
-
if name:
|
|
291
|
-
logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
|
|
292
|
-
deploy_options.name = name
|
|
293
|
-
else:
|
|
294
|
-
if not name:
|
|
295
|
-
raise click.UsageError(
|
|
296
|
-
"Either --name must be set or deployment_name specified in the config"
|
|
297
|
-
)
|
|
298
|
-
deploy_options = DeployOptions(name=name)
|
|
299
|
-
|
|
300
|
-
# Use remaining CLI args to override deploy_options
|
|
301
|
-
if schedule:
|
|
302
|
-
deploy_options.schedule = schedule
|
|
303
|
-
if time_zone:
|
|
304
|
-
deploy_options.time_zone = time_zone
|
|
305
|
-
if cli_version:
|
|
306
|
-
deploy_options.cli_version = cli_version
|
|
307
|
-
if executor_id:
|
|
308
|
-
deploy_options.executor_id = executor_id
|
|
309
|
-
|
|
310
|
-
logger.info(f"Using {repr(deploy_options)}")
|
|
311
|
-
|
|
312
|
-
if not urn:
|
|
313
|
-
# When urn/name is not specified, we will generate a unique urn based on the deployment name.
|
|
314
|
-
urn = _make_ingestion_urn(deploy_options.name)
|
|
315
|
-
logger.info(f"Using recipe urn: {urn}")
|
|
316
|
-
|
|
317
|
-
# Invariant - at this point, both urn and deploy_options are set.
|
|
318
|
-
|
|
319
|
-
variables: dict = {
|
|
320
|
-
"urn": urn,
|
|
321
|
-
"name": deploy_options.name,
|
|
322
|
-
"type": pipeline_config["source"]["type"],
|
|
323
|
-
"recipe": json.dumps(pipeline_config),
|
|
324
|
-
"executorId": deploy_options.executor_id,
|
|
325
|
-
"version": deploy_options.cli_version,
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
if deploy_options.schedule is not None:
|
|
329
|
-
variables["schedule"] = {
|
|
330
|
-
"interval": deploy_options.schedule,
|
|
331
|
-
"timezone": deploy_options.time_zone,
|
|
332
|
-
}
|
|
333
|
-
|
|
334
286
|
# The updateIngestionSource endpoint can actually do upserts as well.
|
|
335
287
|
graphql_query: str = textwrap.dedent(
|
|
336
288
|
"""
|
|
337
|
-
mutation updateIngestionSource(
|
|
338
|
-
$urn:
|
|
339
|
-
$name: String!,
|
|
340
|
-
$type: String!,
|
|
341
|
-
$schedule: UpdateIngestionSourceScheduleInput,
|
|
342
|
-
$recipe: String!,
|
|
343
|
-
$executorId: String!
|
|
344
|
-
$version: String) {
|
|
345
|
-
|
|
346
|
-
updateIngestionSource(urn: $urn, input: {
|
|
347
|
-
name: $name,
|
|
348
|
-
type: $type,
|
|
349
|
-
schedule: $schedule,
|
|
350
|
-
config: {
|
|
351
|
-
recipe: $recipe,
|
|
352
|
-
executorId: $executorId,
|
|
353
|
-
version: $version,
|
|
354
|
-
}
|
|
355
|
-
})
|
|
289
|
+
mutation updateIngestionSource($urn: String!, $input: UpdateIngestionSourceInput!) {
|
|
290
|
+
updateIngestionSource(urn: $urn, input: $input)
|
|
356
291
|
}
|
|
357
292
|
"""
|
|
358
293
|
)
|
|
@@ -372,7 +307,7 @@ def deploy(
|
|
|
372
307
|
sys.exit(1)
|
|
373
308
|
|
|
374
309
|
click.echo(
|
|
375
|
-
f"✅ Successfully wrote data ingestion source metadata for recipe {
|
|
310
|
+
f"✅ Successfully wrote data ingestion source metadata for recipe {variables['name']}:"
|
|
376
311
|
)
|
|
377
312
|
click.echo(response)
|
|
378
313
|
|
|
@@ -464,7 +464,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
464
464
|
|
|
465
465
|
with self.report.new_stage(f"Ingest schema {schema.id}"):
|
|
466
466
|
yield from self.gen_schema_containers(schema)
|
|
467
|
-
|
|
467
|
+
try:
|
|
468
|
+
yield from self.process_tables(schema)
|
|
469
|
+
except Exception as e:
|
|
470
|
+
logger.exception(f"Error parsing schema {schema}")
|
|
471
|
+
self.report.report_warning(
|
|
472
|
+
message="Missed schema because of parsing issues",
|
|
473
|
+
context=str(schema),
|
|
474
|
+
title="Error parsing schema",
|
|
475
|
+
exc=e,
|
|
476
|
+
)
|
|
477
|
+
continue
|
|
468
478
|
|
|
469
479
|
self.report.schemas.processed(schema.id)
|
|
470
480
|
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from datahub.configuration.common import ConfigModel
|
|
8
|
+
from datahub.configuration.config_loader import load_config_file
|
|
9
|
+
from datahub.emitter.mce_builder import datahub_guid
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _make_ingestion_urn(name: str) -> str:
|
|
15
|
+
guid = datahub_guid(
|
|
16
|
+
{
|
|
17
|
+
"name": name,
|
|
18
|
+
}
|
|
19
|
+
)
|
|
20
|
+
return f"urn:li:dataHubIngestionSource:deploy-{guid}"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DeployOptions(ConfigModel):
|
|
24
|
+
name: str
|
|
25
|
+
schedule: Optional[str] = None
|
|
26
|
+
time_zone: str = "UTC"
|
|
27
|
+
cli_version: Optional[str] = None
|
|
28
|
+
executor_id: str = "default"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def deploy_source_vars(
|
|
32
|
+
name: Optional[str],
|
|
33
|
+
config: str,
|
|
34
|
+
urn: Optional[str],
|
|
35
|
+
executor_id: str,
|
|
36
|
+
cli_version: Optional[str],
|
|
37
|
+
schedule: Optional[str],
|
|
38
|
+
time_zone: str,
|
|
39
|
+
extra_pip: Optional[str],
|
|
40
|
+
debug: bool = False,
|
|
41
|
+
) -> dict:
|
|
42
|
+
pipeline_config = load_config_file(
|
|
43
|
+
config,
|
|
44
|
+
allow_stdin=True,
|
|
45
|
+
allow_remote=True,
|
|
46
|
+
resolve_env_vars=False,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
deploy_options_raw = pipeline_config.pop("deployment", None)
|
|
50
|
+
if deploy_options_raw is not None:
|
|
51
|
+
deploy_options = DeployOptions.parse_obj(deploy_options_raw)
|
|
52
|
+
|
|
53
|
+
if name:
|
|
54
|
+
logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
|
|
55
|
+
deploy_options.name = name
|
|
56
|
+
else:
|
|
57
|
+
if not name:
|
|
58
|
+
raise click.UsageError(
|
|
59
|
+
"Either --name must be set or deployment_name specified in the config"
|
|
60
|
+
)
|
|
61
|
+
deploy_options = DeployOptions(name=name)
|
|
62
|
+
|
|
63
|
+
# Use remaining CLI args to override deploy_options
|
|
64
|
+
if schedule:
|
|
65
|
+
deploy_options.schedule = schedule
|
|
66
|
+
if time_zone:
|
|
67
|
+
deploy_options.time_zone = time_zone
|
|
68
|
+
if cli_version:
|
|
69
|
+
deploy_options.cli_version = cli_version
|
|
70
|
+
if executor_id:
|
|
71
|
+
deploy_options.executor_id = executor_id
|
|
72
|
+
|
|
73
|
+
logger.info(f"Using {repr(deploy_options)}")
|
|
74
|
+
|
|
75
|
+
if not urn:
|
|
76
|
+
# When urn/name is not specified, we will generate a unique urn based on the deployment name.
|
|
77
|
+
urn = _make_ingestion_urn(deploy_options.name)
|
|
78
|
+
logger.info(f"Using recipe urn: {urn}")
|
|
79
|
+
|
|
80
|
+
variables: dict = {
|
|
81
|
+
"urn": urn,
|
|
82
|
+
"input": {
|
|
83
|
+
"name": deploy_options.name,
|
|
84
|
+
"type": pipeline_config["source"]["type"],
|
|
85
|
+
"config": {
|
|
86
|
+
"recipe": json.dumps(pipeline_config),
|
|
87
|
+
"executorId": deploy_options.executor_id,
|
|
88
|
+
"debugMode": debug,
|
|
89
|
+
"version": deploy_options.cli_version,
|
|
90
|
+
},
|
|
91
|
+
},
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if deploy_options.schedule is not None:
|
|
95
|
+
variables["input"]["schedule"] = {
|
|
96
|
+
"interval": deploy_options.schedule,
|
|
97
|
+
"timezone": deploy_options.time_zone,
|
|
98
|
+
}
|
|
99
|
+
if extra_pip is not None:
|
|
100
|
+
extra_args_list = (
|
|
101
|
+
variables.get("input", {}).get("config", {}).get("extraArgs", [])
|
|
102
|
+
)
|
|
103
|
+
extra_args_list.append({"key": "extra_pip_requirements", "value": extra_pip})
|
|
104
|
+
variables["input"]["config"]["extraArgs"] = extra_args_list
|
|
105
|
+
|
|
106
|
+
return variables
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|