acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.1rc17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.1rc17.dist-info}/METADATA +2466 -2466
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.1rc17.dist-info}/RECORD +16 -16
- datahub/__init__.py +1 -1
- datahub/emitter/rest_emitter.py +15 -8
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/source/datahub/config.py +10 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/unity/source.py +0 -4
- datahub/sql_parsing/sql_parsing_aggregator.py +8 -5
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.1rc17.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.1rc17.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.1rc17.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=GvEPomdTJt9ZrIZUgZuwaVdBYiJA2qKcUyUKxDy3owo,577
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
|
|
|
119
119
|
datahub/emitter/mcp_builder.py,sha256=eOcuz41c4a3oTkNk39yYl9bTxpksxqATPHLcqyhPGT0,9856
|
|
120
120
|
datahub/emitter/mcp_patch_builder.py,sha256=oonC8iGOvDzqj890CxOjWlBdDEF1RnwvbSZy1sivlTY,4572
|
|
121
121
|
datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
|
|
122
|
-
datahub/emitter/rest_emitter.py,sha256=
|
|
122
|
+
datahub/emitter/rest_emitter.py,sha256=YpRQEyuDBq31Iw7bZtOe5arm4YviCQLpvaObzVwheBY,16759
|
|
123
123
|
datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
|
|
124
124
|
datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
|
|
125
125
|
datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
|
|
@@ -138,13 +138,13 @@ datahub/ingestion/api/registry.py,sha256=LGElUdzhNQoEr-k2SN23mJaIYnA1PYfF97LQxBm
|
|
|
138
138
|
datahub/ingestion/api/report.py,sha256=zb5Y_9ogmWm00KqX7_64sIMT24Wfpk7txRwEfKacw5I,4652
|
|
139
139
|
datahub/ingestion/api/report_helpers.py,sha256=WbUC1kQeaKqIagGV3XzfPmPs7slAT1mfNY4og2BH2A8,994
|
|
140
140
|
datahub/ingestion/api/sink.py,sha256=3jw7-x9gXGreOPwn49wG5fT3C8pYhaNMQITdMN6kbag,4478
|
|
141
|
-
datahub/ingestion/api/source.py,sha256=
|
|
141
|
+
datahub/ingestion/api/source.py,sha256=kSQ6AKDvLdFOIxaz9nPCmCSUsIMDdXHiOxzFiMdYN14,19001
|
|
142
142
|
datahub/ingestion/api/source_helpers.py,sha256=AVO0ogiCKgYmX1ubJaSs6L30TCCgOIalp6awXPF5XM0,19643
|
|
143
143
|
datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188mmC8J8,582
|
|
144
144
|
datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
|
|
145
145
|
datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
146
146
|
datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
|
|
147
|
-
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=
|
|
147
|
+
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=5jrl7cEyonce-YdWe1Iw6y3Okw5smJosqwOm5e-nvqM,4363
|
|
148
148
|
datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
149
|
datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
|
|
150
150
|
datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
|
|
@@ -160,7 +160,7 @@ datahub/ingestion/fs/local_fs.py,sha256=oWf-PZsl5sI-9eHWGeKlfKYagbQaSZ9fGfNbxcFj
|
|
|
160
160
|
datahub/ingestion/fs/s3_fs.py,sha256=FM6UK9A48UdOjkAO-gh1rAa4N7FTXz0Wutmp8TeX7kY,3199
|
|
161
161
|
datahub/ingestion/glossary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
162
|
datahub/ingestion/glossary/classification_mixin.py,sha256=pkb0Rv2SQH7VwAV5DPLoJLJwkDwTjIhOhg4mbXiz9CI,13332
|
|
163
|
-
datahub/ingestion/glossary/classifier.py,sha256=
|
|
163
|
+
datahub/ingestion/glossary/classifier.py,sha256=zp8Fe3he80H5Zz1EwymKjThUPkTpw6PgEJQvlmqrJmQ,3006
|
|
164
164
|
datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
|
|
165
165
|
datahub/ingestion/glossary/datahub_classifier.py,sha256=8VhwuLDhyOqqOr0jqAPIgorb4eAOnvTr4m13Y2Wy1-E,7515
|
|
166
166
|
datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -265,11 +265,11 @@ datahub/ingestion/source/data_lake_common/config.py,sha256=qUk83B01hjuBKHvVz8SmX
|
|
|
265
265
|
datahub/ingestion/source/data_lake_common/data_lake_utils.py,sha256=nxu7osuzqxScPFc-1ODA2M1c_xPNPpRH_SMMU7zKOIE,6212
|
|
266
266
|
datahub/ingestion/source/data_lake_common/path_spec.py,sha256=u3u2eMe70V5vur-j8mYtupZdoeA2hSeK262Whdsc2YU,23506
|
|
267
267
|
datahub/ingestion/source/datahub/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
268
|
-
datahub/ingestion/source/datahub/config.py,sha256=
|
|
268
|
+
datahub/ingestion/source/datahub/config.py,sha256=xBAZJpcw25aMI2zHi2wXi21sAfdy1rlmbBq9tY3adV0,4304
|
|
269
269
|
datahub/ingestion/source/datahub/datahub_api_reader.py,sha256=hlKADVEPoTFiRGKqRsMF5mL4fSu_IrIW8Nx7LpEzvkM,2134
|
|
270
|
-
datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=
|
|
270
|
+
datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=Rd61iHFhvrNmgzIk0jDDYxjxQUnEckbn1SKedoR5qic,8972
|
|
271
271
|
datahub/ingestion/source/datahub/datahub_kafka_reader.py,sha256=gnxhhlK-jrfnHqD_4eVmfcdtBNW6pi1N_qkDZ7uSb3o,4187
|
|
272
|
-
datahub/ingestion/source/datahub/datahub_source.py,sha256=
|
|
272
|
+
datahub/ingestion/source/datahub/datahub_source.py,sha256=5qGg_T0KJaO5WcvrsM0KM8_eTOjy0NvlMI4DUdIAiDo,8482
|
|
273
273
|
datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
|
|
274
274
|
datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
|
|
275
275
|
datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -432,7 +432,7 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
|
|
|
432
432
|
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=jQGSa7ZQs3EsXB9ANShZ4xv9RqrhRfVHRSLeFiDwwxc,17974
|
|
433
433
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
|
|
434
434
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
435
|
-
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=
|
|
435
|
+
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
|
|
436
436
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
437
437
|
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=jTpnFWRqqFId6DKJvvAbNuFPxyNi1oQxxDUyMvh1iu4,26968
|
|
438
438
|
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=5po2FWz41UVowykJYbTFGxsltbmlHBCPcHG20VOhdOE,38469
|
|
@@ -506,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=_6kCI7M4-26pZ9ZMGJUh6LwYmbGAZlnvc
|
|
|
506
506
|
datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
|
|
507
507
|
datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
|
|
508
508
|
datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
|
|
509
|
-
datahub/ingestion/source/unity/source.py,sha256=
|
|
509
|
+
datahub/ingestion/source/unity/source.py,sha256=YdUPCMJtpmvYVnRNnpqb4BVowFobkLvSJ_K2gHwrvCI,41752
|
|
510
510
|
datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
|
|
511
511
|
datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
512
512
|
datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
|
|
@@ -881,7 +881,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
|
|
|
881
881
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
882
882
|
datahub/sql_parsing/schema_resolver.py,sha256=9INZWdxA2dMSLK6RXaVqjbjyLY_VKMhCkQv_Xd6Ln3I,10848
|
|
883
883
|
datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
|
|
884
|
-
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=
|
|
884
|
+
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=ULvLZygN_LtZQg_DKLQ2lDzz3YsEhZBvZUx3wmYeP_Q,69976
|
|
885
885
|
datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
|
|
886
886
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
887
887
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=gUVq3NwZUzQByJs43JZXz8lZf0ZVzVt0FzaW5wZOwK4,47460
|
|
@@ -986,8 +986,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
986
986
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
987
987
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
988
988
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
989
|
-
acryl_datahub-0.15.0.
|
|
990
|
-
acryl_datahub-0.15.0.
|
|
991
|
-
acryl_datahub-0.15.0.
|
|
992
|
-
acryl_datahub-0.15.0.
|
|
993
|
-
acryl_datahub-0.15.0.
|
|
989
|
+
acryl_datahub-0.15.0.1rc17.dist-info/METADATA,sha256=dV3uL4e_h1lxLVGl_a9Cmnm5pBi8kgHTbR56ypsqjC8,173444
|
|
990
|
+
acryl_datahub-0.15.0.1rc17.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
|
|
991
|
+
acryl_datahub-0.15.0.1rc17.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
992
|
+
acryl_datahub-0.15.0.1rc17.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
993
|
+
acryl_datahub-0.15.0.1rc17.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -13,6 +13,7 @@ from requests.exceptions import HTTPError, RequestException
|
|
|
13
13
|
from datahub import nice_version_name
|
|
14
14
|
from datahub.cli import config_utils
|
|
15
15
|
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
|
|
16
|
+
from datahub.cli.env_utils import get_boolean_env_variable
|
|
16
17
|
from datahub.configuration.common import ConfigurationError, OperationalError
|
|
17
18
|
from datahub.emitter.generic_emitter import Emitter
|
|
18
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
@@ -46,6 +47,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
46
47
|
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
47
48
|
)
|
|
48
49
|
|
|
50
|
+
_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
|
|
51
|
+
|
|
49
52
|
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
53
|
# for overhead like request headers.
|
|
51
54
|
# This applies to pretty much all calls to GMS.
|
|
@@ -291,7 +294,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
291
294
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
292
295
|
async_flag: Optional[bool] = None,
|
|
293
296
|
) -> int:
|
|
294
|
-
|
|
297
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
298
|
+
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
|
|
295
299
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
296
300
|
for mcp in mcps:
|
|
297
301
|
ensure_has_system_metadata(mcp)
|
|
@@ -304,22 +308,25 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
304
308
|
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
|
|
305
309
|
for mcp_obj in mcp_objs:
|
|
306
310
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
311
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
312
|
+
logger.debug(
|
|
313
|
+
f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
|
|
314
|
+
)
|
|
310
315
|
|
|
311
316
|
if (
|
|
312
317
|
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
|
|
313
318
|
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
314
319
|
):
|
|
315
|
-
|
|
320
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
321
|
+
logger.debug("Decided to create new chunk")
|
|
316
322
|
mcp_obj_chunks.append([])
|
|
317
323
|
current_chunk_size = 0
|
|
318
324
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
319
325
|
current_chunk_size += mcp_obj_size
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
326
|
+
if len(mcp_obj_chunks) > 0:
|
|
327
|
+
logger.debug(
|
|
328
|
+
f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
|
|
329
|
+
)
|
|
323
330
|
|
|
324
331
|
for mcp_obj_chunk in mcp_obj_chunks:
|
|
325
332
|
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Iterable, List
|
|
3
|
+
from typing import TYPE_CHECKING, Iterable, List
|
|
4
4
|
|
|
5
5
|
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
|
|
6
6
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
7
|
-
from datahub.ingestion.api.source import SourceReport
|
|
8
7
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
9
8
|
from datahub.metadata.schema_classes import (
|
|
10
9
|
DatasetProfileClass,
|
|
@@ -12,12 +11,15 @@ from datahub.metadata.schema_classes import (
|
|
|
12
11
|
SchemaMetadataClass,
|
|
13
12
|
)
|
|
14
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from datahub.ingestion.api.source import SourceReport
|
|
16
|
+
|
|
15
17
|
logger = logging.getLogger(__name__)
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class EnsureAspectSizeProcessor:
|
|
19
21
|
def __init__(
|
|
20
|
-
self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
|
|
22
|
+
self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
|
|
21
23
|
):
|
|
22
24
|
self.report = report
|
|
23
25
|
self.payload_constraint = payload_constraint
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -31,6 +31,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
|
|
|
31
31
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
32
32
|
auto_patch_last_modified,
|
|
33
33
|
)
|
|
34
|
+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
35
|
+
EnsureAspectSizeProcessor,
|
|
36
|
+
)
|
|
34
37
|
from datahub.ingestion.api.closeable import Closeable
|
|
35
38
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
36
39
|
from datahub.ingestion.api.report import Report
|
|
@@ -450,6 +453,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
450
453
|
browse_path_processor,
|
|
451
454
|
partial(auto_workunit_reporter, self.get_report()),
|
|
452
455
|
auto_patch_last_modified,
|
|
456
|
+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
453
457
|
]
|
|
454
458
|
|
|
455
459
|
@staticmethod
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
1
|
from abc import ABCMeta, abstractmethod
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from typing import Any, Dict, List, Optional
|
|
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
|
|
|
38
37
|
)
|
|
39
38
|
|
|
40
39
|
max_workers: int = Field(
|
|
41
|
-
default=
|
|
42
|
-
description="Number of worker processes to use for classification. Set to 1 to disable.",
|
|
40
|
+
default=1,
|
|
41
|
+
description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
|
|
43
42
|
)
|
|
44
43
|
|
|
45
44
|
table_pattern: AllowDenyPattern = Field(
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Optional, Set
|
|
3
3
|
|
|
4
|
+
import pydantic
|
|
4
5
|
from pydantic import Field, root_validator
|
|
5
6
|
|
|
6
7
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -119,3 +120,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
119
120
|
" Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
|
|
120
121
|
)
|
|
121
122
|
return values
|
|
123
|
+
|
|
124
|
+
@pydantic.validator("database_connection")
|
|
125
|
+
def validate_mysql_scheme(
|
|
126
|
+
cls, v: SQLAlchemyConnectionConfig
|
|
127
|
+
) -> SQLAlchemyConnectionConfig:
|
|
128
|
+
if "mysql" in v.scheme:
|
|
129
|
+
if v.scheme != "mysql+pymysql":
|
|
130
|
+
raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
|
|
131
|
+
return v
|
|
@@ -151,8 +151,10 @@ class DataHubDatabaseReader:
|
|
|
151
151
|
self, query: str, params: Dict[str, Any]
|
|
152
152
|
) -> Iterable[Dict[str, Any]]:
|
|
153
153
|
with self.engine.connect() as conn:
|
|
154
|
-
if self.engine.dialect.name
|
|
154
|
+
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
155
|
with conn.begin(): # Transaction required for PostgreSQL server-side cursor
|
|
156
|
+
# Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
|
|
157
|
+
# https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
|
|
156
158
|
conn = conn.execution_options(
|
|
157
159
|
stream_results=True,
|
|
158
160
|
yield_per=self.config.database_query_batch_size,
|
|
@@ -160,22 +162,6 @@ class DataHubDatabaseReader:
|
|
|
160
162
|
result = conn.execute(query, params)
|
|
161
163
|
for row in result:
|
|
162
164
|
yield dict(row)
|
|
163
|
-
elif self.engine.dialect.name == "mysql": # MySQL
|
|
164
|
-
import MySQLdb
|
|
165
|
-
|
|
166
|
-
with contextlib.closing(
|
|
167
|
-
conn.connection.cursor(MySQLdb.cursors.SSCursor)
|
|
168
|
-
) as cursor:
|
|
169
|
-
logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
|
|
170
|
-
cursor.execute(query, params)
|
|
171
|
-
|
|
172
|
-
columns = [desc[0] for desc in cursor.description]
|
|
173
|
-
while True:
|
|
174
|
-
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
175
|
-
if not rows:
|
|
176
|
-
break # Use break instead of return in generator
|
|
177
|
-
for row in rows:
|
|
178
|
-
yield dict(zip(columns, row))
|
|
179
165
|
else:
|
|
180
166
|
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
|
181
167
|
|
|
@@ -130,7 +130,7 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
130
130
|
self._commit_progress(i)
|
|
131
131
|
|
|
132
132
|
def _get_kafka_workunits(
|
|
133
|
-
self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
|
|
133
|
+
self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
|
|
134
134
|
) -> Iterable[MetadataWorkUnit]:
|
|
135
135
|
if self.config.kafka_connection is None:
|
|
136
136
|
return
|
|
@@ -40,6 +40,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
40
40
|
ColumnRef,
|
|
41
41
|
DownstreamColumnRef,
|
|
42
42
|
)
|
|
43
|
+
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
43
44
|
from datahub.utilities.perf_timer import PerfTimer
|
|
44
45
|
from datahub.utilities.time import ts_millis_to_datetime
|
|
45
46
|
|
|
@@ -239,6 +240,9 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
239
240
|
downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
|
|
240
241
|
|
|
241
242
|
known_lineage = KnownQueryLineageInfo(
|
|
243
|
+
query_id=get_query_fingerprint(
|
|
244
|
+
query.query_text, self.identifiers.platform, fast=True
|
|
245
|
+
),
|
|
242
246
|
query_text=query.query_text,
|
|
243
247
|
downstream=downstream_table_urn,
|
|
244
248
|
upstreams=self.map_query_result_upstreams(
|
|
@@ -26,9 +26,6 @@ from datahub.emitter.mcp_builder import (
|
|
|
26
26
|
gen_containers,
|
|
27
27
|
)
|
|
28
28
|
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
29
|
-
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
30
|
-
EnsureAspectSizeProcessor,
|
|
31
|
-
)
|
|
32
29
|
from datahub.ingestion.api.common import PipelineContext
|
|
33
30
|
from datahub.ingestion.api.decorators import (
|
|
34
31
|
SupportStatus,
|
|
@@ -263,7 +260,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
263
260
|
StaleEntityRemovalHandler.create(
|
|
264
261
|
self, self.config, self.ctx
|
|
265
262
|
).workunit_processor,
|
|
266
|
-
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
267
263
|
]
|
|
268
264
|
|
|
269
265
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -165,6 +165,7 @@ class KnownQueryLineageInfo:
|
|
|
165
165
|
timestamp: Optional[datetime] = None
|
|
166
166
|
session_id: Optional[str] = None
|
|
167
167
|
query_type: QueryType = QueryType.UNKNOWN
|
|
168
|
+
query_id: Optional[str] = None
|
|
168
169
|
|
|
169
170
|
|
|
170
171
|
@dataclasses.dataclass
|
|
@@ -618,11 +619,13 @@ class SqlParsingAggregator(Closeable):
|
|
|
618
619
|
self.report.num_known_query_lineage += 1
|
|
619
620
|
|
|
620
621
|
# Generate a fingerprint for the query.
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
622
|
+
query_fingerprint = known_query_lineage.query_id
|
|
623
|
+
if not query_fingerprint:
|
|
624
|
+
with self.report.sql_fingerprinting_timer:
|
|
625
|
+
query_fingerprint = get_query_fingerprint(
|
|
626
|
+
known_query_lineage.query_text,
|
|
627
|
+
platform=self.platform.platform_name,
|
|
628
|
+
)
|
|
626
629
|
formatted_query = self._maybe_format_query(known_query_lineage.query_text)
|
|
627
630
|
|
|
628
631
|
# Register the query.
|
|
File without changes
|
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.1rc17.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|