acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.1rc17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=0dgSJoggO_qJtX-oEnxH20rGzNGGCstuwsxqUKzbKUA,577
1
+ datahub/__init__.py,sha256=GvEPomdTJt9ZrIZUgZuwaVdBYiJA2qKcUyUKxDy3owo,577
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
119
119
  datahub/emitter/mcp_builder.py,sha256=eOcuz41c4a3oTkNk39yYl9bTxpksxqATPHLcqyhPGT0,9856
120
120
  datahub/emitter/mcp_patch_builder.py,sha256=oonC8iGOvDzqj890CxOjWlBdDEF1RnwvbSZy1sivlTY,4572
121
121
  datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
122
- datahub/emitter/rest_emitter.py,sha256=oqyRuXG1o1dYjiEIH5TFMb1q0xhRbpxPIA5qkyz0iQ8,16407
122
+ datahub/emitter/rest_emitter.py,sha256=YpRQEyuDBq31Iw7bZtOe5arm4YviCQLpvaObzVwheBY,16759
123
123
  datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
124
124
  datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
125
125
  datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -138,13 +138,13 @@ datahub/ingestion/api/registry.py,sha256=LGElUdzhNQoEr-k2SN23mJaIYnA1PYfF97LQxBm
138
138
  datahub/ingestion/api/report.py,sha256=zb5Y_9ogmWm00KqX7_64sIMT24Wfpk7txRwEfKacw5I,4652
139
139
  datahub/ingestion/api/report_helpers.py,sha256=WbUC1kQeaKqIagGV3XzfPmPs7slAT1mfNY4og2BH2A8,994
140
140
  datahub/ingestion/api/sink.py,sha256=3jw7-x9gXGreOPwn49wG5fT3C8pYhaNMQITdMN6kbag,4478
141
- datahub/ingestion/api/source.py,sha256=pHfFIBZa57ySpZWnt03mmayWLdbbBAGOhWqWZnf1KUA,18815
141
+ datahub/ingestion/api/source.py,sha256=kSQ6AKDvLdFOIxaz9nPCmCSUsIMDdXHiOxzFiMdYN14,19001
142
142
  datahub/ingestion/api/source_helpers.py,sha256=AVO0ogiCKgYmX1ubJaSs6L30TCCgOIalp6awXPF5XM0,19643
143
143
  datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188mmC8J8,582
144
144
  datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
145
145
  datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
147
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=9mzg3vhCFPI9zHSi_j3FvJtK8kUvb8PmRSz-TM3tt6k,4323
147
+ datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=5jrl7cEyonce-YdWe1Iw6y3Okw5smJosqwOm5e-nvqM,4363
148
148
  datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
149
  datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
150
150
  datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
@@ -160,7 +160,7 @@ datahub/ingestion/fs/local_fs.py,sha256=oWf-PZsl5sI-9eHWGeKlfKYagbQaSZ9fGfNbxcFj
160
160
  datahub/ingestion/fs/s3_fs.py,sha256=FM6UK9A48UdOjkAO-gh1rAa4N7FTXz0Wutmp8TeX7kY,3199
161
161
  datahub/ingestion/glossary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
162
  datahub/ingestion/glossary/classification_mixin.py,sha256=pkb0Rv2SQH7VwAV5DPLoJLJwkDwTjIhOhg4mbXiz9CI,13332
163
- datahub/ingestion/glossary/classifier.py,sha256=daLxnVv_JlfB_jBOxH5LrU_xQRndrsGot6z9Cir5Vuc,2981
163
+ datahub/ingestion/glossary/classifier.py,sha256=zp8Fe3he80H5Zz1EwymKjThUPkTpw6PgEJQvlmqrJmQ,3006
164
164
  datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
165
165
  datahub/ingestion/glossary/datahub_classifier.py,sha256=8VhwuLDhyOqqOr0jqAPIgorb4eAOnvTr4m13Y2Wy1-E,7515
166
166
  datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -265,11 +265,11 @@ datahub/ingestion/source/data_lake_common/config.py,sha256=qUk83B01hjuBKHvVz8SmX
265
265
  datahub/ingestion/source/data_lake_common/data_lake_utils.py,sha256=nxu7osuzqxScPFc-1ODA2M1c_xPNPpRH_SMMU7zKOIE,6212
266
266
  datahub/ingestion/source/data_lake_common/path_spec.py,sha256=u3u2eMe70V5vur-j8mYtupZdoeA2hSeK262Whdsc2YU,23506
267
267
  datahub/ingestion/source/datahub/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
268
- datahub/ingestion/source/datahub/config.py,sha256=rqZFvEmjxjBcW2cTEPYDVTAk3OLzuGIjEFghXPNeZNY,3955
268
+ datahub/ingestion/source/datahub/config.py,sha256=xBAZJpcw25aMI2zHi2wXi21sAfdy1rlmbBq9tY3adV0,4304
269
269
  datahub/ingestion/source/datahub/datahub_api_reader.py,sha256=hlKADVEPoTFiRGKqRsMF5mL4fSu_IrIW8Nx7LpEzvkM,2134
270
- datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=F8JrOjSrmJ2B6m1MWh83A1EYFDcGMla749HUeQWMnL0,9464
270
+ datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=Rd61iHFhvrNmgzIk0jDDYxjxQUnEckbn1SKedoR5qic,8972
271
271
  datahub/ingestion/source/datahub/datahub_kafka_reader.py,sha256=gnxhhlK-jrfnHqD_4eVmfcdtBNW6pi1N_qkDZ7uSb3o,4187
272
- datahub/ingestion/source/datahub/datahub_source.py,sha256=2jDnsHEzpGhr00qQI9unSUJYD6Cb1McYFKOVbA-Zcm4,8487
272
+ datahub/ingestion/source/datahub/datahub_source.py,sha256=5qGg_T0KJaO5WcvrsM0KM8_eTOjy0NvlMI4DUdIAiDo,8482
273
273
  datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
274
274
  datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
275
275
  datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -432,7 +432,7 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
432
432
  datahub/ingestion/source/snowflake/snowflake_config.py,sha256=jQGSa7ZQs3EsXB9ANShZ4xv9RqrhRfVHRSLeFiDwwxc,17974
433
433
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
434
434
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
435
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=5Lpy_irZlbOFJbvVkgsZSBjdLCT3VZNjlEvttzSQAU4,21121
435
+ datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
436
436
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
437
437
  datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=jTpnFWRqqFId6DKJvvAbNuFPxyNi1oQxxDUyMvh1iu4,26968
438
438
  datahub/ingestion/source/snowflake/snowflake_query.py,sha256=5po2FWz41UVowykJYbTFGxsltbmlHBCPcHG20VOhdOE,38469
@@ -506,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=_6kCI7M4-26pZ9ZMGJUh6LwYmbGAZlnvc
506
506
  datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
507
507
  datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
508
508
  datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
509
- datahub/ingestion/source/unity/source.py,sha256=ydjqJ91q-Mir_aJSmEdo1umgPvQ5la67rBhC04IyV78,41938
509
+ datahub/ingestion/source/unity/source.py,sha256=YdUPCMJtpmvYVnRNnpqb4BVowFobkLvSJ_K2gHwrvCI,41752
510
510
  datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
511
511
  datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
512
512
  datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
@@ -881,7 +881,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
881
881
  datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
882
882
  datahub/sql_parsing/schema_resolver.py,sha256=9INZWdxA2dMSLK6RXaVqjbjyLY_VKMhCkQv_Xd6Ln3I,10848
883
883
  datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
884
- datahub/sql_parsing/sql_parsing_aggregator.py,sha256=jVF6TbyM71XdJ34K0Setz3LgJALvJrJs1mVKdxU_6d4,69830
884
+ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=ULvLZygN_LtZQg_DKLQ2lDzz3YsEhZBvZUx3wmYeP_Q,69976
885
885
  datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
886
886
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
887
887
  datahub/sql_parsing/sqlglot_lineage.py,sha256=gUVq3NwZUzQByJs43JZXz8lZf0ZVzVt0FzaW5wZOwK4,47460
@@ -986,8 +986,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
986
986
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
987
987
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
988
988
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
989
- acryl_datahub-0.15.0.1rc16.dist-info/METADATA,sha256=hMvfZy8EYOj5eb7yygEhb_kZJbHtpVx-bWNE6H6eu_c,173444
990
- acryl_datahub-0.15.0.1rc16.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
991
- acryl_datahub-0.15.0.1rc16.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
992
- acryl_datahub-0.15.0.1rc16.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
993
- acryl_datahub-0.15.0.1rc16.dist-info/RECORD,,
989
+ acryl_datahub-0.15.0.1rc17.dist-info/METADATA,sha256=dV3uL4e_h1lxLVGl_a9Cmnm5pBi8kgHTbR56ypsqjC8,173444
990
+ acryl_datahub-0.15.0.1rc17.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
991
+ acryl_datahub-0.15.0.1rc17.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
992
+ acryl_datahub-0.15.0.1rc17.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
993
+ acryl_datahub-0.15.0.1rc17.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc16"
6
+ __version__ = "0.15.0.1rc17"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -13,6 +13,7 @@ from requests.exceptions import HTTPError, RequestException
13
13
  from datahub import nice_version_name
14
14
  from datahub.cli import config_utils
15
15
  from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
16
+ from datahub.cli.env_utils import get_boolean_env_variable
16
17
  from datahub.configuration.common import ConfigurationError, OperationalError
17
18
  from datahub.emitter.generic_emitter import Emitter
18
19
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -46,6 +47,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
46
47
  os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
47
48
  )
48
49
 
50
+ _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
51
+
49
52
  # The limit is 16mb. We will use a max of 15mb to have some space
50
53
  # for overhead like request headers.
51
54
  # This applies to pretty much all calls to GMS.
@@ -291,7 +294,8 @@ class DataHubRestEmitter(Closeable, Emitter):
291
294
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
292
295
  async_flag: Optional[bool] = None,
293
296
  ) -> int:
294
- logger.debug("Attempting to emit batch mcps")
297
+ if _DATAHUB_EMITTER_TRACE:
298
+ logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
295
299
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
296
300
  for mcp in mcps:
297
301
  ensure_has_system_metadata(mcp)
@@ -304,22 +308,25 @@ class DataHubRestEmitter(Closeable, Emitter):
304
308
  current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
305
309
  for mcp_obj in mcp_objs:
306
310
  mcp_obj_size = len(json.dumps(mcp_obj))
307
- logger.debug(
308
- f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
309
- )
311
+ if _DATAHUB_EMITTER_TRACE:
312
+ logger.debug(
313
+ f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
314
+ )
310
315
 
311
316
  if (
312
317
  mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
313
318
  or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
314
319
  ):
315
- logger.debug("Decided to create new chunk")
320
+ if _DATAHUB_EMITTER_TRACE:
321
+ logger.debug("Decided to create new chunk")
316
322
  mcp_obj_chunks.append([])
317
323
  current_chunk_size = 0
318
324
  mcp_obj_chunks[-1].append(mcp_obj)
319
325
  current_chunk_size += mcp_obj_size
320
- logger.debug(
321
- f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
322
- )
326
+ if len(mcp_obj_chunks) > 0:
327
+ logger.debug(
328
+ f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
329
+ )
323
330
 
324
331
  for mcp_obj_chunk in mcp_obj_chunks:
325
332
  # TODO: We're calling json.dumps on each MCP object twice, once to estimate
@@ -1,10 +1,9 @@
1
1
  import json
2
2
  import logging
3
- from typing import Iterable, List
3
+ from typing import TYPE_CHECKING, Iterable, List
4
4
 
5
5
  from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
6
6
  from datahub.emitter.serialization_helper import pre_json_transform
7
- from datahub.ingestion.api.source import SourceReport
8
7
  from datahub.ingestion.api.workunit import MetadataWorkUnit
9
8
  from datahub.metadata.schema_classes import (
10
9
  DatasetProfileClass,
@@ -12,12 +11,15 @@ from datahub.metadata.schema_classes import (
12
11
  SchemaMetadataClass,
13
12
  )
14
13
 
14
+ if TYPE_CHECKING:
15
+ from datahub.ingestion.api.source import SourceReport
16
+
15
17
  logger = logging.getLogger(__name__)
16
18
 
17
19
 
18
20
  class EnsureAspectSizeProcessor:
19
21
  def __init__(
20
- self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
22
+ self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
21
23
  ):
22
24
  self.report = report
23
25
  self.payload_constraint = payload_constraint
@@ -31,6 +31,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
31
31
  from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
32
32
  auto_patch_last_modified,
33
33
  )
34
+ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
35
+ EnsureAspectSizeProcessor,
36
+ )
34
37
  from datahub.ingestion.api.closeable import Closeable
35
38
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
36
39
  from datahub.ingestion.api.report import Report
@@ -450,6 +453,7 @@ class Source(Closeable, metaclass=ABCMeta):
450
453
  browse_path_processor,
451
454
  partial(auto_workunit_reporter, self.get_report()),
452
455
  auto_patch_last_modified,
456
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
453
457
  ]
454
458
 
455
459
  @staticmethod
@@ -1,4 +1,3 @@
1
- import os
2
1
  from abc import ABCMeta, abstractmethod
3
2
  from dataclasses import dataclass
4
3
  from typing import Any, Dict, List, Optional
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
38
37
  )
39
38
 
40
39
  max_workers: int = Field(
41
- default=(os.cpu_count() or 4),
42
- description="Number of worker processes to use for classification. Set to 1 to disable.",
40
+ default=1,
41
+ description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
43
42
  )
44
43
 
45
44
  table_pattern: AllowDenyPattern = Field(
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from typing import Optional, Set
3
3
 
4
+ import pydantic
4
5
  from pydantic import Field, root_validator
5
6
 
6
7
  from datahub.configuration.common import AllowDenyPattern
@@ -119,3 +120,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
119
120
  " Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
120
121
  )
121
122
  return values
123
+
124
+ @pydantic.validator("database_connection")
125
+ def validate_mysql_scheme(
126
+ cls, v: SQLAlchemyConnectionConfig
127
+ ) -> SQLAlchemyConnectionConfig:
128
+ if "mysql" in v.scheme:
129
+ if v.scheme != "mysql+pymysql":
130
+ raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
131
+ return v
@@ -151,8 +151,10 @@ class DataHubDatabaseReader:
151
151
  self, query: str, params: Dict[str, Any]
152
152
  ) -> Iterable[Dict[str, Any]]:
153
153
  with self.engine.connect() as conn:
154
- if self.engine.dialect.name == "postgresql":
154
+ if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
155
  with conn.begin(): # Transaction required for PostgreSQL server-side cursor
156
+ # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
157
+ # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
156
158
  conn = conn.execution_options(
157
159
  stream_results=True,
158
160
  yield_per=self.config.database_query_batch_size,
@@ -160,22 +162,6 @@ class DataHubDatabaseReader:
160
162
  result = conn.execute(query, params)
161
163
  for row in result:
162
164
  yield dict(row)
163
- elif self.engine.dialect.name == "mysql": # MySQL
164
- import MySQLdb
165
-
166
- with contextlib.closing(
167
- conn.connection.cursor(MySQLdb.cursors.SSCursor)
168
- ) as cursor:
169
- logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
170
- cursor.execute(query, params)
171
-
172
- columns = [desc[0] for desc in cursor.description]
173
- while True:
174
- rows = cursor.fetchmany(self.config.database_query_batch_size)
175
- if not rows:
176
- break # Use break instead of return in generator
177
- for row in rows:
178
- yield dict(zip(columns, row))
179
165
  else:
180
166
  raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
181
167
 
@@ -130,7 +130,7 @@ class DataHubSource(StatefulIngestionSourceBase):
130
130
  self._commit_progress(i)
131
131
 
132
132
  def _get_kafka_workunits(
133
- self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = []
133
+ self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
134
134
  ) -> Iterable[MetadataWorkUnit]:
135
135
  if self.config.kafka_connection is None:
136
136
  return
@@ -40,6 +40,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
40
40
  ColumnRef,
41
41
  DownstreamColumnRef,
42
42
  )
43
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
43
44
  from datahub.utilities.perf_timer import PerfTimer
44
45
  from datahub.utilities.time import ts_millis_to_datetime
45
46
 
@@ -239,6 +240,9 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
239
240
  downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
240
241
 
241
242
  known_lineage = KnownQueryLineageInfo(
243
+ query_id=get_query_fingerprint(
244
+ query.query_text, self.identifiers.platform, fast=True
245
+ ),
242
246
  query_text=query.query_text,
243
247
  downstream=downstream_table_urn,
244
248
  upstreams=self.map_query_result_upstreams(
@@ -26,9 +26,6 @@ from datahub.emitter.mcp_builder import (
26
26
  gen_containers,
27
27
  )
28
28
  from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
29
- from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
30
- EnsureAspectSizeProcessor,
31
- )
32
29
  from datahub.ingestion.api.common import PipelineContext
33
30
  from datahub.ingestion.api.decorators import (
34
31
  SupportStatus,
@@ -263,7 +260,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
263
260
  StaleEntityRemovalHandler.create(
264
261
  self, self.config, self.ctx
265
262
  ).workunit_processor,
266
- EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
267
263
  ]
268
264
 
269
265
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
@@ -165,6 +165,7 @@ class KnownQueryLineageInfo:
165
165
  timestamp: Optional[datetime] = None
166
166
  session_id: Optional[str] = None
167
167
  query_type: QueryType = QueryType.UNKNOWN
168
+ query_id: Optional[str] = None
168
169
 
169
170
 
170
171
  @dataclasses.dataclass
@@ -618,11 +619,13 @@ class SqlParsingAggregator(Closeable):
618
619
  self.report.num_known_query_lineage += 1
619
620
 
620
621
  # Generate a fingerprint for the query.
621
- with self.report.sql_fingerprinting_timer:
622
- query_fingerprint = get_query_fingerprint(
623
- known_query_lineage.query_text,
624
- platform=self.platform.platform_name,
625
- )
622
+ query_fingerprint = known_query_lineage.query_id
623
+ if not query_fingerprint:
624
+ with self.report.sql_fingerprinting_timer:
625
+ query_fingerprint = get_query_fingerprint(
626
+ known_query_lineage.query_text,
627
+ platform=self.platform.platform_name,
628
+ )
626
629
  formatted_query = self._maybe_format_query(known_query_lineage.query_text)
627
630
 
628
631
  # Register the query.