acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.4rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/METADATA +2614 -2614
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/RECORD +19 -19
- datahub/_version.py +1 -1
- datahub/emitter/rest_emitter.py +18 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +17 -0
- datahub/ingestion/source/dremio/dremio_api.py +98 -68
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +90 -77
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/ge_data_profiler.py +48 -8
- datahub/ingestion/source/redshift/redshift.py +17 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.1.0.
|
|
1
|
+
acryl_datahub-1.1.0.4rc3.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=OF251LJLh7moYp7lXruZj0uH4nIUOIgEh8RcvTFCPqU,323
|
|
5
5
|
datahub/entrypoints.py,sha256=H-YFTvxTJOgpWsFBVlxyb1opjkq-hjTzNmjy5Fq3RHg,8992
|
|
6
6
|
datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -132,7 +132,7 @@ datahub/emitter/mcp_builder.py,sha256=8IwJAlolQkPpMqQJPLtGrsUqAcuFNs98nrI5iYUxga
|
|
|
132
132
|
datahub/emitter/mcp_patch_builder.py,sha256=u7cpW6DkiN7KpLapmMaXgL_FneoN69boxiANbVgMdSI,4564
|
|
133
133
|
datahub/emitter/request_helper.py,sha256=2Sij9VJqgA7xZI6I7IuxsA8ioakbz0FJ3gvazxU_z3M,5738
|
|
134
134
|
datahub/emitter/response_helper.py,sha256=qGm45n43CepW7j6kP9wTXuP-U-SZnn7hQdJTdVaoqhQ,7504
|
|
135
|
-
datahub/emitter/rest_emitter.py,sha256=
|
|
135
|
+
datahub/emitter/rest_emitter.py,sha256=WrL-ldOJf2LoKv_5behyffsB6vVXjkT8xTdWMtpExtE,38101
|
|
136
136
|
datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
|
|
137
137
|
datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
|
|
138
138
|
datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
|
|
@@ -207,7 +207,7 @@ datahub/ingestion/source/demo_data.py,sha256=PbtCHlZx3wrKlOPPgkWhDQuPm7ZfIx2neXJ
|
|
|
207
207
|
datahub/ingestion/source/elastic_search.py,sha256=2dwIcSbYMaq_RoSnxLGz4Q_20oJ8AGgMKunVIBIgYM8,23406
|
|
208
208
|
datahub/ingestion/source/feast.py,sha256=rAqT7huVgi4c7iRU9qSbohPbNRrxZVw4PIvnfxNsiUk,18798
|
|
209
209
|
datahub/ingestion/source/file.py,sha256=sHCWbtrQcXMMYPs_LUqofx0mk6IFN0G7Lyk9b0yRZMI,16082
|
|
210
|
-
datahub/ingestion/source/ge_data_profiler.py,sha256=
|
|
210
|
+
datahub/ingestion/source/ge_data_profiler.py,sha256=dvwTLK95xx1vuLPzigredqXiv0nyZVKas1dP7zcy3jU,67807
|
|
211
211
|
datahub/ingestion/source/ge_profiling_config.py,sha256=sG_0BwPDRG3I4PnhfWGHf9AbePLDWG0kKcKEtlXHTuk,11544
|
|
212
212
|
datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
|
|
213
213
|
datahub/ingestion/source/ldap.py,sha256=PKoA5pVjuIxFfW1TcbYNIWSm7-C7shK2FDn7Zo5mrVM,18705
|
|
@@ -252,7 +252,7 @@ datahub/ingestion/source/azure/abs_folder_utils.py,sha256=7skXus-4fSIoKpqCeU-GG0
|
|
|
252
252
|
datahub/ingestion/source/azure/abs_utils.py,sha256=KdAlCK-PMrn35kFHxz5vrsjajyx2PD5GRgoBKdoRvcg,2075
|
|
253
253
|
datahub/ingestion/source/azure/azure_common.py,sha256=Zl0pPuE6L3QcM5B1P0LsPthZmD0h7fUUS0kg2okl6IY,4053
|
|
254
254
|
datahub/ingestion/source/bigquery_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
255
|
-
datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=
|
|
255
|
+
datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=dslbjihZMg9Utt8V8DYIucqQfychl_MB-gaDTmsMqe0,15005
|
|
256
256
|
datahub/ingestion/source/bigquery_v2/bigquery_audit.py,sha256=kEwWhq3ch6WT4q4hcX8-fvQh28KgrNfspFwIytO3vQA,25103
|
|
257
257
|
datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py,sha256=LuGJ6LgPViLIfDQfylxlQ3CA7fZYM5MDt8M-7sfzm84,5096
|
|
258
258
|
datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=bG1soRawCLhJw_15L2fJmFfj1kntTthV6ng4LZOnwko,21916
|
|
@@ -308,15 +308,15 @@ datahub/ingestion/source/delta_lake/delta_lake_utils.py,sha256=VqIDPEXepOnlk4oWM
|
|
|
308
308
|
datahub/ingestion/source/delta_lake/report.py,sha256=uR4e4QA_jv8lL3CV-wE5t43H8pUqrGmx_ItLqN9flPI,587
|
|
309
309
|
datahub/ingestion/source/delta_lake/source.py,sha256=1OxdbH_KcC6WFbf78XueKphnmCcIGizUepQ-LQK_hbk,13968
|
|
310
310
|
datahub/ingestion/source/dremio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
311
|
-
datahub/ingestion/source/dremio/dremio_api.py,sha256=
|
|
311
|
+
datahub/ingestion/source/dremio/dremio_api.py,sha256=_xtiftEFWfu1uqbh_W8j99oqJc4wah_M_4ho6W_XdzM,35001
|
|
312
312
|
datahub/ingestion/source/dremio/dremio_aspects.py,sha256=oWV2_mSpq3Bh42YJ1QVbAyp-Uihf2WIT6VsHGsGTgzk,18248
|
|
313
|
-
datahub/ingestion/source/dremio/dremio_config.py,sha256=
|
|
313
|
+
datahub/ingestion/source/dremio/dremio_config.py,sha256=xugXSYoqXuMo9q5LTjSWCx2P376fGxIl7Nc2cI-K_OQ,5882
|
|
314
314
|
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py,sha256=MQk8BAHLufN69CntFfOV8K59A_AvLC-vwMS33Jw8bBg,3069
|
|
315
315
|
datahub/ingestion/source/dremio/dremio_entities.py,sha256=1gZrNqTp3Pm6vqGDQaWt3HkxEuHKxpGYQ4geVoFvxWI,15147
|
|
316
316
|
datahub/ingestion/source/dremio/dremio_profiling.py,sha256=TAcnpo8ZRKhLDHnQSJzJg3YdwTSyEa73LUAzENs7wG4,12287
|
|
317
|
-
datahub/ingestion/source/dremio/dremio_reporting.py,sha256=
|
|
318
|
-
datahub/ingestion/source/dremio/dremio_source.py,sha256=
|
|
319
|
-
datahub/ingestion/source/dremio/dremio_sql_queries.py,sha256=
|
|
317
|
+
datahub/ingestion/source/dremio/dremio_reporting.py,sha256=YRKM6PvoJYHLBXmOGwkgou_8x8_oA2xaqTWWoVuwFMY,2247
|
|
318
|
+
datahub/ingestion/source/dremio/dremio_source.py,sha256=baUW3f6Y7WWbHXo9GqmBzZqXilMo1MbG3hvDS-bwthI,25164
|
|
319
|
+
datahub/ingestion/source/dremio/dremio_sql_queries.py,sha256=wA1hqKk9cKMJDyEdZRQcDDLZPGYwuNqrvleUHTkWgrQ,10508
|
|
320
320
|
datahub/ingestion/source/dynamodb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
321
321
|
datahub/ingestion/source/dynamodb/data_reader.py,sha256=vC77KpcP8LJN0g8wsPRDVw4sebv0ZWIP3tJkEIHaomA,3120
|
|
322
322
|
datahub/ingestion/source/dynamodb/dynamodb.py,sha256=vM3Ia5rZidqOcdPPigpuo6-7Ipoof8eF3RwxJ3SX2Ck,22771
|
|
@@ -428,7 +428,7 @@ datahub/ingestion/source/redshift/lineage.py,sha256=IPF8vHy2MFyhK-hu2-lxV2-kcnNA
|
|
|
428
428
|
datahub/ingestion/source/redshift/lineage_v2.py,sha256=dbTvuaJBV5yvCWM_oEAqZIA1JOlGxLJOexbEB47A_xE,17962
|
|
429
429
|
datahub/ingestion/source/redshift/profile.py,sha256=H1Xtc2rXScUv4w0b2BbM7POjYEwqIql_rpWvlumY_EM,4309
|
|
430
430
|
datahub/ingestion/source/redshift/query.py,sha256=vVIuNUaU4a7AfMFJZlgLuqi0cGVl0gVz8xZUSnPhWvs,47845
|
|
431
|
-
datahub/ingestion/source/redshift/redshift.py,sha256=
|
|
431
|
+
datahub/ingestion/source/redshift/redshift.py,sha256=p6rOOCjxNnPpTn-vFjgISMMjtUTzu6K-OrfWOIaIuJI,44683
|
|
432
432
|
datahub/ingestion/source/redshift/redshift_data_reader.py,sha256=zc69jwXHdF-w8J4Hq-ZQ6BjHQ75Ij2iNDMpoRJlcmlU,1724
|
|
433
433
|
datahub/ingestion/source/redshift/redshift_schema.py,sha256=7F-l_omOuKMuGE_rBWXVPG_GWXFKnCMzC4frNxZB9cs,24800
|
|
434
434
|
datahub/ingestion/source/redshift/report.py,sha256=O3QFozHlmMbH9b7KxbqhgTgr_0tCryj6FIzMiN6kRxw,3044
|
|
@@ -548,8 +548,8 @@ datahub/ingestion/source/unity/source.py,sha256=uJBjgZ7qhJpn25t0ZOcLuZ0vn2Uz4n9A
|
|
|
548
548
|
datahub/ingestion/source/unity/tag_entities.py,sha256=iWl6nRAWSye1hoFDx_Xh4aT53PN0sGzlX7n1-oTVUv8,11568
|
|
549
549
|
datahub/ingestion/source/unity/usage.py,sha256=0wETBAaZvHI_EGgBlxX3bKsVHEAdnUV8_bKI_lbyWjY,11500
|
|
550
550
|
datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
551
|
-
datahub/ingestion/source/usage/clickhouse_usage.py,sha256=
|
|
552
|
-
datahub/ingestion/source/usage/starburst_trino_usage.py,sha256=
|
|
551
|
+
datahub/ingestion/source/usage/clickhouse_usage.py,sha256=M6YVQqwJoFqJPxlTr62lFwxfDeX2-_9Diw6qtcq2XWM,10244
|
|
552
|
+
datahub/ingestion/source/usage/starburst_trino_usage.py,sha256=EnxKQ6IMt0o3VLvqfFJAE-mYMnLponnKGZEsVeGet1c,10802
|
|
553
553
|
datahub/ingestion/source/usage/usage_common.py,sha256=uuCgIduhlRL2zIAN8rymZ5cZn1WF6akZ-ZbbaVYo9_w,9813
|
|
554
554
|
datahub/ingestion/source/vertexai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
555
555
|
datahub/ingestion/source/vertexai/vertexai.py,sha256=RuHda0mbc1DElYZIZ_W_hvkN7Eg4LIvI1fRFMvpHPB0,56012
|
|
@@ -1025,7 +1025,7 @@ datahub/utilities/sql_formatter.py,sha256=tYXIsKjKmpKh0JXGxeAPrHkUWYd1SwJNLjUZsf
|
|
|
1025
1025
|
datahub/utilities/sqlalchemy_query_combiner.py,sha256=oxW20uXz8hV1Zb4fLXvTQ7c3LjACBsrF58TR2_-RSps,14982
|
|
1026
1026
|
datahub/utilities/sqlalchemy_type_converter.py,sha256=H4S4xnnyPozDBHFhBh4rjjoXa5novFzYIUBJy2KSrVc,9805
|
|
1027
1027
|
datahub/utilities/sqllineage_patch.py,sha256=0Buh50bmEqJFg1HFRCknCnePo1cecI4JmGxVhM_jh2g,1976
|
|
1028
|
-
datahub/utilities/stats_collections.py,sha256=
|
|
1028
|
+
datahub/utilities/stats_collections.py,sha256=9QDEk40UxhmQwDS6I63Gp6fcIBqmXVinKl7x2xHCD34,1702
|
|
1029
1029
|
datahub/utilities/str_enum.py,sha256=EsqCLPbrqyQ2YU_wt7QP-a6P5fnpIshXJ3AI8gLBlVA,474
|
|
1030
1030
|
datahub/utilities/tee_io.py,sha256=jBrsUfTPTk9IICntfGOG0HR-Fjp8BQMde-FPQ4r3kuI,601
|
|
1031
1031
|
datahub/utilities/threaded_iterator_executor.py,sha256=6BpCE0os3d-uMYxHBilPQC-JvEBkU6JQY4bGs06JKYI,2004
|
|
@@ -1075,8 +1075,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1075
1075
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1076
1076
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1077
1077
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1078
|
-
acryl_datahub-1.1.0.
|
|
1079
|
-
acryl_datahub-1.1.0.
|
|
1080
|
-
acryl_datahub-1.1.0.
|
|
1081
|
-
acryl_datahub-1.1.0.
|
|
1082
|
-
acryl_datahub-1.1.0.
|
|
1078
|
+
acryl_datahub-1.1.0.4rc3.dist-info/METADATA,sha256=2NyzaDzCIUEF2g5CM-vfMVqvzJvKXnWdJxawDLVJ_7c,182347
|
|
1079
|
+
acryl_datahub-1.1.0.4rc3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
1080
|
+
acryl_datahub-1.1.0.4rc3.dist-info/entry_points.txt,sha256=-N2PGtn1uwKR7-VM9spziE_RNyOdKm_XNpOWL1lnaj4,9790
|
|
1081
|
+
acryl_datahub-1.1.0.4rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1082
|
+
acryl_datahub-1.1.0.4rc3.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -4,6 +4,7 @@ import functools
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
|
+
import re
|
|
7
8
|
import time
|
|
8
9
|
from collections import defaultdict
|
|
9
10
|
from dataclasses import dataclass
|
|
@@ -104,6 +105,22 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
|
104
105
|
)
|
|
105
106
|
|
|
106
107
|
|
|
108
|
+
def preserve_unicode_escapes(obj: Any) -> Any:
|
|
109
|
+
"""Recursively convert unicode characters back to escape sequences"""
|
|
110
|
+
if isinstance(obj, dict):
|
|
111
|
+
return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
|
|
112
|
+
elif isinstance(obj, list):
|
|
113
|
+
return [preserve_unicode_escapes(item) for item in obj]
|
|
114
|
+
elif isinstance(obj, str):
|
|
115
|
+
# Convert non-ASCII characters back to \u escapes
|
|
116
|
+
def escape_unicode(match: Any) -> Any:
|
|
117
|
+
return f"\\u{ord(match.group(0)):04x}"
|
|
118
|
+
|
|
119
|
+
return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
|
|
120
|
+
else:
|
|
121
|
+
return obj
|
|
122
|
+
|
|
123
|
+
|
|
107
124
|
class EmitMode(ConfigEnum):
|
|
108
125
|
# Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
|
|
109
126
|
# Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
|
|
@@ -611,7 +628,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
611
628
|
else:
|
|
612
629
|
url = f"{self._gms_server}/aspects?action=ingestProposal"
|
|
613
630
|
|
|
614
|
-
mcp_obj = pre_json_transform(mcp.to_obj())
|
|
631
|
+
mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
|
|
615
632
|
payload_dict = {
|
|
616
633
|
"proposal": mcp_obj,
|
|
617
634
|
"async": "true"
|
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import Iterable, List, Optional
|
|
6
6
|
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
7
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
8
9
|
from datahub.ingestion.api.decorators import (
|
|
9
10
|
SupportStatus,
|
|
@@ -242,7 +243,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
242
243
|
).workunit_processor,
|
|
243
244
|
]
|
|
244
245
|
|
|
246
|
+
def _warn_deprecated_configs(self):
|
|
247
|
+
if (
|
|
248
|
+
self.config.match_fully_qualified_names is not None
|
|
249
|
+
and not self.config.match_fully_qualified_names
|
|
250
|
+
and self.config.schema_pattern is not None
|
|
251
|
+
and self.config.schema_pattern != AllowDenyPattern.allow_all()
|
|
252
|
+
):
|
|
253
|
+
self.report.report_warning(
|
|
254
|
+
message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
|
|
255
|
+
"Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
|
|
256
|
+
"The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
|
|
257
|
+
context="Config option deprecation warning",
|
|
258
|
+
title="Config option deprecation warning",
|
|
259
|
+
)
|
|
260
|
+
|
|
245
261
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
262
|
+
self._warn_deprecated_configs()
|
|
246
263
|
projects = get_projects(
|
|
247
264
|
self.bq_schema_extractor.schema_api,
|
|
248
265
|
self.report,
|
|
@@ -21,6 +21,7 @@ from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
|
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
|
23
23
|
from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
|
|
24
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
24
25
|
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
26
27
|
|
|
@@ -54,6 +55,8 @@ class DremioAPIOperations:
|
|
|
54
55
|
self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
|
|
55
56
|
self._max_workers: int = connection_args.max_workers
|
|
56
57
|
self.is_dremio_cloud = connection_args.is_dremio_cloud
|
|
58
|
+
self.start_time = connection_args.start_time
|
|
59
|
+
self.end_time = connection_args.end_time
|
|
57
60
|
self.report = report
|
|
58
61
|
self.session = requests.Session()
|
|
59
62
|
if connection_args.is_dremio_cloud:
|
|
@@ -233,47 +236,71 @@ class DremioAPIOperations:
|
|
|
233
236
|
|
|
234
237
|
def get(self, url: str) -> Dict:
|
|
235
238
|
"""execute a get request on dremio"""
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
)
|
|
241
|
-
|
|
239
|
+
logger.debug(f"GET request to {self.base_url + url}")
|
|
240
|
+
self.report.api_calls_total += 1
|
|
241
|
+
self.report.api_calls_by_method_and_path["GET " + url] += 1
|
|
242
|
+
|
|
243
|
+
with PerfTimer() as timer:
|
|
244
|
+
response = self.session.get(
|
|
245
|
+
url=(self.base_url + url),
|
|
246
|
+
verify=self._verify,
|
|
247
|
+
timeout=self._timeout,
|
|
248
|
+
)
|
|
249
|
+
self.report.api_call_secs_by_method_and_path["GET " + url] += (
|
|
250
|
+
timer.elapsed_seconds()
|
|
251
|
+
)
|
|
252
|
+
# response.raise_for_status() # Enabling this line, makes integration tests to fail
|
|
253
|
+
return response.json()
|
|
242
254
|
|
|
243
255
|
def post(self, url: str, data: str) -> Dict:
|
|
244
256
|
"""execute a get request on dremio"""
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
257
|
+
logger.debug(f"POST request to {self.base_url + url}")
|
|
258
|
+
self.report.api_calls_total += 1
|
|
259
|
+
self.report.api_calls_by_method_and_path["POST " + url] += 1
|
|
260
|
+
|
|
261
|
+
with PerfTimer() as timer:
|
|
262
|
+
response = self.session.post(
|
|
263
|
+
url=(self.base_url + url),
|
|
264
|
+
data=data,
|
|
265
|
+
verify=self._verify,
|
|
266
|
+
timeout=self._timeout,
|
|
267
|
+
)
|
|
268
|
+
self.report.api_call_secs_by_method_and_path["POST " + url] += (
|
|
269
|
+
timer.elapsed_seconds()
|
|
270
|
+
)
|
|
271
|
+
# response.raise_for_status() # Enabling this line, makes integration tests to fail
|
|
272
|
+
return response.json()
|
|
252
273
|
|
|
253
274
|
def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
|
|
254
275
|
"""Execute SQL query with timeout and error handling"""
|
|
255
276
|
try:
|
|
256
|
-
|
|
277
|
+
with PerfTimer() as timer:
|
|
278
|
+
logger.info(f"Executing query: {query}")
|
|
279
|
+
response = self.post(url="/sql", data=json.dumps({"sql": query}))
|
|
257
280
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
281
|
+
if "errorMessage" in response:
|
|
282
|
+
self.report.failure(
|
|
283
|
+
message="SQL Error", context=f"{response['errorMessage']}"
|
|
284
|
+
)
|
|
285
|
+
raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
|
|
263
286
|
|
|
264
|
-
|
|
287
|
+
job_id = response["id"]
|
|
265
288
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
289
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
290
|
+
future = executor.submit(self.fetch_results, job_id)
|
|
291
|
+
try:
|
|
292
|
+
result = future.result(timeout=timeout)
|
|
293
|
+
logger.info(
|
|
294
|
+
f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
|
|
295
|
+
)
|
|
296
|
+
return result
|
|
297
|
+
except concurrent.futures.TimeoutError:
|
|
298
|
+
self.cancel_query(job_id)
|
|
299
|
+
raise DremioAPIException(
|
|
300
|
+
f"Query execution timed out after {timeout} seconds"
|
|
301
|
+
) from None
|
|
302
|
+
except RuntimeError as e:
|
|
303
|
+
raise DremioAPIException() from e
|
|
277
304
|
|
|
278
305
|
except requests.RequestException as e:
|
|
279
306
|
raise DremioAPIException("Error executing query") from e
|
|
@@ -603,10 +630,25 @@ class DremioAPIOperations:
|
|
|
603
630
|
return parents_list
|
|
604
631
|
|
|
605
632
|
def extract_all_queries(self) -> List[Dict[str, Any]]:
|
|
633
|
+
# Convert datetime objects to string format for SQL queries
|
|
634
|
+
start_timestamp_str = None
|
|
635
|
+
end_timestamp_str = None
|
|
636
|
+
|
|
637
|
+
if self.start_time:
|
|
638
|
+
start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
639
|
+
if self.end_time:
|
|
640
|
+
end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
641
|
+
|
|
606
642
|
if self.edition == DremioEdition.CLOUD:
|
|
607
|
-
jobs_query = DremioSQLQueries.
|
|
643
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
|
|
644
|
+
start_timestamp_millis=start_timestamp_str,
|
|
645
|
+
end_timestamp_millis=end_timestamp_str,
|
|
646
|
+
)
|
|
608
647
|
else:
|
|
609
|
-
jobs_query = DremioSQLQueries.
|
|
648
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs(
|
|
649
|
+
start_timestamp_millis=start_timestamp_str,
|
|
650
|
+
end_timestamp_millis=end_timestamp_str,
|
|
651
|
+
)
|
|
610
652
|
|
|
611
653
|
return self.execute_query(query=jobs_query)
|
|
612
654
|
|
|
@@ -685,6 +727,27 @@ class DremioAPIOperations:
|
|
|
685
727
|
|
|
686
728
|
return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
|
|
687
729
|
|
|
730
|
+
def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
|
|
731
|
+
"""
|
|
732
|
+
Check if a container path could potentially match a schema pattern.
|
|
733
|
+
This handles hierarchical path matching for container filtering.
|
|
734
|
+
"""
|
|
735
|
+
if pattern == ".*":
|
|
736
|
+
return True
|
|
737
|
+
|
|
738
|
+
current_path = ".".join(path_components)
|
|
739
|
+
|
|
740
|
+
# Handle simple .* patterns (like "a.b.c.*")
|
|
741
|
+
if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
|
|
742
|
+
# Simple dotstar pattern - check prefix matching
|
|
743
|
+
pattern_prefix = pattern[:-2] # Remove ".*"
|
|
744
|
+
return current_path.lower().startswith(
|
|
745
|
+
pattern_prefix.lower()
|
|
746
|
+
) or pattern_prefix.lower().startswith(current_path.lower())
|
|
747
|
+
else:
|
|
748
|
+
# Complex regex pattern - use existing regex matching logic
|
|
749
|
+
return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
|
|
750
|
+
|
|
688
751
|
def should_include_container(self, path: List[str], name: str) -> bool:
|
|
689
752
|
"""
|
|
690
753
|
Helper method to check if a container should be included based on schema patterns.
|
|
@@ -711,41 +774,8 @@ class DremioAPIOperations:
|
|
|
711
774
|
|
|
712
775
|
# Check allow patterns
|
|
713
776
|
for pattern in self.allow_schema_pattern:
|
|
714
|
-
#
|
|
715
|
-
if
|
|
716
|
-
pattern_parts = pattern.split(".")
|
|
717
|
-
path_parts = path_components
|
|
718
|
-
|
|
719
|
-
# If pattern has exact same number of parts, check each component
|
|
720
|
-
if len(pattern_parts) == len(path_parts):
|
|
721
|
-
matches = True
|
|
722
|
-
for p_part, c_part in zip(pattern_parts, path_parts):
|
|
723
|
-
if p_part != "*" and p_part.lower() != c_part.lower():
|
|
724
|
-
matches = False
|
|
725
|
-
break
|
|
726
|
-
if matches:
|
|
727
|
-
self.report.report_container_scanned(full_path)
|
|
728
|
-
return True
|
|
729
|
-
# Otherwise check if current path is prefix match
|
|
730
|
-
else:
|
|
731
|
-
# Remove the trailing wildcard if present
|
|
732
|
-
if pattern_parts[-1] == "*":
|
|
733
|
-
pattern_parts = pattern_parts[:-1]
|
|
734
|
-
|
|
735
|
-
for i in range(len(path_parts)):
|
|
736
|
-
current_path = ".".join(path_parts[: i + 1])
|
|
737
|
-
pattern_prefix = ".".join(pattern_parts[: i + 1])
|
|
738
|
-
|
|
739
|
-
if pattern_prefix.startswith(current_path):
|
|
740
|
-
self.report.report_container_scanned(full_path)
|
|
741
|
-
return True
|
|
742
|
-
|
|
743
|
-
# Direct pattern matching
|
|
744
|
-
if self._check_pattern_match(
|
|
745
|
-
pattern=pattern,
|
|
746
|
-
paths=[full_path],
|
|
747
|
-
allow_prefix=True,
|
|
748
|
-
):
|
|
777
|
+
# Check if current path could potentially match this pattern
|
|
778
|
+
if self._could_match_pattern(pattern, path_components):
|
|
749
779
|
self.report.report_container_scanned(full_path)
|
|
750
780
|
return True
|
|
751
781
|
|
|
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
|
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
11
11
|
)
|
|
12
|
+
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
12
13
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
|
|
13
14
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
14
15
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -118,6 +119,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
|
|
|
118
119
|
class DremioSourceConfig(
|
|
119
120
|
DremioConnectionConfig,
|
|
120
121
|
StatefulIngestionConfigBase,
|
|
122
|
+
BaseTimeWindowConfig,
|
|
121
123
|
EnvConfigMixin,
|
|
122
124
|
PlatformInstanceConfigMixin,
|
|
123
125
|
):
|
|
@@ -1,22 +1,43 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from datetime import datetime
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
5
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
7
|
StaleEntityRemovalSourceReport,
|
|
7
8
|
)
|
|
8
9
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
10
|
+
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
11
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
12
|
+
from datahub.utilities.stats_collections import (
|
|
13
|
+
TopKDict,
|
|
14
|
+
float_top_k_dict,
|
|
15
|
+
int_top_k_dict,
|
|
16
|
+
)
|
|
9
17
|
|
|
10
18
|
|
|
11
19
|
@dataclass
|
|
12
20
|
class DremioSourceReport(
|
|
13
|
-
SQLSourceReport,
|
|
21
|
+
SQLSourceReport,
|
|
22
|
+
StaleEntityRemovalSourceReport,
|
|
23
|
+
IngestionStageReport,
|
|
24
|
+
BaseTimeWindowReport,
|
|
14
25
|
):
|
|
15
26
|
num_containers_failed: int = 0
|
|
16
27
|
num_datasets_failed: int = 0
|
|
17
28
|
containers_scanned: int = 0
|
|
18
29
|
containers_filtered: int = 0
|
|
19
30
|
|
|
31
|
+
api_calls_total: int = 0
|
|
32
|
+
api_calls_by_method_and_path: TopKDict[str, int] = field(
|
|
33
|
+
default_factory=int_top_k_dict
|
|
34
|
+
)
|
|
35
|
+
api_call_secs_by_method_and_path: TopKDict[str, float] = field(
|
|
36
|
+
default_factory=float_top_k_dict
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
40
|
+
|
|
20
41
|
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
|
21
42
|
# recording total combined latency is not very useful, keeping this method as a placeholder
|
|
22
43
|
# for future implementation of min / max / percentiles etc.
|