acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.4rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.1.0.4rc2.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.1.0.4rc3.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=M8rnTXUKlrf1evY-7XYua75EUQ0Yuv4TaMLJrEwD1g4,323
4
+ datahub/_version.py,sha256=OF251LJLh7moYp7lXruZj0uH4nIUOIgEh8RcvTFCPqU,323
5
5
  datahub/entrypoints.py,sha256=H-YFTvxTJOgpWsFBVlxyb1opjkq-hjTzNmjy5Fq3RHg,8992
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -132,7 +132,7 @@ datahub/emitter/mcp_builder.py,sha256=8IwJAlolQkPpMqQJPLtGrsUqAcuFNs98nrI5iYUxga
132
132
  datahub/emitter/mcp_patch_builder.py,sha256=u7cpW6DkiN7KpLapmMaXgL_FneoN69boxiANbVgMdSI,4564
133
133
  datahub/emitter/request_helper.py,sha256=2Sij9VJqgA7xZI6I7IuxsA8ioakbz0FJ3gvazxU_z3M,5738
134
134
  datahub/emitter/response_helper.py,sha256=qGm45n43CepW7j6kP9wTXuP-U-SZnn7hQdJTdVaoqhQ,7504
135
- datahub/emitter/rest_emitter.py,sha256=ctPrtQ1S9wsy_lqE9LopP5pvmLn83Mu5R1mfSf9umdY,37467
135
+ datahub/emitter/rest_emitter.py,sha256=WrL-ldOJf2LoKv_5behyffsB6vVXjkT8xTdWMtpExtE,38101
136
136
  datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
137
137
  datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
138
138
  datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -207,7 +207,7 @@ datahub/ingestion/source/demo_data.py,sha256=PbtCHlZx3wrKlOPPgkWhDQuPm7ZfIx2neXJ
207
207
  datahub/ingestion/source/elastic_search.py,sha256=2dwIcSbYMaq_RoSnxLGz4Q_20oJ8AGgMKunVIBIgYM8,23406
208
208
  datahub/ingestion/source/feast.py,sha256=rAqT7huVgi4c7iRU9qSbohPbNRrxZVw4PIvnfxNsiUk,18798
209
209
  datahub/ingestion/source/file.py,sha256=sHCWbtrQcXMMYPs_LUqofx0mk6IFN0G7Lyk9b0yRZMI,16082
210
- datahub/ingestion/source/ge_data_profiler.py,sha256=Y_sdKK4Ot6MOpSKNfkkCJhiL7hqcjpU0hcDqXpfcNA0,66162
210
+ datahub/ingestion/source/ge_data_profiler.py,sha256=dvwTLK95xx1vuLPzigredqXiv0nyZVKas1dP7zcy3jU,67807
211
211
  datahub/ingestion/source/ge_profiling_config.py,sha256=sG_0BwPDRG3I4PnhfWGHf9AbePLDWG0kKcKEtlXHTuk,11544
212
212
  datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
213
213
  datahub/ingestion/source/ldap.py,sha256=PKoA5pVjuIxFfW1TcbYNIWSm7-C7shK2FDn7Zo5mrVM,18705
@@ -252,7 +252,7 @@ datahub/ingestion/source/azure/abs_folder_utils.py,sha256=7skXus-4fSIoKpqCeU-GG0
252
252
  datahub/ingestion/source/azure/abs_utils.py,sha256=KdAlCK-PMrn35kFHxz5vrsjajyx2PD5GRgoBKdoRvcg,2075
253
253
  datahub/ingestion/source/azure/azure_common.py,sha256=Zl0pPuE6L3QcM5B1P0LsPthZmD0h7fUUS0kg2okl6IY,4053
254
254
  datahub/ingestion/source/bigquery_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
255
- datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=nv_lHwcX07vRrrGamVIpqcAtqJ1tKscq6XVC4vwsRAk,13943
255
+ datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=dslbjihZMg9Utt8V8DYIucqQfychl_MB-gaDTmsMqe0,15005
256
256
  datahub/ingestion/source/bigquery_v2/bigquery_audit.py,sha256=kEwWhq3ch6WT4q4hcX8-fvQh28KgrNfspFwIytO3vQA,25103
257
257
  datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py,sha256=LuGJ6LgPViLIfDQfylxlQ3CA7fZYM5MDt8M-7sfzm84,5096
258
258
  datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=bG1soRawCLhJw_15L2fJmFfj1kntTthV6ng4LZOnwko,21916
@@ -308,15 +308,15 @@ datahub/ingestion/source/delta_lake/delta_lake_utils.py,sha256=VqIDPEXepOnlk4oWM
308
308
  datahub/ingestion/source/delta_lake/report.py,sha256=uR4e4QA_jv8lL3CV-wE5t43H8pUqrGmx_ItLqN9flPI,587
309
309
  datahub/ingestion/source/delta_lake/source.py,sha256=1OxdbH_KcC6WFbf78XueKphnmCcIGizUepQ-LQK_hbk,13968
310
310
  datahub/ingestion/source/dremio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
311
- datahub/ingestion/source/dremio/dremio_api.py,sha256=h4rjnRacggFXzIQVVsKFNgTUixUZh2gPHH4_7rSGx2g,33413
311
+ datahub/ingestion/source/dremio/dremio_api.py,sha256=_xtiftEFWfu1uqbh_W8j99oqJc4wah_M_4ho6W_XdzM,35001
312
312
  datahub/ingestion/source/dremio/dremio_aspects.py,sha256=oWV2_mSpq3Bh42YJ1QVbAyp-Uihf2WIT6VsHGsGTgzk,18248
313
- datahub/ingestion/source/dremio/dremio_config.py,sha256=5SP66ewGYN0OnyWgpU33EZOmtICsclTtBX5DSYLwl3c,5782
313
+ datahub/ingestion/source/dremio/dremio_config.py,sha256=xugXSYoqXuMo9q5LTjSWCx2P376fGxIl7Nc2cI-K_OQ,5882
314
314
  datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py,sha256=MQk8BAHLufN69CntFfOV8K59A_AvLC-vwMS33Jw8bBg,3069
315
315
  datahub/ingestion/source/dremio/dremio_entities.py,sha256=1gZrNqTp3Pm6vqGDQaWt3HkxEuHKxpGYQ4geVoFvxWI,15147
316
316
  datahub/ingestion/source/dremio/dremio_profiling.py,sha256=TAcnpo8ZRKhLDHnQSJzJg3YdwTSyEa73LUAzENs7wG4,12287
317
- datahub/ingestion/source/dremio/dremio_reporting.py,sha256=BvdQA_T-VXl9EjOmj-D2NlM9pXZ4UdKXKGRpYsk1Eqw,1607
318
- datahub/ingestion/source/dremio/dremio_source.py,sha256=_52Z0ifntbhYNwlrMs6jZ59CI4aVpQzL0K16Sv7Xm8Y,24471
319
- datahub/ingestion/source/dremio/dremio_sql_queries.py,sha256=W0rcXawlwJOHNYr5o73rilMijtFOO3cVkn6pY-JLc6o,8186
317
+ datahub/ingestion/source/dremio/dremio_reporting.py,sha256=YRKM6PvoJYHLBXmOGwkgou_8x8_oA2xaqTWWoVuwFMY,2247
318
+ datahub/ingestion/source/dremio/dremio_source.py,sha256=baUW3f6Y7WWbHXo9GqmBzZqXilMo1MbG3hvDS-bwthI,25164
319
+ datahub/ingestion/source/dremio/dremio_sql_queries.py,sha256=wA1hqKk9cKMJDyEdZRQcDDLZPGYwuNqrvleUHTkWgrQ,10508
320
320
  datahub/ingestion/source/dynamodb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
321
321
  datahub/ingestion/source/dynamodb/data_reader.py,sha256=vC77KpcP8LJN0g8wsPRDVw4sebv0ZWIP3tJkEIHaomA,3120
322
322
  datahub/ingestion/source/dynamodb/dynamodb.py,sha256=vM3Ia5rZidqOcdPPigpuo6-7Ipoof8eF3RwxJ3SX2Ck,22771
@@ -428,7 +428,7 @@ datahub/ingestion/source/redshift/lineage.py,sha256=IPF8vHy2MFyhK-hu2-lxV2-kcnNA
428
428
  datahub/ingestion/source/redshift/lineage_v2.py,sha256=dbTvuaJBV5yvCWM_oEAqZIA1JOlGxLJOexbEB47A_xE,17962
429
429
  datahub/ingestion/source/redshift/profile.py,sha256=H1Xtc2rXScUv4w0b2BbM7POjYEwqIql_rpWvlumY_EM,4309
430
430
  datahub/ingestion/source/redshift/query.py,sha256=vVIuNUaU4a7AfMFJZlgLuqi0cGVl0gVz8xZUSnPhWvs,47845
431
- datahub/ingestion/source/redshift/redshift.py,sha256=bM9pow8J6oX9jlTh029xsWGFDQ61lyXHdSz3Av9Et0M,43621
431
+ datahub/ingestion/source/redshift/redshift.py,sha256=p6rOOCjxNnPpTn-vFjgISMMjtUTzu6K-OrfWOIaIuJI,44683
432
432
  datahub/ingestion/source/redshift/redshift_data_reader.py,sha256=zc69jwXHdF-w8J4Hq-ZQ6BjHQ75Ij2iNDMpoRJlcmlU,1724
433
433
  datahub/ingestion/source/redshift/redshift_schema.py,sha256=7F-l_omOuKMuGE_rBWXVPG_GWXFKnCMzC4frNxZB9cs,24800
434
434
  datahub/ingestion/source/redshift/report.py,sha256=O3QFozHlmMbH9b7KxbqhgTgr_0tCryj6FIzMiN6kRxw,3044
@@ -548,8 +548,8 @@ datahub/ingestion/source/unity/source.py,sha256=uJBjgZ7qhJpn25t0ZOcLuZ0vn2Uz4n9A
548
548
  datahub/ingestion/source/unity/tag_entities.py,sha256=iWl6nRAWSye1hoFDx_Xh4aT53PN0sGzlX7n1-oTVUv8,11568
549
549
  datahub/ingestion/source/unity/usage.py,sha256=0wETBAaZvHI_EGgBlxX3bKsVHEAdnUV8_bKI_lbyWjY,11500
550
550
  datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
551
- datahub/ingestion/source/usage/clickhouse_usage.py,sha256=6HtLuDjJ7__dLJmV-RwNKmdDh3Pns_nItizoulsvJPM,10161
552
- datahub/ingestion/source/usage/starburst_trino_usage.py,sha256=K412PkoPU3elilOP7iYby2NYfJxakEvryj78nKDI_IA,10681
551
+ datahub/ingestion/source/usage/clickhouse_usage.py,sha256=M6YVQqwJoFqJPxlTr62lFwxfDeX2-_9Diw6qtcq2XWM,10244
552
+ datahub/ingestion/source/usage/starburst_trino_usage.py,sha256=EnxKQ6IMt0o3VLvqfFJAE-mYMnLponnKGZEsVeGet1c,10802
553
553
  datahub/ingestion/source/usage/usage_common.py,sha256=uuCgIduhlRL2zIAN8rymZ5cZn1WF6akZ-ZbbaVYo9_w,9813
554
554
  datahub/ingestion/source/vertexai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
555
555
  datahub/ingestion/source/vertexai/vertexai.py,sha256=RuHda0mbc1DElYZIZ_W_hvkN7Eg4LIvI1fRFMvpHPB0,56012
@@ -1025,7 +1025,7 @@ datahub/utilities/sql_formatter.py,sha256=tYXIsKjKmpKh0JXGxeAPrHkUWYd1SwJNLjUZsf
1025
1025
  datahub/utilities/sqlalchemy_query_combiner.py,sha256=oxW20uXz8hV1Zb4fLXvTQ7c3LjACBsrF58TR2_-RSps,14982
1026
1026
  datahub/utilities/sqlalchemy_type_converter.py,sha256=H4S4xnnyPozDBHFhBh4rjjoXa5novFzYIUBJy2KSrVc,9805
1027
1027
  datahub/utilities/sqllineage_patch.py,sha256=0Buh50bmEqJFg1HFRCknCnePo1cecI4JmGxVhM_jh2g,1976
1028
- datahub/utilities/stats_collections.py,sha256=CxaTcrF7J6am7iX5jPhFKne535UcyDk_oreVwR013fU,1625
1028
+ datahub/utilities/stats_collections.py,sha256=9QDEk40UxhmQwDS6I63Gp6fcIBqmXVinKl7x2xHCD34,1702
1029
1029
  datahub/utilities/str_enum.py,sha256=EsqCLPbrqyQ2YU_wt7QP-a6P5fnpIshXJ3AI8gLBlVA,474
1030
1030
  datahub/utilities/tee_io.py,sha256=jBrsUfTPTk9IICntfGOG0HR-Fjp8BQMde-FPQ4r3kuI,601
1031
1031
  datahub/utilities/threaded_iterator_executor.py,sha256=6BpCE0os3d-uMYxHBilPQC-JvEBkU6JQY4bGs06JKYI,2004
@@ -1075,8 +1075,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1075
1075
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1076
1076
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1077
1077
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1078
- acryl_datahub-1.1.0.4rc2.dist-info/METADATA,sha256=5mGKtVP2MPiCtTWr84Dn6JQYyPYttuv8wAlpbeIfx28,182347
1079
- acryl_datahub-1.1.0.4rc2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1080
- acryl_datahub-1.1.0.4rc2.dist-info/entry_points.txt,sha256=-N2PGtn1uwKR7-VM9spziE_RNyOdKm_XNpOWL1lnaj4,9790
1081
- acryl_datahub-1.1.0.4rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1082
- acryl_datahub-1.1.0.4rc2.dist-info/RECORD,,
1078
+ acryl_datahub-1.1.0.4rc3.dist-info/METADATA,sha256=2NyzaDzCIUEF2g5CM-vfMVqvzJvKXnWdJxawDLVJ_7c,182347
1079
+ acryl_datahub-1.1.0.4rc3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1080
+ acryl_datahub-1.1.0.4rc3.dist-info/entry_points.txt,sha256=-N2PGtn1uwKR7-VM9spziE_RNyOdKm_XNpOWL1lnaj4,9790
1081
+ acryl_datahub-1.1.0.4rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1082
+ acryl_datahub-1.1.0.4rc3.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.1.0.4rc2"
3
+ __version__ = "1.1.0.4rc3"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -4,6 +4,7 @@ import functools
4
4
  import json
5
5
  import logging
6
6
  import os
7
+ import re
7
8
  import time
8
9
  from collections import defaultdict
9
10
  from dataclasses import dataclass
@@ -104,6 +105,22 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
104
105
  )
105
106
 
106
107
 
108
+ def preserve_unicode_escapes(obj: Any) -> Any:
109
+ """Recursively convert unicode characters back to escape sequences"""
110
+ if isinstance(obj, dict):
111
+ return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
112
+ elif isinstance(obj, list):
113
+ return [preserve_unicode_escapes(item) for item in obj]
114
+ elif isinstance(obj, str):
115
+ # Convert non-ASCII characters back to \u escapes
116
+ def escape_unicode(match: Any) -> Any:
117
+ return f"\\u{ord(match.group(0)):04x}"
118
+
119
+ return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
120
+ else:
121
+ return obj
122
+
123
+
107
124
  class EmitMode(ConfigEnum):
108
125
  # Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
109
126
  # Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
@@ -611,7 +628,7 @@ class DataHubRestEmitter(Closeable, Emitter):
611
628
  else:
612
629
  url = f"{self._gms_server}/aspects?action=ingestProposal"
613
630
 
614
- mcp_obj = pre_json_transform(mcp.to_obj())
631
+ mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
615
632
  payload_dict = {
616
633
  "proposal": mcp_obj,
617
634
  "async": "true"
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  from typing import Iterable, List, Optional
6
6
 
7
+ from datahub.configuration.common import AllowDenyPattern
7
8
  from datahub.ingestion.api.common import PipelineContext
8
9
  from datahub.ingestion.api.decorators import (
9
10
  SupportStatus,
@@ -242,7 +243,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
242
243
  ).workunit_processor,
243
244
  ]
244
245
 
246
+ def _warn_deprecated_configs(self):
247
+ if (
248
+ self.config.match_fully_qualified_names is not None
249
+ and not self.config.match_fully_qualified_names
250
+ and self.config.schema_pattern is not None
251
+ and self.config.schema_pattern != AllowDenyPattern.allow_all()
252
+ ):
253
+ self.report.report_warning(
254
+ message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
255
+ "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
256
+ "The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
257
+ context="Config option deprecation warning",
258
+ title="Config option deprecation warning",
259
+ )
260
+
245
261
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
262
+ self._warn_deprecated_configs()
246
263
  projects = get_projects(
247
264
  self.bq_schema_extractor.schema_api,
248
265
  self.report,
@@ -21,6 +21,7 @@ from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
21
21
  )
22
22
  from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
23
23
  from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
24
+ from datahub.utilities.perf_timer import PerfTimer
24
25
 
25
26
  logger = logging.getLogger(__name__)
26
27
 
@@ -54,6 +55,8 @@ class DremioAPIOperations:
54
55
  self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
55
56
  self._max_workers: int = connection_args.max_workers
56
57
  self.is_dremio_cloud = connection_args.is_dremio_cloud
58
+ self.start_time = connection_args.start_time
59
+ self.end_time = connection_args.end_time
57
60
  self.report = report
58
61
  self.session = requests.Session()
59
62
  if connection_args.is_dremio_cloud:
@@ -233,47 +236,71 @@ class DremioAPIOperations:
233
236
 
234
237
  def get(self, url: str) -> Dict:
235
238
  """execute a get request on dremio"""
236
- response = self.session.get(
237
- url=(self.base_url + url),
238
- verify=self._verify,
239
- timeout=self._timeout,
240
- )
241
- return response.json()
239
+ logger.debug(f"GET request to {self.base_url + url}")
240
+ self.report.api_calls_total += 1
241
+ self.report.api_calls_by_method_and_path["GET " + url] += 1
242
+
243
+ with PerfTimer() as timer:
244
+ response = self.session.get(
245
+ url=(self.base_url + url),
246
+ verify=self._verify,
247
+ timeout=self._timeout,
248
+ )
249
+ self.report.api_call_secs_by_method_and_path["GET " + url] += (
250
+ timer.elapsed_seconds()
251
+ )
252
+ # response.raise_for_status() # Enabling this line, makes integration tests to fail
253
+ return response.json()
242
254
 
243
255
  def post(self, url: str, data: str) -> Dict:
244
256
  """execute a get request on dremio"""
245
- response = self.session.post(
246
- url=(self.base_url + url),
247
- data=data,
248
- verify=self._verify,
249
- timeout=self._timeout,
250
- )
251
- return response.json()
257
+ logger.debug(f"POST request to {self.base_url + url}")
258
+ self.report.api_calls_total += 1
259
+ self.report.api_calls_by_method_and_path["POST " + url] += 1
260
+
261
+ with PerfTimer() as timer:
262
+ response = self.session.post(
263
+ url=(self.base_url + url),
264
+ data=data,
265
+ verify=self._verify,
266
+ timeout=self._timeout,
267
+ )
268
+ self.report.api_call_secs_by_method_and_path["POST " + url] += (
269
+ timer.elapsed_seconds()
270
+ )
271
+ # response.raise_for_status() # Enabling this line, makes integration tests to fail
272
+ return response.json()
252
273
 
253
274
  def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
254
275
  """Execute SQL query with timeout and error handling"""
255
276
  try:
256
- response = self.post(url="/sql", data=json.dumps({"sql": query}))
277
+ with PerfTimer() as timer:
278
+ logger.info(f"Executing query: {query}")
279
+ response = self.post(url="/sql", data=json.dumps({"sql": query}))
257
280
 
258
- if "errorMessage" in response:
259
- self.report.failure(
260
- message="SQL Error", context=f"{response['errorMessage']}"
261
- )
262
- raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
281
+ if "errorMessage" in response:
282
+ self.report.failure(
283
+ message="SQL Error", context=f"{response['errorMessage']}"
284
+ )
285
+ raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
263
286
 
264
- job_id = response["id"]
287
+ job_id = response["id"]
265
288
 
266
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
267
- future = executor.submit(self.fetch_results, job_id)
268
- try:
269
- return future.result(timeout=timeout)
270
- except concurrent.futures.TimeoutError:
271
- self.cancel_query(job_id)
272
- raise DremioAPIException(
273
- f"Query execution timed out after {timeout} seconds"
274
- ) from None
275
- except RuntimeError as e:
276
- raise DremioAPIException() from e
289
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
290
+ future = executor.submit(self.fetch_results, job_id)
291
+ try:
292
+ result = future.result(timeout=timeout)
293
+ logger.info(
294
+ f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
295
+ )
296
+ return result
297
+ except concurrent.futures.TimeoutError:
298
+ self.cancel_query(job_id)
299
+ raise DremioAPIException(
300
+ f"Query execution timed out after {timeout} seconds"
301
+ ) from None
302
+ except RuntimeError as e:
303
+ raise DremioAPIException() from e
277
304
 
278
305
  except requests.RequestException as e:
279
306
  raise DremioAPIException("Error executing query") from e
@@ -603,10 +630,25 @@ class DremioAPIOperations:
603
630
  return parents_list
604
631
 
605
632
  def extract_all_queries(self) -> List[Dict[str, Any]]:
633
+ # Convert datetime objects to string format for SQL queries
634
+ start_timestamp_str = None
635
+ end_timestamp_str = None
636
+
637
+ if self.start_time:
638
+ start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
639
+ if self.end_time:
640
+ end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
641
+
606
642
  if self.edition == DremioEdition.CLOUD:
607
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS_CLOUD
643
+ jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
644
+ start_timestamp_millis=start_timestamp_str,
645
+ end_timestamp_millis=end_timestamp_str,
646
+ )
608
647
  else:
609
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS
648
+ jobs_query = DremioSQLQueries.get_query_all_jobs(
649
+ start_timestamp_millis=start_timestamp_str,
650
+ end_timestamp_millis=end_timestamp_str,
651
+ )
610
652
 
611
653
  return self.execute_query(query=jobs_query)
612
654
 
@@ -685,6 +727,27 @@ class DremioAPIOperations:
685
727
 
686
728
  return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
687
729
 
730
+ def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
731
+ """
732
+ Check if a container path could potentially match a schema pattern.
733
+ This handles hierarchical path matching for container filtering.
734
+ """
735
+ if pattern == ".*":
736
+ return True
737
+
738
+ current_path = ".".join(path_components)
739
+
740
+ # Handle simple .* patterns (like "a.b.c.*")
741
+ if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
742
+ # Simple dotstar pattern - check prefix matching
743
+ pattern_prefix = pattern[:-2] # Remove ".*"
744
+ return current_path.lower().startswith(
745
+ pattern_prefix.lower()
746
+ ) or pattern_prefix.lower().startswith(current_path.lower())
747
+ else:
748
+ # Complex regex pattern - use existing regex matching logic
749
+ return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
750
+
688
751
  def should_include_container(self, path: List[str], name: str) -> bool:
689
752
  """
690
753
  Helper method to check if a container should be included based on schema patterns.
@@ -711,41 +774,8 @@ class DremioAPIOperations:
711
774
 
712
775
  # Check allow patterns
713
776
  for pattern in self.allow_schema_pattern:
714
- # For patterns with wildcards, check if this path is a parent of the pattern
715
- if "*" in pattern:
716
- pattern_parts = pattern.split(".")
717
- path_parts = path_components
718
-
719
- # If pattern has exact same number of parts, check each component
720
- if len(pattern_parts) == len(path_parts):
721
- matches = True
722
- for p_part, c_part in zip(pattern_parts, path_parts):
723
- if p_part != "*" and p_part.lower() != c_part.lower():
724
- matches = False
725
- break
726
- if matches:
727
- self.report.report_container_scanned(full_path)
728
- return True
729
- # Otherwise check if current path is prefix match
730
- else:
731
- # Remove the trailing wildcard if present
732
- if pattern_parts[-1] == "*":
733
- pattern_parts = pattern_parts[:-1]
734
-
735
- for i in range(len(path_parts)):
736
- current_path = ".".join(path_parts[: i + 1])
737
- pattern_prefix = ".".join(pattern_parts[: i + 1])
738
-
739
- if pattern_prefix.startswith(current_path):
740
- self.report.report_container_scanned(full_path)
741
- return True
742
-
743
- # Direct pattern matching
744
- if self._check_pattern_match(
745
- pattern=pattern,
746
- paths=[full_path],
747
- allow_prefix=True,
748
- ):
777
+ # Check if current path could potentially match this pattern
778
+ if self._could_match_pattern(pattern, path_components):
749
779
  self.report.report_container_scanned(full_path)
750
780
  return True
751
781
 
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
11
11
  )
12
+ from datahub.configuration.time_window_config import BaseTimeWindowConfig
12
13
  from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
13
14
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
14
15
  StatefulStaleMetadataRemovalConfig,
@@ -118,6 +119,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
118
119
  class DremioSourceConfig(
119
120
  DremioConnectionConfig,
120
121
  StatefulIngestionConfigBase,
122
+ BaseTimeWindowConfig,
121
123
  EnvConfigMixin,
122
124
  PlatformInstanceConfigMixin,
123
125
  ):
@@ -1,22 +1,43 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
+ from typing import Optional
3
4
 
4
5
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
5
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
7
  StaleEntityRemovalSourceReport,
7
8
  )
8
9
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
10
+ from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
11
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
12
+ from datahub.utilities.stats_collections import (
13
+ TopKDict,
14
+ float_top_k_dict,
15
+ int_top_k_dict,
16
+ )
9
17
 
10
18
 
11
19
  @dataclass
12
20
  class DremioSourceReport(
13
- SQLSourceReport, StaleEntityRemovalSourceReport, IngestionStageReport
21
+ SQLSourceReport,
22
+ StaleEntityRemovalSourceReport,
23
+ IngestionStageReport,
24
+ BaseTimeWindowReport,
14
25
  ):
15
26
  num_containers_failed: int = 0
16
27
  num_datasets_failed: int = 0
17
28
  containers_scanned: int = 0
18
29
  containers_filtered: int = 0
19
30
 
31
+ api_calls_total: int = 0
32
+ api_calls_by_method_and_path: TopKDict[str, int] = field(
33
+ default_factory=int_top_k_dict
34
+ )
35
+ api_call_secs_by_method_and_path: TopKDict[str, float] = field(
36
+ default_factory=float_top_k_dict
37
+ )
38
+
39
+ sql_aggregator: Optional[SqlAggregatorReport] = None
40
+
20
41
  def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
21
42
  # recording total combined latency is not very useful, keeping this method as a placeholder
22
43
  # for future implementation of min / max / percentiles etc.