acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (263) hide show
  1. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
  2. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +2 -2
  5. datahub/api/entities/corpgroup/corpgroup.py +11 -6
  6. datahub/api/entities/corpuser/corpuser.py +11 -11
  7. datahub/api/entities/dataproduct/dataproduct.py +47 -27
  8. datahub/api/entities/dataset/dataset.py +32 -21
  9. datahub/api/entities/external/lake_formation_external_entites.py +5 -6
  10. datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
  11. datahub/api/entities/forms/forms.py +16 -14
  12. datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
  13. datahub/cli/check_cli.py +2 -2
  14. datahub/cli/config_utils.py +3 -3
  15. datahub/cli/lite_cli.py +9 -7
  16. datahub/cli/migrate.py +4 -4
  17. datahub/cli/quickstart_versioning.py +3 -3
  18. datahub/cli/specific/group_cli.py +1 -1
  19. datahub/cli/specific/structuredproperties_cli.py +1 -1
  20. datahub/cli/specific/user_cli.py +1 -1
  21. datahub/configuration/common.py +14 -2
  22. datahub/configuration/connection_resolver.py +2 -2
  23. datahub/configuration/git.py +47 -30
  24. datahub/configuration/import_resolver.py +2 -2
  25. datahub/configuration/kafka.py +4 -3
  26. datahub/configuration/time_window_config.py +26 -26
  27. datahub/configuration/validate_field_deprecation.py +2 -2
  28. datahub/configuration/validate_field_removal.py +2 -2
  29. datahub/configuration/validate_field_rename.py +2 -2
  30. datahub/configuration/validate_multiline_string.py +2 -1
  31. datahub/emitter/kafka_emitter.py +3 -1
  32. datahub/emitter/rest_emitter.py +2 -4
  33. datahub/ingestion/api/decorators.py +1 -1
  34. datahub/ingestion/api/report.py +1 -1
  35. datahub/ingestion/api/sink.py +1 -1
  36. datahub/ingestion/api/source.py +1 -1
  37. datahub/ingestion/glossary/datahub_classifier.py +11 -8
  38. datahub/ingestion/graph/client.py +5 -1
  39. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  40. datahub/ingestion/reporting/file_reporter.py +5 -4
  41. datahub/ingestion/run/pipeline.py +7 -6
  42. datahub/ingestion/run/pipeline_config.py +12 -14
  43. datahub/ingestion/run/sink_callback.py +1 -1
  44. datahub/ingestion/sink/datahub_rest.py +6 -4
  45. datahub/ingestion/source/abs/config.py +19 -19
  46. datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
  47. datahub/ingestion/source/abs/source.py +2 -2
  48. datahub/ingestion/source/aws/aws_common.py +1 -1
  49. datahub/ingestion/source/aws/glue.py +6 -4
  50. datahub/ingestion/source/aws/sagemaker.py +1 -1
  51. datahub/ingestion/source/azure/azure_common.py +8 -12
  52. datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
  53. datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
  54. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
  55. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  56. datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
  57. datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
  58. datahub/ingestion/source/datahub/config.py +8 -8
  59. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  60. datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
  61. datahub/ingestion/source/dbt/dbt_common.py +39 -37
  62. datahub/ingestion/source/dbt/dbt_core.py +10 -12
  63. datahub/ingestion/source/debug/datahub_debug.py +1 -1
  64. datahub/ingestion/source/delta_lake/config.py +6 -4
  65. datahub/ingestion/source/dremio/dremio_api.py +212 -78
  66. datahub/ingestion/source/dremio/dremio_config.py +10 -6
  67. datahub/ingestion/source/dremio/dremio_entities.py +55 -39
  68. datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
  69. datahub/ingestion/source/dremio/dremio_source.py +24 -26
  70. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  71. datahub/ingestion/source/elastic_search.py +110 -32
  72. datahub/ingestion/source/excel/source.py +1 -1
  73. datahub/ingestion/source/feast.py +1 -1
  74. datahub/ingestion/source/file.py +5 -4
  75. datahub/ingestion/source/fivetran/config.py +17 -16
  76. datahub/ingestion/source/fivetran/fivetran.py +2 -2
  77. datahub/ingestion/source/gc/datahub_gc.py +1 -1
  78. datahub/ingestion/source/gcs/gcs_source.py +8 -10
  79. datahub/ingestion/source/ge_profiling_config.py +8 -5
  80. datahub/ingestion/source/grafana/grafana_api.py +2 -2
  81. datahub/ingestion/source/grafana/grafana_config.py +4 -3
  82. datahub/ingestion/source/grafana/grafana_source.py +1 -1
  83. datahub/ingestion/source/grafana/models.py +23 -5
  84. datahub/ingestion/source/hex/api.py +7 -5
  85. datahub/ingestion/source/hex/hex.py +4 -3
  86. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  87. datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +10 -10
  90. datahub/ingestion/source/kafka/kafka.py +1 -1
  91. datahub/ingestion/source/ldap.py +1 -1
  92. datahub/ingestion/source/looker/looker_common.py +7 -5
  93. datahub/ingestion/source/looker/looker_config.py +21 -20
  94. datahub/ingestion/source/looker/lookml_config.py +47 -47
  95. datahub/ingestion/source/metabase.py +8 -8
  96. datahub/ingestion/source/metadata/business_glossary.py +2 -2
  97. datahub/ingestion/source/metadata/lineage.py +13 -8
  98. datahub/ingestion/source/mlflow.py +1 -1
  99. datahub/ingestion/source/mode.py +6 -4
  100. datahub/ingestion/source/mongodb.py +4 -3
  101. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  102. datahub/ingestion/source/nifi.py +17 -23
  103. datahub/ingestion/source/openapi.py +6 -8
  104. datahub/ingestion/source/powerbi/config.py +33 -32
  105. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
  106. datahub/ingestion/source/powerbi/powerbi.py +1 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
  108. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
  109. datahub/ingestion/source/preset.py +8 -8
  110. datahub/ingestion/source/pulsar.py +1 -1
  111. datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
  112. datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
  113. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  114. datahub/ingestion/source/redshift/config.py +18 -20
  115. datahub/ingestion/source/redshift/redshift.py +2 -2
  116. datahub/ingestion/source/redshift/usage.py +23 -3
  117. datahub/ingestion/source/s3/config.py +83 -62
  118. datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
  119. datahub/ingestion/source/s3/source.py +8 -5
  120. datahub/ingestion/source/sac/sac.py +5 -4
  121. datahub/ingestion/source/salesforce.py +3 -2
  122. datahub/ingestion/source/schema/json_schema.py +2 -2
  123. datahub/ingestion/source/sigma/data_classes.py +3 -2
  124. datahub/ingestion/source/sigma/sigma.py +1 -1
  125. datahub/ingestion/source/sigma/sigma_api.py +7 -7
  126. datahub/ingestion/source/slack/slack.py +1 -1
  127. datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
  128. datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
  129. datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
  130. datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
  131. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
  132. datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
  133. datahub/ingestion/source/sql/athena.py +1 -1
  134. datahub/ingestion/source/sql/clickhouse.py +4 -2
  135. datahub/ingestion/source/sql/cockroachdb.py +1 -1
  136. datahub/ingestion/source/sql/druid.py +1 -1
  137. datahub/ingestion/source/sql/hana.py +1 -1
  138. datahub/ingestion/source/sql/hive.py +7 -5
  139. datahub/ingestion/source/sql/hive_metastore.py +1 -1
  140. datahub/ingestion/source/sql/mssql/source.py +13 -6
  141. datahub/ingestion/source/sql/mysql.py +1 -1
  142. datahub/ingestion/source/sql/oracle.py +17 -10
  143. datahub/ingestion/source/sql/postgres.py +2 -2
  144. datahub/ingestion/source/sql/presto.py +1 -1
  145. datahub/ingestion/source/sql/sql_config.py +8 -9
  146. datahub/ingestion/source/sql/sql_generic.py +1 -1
  147. datahub/ingestion/source/sql/teradata.py +1 -1
  148. datahub/ingestion/source/sql/trino.py +1 -1
  149. datahub/ingestion/source/sql/vertica.py +5 -4
  150. datahub/ingestion/source/sql_queries.py +174 -22
  151. datahub/ingestion/source/state/checkpoint.py +2 -2
  152. datahub/ingestion/source/state/entity_removal_state.py +2 -1
  153. datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
  154. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
  155. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  156. datahub/ingestion/source/superset.py +9 -9
  157. datahub/ingestion/source/tableau/tableau.py +14 -16
  158. datahub/ingestion/source/unity/azure_auth_config.py +15 -0
  159. datahub/ingestion/source/unity/config.py +51 -34
  160. datahub/ingestion/source/unity/connection.py +7 -1
  161. datahub/ingestion/source/unity/connection_test.py +1 -1
  162. datahub/ingestion/source/unity/proxy.py +216 -7
  163. datahub/ingestion/source/unity/proxy_types.py +91 -0
  164. datahub/ingestion/source/unity/source.py +29 -3
  165. datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
  166. datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
  167. datahub/ingestion/source/usage/usage_common.py +5 -3
  168. datahub/ingestion/source_config/csv_enricher.py +7 -6
  169. datahub/ingestion/source_config/operation_config.py +7 -4
  170. datahub/ingestion/source_config/pulsar.py +11 -15
  171. datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
  172. datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
  173. datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
  174. datahub/ingestion/transformer/add_dataset_properties.py +2 -2
  175. datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
  176. datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
  177. datahub/ingestion/transformer/add_dataset_tags.py +3 -3
  178. datahub/ingestion/transformer/add_dataset_terms.py +3 -3
  179. datahub/ingestion/transformer/dataset_domain.py +3 -3
  180. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
  181. datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
  182. datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
  183. datahub/ingestion/transformer/mark_dataset_status.py +1 -1
  184. datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
  185. datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
  186. datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
  187. datahub/ingestion/transformer/replace_external_url.py +2 -2
  188. datahub/ingestion/transformer/set_browse_path.py +1 -1
  189. datahub/ingestion/transformer/tags_to_terms.py +1 -1
  190. datahub/lite/duckdb_lite.py +1 -1
  191. datahub/lite/lite_util.py +2 -2
  192. datahub/metadata/_internal_schema_classes.py +62 -2
  193. datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
  194. datahub/metadata/schema.avsc +271 -91
  195. datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
  196. datahub/metadata/schemas/AssertionInfo.avsc +48 -5
  197. datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
  198. datahub/metadata/schemas/ChartInfo.avsc +12 -5
  199. datahub/metadata/schemas/ContainerProperties.avsc +12 -5
  200. datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  201. datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
  202. datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
  203. datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
  204. datahub/metadata/schemas/DashboardInfo.avsc +16 -4
  205. datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
  206. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
  207. datahub/metadata/schemas/DataJobInfo.avsc +9 -4
  208. datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
  209. datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  210. datahub/metadata/schemas/DataProductProperties.avsc +5 -2
  211. datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
  212. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  213. datahub/metadata/schemas/DatasetProperties.avsc +12 -5
  214. datahub/metadata/schemas/DomainProperties.avsc +7 -3
  215. datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
  216. datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  217. datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  218. datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  219. datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  220. datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  221. datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  222. datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  223. datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  224. datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  225. datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  226. datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
  227. datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
  228. datahub/metadata/schemas/GlobalTags.avsc +3 -2
  229. datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  230. datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  231. datahub/metadata/schemas/InputFields.avsc +3 -2
  232. datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
  233. datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
  234. datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
  235. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  236. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  237. datahub/metadata/schemas/MLModelProperties.avsc +4 -2
  238. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
  239. datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
  240. datahub/metadata/schemas/NotebookInfo.avsc +5 -2
  241. datahub/metadata/schemas/Ownership.avsc +3 -2
  242. datahub/metadata/schemas/QuerySubjects.avsc +1 -1
  243. datahub/metadata/schemas/RoleProperties.avsc +3 -1
  244. datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  245. datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
  246. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
  247. datahub/metadata/schemas/TagProperties.avsc +3 -1
  248. datahub/metadata/schemas/TestInfo.avsc +2 -1
  249. datahub/sdk/__init__.py +1 -0
  250. datahub/sdk/_all_entities.py +2 -0
  251. datahub/sdk/search_filters.py +68 -40
  252. datahub/sdk/tag.py +112 -0
  253. datahub/secret/datahub_secret_store.py +7 -4
  254. datahub/secret/file_secret_store.py +1 -1
  255. datahub/sql_parsing/schema_resolver.py +29 -0
  256. datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
  257. datahub/sql_parsing/sqlglot_lineage.py +5 -2
  258. datahub/testing/check_sql_parser_result.py +2 -2
  259. datahub/utilities/ingest_utils.py +1 -1
  260. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
  261. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
  262. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
  263. {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ from collections import defaultdict
7
7
  from enum import Enum
8
8
  from itertools import product
9
9
  from time import sleep, time
10
- from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
10
+ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
11
11
  from urllib.parse import quote
12
12
 
13
13
  import requests
@@ -343,14 +343,149 @@ class DremioAPIOperations:
343
343
 
344
344
  while True:
345
345
  result = self.get_job_result(job_id, offset, limit)
346
- rows.extend(result["rows"])
347
346
 
348
- offset = offset + limit
349
- if offset >= result["rowCount"]:
347
+ # Handle cases where API response doesn't contain 'rows' key
348
+ # This can happen with OOM errors or when no rows are returned
349
+ if "rows" not in result:
350
+ logger.warning(
351
+ f"API response for job {job_id} missing 'rows' key. "
352
+ f"Response keys: {list(result.keys())}"
353
+ )
354
+ # Check for error conditions
355
+ if "errorMessage" in result:
356
+ raise DremioAPIException(f"Query error: {result['errorMessage']}")
357
+ elif "message" in result:
358
+ logger.warning(
359
+ f"Query warning for job {job_id}: {result['message']}"
360
+ )
361
+ # Return empty list if no rows key and no error
362
+ break
363
+
364
+ # Handle empty rows response
365
+ result_rows = result["rows"]
366
+ if not result_rows:
367
+ logger.debug(
368
+ f"No more rows returned for job {job_id} at offset {offset}"
369
+ )
350
370
  break
351
371
 
372
+ rows.extend(result_rows)
373
+
374
+ # Check actual returned rows to determine if we should continue
375
+ actual_rows_returned = len(result_rows)
376
+ if actual_rows_returned == 0:
377
+ logger.debug(f"Query returned no rows for job {job_id}")
378
+ break
379
+
380
+ offset = offset + actual_rows_returned
381
+ # If we got fewer rows than requested, we've reached the end
382
+ if actual_rows_returned < limit:
383
+ break
384
+
385
+ logger.info(f"Fetched {len(rows)} total rows for job {job_id}")
352
386
  return rows
353
387
 
388
+ def _fetch_results_iter(self, job_id: str) -> Iterator[Dict]:
389
+ """
390
+ Fetch job results in a streaming fashion to reduce memory usage.
391
+ Yields individual rows instead of collecting all in memory.
392
+ """
393
+ limit = 500
394
+ offset = 0
395
+ total_rows_fetched = 0
396
+
397
+ while True:
398
+ result = self.get_job_result(job_id, offset, limit)
399
+
400
+ # Handle cases where API response doesn't contain 'rows' key
401
+ if "rows" not in result:
402
+ logger.warning(
403
+ f"API response for job {job_id} missing 'rows' key. "
404
+ f"Response keys: {list(result.keys())}"
405
+ )
406
+ # Check for error conditions
407
+ if "errorMessage" in result:
408
+ raise DremioAPIException(f"Query error: {result['errorMessage']}")
409
+ elif "message" in result:
410
+ logger.warning(
411
+ f"Query warning for job {job_id}: {result['message']}"
412
+ )
413
+ # Stop iteration if no rows key and no error
414
+ break
415
+
416
+ # Handle empty rows response
417
+ result_rows = result["rows"]
418
+ if not result_rows:
419
+ logger.debug(
420
+ f"No more rows returned for job {job_id} at offset {offset}"
421
+ )
422
+ break
423
+
424
+ # Yield individual rows instead of collecting them
425
+ for row in result_rows:
426
+ yield row
427
+ total_rows_fetched += 1
428
+
429
+ # Check actual returned rows to determine if we should continue
430
+ actual_rows_returned = len(result_rows)
431
+ if actual_rows_returned == 0:
432
+ logger.debug(f"Query returned no rows for job {job_id}")
433
+ break
434
+
435
+ offset = offset + actual_rows_returned
436
+ # If we got fewer rows than requested, we've reached the end
437
+ if actual_rows_returned < limit:
438
+ break
439
+
440
+ logger.info(f"Streamed {total_rows_fetched} total rows for job {job_id}")
441
+
442
+ def execute_query_iter(
443
+ self, query: str, timeout: int = 3600
444
+ ) -> Iterator[Dict[str, Any]]:
445
+ """Execute SQL query and return results as a streaming iterator"""
446
+ try:
447
+ with PerfTimer() as timer:
448
+ logger.info(f"Executing streaming query: {query}")
449
+ response = self.post(url="/sql", data=json.dumps({"sql": query}))
450
+
451
+ if "errorMessage" in response:
452
+ self.report.failure(
453
+ message="SQL Error", context=f"{response['errorMessage']}"
454
+ )
455
+ raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
456
+
457
+ job_id = response["id"]
458
+
459
+ # Wait for job completion
460
+ start_time = time()
461
+ while True:
462
+ status = self.get_job_status(job_id)
463
+ if status["jobState"] == "COMPLETED":
464
+ break
465
+ elif status["jobState"] == "FAILED":
466
+ error_message = status.get("errorMessage", "Unknown error")
467
+ raise RuntimeError(f"Query failed: {error_message}")
468
+ elif status["jobState"] == "CANCELED":
469
+ raise RuntimeError("Query was canceled")
470
+
471
+ if time() - start_time > timeout:
472
+ self.cancel_query(job_id)
473
+ raise DremioAPIException(
474
+ f"Query execution timed out after {timeout} seconds"
475
+ )
476
+
477
+ sleep(3)
478
+
479
+ logger.info(
480
+ f"Query job completed in {timer.elapsed_seconds()} seconds, starting streaming"
481
+ )
482
+
483
+ # Return streaming iterator
484
+ return self._fetch_results_iter(job_id)
485
+
486
+ except requests.RequestException as e:
487
+ raise DremioAPIException("Error executing streaming query") from e
488
+
354
489
  def cancel_query(self, job_id: str) -> None:
355
490
  """Cancel a running query"""
356
491
  try:
@@ -499,8 +634,12 @@ class DremioAPIOperations:
499
634
  return f"AND {operator}({field}, '{pattern_str}')"
500
635
 
501
636
  def get_all_tables_and_columns(
502
- self, containers: Deque["DremioContainer"]
503
- ) -> List[Dict]:
637
+ self, containers: Iterator["DremioContainer"]
638
+ ) -> Iterator[Dict]:
639
+ """
640
+ Memory-efficient streaming version that yields tables one at a time.
641
+ Reduces memory usage for large datasets by processing results as they come.
642
+ """
504
643
  if self.edition == DremioEdition.ENTERPRISE:
505
644
  query_template = DremioSQLQueries.QUERY_DATASETS_EE
506
645
  elif self.edition == DremioEdition.CLOUD:
@@ -517,93 +656,85 @@ class DremioAPIOperations:
517
656
  self.deny_schema_pattern, schema_field, allow=False
518
657
  )
519
658
 
520
- all_tables_and_columns = []
521
-
659
+ # Process each container's results separately to avoid memory buildup
522
660
  for schema in containers:
523
- formatted_query = ""
524
661
  try:
525
662
  formatted_query = query_template.format(
526
663
  schema_pattern=schema_condition,
527
664
  deny_schema_pattern=deny_schema_condition,
528
665
  container_name=schema.container_name.lower(),
529
666
  )
530
- all_tables_and_columns.extend(
531
- self.execute_query(
532
- query=formatted_query,
533
- )
534
- )
535
- except DremioAPIException as e:
536
- self.report.warning(
537
- message="Container has no tables or views",
538
- context=f"{schema.subclass} {schema.container_name}",
539
- exc=e,
540
- )
541
667
 
542
- tables = []
668
+ # Use streaming query execution
669
+ container_results = list(self.execute_query_iter(query=formatted_query))
543
670
 
544
- if self.edition == DremioEdition.COMMUNITY:
545
- tables = self.community_get_formatted_tables(all_tables_and_columns)
671
+ if self.edition == DremioEdition.COMMUNITY:
672
+ # Process community edition results
673
+ formatted_tables = self.community_get_formatted_tables(
674
+ container_results
675
+ )
676
+ for table in formatted_tables:
677
+ yield table
678
+ else:
679
+ # Process enterprise/cloud edition results
680
+ column_dictionary: Dict[str, List[Dict]] = defaultdict(list)
681
+ table_metadata: Dict[str, Dict] = {}
546
682
 
547
- else:
548
- column_dictionary: Dict[str, List[Dict]] = defaultdict(list)
683
+ for record in container_results:
684
+ if not record.get("COLUMN_NAME"):
685
+ continue
549
686
 
550
- for record in all_tables_and_columns:
551
- if not record.get("COLUMN_NAME"):
552
- continue
687
+ table_full_path = record.get("FULL_TABLE_PATH")
688
+ if not table_full_path:
689
+ continue
553
690
 
554
- table_full_path = record.get("FULL_TABLE_PATH")
555
- if not table_full_path:
556
- continue
691
+ # Store column information
692
+ column_dictionary[table_full_path].append(
693
+ {
694
+ "name": record["COLUMN_NAME"],
695
+ "ordinal_position": record["ORDINAL_POSITION"],
696
+ "is_nullable": record["IS_NULLABLE"],
697
+ "data_type": record["DATA_TYPE"],
698
+ "column_size": record["COLUMN_SIZE"],
699
+ }
700
+ )
557
701
 
558
- column_dictionary[table_full_path].append(
559
- {
560
- "name": record["COLUMN_NAME"],
561
- "ordinal_position": record["ORDINAL_POSITION"],
562
- "is_nullable": record["IS_NULLABLE"],
563
- "data_type": record["DATA_TYPE"],
564
- "column_size": record["COLUMN_SIZE"],
565
- }
566
- )
702
+ # Store table metadata (only once per table)
703
+ if table_full_path not in table_metadata:
704
+ table_metadata[table_full_path] = {
705
+ "TABLE_NAME": record.get("TABLE_NAME"),
706
+ "TABLE_SCHEMA": record.get("TABLE_SCHEMA"),
707
+ "VIEW_DEFINITION": record.get("VIEW_DEFINITION"),
708
+ "RESOURCE_ID": record.get("RESOURCE_ID"),
709
+ "LOCATION_ID": record.get("LOCATION_ID"),
710
+ "OWNER": record.get("OWNER"),
711
+ "OWNER_TYPE": record.get("OWNER_TYPE"),
712
+ "CREATED": record.get("CREATED"),
713
+ "FORMAT_TYPE": record.get("FORMAT_TYPE"),
714
+ }
567
715
 
568
- distinct_tables_list = list(
569
- {
570
- tuple(
571
- dictionary[key]
572
- for key in (
573
- "TABLE_SCHEMA",
574
- "TABLE_NAME",
575
- "FULL_TABLE_PATH",
576
- "VIEW_DEFINITION",
577
- "LOCATION_ID",
578
- "OWNER",
579
- "OWNER_TYPE",
580
- "CREATED",
581
- "FORMAT_TYPE",
582
- )
583
- if key in dictionary
584
- ): dictionary
585
- for dictionary in all_tables_and_columns
586
- }.values()
587
- )
716
+ # Yield tables one at a time
717
+ for table_path, table_info in table_metadata.items():
718
+ yield {
719
+ "TABLE_NAME": table_info.get("TABLE_NAME"),
720
+ "TABLE_SCHEMA": table_info.get("TABLE_SCHEMA"),
721
+ "COLUMNS": column_dictionary[table_path],
722
+ "VIEW_DEFINITION": table_info.get("VIEW_DEFINITION"),
723
+ "RESOURCE_ID": table_info.get("RESOURCE_ID"),
724
+ "LOCATION_ID": table_info.get("LOCATION_ID"),
725
+ "OWNER": table_info.get("OWNER"),
726
+ "OWNER_TYPE": table_info.get("OWNER_TYPE"),
727
+ "CREATED": table_info.get("CREATED"),
728
+ "FORMAT_TYPE": table_info.get("FORMAT_TYPE"),
729
+ }
588
730
 
589
- for table in distinct_tables_list:
590
- tables.append(
591
- {
592
- "TABLE_NAME": table.get("TABLE_NAME"),
593
- "TABLE_SCHEMA": table.get("TABLE_SCHEMA"),
594
- "COLUMNS": column_dictionary[table["FULL_TABLE_PATH"]],
595
- "VIEW_DEFINITION": table.get("VIEW_DEFINITION"),
596
- "RESOURCE_ID": table.get("RESOURCE_ID"),
597
- "LOCATION_ID": table.get("LOCATION_ID"),
598
- "OWNER": table.get("OWNER"),
599
- "OWNER_TYPE": table.get("OWNER_TYPE"),
600
- "CREATED": table.get("CREATED"),
601
- "FORMAT_TYPE": table.get("FORMAT_TYPE"),
602
- }
731
+ except DremioAPIException as e:
732
+ self.report.warning(
733
+ message="Container has no tables or views",
734
+ context=f"{schema.subclass} {schema.container_name}",
735
+ exc=e,
603
736
  )
604
737
 
605
- return tables
606
-
607
738
  def validate_schema_format(self, schema):
608
739
  if "." in schema:
609
740
  schema_path = self.get(
@@ -640,7 +771,10 @@ class DremioAPIOperations:
640
771
 
641
772
  return parents_list
642
773
 
643
- def extract_all_queries(self) -> List[Dict[str, Any]]:
774
+ def extract_all_queries(self) -> Iterator[Dict[str, Any]]:
775
+ """
776
+ Memory-efficient streaming version for extracting query results.
777
+ """
644
778
  # Convert datetime objects to string format for SQL queries
645
779
  start_timestamp_str = None
646
780
  end_timestamp_str = None
@@ -661,7 +795,7 @@ class DremioAPIOperations:
661
795
  end_timestamp_millis=end_timestamp_str,
662
796
  )
663
797
 
664
- return self.execute_query(query=jobs_query)
798
+ return self.execute_query_iter(query=jobs_query)
665
799
 
666
800
  def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
667
801
  """
@@ -2,7 +2,7 @@ import os
2
2
  from typing import List, Literal, Optional
3
3
 
4
4
  import certifi
5
- from pydantic import Field, validator
5
+ from pydantic import Field, ValidationInfo, field_validator
6
6
 
7
7
  from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
8
8
  from datahub.configuration.source_common import (
@@ -78,8 +78,9 @@ class DremioConnectionConfig(ConfigModel):
78
78
  description="ID of Dremio Cloud Project. Found in Project Settings in the Dremio Cloud UI",
79
79
  )
80
80
 
81
- @validator("authentication_method")
82
- def validate_auth_method(cls, value):
81
+ @field_validator("authentication_method", mode="after")
82
+ @classmethod
83
+ def validate_auth_method(cls, value: str) -> str:
83
84
  allowed_methods = ["password", "PAT"]
84
85
  if value not in allowed_methods:
85
86
  raise ValueError(
@@ -87,9 +88,12 @@ class DremioConnectionConfig(ConfigModel):
87
88
  )
88
89
  return value
89
90
 
90
- @validator("password")
91
- def validate_password(cls, value, values):
92
- if values.get("authentication_method") == "PAT" and not value:
91
+ @field_validator("password", mode="after")
92
+ @classmethod
93
+ def validate_password(
94
+ cls, value: Optional[str], info: ValidationInfo
95
+ ) -> Optional[str]:
96
+ if info.data.get("authentication_method") == "PAT" and not value:
93
97
  raise ValueError(
94
98
  "Password (Personal Access Token) is required when using PAT authentication",
95
99
  )
@@ -1,4 +1,3 @@
1
- import itertools
2
1
  import logging
3
2
  import re
4
3
  import uuid
@@ -6,7 +5,7 @@ from collections import deque
6
5
  from dataclasses import dataclass
7
6
  from datetime import datetime
8
7
  from enum import Enum
9
- from typing import Any, Deque, Dict, List, Optional
8
+ from typing import Any, Deque, Dict, Iterator, List, Optional
10
9
 
11
10
  from sqlglot import parse_one
12
11
 
@@ -184,6 +183,7 @@ class DremioQuery:
184
183
  return ""
185
184
 
186
185
  def get_raw_query(self, sql_query: str) -> str:
186
+ """Remove comments from SQL query using sqlglot parser."""
187
187
  try:
188
188
  parsed = parse_one(sql_query)
189
189
  return parsed.sql(comments=False)
@@ -336,43 +336,26 @@ class DremioCatalog:
336
336
  def __init__(self, dremio_api: DremioAPIOperations):
337
337
  self.dremio_api = dremio_api
338
338
  self.edition = dremio_api.edition
339
- self.datasets: Deque[DremioDataset] = deque()
340
339
  self.sources: Deque[DremioSourceContainer] = deque()
341
340
  self.spaces: Deque[DremioSpace] = deque()
342
341
  self.folders: Deque[DremioFolder] = deque()
343
- self.glossary_terms: Deque[DremioGlossaryTerm] = deque()
344
342
  self.queries: Deque[DremioQuery] = deque()
345
343
 
346
- self.datasets_populated = False
347
344
  self.containers_populated = False
348
345
  self.queries_populated = False
349
346
 
350
- def set_datasets(self) -> None:
351
- if not self.datasets_populated:
352
- self.set_containers()
347
+ def get_datasets(self) -> Iterator[DremioDataset]:
348
+ """Get all Dremio datasets (tables and views) as an iterator."""
349
+ # Get containers directly without storing them
350
+ containers = self.get_containers()
353
351
 
354
- containers: Deque[DremioContainer] = deque()
355
- containers.extend(self.spaces) # Add DremioSpace elements
356
- containers.extend(self.sources) # Add DremioSource elements
357
-
358
- for dataset_details in self.dremio_api.get_all_tables_and_columns(
359
- containers=containers
360
- ):
361
- dremio_dataset = DremioDataset(
362
- dataset_details=dataset_details,
363
- api_operations=self.dremio_api,
364
- )
365
- self.datasets.append(dremio_dataset)
366
-
367
- for glossary_term in dremio_dataset.glossary_terms:
368
- if glossary_term not in self.glossary_terms:
369
- self.glossary_terms.append(glossary_term)
370
-
371
- self.datasets_populated = True
352
+ for dataset_details in self.dremio_api.get_all_tables_and_columns(containers):
353
+ dremio_dataset = DremioDataset(
354
+ dataset_details=dataset_details,
355
+ api_operations=self.dremio_api,
356
+ )
372
357
 
373
- def get_datasets(self) -> Deque[DremioDataset]:
374
- self.set_datasets()
375
- return self.datasets
358
+ yield dremio_dataset
376
359
 
377
360
  def set_containers(self) -> None:
378
361
  if not self.containers_populated:
@@ -423,18 +406,50 @@ class DremioCatalog:
423
406
 
424
407
  self.containers_populated = True
425
408
 
426
- def get_containers(self) -> Deque:
427
- self.set_containers()
428
- return deque(itertools.chain(self.sources, self.spaces, self.folders))
409
+ def get_containers(self) -> Iterator[DremioContainer]:
410
+ """Get all containers (sources, spaces, folders) as an iterator."""
411
+ for container in self.dremio_api.get_all_containers():
412
+ container_type = container.get("container_type")
413
+ if container_type == DremioEntityContainerType.SOURCE:
414
+ yield DremioSourceContainer(
415
+ container_name=container.get("name"),
416
+ location_id=container.get("id"),
417
+ path=[],
418
+ api_operations=self.dremio_api,
419
+ dremio_source_type=container.get("source_type") or "",
420
+ root_path=container.get("root_path"),
421
+ database_name=container.get("database_name"),
422
+ )
423
+ elif container_type == DremioEntityContainerType.SPACE:
424
+ yield DremioSpace(
425
+ container_name=container.get("name"),
426
+ location_id=container.get("id"),
427
+ path=[],
428
+ api_operations=self.dremio_api,
429
+ )
430
+ elif container_type == DremioEntityContainerType.FOLDER:
431
+ yield DremioFolder(
432
+ container_name=container.get("name"),
433
+ location_id=container.get("id"),
434
+ path=container.get("path"),
435
+ api_operations=self.dremio_api,
436
+ )
437
+
438
+ def get_sources(self) -> Iterator[DremioSourceContainer]:
439
+ """Get all Dremio source containers (external data connections) as an iterator."""
440
+ for container in self.get_containers():
441
+ if isinstance(container, DremioSourceContainer):
442
+ yield container
429
443
 
430
- def get_sources(self) -> Deque[DremioSourceContainer]:
431
- self.set_containers()
432
- return self.sources
444
+ def get_glossary_terms(self) -> Iterator[DremioGlossaryTerm]:
445
+ """Get all unique glossary terms (tags) from datasets."""
446
+ glossary_terms_seen = set()
433
447
 
434
- def get_glossary_terms(self) -> Deque[DremioGlossaryTerm]:
435
- self.set_datasets()
436
- self.set_containers()
437
- return self.glossary_terms
448
+ for dataset in self.get_datasets():
449
+ for glossary_term in dataset.glossary_terms:
450
+ if glossary_term not in glossary_terms_seen:
451
+ glossary_terms_seen.add(glossary_term)
452
+ yield glossary_term
438
453
 
439
454
  def is_valid_query(self, query: Dict[str, Any]) -> bool:
440
455
  required_fields = [
@@ -447,6 +462,7 @@ class DremioCatalog:
447
462
  return all(query.get(field) for field in required_fields)
448
463
 
449
464
  def get_queries(self) -> Deque[DremioQuery]:
465
+ """Get all valid Dremio queries for lineage analysis."""
450
466
  for query in self.dremio_api.extract_all_queries():
451
467
  if not self.is_valid_query(query):
452
468
  continue
@@ -17,6 +17,7 @@ from datahub.metadata.schema_classes import (
17
17
  DatasetProfileClass,
18
18
  QuantileClass,
19
19
  )
20
+ from datahub.utilities.perf_timer import PerfTimer
20
21
 
21
22
  logger = logging.getLogger(__name__)
22
23
 
@@ -64,8 +65,13 @@ class DremioProfiler:
64
65
  )
65
66
  return
66
67
 
67
- profile_data = self.profile_table(full_table_name, columns)
68
- profile_aspect = self.populate_profile_aspect(profile_data)
68
+ with PerfTimer() as timer:
69
+ profile_data = self.profile_table(full_table_name, columns)
70
+ profile_aspect = self.populate_profile_aspect(profile_data)
71
+
72
+ logger.info(
73
+ f"Profiled table {full_table_name} with {len(columns)} columns in {timer.elapsed_seconds():.2f} seconds"
74
+ )
69
75
 
70
76
  if profile_aspect:
71
77
  self.report.report_entity_profiled(dataset.resource_name)
@@ -131,7 +137,12 @@ class DremioProfiler:
131
137
  def _profile_chunk(self, table_name: str, columns: List[Tuple[str, str]]) -> Dict:
132
138
  profile_sql = self._build_profile_sql(table_name, columns)
133
139
  try:
134
- results = self.api_operations.execute_query(profile_sql)
140
+ with PerfTimer() as timer:
141
+ results = self.api_operations.execute_query(profile_sql)
142
+
143
+ logger.debug(
144
+ f"Profiling query for {table_name} ({len(columns)} columns) completed in {timer.elapsed_seconds():.2f} seconds"
145
+ )
135
146
  return self._parse_profile_results(results, columns)
136
147
  except DremioAPIException as e:
137
148
  raise e