acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +7 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +24 -26
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +110 -32
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +174 -22
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/azure_auth_config.py +15 -0
- datahub/ingestion/source/unity/config.py +51 -34
- datahub/ingestion/source/unity/connection.py +7 -1
- datahub/ingestion/source/unity/connection_test.py +1 -1
- datahub/ingestion/source/unity/proxy.py +216 -7
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +29 -3
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +271 -91
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -1
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +2 -0
- datahub/sdk/search_filters.py +68 -40
- datahub/sdk/tag.py +112 -0
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from collections import defaultdict
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from itertools import product
|
|
9
9
|
from time import sleep, time
|
|
10
|
-
from typing import TYPE_CHECKING, Any,
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
|
|
11
11
|
from urllib.parse import quote
|
|
12
12
|
|
|
13
13
|
import requests
|
|
@@ -343,14 +343,149 @@ class DremioAPIOperations:
|
|
|
343
343
|
|
|
344
344
|
while True:
|
|
345
345
|
result = self.get_job_result(job_id, offset, limit)
|
|
346
|
-
rows.extend(result["rows"])
|
|
347
346
|
|
|
348
|
-
|
|
349
|
-
|
|
347
|
+
# Handle cases where API response doesn't contain 'rows' key
|
|
348
|
+
# This can happen with OOM errors or when no rows are returned
|
|
349
|
+
if "rows" not in result:
|
|
350
|
+
logger.warning(
|
|
351
|
+
f"API response for job {job_id} missing 'rows' key. "
|
|
352
|
+
f"Response keys: {list(result.keys())}"
|
|
353
|
+
)
|
|
354
|
+
# Check for error conditions
|
|
355
|
+
if "errorMessage" in result:
|
|
356
|
+
raise DremioAPIException(f"Query error: {result['errorMessage']}")
|
|
357
|
+
elif "message" in result:
|
|
358
|
+
logger.warning(
|
|
359
|
+
f"Query warning for job {job_id}: {result['message']}"
|
|
360
|
+
)
|
|
361
|
+
# Return empty list if no rows key and no error
|
|
362
|
+
break
|
|
363
|
+
|
|
364
|
+
# Handle empty rows response
|
|
365
|
+
result_rows = result["rows"]
|
|
366
|
+
if not result_rows:
|
|
367
|
+
logger.debug(
|
|
368
|
+
f"No more rows returned for job {job_id} at offset {offset}"
|
|
369
|
+
)
|
|
350
370
|
break
|
|
351
371
|
|
|
372
|
+
rows.extend(result_rows)
|
|
373
|
+
|
|
374
|
+
# Check actual returned rows to determine if we should continue
|
|
375
|
+
actual_rows_returned = len(result_rows)
|
|
376
|
+
if actual_rows_returned == 0:
|
|
377
|
+
logger.debug(f"Query returned no rows for job {job_id}")
|
|
378
|
+
break
|
|
379
|
+
|
|
380
|
+
offset = offset + actual_rows_returned
|
|
381
|
+
# If we got fewer rows than requested, we've reached the end
|
|
382
|
+
if actual_rows_returned < limit:
|
|
383
|
+
break
|
|
384
|
+
|
|
385
|
+
logger.info(f"Fetched {len(rows)} total rows for job {job_id}")
|
|
352
386
|
return rows
|
|
353
387
|
|
|
388
|
+
def _fetch_results_iter(self, job_id: str) -> Iterator[Dict]:
|
|
389
|
+
"""
|
|
390
|
+
Fetch job results in a streaming fashion to reduce memory usage.
|
|
391
|
+
Yields individual rows instead of collecting all in memory.
|
|
392
|
+
"""
|
|
393
|
+
limit = 500
|
|
394
|
+
offset = 0
|
|
395
|
+
total_rows_fetched = 0
|
|
396
|
+
|
|
397
|
+
while True:
|
|
398
|
+
result = self.get_job_result(job_id, offset, limit)
|
|
399
|
+
|
|
400
|
+
# Handle cases where API response doesn't contain 'rows' key
|
|
401
|
+
if "rows" not in result:
|
|
402
|
+
logger.warning(
|
|
403
|
+
f"API response for job {job_id} missing 'rows' key. "
|
|
404
|
+
f"Response keys: {list(result.keys())}"
|
|
405
|
+
)
|
|
406
|
+
# Check for error conditions
|
|
407
|
+
if "errorMessage" in result:
|
|
408
|
+
raise DremioAPIException(f"Query error: {result['errorMessage']}")
|
|
409
|
+
elif "message" in result:
|
|
410
|
+
logger.warning(
|
|
411
|
+
f"Query warning for job {job_id}: {result['message']}"
|
|
412
|
+
)
|
|
413
|
+
# Stop iteration if no rows key and no error
|
|
414
|
+
break
|
|
415
|
+
|
|
416
|
+
# Handle empty rows response
|
|
417
|
+
result_rows = result["rows"]
|
|
418
|
+
if not result_rows:
|
|
419
|
+
logger.debug(
|
|
420
|
+
f"No more rows returned for job {job_id} at offset {offset}"
|
|
421
|
+
)
|
|
422
|
+
break
|
|
423
|
+
|
|
424
|
+
# Yield individual rows instead of collecting them
|
|
425
|
+
for row in result_rows:
|
|
426
|
+
yield row
|
|
427
|
+
total_rows_fetched += 1
|
|
428
|
+
|
|
429
|
+
# Check actual returned rows to determine if we should continue
|
|
430
|
+
actual_rows_returned = len(result_rows)
|
|
431
|
+
if actual_rows_returned == 0:
|
|
432
|
+
logger.debug(f"Query returned no rows for job {job_id}")
|
|
433
|
+
break
|
|
434
|
+
|
|
435
|
+
offset = offset + actual_rows_returned
|
|
436
|
+
# If we got fewer rows than requested, we've reached the end
|
|
437
|
+
if actual_rows_returned < limit:
|
|
438
|
+
break
|
|
439
|
+
|
|
440
|
+
logger.info(f"Streamed {total_rows_fetched} total rows for job {job_id}")
|
|
441
|
+
|
|
442
|
+
def execute_query_iter(
|
|
443
|
+
self, query: str, timeout: int = 3600
|
|
444
|
+
) -> Iterator[Dict[str, Any]]:
|
|
445
|
+
"""Execute SQL query and return results as a streaming iterator"""
|
|
446
|
+
try:
|
|
447
|
+
with PerfTimer() as timer:
|
|
448
|
+
logger.info(f"Executing streaming query: {query}")
|
|
449
|
+
response = self.post(url="/sql", data=json.dumps({"sql": query}))
|
|
450
|
+
|
|
451
|
+
if "errorMessage" in response:
|
|
452
|
+
self.report.failure(
|
|
453
|
+
message="SQL Error", context=f"{response['errorMessage']}"
|
|
454
|
+
)
|
|
455
|
+
raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
|
|
456
|
+
|
|
457
|
+
job_id = response["id"]
|
|
458
|
+
|
|
459
|
+
# Wait for job completion
|
|
460
|
+
start_time = time()
|
|
461
|
+
while True:
|
|
462
|
+
status = self.get_job_status(job_id)
|
|
463
|
+
if status["jobState"] == "COMPLETED":
|
|
464
|
+
break
|
|
465
|
+
elif status["jobState"] == "FAILED":
|
|
466
|
+
error_message = status.get("errorMessage", "Unknown error")
|
|
467
|
+
raise RuntimeError(f"Query failed: {error_message}")
|
|
468
|
+
elif status["jobState"] == "CANCELED":
|
|
469
|
+
raise RuntimeError("Query was canceled")
|
|
470
|
+
|
|
471
|
+
if time() - start_time > timeout:
|
|
472
|
+
self.cancel_query(job_id)
|
|
473
|
+
raise DremioAPIException(
|
|
474
|
+
f"Query execution timed out after {timeout} seconds"
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
sleep(3)
|
|
478
|
+
|
|
479
|
+
logger.info(
|
|
480
|
+
f"Query job completed in {timer.elapsed_seconds()} seconds, starting streaming"
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Return streaming iterator
|
|
484
|
+
return self._fetch_results_iter(job_id)
|
|
485
|
+
|
|
486
|
+
except requests.RequestException as e:
|
|
487
|
+
raise DremioAPIException("Error executing streaming query") from e
|
|
488
|
+
|
|
354
489
|
def cancel_query(self, job_id: str) -> None:
|
|
355
490
|
"""Cancel a running query"""
|
|
356
491
|
try:
|
|
@@ -499,8 +634,12 @@ class DremioAPIOperations:
|
|
|
499
634
|
return f"AND {operator}({field}, '{pattern_str}')"
|
|
500
635
|
|
|
501
636
|
def get_all_tables_and_columns(
|
|
502
|
-
self, containers:
|
|
503
|
-
) ->
|
|
637
|
+
self, containers: Iterator["DremioContainer"]
|
|
638
|
+
) -> Iterator[Dict]:
|
|
639
|
+
"""
|
|
640
|
+
Memory-efficient streaming version that yields tables one at a time.
|
|
641
|
+
Reduces memory usage for large datasets by processing results as they come.
|
|
642
|
+
"""
|
|
504
643
|
if self.edition == DremioEdition.ENTERPRISE:
|
|
505
644
|
query_template = DremioSQLQueries.QUERY_DATASETS_EE
|
|
506
645
|
elif self.edition == DremioEdition.CLOUD:
|
|
@@ -517,93 +656,85 @@ class DremioAPIOperations:
|
|
|
517
656
|
self.deny_schema_pattern, schema_field, allow=False
|
|
518
657
|
)
|
|
519
658
|
|
|
520
|
-
|
|
521
|
-
|
|
659
|
+
# Process each container's results separately to avoid memory buildup
|
|
522
660
|
for schema in containers:
|
|
523
|
-
formatted_query = ""
|
|
524
661
|
try:
|
|
525
662
|
formatted_query = query_template.format(
|
|
526
663
|
schema_pattern=schema_condition,
|
|
527
664
|
deny_schema_pattern=deny_schema_condition,
|
|
528
665
|
container_name=schema.container_name.lower(),
|
|
529
666
|
)
|
|
530
|
-
all_tables_and_columns.extend(
|
|
531
|
-
self.execute_query(
|
|
532
|
-
query=formatted_query,
|
|
533
|
-
)
|
|
534
|
-
)
|
|
535
|
-
except DremioAPIException as e:
|
|
536
|
-
self.report.warning(
|
|
537
|
-
message="Container has no tables or views",
|
|
538
|
-
context=f"{schema.subclass} {schema.container_name}",
|
|
539
|
-
exc=e,
|
|
540
|
-
)
|
|
541
667
|
|
|
542
|
-
|
|
668
|
+
# Use streaming query execution
|
|
669
|
+
container_results = list(self.execute_query_iter(query=formatted_query))
|
|
543
670
|
|
|
544
|
-
|
|
545
|
-
|
|
671
|
+
if self.edition == DremioEdition.COMMUNITY:
|
|
672
|
+
# Process community edition results
|
|
673
|
+
formatted_tables = self.community_get_formatted_tables(
|
|
674
|
+
container_results
|
|
675
|
+
)
|
|
676
|
+
for table in formatted_tables:
|
|
677
|
+
yield table
|
|
678
|
+
else:
|
|
679
|
+
# Process enterprise/cloud edition results
|
|
680
|
+
column_dictionary: Dict[str, List[Dict]] = defaultdict(list)
|
|
681
|
+
table_metadata: Dict[str, Dict] = {}
|
|
546
682
|
|
|
547
|
-
|
|
548
|
-
|
|
683
|
+
for record in container_results:
|
|
684
|
+
if not record.get("COLUMN_NAME"):
|
|
685
|
+
continue
|
|
549
686
|
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
687
|
+
table_full_path = record.get("FULL_TABLE_PATH")
|
|
688
|
+
if not table_full_path:
|
|
689
|
+
continue
|
|
553
690
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
691
|
+
# Store column information
|
|
692
|
+
column_dictionary[table_full_path].append(
|
|
693
|
+
{
|
|
694
|
+
"name": record["COLUMN_NAME"],
|
|
695
|
+
"ordinal_position": record["ORDINAL_POSITION"],
|
|
696
|
+
"is_nullable": record["IS_NULLABLE"],
|
|
697
|
+
"data_type": record["DATA_TYPE"],
|
|
698
|
+
"column_size": record["COLUMN_SIZE"],
|
|
699
|
+
}
|
|
700
|
+
)
|
|
557
701
|
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
702
|
+
# Store table metadata (only once per table)
|
|
703
|
+
if table_full_path not in table_metadata:
|
|
704
|
+
table_metadata[table_full_path] = {
|
|
705
|
+
"TABLE_NAME": record.get("TABLE_NAME"),
|
|
706
|
+
"TABLE_SCHEMA": record.get("TABLE_SCHEMA"),
|
|
707
|
+
"VIEW_DEFINITION": record.get("VIEW_DEFINITION"),
|
|
708
|
+
"RESOURCE_ID": record.get("RESOURCE_ID"),
|
|
709
|
+
"LOCATION_ID": record.get("LOCATION_ID"),
|
|
710
|
+
"OWNER": record.get("OWNER"),
|
|
711
|
+
"OWNER_TYPE": record.get("OWNER_TYPE"),
|
|
712
|
+
"CREATED": record.get("CREATED"),
|
|
713
|
+
"FORMAT_TYPE": record.get("FORMAT_TYPE"),
|
|
714
|
+
}
|
|
567
715
|
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
"
|
|
574
|
-
"
|
|
575
|
-
"
|
|
576
|
-
"
|
|
577
|
-
"
|
|
578
|
-
"
|
|
579
|
-
"
|
|
580
|
-
"
|
|
581
|
-
|
|
582
|
-
)
|
|
583
|
-
if key in dictionary
|
|
584
|
-
): dictionary
|
|
585
|
-
for dictionary in all_tables_and_columns
|
|
586
|
-
}.values()
|
|
587
|
-
)
|
|
716
|
+
# Yield tables one at a time
|
|
717
|
+
for table_path, table_info in table_metadata.items():
|
|
718
|
+
yield {
|
|
719
|
+
"TABLE_NAME": table_info.get("TABLE_NAME"),
|
|
720
|
+
"TABLE_SCHEMA": table_info.get("TABLE_SCHEMA"),
|
|
721
|
+
"COLUMNS": column_dictionary[table_path],
|
|
722
|
+
"VIEW_DEFINITION": table_info.get("VIEW_DEFINITION"),
|
|
723
|
+
"RESOURCE_ID": table_info.get("RESOURCE_ID"),
|
|
724
|
+
"LOCATION_ID": table_info.get("LOCATION_ID"),
|
|
725
|
+
"OWNER": table_info.get("OWNER"),
|
|
726
|
+
"OWNER_TYPE": table_info.get("OWNER_TYPE"),
|
|
727
|
+
"CREATED": table_info.get("CREATED"),
|
|
728
|
+
"FORMAT_TYPE": table_info.get("FORMAT_TYPE"),
|
|
729
|
+
}
|
|
588
730
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
"COLUMNS": column_dictionary[table["FULL_TABLE_PATH"]],
|
|
595
|
-
"VIEW_DEFINITION": table.get("VIEW_DEFINITION"),
|
|
596
|
-
"RESOURCE_ID": table.get("RESOURCE_ID"),
|
|
597
|
-
"LOCATION_ID": table.get("LOCATION_ID"),
|
|
598
|
-
"OWNER": table.get("OWNER"),
|
|
599
|
-
"OWNER_TYPE": table.get("OWNER_TYPE"),
|
|
600
|
-
"CREATED": table.get("CREATED"),
|
|
601
|
-
"FORMAT_TYPE": table.get("FORMAT_TYPE"),
|
|
602
|
-
}
|
|
731
|
+
except DremioAPIException as e:
|
|
732
|
+
self.report.warning(
|
|
733
|
+
message="Container has no tables or views",
|
|
734
|
+
context=f"{schema.subclass} {schema.container_name}",
|
|
735
|
+
exc=e,
|
|
603
736
|
)
|
|
604
737
|
|
|
605
|
-
return tables
|
|
606
|
-
|
|
607
738
|
def validate_schema_format(self, schema):
|
|
608
739
|
if "." in schema:
|
|
609
740
|
schema_path = self.get(
|
|
@@ -640,7 +771,10 @@ class DremioAPIOperations:
|
|
|
640
771
|
|
|
641
772
|
return parents_list
|
|
642
773
|
|
|
643
|
-
def extract_all_queries(self) ->
|
|
774
|
+
def extract_all_queries(self) -> Iterator[Dict[str, Any]]:
|
|
775
|
+
"""
|
|
776
|
+
Memory-efficient streaming version for extracting query results.
|
|
777
|
+
"""
|
|
644
778
|
# Convert datetime objects to string format for SQL queries
|
|
645
779
|
start_timestamp_str = None
|
|
646
780
|
end_timestamp_str = None
|
|
@@ -661,7 +795,7 @@ class DremioAPIOperations:
|
|
|
661
795
|
end_timestamp_millis=end_timestamp_str,
|
|
662
796
|
)
|
|
663
797
|
|
|
664
|
-
return self.
|
|
798
|
+
return self.execute_query_iter(query=jobs_query)
|
|
665
799
|
|
|
666
800
|
def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
|
|
667
801
|
"""
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from typing import List, Literal, Optional
|
|
3
3
|
|
|
4
4
|
import certifi
|
|
5
|
-
from pydantic import Field,
|
|
5
|
+
from pydantic import Field, ValidationInfo, field_validator
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.source_common import (
|
|
@@ -78,8 +78,9 @@ class DremioConnectionConfig(ConfigModel):
|
|
|
78
78
|
description="ID of Dremio Cloud Project. Found in Project Settings in the Dremio Cloud UI",
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
@
|
|
82
|
-
|
|
81
|
+
@field_validator("authentication_method", mode="after")
|
|
82
|
+
@classmethod
|
|
83
|
+
def validate_auth_method(cls, value: str) -> str:
|
|
83
84
|
allowed_methods = ["password", "PAT"]
|
|
84
85
|
if value not in allowed_methods:
|
|
85
86
|
raise ValueError(
|
|
@@ -87,9 +88,12 @@ class DremioConnectionConfig(ConfigModel):
|
|
|
87
88
|
)
|
|
88
89
|
return value
|
|
89
90
|
|
|
90
|
-
@
|
|
91
|
-
|
|
92
|
-
|
|
91
|
+
@field_validator("password", mode="after")
|
|
92
|
+
@classmethod
|
|
93
|
+
def validate_password(
|
|
94
|
+
cls, value: Optional[str], info: ValidationInfo
|
|
95
|
+
) -> Optional[str]:
|
|
96
|
+
if info.data.get("authentication_method") == "PAT" and not value:
|
|
93
97
|
raise ValueError(
|
|
94
98
|
"Password (Personal Access Token) is required when using PAT authentication",
|
|
95
99
|
)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import itertools
|
|
2
1
|
import logging
|
|
3
2
|
import re
|
|
4
3
|
import uuid
|
|
@@ -6,7 +5,7 @@ from collections import deque
|
|
|
6
5
|
from dataclasses import dataclass
|
|
7
6
|
from datetime import datetime
|
|
8
7
|
from enum import Enum
|
|
9
|
-
from typing import Any, Deque, Dict, List, Optional
|
|
8
|
+
from typing import Any, Deque, Dict, Iterator, List, Optional
|
|
10
9
|
|
|
11
10
|
from sqlglot import parse_one
|
|
12
11
|
|
|
@@ -184,6 +183,7 @@ class DremioQuery:
|
|
|
184
183
|
return ""
|
|
185
184
|
|
|
186
185
|
def get_raw_query(self, sql_query: str) -> str:
|
|
186
|
+
"""Remove comments from SQL query using sqlglot parser."""
|
|
187
187
|
try:
|
|
188
188
|
parsed = parse_one(sql_query)
|
|
189
189
|
return parsed.sql(comments=False)
|
|
@@ -336,43 +336,26 @@ class DremioCatalog:
|
|
|
336
336
|
def __init__(self, dremio_api: DremioAPIOperations):
|
|
337
337
|
self.dremio_api = dremio_api
|
|
338
338
|
self.edition = dremio_api.edition
|
|
339
|
-
self.datasets: Deque[DremioDataset] = deque()
|
|
340
339
|
self.sources: Deque[DremioSourceContainer] = deque()
|
|
341
340
|
self.spaces: Deque[DremioSpace] = deque()
|
|
342
341
|
self.folders: Deque[DremioFolder] = deque()
|
|
343
|
-
self.glossary_terms: Deque[DremioGlossaryTerm] = deque()
|
|
344
342
|
self.queries: Deque[DremioQuery] = deque()
|
|
345
343
|
|
|
346
|
-
self.datasets_populated = False
|
|
347
344
|
self.containers_populated = False
|
|
348
345
|
self.queries_populated = False
|
|
349
346
|
|
|
350
|
-
def
|
|
351
|
-
|
|
352
|
-
|
|
347
|
+
def get_datasets(self) -> Iterator[DremioDataset]:
|
|
348
|
+
"""Get all Dremio datasets (tables and views) as an iterator."""
|
|
349
|
+
# Get containers directly without storing them
|
|
350
|
+
containers = self.get_containers()
|
|
353
351
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
containers=containers
|
|
360
|
-
):
|
|
361
|
-
dremio_dataset = DremioDataset(
|
|
362
|
-
dataset_details=dataset_details,
|
|
363
|
-
api_operations=self.dremio_api,
|
|
364
|
-
)
|
|
365
|
-
self.datasets.append(dremio_dataset)
|
|
366
|
-
|
|
367
|
-
for glossary_term in dremio_dataset.glossary_terms:
|
|
368
|
-
if glossary_term not in self.glossary_terms:
|
|
369
|
-
self.glossary_terms.append(glossary_term)
|
|
370
|
-
|
|
371
|
-
self.datasets_populated = True
|
|
352
|
+
for dataset_details in self.dremio_api.get_all_tables_and_columns(containers):
|
|
353
|
+
dremio_dataset = DremioDataset(
|
|
354
|
+
dataset_details=dataset_details,
|
|
355
|
+
api_operations=self.dremio_api,
|
|
356
|
+
)
|
|
372
357
|
|
|
373
|
-
|
|
374
|
-
self.set_datasets()
|
|
375
|
-
return self.datasets
|
|
358
|
+
yield dremio_dataset
|
|
376
359
|
|
|
377
360
|
def set_containers(self) -> None:
|
|
378
361
|
if not self.containers_populated:
|
|
@@ -423,18 +406,50 @@ class DremioCatalog:
|
|
|
423
406
|
|
|
424
407
|
self.containers_populated = True
|
|
425
408
|
|
|
426
|
-
def get_containers(self) ->
|
|
427
|
-
|
|
428
|
-
|
|
409
|
+
def get_containers(self) -> Iterator[DremioContainer]:
|
|
410
|
+
"""Get all containers (sources, spaces, folders) as an iterator."""
|
|
411
|
+
for container in self.dremio_api.get_all_containers():
|
|
412
|
+
container_type = container.get("container_type")
|
|
413
|
+
if container_type == DremioEntityContainerType.SOURCE:
|
|
414
|
+
yield DremioSourceContainer(
|
|
415
|
+
container_name=container.get("name"),
|
|
416
|
+
location_id=container.get("id"),
|
|
417
|
+
path=[],
|
|
418
|
+
api_operations=self.dremio_api,
|
|
419
|
+
dremio_source_type=container.get("source_type") or "",
|
|
420
|
+
root_path=container.get("root_path"),
|
|
421
|
+
database_name=container.get("database_name"),
|
|
422
|
+
)
|
|
423
|
+
elif container_type == DremioEntityContainerType.SPACE:
|
|
424
|
+
yield DremioSpace(
|
|
425
|
+
container_name=container.get("name"),
|
|
426
|
+
location_id=container.get("id"),
|
|
427
|
+
path=[],
|
|
428
|
+
api_operations=self.dremio_api,
|
|
429
|
+
)
|
|
430
|
+
elif container_type == DremioEntityContainerType.FOLDER:
|
|
431
|
+
yield DremioFolder(
|
|
432
|
+
container_name=container.get("name"),
|
|
433
|
+
location_id=container.get("id"),
|
|
434
|
+
path=container.get("path"),
|
|
435
|
+
api_operations=self.dremio_api,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
def get_sources(self) -> Iterator[DremioSourceContainer]:
|
|
439
|
+
"""Get all Dremio source containers (external data connections) as an iterator."""
|
|
440
|
+
for container in self.get_containers():
|
|
441
|
+
if isinstance(container, DremioSourceContainer):
|
|
442
|
+
yield container
|
|
429
443
|
|
|
430
|
-
def
|
|
431
|
-
|
|
432
|
-
|
|
444
|
+
def get_glossary_terms(self) -> Iterator[DremioGlossaryTerm]:
|
|
445
|
+
"""Get all unique glossary terms (tags) from datasets."""
|
|
446
|
+
glossary_terms_seen = set()
|
|
433
447
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
448
|
+
for dataset in self.get_datasets():
|
|
449
|
+
for glossary_term in dataset.glossary_terms:
|
|
450
|
+
if glossary_term not in glossary_terms_seen:
|
|
451
|
+
glossary_terms_seen.add(glossary_term)
|
|
452
|
+
yield glossary_term
|
|
438
453
|
|
|
439
454
|
def is_valid_query(self, query: Dict[str, Any]) -> bool:
|
|
440
455
|
required_fields = [
|
|
@@ -447,6 +462,7 @@ class DremioCatalog:
|
|
|
447
462
|
return all(query.get(field) for field in required_fields)
|
|
448
463
|
|
|
449
464
|
def get_queries(self) -> Deque[DremioQuery]:
|
|
465
|
+
"""Get all valid Dremio queries for lineage analysis."""
|
|
450
466
|
for query in self.dremio_api.extract_all_queries():
|
|
451
467
|
if not self.is_valid_query(query):
|
|
452
468
|
continue
|
|
@@ -17,6 +17,7 @@ from datahub.metadata.schema_classes import (
|
|
|
17
17
|
DatasetProfileClass,
|
|
18
18
|
QuantileClass,
|
|
19
19
|
)
|
|
20
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
20
21
|
|
|
21
22
|
logger = logging.getLogger(__name__)
|
|
22
23
|
|
|
@@ -64,8 +65,13 @@ class DremioProfiler:
|
|
|
64
65
|
)
|
|
65
66
|
return
|
|
66
67
|
|
|
67
|
-
|
|
68
|
-
|
|
68
|
+
with PerfTimer() as timer:
|
|
69
|
+
profile_data = self.profile_table(full_table_name, columns)
|
|
70
|
+
profile_aspect = self.populate_profile_aspect(profile_data)
|
|
71
|
+
|
|
72
|
+
logger.info(
|
|
73
|
+
f"Profiled table {full_table_name} with {len(columns)} columns in {timer.elapsed_seconds():.2f} seconds"
|
|
74
|
+
)
|
|
69
75
|
|
|
70
76
|
if profile_aspect:
|
|
71
77
|
self.report.report_entity_profiled(dataset.resource_name)
|
|
@@ -131,7 +137,12 @@ class DremioProfiler:
|
|
|
131
137
|
def _profile_chunk(self, table_name: str, columns: List[Tuple[str, str]]) -> Dict:
|
|
132
138
|
profile_sql = self._build_profile_sql(table_name, columns)
|
|
133
139
|
try:
|
|
134
|
-
|
|
140
|
+
with PerfTimer() as timer:
|
|
141
|
+
results = self.api_operations.execute_query(profile_sql)
|
|
142
|
+
|
|
143
|
+
logger.debug(
|
|
144
|
+
f"Profiling query for {table_name} ({len(columns)} columns) completed in {timer.elapsed_seconds():.2f} seconds"
|
|
145
|
+
)
|
|
135
146
|
return self._parse_profile_results(results, columns)
|
|
136
147
|
except DremioAPIException as e:
|
|
137
148
|
raise e
|