acryl-datahub 1.3.0.1rc8__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/METADATA +2654 -2647
- {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/RECORD +74 -74
- datahub/_version.py +1 -1
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +9 -11
- datahub/ingestion/source/elastic_search.py +106 -29
- datahub/ingestion/source/snowflake/snowflake_queries.py +27 -3
- datahub/ingestion/source/sql_queries.py +164 -15
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +264 -89
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +9 -3
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from collections import defaultdict
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from itertools import product
|
|
9
9
|
from time import sleep, time
|
|
10
|
-
from typing import TYPE_CHECKING, Any,
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
|
|
11
11
|
from urllib.parse import quote
|
|
12
12
|
|
|
13
13
|
import requests
|
|
@@ -343,14 +343,149 @@ class DremioAPIOperations:
|
|
|
343
343
|
|
|
344
344
|
while True:
|
|
345
345
|
result = self.get_job_result(job_id, offset, limit)
|
|
346
|
-
rows.extend(result["rows"])
|
|
347
346
|
|
|
348
|
-
|
|
349
|
-
|
|
347
|
+
# Handle cases where API response doesn't contain 'rows' key
|
|
348
|
+
# This can happen with OOM errors or when no rows are returned
|
|
349
|
+
if "rows" not in result:
|
|
350
|
+
logger.warning(
|
|
351
|
+
f"API response for job {job_id} missing 'rows' key. "
|
|
352
|
+
f"Response keys: {list(result.keys())}"
|
|
353
|
+
)
|
|
354
|
+
# Check for error conditions
|
|
355
|
+
if "errorMessage" in result:
|
|
356
|
+
raise DremioAPIException(f"Query error: {result['errorMessage']}")
|
|
357
|
+
elif "message" in result:
|
|
358
|
+
logger.warning(
|
|
359
|
+
f"Query warning for job {job_id}: {result['message']}"
|
|
360
|
+
)
|
|
361
|
+
# Return empty list if no rows key and no error
|
|
362
|
+
break
|
|
363
|
+
|
|
364
|
+
# Handle empty rows response
|
|
365
|
+
result_rows = result["rows"]
|
|
366
|
+
if not result_rows:
|
|
367
|
+
logger.debug(
|
|
368
|
+
f"No more rows returned for job {job_id} at offset {offset}"
|
|
369
|
+
)
|
|
350
370
|
break
|
|
351
371
|
|
|
372
|
+
rows.extend(result_rows)
|
|
373
|
+
|
|
374
|
+
# Check actual returned rows to determine if we should continue
|
|
375
|
+
actual_rows_returned = len(result_rows)
|
|
376
|
+
if actual_rows_returned == 0:
|
|
377
|
+
logger.debug(f"Query returned no rows for job {job_id}")
|
|
378
|
+
break
|
|
379
|
+
|
|
380
|
+
offset = offset + actual_rows_returned
|
|
381
|
+
# If we got fewer rows than requested, we've reached the end
|
|
382
|
+
if actual_rows_returned < limit:
|
|
383
|
+
break
|
|
384
|
+
|
|
385
|
+
logger.info(f"Fetched {len(rows)} total rows for job {job_id}")
|
|
352
386
|
return rows
|
|
353
387
|
|
|
388
|
+
def _fetch_results_iter(self, job_id: str) -> Iterator[Dict]:
|
|
389
|
+
"""
|
|
390
|
+
Fetch job results in a streaming fashion to reduce memory usage.
|
|
391
|
+
Yields individual rows instead of collecting all in memory.
|
|
392
|
+
"""
|
|
393
|
+
limit = 500
|
|
394
|
+
offset = 0
|
|
395
|
+
total_rows_fetched = 0
|
|
396
|
+
|
|
397
|
+
while True:
|
|
398
|
+
result = self.get_job_result(job_id, offset, limit)
|
|
399
|
+
|
|
400
|
+
# Handle cases where API response doesn't contain 'rows' key
|
|
401
|
+
if "rows" not in result:
|
|
402
|
+
logger.warning(
|
|
403
|
+
f"API response for job {job_id} missing 'rows' key. "
|
|
404
|
+
f"Response keys: {list(result.keys())}"
|
|
405
|
+
)
|
|
406
|
+
# Check for error conditions
|
|
407
|
+
if "errorMessage" in result:
|
|
408
|
+
raise DremioAPIException(f"Query error: {result['errorMessage']}")
|
|
409
|
+
elif "message" in result:
|
|
410
|
+
logger.warning(
|
|
411
|
+
f"Query warning for job {job_id}: {result['message']}"
|
|
412
|
+
)
|
|
413
|
+
# Stop iteration if no rows key and no error
|
|
414
|
+
break
|
|
415
|
+
|
|
416
|
+
# Handle empty rows response
|
|
417
|
+
result_rows = result["rows"]
|
|
418
|
+
if not result_rows:
|
|
419
|
+
logger.debug(
|
|
420
|
+
f"No more rows returned for job {job_id} at offset {offset}"
|
|
421
|
+
)
|
|
422
|
+
break
|
|
423
|
+
|
|
424
|
+
# Yield individual rows instead of collecting them
|
|
425
|
+
for row in result_rows:
|
|
426
|
+
yield row
|
|
427
|
+
total_rows_fetched += 1
|
|
428
|
+
|
|
429
|
+
# Check actual returned rows to determine if we should continue
|
|
430
|
+
actual_rows_returned = len(result_rows)
|
|
431
|
+
if actual_rows_returned == 0:
|
|
432
|
+
logger.debug(f"Query returned no rows for job {job_id}")
|
|
433
|
+
break
|
|
434
|
+
|
|
435
|
+
offset = offset + actual_rows_returned
|
|
436
|
+
# If we got fewer rows than requested, we've reached the end
|
|
437
|
+
if actual_rows_returned < limit:
|
|
438
|
+
break
|
|
439
|
+
|
|
440
|
+
logger.info(f"Streamed {total_rows_fetched} total rows for job {job_id}")
|
|
441
|
+
|
|
442
|
+
def execute_query_iter(
|
|
443
|
+
self, query: str, timeout: int = 3600
|
|
444
|
+
) -> Iterator[Dict[str, Any]]:
|
|
445
|
+
"""Execute SQL query and return results as a streaming iterator"""
|
|
446
|
+
try:
|
|
447
|
+
with PerfTimer() as timer:
|
|
448
|
+
logger.info(f"Executing streaming query: {query}")
|
|
449
|
+
response = self.post(url="/sql", data=json.dumps({"sql": query}))
|
|
450
|
+
|
|
451
|
+
if "errorMessage" in response:
|
|
452
|
+
self.report.failure(
|
|
453
|
+
message="SQL Error", context=f"{response['errorMessage']}"
|
|
454
|
+
)
|
|
455
|
+
raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
|
|
456
|
+
|
|
457
|
+
job_id = response["id"]
|
|
458
|
+
|
|
459
|
+
# Wait for job completion
|
|
460
|
+
start_time = time()
|
|
461
|
+
while True:
|
|
462
|
+
status = self.get_job_status(job_id)
|
|
463
|
+
if status["jobState"] == "COMPLETED":
|
|
464
|
+
break
|
|
465
|
+
elif status["jobState"] == "FAILED":
|
|
466
|
+
error_message = status.get("errorMessage", "Unknown error")
|
|
467
|
+
raise RuntimeError(f"Query failed: {error_message}")
|
|
468
|
+
elif status["jobState"] == "CANCELED":
|
|
469
|
+
raise RuntimeError("Query was canceled")
|
|
470
|
+
|
|
471
|
+
if time() - start_time > timeout:
|
|
472
|
+
self.cancel_query(job_id)
|
|
473
|
+
raise DremioAPIException(
|
|
474
|
+
f"Query execution timed out after {timeout} seconds"
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
sleep(3)
|
|
478
|
+
|
|
479
|
+
logger.info(
|
|
480
|
+
f"Query job completed in {timer.elapsed_seconds()} seconds, starting streaming"
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Return streaming iterator
|
|
484
|
+
return self._fetch_results_iter(job_id)
|
|
485
|
+
|
|
486
|
+
except requests.RequestException as e:
|
|
487
|
+
raise DremioAPIException("Error executing streaming query") from e
|
|
488
|
+
|
|
354
489
|
def cancel_query(self, job_id: str) -> None:
|
|
355
490
|
"""Cancel a running query"""
|
|
356
491
|
try:
|
|
@@ -499,8 +634,12 @@ class DremioAPIOperations:
|
|
|
499
634
|
return f"AND {operator}({field}, '{pattern_str}')"
|
|
500
635
|
|
|
501
636
|
def get_all_tables_and_columns(
|
|
502
|
-
self, containers:
|
|
503
|
-
) ->
|
|
637
|
+
self, containers: Iterator["DremioContainer"]
|
|
638
|
+
) -> Iterator[Dict]:
|
|
639
|
+
"""
|
|
640
|
+
Memory-efficient streaming version that yields tables one at a time.
|
|
641
|
+
Reduces memory usage for large datasets by processing results as they come.
|
|
642
|
+
"""
|
|
504
643
|
if self.edition == DremioEdition.ENTERPRISE:
|
|
505
644
|
query_template = DremioSQLQueries.QUERY_DATASETS_EE
|
|
506
645
|
elif self.edition == DremioEdition.CLOUD:
|
|
@@ -517,93 +656,85 @@ class DremioAPIOperations:
|
|
|
517
656
|
self.deny_schema_pattern, schema_field, allow=False
|
|
518
657
|
)
|
|
519
658
|
|
|
520
|
-
|
|
521
|
-
|
|
659
|
+
# Process each container's results separately to avoid memory buildup
|
|
522
660
|
for schema in containers:
|
|
523
|
-
formatted_query = ""
|
|
524
661
|
try:
|
|
525
662
|
formatted_query = query_template.format(
|
|
526
663
|
schema_pattern=schema_condition,
|
|
527
664
|
deny_schema_pattern=deny_schema_condition,
|
|
528
665
|
container_name=schema.container_name.lower(),
|
|
529
666
|
)
|
|
530
|
-
all_tables_and_columns.extend(
|
|
531
|
-
self.execute_query(
|
|
532
|
-
query=formatted_query,
|
|
533
|
-
)
|
|
534
|
-
)
|
|
535
|
-
except DremioAPIException as e:
|
|
536
|
-
self.report.warning(
|
|
537
|
-
message="Container has no tables or views",
|
|
538
|
-
context=f"{schema.subclass} {schema.container_name}",
|
|
539
|
-
exc=e,
|
|
540
|
-
)
|
|
541
667
|
|
|
542
|
-
|
|
668
|
+
# Use streaming query execution
|
|
669
|
+
container_results = list(self.execute_query_iter(query=formatted_query))
|
|
543
670
|
|
|
544
|
-
|
|
545
|
-
|
|
671
|
+
if self.edition == DremioEdition.COMMUNITY:
|
|
672
|
+
# Process community edition results
|
|
673
|
+
formatted_tables = self.community_get_formatted_tables(
|
|
674
|
+
container_results
|
|
675
|
+
)
|
|
676
|
+
for table in formatted_tables:
|
|
677
|
+
yield table
|
|
678
|
+
else:
|
|
679
|
+
# Process enterprise/cloud edition results
|
|
680
|
+
column_dictionary: Dict[str, List[Dict]] = defaultdict(list)
|
|
681
|
+
table_metadata: Dict[str, Dict] = {}
|
|
546
682
|
|
|
547
|
-
|
|
548
|
-
|
|
683
|
+
for record in container_results:
|
|
684
|
+
if not record.get("COLUMN_NAME"):
|
|
685
|
+
continue
|
|
549
686
|
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
687
|
+
table_full_path = record.get("FULL_TABLE_PATH")
|
|
688
|
+
if not table_full_path:
|
|
689
|
+
continue
|
|
553
690
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
691
|
+
# Store column information
|
|
692
|
+
column_dictionary[table_full_path].append(
|
|
693
|
+
{
|
|
694
|
+
"name": record["COLUMN_NAME"],
|
|
695
|
+
"ordinal_position": record["ORDINAL_POSITION"],
|
|
696
|
+
"is_nullable": record["IS_NULLABLE"],
|
|
697
|
+
"data_type": record["DATA_TYPE"],
|
|
698
|
+
"column_size": record["COLUMN_SIZE"],
|
|
699
|
+
}
|
|
700
|
+
)
|
|
557
701
|
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
702
|
+
# Store table metadata (only once per table)
|
|
703
|
+
if table_full_path not in table_metadata:
|
|
704
|
+
table_metadata[table_full_path] = {
|
|
705
|
+
"TABLE_NAME": record.get("TABLE_NAME"),
|
|
706
|
+
"TABLE_SCHEMA": record.get("TABLE_SCHEMA"),
|
|
707
|
+
"VIEW_DEFINITION": record.get("VIEW_DEFINITION"),
|
|
708
|
+
"RESOURCE_ID": record.get("RESOURCE_ID"),
|
|
709
|
+
"LOCATION_ID": record.get("LOCATION_ID"),
|
|
710
|
+
"OWNER": record.get("OWNER"),
|
|
711
|
+
"OWNER_TYPE": record.get("OWNER_TYPE"),
|
|
712
|
+
"CREATED": record.get("CREATED"),
|
|
713
|
+
"FORMAT_TYPE": record.get("FORMAT_TYPE"),
|
|
714
|
+
}
|
|
567
715
|
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
"
|
|
574
|
-
"
|
|
575
|
-
"
|
|
576
|
-
"
|
|
577
|
-
"
|
|
578
|
-
"
|
|
579
|
-
"
|
|
580
|
-
"
|
|
581
|
-
|
|
582
|
-
)
|
|
583
|
-
if key in dictionary
|
|
584
|
-
): dictionary
|
|
585
|
-
for dictionary in all_tables_and_columns
|
|
586
|
-
}.values()
|
|
587
|
-
)
|
|
716
|
+
# Yield tables one at a time
|
|
717
|
+
for table_path, table_info in table_metadata.items():
|
|
718
|
+
yield {
|
|
719
|
+
"TABLE_NAME": table_info.get("TABLE_NAME"),
|
|
720
|
+
"TABLE_SCHEMA": table_info.get("TABLE_SCHEMA"),
|
|
721
|
+
"COLUMNS": column_dictionary[table_path],
|
|
722
|
+
"VIEW_DEFINITION": table_info.get("VIEW_DEFINITION"),
|
|
723
|
+
"RESOURCE_ID": table_info.get("RESOURCE_ID"),
|
|
724
|
+
"LOCATION_ID": table_info.get("LOCATION_ID"),
|
|
725
|
+
"OWNER": table_info.get("OWNER"),
|
|
726
|
+
"OWNER_TYPE": table_info.get("OWNER_TYPE"),
|
|
727
|
+
"CREATED": table_info.get("CREATED"),
|
|
728
|
+
"FORMAT_TYPE": table_info.get("FORMAT_TYPE"),
|
|
729
|
+
}
|
|
588
730
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
"COLUMNS": column_dictionary[table["FULL_TABLE_PATH"]],
|
|
595
|
-
"VIEW_DEFINITION": table.get("VIEW_DEFINITION"),
|
|
596
|
-
"RESOURCE_ID": table.get("RESOURCE_ID"),
|
|
597
|
-
"LOCATION_ID": table.get("LOCATION_ID"),
|
|
598
|
-
"OWNER": table.get("OWNER"),
|
|
599
|
-
"OWNER_TYPE": table.get("OWNER_TYPE"),
|
|
600
|
-
"CREATED": table.get("CREATED"),
|
|
601
|
-
"FORMAT_TYPE": table.get("FORMAT_TYPE"),
|
|
602
|
-
}
|
|
731
|
+
except DremioAPIException as e:
|
|
732
|
+
self.report.warning(
|
|
733
|
+
message="Container has no tables or views",
|
|
734
|
+
context=f"{schema.subclass} {schema.container_name}",
|
|
735
|
+
exc=e,
|
|
603
736
|
)
|
|
604
737
|
|
|
605
|
-
return tables
|
|
606
|
-
|
|
607
738
|
def validate_schema_format(self, schema):
|
|
608
739
|
if "." in schema:
|
|
609
740
|
schema_path = self.get(
|
|
@@ -640,7 +771,10 @@ class DremioAPIOperations:
|
|
|
640
771
|
|
|
641
772
|
return parents_list
|
|
642
773
|
|
|
643
|
-
def extract_all_queries(self) ->
|
|
774
|
+
def extract_all_queries(self) -> Iterator[Dict[str, Any]]:
|
|
775
|
+
"""
|
|
776
|
+
Memory-efficient streaming version for extracting query results.
|
|
777
|
+
"""
|
|
644
778
|
# Convert datetime objects to string format for SQL queries
|
|
645
779
|
start_timestamp_str = None
|
|
646
780
|
end_timestamp_str = None
|
|
@@ -661,7 +795,7 @@ class DremioAPIOperations:
|
|
|
661
795
|
end_timestamp_millis=end_timestamp_str,
|
|
662
796
|
)
|
|
663
797
|
|
|
664
|
-
return self.
|
|
798
|
+
return self.execute_query_iter(query=jobs_query)
|
|
665
799
|
|
|
666
800
|
def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
|
|
667
801
|
"""
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import itertools
|
|
2
1
|
import logging
|
|
3
2
|
import re
|
|
4
3
|
import uuid
|
|
@@ -6,7 +5,7 @@ from collections import deque
|
|
|
6
5
|
from dataclasses import dataclass
|
|
7
6
|
from datetime import datetime
|
|
8
7
|
from enum import Enum
|
|
9
|
-
from typing import Any, Deque, Dict, List, Optional
|
|
8
|
+
from typing import Any, Deque, Dict, Iterator, List, Optional
|
|
10
9
|
|
|
11
10
|
from sqlglot import parse_one
|
|
12
11
|
|
|
@@ -184,6 +183,7 @@ class DremioQuery:
|
|
|
184
183
|
return ""
|
|
185
184
|
|
|
186
185
|
def get_raw_query(self, sql_query: str) -> str:
|
|
186
|
+
"""Remove comments from SQL query using sqlglot parser."""
|
|
187
187
|
try:
|
|
188
188
|
parsed = parse_one(sql_query)
|
|
189
189
|
return parsed.sql(comments=False)
|
|
@@ -336,43 +336,26 @@ class DremioCatalog:
|
|
|
336
336
|
def __init__(self, dremio_api: DremioAPIOperations):
|
|
337
337
|
self.dremio_api = dremio_api
|
|
338
338
|
self.edition = dremio_api.edition
|
|
339
|
-
self.datasets: Deque[DremioDataset] = deque()
|
|
340
339
|
self.sources: Deque[DremioSourceContainer] = deque()
|
|
341
340
|
self.spaces: Deque[DremioSpace] = deque()
|
|
342
341
|
self.folders: Deque[DremioFolder] = deque()
|
|
343
|
-
self.glossary_terms: Deque[DremioGlossaryTerm] = deque()
|
|
344
342
|
self.queries: Deque[DremioQuery] = deque()
|
|
345
343
|
|
|
346
|
-
self.datasets_populated = False
|
|
347
344
|
self.containers_populated = False
|
|
348
345
|
self.queries_populated = False
|
|
349
346
|
|
|
350
|
-
def
|
|
351
|
-
|
|
352
|
-
|
|
347
|
+
def get_datasets(self) -> Iterator[DremioDataset]:
|
|
348
|
+
"""Get all Dremio datasets (tables and views) as an iterator."""
|
|
349
|
+
# Get containers directly without storing them
|
|
350
|
+
containers = self.get_containers()
|
|
353
351
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
containers=containers
|
|
360
|
-
):
|
|
361
|
-
dremio_dataset = DremioDataset(
|
|
362
|
-
dataset_details=dataset_details,
|
|
363
|
-
api_operations=self.dremio_api,
|
|
364
|
-
)
|
|
365
|
-
self.datasets.append(dremio_dataset)
|
|
366
|
-
|
|
367
|
-
for glossary_term in dremio_dataset.glossary_terms:
|
|
368
|
-
if glossary_term not in self.glossary_terms:
|
|
369
|
-
self.glossary_terms.append(glossary_term)
|
|
370
|
-
|
|
371
|
-
self.datasets_populated = True
|
|
352
|
+
for dataset_details in self.dremio_api.get_all_tables_and_columns(containers):
|
|
353
|
+
dremio_dataset = DremioDataset(
|
|
354
|
+
dataset_details=dataset_details,
|
|
355
|
+
api_operations=self.dremio_api,
|
|
356
|
+
)
|
|
372
357
|
|
|
373
|
-
|
|
374
|
-
self.set_datasets()
|
|
375
|
-
return self.datasets
|
|
358
|
+
yield dremio_dataset
|
|
376
359
|
|
|
377
360
|
def set_containers(self) -> None:
|
|
378
361
|
if not self.containers_populated:
|
|
@@ -423,18 +406,50 @@ class DremioCatalog:
|
|
|
423
406
|
|
|
424
407
|
self.containers_populated = True
|
|
425
408
|
|
|
426
|
-
def get_containers(self) ->
|
|
427
|
-
|
|
428
|
-
|
|
409
|
+
def get_containers(self) -> Iterator[DremioContainer]:
|
|
410
|
+
"""Get all containers (sources, spaces, folders) as an iterator."""
|
|
411
|
+
for container in self.dremio_api.get_all_containers():
|
|
412
|
+
container_type = container.get("container_type")
|
|
413
|
+
if container_type == DremioEntityContainerType.SOURCE:
|
|
414
|
+
yield DremioSourceContainer(
|
|
415
|
+
container_name=container.get("name"),
|
|
416
|
+
location_id=container.get("id"),
|
|
417
|
+
path=[],
|
|
418
|
+
api_operations=self.dremio_api,
|
|
419
|
+
dremio_source_type=container.get("source_type") or "",
|
|
420
|
+
root_path=container.get("root_path"),
|
|
421
|
+
database_name=container.get("database_name"),
|
|
422
|
+
)
|
|
423
|
+
elif container_type == DremioEntityContainerType.SPACE:
|
|
424
|
+
yield DremioSpace(
|
|
425
|
+
container_name=container.get("name"),
|
|
426
|
+
location_id=container.get("id"),
|
|
427
|
+
path=[],
|
|
428
|
+
api_operations=self.dremio_api,
|
|
429
|
+
)
|
|
430
|
+
elif container_type == DremioEntityContainerType.FOLDER:
|
|
431
|
+
yield DremioFolder(
|
|
432
|
+
container_name=container.get("name"),
|
|
433
|
+
location_id=container.get("id"),
|
|
434
|
+
path=container.get("path"),
|
|
435
|
+
api_operations=self.dremio_api,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
def get_sources(self) -> Iterator[DremioSourceContainer]:
|
|
439
|
+
"""Get all Dremio source containers (external data connections) as an iterator."""
|
|
440
|
+
for container in self.get_containers():
|
|
441
|
+
if isinstance(container, DremioSourceContainer):
|
|
442
|
+
yield container
|
|
429
443
|
|
|
430
|
-
def
|
|
431
|
-
|
|
432
|
-
|
|
444
|
+
def get_glossary_terms(self) -> Iterator[DremioGlossaryTerm]:
|
|
445
|
+
"""Get all unique glossary terms (tags) from datasets."""
|
|
446
|
+
glossary_terms_seen = set()
|
|
433
447
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
448
|
+
for dataset in self.get_datasets():
|
|
449
|
+
for glossary_term in dataset.glossary_terms:
|
|
450
|
+
if glossary_term not in glossary_terms_seen:
|
|
451
|
+
glossary_terms_seen.add(glossary_term)
|
|
452
|
+
yield glossary_term
|
|
438
453
|
|
|
439
454
|
def is_valid_query(self, query: Dict[str, Any]) -> bool:
|
|
440
455
|
required_fields = [
|
|
@@ -447,6 +462,7 @@ class DremioCatalog:
|
|
|
447
462
|
return all(query.get(field) for field in required_fields)
|
|
448
463
|
|
|
449
464
|
def get_queries(self) -> Deque[DremioQuery]:
|
|
465
|
+
"""Get all valid Dremio queries for lineage analysis."""
|
|
450
466
|
for query in self.dremio_api.extract_all_queries():
|
|
451
467
|
if not self.is_valid_query(query):
|
|
452
468
|
continue
|
|
@@ -17,6 +17,7 @@ from datahub.metadata.schema_classes import (
|
|
|
17
17
|
DatasetProfileClass,
|
|
18
18
|
QuantileClass,
|
|
19
19
|
)
|
|
20
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
20
21
|
|
|
21
22
|
logger = logging.getLogger(__name__)
|
|
22
23
|
|
|
@@ -64,8 +65,13 @@ class DremioProfiler:
|
|
|
64
65
|
)
|
|
65
66
|
return
|
|
66
67
|
|
|
67
|
-
|
|
68
|
-
|
|
68
|
+
with PerfTimer() as timer:
|
|
69
|
+
profile_data = self.profile_table(full_table_name, columns)
|
|
70
|
+
profile_aspect = self.populate_profile_aspect(profile_data)
|
|
71
|
+
|
|
72
|
+
logger.info(
|
|
73
|
+
f"Profiled table {full_table_name} with {len(columns)} columns in {timer.elapsed_seconds():.2f} seconds"
|
|
74
|
+
)
|
|
69
75
|
|
|
70
76
|
if profile_aspect:
|
|
71
77
|
self.report.report_entity_profiled(dataset.resource_name)
|
|
@@ -131,7 +137,12 @@ class DremioProfiler:
|
|
|
131
137
|
def _profile_chunk(self, table_name: str, columns: List[Tuple[str, str]]) -> Dict:
|
|
132
138
|
profile_sql = self._build_profile_sql(table_name, columns)
|
|
133
139
|
try:
|
|
134
|
-
|
|
140
|
+
with PerfTimer() as timer:
|
|
141
|
+
results = self.api_operations.execute_query(profile_sql)
|
|
142
|
+
|
|
143
|
+
logger.debug(
|
|
144
|
+
f"Profiling query for {table_name} ({len(columns)} columns) completed in {timer.elapsed_seconds():.2f} seconds"
|
|
145
|
+
)
|
|
135
146
|
return self._parse_profile_results(results, columns)
|
|
136
147
|
except DremioAPIException as e:
|
|
137
148
|
raise e
|
|
@@ -55,7 +55,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
55
55
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
56
56
|
LINEAGE_EXTRACTION,
|
|
57
57
|
METADATA_EXTRACTION,
|
|
58
|
-
|
|
58
|
+
PROFILING,
|
|
59
59
|
)
|
|
60
60
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
61
61
|
DatasetLineageTypeClass,
|
|
@@ -201,7 +201,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
201
201
|
return "dremio"
|
|
202
202
|
|
|
203
203
|
def _build_source_map(self) -> Dict[str, DremioSourceMapEntry]:
|
|
204
|
-
dremio_sources = self.dremio_catalog.get_sources()
|
|
204
|
+
dremio_sources = list(self.dremio_catalog.get_sources())
|
|
205
205
|
source_mappings_config = self.config.source_mappings or []
|
|
206
206
|
|
|
207
207
|
source_map = build_dremio_source_map(dremio_sources, source_mappings_config)
|
|
@@ -242,9 +242,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
242
242
|
)
|
|
243
243
|
|
|
244
244
|
# Process Datasets
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
for dataset_info in datasets:
|
|
245
|
+
for dataset_info in self.dremio_catalog.get_datasets():
|
|
248
246
|
try:
|
|
249
247
|
yield from self.process_dataset(dataset_info)
|
|
250
248
|
logger.info(
|
|
@@ -258,10 +256,8 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
258
256
|
exc=exc,
|
|
259
257
|
)
|
|
260
258
|
|
|
261
|
-
# Process Glossary Terms
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
for glossary_term in glossary_terms:
|
|
259
|
+
# Process Glossary Terms using streaming
|
|
260
|
+
for glossary_term in self.dremio_catalog.get_glossary_terms():
|
|
265
261
|
try:
|
|
266
262
|
yield from self.process_glossary_term(glossary_term)
|
|
267
263
|
except Exception as exc:
|
|
@@ -283,14 +279,16 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
283
279
|
# Profiling
|
|
284
280
|
if self.config.is_profiling_enabled():
|
|
285
281
|
with (
|
|
286
|
-
self.report.
|
|
282
|
+
self.report.new_stage(PROFILING),
|
|
287
283
|
ThreadPoolExecutor(
|
|
288
284
|
max_workers=self.config.profiling.max_workers
|
|
289
285
|
) as executor,
|
|
290
286
|
):
|
|
287
|
+
# Collect datasets for profiling
|
|
288
|
+
datasets_for_profiling = list(self.dremio_catalog.get_datasets())
|
|
291
289
|
future_to_dataset = {
|
|
292
290
|
executor.submit(self.generate_profiles, dataset): dataset
|
|
293
|
-
for dataset in
|
|
291
|
+
for dataset in datasets_for_profiling
|
|
294
292
|
}
|
|
295
293
|
|
|
296
294
|
for future in as_completed(future_to_dataset):
|