acryl-datahub 1.2.0.4rc3__py3-none-any.whl → 1.2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (43) hide show
  1. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/METADATA +2454 -2454
  2. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/RECORD +43 -41
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +1 -1
  5. datahub/api/entities/external/external_entities.py +500 -15
  6. datahub/ingestion/source/aws/glue.py +18 -14
  7. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  8. datahub/ingestion/source/aws/tag_entities.py +82 -104
  9. datahub/ingestion/source/common/subtypes.py +1 -0
  10. datahub/ingestion/source/hex/api.py +2 -0
  11. datahub/ingestion/source/hex/mapper.py +16 -2
  12. datahub/ingestion/source/hex/model.py +2 -0
  13. datahub/ingestion/source/looker/looker_common.py +26 -0
  14. datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -1
  15. datahub/ingestion/source/snowflake/constants.py +1 -0
  16. datahub/ingestion/source/snowflake/snowflake_config.py +8 -1
  17. datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
  18. datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
  19. datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
  20. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
  21. datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
  22. datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
  23. datahub/ingestion/source/sql/mssql/source.py +2 -25
  24. datahub/ingestion/source/sql/mysql.py +54 -0
  25. datahub/ingestion/source/sql/postgres.py +5 -134
  26. datahub/ingestion/source/sql/sql_common.py +137 -0
  27. datahub/ingestion/source/superset.py +140 -56
  28. datahub/ingestion/source/unity/config.py +11 -0
  29. datahub/ingestion/source/unity/connection_test.py +1 -0
  30. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  31. datahub/ingestion/source/unity/proxy.py +20 -6
  32. datahub/ingestion/source/unity/report.py +9 -1
  33. datahub/ingestion/source/unity/source.py +51 -16
  34. datahub/ingestion/source/unity/tag_entities.py +49 -147
  35. datahub/metadata/_internal_schema_classes.py +514 -514
  36. datahub/metadata/_urns/urn_defs.py +1684 -1684
  37. datahub/metadata/schema.avsc +16680 -16281
  38. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  39. datahub/metadata/schemas/Operation.avsc +4 -2
  40. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/WHEEL +0 -0
  41. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/entry_points.txt +0 -0
  42. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/licenses/LICENSE +0 -0
  43. {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/top_level.txt +0 -0
@@ -36,14 +36,8 @@ from datahub.ingestion.source.sql.sql_common import (
36
36
  register_custom_type,
37
37
  )
38
38
  from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
39
- from datahub.ingestion.source.sql.sql_utils import (
40
- gen_database_key,
41
- gen_schema_key,
42
- )
43
39
  from datahub.ingestion.source.sql.stored_procedures.base import (
44
40
  BaseProcedure,
45
- generate_procedure_container_workunits,
46
- generate_procedure_workunits,
47
41
  )
48
42
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
49
43
  ArrayTypeClass,
@@ -132,10 +126,12 @@ class PostgresConfig(BasePostgresConfig):
132
126
  "Note: this is not used if `database` or `sqlalchemy_uri` are provided."
133
127
  ),
134
128
  )
129
+
135
130
  include_stored_procedures: bool = Field(
136
131
  default=True,
137
132
  description="Include ingest of stored procedures.",
138
133
  )
134
+
139
135
  procedure_pattern: AllowDenyPattern = Field(
140
136
  default=AllowDenyPattern.allow_all(),
141
137
  description="Regex patterns for stored procedures to filter in ingestion."
@@ -310,73 +306,6 @@ class PostgresSource(SQLAlchemySource):
310
306
  except Exception as e:
311
307
  logger.error(f"failed to fetch profile metadata: {e}")
312
308
 
313
- def get_schema_level_workunits(
314
- self,
315
- inspector: Inspector,
316
- schema: str,
317
- database: str,
318
- ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
319
- yield from super().get_schema_level_workunits(
320
- inspector=inspector,
321
- schema=schema,
322
- database=database,
323
- )
324
-
325
- if self.config.include_stored_procedures:
326
- try:
327
- yield from self.loop_stored_procedures(inspector, schema, self.config)
328
- except Exception as e:
329
- self.report.failure(
330
- title="Failed to list stored procedures for schema",
331
- message="An error occurred while listing procedures for the schema.",
332
- context=f"{database}.{schema}",
333
- exc=e,
334
- )
335
-
336
- def loop_stored_procedures(
337
- self,
338
- inspector: Inspector,
339
- schema: str,
340
- config: PostgresConfig,
341
- ) -> Iterable[MetadataWorkUnit]:
342
- """
343
- Loop schema data for get stored procedures as dataJob-s.
344
- """
345
- db_name = self.get_db_name(inspector)
346
-
347
- procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
348
- if procedures:
349
- yield from self._process_procedures(procedures, db_name, schema)
350
-
351
- def fetch_procedures_for_schema(
352
- self, inspector: Inspector, schema: str, db_name: str
353
- ) -> List[BaseProcedure]:
354
- try:
355
- raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
356
- inspector, schema, db_name
357
- )
358
- procedures: List[BaseProcedure] = []
359
- for procedure in raw_procedures:
360
- procedure_qualified_name = self.get_identifier(
361
- schema=schema,
362
- entity=procedure.name,
363
- inspector=inspector,
364
- )
365
-
366
- if not self.config.procedure_pattern.allowed(procedure_qualified_name):
367
- self.report.report_dropped(procedure_qualified_name)
368
- else:
369
- procedures.append(procedure)
370
- return procedures
371
- except Exception as e:
372
- self.report.warning(
373
- title="Failed to get procedures for schema",
374
- message="An error occurred while fetching procedures for the schema.",
375
- context=f"{db_name}.{schema}",
376
- exc=e,
377
- )
378
- return []
379
-
380
309
  def get_procedures_for_schema(
381
310
  self, inspector: Inspector, schema: str, db_name: str
382
311
  ) -> List[BaseProcedure]:
@@ -401,10 +330,9 @@ class PostgresSource(SQLAlchemySource):
401
330
  pg_language l ON l.oid = p.prolang
402
331
  WHERE
403
332
  p.prokind = 'p'
404
- AND n.nspname = '"""
405
- + schema
406
- + """';
407
- """
333
+ AND n.nspname = %s;
334
+ """,
335
+ (schema,),
408
336
  )
409
337
 
410
338
  procedure_rows = list(procedures)
@@ -423,60 +351,3 @@ class PostgresSource(SQLAlchemySource):
423
351
  )
424
352
  )
425
353
  return base_procedures
426
-
427
- def _process_procedures(
428
- self,
429
- procedures: List[BaseProcedure],
430
- db_name: str,
431
- schema: str,
432
- ) -> Iterable[MetadataWorkUnit]:
433
- if procedures:
434
- yield from generate_procedure_container_workunits(
435
- database_key=gen_database_key(
436
- database=db_name,
437
- platform=self.platform,
438
- platform_instance=self.config.platform_instance,
439
- env=self.config.env,
440
- ),
441
- schema_key=gen_schema_key(
442
- db_name=db_name,
443
- schema=schema,
444
- platform=self.platform,
445
- platform_instance=self.config.platform_instance,
446
- env=self.config.env,
447
- ),
448
- )
449
- for procedure in procedures:
450
- yield from self._process_procedure(procedure, schema, db_name)
451
-
452
- def _process_procedure(
453
- self,
454
- procedure: BaseProcedure,
455
- schema: str,
456
- db_name: str,
457
- ) -> Iterable[MetadataWorkUnit]:
458
- try:
459
- yield from generate_procedure_workunits(
460
- procedure=procedure,
461
- database_key=gen_database_key(
462
- database=db_name,
463
- platform=self.platform,
464
- platform_instance=self.config.platform_instance,
465
- env=self.config.env,
466
- ),
467
- schema_key=gen_schema_key(
468
- db_name=db_name,
469
- schema=schema,
470
- platform=self.platform,
471
- platform_instance=self.config.platform_instance,
472
- env=self.config.env,
473
- ),
474
- schema_resolver=self.get_schema_resolver(),
475
- )
476
- except Exception as e:
477
- self.report.warning(
478
- title="Failed to emit stored procedure",
479
- message="An error occurred while emitting stored procedure",
480
- context=procedure.name,
481
- exc=e,
482
- )
@@ -27,6 +27,7 @@ from sqlalchemy.exc import ProgrammingError
27
27
  from sqlalchemy.sql import sqltypes as types
28
28
  from sqlalchemy.types import TypeDecorator, TypeEngine
29
29
 
30
+ from datahub.configuration.common import AllowDenyPattern
30
31
  from datahub.emitter.mce_builder import (
31
32
  make_data_platform_urn,
32
33
  make_dataplatform_instance_urn,
@@ -71,6 +72,11 @@ from datahub.ingestion.source.sql.sql_utils import (
71
72
  from datahub.ingestion.source.sql.sqlalchemy_data_reader import (
72
73
  SqlAlchemyTableDataReader,
73
74
  )
75
+ from datahub.ingestion.source.sql.stored_procedures.base import (
76
+ BaseProcedure,
77
+ generate_procedure_container_workunits,
78
+ generate_procedure_workunits,
79
+ )
74
80
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
75
81
  StaleEntityRemovalHandler,
76
82
  )
@@ -531,6 +537,24 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
531
537
  if self.config.include_views:
532
538
  yield from self.loop_views(inspector, schema, self.config)
533
539
 
540
+ if getattr(self.config, "include_stored_procedures", False):
541
+ try:
542
+ yield from self.loop_stored_procedures(inspector, schema, self.config)
543
+ except NotImplementedError as e:
544
+ self.report.warning(
545
+ title="Stored procedures not supported",
546
+ message="The current SQL dialect does not support stored procedures.",
547
+ context=f"{database}.{schema}",
548
+ exc=e,
549
+ )
550
+ except Exception as e:
551
+ self.report.failure(
552
+ title="Failed to list stored procedures for schema",
553
+ message="An error occurred while listing procedures for the schema.",
554
+ context=f"{database}.{schema}",
555
+ exc=e,
556
+ )
557
+
534
558
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
535
559
  return [
536
560
  *super().get_workunit_processors(),
@@ -1437,3 +1461,116 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1437
1461
 
1438
1462
  def get_report(self):
1439
1463
  return self.report
1464
+
1465
+ def loop_stored_procedures(
1466
+ self,
1467
+ inspector: Inspector,
1468
+ schema: str,
1469
+ config: Union[SQLCommonConfig, Type[SQLCommonConfig]],
1470
+ ) -> Iterable[MetadataWorkUnit]:
1471
+ """
1472
+ Loop schema data for get stored procedures as dataJob-s.
1473
+ """
1474
+ db_name = self.get_db_name(inspector)
1475
+
1476
+ procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
1477
+ if procedures:
1478
+ yield from self._process_procedures(procedures, db_name, schema)
1479
+
1480
+ def fetch_procedures_for_schema(
1481
+ self, inspector: Inspector, schema: str, db_name: str
1482
+ ) -> List[BaseProcedure]:
1483
+ try:
1484
+ raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
1485
+ inspector, schema, db_name
1486
+ )
1487
+ procedures: List[BaseProcedure] = []
1488
+ for procedure in raw_procedures:
1489
+ procedure_qualified_name = self.get_identifier(
1490
+ schema=schema,
1491
+ entity=procedure.name,
1492
+ inspector=inspector,
1493
+ )
1494
+
1495
+ procedure_pattern = getattr(
1496
+ self.config, "procedure_pattern", AllowDenyPattern.allow_all()
1497
+ )
1498
+ if not procedure_pattern.allowed(procedure_qualified_name):
1499
+ self.report.report_dropped(procedure_qualified_name)
1500
+ else:
1501
+ procedures.append(procedure)
1502
+ return procedures
1503
+ except NotImplementedError:
1504
+ raise
1505
+ except Exception as e:
1506
+ self.report.warning(
1507
+ title="Failed to get procedures for schema",
1508
+ message="An error occurred while fetching procedures for the schema.",
1509
+ context=f"{db_name}.{schema}",
1510
+ exc=e,
1511
+ )
1512
+ return []
1513
+
1514
+ def get_procedures_for_schema(
1515
+ self, inspector: Inspector, schema: str, db_name: str
1516
+ ) -> List[BaseProcedure]:
1517
+ raise NotImplementedError(
1518
+ "Subclasses must implement the 'get_procedures_for_schema' method."
1519
+ )
1520
+
1521
+ def _process_procedures(
1522
+ self,
1523
+ procedures: List[BaseProcedure],
1524
+ db_name: str,
1525
+ schema: str,
1526
+ ) -> Iterable[MetadataWorkUnit]:
1527
+ if procedures:
1528
+ yield from generate_procedure_container_workunits(
1529
+ database_key=gen_database_key(
1530
+ database=db_name,
1531
+ platform=self.platform,
1532
+ platform_instance=self.config.platform_instance,
1533
+ env=self.config.env,
1534
+ ),
1535
+ schema_key=gen_schema_key(
1536
+ db_name=db_name,
1537
+ schema=schema,
1538
+ platform=self.platform,
1539
+ platform_instance=self.config.platform_instance,
1540
+ env=self.config.env,
1541
+ ),
1542
+ )
1543
+ for procedure in procedures:
1544
+ yield from self._process_procedure(procedure, schema, db_name)
1545
+
1546
+ def _process_procedure(
1547
+ self,
1548
+ procedure: BaseProcedure,
1549
+ schema: str,
1550
+ db_name: str,
1551
+ ) -> Iterable[MetadataWorkUnit]:
1552
+ try:
1553
+ yield from generate_procedure_workunits(
1554
+ procedure=procedure,
1555
+ database_key=gen_database_key(
1556
+ database=db_name,
1557
+ platform=self.platform,
1558
+ platform_instance=self.config.platform_instance,
1559
+ env=self.config.env,
1560
+ ),
1561
+ schema_key=gen_schema_key(
1562
+ db_name=db_name,
1563
+ schema=schema,
1564
+ platform=self.platform,
1565
+ platform_instance=self.config.platform_instance,
1566
+ env=self.config.env,
1567
+ ),
1568
+ schema_resolver=self.get_schema_resolver(),
1569
+ )
1570
+ except Exception as e:
1571
+ self.report.warning(
1572
+ title="Failed to emit stored procedure",
1573
+ message="An error occurred while emitting stored procedure",
1574
+ context=procedure.name,
1575
+ exc=e,
1576
+ )
@@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
8
8
 
9
9
  import dateutil.parser as dp
10
10
  import requests
11
+ import sqlglot
11
12
  from pydantic import BaseModel
12
13
  from pydantic.class_validators import root_validator, validator
13
14
  from pydantic.fields import Field
@@ -75,6 +76,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
75
76
  SchemaFieldDataType,
76
77
  SchemaMetadata,
77
78
  StringTypeClass,
79
+ TimeTypeClass,
78
80
  )
79
81
  from datahub.metadata.schema_classes import (
80
82
  AuditStampClass,
@@ -131,8 +133,11 @@ FIELD_TYPE_MAPPING = {
131
133
  "STRING": StringTypeClass,
132
134
  "FLOAT": NumberTypeClass,
133
135
  "DATETIME": DateTypeClass,
136
+ "TIMESTAMP": TimeTypeClass,
134
137
  "BOOLEAN": BooleanTypeClass,
135
138
  "SQL": StringTypeClass,
139
+ "NUMERIC": NumberTypeClass,
140
+ "TEXT": StringTypeClass,
136
141
  }
137
142
 
138
143
 
@@ -633,74 +638,130 @@ class SupersetSource(StatefulIngestionSourceBase):
633
638
 
634
639
  return input_fields
635
640
 
636
- def construct_chart_cll(
637
- self,
638
- chart_data: dict,
639
- datasource_urn: Union[str, None],
640
- datasource_id: Union[Any, int],
641
- ) -> List[InputField]:
642
- column_data: List[Union[str, dict]] = chart_data.get("form_data", {}).get(
643
- "all_columns", []
644
- )
641
+ def _extract_columns_from_sql(self, sql_expr: Optional[str]) -> List[str]:
642
+ if not sql_expr:
643
+ return []
645
644
 
646
- # the second field represents whether its a SQL expression,
647
- # false being just regular column and true being SQL col
648
- chart_column_data: List[Tuple[str, bool]] = [
649
- (column, False)
650
- if isinstance(column, str)
651
- else (column.get("label", ""), True)
652
- for column in column_data
653
- ]
645
+ try:
646
+ parsed_expr = sqlglot.parse_one(sql_expr)
654
647
 
655
- dataset_columns: List[Tuple[str, str, str]] = []
648
+ column_refs = set()
649
+ for node in parsed_expr.walk():
650
+ if isinstance(node, sqlglot.exp.Column):
651
+ column_name = node.name
652
+ column_refs.add(column_name)
656
653
 
657
- # parses the superset dataset's column info, to build type and description info
658
- if datasource_id:
659
- dataset_info = self.get_dataset_info(datasource_id).get("result", {})
660
- dataset_column_info = dataset_info.get("columns", [])
661
- dataset_metric_info = dataset_info.get("metrics", [])
654
+ return list(column_refs)
655
+ except Exception as e:
656
+ self.report.warning(f"Failed to parse SQL expression '{sql_expr}': {e}")
657
+ return []
662
658
 
663
- for column in dataset_column_info:
664
- col_name = column.get("column_name", "")
665
- col_type = column.get("type", "")
666
- col_description = column.get("description", "")
659
+ def _process_column_item(
660
+ self, item: Union[str, dict], unique_columns: Dict[str, bool]
661
+ ) -> None:
662
+ """Process a single column item and add to unique_columns."""
667
663
 
668
- # if missing column name or column type, cannot construct the column,
669
- # so we skip this column, missing description is fine
670
- if col_name == "" or col_type == "":
671
- logger.info(f"could not construct column lineage for {column}")
672
- continue
664
+ def add_column(col_name: str, is_sql: bool) -> None:
665
+ if not col_name:
666
+ return
667
+ # Always set to False if any non-SQL seen, else keep as is_sql
668
+ unique_columns[col_name] = unique_columns.get(col_name, True) and is_sql
669
+
670
+ if isinstance(item, str):
671
+ add_column(item, False)
672
+ elif isinstance(item, dict):
673
+ if item.get("expressionType") == "SIMPLE":
674
+ # For metrics with SIMPLE expression type
675
+ add_column(item.get("column", {}).get("column_name", ""), False)
676
+ elif item.get("expressionType") == "SQL":
677
+ sql_expr = item.get("sqlExpression")
678
+ column_refs = self._extract_columns_from_sql(sql_expr)
679
+ for col in column_refs:
680
+ add_column(col, False)
681
+ if not column_refs:
682
+ add_column(item.get("label", ""), True)
683
+
684
+ def _collect_all_unique_columns(self, form_data: dict) -> Dict[str, bool]:
685
+ """Collect all unique column names from form_data, distinguishing SQL vs non-SQL."""
686
+ unique_columns: Dict[str, bool] = {}
687
+
688
+ # Process regular columns
689
+ for column in form_data.get("all_columns", []):
690
+ self._process_column_item(column, unique_columns)
691
+
692
+ # Process metrics
693
+ # For charts with a single metric, the metric is stored in the form_data as a string in the 'metric' key
694
+ # For charts with multiple metrics, the metrics are stored in the form_data as a list of strings in the 'metrics' key
695
+ if "metric" in form_data:
696
+ metrics_data = [form_data.get("metric")]
697
+ else:
698
+ metrics_data = form_data.get("metrics", [])
673
699
 
674
- dataset_columns.append((col_name, col_type, col_description))
700
+ for metric in metrics_data:
701
+ if metric is not None:
702
+ self._process_column_item(metric, unique_columns)
675
703
 
676
- for metric in dataset_metric_info:
677
- metric_name = metric.get("metric_name", "")
678
- metric_type = metric.get("metric_type", "")
679
- metric_description = metric.get("description", "")
704
+ # Process group by columns
705
+ for group in form_data.get("groupby", []):
706
+ self._process_column_item(group, unique_columns)
680
707
 
681
- if metric_name == "" or metric_type == "":
682
- logger.info(f"could not construct metric lineage for {metric}")
683
- continue
708
+ # Process x-axis columns
709
+ x_axis_data = form_data.get("x_axis")
710
+ if x_axis_data is not None:
711
+ self._process_column_item(x_axis_data, unique_columns)
684
712
 
685
- dataset_columns.append((metric_name, metric_type, metric_description))
686
- else:
687
- # if no datasource id, cannot build cll, just return
713
+ return unique_columns
714
+
715
+ def _fetch_dataset_columns(
716
+ self, datasource_id: Union[Any, int]
717
+ ) -> List[Tuple[str, str, str]]:
718
+ """Fetch dataset columns and metrics from Superset API."""
719
+ if not datasource_id:
688
720
  logger.warning(
689
721
  "no datasource id was found, cannot build column level lineage"
690
722
  )
691
723
  return []
692
724
 
725
+ dataset_info = self.get_dataset_info(datasource_id).get("result", {})
726
+ dataset_column_info = dataset_info.get("columns", [])
727
+ dataset_metric_info = dataset_info.get("metrics", [])
728
+
729
+ dataset_columns: List[Tuple[str, str, str]] = []
730
+ for column in dataset_column_info:
731
+ col_name = column.get("column_name", "")
732
+ col_type = column.get("type", "")
733
+ col_description = column.get("description", "")
734
+
735
+ if col_name == "" or col_type == "":
736
+ logger.info(f"could not construct column lineage for {column}")
737
+ continue
738
+
739
+ dataset_columns.append((col_name, col_type, col_description))
740
+
741
+ for metric in dataset_metric_info:
742
+ metric_name = metric.get("metric_name", "")
743
+ metric_type = metric.get("metric_type", "")
744
+ metric_description = metric.get("description", "")
745
+
746
+ if metric_name == "" or metric_type == "":
747
+ logger.info(f"could not construct metric lineage for {metric}")
748
+ continue
749
+
750
+ dataset_columns.append((metric_name, metric_type, metric_description))
751
+
752
+ return dataset_columns
753
+
754
+ def _match_chart_columns_with_dataset(
755
+ self,
756
+ unique_chart_columns: Dict[str, bool],
757
+ dataset_columns: List[Tuple[str, str, str]],
758
+ ) -> List[Tuple[str, str, str]]:
759
+ """Match chart columns with dataset columns, preserving SQL/non-SQL status."""
693
760
  chart_columns: List[Tuple[str, str, str]] = []
694
- for chart_col in chart_column_data:
695
- chart_col_name, is_sql = chart_col
761
+
762
+ for chart_col_name, is_sql in unique_chart_columns.items():
696
763
  if is_sql:
697
- chart_columns.append(
698
- (
699
- chart_col_name,
700
- "SQL",
701
- "",
702
- )
703
- )
764
+ chart_columns.append((chart_col_name, "SQL", ""))
704
765
  continue
705
766
 
706
767
  # find matching upstream column
@@ -711,13 +772,36 @@ class SupersetSource(StatefulIngestionSourceBase):
711
772
  if dataset_col_name == chart_col_name:
712
773
  chart_columns.append(
713
774
  (chart_col_name, dataset_col_type, dataset_col_description)
714
- ) # column name, column type, description
775
+ )
715
776
  break
716
-
717
- # if no matching upstream column was found
718
- if len(chart_columns) == 0 or chart_columns[-1][0] != chart_col_name:
777
+ else:
719
778
  chart_columns.append((chart_col_name, "", ""))
720
779
 
780
+ return chart_columns
781
+
782
+ def construct_chart_cll(
783
+ self,
784
+ chart_data: dict,
785
+ datasource_urn: Union[str, None],
786
+ datasource_id: Union[Any, int],
787
+ ) -> List[InputField]:
788
+ """Construct column-level lineage for a chart."""
789
+ form_data = chart_data.get("form_data", {})
790
+
791
+ # Extract and process all columns in one go
792
+ unique_columns = self._collect_all_unique_columns(form_data)
793
+
794
+ # Fetch dataset columns
795
+ dataset_columns = self._fetch_dataset_columns(datasource_id)
796
+ if not dataset_columns:
797
+ return []
798
+
799
+ # Match chart columns with dataset columns
800
+ chart_columns = self._match_chart_columns_with_dataset(
801
+ unique_columns, dataset_columns
802
+ )
803
+
804
+ # Build input fields
721
805
  return self.build_input_fields(chart_columns, datasource_urn)
722
806
 
723
807
  def construct_chart_from_chart_data(
@@ -275,6 +275,17 @@ class UnityCatalogSourceConfig(
275
275
  hidden_from_docs=True,
276
276
  )
277
277
 
278
+ databricks_api_page_size: int = pydantic.Field(
279
+ default=0,
280
+ ge=0,
281
+ description=(
282
+ "Page size for Databricks API calls when listing resources (catalogs, schemas, tables, etc.). "
283
+ "When set to 0 (default), uses server-side configured page length (recommended). "
284
+ "When set to a positive value, the page length is the minimum of this value and the server configured value. "
285
+ "Must be a non-negative integer."
286
+ ),
287
+ )
288
+
278
289
  include_usage_statistics: bool = Field(
279
290
  default=True,
280
291
  description="Generate usage statistics.",
@@ -19,6 +19,7 @@ class UnityCatalogConnectionTest:
19
19
  self.config.token,
20
20
  self.config.profiling.warehouse_id,
21
21
  report=self.report,
22
+ databricks_api_page_size=self.config.databricks_api_page_size,
22
23
  )
23
24
 
24
25
  def get_connection_test(self) -> TestConnectionReport:
@@ -0,0 +1,19 @@
1
+ import logging
2
+
3
+ from datahub.api.entities.external.external_entities import (
4
+ PlatformResourceRepository,
5
+ )
6
+ from datahub.ingestion.source.unity.tag_entities import (
7
+ UnityCatalogTagPlatformResource,
8
+ UnityCatalogTagPlatformResourceId,
9
+ )
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class UnityCatalogPlatformResourceRepository(
15
+ PlatformResourceRepository[
16
+ UnityCatalogTagPlatformResourceId, UnityCatalogTagPlatformResource
17
+ ]
18
+ ):
19
+ """Unity Catalog-specific platform resource repository with tag-related operations."""