acryl-datahub 1.0.0.1rc4__py3-none-any.whl → 1.0.0.1rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (29) hide show
  1. {acryl_datahub-1.0.0.1rc4.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/METADATA +2563 -2563
  2. {acryl_datahub-1.0.0.1rc4.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/RECORD +29 -29
  3. datahub/_version.py +1 -1
  4. datahub/cli/specific/dataset_cli.py +26 -10
  5. datahub/emitter/mce_builder.py +1 -3
  6. datahub/emitter/mcp_builder.py +8 -0
  7. datahub/emitter/response_helper.py +25 -18
  8. datahub/emitter/rest_emitter.py +21 -5
  9. datahub/errors.py +4 -0
  10. datahub/ingestion/graph/client.py +2 -2
  11. datahub/ingestion/sink/datahub_rest.py +2 -2
  12. datahub/ingestion/source/common/subtypes.py +1 -0
  13. datahub/ingestion/source/iceberg/iceberg.py +159 -102
  14. datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
  15. datahub/ingestion/source/powerbi/config.py +31 -4
  16. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  17. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
  18. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  19. datahub/ingestion/source/powerbi/powerbi.py +12 -1
  20. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
  21. datahub/ingestion/source/redshift/lineage_v2.py +2 -1
  22. datahub/ingestion/source/sigma/config.py +3 -4
  23. datahub/ingestion/source/sigma/sigma.py +10 -6
  24. datahub/ingestion/source/sql/oracle.py +51 -4
  25. datahub/ingestion/source/usage/usage_common.py +0 -65
  26. {acryl_datahub-1.0.0.1rc4.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/WHEEL +0 -0
  27. {acryl_datahub-1.0.0.1rc4.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/entry_points.txt +0 -0
  28. {acryl_datahub-1.0.0.1rc4.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/licenses/LICENSE +0 -0
  29. {acryl_datahub-1.0.0.1rc4.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,13 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
30
30
  ReferencedTable,
31
31
  )
32
32
  from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
33
- from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
33
+ from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
34
+ from datahub.sql_parsing.sqlglot_lineage import (
35
+ ColumnLineageInfo,
36
+ ColumnRef,
37
+ DownstreamColumnRef,
38
+ SqlParsingResult,
39
+ )
34
40
 
35
41
  logger = logging.getLogger(__name__)
36
42
 
@@ -262,6 +268,33 @@ class AbstractLineage(ABC):
262
268
  ),
263
269
  )
264
270
 
271
+ def create_table_column_lineage(self, urn: str) -> List[ColumnLineageInfo]:
272
+ column_lineage = []
273
+
274
+ if self.table.columns is not None:
275
+ for column in self.table.columns:
276
+ downstream = DownstreamColumnRef(
277
+ table=self.table.name,
278
+ column=column.name,
279
+ column_type=SchemaFieldDataTypeClass(type=column.datahubDataType),
280
+ native_column_type=column.dataType or "UNKNOWN",
281
+ )
282
+
283
+ upstreams = [
284
+ ColumnRef(
285
+ table=urn,
286
+ column=column.name.lower(),
287
+ )
288
+ ]
289
+
290
+ column_lineage_info = ColumnLineageInfo(
291
+ downstream=downstream, upstreams=upstreams
292
+ )
293
+
294
+ column_lineage.append(column_lineage_info)
295
+
296
+ return column_lineage
297
+
265
298
 
266
299
  class AmazonRedshiftLineage(AbstractLineage):
267
300
  def get_platform_pair(self) -> DataPlatformPair:
@@ -299,6 +332,8 @@ class AmazonRedshiftLineage(AbstractLineage):
299
332
  qualified_table_name=qualified_table_name,
300
333
  )
301
334
 
335
+ column_lineage = self.create_table_column_lineage(urn)
336
+
302
337
  return Lineage(
303
338
  upstreams=[
304
339
  DataPlatformTable(
@@ -306,7 +341,7 @@ class AmazonRedshiftLineage(AbstractLineage):
306
341
  urn=urn,
307
342
  )
308
343
  ],
309
- column_lineage=[],
344
+ column_lineage=column_lineage,
310
345
  )
311
346
 
312
347
 
@@ -364,6 +399,8 @@ class OracleLineage(AbstractLineage):
364
399
  qualified_table_name=qualified_table_name,
365
400
  )
366
401
 
402
+ column_lineage = self.create_table_column_lineage(urn)
403
+
367
404
  return Lineage(
368
405
  upstreams=[
369
406
  DataPlatformTable(
@@ -371,7 +408,7 @@ class OracleLineage(AbstractLineage):
371
408
  urn=urn,
372
409
  )
373
410
  ],
374
- column_lineage=[],
411
+ column_lineage=column_lineage,
375
412
  )
376
413
 
377
414
 
@@ -449,6 +486,8 @@ class DatabricksLineage(AbstractLineage):
449
486
  qualified_table_name=qualified_table_name,
450
487
  )
451
488
 
489
+ column_lineage = self.create_table_column_lineage(urn)
490
+
452
491
  return Lineage(
453
492
  upstreams=[
454
493
  DataPlatformTable(
@@ -456,7 +495,7 @@ class DatabricksLineage(AbstractLineage):
456
495
  urn=urn,
457
496
  )
458
497
  ],
459
- column_lineage=[],
498
+ column_lineage=column_lineage,
460
499
  )
461
500
 
462
501
  return Lineage.empty()
@@ -509,6 +548,9 @@ class TwoStepDataAccessPattern(AbstractLineage, ABC):
509
548
  server=server,
510
549
  qualified_table_name=qualified_table_name,
511
550
  )
551
+
552
+ column_lineage = self.create_table_column_lineage(urn)
553
+
512
554
  return Lineage(
513
555
  upstreams=[
514
556
  DataPlatformTable(
@@ -516,10 +558,62 @@ class TwoStepDataAccessPattern(AbstractLineage, ABC):
516
558
  urn=urn,
517
559
  )
518
560
  ],
519
- column_lineage=[],
561
+ column_lineage=column_lineage,
520
562
  )
521
563
 
522
564
 
565
+ class MySQLLineage(AbstractLineage):
566
+ def create_lineage(
567
+ self, data_access_func_detail: DataAccessFunctionDetail
568
+ ) -> Lineage:
569
+ logger.debug(
570
+ f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
571
+ )
572
+
573
+ server, db_name = self.get_db_detail_from_argument(
574
+ data_access_func_detail.arg_list
575
+ )
576
+ if server is None or db_name is None:
577
+ return Lineage.empty() # Return an empty list
578
+
579
+ schema_name: str = cast(
580
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
581
+ ).items["Schema"]
582
+
583
+ table_name: str = cast(
584
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
585
+ ).items["Item"]
586
+
587
+ qualified_table_name: str = f"{schema_name}.{table_name}"
588
+
589
+ logger.debug(
590
+ f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
591
+ )
592
+
593
+ urn = make_urn(
594
+ config=self.config,
595
+ platform_instance_resolver=self.platform_instance_resolver,
596
+ data_platform_pair=self.get_platform_pair(),
597
+ server=server,
598
+ qualified_table_name=qualified_table_name,
599
+ )
600
+
601
+ column_lineage = self.create_table_column_lineage(urn)
602
+
603
+ return Lineage(
604
+ upstreams=[
605
+ DataPlatformTable(
606
+ data_platform_pair=self.get_platform_pair(),
607
+ urn=urn,
608
+ )
609
+ ],
610
+ column_lineage=column_lineage,
611
+ )
612
+
613
+ def get_platform_pair(self) -> DataPlatformPair:
614
+ return SupportedDataPlatform.MYSQL.value
615
+
616
+
523
617
  class PostgresLineage(TwoStepDataAccessPattern):
524
618
  def create_lineage(
525
619
  self, data_access_func_detail: DataAccessFunctionDetail
@@ -671,6 +765,8 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
671
765
  qualified_table_name=qualified_table_name,
672
766
  )
673
767
 
768
+ column_lineage = self.create_table_column_lineage(urn)
769
+
674
770
  return Lineage(
675
771
  upstreams=[
676
772
  DataPlatformTable(
@@ -678,7 +774,7 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
678
774
  urn=urn,
679
775
  )
680
776
  ],
681
- column_lineage=[],
777
+ column_lineage=column_lineage,
682
778
  )
683
779
 
684
780
 
@@ -726,6 +822,7 @@ class NativeQueryLineage(AbstractLineage):
726
822
 
727
823
  tables: List[str] = native_sql_parser.get_tables(query)
728
824
 
825
+ column_lineage = []
729
826
  for qualified_table_name in tables:
730
827
  if len(qualified_table_name.split(".")) != 3:
731
828
  logger.debug(
@@ -748,12 +845,11 @@ class NativeQueryLineage(AbstractLineage):
748
845
  )
749
846
  )
750
847
 
848
+ column_lineage = self.create_table_column_lineage(urn)
849
+
751
850
  logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
752
851
 
753
- return Lineage(
754
- upstreams=dataplatform_tables,
755
- column_lineage=[],
756
- )
852
+ return Lineage(upstreams=dataplatform_tables, column_lineage=column_lineage)
757
853
 
758
854
  def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
759
855
  if (
@@ -885,6 +981,11 @@ class SupportedPattern(Enum):
885
981
  FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
886
982
  )
887
983
 
984
+ MYSQL = (
985
+ MySQLLineage,
986
+ FunctionName.MYSQL_DATA_ACCESS,
987
+ )
988
+
888
989
  NATIVE_QUERY = (
889
990
  NativeQueryLineage,
890
991
  FunctionName.NATIVE_QUERY,
@@ -361,6 +361,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
361
361
  )
362
362
 
363
363
  if output_variable is None:
364
+ logger.debug(
365
+ f"Table: {self.table.full_name}: output-variable not found in tree"
366
+ )
364
367
  self.reporter.report_warning(
365
368
  f"{self.table.full_name}-output-variable",
366
369
  "output-variable not found in table expression",
@@ -374,6 +377,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
374
377
 
375
378
  # Each item is data-access function
376
379
  for f_detail in table_links:
380
+ logger.debug(
381
+ f"Processing data-access-function {f_detail.data_access_function_name}"
382
+ )
377
383
  # Get & Check if we support data-access-function available in M-Query
378
384
  supported_resolver = SupportedPattern.get_pattern_handler(
379
385
  f_detail.data_access_function_name
@@ -390,6 +396,10 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
390
396
 
391
397
  # From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
392
398
  # & also pass additional information that will be need to generate lineage
399
+ logger.debug(
400
+ f"Creating instance of {supported_resolver.handler().__name__} "
401
+ f"for data-access-function {f_detail.data_access_function_name}"
402
+ )
393
403
  pattern_handler: AbstractLineage = supported_resolver.handler()(
394
404
  ctx=ctx,
395
405
  table=self.table,
@@ -3,6 +3,7 @@
3
3
  # Meta Data Ingestion From the Power BI Source
4
4
  #
5
5
  #########################################################
6
+ import functools
6
7
  import logging
7
8
  from datetime import datetime
8
9
  from typing import Iterable, List, Optional, Tuple, Union
@@ -24,6 +25,7 @@ from datahub.ingestion.api.decorators import (
24
25
  support_status,
25
26
  )
26
27
  from datahub.ingestion.api.incremental_lineage_helper import (
28
+ auto_incremental_lineage,
27
29
  convert_dashboard_info_to_patch,
28
30
  )
29
31
  from datahub.ingestion.api.source import (
@@ -238,6 +240,10 @@ class Mapper:
238
240
  upstream: List[UpstreamClass] = []
239
241
  cll_lineage: List[FineGrainedLineage] = []
240
242
 
243
+ logger.debug(
244
+ f"Extracting lineage for table {table.full_name} in dataset {table.dataset.name if table.dataset else None}"
245
+ )
246
+
241
247
  upstream_lineage: List[
242
248
  datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
243
249
  ] = parser.get_upstream_tables(
@@ -1306,7 +1312,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1306
1312
 
1307
1313
  allowed_workspaces = []
1308
1314
  for workspace in all_workspaces:
1309
- if not self.source_config.workspace_id_pattern.allowed(workspace.id):
1315
+ if not self.source_config.workspace_id_pattern.allowed(
1316
+ workspace.id
1317
+ ) or not self.source_config.workspace_name_pattern.allowed(workspace.name):
1310
1318
  self.reporter.filtered_workspace_names.append(
1311
1319
  f"{workspace.id} - {workspace.name}"
1312
1320
  )
@@ -1522,6 +1530,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1522
1530
  else:
1523
1531
  return [
1524
1532
  *super().get_workunit_processors(),
1533
+ functools.partial(
1534
+ auto_incremental_lineage, self.source_config.incremental_lineage
1535
+ ),
1525
1536
  self.stale_entity_removal_handler.workunit_processor,
1526
1537
  ]
1527
1538
 
@@ -115,7 +115,7 @@ class PowerBiAPI:
115
115
  if scan_result is None:
116
116
  return results
117
117
 
118
- for scanned_dashboard in scan_result.get(Constant.DASHBOARDS, []):
118
+ for scanned_dashboard in scan_result.get(Constant.DASHBOARDS) or []:
119
119
  # Iterate through response and create a list of PowerBiAPI.Dashboard
120
120
  dashboard_id = scanned_dashboard.get("id")
121
121
  tags = self._parse_endorsement(
@@ -133,17 +133,17 @@ class PowerBiAPI:
133
133
  if scan_result is None:
134
134
  return results
135
135
 
136
- reports: List[dict] = scan_result.get(Constant.REPORTS, [])
136
+ reports: List[dict] = scan_result.get(Constant.REPORTS) or []
137
137
 
138
138
  for report in reports:
139
- report_id = report.get(Constant.ID, None)
139
+ report_id = report.get(Constant.ID)
140
140
  if report_id is None:
141
141
  logger.warning(
142
142
  f"Report id is none. Skipping endorsement tag for report instance {report}"
143
143
  )
144
144
  continue
145
145
  endorsements = self._parse_endorsement(
146
- report.get(Constant.ENDORSEMENT_DETAIL, None)
146
+ report.get(Constant.ENDORSEMENT_DETAIL)
147
147
  )
148
148
  results[report_id] = endorsements
149
149
 
@@ -339,7 +339,7 @@ class PowerBiAPI:
339
339
  if not endorsements:
340
340
  return []
341
341
 
342
- endorsement = endorsements.get(Constant.ENDORSEMENT, None)
342
+ endorsement = endorsements.get(Constant.ENDORSEMENT)
343
343
  if not endorsement:
344
344
  return []
345
345
 
@@ -396,7 +396,7 @@ class PowerBiAPI:
396
396
 
397
397
  if self.__config.extract_endorsements_to_tags:
398
398
  dataset_instance.tags = self._parse_endorsement(
399
- dataset_dict.get(Constant.ENDORSEMENT_DETAIL, None)
399
+ dataset_dict.get(Constant.ENDORSEMENT_DETAIL)
400
400
  )
401
401
 
402
402
  dataset_map[dataset_instance.id] = dataset_instance
@@ -407,7 +407,7 @@ class PowerBiAPI:
407
407
  else dataset_instance.id
408
408
  )
409
409
  logger.debug(f"dataset_dict = {dataset_dict}")
410
- for table in dataset_dict.get(Constant.TABLES, []):
410
+ for table in dataset_dict.get(Constant.TABLES) or []:
411
411
  expression: Optional[str] = (
412
412
  table[Constant.SOURCE][0][Constant.EXPRESSION]
413
413
  if table.get(Constant.SOURCE) is not None
@@ -430,10 +430,10 @@ class PowerBiAPI:
430
430
  column["dataType"], FIELD_TYPE_MAPPING["Null"]
431
431
  ),
432
432
  )
433
- for column in table.get("columns", [])
433
+ for column in table.get("columns") or []
434
434
  ],
435
435
  measures=[
436
- Measure(**measure) for measure in table.get("measures", [])
436
+ Measure(**measure) for measure in table.get("measures") or []
437
437
  ],
438
438
  dataset=dataset_instance,
439
439
  row_count=None,
@@ -480,7 +480,7 @@ class PowerBiAPI:
480
480
  )
481
481
  )
482
482
  if app_id is None: # In PowerBI one workspace can have one app
483
- app_id = report.get(Constant.APP_ID)
483
+ app_id = report[Constant.APP_ID]
484
484
 
485
485
  raw_app_dashboards: List[Dict] = []
486
486
  # Filter app dashboards
@@ -488,7 +488,7 @@ class PowerBiAPI:
488
488
  if dashboard.get(Constant.APP_ID):
489
489
  raw_app_dashboards.append(dashboard)
490
490
  if app_id is None: # In PowerBI, one workspace contains one app
491
- app_id = report[Constant.APP_ID]
491
+ app_id = dashboard[Constant.APP_ID]
492
492
 
493
493
  # workspace doesn't have an App. Above two loops can be avoided
494
494
  # if app_id is available at root level in workspace_metadata
@@ -230,7 +230,8 @@ class RedshiftSqlLineageV2(Closeable):
230
230
  )
231
231
 
232
232
  # Populate lineage for external tables.
233
- self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
233
+ if not self.config.skip_external_tables:
234
+ self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
234
235
 
235
236
  def _populate_lineage_agg(
236
237
  self,
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
11
11
  )
12
+ from datahub.ingestion.api.report import EntityFilterReport
12
13
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
13
14
  StaleEntityRemovalSourceReport,
14
15
  StatefulStaleMetadataRemovalConfig,
@@ -54,16 +55,14 @@ class Constant:
54
55
 
55
56
  @dataclass
56
57
  class SigmaSourceReport(StaleEntityRemovalSourceReport):
57
- number_of_workspaces: int = 0
58
+ workspaces: EntityFilterReport = EntityFilterReport.field(type="workspace")
59
+ number_of_workspaces: Optional[int] = None
58
60
  non_accessible_workspaces_count: int = 0
59
61
  shared_entities_count: int = 0
60
62
  number_of_datasets: int = 0
61
63
  number_of_workbooks: int = 0
62
64
  number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
63
65
 
64
- def report_number_of_workspaces(self, number_of_workspaces: int) -> None:
65
- self.number_of_workspaces = number_of_workspaces
66
-
67
66
 
68
67
  class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
69
68
  data_source_platform: str = pydantic.Field(
@@ -162,14 +162,17 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
162
162
 
163
163
  def _get_allowed_workspaces(self) -> List[Workspace]:
164
164
  all_workspaces = self.sigma_api.workspaces.values()
165
- allowed_workspaces = [
166
- workspace
167
- for workspace in all_workspaces
168
- if self.config.workspace_pattern.allowed(workspace.name)
169
- ]
170
165
  logger.info(f"Number of workspaces = {len(all_workspaces)}")
171
- self.reporter.report_number_of_workspaces(len(all_workspaces))
166
+ self.reporter.number_of_workspaces = len(all_workspaces)
167
+
168
+ allowed_workspaces = []
169
+ for workspace in all_workspaces:
170
+ if self.config.workspace_pattern.allowed(workspace.name):
171
+ allowed_workspaces.append(workspace)
172
+ else:
173
+ self.reporter.workspaces.dropped(workspace.workspaceId)
172
174
  logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
175
+
173
176
  return allowed_workspaces
174
177
 
175
178
  def _gen_workspace_workunit(
@@ -658,6 +661,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
658
661
  yield from self._gen_workbook_workunit(workbook)
659
662
 
660
663
  for workspace in self._get_allowed_workspaces():
664
+ self.reporter.workspaces.processed(workspace.workspaceId)
661
665
  yield from self._gen_workspace_workunit(workspace)
662
666
  yield from self._gen_sigma_dataset_upstream_lineage_workunit()
663
667
 
@@ -31,7 +31,9 @@ from datahub.ingestion.source.sql.sql_common import (
31
31
  SQLAlchemySource,
32
32
  make_sqlalchemy_type,
33
33
  )
34
- from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
34
+ from datahub.ingestion.source.sql.sql_config import (
35
+ BasicSQLAlchemyConfig,
36
+ )
35
37
 
36
38
  logger = logging.getLogger(__name__)
37
39
 
@@ -71,10 +73,12 @@ class OracleConfig(BasicSQLAlchemyConfig):
71
73
  description="Will be set automatically to default value.",
72
74
  )
73
75
  service_name: Optional[str] = Field(
74
- default=None, description="Oracle service name. If using, omit `database`."
76
+ default=None,
77
+ description="Oracle service name. If using, omit `database`.",
75
78
  )
76
79
  database: Optional[str] = Field(
77
- default=None, description="If using, omit `service_name`."
80
+ default=None,
81
+ description="If using, omit `service_name`.",
78
82
  )
79
83
  add_database_name_to_urn: Optional[bool] = Field(
80
84
  default=False,
@@ -631,7 +635,6 @@ class OracleSource(SQLAlchemySource):
631
635
  - Table, row, and column statistics via optional SQL profiling
632
636
 
633
637
  Using the Oracle source requires that you've also installed the correct drivers; see the [cx_Oracle docs](https://cx-oracle.readthedocs.io/en/latest/user_guide/installation.html). The easiest one is the [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client.html).
634
-
635
638
  """
636
639
 
637
640
  config: OracleConfig
@@ -661,6 +664,8 @@ class OracleSource(SQLAlchemySource):
661
664
  database name from Connection URL, which does not work when using
662
665
  service instead of database.
663
666
  In that case, it tries to retrieve the database name by sending a query to the DB.
667
+
668
+ Note: This is used as a fallback if database is not specified in the config.
664
669
  """
665
670
 
666
671
  # call default implementation first
@@ -687,7 +692,49 @@ class OracleSource(SQLAlchemySource):
687
692
  # To silent the mypy lint error
688
693
  yield cast(Inspector, inspector)
689
694
 
695
+ def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
696
+ """
697
+ Override the get_db_schema method to ensure proper schema name extraction.
698
+ This method is used during view lineage extraction to determine the default schema
699
+ for unqualified table names in view definitions.
700
+ """
701
+ try:
702
+ # Try to get the schema from the dataset identifier
703
+ parts = dataset_identifier.split(".")
704
+
705
+ # Handle the identifier format differently based on add_database_name_to_urn flag
706
+ if self.config.add_database_name_to_urn:
707
+ if len(parts) >= 3:
708
+ # Format is: database.schema.view when add_database_name_to_urn=True
709
+ db_name = parts[-3]
710
+ schema_name = parts[-2]
711
+ return db_name, schema_name
712
+ elif len(parts) >= 2:
713
+ # Handle the case where database might be missing even with flag enabled
714
+ # If we have a database in the config, use that
715
+ db_name = str(self.config.database)
716
+ schema_name = parts[-2]
717
+ return db_name, schema_name
718
+ else:
719
+ # Format is: schema.view when add_database_name_to_urn=False
720
+ if len(parts) >= 2:
721
+ # When add_database_name_to_urn is False, don't include database in the result
722
+ db_name = None
723
+ schema_name = parts[-2]
724
+ return db_name, schema_name
725
+ except Exception as e:
726
+ logger.warning(
727
+ f"Error extracting schema from identifier {dataset_identifier}: {e}"
728
+ )
729
+
730
+ # Fall back to parent implementation if our approach fails
731
+ db_name, schema_name = super().get_db_schema(dataset_identifier)
732
+ return db_name, schema_name
733
+
690
734
  def get_workunits(self):
735
+ """
736
+ Override get_workunits to patch Oracle dialect for custom types.
737
+ """
691
738
  with patch.dict(
692
739
  "sqlalchemy.dialects.oracle.base.OracleDialect.ischema_names",
693
740
  {klass.__name__: klass for klass in extra_oracle_types},
@@ -12,11 +12,9 @@ from typing import (
12
12
  Optional,
13
13
  Tuple,
14
14
  TypeVar,
15
- Union,
16
15
  )
17
16
 
18
17
  import pydantic
19
- from deprecated import deprecated
20
18
  from pydantic.fields import Field
21
19
 
22
20
  import datahub.emitter.mce_builder as builder
@@ -28,19 +26,13 @@ from datahub.configuration.time_window_config import (
28
26
  )
29
27
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
30
28
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
- from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetUsageStatistics
32
29
  from datahub.metadata.schema_classes import (
33
- CalendarIntervalClass,
34
30
  DatasetFieldUsageCountsClass,
35
31
  DatasetUsageStatisticsClass,
36
32
  DatasetUserUsageCountsClass,
37
33
  TimeWindowSizeClass,
38
- UsageAggregationClass,
39
- WindowDurationClass,
40
34
  )
41
35
  from datahub.utilities.sql_formatter import format_sql_query, trim_query
42
- from datahub.utilities.urns.dataset_urn import DatasetUrn
43
- from datahub.utilities.urns.urn import guess_entity_type
44
36
 
45
37
  logger = logging.getLogger(__name__)
46
38
 
@@ -295,60 +287,3 @@ class UsageAggregator(Generic[ResourceType]):
295
287
  user_urn_builder=user_urn_builder,
296
288
  queries_character_limit=self.config.queries_character_limit,
297
289
  )
298
-
299
-
300
- @deprecated
301
- def convert_usage_aggregation_class(
302
- obj: UsageAggregationClass,
303
- ) -> MetadataChangeProposalWrapper:
304
- # Legacy usage aggregation only supported dataset usage stats
305
- if guess_entity_type(obj.resource) == DatasetUrn.ENTITY_TYPE:
306
- aspect = DatasetUsageStatistics(
307
- timestampMillis=obj.bucket,
308
- eventGranularity=TimeWindowSizeClass(
309
- unit=convert_window_to_interval(obj.duration)
310
- ),
311
- uniqueUserCount=obj.metrics.uniqueUserCount,
312
- totalSqlQueries=obj.metrics.totalSqlQueries,
313
- topSqlQueries=obj.metrics.topSqlQueries,
314
- userCounts=(
315
- [
316
- DatasetUserUsageCountsClass(
317
- user=u.user, count=u.count, userEmail=u.userEmail
318
- )
319
- for u in obj.metrics.users
320
- if u.user is not None
321
- ]
322
- if obj.metrics.users
323
- else None
324
- ),
325
- fieldCounts=(
326
- [
327
- DatasetFieldUsageCountsClass(fieldPath=f.fieldName, count=f.count)
328
- for f in obj.metrics.fields
329
- ]
330
- if obj.metrics.fields
331
- else None
332
- ),
333
- )
334
- return MetadataChangeProposalWrapper(entityUrn=obj.resource, aspect=aspect)
335
- else:
336
- raise Exception(
337
- f"Skipping unsupported usage aggregation - invalid entity type: {obj}"
338
- )
339
-
340
-
341
- @deprecated
342
- def convert_window_to_interval(window: Union[str, WindowDurationClass]) -> str:
343
- if window == WindowDurationClass.YEAR:
344
- return CalendarIntervalClass.YEAR
345
- elif window == WindowDurationClass.MONTH:
346
- return CalendarIntervalClass.MONTH
347
- elif window == WindowDurationClass.WEEK:
348
- return CalendarIntervalClass.WEEK
349
- elif window == WindowDurationClass.DAY:
350
- return CalendarIntervalClass.DAY
351
- elif window == WindowDurationClass.HOUR:
352
- return CalendarIntervalClass.HOUR
353
- else:
354
- raise Exception(f"Unsupported window duration: {window}")