acryl-datahub 1.0.0.1rc5__py3-none-any.whl → 1.0.0.1rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/METADATA +2578 -2578
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/RECORD +24 -24
- datahub/_version.py +1 -1
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/rest_emitter.py +13 -5
- datahub/ingestion/graph/client.py +2 -2
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/iceberg/iceberg.py +159 -102
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +12 -1
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/usage/usage_common.py +0 -65
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc6.dist-info}/top_level.txt +0 -0
|
@@ -30,7 +30,13 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
|
|
|
30
30
|
ReferencedTable,
|
|
31
31
|
)
|
|
32
32
|
from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
|
|
33
|
-
from datahub.
|
|
33
|
+
from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
|
|
34
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
35
|
+
ColumnLineageInfo,
|
|
36
|
+
ColumnRef,
|
|
37
|
+
DownstreamColumnRef,
|
|
38
|
+
SqlParsingResult,
|
|
39
|
+
)
|
|
34
40
|
|
|
35
41
|
logger = logging.getLogger(__name__)
|
|
36
42
|
|
|
@@ -262,6 +268,33 @@ class AbstractLineage(ABC):
|
|
|
262
268
|
),
|
|
263
269
|
)
|
|
264
270
|
|
|
271
|
+
def create_table_column_lineage(self, urn: str) -> List[ColumnLineageInfo]:
|
|
272
|
+
column_lineage = []
|
|
273
|
+
|
|
274
|
+
if self.table.columns is not None:
|
|
275
|
+
for column in self.table.columns:
|
|
276
|
+
downstream = DownstreamColumnRef(
|
|
277
|
+
table=self.table.name,
|
|
278
|
+
column=column.name,
|
|
279
|
+
column_type=SchemaFieldDataTypeClass(type=column.datahubDataType),
|
|
280
|
+
native_column_type=column.dataType or "UNKNOWN",
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
upstreams = [
|
|
284
|
+
ColumnRef(
|
|
285
|
+
table=urn,
|
|
286
|
+
column=column.name.lower(),
|
|
287
|
+
)
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
column_lineage_info = ColumnLineageInfo(
|
|
291
|
+
downstream=downstream, upstreams=upstreams
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
column_lineage.append(column_lineage_info)
|
|
295
|
+
|
|
296
|
+
return column_lineage
|
|
297
|
+
|
|
265
298
|
|
|
266
299
|
class AmazonRedshiftLineage(AbstractLineage):
|
|
267
300
|
def get_platform_pair(self) -> DataPlatformPair:
|
|
@@ -299,6 +332,8 @@ class AmazonRedshiftLineage(AbstractLineage):
|
|
|
299
332
|
qualified_table_name=qualified_table_name,
|
|
300
333
|
)
|
|
301
334
|
|
|
335
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
336
|
+
|
|
302
337
|
return Lineage(
|
|
303
338
|
upstreams=[
|
|
304
339
|
DataPlatformTable(
|
|
@@ -306,7 +341,7 @@ class AmazonRedshiftLineage(AbstractLineage):
|
|
|
306
341
|
urn=urn,
|
|
307
342
|
)
|
|
308
343
|
],
|
|
309
|
-
column_lineage=
|
|
344
|
+
column_lineage=column_lineage,
|
|
310
345
|
)
|
|
311
346
|
|
|
312
347
|
|
|
@@ -364,6 +399,8 @@ class OracleLineage(AbstractLineage):
|
|
|
364
399
|
qualified_table_name=qualified_table_name,
|
|
365
400
|
)
|
|
366
401
|
|
|
402
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
403
|
+
|
|
367
404
|
return Lineage(
|
|
368
405
|
upstreams=[
|
|
369
406
|
DataPlatformTable(
|
|
@@ -371,7 +408,7 @@ class OracleLineage(AbstractLineage):
|
|
|
371
408
|
urn=urn,
|
|
372
409
|
)
|
|
373
410
|
],
|
|
374
|
-
column_lineage=
|
|
411
|
+
column_lineage=column_lineage,
|
|
375
412
|
)
|
|
376
413
|
|
|
377
414
|
|
|
@@ -449,6 +486,8 @@ class DatabricksLineage(AbstractLineage):
|
|
|
449
486
|
qualified_table_name=qualified_table_name,
|
|
450
487
|
)
|
|
451
488
|
|
|
489
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
490
|
+
|
|
452
491
|
return Lineage(
|
|
453
492
|
upstreams=[
|
|
454
493
|
DataPlatformTable(
|
|
@@ -456,7 +495,7 @@ class DatabricksLineage(AbstractLineage):
|
|
|
456
495
|
urn=urn,
|
|
457
496
|
)
|
|
458
497
|
],
|
|
459
|
-
column_lineage=
|
|
498
|
+
column_lineage=column_lineage,
|
|
460
499
|
)
|
|
461
500
|
|
|
462
501
|
return Lineage.empty()
|
|
@@ -509,6 +548,9 @@ class TwoStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
509
548
|
server=server,
|
|
510
549
|
qualified_table_name=qualified_table_name,
|
|
511
550
|
)
|
|
551
|
+
|
|
552
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
553
|
+
|
|
512
554
|
return Lineage(
|
|
513
555
|
upstreams=[
|
|
514
556
|
DataPlatformTable(
|
|
@@ -516,10 +558,62 @@ class TwoStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
516
558
|
urn=urn,
|
|
517
559
|
)
|
|
518
560
|
],
|
|
519
|
-
column_lineage=
|
|
561
|
+
column_lineage=column_lineage,
|
|
520
562
|
)
|
|
521
563
|
|
|
522
564
|
|
|
565
|
+
class MySQLLineage(AbstractLineage):
|
|
566
|
+
def create_lineage(
|
|
567
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
568
|
+
) -> Lineage:
|
|
569
|
+
logger.debug(
|
|
570
|
+
f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
server, db_name = self.get_db_detail_from_argument(
|
|
574
|
+
data_access_func_detail.arg_list
|
|
575
|
+
)
|
|
576
|
+
if server is None or db_name is None:
|
|
577
|
+
return Lineage.empty() # Return an empty list
|
|
578
|
+
|
|
579
|
+
schema_name: str = cast(
|
|
580
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
581
|
+
).items["Schema"]
|
|
582
|
+
|
|
583
|
+
table_name: str = cast(
|
|
584
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
585
|
+
).items["Item"]
|
|
586
|
+
|
|
587
|
+
qualified_table_name: str = f"{schema_name}.{table_name}"
|
|
588
|
+
|
|
589
|
+
logger.debug(
|
|
590
|
+
f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
urn = make_urn(
|
|
594
|
+
config=self.config,
|
|
595
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
596
|
+
data_platform_pair=self.get_platform_pair(),
|
|
597
|
+
server=server,
|
|
598
|
+
qualified_table_name=qualified_table_name,
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
602
|
+
|
|
603
|
+
return Lineage(
|
|
604
|
+
upstreams=[
|
|
605
|
+
DataPlatformTable(
|
|
606
|
+
data_platform_pair=self.get_platform_pair(),
|
|
607
|
+
urn=urn,
|
|
608
|
+
)
|
|
609
|
+
],
|
|
610
|
+
column_lineage=column_lineage,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
614
|
+
return SupportedDataPlatform.MYSQL.value
|
|
615
|
+
|
|
616
|
+
|
|
523
617
|
class PostgresLineage(TwoStepDataAccessPattern):
|
|
524
618
|
def create_lineage(
|
|
525
619
|
self, data_access_func_detail: DataAccessFunctionDetail
|
|
@@ -671,6 +765,8 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
671
765
|
qualified_table_name=qualified_table_name,
|
|
672
766
|
)
|
|
673
767
|
|
|
768
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
769
|
+
|
|
674
770
|
return Lineage(
|
|
675
771
|
upstreams=[
|
|
676
772
|
DataPlatformTable(
|
|
@@ -678,7 +774,7 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
678
774
|
urn=urn,
|
|
679
775
|
)
|
|
680
776
|
],
|
|
681
|
-
column_lineage=
|
|
777
|
+
column_lineage=column_lineage,
|
|
682
778
|
)
|
|
683
779
|
|
|
684
780
|
|
|
@@ -726,6 +822,7 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
726
822
|
|
|
727
823
|
tables: List[str] = native_sql_parser.get_tables(query)
|
|
728
824
|
|
|
825
|
+
column_lineage = []
|
|
729
826
|
for qualified_table_name in tables:
|
|
730
827
|
if len(qualified_table_name.split(".")) != 3:
|
|
731
828
|
logger.debug(
|
|
@@ -748,12 +845,11 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
748
845
|
)
|
|
749
846
|
)
|
|
750
847
|
|
|
848
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
849
|
+
|
|
751
850
|
logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
|
|
752
851
|
|
|
753
|
-
return Lineage(
|
|
754
|
-
upstreams=dataplatform_tables,
|
|
755
|
-
column_lineage=[],
|
|
756
|
-
)
|
|
852
|
+
return Lineage(upstreams=dataplatform_tables, column_lineage=column_lineage)
|
|
757
853
|
|
|
758
854
|
def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
|
|
759
855
|
if (
|
|
@@ -885,6 +981,11 @@ class SupportedPattern(Enum):
|
|
|
885
981
|
FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
|
|
886
982
|
)
|
|
887
983
|
|
|
984
|
+
MYSQL = (
|
|
985
|
+
MySQLLineage,
|
|
986
|
+
FunctionName.MYSQL_DATA_ACCESS,
|
|
987
|
+
)
|
|
988
|
+
|
|
888
989
|
NATIVE_QUERY = (
|
|
889
990
|
NativeQueryLineage,
|
|
890
991
|
FunctionName.NATIVE_QUERY,
|
|
@@ -361,6 +361,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
361
361
|
)
|
|
362
362
|
|
|
363
363
|
if output_variable is None:
|
|
364
|
+
logger.debug(
|
|
365
|
+
f"Table: {self.table.full_name}: output-variable not found in tree"
|
|
366
|
+
)
|
|
364
367
|
self.reporter.report_warning(
|
|
365
368
|
f"{self.table.full_name}-output-variable",
|
|
366
369
|
"output-variable not found in table expression",
|
|
@@ -374,6 +377,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
374
377
|
|
|
375
378
|
# Each item is data-access function
|
|
376
379
|
for f_detail in table_links:
|
|
380
|
+
logger.debug(
|
|
381
|
+
f"Processing data-access-function {f_detail.data_access_function_name}"
|
|
382
|
+
)
|
|
377
383
|
# Get & Check if we support data-access-function available in M-Query
|
|
378
384
|
supported_resolver = SupportedPattern.get_pattern_handler(
|
|
379
385
|
f_detail.data_access_function_name
|
|
@@ -390,6 +396,10 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
390
396
|
|
|
391
397
|
# From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
|
|
392
398
|
# & also pass additional information that will be need to generate lineage
|
|
399
|
+
logger.debug(
|
|
400
|
+
f"Creating instance of {supported_resolver.handler().__name__} "
|
|
401
|
+
f"for data-access-function {f_detail.data_access_function_name}"
|
|
402
|
+
)
|
|
393
403
|
pattern_handler: AbstractLineage = supported_resolver.handler()(
|
|
394
404
|
ctx=ctx,
|
|
395
405
|
table=self.table,
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# Meta Data Ingestion From the Power BI Source
|
|
4
4
|
#
|
|
5
5
|
#########################################################
|
|
6
|
+
import functools
|
|
6
7
|
import logging
|
|
7
8
|
from datetime import datetime
|
|
8
9
|
from typing import Iterable, List, Optional, Tuple, Union
|
|
@@ -24,6 +25,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
24
25
|
support_status,
|
|
25
26
|
)
|
|
26
27
|
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
28
|
+
auto_incremental_lineage,
|
|
27
29
|
convert_dashboard_info_to_patch,
|
|
28
30
|
)
|
|
29
31
|
from datahub.ingestion.api.source import (
|
|
@@ -238,6 +240,10 @@ class Mapper:
|
|
|
238
240
|
upstream: List[UpstreamClass] = []
|
|
239
241
|
cll_lineage: List[FineGrainedLineage] = []
|
|
240
242
|
|
|
243
|
+
logger.debug(
|
|
244
|
+
f"Extracting lineage for table {table.full_name} in dataset {table.dataset.name if table.dataset else None}"
|
|
245
|
+
)
|
|
246
|
+
|
|
241
247
|
upstream_lineage: List[
|
|
242
248
|
datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
|
|
243
249
|
] = parser.get_upstream_tables(
|
|
@@ -1306,7 +1312,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1306
1312
|
|
|
1307
1313
|
allowed_workspaces = []
|
|
1308
1314
|
for workspace in all_workspaces:
|
|
1309
|
-
if not self.source_config.workspace_id_pattern.allowed(
|
|
1315
|
+
if not self.source_config.workspace_id_pattern.allowed(
|
|
1316
|
+
workspace.id
|
|
1317
|
+
) or not self.source_config.workspace_name_pattern.allowed(workspace.name):
|
|
1310
1318
|
self.reporter.filtered_workspace_names.append(
|
|
1311
1319
|
f"{workspace.id} - {workspace.name}"
|
|
1312
1320
|
)
|
|
@@ -1522,6 +1530,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1522
1530
|
else:
|
|
1523
1531
|
return [
|
|
1524
1532
|
*super().get_workunit_processors(),
|
|
1533
|
+
functools.partial(
|
|
1534
|
+
auto_incremental_lineage, self.source_config.incremental_lineage
|
|
1535
|
+
),
|
|
1525
1536
|
self.stale_entity_removal_handler.workunit_processor,
|
|
1526
1537
|
]
|
|
1527
1538
|
|
|
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
|
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
11
11
|
)
|
|
12
|
+
from datahub.ingestion.api.report import EntityFilterReport
|
|
12
13
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
13
14
|
StaleEntityRemovalSourceReport,
|
|
14
15
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -54,16 +55,14 @@ class Constant:
|
|
|
54
55
|
|
|
55
56
|
@dataclass
|
|
56
57
|
class SigmaSourceReport(StaleEntityRemovalSourceReport):
|
|
57
|
-
|
|
58
|
+
workspaces: EntityFilterReport = EntityFilterReport.field(type="workspace")
|
|
59
|
+
number_of_workspaces: Optional[int] = None
|
|
58
60
|
non_accessible_workspaces_count: int = 0
|
|
59
61
|
shared_entities_count: int = 0
|
|
60
62
|
number_of_datasets: int = 0
|
|
61
63
|
number_of_workbooks: int = 0
|
|
62
64
|
number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
|
|
63
65
|
|
|
64
|
-
def report_number_of_workspaces(self, number_of_workspaces: int) -> None:
|
|
65
|
-
self.number_of_workspaces = number_of_workspaces
|
|
66
|
-
|
|
67
66
|
|
|
68
67
|
class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
69
68
|
data_source_platform: str = pydantic.Field(
|
|
@@ -162,14 +162,17 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
162
162
|
|
|
163
163
|
def _get_allowed_workspaces(self) -> List[Workspace]:
|
|
164
164
|
all_workspaces = self.sigma_api.workspaces.values()
|
|
165
|
-
allowed_workspaces = [
|
|
166
|
-
workspace
|
|
167
|
-
for workspace in all_workspaces
|
|
168
|
-
if self.config.workspace_pattern.allowed(workspace.name)
|
|
169
|
-
]
|
|
170
165
|
logger.info(f"Number of workspaces = {len(all_workspaces)}")
|
|
171
|
-
self.reporter.
|
|
166
|
+
self.reporter.number_of_workspaces = len(all_workspaces)
|
|
167
|
+
|
|
168
|
+
allowed_workspaces = []
|
|
169
|
+
for workspace in all_workspaces:
|
|
170
|
+
if self.config.workspace_pattern.allowed(workspace.name):
|
|
171
|
+
allowed_workspaces.append(workspace)
|
|
172
|
+
else:
|
|
173
|
+
self.reporter.workspaces.dropped(workspace.workspaceId)
|
|
172
174
|
logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
|
|
175
|
+
|
|
173
176
|
return allowed_workspaces
|
|
174
177
|
|
|
175
178
|
def _gen_workspace_workunit(
|
|
@@ -658,6 +661,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
658
661
|
yield from self._gen_workbook_workunit(workbook)
|
|
659
662
|
|
|
660
663
|
for workspace in self._get_allowed_workspaces():
|
|
664
|
+
self.reporter.workspaces.processed(workspace.workspaceId)
|
|
661
665
|
yield from self._gen_workspace_workunit(workspace)
|
|
662
666
|
yield from self._gen_sigma_dataset_upstream_lineage_workunit()
|
|
663
667
|
|
|
@@ -31,7 +31,9 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
31
31
|
SQLAlchemySource,
|
|
32
32
|
make_sqlalchemy_type,
|
|
33
33
|
)
|
|
34
|
-
from datahub.ingestion.source.sql.sql_config import
|
|
34
|
+
from datahub.ingestion.source.sql.sql_config import (
|
|
35
|
+
BasicSQLAlchemyConfig,
|
|
36
|
+
)
|
|
35
37
|
|
|
36
38
|
logger = logging.getLogger(__name__)
|
|
37
39
|
|
|
@@ -71,10 +73,12 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
71
73
|
description="Will be set automatically to default value.",
|
|
72
74
|
)
|
|
73
75
|
service_name: Optional[str] = Field(
|
|
74
|
-
default=None,
|
|
76
|
+
default=None,
|
|
77
|
+
description="Oracle service name. If using, omit `database`.",
|
|
75
78
|
)
|
|
76
79
|
database: Optional[str] = Field(
|
|
77
|
-
default=None,
|
|
80
|
+
default=None,
|
|
81
|
+
description="If using, omit `service_name`.",
|
|
78
82
|
)
|
|
79
83
|
add_database_name_to_urn: Optional[bool] = Field(
|
|
80
84
|
default=False,
|
|
@@ -631,7 +635,6 @@ class OracleSource(SQLAlchemySource):
|
|
|
631
635
|
- Table, row, and column statistics via optional SQL profiling
|
|
632
636
|
|
|
633
637
|
Using the Oracle source requires that you've also installed the correct drivers; see the [cx_Oracle docs](https://cx-oracle.readthedocs.io/en/latest/user_guide/installation.html). The easiest one is the [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client.html).
|
|
634
|
-
|
|
635
638
|
"""
|
|
636
639
|
|
|
637
640
|
config: OracleConfig
|
|
@@ -661,6 +664,8 @@ class OracleSource(SQLAlchemySource):
|
|
|
661
664
|
database name from Connection URL, which does not work when using
|
|
662
665
|
service instead of database.
|
|
663
666
|
In that case, it tries to retrieve the database name by sending a query to the DB.
|
|
667
|
+
|
|
668
|
+
Note: This is used as a fallback if database is not specified in the config.
|
|
664
669
|
"""
|
|
665
670
|
|
|
666
671
|
# call default implementation first
|
|
@@ -687,7 +692,49 @@ class OracleSource(SQLAlchemySource):
|
|
|
687
692
|
# To silent the mypy lint error
|
|
688
693
|
yield cast(Inspector, inspector)
|
|
689
694
|
|
|
695
|
+
def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
|
|
696
|
+
"""
|
|
697
|
+
Override the get_db_schema method to ensure proper schema name extraction.
|
|
698
|
+
This method is used during view lineage extraction to determine the default schema
|
|
699
|
+
for unqualified table names in view definitions.
|
|
700
|
+
"""
|
|
701
|
+
try:
|
|
702
|
+
# Try to get the schema from the dataset identifier
|
|
703
|
+
parts = dataset_identifier.split(".")
|
|
704
|
+
|
|
705
|
+
# Handle the identifier format differently based on add_database_name_to_urn flag
|
|
706
|
+
if self.config.add_database_name_to_urn:
|
|
707
|
+
if len(parts) >= 3:
|
|
708
|
+
# Format is: database.schema.view when add_database_name_to_urn=True
|
|
709
|
+
db_name = parts[-3]
|
|
710
|
+
schema_name = parts[-2]
|
|
711
|
+
return db_name, schema_name
|
|
712
|
+
elif len(parts) >= 2:
|
|
713
|
+
# Handle the case where database might be missing even with flag enabled
|
|
714
|
+
# If we have a database in the config, use that
|
|
715
|
+
db_name = str(self.config.database)
|
|
716
|
+
schema_name = parts[-2]
|
|
717
|
+
return db_name, schema_name
|
|
718
|
+
else:
|
|
719
|
+
# Format is: schema.view when add_database_name_to_urn=False
|
|
720
|
+
if len(parts) >= 2:
|
|
721
|
+
# When add_database_name_to_urn is False, don't include database in the result
|
|
722
|
+
db_name = None
|
|
723
|
+
schema_name = parts[-2]
|
|
724
|
+
return db_name, schema_name
|
|
725
|
+
except Exception as e:
|
|
726
|
+
logger.warning(
|
|
727
|
+
f"Error extracting schema from identifier {dataset_identifier}: {e}"
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Fall back to parent implementation if our approach fails
|
|
731
|
+
db_name, schema_name = super().get_db_schema(dataset_identifier)
|
|
732
|
+
return db_name, schema_name
|
|
733
|
+
|
|
690
734
|
def get_workunits(self):
|
|
735
|
+
"""
|
|
736
|
+
Override get_workunits to patch Oracle dialect for custom types.
|
|
737
|
+
"""
|
|
691
738
|
with patch.dict(
|
|
692
739
|
"sqlalchemy.dialects.oracle.base.OracleDialect.ischema_names",
|
|
693
740
|
{klass.__name__: klass for klass in extra_oracle_types},
|
|
@@ -12,11 +12,9 @@ from typing import (
|
|
|
12
12
|
Optional,
|
|
13
13
|
Tuple,
|
|
14
14
|
TypeVar,
|
|
15
|
-
Union,
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
import pydantic
|
|
19
|
-
from deprecated import deprecated
|
|
20
18
|
from pydantic.fields import Field
|
|
21
19
|
|
|
22
20
|
import datahub.emitter.mce_builder as builder
|
|
@@ -28,19 +26,13 @@ from datahub.configuration.time_window_config import (
|
|
|
28
26
|
)
|
|
29
27
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
30
28
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
|
-
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetUsageStatistics
|
|
32
29
|
from datahub.metadata.schema_classes import (
|
|
33
|
-
CalendarIntervalClass,
|
|
34
30
|
DatasetFieldUsageCountsClass,
|
|
35
31
|
DatasetUsageStatisticsClass,
|
|
36
32
|
DatasetUserUsageCountsClass,
|
|
37
33
|
TimeWindowSizeClass,
|
|
38
|
-
UsageAggregationClass,
|
|
39
|
-
WindowDurationClass,
|
|
40
34
|
)
|
|
41
35
|
from datahub.utilities.sql_formatter import format_sql_query, trim_query
|
|
42
|
-
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
43
|
-
from datahub.utilities.urns.urn import guess_entity_type
|
|
44
36
|
|
|
45
37
|
logger = logging.getLogger(__name__)
|
|
46
38
|
|
|
@@ -295,60 +287,3 @@ class UsageAggregator(Generic[ResourceType]):
|
|
|
295
287
|
user_urn_builder=user_urn_builder,
|
|
296
288
|
queries_character_limit=self.config.queries_character_limit,
|
|
297
289
|
)
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
@deprecated
|
|
301
|
-
def convert_usage_aggregation_class(
|
|
302
|
-
obj: UsageAggregationClass,
|
|
303
|
-
) -> MetadataChangeProposalWrapper:
|
|
304
|
-
# Legacy usage aggregation only supported dataset usage stats
|
|
305
|
-
if guess_entity_type(obj.resource) == DatasetUrn.ENTITY_TYPE:
|
|
306
|
-
aspect = DatasetUsageStatistics(
|
|
307
|
-
timestampMillis=obj.bucket,
|
|
308
|
-
eventGranularity=TimeWindowSizeClass(
|
|
309
|
-
unit=convert_window_to_interval(obj.duration)
|
|
310
|
-
),
|
|
311
|
-
uniqueUserCount=obj.metrics.uniqueUserCount,
|
|
312
|
-
totalSqlQueries=obj.metrics.totalSqlQueries,
|
|
313
|
-
topSqlQueries=obj.metrics.topSqlQueries,
|
|
314
|
-
userCounts=(
|
|
315
|
-
[
|
|
316
|
-
DatasetUserUsageCountsClass(
|
|
317
|
-
user=u.user, count=u.count, userEmail=u.userEmail
|
|
318
|
-
)
|
|
319
|
-
for u in obj.metrics.users
|
|
320
|
-
if u.user is not None
|
|
321
|
-
]
|
|
322
|
-
if obj.metrics.users
|
|
323
|
-
else None
|
|
324
|
-
),
|
|
325
|
-
fieldCounts=(
|
|
326
|
-
[
|
|
327
|
-
DatasetFieldUsageCountsClass(fieldPath=f.fieldName, count=f.count)
|
|
328
|
-
for f in obj.metrics.fields
|
|
329
|
-
]
|
|
330
|
-
if obj.metrics.fields
|
|
331
|
-
else None
|
|
332
|
-
),
|
|
333
|
-
)
|
|
334
|
-
return MetadataChangeProposalWrapper(entityUrn=obj.resource, aspect=aspect)
|
|
335
|
-
else:
|
|
336
|
-
raise Exception(
|
|
337
|
-
f"Skipping unsupported usage aggregation - invalid entity type: {obj}"
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
@deprecated
|
|
342
|
-
def convert_window_to_interval(window: Union[str, WindowDurationClass]) -> str:
|
|
343
|
-
if window == WindowDurationClass.YEAR:
|
|
344
|
-
return CalendarIntervalClass.YEAR
|
|
345
|
-
elif window == WindowDurationClass.MONTH:
|
|
346
|
-
return CalendarIntervalClass.MONTH
|
|
347
|
-
elif window == WindowDurationClass.WEEK:
|
|
348
|
-
return CalendarIntervalClass.WEEK
|
|
349
|
-
elif window == WindowDurationClass.DAY:
|
|
350
|
-
return CalendarIntervalClass.DAY
|
|
351
|
-
elif window == WindowDurationClass.HOUR:
|
|
352
|
-
return CalendarIntervalClass.HOUR
|
|
353
|
-
else:
|
|
354
|
-
raise Exception(f"Unsupported window duration: {window}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|