acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/datajob/datajob.py +7 -4
- datahub/api/entities/dataset/dataset.py +9 -11
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/common.py +5 -0
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/errors.py +4 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/api/source_helpers.py +1 -0
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +33 -8
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/run/pipeline.py +9 -6
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
- datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
- datahub/ingestion/source/ge_data_profiler.py +27 -1
- datahub/ingestion/source/hex/api.py +1 -20
- datahub/ingestion/source/hex/query_fetcher.py +4 -1
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +17 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +34 -5
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mlflow.py +19 -6
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +13 -1
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sigma/config.py +74 -6
- datahub/ingestion/source/sigma/sigma.py +16 -1
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +4 -52
- datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/hive.py +7 -2
- datahub/ingestion/source/sql/hive_metastore.py +5 -5
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +31 -6
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/ingestion/source/vertexai/vertexai.py +316 -4
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
- datahub/metadata/_urns/urn_defs.py +1819 -1763
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +17296 -16883
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/_all_entities.py +4 -0
- datahub/sdk/_shared.py +142 -4
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/entity_client.py +8 -0
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +6 -3
- datahub/sdk/mlmodel.py +301 -0
- datahub/sdk/mlmodelgroup.py +233 -0
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/specific/dataset.py +12 -0
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +18 -14
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/testing/mcp_diff.py +15 -2
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +350 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -192,6 +192,11 @@ class SupportedDataPlatform(Enum):
|
|
|
192
192
|
datahub_data_platform_name="mysql",
|
|
193
193
|
)
|
|
194
194
|
|
|
195
|
+
ODBC = DataPlatformPair(
|
|
196
|
+
powerbi_data_platform_name="Odbc",
|
|
197
|
+
datahub_data_platform_name="odbc",
|
|
198
|
+
)
|
|
199
|
+
|
|
195
200
|
|
|
196
201
|
@dataclass
|
|
197
202
|
class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
@@ -341,6 +346,13 @@ class PowerBiDashboardSourceConfig(
|
|
|
341
346
|
"For Google BigQuery the datasource's server is google bigquery project name. "
|
|
342
347
|
"For Databricks Unity Catalog the datasource's server is workspace FQDN.",
|
|
343
348
|
)
|
|
349
|
+
# ODBC DSN to platform mapping
|
|
350
|
+
dsn_to_platform_name: Dict[str, str] = pydantic.Field(
|
|
351
|
+
default={},
|
|
352
|
+
description="A mapping of ODBC DSN to DataHub data platform name. "
|
|
353
|
+
"For example with an ODBC connection string 'DSN=database' where the database type "
|
|
354
|
+
"is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
|
|
355
|
+
)
|
|
344
356
|
# deprecated warning
|
|
345
357
|
_dataset_type_mapping = pydantic_field_deprecated(
|
|
346
358
|
"dataset_type_mapping",
|
|
@@ -501,7 +513,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
501
513
|
include_workspace_name_in_dataset_urn: bool = pydantic.Field(
|
|
502
514
|
default=False,
|
|
503
515
|
description="It is recommended to set this to true, as it helps prevent the overwriting of datasets."
|
|
504
|
-
"Read section #11560 at https://
|
|
516
|
+
"Read section #11560 at https://docs.datahub.com/docs/how/updating-datahub/ before enabling this option."
|
|
505
517
|
"To maintain backward compatibility, this is set to False.",
|
|
506
518
|
)
|
|
507
519
|
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
server_patterns = [
|
|
5
|
+
r"Server=([^:]+)[:][0-9]+/.*",
|
|
6
|
+
r"SERVER=\{([^}]*)\}",
|
|
7
|
+
r"SERVER=([^;]*)",
|
|
8
|
+
r"HOST=\{([^}]*)\}",
|
|
9
|
+
r"HOST=([^;]*)",
|
|
10
|
+
r"DATA SOURCE=\{([^}]*)\}",
|
|
11
|
+
r"DATA SOURCE=([^;]*)",
|
|
12
|
+
r"DSN=\{([^}]*)\}",
|
|
13
|
+
r"DSN=([^;]*)",
|
|
14
|
+
r"Server=([^;]*)",
|
|
15
|
+
r"S3OutputLocation=([^;]*)",
|
|
16
|
+
r"HTTPPath=([^;]*)",
|
|
17
|
+
r"Host=([^;]*)",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
dsn_patterns = [
|
|
21
|
+
r"DSN\s*=\s*\"([^\"]+)\"",
|
|
22
|
+
r"DSN\s*=\s*\'([^\']+)\'",
|
|
23
|
+
r"DSN\s*=\s*([^;]+)",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
platform_patterns = {
|
|
27
|
+
"mysql": r"mysql",
|
|
28
|
+
"postgres": r"post(gre(s|sql)?|gres)",
|
|
29
|
+
"mssql": r"(sql\s*server|mssql|sqlncli)",
|
|
30
|
+
"oracle": r"oracle",
|
|
31
|
+
"db2": r"db2",
|
|
32
|
+
"sqlite": r"sqlite",
|
|
33
|
+
"access": r"(access|\.mdb|\.accdb)",
|
|
34
|
+
"excel": r"(excel|\.xls)",
|
|
35
|
+
"firebird": r"firebird",
|
|
36
|
+
"informix": r"informix",
|
|
37
|
+
"sybase": r"sybase",
|
|
38
|
+
"teradata": r"teradata",
|
|
39
|
+
"hadoop": r"(hadoop|hive)",
|
|
40
|
+
"snowflake": r"snowflake",
|
|
41
|
+
"redshift": r"redshift",
|
|
42
|
+
"bigquery": r"bigquery",
|
|
43
|
+
"athena": r"(athena|aws\s*athena)",
|
|
44
|
+
"databricks": r"(databricks|spark)",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
powerbi_platform_names = {
|
|
48
|
+
"mysql": "MySQL",
|
|
49
|
+
"postgres": "PostgreSQL",
|
|
50
|
+
"mssql": "SQL Server",
|
|
51
|
+
"oracle": "Oracle",
|
|
52
|
+
"db2": "IBM DB2",
|
|
53
|
+
"sqlite": "SQLite",
|
|
54
|
+
"access": "Microsoft Access",
|
|
55
|
+
"excel": "Microsoft Excel",
|
|
56
|
+
"firebird": "Firebird",
|
|
57
|
+
"informix": "IBM Informix",
|
|
58
|
+
"sybase": "SAP Sybase",
|
|
59
|
+
"teradata": "Teradata",
|
|
60
|
+
"hadoop": "Hadoop",
|
|
61
|
+
"snowflake": "Snowflake",
|
|
62
|
+
"redshift": "Amazon Redshift",
|
|
63
|
+
"bigquery": "Google BigQuery",
|
|
64
|
+
"athena": "Amazon Athena",
|
|
65
|
+
"databricks": "Databricks",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def extract_driver(connection_string: str) -> Union[str, None]:
|
|
70
|
+
"""
|
|
71
|
+
Parse an ODBC connection string and extract the driver name.
|
|
72
|
+
Handles whitespace in driver names and various connection string formats.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
connection_string (str): The ODBC connection string
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
str: The extracted driver name, or None if not found
|
|
79
|
+
"""
|
|
80
|
+
# Match DRIVER={driver name} pattern
|
|
81
|
+
driver_match = re.search(r"DRIVER=\{([^}]*)}", connection_string, re.IGNORECASE)
|
|
82
|
+
|
|
83
|
+
if driver_match:
|
|
84
|
+
return driver_match.group(1).strip()
|
|
85
|
+
|
|
86
|
+
# Alternative pattern for DRIVER=driver
|
|
87
|
+
driver_match = re.search(r"DRIVER=([^;]*)", connection_string, re.IGNORECASE)
|
|
88
|
+
|
|
89
|
+
if driver_match:
|
|
90
|
+
return driver_match.group(1).strip()
|
|
91
|
+
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def extract_dsn(connection_string: str) -> Union[str, None]:
|
|
96
|
+
"""
|
|
97
|
+
Extract the DSN value from an ODBC connection string.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
connection_string (str): The ODBC connection string
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
str or None: The extracted DSN value, or None if not found
|
|
104
|
+
"""
|
|
105
|
+
for pattern in dsn_patterns:
|
|
106
|
+
match = re.search(pattern, connection_string, re.IGNORECASE)
|
|
107
|
+
if match:
|
|
108
|
+
return match.group(1).strip()
|
|
109
|
+
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def extract_server(connection_string: str) -> Union[str, None]:
|
|
114
|
+
"""
|
|
115
|
+
Parse an ODBC connection string and extract the server name.
|
|
116
|
+
Handles various parameter names for server (SERVER, Host, Data Source, etc.)
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
connection_string (str): The ODBC connection string
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
str: The extracted server name, or None if not found
|
|
123
|
+
"""
|
|
124
|
+
for pattern in server_patterns:
|
|
125
|
+
server_match = re.search(pattern, connection_string, re.IGNORECASE)
|
|
126
|
+
if server_match:
|
|
127
|
+
return server_match.group(1).strip()
|
|
128
|
+
|
|
129
|
+
# Special case for Athena: extract from AwsRegion if no server found
|
|
130
|
+
region_match = re.search(r"AwsRegion=([^;]*)", connection_string, re.IGNORECASE)
|
|
131
|
+
if region_match:
|
|
132
|
+
return f"aws-athena-{region_match.group(1).strip()}"
|
|
133
|
+
|
|
134
|
+
# Special case for Databricks: try to extract hostname from JDBC URL
|
|
135
|
+
jdbc_match = re.search(r"jdbc:spark://([^:;/]+)", connection_string, re.IGNORECASE)
|
|
136
|
+
if jdbc_match:
|
|
137
|
+
return jdbc_match.group(1).strip()
|
|
138
|
+
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def extract_platform(connection_string: str) -> Tuple[Optional[str], Optional[str]]:
|
|
143
|
+
"""
|
|
144
|
+
Extract the database platform name from the ODBC driver name.
|
|
145
|
+
Returns the lowercase platform name.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
connection_string (str): The ODBC connection string
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
tuple: A tuple containing the normalized platform name and the corresponding
|
|
152
|
+
Power BI platform name, or None if not recognized.
|
|
153
|
+
"""
|
|
154
|
+
driver_name = extract_driver(connection_string)
|
|
155
|
+
if not driver_name:
|
|
156
|
+
return None, None
|
|
157
|
+
|
|
158
|
+
driver_lower = driver_name.lower()
|
|
159
|
+
|
|
160
|
+
for platform, pattern in platform_patterns.items():
|
|
161
|
+
if re.search(pattern, driver_lower):
|
|
162
|
+
return platform, powerbi_platform_names.get(platform)
|
|
163
|
+
|
|
164
|
+
return None, None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def normalize_platform_name(platform: str) -> Tuple[Optional[str], Optional[str]]:
|
|
168
|
+
"""
|
|
169
|
+
Normalizes the platform name by matching it with predefined patterns and maps it to
|
|
170
|
+
a corresponding Power BI platform name.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
platform (str): The platform name to normalize
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
tuple: A tuple containing the normalized platform name and the corresponding
|
|
177
|
+
Power BI platform name, or None if not recognized.
|
|
178
|
+
"""
|
|
179
|
+
platform_lower = platform.lower()
|
|
180
|
+
|
|
181
|
+
for platform, pattern in platform_patterns.items():
|
|
182
|
+
if re.search(pattern, platform_lower):
|
|
183
|
+
return platform, powerbi_platform_names.get(platform)
|
|
184
|
+
|
|
185
|
+
return None, None
|
|
@@ -29,6 +29,12 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
|
|
|
29
29
|
Lineage,
|
|
30
30
|
ReferencedTable,
|
|
31
31
|
)
|
|
32
|
+
from datahub.ingestion.source.powerbi.m_query.odbc import (
|
|
33
|
+
extract_dsn,
|
|
34
|
+
extract_platform,
|
|
35
|
+
extract_server,
|
|
36
|
+
normalize_platform_name,
|
|
37
|
+
)
|
|
32
38
|
from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
|
|
33
39
|
from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
|
|
34
40
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
@@ -155,6 +161,7 @@ class AbstractLineage(ABC):
|
|
|
155
161
|
tree_function.token_values(arg_list)
|
|
156
162
|
),
|
|
157
163
|
)
|
|
164
|
+
logger.debug(f"DB Details: {arguments}")
|
|
158
165
|
|
|
159
166
|
if len(arguments) < 2:
|
|
160
167
|
logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
|
|
@@ -940,6 +947,147 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
940
947
|
)
|
|
941
948
|
|
|
942
949
|
|
|
950
|
+
class OdbcLineage(AbstractLineage):
|
|
951
|
+
def create_lineage(
|
|
952
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
953
|
+
) -> Lineage:
|
|
954
|
+
logger.debug(
|
|
955
|
+
f"Processing {self.get_platform_pair().powerbi_data_platform_name} "
|
|
956
|
+
f"data-access function detail {data_access_func_detail}"
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
connect_string, _ = self.get_db_detail_from_argument(
|
|
960
|
+
data_access_func_detail.arg_list
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
if not connect_string:
|
|
964
|
+
self.reporter.warning(
|
|
965
|
+
title="Can not extract ODBC connect string",
|
|
966
|
+
message="Can not extract ODBC connect string from data access function. Skipping Lineage creation.",
|
|
967
|
+
context=f"table-name={self.table.full_name}, data-access-func-detail={data_access_func_detail}",
|
|
968
|
+
)
|
|
969
|
+
return Lineage.empty()
|
|
970
|
+
|
|
971
|
+
logger.debug(f"ODBC connect string: {connect_string}")
|
|
972
|
+
data_platform, powerbi_platform = extract_platform(connect_string)
|
|
973
|
+
server_name = extract_server(connect_string)
|
|
974
|
+
|
|
975
|
+
if not data_platform:
|
|
976
|
+
dsn = extract_dsn(connect_string)
|
|
977
|
+
if dsn:
|
|
978
|
+
logger.debug(f"Extracted DSN: {dsn}")
|
|
979
|
+
server_name = dsn
|
|
980
|
+
if dsn and self.config.dsn_to_platform_name:
|
|
981
|
+
logger.debug(f"Attempting to map DSN {dsn} to platform")
|
|
982
|
+
name = self.config.dsn_to_platform_name.get(dsn)
|
|
983
|
+
if name:
|
|
984
|
+
logger.debug(f"Found DSN {dsn} mapped to platform {name}")
|
|
985
|
+
data_platform, powerbi_platform = normalize_platform_name(name)
|
|
986
|
+
|
|
987
|
+
if not data_platform or not powerbi_platform:
|
|
988
|
+
self.reporter.warning(
|
|
989
|
+
title="Can not determine ODBC platform",
|
|
990
|
+
message="Can not determine platform from ODBC connect string. Skipping Lineage creation.",
|
|
991
|
+
context=f"table-name={self.table.full_name}, connect-string={connect_string}",
|
|
992
|
+
)
|
|
993
|
+
return Lineage.empty()
|
|
994
|
+
|
|
995
|
+
platform_pair: DataPlatformPair = self.create_platform_pair(
|
|
996
|
+
data_platform, powerbi_platform
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
if not server_name and self.config.server_to_platform_instance:
|
|
1000
|
+
self.reporter.warning(
|
|
1001
|
+
title="Can not determine ODBC server name",
|
|
1002
|
+
message="Can not determine server name with server_to_platform_instance mapping. Skipping Lineage creation.",
|
|
1003
|
+
context=f"table-name={self.table.full_name}",
|
|
1004
|
+
)
|
|
1005
|
+
return Lineage.empty()
|
|
1006
|
+
elif not server_name:
|
|
1007
|
+
server_name = "unknown"
|
|
1008
|
+
|
|
1009
|
+
database_name = None
|
|
1010
|
+
schema_name = None
|
|
1011
|
+
table_name = None
|
|
1012
|
+
qualified_table_name = None
|
|
1013
|
+
|
|
1014
|
+
temp_accessor: Optional[IdentifierAccessor] = (
|
|
1015
|
+
data_access_func_detail.identifier_accessor
|
|
1016
|
+
)
|
|
1017
|
+
|
|
1018
|
+
while temp_accessor:
|
|
1019
|
+
logger.debug(
|
|
1020
|
+
f"identifier = {temp_accessor.identifier} items = {temp_accessor.items}"
|
|
1021
|
+
)
|
|
1022
|
+
if temp_accessor.items.get("Kind") == "Database":
|
|
1023
|
+
database_name = temp_accessor.items["Name"]
|
|
1024
|
+
|
|
1025
|
+
if temp_accessor.items.get("Kind") == "Schema":
|
|
1026
|
+
schema_name = temp_accessor.items["Name"]
|
|
1027
|
+
|
|
1028
|
+
if temp_accessor.items.get("Kind") == "Table":
|
|
1029
|
+
table_name = temp_accessor.items["Name"]
|
|
1030
|
+
|
|
1031
|
+
if temp_accessor.next is not None:
|
|
1032
|
+
temp_accessor = temp_accessor.next
|
|
1033
|
+
else:
|
|
1034
|
+
break
|
|
1035
|
+
|
|
1036
|
+
if (
|
|
1037
|
+
database_name is not None
|
|
1038
|
+
and schema_name is not None
|
|
1039
|
+
and table_name is not None
|
|
1040
|
+
):
|
|
1041
|
+
qualified_table_name = f"{database_name}.{schema_name}.{table_name}"
|
|
1042
|
+
elif database_name is not None and table_name is not None:
|
|
1043
|
+
qualified_table_name = f"{database_name}.{table_name}"
|
|
1044
|
+
|
|
1045
|
+
if not qualified_table_name:
|
|
1046
|
+
self.reporter.warning(
|
|
1047
|
+
title="Can not determine qualified table name",
|
|
1048
|
+
message="Can not determine qualified table name for ODBC data source. Skipping Lineage creation.",
|
|
1049
|
+
context=f"table-name={self.table.full_name}, data-platform={data_platform}",
|
|
1050
|
+
)
|
|
1051
|
+
logger.warning(
|
|
1052
|
+
f"Can not determine qualified table name for ODBC data source {data_platform} "
|
|
1053
|
+
f"table {self.table.full_name}."
|
|
1054
|
+
)
|
|
1055
|
+
return Lineage.empty()
|
|
1056
|
+
|
|
1057
|
+
logger.debug(
|
|
1058
|
+
f"ODBC Platform {data_platform} found qualified table name {qualified_table_name}"
|
|
1059
|
+
)
|
|
1060
|
+
|
|
1061
|
+
urn = make_urn(
|
|
1062
|
+
config=self.config,
|
|
1063
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
1064
|
+
data_platform_pair=platform_pair,
|
|
1065
|
+
server=server_name,
|
|
1066
|
+
qualified_table_name=qualified_table_name,
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
1070
|
+
|
|
1071
|
+
return Lineage(
|
|
1072
|
+
upstreams=[
|
|
1073
|
+
DataPlatformTable(
|
|
1074
|
+
data_platform_pair=platform_pair,
|
|
1075
|
+
urn=urn,
|
|
1076
|
+
)
|
|
1077
|
+
],
|
|
1078
|
+
column_lineage=column_lineage,
|
|
1079
|
+
)
|
|
1080
|
+
|
|
1081
|
+
@staticmethod
|
|
1082
|
+
def create_platform_pair(
|
|
1083
|
+
data_platform: str, powerbi_platform: str
|
|
1084
|
+
) -> DataPlatformPair:
|
|
1085
|
+
return DataPlatformPair(data_platform, powerbi_platform)
|
|
1086
|
+
|
|
1087
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
1088
|
+
return SupportedDataPlatform.ODBC.value
|
|
1089
|
+
|
|
1090
|
+
|
|
943
1091
|
class SupportedPattern(Enum):
|
|
944
1092
|
DATABRICKS_QUERY = (
|
|
945
1093
|
DatabricksLineage,
|
|
@@ -991,6 +1139,11 @@ class SupportedPattern(Enum):
|
|
|
991
1139
|
FunctionName.NATIVE_QUERY,
|
|
992
1140
|
)
|
|
993
1141
|
|
|
1142
|
+
ODBC = (
|
|
1143
|
+
OdbcLineage,
|
|
1144
|
+
FunctionName.ODBC_DATA_ACCESS,
|
|
1145
|
+
)
|
|
1146
|
+
|
|
994
1147
|
def handler(self) -> Type[AbstractLineage]:
|
|
995
1148
|
return self.value[0]
|
|
996
1149
|
|
|
@@ -63,10 +63,10 @@ class SessionWithTimeout(requests.Session):
|
|
|
63
63
|
super().__init__(*args, **kwargs)
|
|
64
64
|
self.timeout = timeout
|
|
65
65
|
|
|
66
|
-
def request(self, method, url, **kwargs):
|
|
66
|
+
def request(self, method, url, *args, **kwargs):
|
|
67
67
|
# Set the default timeout if none is provided
|
|
68
68
|
kwargs.setdefault("timeout", self.timeout)
|
|
69
|
-
return super().request(method, url, **kwargs)
|
|
69
|
+
return super().request(method, url, *args, **kwargs)
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
class DataResolverBase(ABC):
|
|
@@ -182,15 +182,16 @@ class RedshiftUsageExtractor:
|
|
|
182
182
|
self.report.num_operational_stats_filtered = 0
|
|
183
183
|
|
|
184
184
|
if self.config.include_operational_stats:
|
|
185
|
-
with self.report.new_stage(
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
185
|
+
with self.report.new_stage(
|
|
186
|
+
USAGE_EXTRACTION_OPERATIONAL_STATS
|
|
187
|
+
), PerfTimer() as timer:
|
|
188
|
+
# Generate operation aspect workunits
|
|
189
|
+
yield from self._gen_operation_aspect_workunits(
|
|
190
|
+
self.connection, all_tables
|
|
191
|
+
)
|
|
192
|
+
self.report.operational_metadata_extraction_sec[
|
|
193
|
+
self.config.database
|
|
194
|
+
] = timer.elapsed_seconds(digits=2)
|
|
194
195
|
|
|
195
196
|
# Generate aggregate events
|
|
196
197
|
with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import Dict, Optional
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
6
7
|
|
|
7
8
|
from datahub.configuration.common import AllowDenyPattern
|
|
8
9
|
from datahub.configuration.source_common import (
|
|
@@ -53,15 +54,82 @@ class Constant:
|
|
|
53
54
|
DEFAULT_API_URL = "https://aws-api.sigmacomputing.com/v2"
|
|
54
55
|
|
|
55
56
|
|
|
57
|
+
class WorkspaceCounts(BaseModel):
|
|
58
|
+
workbooks_count: int = 0
|
|
59
|
+
datasets_count: int = 0
|
|
60
|
+
elements_count: int = 0
|
|
61
|
+
pages_count: int = 0
|
|
62
|
+
|
|
63
|
+
def is_empty(self) -> bool:
|
|
64
|
+
return (
|
|
65
|
+
self.workbooks_count == 0
|
|
66
|
+
and self.datasets_count == 0
|
|
67
|
+
and self.elements_count == 0
|
|
68
|
+
and self.pages_count == 0
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def as_obj(self) -> dict:
|
|
72
|
+
return {
|
|
73
|
+
"workbooks_count": self.workbooks_count,
|
|
74
|
+
"datasets_count": self.datasets_count,
|
|
75
|
+
"elements_count": self.elements_count,
|
|
76
|
+
"pages_count": self.pages_count,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
|
|
81
|
+
type: str = "workspace"
|
|
82
|
+
|
|
83
|
+
workspace_counts: Dict[str, WorkspaceCounts] = Field(
|
|
84
|
+
default_factory=dict,
|
|
85
|
+
description="Counts of workbooks, datasets, elements and pages in each workspace.",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def increment_workbooks_count(self, workspace_id: str) -> None:
|
|
89
|
+
if workspace_id not in self.workspace_counts:
|
|
90
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
91
|
+
self.workspace_counts[workspace_id].workbooks_count += 1
|
|
92
|
+
|
|
93
|
+
def increment_datasets_count(self, workspace_id: str) -> None:
|
|
94
|
+
if workspace_id not in self.workspace_counts:
|
|
95
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
96
|
+
self.workspace_counts[workspace_id].datasets_count += 1
|
|
97
|
+
|
|
98
|
+
def increment_elements_count(self, workspace_id: str) -> None:
|
|
99
|
+
if workspace_id not in self.workspace_counts:
|
|
100
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
101
|
+
self.workspace_counts[workspace_id].elements_count += 1
|
|
102
|
+
|
|
103
|
+
def increment_pages_count(self, workspace_id: str) -> None:
|
|
104
|
+
if workspace_id not in self.workspace_counts:
|
|
105
|
+
self.workspace_counts[workspace_id] = WorkspaceCounts()
|
|
106
|
+
self.workspace_counts[workspace_id].pages_count += 1
|
|
107
|
+
|
|
108
|
+
def as_obj(self) -> dict:
|
|
109
|
+
return {
|
|
110
|
+
"filtered": self.dropped_entities.as_obj(),
|
|
111
|
+
"processed": self.processed_entities.as_obj(),
|
|
112
|
+
"workspace_counts": {
|
|
113
|
+
key: item.as_obj() for key, item in self.workspace_counts.items()
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
56
118
|
@dataclass
|
|
57
119
|
class SigmaSourceReport(StaleEntityRemovalSourceReport):
|
|
58
|
-
workspaces:
|
|
59
|
-
|
|
120
|
+
workspaces: SigmaWorkspaceEntityFilterReport = field(
|
|
121
|
+
default_factory=SigmaWorkspaceEntityFilterReport
|
|
122
|
+
)
|
|
60
123
|
non_accessible_workspaces_count: int = 0
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
124
|
+
|
|
125
|
+
datasets: EntityFilterReport = EntityFilterReport.field(type="dataset")
|
|
126
|
+
datasets_without_workspace: int = 0
|
|
127
|
+
|
|
128
|
+
workbooks: EntityFilterReport = EntityFilterReport.field(type="workbook")
|
|
129
|
+
workbooks_without_workspace: int = 0
|
|
130
|
+
|
|
64
131
|
number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
|
|
132
|
+
empty_workspaces: List[str] = field(default_factory=list)
|
|
65
133
|
|
|
66
134
|
|
|
67
135
|
class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
@@ -35,6 +35,7 @@ from datahub.ingestion.source.sigma.config import (
|
|
|
35
35
|
PlatformDetail,
|
|
36
36
|
SigmaSourceConfig,
|
|
37
37
|
SigmaSourceReport,
|
|
38
|
+
WorkspaceCounts,
|
|
38
39
|
)
|
|
39
40
|
from datahub.ingestion.source.sigma.data_classes import (
|
|
40
41
|
Element,
|
|
@@ -163,7 +164,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
163
164
|
def _get_allowed_workspaces(self) -> List[Workspace]:
|
|
164
165
|
all_workspaces = self.sigma_api.workspaces.values()
|
|
165
166
|
logger.info(f"Number of workspaces = {len(all_workspaces)}")
|
|
166
|
-
self.reporter.number_of_workspaces = len(all_workspaces)
|
|
167
167
|
|
|
168
168
|
allowed_workspaces = []
|
|
169
169
|
for workspace in all_workspaces:
|
|
@@ -285,6 +285,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
285
285
|
yield self._gen_dataset_properties(dataset_urn, dataset)
|
|
286
286
|
|
|
287
287
|
if dataset.workspaceId:
|
|
288
|
+
self.reporter.workspaces.increment_datasets_count(dataset.workspaceId)
|
|
288
289
|
yield from add_entity_to_container(
|
|
289
290
|
container_key=self._gen_workspace_key(dataset.workspaceId),
|
|
290
291
|
entity_type="dataset",
|
|
@@ -468,6 +469,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
468
469
|
).as_workunit()
|
|
469
470
|
|
|
470
471
|
if workbook.workspaceId:
|
|
472
|
+
self.reporter.workspaces.increment_elements_count(workbook.workspaceId)
|
|
473
|
+
|
|
471
474
|
yield self._gen_entity_browsepath_aspect(
|
|
472
475
|
entity_urn=chart_urn,
|
|
473
476
|
parent_entity_urn=builder.make_container_urn(
|
|
@@ -525,6 +528,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
525
528
|
all_input_fields: List[InputFieldClass] = []
|
|
526
529
|
|
|
527
530
|
if workbook.workspaceId:
|
|
531
|
+
self.reporter.workspaces.increment_pages_count(workbook.workspaceId)
|
|
528
532
|
yield self._gen_entity_browsepath_aspect(
|
|
529
533
|
entity_urn=dashboard_urn,
|
|
530
534
|
parent_entity_urn=builder.make_container_urn(
|
|
@@ -614,6 +618,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
614
618
|
|
|
615
619
|
paths = workbook.path.split("/")[1:]
|
|
616
620
|
if workbook.workspaceId:
|
|
621
|
+
self.reporter.workspaces.increment_workbooks_count(workbook.workspaceId)
|
|
622
|
+
|
|
617
623
|
yield self._gen_entity_browsepath_aspect(
|
|
618
624
|
entity_urn=dashboard_urn,
|
|
619
625
|
parent_entity_urn=builder.make_container_urn(
|
|
@@ -667,6 +673,15 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
667
673
|
f"{workspace.name} ({workspace.workspaceId})"
|
|
668
674
|
)
|
|
669
675
|
yield from self._gen_workspace_workunit(workspace)
|
|
676
|
+
if self.reporter.workspaces.workspace_counts.get(
|
|
677
|
+
workspace.workspaceId, WorkspaceCounts()
|
|
678
|
+
).is_empty():
|
|
679
|
+
logger.warning(
|
|
680
|
+
f"Workspace {workspace.name} ({workspace.workspaceId}) is empty. If this is not expected, add the user associated with the Client ID/Secret to each workspace with missing metadata"
|
|
681
|
+
)
|
|
682
|
+
self.reporter.empty_workspaces.append(
|
|
683
|
+
f"{workspace.name} ({workspace.workspaceId})"
|
|
684
|
+
)
|
|
670
685
|
yield from self._gen_sigma_dataset_upstream_lineage_workunit()
|
|
671
686
|
|
|
672
687
|
def get_report(self) -> SourceReport:
|