acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/cli/check_cli.py +65 -11
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +3 -4
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +41 -8
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +47 -45
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +73 -30
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +12 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/glue.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
- datahub/ingestion/source/dbt/dbt_common.py +3 -1
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/redshift/redshift.py +17 -0
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -12
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/mssql/source.py +24 -15
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +11 -0
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +57 -2
- datahub/ingestion/source/tableau/tableau.py +57 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +56 -30
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1253 -536
- datahub/metadata/_urns/urn_defs.py +1797 -1685
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +16614 -16538
- datahub/metadata/schemas/ContainerProperties.avsc +2 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataJobInfo.avsc +2 -0
- datahub/metadata/schemas/DataProcessKey.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +4 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
- datahub/metadata/schemas/MLModelKey.avsc +2 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/datajob.py +39 -15
- datahub/sdk/lineage_client.py +2 -0
- datahub/sdk/main_client.py +14 -2
- datahub/sdk/search_client.py +4 -3
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from collections import defaultdict
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from itertools import product
|
|
9
9
|
from time import sleep, time
|
|
10
|
-
from typing import Any, Deque, Dict, List, Optional, Union
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
|
|
11
11
|
from urllib.parse import quote
|
|
12
12
|
|
|
13
13
|
import requests
|
|
@@ -15,12 +15,17 @@ from requests.adapters import HTTPAdapter
|
|
|
15
15
|
from urllib3 import Retry
|
|
16
16
|
from urllib3.exceptions import InsecureRequestWarning
|
|
17
17
|
|
|
18
|
+
from datahub.emitter.request_helper import make_curl_command
|
|
18
19
|
from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
|
|
19
20
|
from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
|
|
20
21
|
DremioToDataHubSourceTypeMapping,
|
|
21
22
|
)
|
|
22
23
|
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
|
23
24
|
from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
|
|
25
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from datahub.ingestion.source.dremio.dremio_entities import DremioContainer
|
|
24
29
|
|
|
25
30
|
logger = logging.getLogger(__name__)
|
|
26
31
|
|
|
@@ -54,6 +59,8 @@ class DremioAPIOperations:
|
|
|
54
59
|
self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
|
|
55
60
|
self._max_workers: int = connection_args.max_workers
|
|
56
61
|
self.is_dremio_cloud = connection_args.is_dremio_cloud
|
|
62
|
+
self.start_time = connection_args.start_time
|
|
63
|
+
self.end_time = connection_args.end_time
|
|
57
64
|
self.report = report
|
|
58
65
|
self.session = requests.Session()
|
|
59
66
|
if connection_args.is_dremio_cloud:
|
|
@@ -178,6 +185,7 @@ class DremioAPIOperations:
|
|
|
178
185
|
self.session.headers.update(
|
|
179
186
|
{"Authorization": f"Bearer {connection_args.password}"}
|
|
180
187
|
)
|
|
188
|
+
logger.debug("Configured Dremio cloud API session to use PAT")
|
|
181
189
|
return
|
|
182
190
|
|
|
183
191
|
# On-prem Dremio authentication (PAT or Basic Auth)
|
|
@@ -189,6 +197,7 @@ class DremioAPIOperations:
|
|
|
189
197
|
"Authorization": f"Bearer {connection_args.password}",
|
|
190
198
|
}
|
|
191
199
|
)
|
|
200
|
+
logger.debug("Configured Dremio API session to use PAT")
|
|
192
201
|
return
|
|
193
202
|
else:
|
|
194
203
|
assert connection_args.username and connection_args.password, (
|
|
@@ -212,10 +221,10 @@ class DremioAPIOperations:
|
|
|
212
221
|
response.raise_for_status()
|
|
213
222
|
token = response.json().get("token")
|
|
214
223
|
if token:
|
|
224
|
+
logger.debug("Exchanged username and password for Dremio token")
|
|
215
225
|
self.session.headers.update(
|
|
216
226
|
{"Authorization": f"_dremio{token}"}
|
|
217
227
|
)
|
|
218
|
-
|
|
219
228
|
return
|
|
220
229
|
else:
|
|
221
230
|
self.report.failure("Failed to authenticate", login_url)
|
|
@@ -231,49 +240,76 @@ class DremioAPIOperations:
|
|
|
231
240
|
"Credentials cannot be refreshed. Please check your username and password."
|
|
232
241
|
)
|
|
233
242
|
|
|
243
|
+
def _request(self, method: str, url: str, data: Union[str, None] = None) -> Dict:
|
|
244
|
+
"""Send a request to the Dremio API."""
|
|
245
|
+
|
|
246
|
+
logger.debug(f"{method} request to {self.base_url + url}")
|
|
247
|
+
self.report.api_calls_total += 1
|
|
248
|
+
self.report.api_calls_by_method_and_path[f"{method} {url}"] += 1
|
|
249
|
+
|
|
250
|
+
with PerfTimer() as timer:
|
|
251
|
+
response = self.session.request(
|
|
252
|
+
method=method,
|
|
253
|
+
url=(self.base_url + url),
|
|
254
|
+
data=data,
|
|
255
|
+
verify=self._verify,
|
|
256
|
+
timeout=self._timeout,
|
|
257
|
+
)
|
|
258
|
+
self.report.api_call_secs_by_method_and_path[f"{method} {url}"] += (
|
|
259
|
+
timer.elapsed_seconds()
|
|
260
|
+
)
|
|
261
|
+
# response.raise_for_status() # Enabling this line, makes integration tests to fail
|
|
262
|
+
try:
|
|
263
|
+
return response.json()
|
|
264
|
+
except requests.exceptions.JSONDecodeError as e:
|
|
265
|
+
logger.info(
|
|
266
|
+
f"On {method} request to {url}, failed to parse JSON from response (status {response.status_code}): {response.text}"
|
|
267
|
+
)
|
|
268
|
+
logger.debug(
|
|
269
|
+
f"Request curl equivalent: {make_curl_command(self.session, method, url, data)}"
|
|
270
|
+
)
|
|
271
|
+
raise DremioAPIException(
|
|
272
|
+
f"Failed to parse JSON from response (status {response.status_code}): {response.text}"
|
|
273
|
+
) from e
|
|
274
|
+
|
|
234
275
|
def get(self, url: str) -> Dict:
|
|
235
|
-
"""
|
|
236
|
-
|
|
237
|
-
url=(self.base_url + url),
|
|
238
|
-
verify=self._verify,
|
|
239
|
-
timeout=self._timeout,
|
|
240
|
-
)
|
|
241
|
-
return response.json()
|
|
276
|
+
"""Send a GET request to the Dremio API."""
|
|
277
|
+
return self._request("GET", url)
|
|
242
278
|
|
|
243
279
|
def post(self, url: str, data: str) -> Dict:
|
|
244
|
-
"""
|
|
245
|
-
|
|
246
|
-
url=(self.base_url + url),
|
|
247
|
-
data=data,
|
|
248
|
-
verify=self._verify,
|
|
249
|
-
timeout=self._timeout,
|
|
250
|
-
)
|
|
251
|
-
return response.json()
|
|
280
|
+
"""Send a POST request to the Dremio API."""
|
|
281
|
+
return self._request("POST", url, data=data)
|
|
252
282
|
|
|
253
283
|
def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
|
|
254
284
|
"""Execute SQL query with timeout and error handling"""
|
|
255
285
|
try:
|
|
256
|
-
|
|
286
|
+
with PerfTimer() as timer:
|
|
287
|
+
logger.info(f"Executing query: {query}")
|
|
288
|
+
response = self.post(url="/sql", data=json.dumps({"sql": query}))
|
|
257
289
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
290
|
+
if "errorMessage" in response:
|
|
291
|
+
self.report.failure(
|
|
292
|
+
message="SQL Error", context=f"{response['errorMessage']}"
|
|
293
|
+
)
|
|
294
|
+
raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
|
|
263
295
|
|
|
264
|
-
|
|
296
|
+
job_id = response["id"]
|
|
265
297
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
298
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
299
|
+
future = executor.submit(self.fetch_results, job_id)
|
|
300
|
+
try:
|
|
301
|
+
result = future.result(timeout=timeout)
|
|
302
|
+
logger.info(
|
|
303
|
+
f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
|
|
304
|
+
)
|
|
305
|
+
return result
|
|
306
|
+
except concurrent.futures.TimeoutError:
|
|
307
|
+
self.cancel_query(job_id)
|
|
308
|
+
raise DremioAPIException(
|
|
309
|
+
f"Query execution timed out after {timeout} seconds"
|
|
310
|
+
) from None
|
|
311
|
+
except RuntimeError as e:
|
|
312
|
+
raise DremioAPIException() from e
|
|
277
313
|
|
|
278
314
|
except requests.RequestException as e:
|
|
279
315
|
raise DremioAPIException("Error executing query") from e
|
|
@@ -462,7 +498,9 @@ class DremioAPIOperations:
|
|
|
462
498
|
pattern_str = "|".join(f"({p})" for p in patterns)
|
|
463
499
|
return f"AND {operator}({field}, '{pattern_str}')"
|
|
464
500
|
|
|
465
|
-
def get_all_tables_and_columns(
|
|
501
|
+
def get_all_tables_and_columns(
|
|
502
|
+
self, containers: Deque["DremioContainer"]
|
|
503
|
+
) -> List[Dict]:
|
|
466
504
|
if self.edition == DremioEdition.ENTERPRISE:
|
|
467
505
|
query_template = DremioSQLQueries.QUERY_DATASETS_EE
|
|
468
506
|
elif self.edition == DremioEdition.CLOUD:
|
|
@@ -603,10 +641,25 @@ class DremioAPIOperations:
|
|
|
603
641
|
return parents_list
|
|
604
642
|
|
|
605
643
|
def extract_all_queries(self) -> List[Dict[str, Any]]:
|
|
644
|
+
# Convert datetime objects to string format for SQL queries
|
|
645
|
+
start_timestamp_str = None
|
|
646
|
+
end_timestamp_str = None
|
|
647
|
+
|
|
648
|
+
if self.start_time:
|
|
649
|
+
start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
650
|
+
if self.end_time:
|
|
651
|
+
end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
652
|
+
|
|
606
653
|
if self.edition == DremioEdition.CLOUD:
|
|
607
|
-
jobs_query = DremioSQLQueries.
|
|
654
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
|
|
655
|
+
start_timestamp_millis=start_timestamp_str,
|
|
656
|
+
end_timestamp_millis=end_timestamp_str,
|
|
657
|
+
)
|
|
608
658
|
else:
|
|
609
|
-
jobs_query = DremioSQLQueries.
|
|
659
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs(
|
|
660
|
+
start_timestamp_millis=start_timestamp_str,
|
|
661
|
+
end_timestamp_millis=end_timestamp_str,
|
|
662
|
+
)
|
|
610
663
|
|
|
611
664
|
return self.execute_query(query=jobs_query)
|
|
612
665
|
|
|
@@ -685,6 +738,27 @@ class DremioAPIOperations:
|
|
|
685
738
|
|
|
686
739
|
return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
|
|
687
740
|
|
|
741
|
+
def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
|
|
742
|
+
"""
|
|
743
|
+
Check if a container path could potentially match a schema pattern.
|
|
744
|
+
This handles hierarchical path matching for container filtering.
|
|
745
|
+
"""
|
|
746
|
+
if pattern == ".*":
|
|
747
|
+
return True
|
|
748
|
+
|
|
749
|
+
current_path = ".".join(path_components)
|
|
750
|
+
|
|
751
|
+
# Handle simple .* patterns (like "a.b.c.*")
|
|
752
|
+
if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
|
|
753
|
+
# Simple dotstar pattern - check prefix matching
|
|
754
|
+
pattern_prefix = pattern[:-2] # Remove ".*"
|
|
755
|
+
return current_path.lower().startswith(
|
|
756
|
+
pattern_prefix.lower()
|
|
757
|
+
) or pattern_prefix.lower().startswith(current_path.lower())
|
|
758
|
+
else:
|
|
759
|
+
# Complex regex pattern - use existing regex matching logic
|
|
760
|
+
return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
|
|
761
|
+
|
|
688
762
|
def should_include_container(self, path: List[str], name: str) -> bool:
|
|
689
763
|
"""
|
|
690
764
|
Helper method to check if a container should be included based on schema patterns.
|
|
@@ -711,41 +785,8 @@ class DremioAPIOperations:
|
|
|
711
785
|
|
|
712
786
|
# Check allow patterns
|
|
713
787
|
for pattern in self.allow_schema_pattern:
|
|
714
|
-
#
|
|
715
|
-
if
|
|
716
|
-
pattern_parts = pattern.split(".")
|
|
717
|
-
path_parts = path_components
|
|
718
|
-
|
|
719
|
-
# If pattern has exact same number of parts, check each component
|
|
720
|
-
if len(pattern_parts) == len(path_parts):
|
|
721
|
-
matches = True
|
|
722
|
-
for p_part, c_part in zip(pattern_parts, path_parts):
|
|
723
|
-
if p_part != "*" and p_part.lower() != c_part.lower():
|
|
724
|
-
matches = False
|
|
725
|
-
break
|
|
726
|
-
if matches:
|
|
727
|
-
self.report.report_container_scanned(full_path)
|
|
728
|
-
return True
|
|
729
|
-
# Otherwise check if current path is prefix match
|
|
730
|
-
else:
|
|
731
|
-
# Remove the trailing wildcard if present
|
|
732
|
-
if pattern_parts[-1] == "*":
|
|
733
|
-
pattern_parts = pattern_parts[:-1]
|
|
734
|
-
|
|
735
|
-
for i in range(len(path_parts)):
|
|
736
|
-
current_path = ".".join(path_parts[: i + 1])
|
|
737
|
-
pattern_prefix = ".".join(pattern_parts[: i + 1])
|
|
738
|
-
|
|
739
|
-
if pattern_prefix.startswith(current_path):
|
|
740
|
-
self.report.report_container_scanned(full_path)
|
|
741
|
-
return True
|
|
742
|
-
|
|
743
|
-
# Direct pattern matching
|
|
744
|
-
if self._check_pattern_match(
|
|
745
|
-
pattern=pattern,
|
|
746
|
-
paths=[full_path],
|
|
747
|
-
allow_prefix=True,
|
|
748
|
-
):
|
|
788
|
+
# Check if current path could potentially match this pattern
|
|
789
|
+
if self._could_match_pattern(pattern, path_components):
|
|
749
790
|
self.report.report_container_scanned(full_path)
|
|
750
791
|
return True
|
|
751
792
|
|
|
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
|
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
11
11
|
)
|
|
12
|
+
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
12
13
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
|
|
13
14
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
14
15
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -118,6 +119,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
|
|
|
118
119
|
class DremioSourceConfig(
|
|
119
120
|
DremioConnectionConfig,
|
|
120
121
|
StatefulIngestionConfigBase,
|
|
122
|
+
BaseTimeWindowConfig,
|
|
121
123
|
EnvConfigMixin,
|
|
122
124
|
PlatformInstanceConfigMixin,
|
|
123
125
|
):
|
|
@@ -1,22 +1,43 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from datetime import datetime
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
5
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
7
|
StaleEntityRemovalSourceReport,
|
|
7
8
|
)
|
|
8
9
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
10
|
+
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
11
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
12
|
+
from datahub.utilities.stats_collections import (
|
|
13
|
+
TopKDict,
|
|
14
|
+
float_top_k_dict,
|
|
15
|
+
int_top_k_dict,
|
|
16
|
+
)
|
|
9
17
|
|
|
10
18
|
|
|
11
19
|
@dataclass
|
|
12
20
|
class DremioSourceReport(
|
|
13
|
-
SQLSourceReport,
|
|
21
|
+
SQLSourceReport,
|
|
22
|
+
StaleEntityRemovalSourceReport,
|
|
23
|
+
IngestionStageReport,
|
|
24
|
+
BaseTimeWindowReport,
|
|
14
25
|
):
|
|
15
26
|
num_containers_failed: int = 0
|
|
16
27
|
num_datasets_failed: int = 0
|
|
17
28
|
containers_scanned: int = 0
|
|
18
29
|
containers_filtered: int = 0
|
|
19
30
|
|
|
31
|
+
api_calls_total: int = 0
|
|
32
|
+
api_calls_by_method_and_path: TopKDict[str, int] = field(
|
|
33
|
+
default_factory=int_top_k_dict
|
|
34
|
+
)
|
|
35
|
+
api_call_secs_by_method_and_path: TopKDict[str, float] = field(
|
|
36
|
+
default_factory=float_top_k_dict
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
40
|
+
|
|
20
41
|
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
|
21
42
|
# recording total combined latency is not very useful, keeping this method as a placeholder
|
|
22
43
|
# for future implementation of min / max / percentiles etc.
|
|
@@ -51,13 +51,17 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
51
51
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
52
52
|
StatefulIngestionSourceBase,
|
|
53
53
|
)
|
|
54
|
-
from datahub.ingestion.source_report.ingestion_stage import
|
|
54
|
+
from datahub.ingestion.source_report.ingestion_stage import (
|
|
55
|
+
LINEAGE_EXTRACTION,
|
|
56
|
+
METADATA_EXTRACTION,
|
|
57
|
+
PROFILING,
|
|
58
|
+
)
|
|
55
59
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
56
60
|
DatasetLineageTypeClass,
|
|
57
61
|
UpstreamClass,
|
|
58
62
|
UpstreamLineage,
|
|
59
63
|
)
|
|
60
|
-
from datahub.metadata.schema_classes import
|
|
64
|
+
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
61
65
|
from datahub.metadata.urns import CorpUserUrn
|
|
62
66
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
63
67
|
KnownQueryLineageInfo,
|
|
@@ -89,6 +93,7 @@ class DremioSourceMapEntry:
|
|
|
89
93
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
90
94
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
91
95
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
96
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
92
97
|
class DremioSource(StatefulIngestionSourceBase):
|
|
93
98
|
"""
|
|
94
99
|
This plugin integrates with Dremio to extract and ingest metadata into DataHub.
|
|
@@ -126,6 +131,13 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
126
131
|
self.default_db = "dremio"
|
|
127
132
|
self.config = config
|
|
128
133
|
self.report = DremioSourceReport()
|
|
134
|
+
|
|
135
|
+
# Set time window for query lineage extraction
|
|
136
|
+
self.report.window_start_time, self.report.window_end_time = (
|
|
137
|
+
self.config.start_time,
|
|
138
|
+
self.config.end_time,
|
|
139
|
+
)
|
|
140
|
+
|
|
129
141
|
self.source_map: Dict[str, DremioSourceMapEntry] = dict()
|
|
130
142
|
|
|
131
143
|
# Initialize API operations
|
|
@@ -154,6 +166,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
154
166
|
generate_operations=True,
|
|
155
167
|
usage_config=self.config.usage,
|
|
156
168
|
)
|
|
169
|
+
self.report.sql_aggregator = self.sql_parsing_aggregator.report
|
|
157
170
|
|
|
158
171
|
# For profiling
|
|
159
172
|
self.profiler = DremioProfiler(config, self.report, dremio_api)
|
|
@@ -190,84 +203,88 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
190
203
|
|
|
191
204
|
self.source_map = self._build_source_map()
|
|
192
205
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
206
|
+
with self.report.new_stage(METADATA_EXTRACTION):
|
|
207
|
+
# Process Containers
|
|
208
|
+
containers = self.dremio_catalog.get_containers()
|
|
209
|
+
for container in containers:
|
|
210
|
+
try:
|
|
211
|
+
yield from self.process_container(container)
|
|
212
|
+
logger.info(
|
|
213
|
+
f"Dremio container {container.container_name} emitted successfully"
|
|
214
|
+
)
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
self.report.num_containers_failed += 1
|
|
217
|
+
self.report.report_failure(
|
|
218
|
+
message="Failed to process Dremio container",
|
|
219
|
+
context=f"{'.'.join(container.path)}.{container.container_name}",
|
|
220
|
+
exc=exc,
|
|
221
|
+
)
|
|
208
222
|
|
|
209
|
-
|
|
210
|
-
|
|
223
|
+
# Process Datasets
|
|
224
|
+
datasets = self.dremio_catalog.get_datasets()
|
|
211
225
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
226
|
+
for dataset_info in datasets:
|
|
227
|
+
try:
|
|
228
|
+
yield from self.process_dataset(dataset_info)
|
|
229
|
+
logger.info(
|
|
230
|
+
f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
|
|
231
|
+
)
|
|
232
|
+
except Exception as exc:
|
|
233
|
+
self.report.num_datasets_failed += 1 # Increment failed datasets
|
|
234
|
+
self.report.report_failure(
|
|
235
|
+
message="Failed to process Dremio dataset",
|
|
236
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
237
|
+
exc=exc,
|
|
238
|
+
)
|
|
225
239
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
self.get_query_lineage_workunits()
|
|
229
|
-
|
|
230
|
-
# Process Glossary Terms
|
|
231
|
-
glossary_terms = self.dremio_catalog.get_glossary_terms()
|
|
232
|
-
|
|
233
|
-
for glossary_term in glossary_terms:
|
|
234
|
-
try:
|
|
235
|
-
yield from self.process_glossary_term(glossary_term)
|
|
236
|
-
except Exception as exc:
|
|
237
|
-
self.report.report_failure(
|
|
238
|
-
message="Failed to process Glossary terms",
|
|
239
|
-
context=f"{glossary_term.glossary_term}",
|
|
240
|
-
exc=exc,
|
|
241
|
-
)
|
|
240
|
+
# Process Glossary Terms
|
|
241
|
+
glossary_terms = self.dremio_catalog.get_glossary_terms()
|
|
242
242
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
243
|
+
for glossary_term in glossary_terms:
|
|
244
|
+
try:
|
|
245
|
+
yield from self.process_glossary_term(glossary_term)
|
|
246
|
+
except Exception as exc:
|
|
247
|
+
self.report.report_failure(
|
|
248
|
+
message="Failed to process Glossary terms",
|
|
249
|
+
context=f"{glossary_term.glossary_term}",
|
|
250
|
+
exc=exc,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Optionally Process Query Lineage
|
|
254
|
+
if self.config.include_query_lineage:
|
|
255
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
256
|
+
self.get_query_lineage_workunits()
|
|
257
|
+
|
|
258
|
+
# Generate workunit for aggregated SQL parsing results
|
|
259
|
+
for mcp in self.sql_parsing_aggregator.gen_metadata():
|
|
260
|
+
yield mcp.as_workunit()
|
|
261
|
+
|
|
262
|
+
# Profiling
|
|
263
|
+
if self.config.is_profiling_enabled():
|
|
264
|
+
with (
|
|
265
|
+
self.report.new_stage(PROFILING),
|
|
266
|
+
ThreadPoolExecutor(
|
|
267
|
+
max_workers=self.config.profiling.max_workers
|
|
268
|
+
) as executor,
|
|
269
|
+
):
|
|
270
|
+
future_to_dataset = {
|
|
271
|
+
executor.submit(self.generate_profiles, dataset): dataset
|
|
272
|
+
for dataset in datasets
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
for future in as_completed(future_to_dataset):
|
|
276
|
+
dataset_info = future_to_dataset[future]
|
|
277
|
+
try:
|
|
278
|
+
yield from future.result()
|
|
279
|
+
except Exception as exc:
|
|
280
|
+
self.report.profiling_skipped_other[
|
|
281
|
+
dataset_info.resource_name
|
|
282
|
+
] += 1
|
|
283
|
+
self.report.report_failure(
|
|
284
|
+
message="Failed to profile dataset",
|
|
285
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
286
|
+
exc=exc,
|
|
287
|
+
)
|
|
271
288
|
|
|
272
289
|
def process_container(
|
|
273
290
|
self, container_info: DremioContainer
|
|
@@ -388,8 +405,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
388
405
|
env=self.config.env,
|
|
389
406
|
platform_instance=self.config.platform_instance,
|
|
390
407
|
)
|
|
391
|
-
|
|
392
|
-
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
408
|
+
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
393
409
|
|
|
394
410
|
def generate_view_lineage(
|
|
395
411
|
self, dataset_urn: str, parents: List[str]
|
|
@@ -417,11 +433,8 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
417
433
|
]
|
|
418
434
|
)
|
|
419
435
|
mcp = MetadataChangeProposalWrapper(
|
|
420
|
-
entityType="dataset",
|
|
421
436
|
entityUrn=dataset_urn,
|
|
422
|
-
aspectName=lineage.ASPECT_NAME,
|
|
423
437
|
aspect=lineage,
|
|
424
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
425
438
|
)
|
|
426
439
|
|
|
427
440
|
for upstream_urn in upstream_urns:
|