acryl-datahub 1.1.0.5rc8__py3-none-any.whl → 1.1.0.5rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/METADATA +2465 -2465
- {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/RECORD +47 -47
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +45 -1
- datahub/cli/cli_utils.py +0 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +5 -0
- datahub/cli/docker_cli.py +2 -0
- datahub/cli/exists_cli.py +2 -0
- datahub/cli/get_cli.py +2 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +7 -0
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +3 -0
- datahub/cli/specific/assertions_cli.py +2 -0
- datahub/cli/specific/datacontract_cli.py +3 -0
- datahub/cli/specific/dataproduct_cli.py +11 -0
- datahub/cli/specific/dataset_cli.py +4 -0
- datahub/cli/specific/forms_cli.py +2 -0
- datahub/cli/specific/group_cli.py +2 -0
- datahub/cli/specific/structuredproperties_cli.py +4 -0
- datahub/cli/specific/user_cli.py +2 -0
- datahub/cli/state_cli.py +2 -0
- datahub/cli/timeline_cli.py +2 -0
- datahub/emitter/rest_emitter.py +24 -8
- datahub/ingestion/api/report.py +72 -12
- datahub/ingestion/autogenerated/capability_summary.json +19 -1
- datahub/ingestion/autogenerated/lineage_helper.py +101 -19
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/dremio/dremio_api.py +38 -27
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +127 -0
- datahub/ingestion/source/sql/sql_common.py +4 -0
- datahub/ingestion/source/sql/teradata.py +993 -234
- datahub/ingestion/source/tableau/tableau.py +11 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -2
- datahub/metadata/_internal_schema_classes.py +528 -529
- datahub/metadata/_urns/urn_defs.py +1803 -1803
- datahub/metadata/schema.avsc +16720 -17109
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +1 -3
- datahub/sdk/main_client.py +14 -2
- datahub/sdk/search_client.py +4 -3
- datahub/telemetry/telemetry.py +17 -11
- {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc8.dist-info → acryl_datahub-1.1.0.5rc10.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from collections import defaultdict
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from itertools import product
|
|
9
9
|
from time import sleep, time
|
|
10
|
-
from typing import Any, Deque, Dict, List, Optional, Union
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
|
|
11
11
|
from urllib.parse import quote
|
|
12
12
|
|
|
13
13
|
import requests
|
|
@@ -15,6 +15,7 @@ from requests.adapters import HTTPAdapter
|
|
|
15
15
|
from urllib3 import Retry
|
|
16
16
|
from urllib3.exceptions import InsecureRequestWarning
|
|
17
17
|
|
|
18
|
+
from datahub.emitter.request_helper import make_curl_command
|
|
18
19
|
from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
|
|
19
20
|
from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
|
|
20
21
|
DremioToDataHubSourceTypeMapping,
|
|
@@ -23,6 +24,9 @@ from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
|
|
23
24
|
from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
|
|
24
25
|
from datahub.utilities.perf_timer import PerfTimer
|
|
25
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from datahub.ingestion.source.dremio.dremio_entities import DremioContainer
|
|
29
|
+
|
|
26
30
|
logger = logging.getLogger(__name__)
|
|
27
31
|
|
|
28
32
|
|
|
@@ -181,6 +185,7 @@ class DremioAPIOperations:
|
|
|
181
185
|
self.session.headers.update(
|
|
182
186
|
{"Authorization": f"Bearer {connection_args.password}"}
|
|
183
187
|
)
|
|
188
|
+
logger.debug("Configured Dremio cloud API session to use PAT")
|
|
184
189
|
return
|
|
185
190
|
|
|
186
191
|
# On-prem Dremio authentication (PAT or Basic Auth)
|
|
@@ -192,6 +197,7 @@ class DremioAPIOperations:
|
|
|
192
197
|
"Authorization": f"Bearer {connection_args.password}",
|
|
193
198
|
}
|
|
194
199
|
)
|
|
200
|
+
logger.debug("Configured Dremio API session to use PAT")
|
|
195
201
|
return
|
|
196
202
|
else:
|
|
197
203
|
assert connection_args.username and connection_args.password, (
|
|
@@ -215,10 +221,10 @@ class DremioAPIOperations:
|
|
|
215
221
|
response.raise_for_status()
|
|
216
222
|
token = response.json().get("token")
|
|
217
223
|
if token:
|
|
224
|
+
logger.debug("Exchanged username and password for Dremio token")
|
|
218
225
|
self.session.headers.update(
|
|
219
226
|
{"Authorization": f"_dremio{token}"}
|
|
220
227
|
)
|
|
221
|
-
|
|
222
228
|
return
|
|
223
229
|
else:
|
|
224
230
|
self.report.failure("Failed to authenticate", login_url)
|
|
@@ -234,42 +240,45 @@ class DremioAPIOperations:
|
|
|
234
240
|
"Credentials cannot be refreshed. Please check your username and password."
|
|
235
241
|
)
|
|
236
242
|
|
|
237
|
-
def
|
|
238
|
-
"""
|
|
239
|
-
logger.debug(f"GET request to {self.base_url + url}")
|
|
240
|
-
self.report.api_calls_total += 1
|
|
241
|
-
self.report.api_calls_by_method_and_path["GET " + url] += 1
|
|
242
|
-
|
|
243
|
-
with PerfTimer() as timer:
|
|
244
|
-
response = self.session.get(
|
|
245
|
-
url=(self.base_url + url),
|
|
246
|
-
verify=self._verify,
|
|
247
|
-
timeout=self._timeout,
|
|
248
|
-
)
|
|
249
|
-
self.report.api_call_secs_by_method_and_path["GET " + url] += (
|
|
250
|
-
timer.elapsed_seconds()
|
|
251
|
-
)
|
|
252
|
-
# response.raise_for_status() # Enabling this line, makes integration tests to fail
|
|
253
|
-
return response.json()
|
|
243
|
+
def _request(self, method: str, url: str, data: Union[str, None] = None) -> Dict:
|
|
244
|
+
"""Send a request to the Dremio API."""
|
|
254
245
|
|
|
255
|
-
|
|
256
|
-
"""execute a get request on dremio"""
|
|
257
|
-
logger.debug(f"POST request to {self.base_url + url}")
|
|
246
|
+
logger.debug(f"{method} request to {self.base_url + url}")
|
|
258
247
|
self.report.api_calls_total += 1
|
|
259
|
-
self.report.api_calls_by_method_and_path["
|
|
248
|
+
self.report.api_calls_by_method_and_path[f"{method} {url}"] += 1
|
|
260
249
|
|
|
261
250
|
with PerfTimer() as timer:
|
|
262
|
-
response = self.session.
|
|
251
|
+
response = self.session.request(
|
|
252
|
+
method=method,
|
|
263
253
|
url=(self.base_url + url),
|
|
264
254
|
data=data,
|
|
265
255
|
verify=self._verify,
|
|
266
256
|
timeout=self._timeout,
|
|
267
257
|
)
|
|
268
|
-
self.report.api_call_secs_by_method_and_path["
|
|
258
|
+
self.report.api_call_secs_by_method_and_path[f"{method} {url}"] += (
|
|
269
259
|
timer.elapsed_seconds()
|
|
270
260
|
)
|
|
271
261
|
# response.raise_for_status() # Enabling this line, makes integration tests to fail
|
|
272
|
-
|
|
262
|
+
try:
|
|
263
|
+
return response.json()
|
|
264
|
+
except requests.exceptions.JSONDecodeError as e:
|
|
265
|
+
logger.info(
|
|
266
|
+
f"On {method} request to {url}, failed to parse JSON from response (status {response.status_code}): {response.text}"
|
|
267
|
+
)
|
|
268
|
+
logger.debug(
|
|
269
|
+
f"Request curl equivalent: {make_curl_command(self.session, method, url, data)}"
|
|
270
|
+
)
|
|
271
|
+
raise DremioAPIException(
|
|
272
|
+
f"Failed to parse JSON from response (status {response.status_code}): {response.text}"
|
|
273
|
+
) from e
|
|
274
|
+
|
|
275
|
+
def get(self, url: str) -> Dict:
|
|
276
|
+
"""Send a GET request to the Dremio API."""
|
|
277
|
+
return self._request("GET", url)
|
|
278
|
+
|
|
279
|
+
def post(self, url: str, data: str) -> Dict:
|
|
280
|
+
"""Send a POST request to the Dremio API."""
|
|
281
|
+
return self._request("POST", url, data=data)
|
|
273
282
|
|
|
274
283
|
def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
|
|
275
284
|
"""Execute SQL query with timeout and error handling"""
|
|
@@ -489,7 +498,9 @@ class DremioAPIOperations:
|
|
|
489
498
|
pattern_str = "|".join(f"({p})" for p in patterns)
|
|
490
499
|
return f"AND {operator}({field}, '{pattern_str}')"
|
|
491
500
|
|
|
492
|
-
def get_all_tables_and_columns(
|
|
501
|
+
def get_all_tables_and_columns(
|
|
502
|
+
self, containers: Deque["DremioContainer"]
|
|
503
|
+
) -> List[Dict]:
|
|
493
504
|
if self.edition == DremioEdition.ENTERPRISE:
|
|
494
505
|
query_template = DremioSQLQueries.QUERY_DATASETS_EE
|
|
495
506
|
elif self.edition == DremioEdition.CLOUD:
|
|
@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
|
|
|
33
33
|
)
|
|
34
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
35
35
|
from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
|
|
36
|
-
from datahub.ingestion.source.common.subtypes import
|
|
36
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
37
|
+
MLAssetSubTypes,
|
|
38
|
+
SourceCapabilityModifier,
|
|
39
|
+
)
|
|
37
40
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
38
41
|
StaleEntityRemovalHandler,
|
|
39
42
|
StaleEntityRemovalSourceReport,
|
|
@@ -138,6 +141,13 @@ class MLflowRegisteredModelStageInfo:
|
|
|
138
141
|
SourceCapability.DESCRIPTIONS,
|
|
139
142
|
"Extract descriptions for MLflow Registered Models and Model Versions",
|
|
140
143
|
)
|
|
144
|
+
@capability(
|
|
145
|
+
SourceCapability.CONTAINERS,
|
|
146
|
+
"Extract ML experiments",
|
|
147
|
+
subtype_modifier=[
|
|
148
|
+
SourceCapabilityModifier.MLFLOW_EXPERIMENT,
|
|
149
|
+
],
|
|
150
|
+
)
|
|
141
151
|
@capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
|
|
142
152
|
class MLflowSource(StatefulIngestionSourceBase):
|
|
143
153
|
platform = "mlflow"
|
|
@@ -119,6 +119,20 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
119
119
|
include_query_usage_statistics: bool = True
|
|
120
120
|
include_operations: bool = True
|
|
121
121
|
|
|
122
|
+
push_down_database_pattern_access_history: bool = pydantic.Field(
|
|
123
|
+
default=False,
|
|
124
|
+
description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
|
|
125
|
+
"This filters on the accessed objects in access_history.",
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
additional_database_names_allowlist: List[str] = pydantic.Field(
|
|
129
|
+
default=[],
|
|
130
|
+
description="Additional database names (no pattern matching) to be included in the access_history filter. "
|
|
131
|
+
"Only applies if push_down_database_pattern_access_history=True. "
|
|
132
|
+
"These databases will be included in the filter being pushed down regardless of database_pattern settings."
|
|
133
|
+
"This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
|
|
134
|
+
)
|
|
135
|
+
|
|
122
136
|
query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
|
|
123
137
|
|
|
124
138
|
|
|
@@ -383,6 +397,12 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
383
397
|
bucket_duration=self.config.window.bucket_duration,
|
|
384
398
|
deny_usernames=self.config.pushdown_deny_usernames,
|
|
385
399
|
dedup_strategy=self.config.query_dedup_strategy,
|
|
400
|
+
database_pattern=self.filters.filter_config.database_pattern
|
|
401
|
+
if self.config.push_down_database_pattern_access_history
|
|
402
|
+
else None,
|
|
403
|
+
additional_database_names=self.config.additional_database_names_allowlist
|
|
404
|
+
if self.config.push_down_database_pattern_access_history
|
|
405
|
+
else None,
|
|
386
406
|
).build_enriched_query_log_query()
|
|
387
407
|
|
|
388
408
|
with self.structured_reporter.report_exc(
|
|
@@ -723,6 +743,8 @@ class QueryLogQueryBuilder:
|
|
|
723
743
|
deny_usernames: Optional[List[str]],
|
|
724
744
|
max_tables_per_query: int = 20,
|
|
725
745
|
dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
|
|
746
|
+
database_pattern: Optional[AllowDenyPattern] = None,
|
|
747
|
+
additional_database_names: Optional[List[str]] = None,
|
|
726
748
|
):
|
|
727
749
|
self.start_time = start_time
|
|
728
750
|
self.end_time = end_time
|
|
@@ -736,9 +758,113 @@ class QueryLogQueryBuilder:
|
|
|
736
758
|
user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
|
|
737
759
|
self.users_filter = f"user_name NOT IN ({user_not_in})"
|
|
738
760
|
|
|
761
|
+
self.access_history_database_filter = (
|
|
762
|
+
self._build_access_history_database_filter_condition(
|
|
763
|
+
database_pattern, additional_database_names
|
|
764
|
+
)
|
|
765
|
+
)
|
|
766
|
+
|
|
739
767
|
self.time_bucket_size = bucket_duration.value
|
|
740
768
|
assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
|
|
741
769
|
|
|
770
|
+
def _build_access_history_database_filter_condition(
|
|
771
|
+
self,
|
|
772
|
+
database_pattern: Optional[AllowDenyPattern],
|
|
773
|
+
additional_database_names: Optional[List[str]] = None,
|
|
774
|
+
) -> str:
|
|
775
|
+
"""
|
|
776
|
+
Build a SQL WHERE condition for database filtering in access_history based on AllowDenyPattern.
|
|
777
|
+
|
|
778
|
+
IMPORTANT: This function handles the fundamental difference between DML and DDL operations in Snowflake's
|
|
779
|
+
access_history table:
|
|
780
|
+
|
|
781
|
+
- DML Operations (SELECT, INSERT, UPDATE, DELETE, etc.): Store accessed/modified objects in the
|
|
782
|
+
`direct_objects_accessed` and `objects_modified` arrays
|
|
783
|
+
- DDL Operations (CREATE, ALTER, DROP, RENAME, etc.): Store modified objects in the
|
|
784
|
+
`object_modified_by_ddl` field (single object, not an array)
|
|
785
|
+
|
|
786
|
+
Without checking `object_modified_by_ddl`, DDL operations like "ALTER TABLE person_info RENAME TO person_info_final"
|
|
787
|
+
would be incorrectly filtered out because they don't populate the DML arrays, causing missing lineage
|
|
788
|
+
and operational metadata.
|
|
789
|
+
|
|
790
|
+
Args:
|
|
791
|
+
database_pattern: The AllowDenyPattern configuration for database filtering
|
|
792
|
+
additional_database_names: Additional database names to always include (no pattern matching)
|
|
793
|
+
|
|
794
|
+
Returns:
|
|
795
|
+
A SQL WHERE condition string, or "TRUE" if no filtering should be applied
|
|
796
|
+
"""
|
|
797
|
+
if not database_pattern and not additional_database_names:
|
|
798
|
+
return "TRUE"
|
|
799
|
+
|
|
800
|
+
# Build the database filter conditions for pattern matching
|
|
801
|
+
# Note: Using UPPER() + RLIKE for case-insensitive matching is more performant than REGEXP_LIKE with 'i' flag
|
|
802
|
+
database_filter_parts = []
|
|
803
|
+
|
|
804
|
+
if database_pattern:
|
|
805
|
+
allow_patterns = database_pattern.allow
|
|
806
|
+
deny_patterns = database_pattern.deny
|
|
807
|
+
|
|
808
|
+
# Add allow patterns (if not the default "allow all")
|
|
809
|
+
if allow_patterns and allow_patterns != [".*"]:
|
|
810
|
+
allow_conditions = []
|
|
811
|
+
for pattern in allow_patterns:
|
|
812
|
+
# Escape single quotes that might be present in the regex pattern
|
|
813
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
814
|
+
allow_conditions.append(
|
|
815
|
+
f"SPLIT_PART(UPPER(o:objectName), '.', 1) RLIKE '{escaped_pattern}'"
|
|
816
|
+
)
|
|
817
|
+
if allow_conditions:
|
|
818
|
+
database_filter_parts.append(f"({' OR '.join(allow_conditions)})")
|
|
819
|
+
|
|
820
|
+
# Add deny patterns
|
|
821
|
+
if deny_patterns:
|
|
822
|
+
deny_conditions = []
|
|
823
|
+
for pattern in deny_patterns:
|
|
824
|
+
# Escape single quotes that might be present in the regex pattern
|
|
825
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
826
|
+
deny_conditions.append(
|
|
827
|
+
f"SPLIT_PART(UPPER(o:objectName), '.', 1) NOT RLIKE '{escaped_pattern}'"
|
|
828
|
+
)
|
|
829
|
+
if deny_conditions:
|
|
830
|
+
database_filter_parts.append(f"({' AND '.join(deny_conditions)})")
|
|
831
|
+
|
|
832
|
+
# Add additional database names (exact matches)
|
|
833
|
+
if additional_database_names:
|
|
834
|
+
additional_db_conditions = []
|
|
835
|
+
for db_name in additional_database_names:
|
|
836
|
+
# Escape single quotes
|
|
837
|
+
escaped_db_name = db_name.replace("'", "''")
|
|
838
|
+
additional_db_conditions.append(
|
|
839
|
+
f"SPLIT_PART(UPPER(o:objectName), '.', 1) = '{escaped_db_name.upper()}'"
|
|
840
|
+
)
|
|
841
|
+
if additional_db_conditions:
|
|
842
|
+
database_filter_parts.append(
|
|
843
|
+
f"({' OR '.join(additional_db_conditions)})"
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
if database_filter_parts:
|
|
847
|
+
database_filter_condition = " AND ".join(database_filter_parts)
|
|
848
|
+
|
|
849
|
+
# Build a condition that checks if any objects in the arrays match the database pattern
|
|
850
|
+
# This implements "at least one" matching behavior: queries are allowed if they touch
|
|
851
|
+
# at least one database that matches the pattern, even if they also touch other databases
|
|
852
|
+
# Use ARRAY_SIZE with FILTER which is more compatible with Snowflake
|
|
853
|
+
direct_objects_condition = f"ARRAY_SIZE(FILTER(direct_objects_accessed, o -> {database_filter_condition})) > 0"
|
|
854
|
+
objects_modified_condition = f"ARRAY_SIZE(FILTER(objects_modified, o -> {database_filter_condition})) > 0"
|
|
855
|
+
|
|
856
|
+
# CRITICAL: Handle DDL operations by checking object_modified_by_ddl field
|
|
857
|
+
# DDL operations like ALTER TABLE RENAME store their data here instead of in the arrays
|
|
858
|
+
# We need to adapt the filter condition for a single object rather than an array
|
|
859
|
+
ddl_filter_condition = database_filter_condition.replace(
|
|
860
|
+
"o:objectName", "object_modified_by_ddl:objectName"
|
|
861
|
+
)
|
|
862
|
+
object_modified_by_ddl_condition = f"({ddl_filter_condition})"
|
|
863
|
+
|
|
864
|
+
return f"({direct_objects_condition} OR {objects_modified_condition} OR {object_modified_by_ddl_condition})"
|
|
865
|
+
else:
|
|
866
|
+
return "TRUE"
|
|
867
|
+
|
|
742
868
|
def _query_fingerprinted_queries(self):
|
|
743
869
|
if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
|
|
744
870
|
secondary_fingerprint_sql = """
|
|
@@ -828,6 +954,7 @@ fingerprinted_queries as (
|
|
|
828
954
|
AND query_id IN (
|
|
829
955
|
SELECT query_id FROM deduplicated_queries
|
|
830
956
|
)
|
|
957
|
+
AND {self.access_history_database_filter}
|
|
831
958
|
)
|
|
832
959
|
, filtered_access_history AS (
|
|
833
960
|
-- TODO: Add table filter clause.
|
|
@@ -593,6 +593,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
593
593
|
)
|
|
594
594
|
|
|
595
595
|
# Generate workunit for aggregated SQL parsing results
|
|
596
|
+
yield from self._generate_aggregator_workunits()
|
|
597
|
+
|
|
598
|
+
def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
599
|
+
"""Generate work units from SQL parsing aggregator. Can be overridden by subclasses."""
|
|
596
600
|
for mcp in self.aggregator.gen_metadata():
|
|
597
601
|
yield mcp.as_workunit()
|
|
598
602
|
|