acryl-datahub 1.1.0.5rc9__py3-none-any.whl → 1.1.0.5rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (45) hide show
  1. {acryl_datahub-1.1.0.5rc9.dist-info → acryl_datahub-1.1.0.5rc11.dist-info}/METADATA +2517 -2517
  2. {acryl_datahub-1.1.0.5rc9.dist-info → acryl_datahub-1.1.0.5rc11.dist-info}/RECORD +45 -45
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +45 -1
  5. datahub/cli/cli_utils.py +0 -10
  6. datahub/cli/container_cli.py +5 -0
  7. datahub/cli/delete_cli.py +5 -0
  8. datahub/cli/docker_cli.py +2 -0
  9. datahub/cli/exists_cli.py +2 -0
  10. datahub/cli/get_cli.py +2 -0
  11. datahub/cli/iceberg_cli.py +5 -0
  12. datahub/cli/ingest_cli.py +7 -0
  13. datahub/cli/migrate.py +2 -0
  14. datahub/cli/put_cli.py +3 -0
  15. datahub/cli/specific/assertions_cli.py +2 -0
  16. datahub/cli/specific/datacontract_cli.py +3 -0
  17. datahub/cli/specific/dataproduct_cli.py +11 -0
  18. datahub/cli/specific/dataset_cli.py +4 -0
  19. datahub/cli/specific/forms_cli.py +2 -0
  20. datahub/cli/specific/group_cli.py +2 -0
  21. datahub/cli/specific/structuredproperties_cli.py +4 -0
  22. datahub/cli/specific/user_cli.py +2 -0
  23. datahub/cli/state_cli.py +2 -0
  24. datahub/cli/timeline_cli.py +2 -0
  25. datahub/emitter/rest_emitter.py +24 -8
  26. datahub/ingestion/api/report.py +72 -12
  27. datahub/ingestion/autogenerated/capability_summary.json +19 -1
  28. datahub/ingestion/autogenerated/lineage_helper.py +101 -19
  29. datahub/ingestion/source/common/subtypes.py +2 -0
  30. datahub/ingestion/source/dremio/dremio_api.py +38 -27
  31. datahub/ingestion/source/mlflow.py +11 -1
  32. datahub/ingestion/source/snowflake/snowflake_queries.py +127 -0
  33. datahub/ingestion/source/tableau/tableau.py +11 -2
  34. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  35. datahub/metadata/_internal_schema_classes.py +528 -529
  36. datahub/metadata/_urns/urn_defs.py +1803 -1803
  37. datahub/metadata/schema.avsc +16720 -17109
  38. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +1 -3
  39. datahub/sdk/main_client.py +14 -2
  40. datahub/sdk/search_client.py +4 -3
  41. datahub/telemetry/telemetry.py +17 -11
  42. {acryl_datahub-1.1.0.5rc9.dist-info → acryl_datahub-1.1.0.5rc11.dist-info}/WHEEL +0 -0
  43. {acryl_datahub-1.1.0.5rc9.dist-info → acryl_datahub-1.1.0.5rc11.dist-info}/entry_points.txt +0 -0
  44. {acryl_datahub-1.1.0.5rc9.dist-info → acryl_datahub-1.1.0.5rc11.dist-info}/licenses/LICENSE +0 -0
  45. {acryl_datahub-1.1.0.5rc9.dist-info → acryl_datahub-1.1.0.5rc11.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ from collections import defaultdict
7
7
  from enum import Enum
8
8
  from itertools import product
9
9
  from time import sleep, time
10
- from typing import Any, Deque, Dict, List, Optional, Union
10
+ from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
11
11
  from urllib.parse import quote
12
12
 
13
13
  import requests
@@ -15,6 +15,7 @@ from requests.adapters import HTTPAdapter
15
15
  from urllib3 import Retry
16
16
  from urllib3.exceptions import InsecureRequestWarning
17
17
 
18
+ from datahub.emitter.request_helper import make_curl_command
18
19
  from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
19
20
  from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
20
21
  DremioToDataHubSourceTypeMapping,
@@ -23,6 +24,9 @@ from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
23
24
  from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
24
25
  from datahub.utilities.perf_timer import PerfTimer
25
26
 
27
+ if TYPE_CHECKING:
28
+ from datahub.ingestion.source.dremio.dremio_entities import DremioContainer
29
+
26
30
  logger = logging.getLogger(__name__)
27
31
 
28
32
 
@@ -181,6 +185,7 @@ class DremioAPIOperations:
181
185
  self.session.headers.update(
182
186
  {"Authorization": f"Bearer {connection_args.password}"}
183
187
  )
188
+ logger.debug("Configured Dremio cloud API session to use PAT")
184
189
  return
185
190
 
186
191
  # On-prem Dremio authentication (PAT or Basic Auth)
@@ -192,6 +197,7 @@ class DremioAPIOperations:
192
197
  "Authorization": f"Bearer {connection_args.password}",
193
198
  }
194
199
  )
200
+ logger.debug("Configured Dremio API session to use PAT")
195
201
  return
196
202
  else:
197
203
  assert connection_args.username and connection_args.password, (
@@ -215,10 +221,10 @@ class DremioAPIOperations:
215
221
  response.raise_for_status()
216
222
  token = response.json().get("token")
217
223
  if token:
224
+ logger.debug("Exchanged username and password for Dremio token")
218
225
  self.session.headers.update(
219
226
  {"Authorization": f"_dremio{token}"}
220
227
  )
221
-
222
228
  return
223
229
  else:
224
230
  self.report.failure("Failed to authenticate", login_url)
@@ -234,42 +240,45 @@ class DremioAPIOperations:
234
240
  "Credentials cannot be refreshed. Please check your username and password."
235
241
  )
236
242
 
237
- def get(self, url: str) -> Dict:
238
- """execute a get request on dremio"""
239
- logger.debug(f"GET request to {self.base_url + url}")
240
- self.report.api_calls_total += 1
241
- self.report.api_calls_by_method_and_path["GET " + url] += 1
242
-
243
- with PerfTimer() as timer:
244
- response = self.session.get(
245
- url=(self.base_url + url),
246
- verify=self._verify,
247
- timeout=self._timeout,
248
- )
249
- self.report.api_call_secs_by_method_and_path["GET " + url] += (
250
- timer.elapsed_seconds()
251
- )
252
- # response.raise_for_status() # Enabling this line, makes integration tests to fail
253
- return response.json()
243
+ def _request(self, method: str, url: str, data: Union[str, None] = None) -> Dict:
244
+ """Send a request to the Dremio API."""
254
245
 
255
- def post(self, url: str, data: str) -> Dict:
256
- """execute a get request on dremio"""
257
- logger.debug(f"POST request to {self.base_url + url}")
246
+ logger.debug(f"{method} request to {self.base_url + url}")
258
247
  self.report.api_calls_total += 1
259
- self.report.api_calls_by_method_and_path["POST " + url] += 1
248
+ self.report.api_calls_by_method_and_path[f"{method} {url}"] += 1
260
249
 
261
250
  with PerfTimer() as timer:
262
- response = self.session.post(
251
+ response = self.session.request(
252
+ method=method,
263
253
  url=(self.base_url + url),
264
254
  data=data,
265
255
  verify=self._verify,
266
256
  timeout=self._timeout,
267
257
  )
268
- self.report.api_call_secs_by_method_and_path["POST " + url] += (
258
+ self.report.api_call_secs_by_method_and_path[f"{method} {url}"] += (
269
259
  timer.elapsed_seconds()
270
260
  )
271
261
  # response.raise_for_status() # Enabling this line, makes integration tests to fail
272
- return response.json()
262
+ try:
263
+ return response.json()
264
+ except requests.exceptions.JSONDecodeError as e:
265
+ logger.info(
266
+ f"On {method} request to {url}, failed to parse JSON from response (status {response.status_code}): {response.text}"
267
+ )
268
+ logger.debug(
269
+ f"Request curl equivalent: {make_curl_command(self.session, method, url, data)}"
270
+ )
271
+ raise DremioAPIException(
272
+ f"Failed to parse JSON from response (status {response.status_code}): {response.text}"
273
+ ) from e
274
+
275
+ def get(self, url: str) -> Dict:
276
+ """Send a GET request to the Dremio API."""
277
+ return self._request("GET", url)
278
+
279
+ def post(self, url: str, data: str) -> Dict:
280
+ """Send a POST request to the Dremio API."""
281
+ return self._request("POST", url, data=data)
273
282
 
274
283
  def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
275
284
  """Execute SQL query with timeout and error handling"""
@@ -489,7 +498,9 @@ class DremioAPIOperations:
489
498
  pattern_str = "|".join(f"({p})" for p in patterns)
490
499
  return f"AND {operator}({field}, '{pattern_str}')"
491
500
 
492
- def get_all_tables_and_columns(self, containers: Deque) -> List[Dict]:
501
+ def get_all_tables_and_columns(
502
+ self, containers: Deque["DremioContainer"]
503
+ ) -> List[Dict]:
493
504
  if self.edition == DremioEdition.ENTERPRISE:
494
505
  query_template = DremioSQLQueries.QUERY_DATASETS_EE
495
506
  elif self.edition == DremioEdition.CLOUD:
@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
33
33
  )
34
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
35
  from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
36
- from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
36
+ from datahub.ingestion.source.common.subtypes import (
37
+ MLAssetSubTypes,
38
+ SourceCapabilityModifier,
39
+ )
37
40
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
38
41
  StaleEntityRemovalHandler,
39
42
  StaleEntityRemovalSourceReport,
@@ -138,6 +141,13 @@ class MLflowRegisteredModelStageInfo:
138
141
  SourceCapability.DESCRIPTIONS,
139
142
  "Extract descriptions for MLflow Registered Models and Model Versions",
140
143
  )
144
+ @capability(
145
+ SourceCapability.CONTAINERS,
146
+ "Extract ML experiments",
147
+ subtype_modifier=[
148
+ SourceCapabilityModifier.MLFLOW_EXPERIMENT,
149
+ ],
150
+ )
141
151
  @capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
142
152
  class MLflowSource(StatefulIngestionSourceBase):
143
153
  platform = "mlflow"
@@ -119,6 +119,20 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
119
119
  include_query_usage_statistics: bool = True
120
120
  include_operations: bool = True
121
121
 
122
+ push_down_database_pattern_access_history: bool = pydantic.Field(
123
+ default=False,
124
+ description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
125
+ "This filters on the accessed objects in access_history.",
126
+ )
127
+
128
+ additional_database_names_allowlist: List[str] = pydantic.Field(
129
+ default=[],
130
+ description="Additional database names (no pattern matching) to be included in the access_history filter. "
131
+ "Only applies if push_down_database_pattern_access_history=True. "
132
+ "These databases will be included in the filter being pushed down regardless of database_pattern settings."
133
+ "This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
134
+ )
135
+
122
136
  query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
123
137
 
124
138
 
@@ -383,6 +397,12 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
383
397
  bucket_duration=self.config.window.bucket_duration,
384
398
  deny_usernames=self.config.pushdown_deny_usernames,
385
399
  dedup_strategy=self.config.query_dedup_strategy,
400
+ database_pattern=self.filters.filter_config.database_pattern
401
+ if self.config.push_down_database_pattern_access_history
402
+ else None,
403
+ additional_database_names=self.config.additional_database_names_allowlist
404
+ if self.config.push_down_database_pattern_access_history
405
+ else None,
386
406
  ).build_enriched_query_log_query()
387
407
 
388
408
  with self.structured_reporter.report_exc(
@@ -723,6 +743,8 @@ class QueryLogQueryBuilder:
723
743
  deny_usernames: Optional[List[str]],
724
744
  max_tables_per_query: int = 20,
725
745
  dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
746
+ database_pattern: Optional[AllowDenyPattern] = None,
747
+ additional_database_names: Optional[List[str]] = None,
726
748
  ):
727
749
  self.start_time = start_time
728
750
  self.end_time = end_time
@@ -736,9 +758,113 @@ class QueryLogQueryBuilder:
736
758
  user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
737
759
  self.users_filter = f"user_name NOT IN ({user_not_in})"
738
760
 
761
+ self.access_history_database_filter = (
762
+ self._build_access_history_database_filter_condition(
763
+ database_pattern, additional_database_names
764
+ )
765
+ )
766
+
739
767
  self.time_bucket_size = bucket_duration.value
740
768
  assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
741
769
 
770
+ def _build_access_history_database_filter_condition(
771
+ self,
772
+ database_pattern: Optional[AllowDenyPattern],
773
+ additional_database_names: Optional[List[str]] = None,
774
+ ) -> str:
775
+ """
776
+ Build a SQL WHERE condition for database filtering in access_history based on AllowDenyPattern.
777
+
778
+ IMPORTANT: This function handles the fundamental difference between DML and DDL operations in Snowflake's
779
+ access_history table:
780
+
781
+ - DML Operations (SELECT, INSERT, UPDATE, DELETE, etc.): Store accessed/modified objects in the
782
+ `direct_objects_accessed` and `objects_modified` arrays
783
+ - DDL Operations (CREATE, ALTER, DROP, RENAME, etc.): Store modified objects in the
784
+ `object_modified_by_ddl` field (single object, not an array)
785
+
786
+ Without checking `object_modified_by_ddl`, DDL operations like "ALTER TABLE person_info RENAME TO person_info_final"
787
+ would be incorrectly filtered out because they don't populate the DML arrays, causing missing lineage
788
+ and operational metadata.
789
+
790
+ Args:
791
+ database_pattern: The AllowDenyPattern configuration for database filtering
792
+ additional_database_names: Additional database names to always include (no pattern matching)
793
+
794
+ Returns:
795
+ A SQL WHERE condition string, or "TRUE" if no filtering should be applied
796
+ """
797
+ if not database_pattern and not additional_database_names:
798
+ return "TRUE"
799
+
800
+ # Build the database filter conditions for pattern matching
801
+ # Note: Using UPPER() + RLIKE for case-insensitive matching is more performant than REGEXP_LIKE with 'i' flag
802
+ database_filter_parts = []
803
+
804
+ if database_pattern:
805
+ allow_patterns = database_pattern.allow
806
+ deny_patterns = database_pattern.deny
807
+
808
+ # Add allow patterns (if not the default "allow all")
809
+ if allow_patterns and allow_patterns != [".*"]:
810
+ allow_conditions = []
811
+ for pattern in allow_patterns:
812
+ # Escape single quotes that might be present in the regex pattern
813
+ escaped_pattern = pattern.replace("'", "''")
814
+ allow_conditions.append(
815
+ f"SPLIT_PART(UPPER(o:objectName), '.', 1) RLIKE '{escaped_pattern}'"
816
+ )
817
+ if allow_conditions:
818
+ database_filter_parts.append(f"({' OR '.join(allow_conditions)})")
819
+
820
+ # Add deny patterns
821
+ if deny_patterns:
822
+ deny_conditions = []
823
+ for pattern in deny_patterns:
824
+ # Escape single quotes that might be present in the regex pattern
825
+ escaped_pattern = pattern.replace("'", "''")
826
+ deny_conditions.append(
827
+ f"SPLIT_PART(UPPER(o:objectName), '.', 1) NOT RLIKE '{escaped_pattern}'"
828
+ )
829
+ if deny_conditions:
830
+ database_filter_parts.append(f"({' AND '.join(deny_conditions)})")
831
+
832
+ # Add additional database names (exact matches)
833
+ if additional_database_names:
834
+ additional_db_conditions = []
835
+ for db_name in additional_database_names:
836
+ # Escape single quotes
837
+ escaped_db_name = db_name.replace("'", "''")
838
+ additional_db_conditions.append(
839
+ f"SPLIT_PART(UPPER(o:objectName), '.', 1) = '{escaped_db_name.upper()}'"
840
+ )
841
+ if additional_db_conditions:
842
+ database_filter_parts.append(
843
+ f"({' OR '.join(additional_db_conditions)})"
844
+ )
845
+
846
+ if database_filter_parts:
847
+ database_filter_condition = " AND ".join(database_filter_parts)
848
+
849
+ # Build a condition that checks if any objects in the arrays match the database pattern
850
+ # This implements "at least one" matching behavior: queries are allowed if they touch
851
+ # at least one database that matches the pattern, even if they also touch other databases
852
+ # Use ARRAY_SIZE with FILTER which is more compatible with Snowflake
853
+ direct_objects_condition = f"ARRAY_SIZE(FILTER(direct_objects_accessed, o -> {database_filter_condition})) > 0"
854
+ objects_modified_condition = f"ARRAY_SIZE(FILTER(objects_modified, o -> {database_filter_condition})) > 0"
855
+
856
+ # CRITICAL: Handle DDL operations by checking object_modified_by_ddl field
857
+ # DDL operations like ALTER TABLE RENAME store their data here instead of in the arrays
858
+ # We need to adapt the filter condition for a single object rather than an array
859
+ ddl_filter_condition = database_filter_condition.replace(
860
+ "o:objectName", "object_modified_by_ddl:objectName"
861
+ )
862
+ object_modified_by_ddl_condition = f"({ddl_filter_condition})"
863
+
864
+ return f"({direct_objects_condition} OR {objects_modified_condition} OR {object_modified_by_ddl_condition})"
865
+ else:
866
+ return "TRUE"
867
+
742
868
  def _query_fingerprinted_queries(self):
743
869
  if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
744
870
  secondary_fingerprint_sql = """
@@ -828,6 +954,7 @@ fingerprinted_queries as (
828
954
  AND query_id IN (
829
955
  SELECT query_id FROM deduplicated_queries
830
956
  )
957
+ AND {self.access_history_database_filter}
831
958
  )
832
959
  , filtered_access_history AS (
833
960
  -- TODO: Add table filter clause.
@@ -869,6 +869,15 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
869
869
  @platform_name("Tableau")
870
870
  @config_class(TableauConfig)
871
871
  @support_status(SupportStatus.CERTIFIED)
872
+ @capability(
873
+ SourceCapability.CONTAINERS,
874
+ "Enabled by default",
875
+ subtype_modifier=[
876
+ SourceCapabilityModifier.TABLEAU_PROJECT,
877
+ SourceCapabilityModifier.TABLEAU_SITE,
878
+ SourceCapabilityModifier.TABLEAU_WORKBOOK,
879
+ ],
880
+ )
872
881
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
873
882
  @capability(SourceCapability.DOMAINS, "Requires transformer", supported=False)
874
883
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
@@ -3671,7 +3680,7 @@ class TableauSiteSource:
3671
3680
  container_key=project_key,
3672
3681
  name=project_.name,
3673
3682
  description=project_.description,
3674
- sub_types=[c.PROJECT],
3683
+ sub_types=[BIContainerSubTypes.TABLEAU_PROJECT],
3675
3684
  parent_container_key=parent_project_key,
3676
3685
  )
3677
3686
 
@@ -3689,7 +3698,7 @@ class TableauSiteSource:
3689
3698
  yield from gen_containers(
3690
3699
  container_key=self.gen_site_key(self.site_id),
3691
3700
  name=self.site.name or "Default",
3692
- sub_types=[c.SITE],
3701
+ sub_types=[BIContainerSubTypes.TABLEAU_SITE],
3693
3702
  )
3694
3703
 
3695
3704
  def _fetch_groups(self):
@@ -76,8 +76,6 @@ CHART = "chart"
76
76
  DASHBOARD = "dashboard"
77
77
  DASHBOARDS_CONNECTION = "dashboardsConnection"
78
78
  EMBEDDED_DATA_SOURCES_CONNECTION = "embeddedDatasourcesConnection"
79
- PROJECT = "Project"
80
- SITE = "Site"
81
79
  IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
82
80
  SITE_PERMISSION = "sitePermission"
83
81
  ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer"