PyPI - acryl-datahub - Versions diffs - 1.2.0.4rc1__py3-none-any.whl → 1.2.0.4rc3__py3-none-any.whl - Mend

acryl-datahub 1.2.0.4rc1py3-none-any.whl → 1.2.0.4rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (42) hide show

datahub/ingestion/source/data_lake_common/path_spec.py CHANGED Viewed

@@ -62,7 +62,6 @@ class SortKey(ConfigModel):
     date_format: Optional[str] = Field(
         default=None,
-        type=str,
         description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
     )
@@ -260,7 +259,7 @@ class PathSpec(ConfigModel):
     ) -> Union[None, parse.Result, parse.Match]:
         return self.compiled_folder_include.parse(path)
-    @pydantic.root_validator()
+    @pydantic.root_validator(skip_on_failure=True)
     def validate_no_double_stars(cls, values: Dict) -> Dict:
         if "include" not in values:
             return values
@@ -456,7 +455,11 @@ class PathSpec(ConfigModel):
                 partition = partition.rsplit("/", 1)[0]
                 for partition_key in partition.split("/"):
                     if partition_key.find("=") != -1:
-                        partition_keys.append(tuple(partition_key.split("=")))
+                        key_value = partition_key.split(
+                            "=", 1
+                        )  # Split into at most 2 parts
+                        if len(key_value) == 2:
+                            partition_keys.append((key_value[0], key_value[1]))
             else:
                 partition_split = partition.rsplit("/", 1)
                 if len(partition_split) == 1:

datahub/ingestion/source/dbt/dbt_cloud.py CHANGED Viewed

@@ -370,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
             name = node["alias"]
         comment = node.get("comment", "")
-        description = node["description"]
-        if node.get("sourceDescription"):
-            description = node["sourceDescription"]
+        # In dbt sources, there are two types of descriptions:
+        # - description: table-level description (specific to the source table)
+        # - sourceDescription: schema-level description (describes the overall source schema)
+        # The table-level description should take precedence since it's more specific.
+        description = node["description"] or node.get("sourceDescription", "")
         if node["resourceType"] == "model":
             materialization = node["materializedType"]

datahub/ingestion/source/fivetran/fivetran_log_api.py CHANGED Viewed

@@ -69,9 +69,10 @@ class FivetranLogAPI:
                 fivetran_log_query.set_schema(bigquery_destination_config.dataset)
                 # The "database" should be the BigQuery project name.
-                fivetran_log_database = engine.execute(
-                    "SELECT @@project_id"
-                ).fetchone()[0]
+                result = engine.execute("SELECT @@project_id").fetchone()
+                if result is None:
+                    raise ValueError("Failed to retrieve BigQuery project ID")
+                fivetran_log_database = result[0]
         else:
             raise ConfigurationError(
                 f"Destination platform '{destination_platform}' is not yet supported."

datahub/ingestion/source/grafana/models.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional
 from pydantic import BaseModel, Field
+from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
 from datahub.emitter.mcp_builder import ContainerKey
 # Grafana-specific type definitions for better type safety
@@ -106,6 +107,11 @@ class Folder(BaseModel):
     title: str
     description: Optional[str] = ""
+    if PYDANTIC_VERSION_2:
+        from pydantic import ConfigDict
+        model_config = ConfigDict(coerce_numbers_to_str=True)  # type: ignore
 class FolderKey(ContainerKey):
     """Key for identifying a Grafana folder."""

datahub/ingestion/source/hex/hex.py CHANGED Viewed

@@ -69,7 +69,7 @@ class HexSourceConfig(
     )
     include_components: bool = Field(
         default=True,
-        desciption="Include Hex Components in the ingestion",
+        description="Include Hex Components in the ingestion",
     )
     page_size: int = Field(
         default=HEX_API_PAGE_SIZE_DEFAULT,

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -524,11 +524,11 @@ class IcebergSource(StatefulIngestionSourceBase):
         custom_properties["format-version"] = str(table.metadata.format_version)
         custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
         last_modified: Optional[int] = table.metadata.last_updated_ms
-        if table.current_snapshot():
-            custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
-            custom_properties["manifest-list"] = table.current_snapshot().manifest_list
+        if current_snapshot := table.current_snapshot():
+            custom_properties["snapshot-id"] = str(current_snapshot.snapshot_id)
+            custom_properties["manifest-list"] = current_snapshot.manifest_list
             if not last_modified:
-                last_modified = int(table.current_snapshot().timestamp_ms)
+                last_modified = int(current_snapshot.timestamp_ms)
         if "created-at" in custom_properties:
             try:
                 dt = dateutil_parser.isoparse(custom_properties["created-at"])

datahub/ingestion/source/powerbi_report_server/report_server_domain.py CHANGED Viewed

@@ -27,10 +27,8 @@ class CatalogItem(BaseModel):
     is_favorite: bool = Field(alias="IsFavorite")
     user_info: Any = Field(None, alias="UserInfo")
     display_name: Optional[str] = Field(None, alias="DisplayName")
-    has_data_sources: bool = Field(default=False, alias="HasDataSources")
-    data_sources: Optional[List["DataSource"]] = Field(
-        default_factory=list, alias="DataSources"
-    )
+    has_data_sources: bool = Field(False, alias="HasDataSources")
+    data_sources: Optional[List["DataSource"]] = Field(None, alias="DataSources")
     @validator("display_name", always=True)
     def validate_diplay_name(cls, value, values):

datahub/ingestion/source/redshift/datashares.py CHANGED Viewed

@@ -26,7 +26,7 @@ from datahub.utilities.search_utils import LogicalOperator
 class OutboundSharePlatformResource(BaseModel):
     namespace: str
-    platform_instance: Optional[str]
+    platform_instance: Optional[str] = None
     env: str
     source_database: str
     share_name: str

datahub/ingestion/source/slack/slack.py CHANGED Viewed

@@ -203,38 +203,31 @@ class SlackSourceConfig(
         description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email`, `users.profile:read`, and `team:read` scopes.",
     )
     enrich_user_metadata: bool = Field(
-        type=bool,
-        default=True,
+        True,
         description="When enabled, will enrich provisioned DataHub users' metadata with information from Slack.",
     )
     ingest_users: bool = Field(
-        type=bool,
-        default=True,
+        True,
         description="Whether to ingest users. When set to true, will ingest all users in the Slack workspace (as platform resources) to simplify user enrichment after they are provisioned on DataHub.",
     )
     api_requests_per_min: int = Field(
-        type=int,
-        default=10,
+        10,
         description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
     )
     ingest_public_channels: bool = Field(
-        type=bool,
-        default=False,
+        False,
         description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
     )
     channels_iteration_limit: int = Field(
-        type=int,
-        default=200,
+        200,
         description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
     )
     channel_min_members: int = Field(
-        type=int,
-        default=2,
+        2,
         description="Ingest channels with at least this many members.",
     )
     should_ingest_archived_channels: bool = Field(
-        type=bool,
-        default=False,
+        False,
         description="Whether to ingest archived channels.",
     )

datahub/ingestion/source/snowflake/snowflake_lineage_v2.py CHANGED Viewed

@@ -72,7 +72,7 @@ class ColumnUpstreamJob(BaseModel):
 class ColumnUpstreamLineage(BaseModel):
-    column_name: Optional[str]
+    column_name: Optional[str] = None
     upstreams: List[ColumnUpstreamJob] = Field(default_factory=list)
@@ -91,9 +91,9 @@ class Query(BaseModel):
 class UpstreamLineageEdge(BaseModel):
     DOWNSTREAM_TABLE_NAME: str
     DOWNSTREAM_TABLE_DOMAIN: str
-    UPSTREAM_TABLES: Optional[List[UpstreamTableNode]]
-    UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]]
-    QUERIES: Optional[List[Query]]
+    UPSTREAM_TABLES: Optional[List[UpstreamTableNode]] = None
+    UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]] = None
+    QUERIES: Optional[List[Query]] = None
     _json_upstream_tables = pydantic_parse_json("UPSTREAM_TABLES")
     _json_upstream_columns = pydantic_parse_json("UPSTREAM_COLUMNS")

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -1184,7 +1184,7 @@ class TableauSiteSource:
                     self.report.warning(
                         title="Incomplete project hierarchy",
                         message="Project details missing. Child projects will be ingested without reference to their parent project. We generally need Site Administrator Explorer permissions to extract the complete project hierarchy.",
-                        context=f"Missing {project.parent_id}, referenced by {project.id} {project.project_name}",
+                        context=f"Missing {project.parent_id}, referenced by {project.id} {project.name}",
                     )
                     project.parent_id = None

datahub/ingestion/source/unity/config.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pydantic
 from pydantic import Field
 from typing_extensions import Literal
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigEnum, ConfigModel
 from datahub.configuration.source_common import (
     DatasetSourceConfigMixin,
     LowerCaseDatasetUrnConfigMixin,
@@ -36,6 +36,12 @@ from datahub.utilities.global_warning_util import add_global_warning
 logger = logging.getLogger(__name__)
+class LineageDataSource(ConfigEnum):
+    AUTO = "AUTO"
+    SYSTEM_TABLES = "SYSTEM_TABLES"
+    API = "API"
 class UnityCatalogProfilerConfig(ConfigModel):
     method: str = Field(
         description=(
@@ -243,6 +249,21 @@ class UnityCatalogSourceConfig(
         description="Option to enable/disable lineage generation. Currently we have to call a rest call per column to get column level lineage due to the Databrick api which can slow down ingestion. ",
     )
+    lineage_data_source: LineageDataSource = pydantic.Field(
+        default=LineageDataSource.AUTO,
+        description=(
+            "Source for lineage data extraction. Options: "
+            f"'{LineageDataSource.AUTO.value}' - Use system tables when SQL warehouse is available, fallback to API; "
+            f"'{LineageDataSource.SYSTEM_TABLES.value}' - Force use of system.access.table_lineage and system.access.column_lineage tables (requires SQL warehouse); "
+            f"'{LineageDataSource.API.value}' - Force use of REST API endpoints for lineage data"
+        ),
+    )
+    ignore_start_time_lineage: bool = pydantic.Field(
+        default=False,
+        description="Option to ignore the start_time and retrieve all available lineage. When enabled, the start_time filter will be set to zero to extract all lineage events regardless of the configured time window.",
+    )
     column_lineage_column_limit: int = pydantic.Field(
         default=300,
         description="Limit the number of columns to get column level lineage. ",
@@ -362,6 +383,20 @@ class UnityCatalogSourceConfig(
         return values
+    @pydantic.root_validator(skip_on_failure=True)
+    def validate_lineage_data_source_with_warehouse(
+        cls, values: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        lineage_data_source = values.get("lineage_data_source", LineageDataSource.AUTO)
+        warehouse_id = values.get("warehouse_id")
+        if lineage_data_source == LineageDataSource.SYSTEM_TABLES and not warehouse_id:
+            raise ValueError(
+                f"lineage_data_source='{LineageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
+            )
+        return values
     @pydantic.validator("schema_pattern", always=True)
     def schema_pattern_should__always_deny_information_schema(
         cls, v: AllowDenyPattern

acryl-datahub 1.2.0.4rc1__py3-none-any.whl → 1.2.0.4rc3__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.4rc1py3-none-any.whl → 1.2.0.4rc3py3-none-any.whl