PyPI - acryl-datahub - Versions diffs - 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl - Mend

acryl-datahub 1.2.0.9rc1py3-none-any.whl → 1.2.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show

{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/assertion/assertion.py +1 -1
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/dataproduct/dataproduct.py +6 -3
datahub/api/entities/dataset/dataset.py +9 -18
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/api/graphql/operation.py +10 -6
datahub/cli/docker_check.py +2 -2
datahub/configuration/common.py +29 -1
datahub/configuration/connection_resolver.py +5 -2
datahub/configuration/import_resolver.py +7 -4
datahub/configuration/pydantic_migration_helpers.py +0 -9
datahub/configuration/source_common.py +3 -2
datahub/configuration/validate_field_deprecation.py +5 -2
datahub/configuration/validate_field_removal.py +5 -2
datahub/configuration/validate_field_rename.py +6 -5
datahub/configuration/validate_multiline_string.py +5 -2
datahub/ingestion/autogenerated/capability_summary.json +45 -1
datahub/ingestion/run/pipeline_config.py +2 -2
datahub/ingestion/source/azure/azure_common.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
datahub/ingestion/source/datahub/config.py +8 -9
datahub/ingestion/source/dbt/dbt_common.py +65 -5
datahub/ingestion/source/delta_lake/config.py +1 -1
datahub/ingestion/source/dremio/dremio_config.py +3 -4
datahub/ingestion/source/feast.py +8 -10
datahub/ingestion/source/fivetran/config.py +1 -1
datahub/ingestion/source/gcs/gcs_source.py +19 -2
datahub/ingestion/source/ge_data_profiler.py +15 -2
datahub/ingestion/source/ge_profiling_config.py +26 -22
datahub/ingestion/source/grafana/grafana_config.py +2 -2
datahub/ingestion/source/grafana/models.py +12 -14
datahub/ingestion/source/hex/hex.py +6 -1
datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
datahub/ingestion/source/kafka_connect/common.py +2 -2
datahub/ingestion/source/looker/looker_common.py +76 -75
datahub/ingestion/source/looker/looker_config.py +15 -4
datahub/ingestion/source/looker/looker_source.py +493 -547
datahub/ingestion/source/looker/lookml_config.py +1 -1
datahub/ingestion/source/looker/lookml_source.py +46 -88
datahub/ingestion/source/metabase.py +9 -2
datahub/ingestion/source/metadata/business_glossary.py +7 -7
datahub/ingestion/source/metadata/lineage.py +1 -1
datahub/ingestion/source/mode.py +13 -5
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/powerbi/config.py +14 -21
datahub/ingestion/source/preset.py +1 -1
datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
datahub/ingestion/source/redash.py +1 -1
datahub/ingestion/source/redshift/config.py +6 -3
datahub/ingestion/source/redshift/query.py +23 -19
datahub/ingestion/source/s3/source.py +26 -24
datahub/ingestion/source/salesforce.py +13 -9
datahub/ingestion/source/schema/json_schema.py +14 -14
datahub/ingestion/source/sigma/data_classes.py +3 -0
datahub/ingestion/source/snaplogic/__init__.py +0 -0
datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
datahub/ingestion/source/sql/athena.py +2 -1
datahub/ingestion/source/sql/clickhouse.py +12 -7
datahub/ingestion/source/sql/cockroachdb.py +5 -3
datahub/ingestion/source/sql/druid.py +2 -2
datahub/ingestion/source/sql/hive.py +4 -3
datahub/ingestion/source/sql/hive_metastore.py +7 -9
datahub/ingestion/source/sql/mssql/source.py +2 -2
datahub/ingestion/source/sql/mysql.py +2 -2
datahub/ingestion/source/sql/oracle.py +3 -3
datahub/ingestion/source/sql/presto.py +2 -1
datahub/ingestion/source/sql/teradata.py +4 -4
datahub/ingestion/source/sql/trino.py +2 -1
datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
datahub/ingestion/source/sql/vertica.py +1 -1
datahub/ingestion/source/sql_queries.py +6 -6
datahub/ingestion/source/state/checkpoint.py +5 -1
datahub/ingestion/source/state/entity_removal_state.py +5 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
datahub/ingestion/source/superset.py +122 -15
datahub/ingestion/source/tableau/tableau.py +68 -14
datahub/ingestion/source/tableau/tableau_common.py +5 -0
datahub/ingestion/source/tableau/tableau_constant.py +1 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
datahub/ingestion/source/unity/config.py +7 -3
datahub/ingestion/source/usage/usage_common.py +3 -3
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/set_browse_path.py +112 -0
datahub/metadata/_internal_schema_classes.py +728 -528
datahub/metadata/_urns/urn_defs.py +1702 -1702
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/schema.avsc +17434 -17732
datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
datahub/metadata/schemas/LogicalParent.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
datahub/sdk/_shared.py +126 -0
datahub/sdk/chart.py +87 -30
datahub/sdk/dashboard.py +79 -34
datahub/sdk/entity_client.py +11 -4
datahub/sdk/lineage_client.py +3 -3
datahub/sdk/search_filters.py +1 -7
datahub/sql_parsing/split_statements.py +13 -0
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/bigquery_v2/bigquery_config.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import logging
 import os
 import re
+from copy import deepcopy
 from datetime import timedelta
 from typing import Dict, List, Optional, Union
 from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
 from datahub.configuration.source_common import (
     EnvConfigMixin,
     LowerCaseDatasetUrnConfigMixin,
@@ -73,8 +74,10 @@ class BigQueryBaseConfig(ConfigModel):
             ) from e
         return v
-    @root_validator(pre=True, skip_on_failure=True)
+    @root_validator(pre=True)
     def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
+        # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
+        values = deepcopy(values)
         project_id = values.pop("project_id", None)
         project_ids = values.get("project_ids")
@@ -182,13 +185,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
     )
     # NOTE: `schema_pattern` is added here only to hide it from docs.
-    schema_pattern: AllowDenyPattern = Field(
+    schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
         default=AllowDenyPattern.allow_all(),
-        hidden_from_docs=True,
     )
     @root_validator(pre=False, skip_on_failure=True)
     def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
+        # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
+        values = deepcopy(values)
         dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
         schema_pattern = values.get("schema_pattern")
         if (
@@ -320,8 +324,7 @@ class BigQueryV2Config(
         description="Include full payload into events. It is only for debugging and internal use.",
     )
-    number_of_datasets_process_in_batch: int = Field(
-        hidden_from_docs=True,
+    number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
         default=10000,
         description="Number of table queried in batch when getting metadata. This is a low level config property "
         "which should be touched with care.",
@@ -436,17 +439,15 @@ class BigQueryV2Config(
     upstream_lineage_in_report: bool = Field(
         default=False,
-        description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
+        description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
     )
-    run_optimized_column_query: bool = Field(
-        hidden_from_docs=True,
+    run_optimized_column_query: HiddenFromDocs[bool] = Field(
         default=False,
         description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
     )
-    file_backed_cache_size: int = Field(
-        hidden_from_docs=True,
+    file_backed_cache_size: HiddenFromDocs[int] = Field(
         default=2000,
         description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
     )
@@ -456,10 +457,9 @@ class BigQueryV2Config(
         description="Option to exclude empty projects from being ingested.",
     )
-    schema_resolution_batch_size: int = Field(
+    schema_resolution_batch_size: HiddenFromDocs[int] = Field(
         default=100,
         description="The number of tables to process in a batch when resolving schema from DataHub.",
-        hidden_from_schema=True,
     )
     max_threads_dataset_parallelism: int = Field(
@@ -480,6 +480,8 @@ class BigQueryV2Config(
     @root_validator(pre=True)
     def set_include_schema_metadata(cls, values: Dict) -> Dict:
+        # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
+        values = deepcopy(values)
         # Historically this is used to disable schema ingestion
         if (
             "include_tables" in values
@@ -498,6 +500,8 @@ class BigQueryV2Config(
     @root_validator(skip_on_failure=True)
     def profile_default_settings(cls, values: Dict) -> Dict:
+        # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
+        values = deepcopy(values)
         # Extra default SQLAlchemy option for better connection pooling and threading.
         # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
         values["options"].setdefault("max_overflow", -1)
@@ -515,9 +519,19 @@ class BigQueryV2Config(
         return v
+    @validator("upstream_lineage_in_report")
+    def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
+        if v and values.get("use_queries_v2", True):
+            logging.warning(
+                "`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
+                "This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
+            )
+        return v
     def get_table_pattern(self, pattern: List[str]) -> str:
         return "|".join(pattern) if pattern else ""
-    platform_instance_not_supported_for_bigquery = pydantic_removed_field(
+    _platform_instance_not_supported_for_bigquery = pydantic_removed_field(
         "platform_instance"
     )

datahub/ingestion/source/bigquery_v2/bigquery_schema.py CHANGED Viewed

@@ -283,23 +283,30 @@ class BigQuerySchemaApi:
         with self.report.list_datasets_timer:
             self.report.num_list_datasets_api_requests += 1
             datasets = self.bq_client.list_datasets(project_id, max_results=maxResults)
-            return [
-                BigqueryDataset(
-                    name=d.dataset_id,
-                    labels=d.labels,
-                    location=(
-                        d._properties.get("location")
-                        if hasattr(d, "_properties") and isinstance(d._properties, dict)
-                        else None
-                    ),
-                    # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
-                    # TODO: Given we are calling get_dataset for each dataset, we may consume and publish other fields too, such as created, modified, etc...
-                    # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
-                    # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
-                    comment=self.bq_client.get_dataset(d.reference).description,
+            result = []
+            for d in datasets:
+                # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
+                # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
+                # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
+                dataset = self.bq_client.get_dataset(d.reference)
+                location = (
+                    d._properties.get("location")
+                    if hasattr(d, "_properties") and isinstance(d._properties, dict)
+                    else None
+                )
+                result.append(
+                    BigqueryDataset(
+                        name=d.dataset_id,
+                        labels=d.labels,
+                        location=location,
+                        comment=dataset.description,
+                        created=dataset.created,
+                        last_altered=dataset.modified,
+                    )
                 )
-                for d in datasets
-            ]
+            return result
     # This is not used anywhere
     def get_datasets_for_project_id_with_information_schema(

datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py CHANGED Viewed

@@ -12,6 +12,7 @@ from datahub.emitter.mce_builder import (
     make_dataset_urn_with_platform_instance,
     make_schema_field_urn,
     make_tag_urn,
+    make_ts_millis,
 )
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey
@@ -300,6 +301,8 @@ class BigQuerySchemaGenerator:
         description: Optional[str] = None,
         tags: Optional[Dict[str, str]] = None,
         extra_properties: Optional[Dict[str, str]] = None,
+        created: Optional[int] = None,
+        last_modified: Optional[int] = None,
     ) -> Iterable[MetadataWorkUnit]:
         schema_container_key = self.gen_dataset_key(project_id, dataset)
@@ -349,6 +352,8 @@ class BigQuerySchemaGenerator:
             ),
             tags=tags_joined,
             extra_properties=extra_properties,
+            created=created,
+            last_modified=last_modified,
         )
     def _process_project(
@@ -484,6 +489,12 @@ class BigQuerySchemaGenerator:
                     else None
                 ),
                 description=bigquery_dataset.comment,
+                created=make_ts_millis(bigquery_dataset.created)
+                if bigquery_dataset.created
+                else None,
+                last_modified=make_ts_millis(bigquery_dataset.last_altered)
+                if bigquery_dataset.last_altered
+                else None,
             )
         columns = None

datahub/ingestion/source/bigquery_v2/queries_extractor.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
 from google.cloud.bigquery import Client
 from pydantic import Field, PositiveInt
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
 from datahub.configuration.time_window_config import (
     BaseTimeWindowConfig,
     get_time_bucket,
@@ -86,12 +86,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
     # TODO: Support stateful ingestion for the time windows.
     window: BaseTimeWindowConfig = BaseTimeWindowConfig()
-    local_temp_path: Optional[pathlib.Path] = Field(
-        default=None,
-        description="Local path to store the audit log.",
+    local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
         # TODO: For now, this is simply an advanced config to make local testing easier.
         # Eventually, we will want to store date-specific files in the directory and use it as a cache.
-        hidden_from_docs=True,
+        default=None,
+        description="Local path to store the audit log.",
     )
     user_email_pattern: AllowDenyPattern = Field(

datahub/ingestion/source/common/gcp_credentials_config.py CHANGED Viewed

@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
 class GCPCredential(ConfigModel):
-    project_id: Optional[str] = Field(description="Project id to set the credentials")
+    project_id: Optional[str] = Field(
+        None, description="Project id to set the credentials"
+    )
     private_key_id: str = Field(description="Private key id")
     private_key: str = Field(
         description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"

datahub/ingestion/source/data_lake_common/path_spec.py CHANGED Viewed

@@ -11,7 +11,7 @@ from cached_property import cached_property
 from pydantic.fields import Field
 from wcmatch import pathlib
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
 from datahub.ingestion.source.aws.s3_util import is_s3_uri
 from datahub.ingestion.source.azure.abs_utils import is_abs_uri
 from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
@@ -89,63 +89,62 @@ class PathSpec(ConfigModel):
         description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
     )
     exclude: Optional[List[str]] = Field(
-        default=[],
+        [],
         description="list of paths in glob pattern which will be excluded while scanning for the datasets",
     )
     file_types: List[str] = Field(
-        default=SUPPORTED_FILE_TYPES,
+        SUPPORTED_FILE_TYPES,
         description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.",
     )
     default_extension: Optional[str] = Field(
-        default=None,
+        None,
         description="For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.",
     )
     table_name: Optional[str] = Field(
-        default=None,
+        None,
         description="Display name of the dataset.Combination of named variables from include path and strings",
     )
     # This is not used yet, but will be used in the future to sort the partitions
-    sort_key: Optional[SortKey] = Field(
-        hidden_from_docs=True,
-        default=None,
+    sort_key: HiddenFromDocs[Optional[SortKey]] = Field(
+        None,
         description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.",
     )
     enable_compression: bool = Field(
-        default=True,
+        True,
         description="Enable or disable processing compressed files. Currently .gz and .bz files are supported.",
     )
     sample_files: bool = Field(
-        default=True,
+        True,
         description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled",
     )
     allow_double_stars: bool = Field(
-        default=False,
+        False,
         description="Allow double stars in the include path. This can affect performance significantly if enabled",
     )
     autodetect_partitions: bool = Field(
-        default=True,
+        True,
         description="Autodetect partition(s) from the path. If set to true, it will autodetect partition key/value if the folder format is {partition_key}={partition_value} for example `year=2024`",
     )
     traversal_method: FolderTraversalMethod = Field(
-        default=FolderTraversalMethod.MAX,
+        FolderTraversalMethod.MAX,
         description="Method to traverse the folder. ALL: Traverse all the folders, MIN_MAX: Traverse the folders by finding min and max value, MAX: Traverse the folder with max value",
     )
     include_hidden_folders: bool = Field(
-        default=False,
+        False,
         description="Include hidden folders in the traversal (folders starting with . or _",
     )
     tables_filter_pattern: AllowDenyPattern = Field(
-        default=AllowDenyPattern.allow_all(),
+        AllowDenyPattern.allow_all(),
         description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
     )
@@ -479,7 +478,8 @@ class PathSpec(ConfigModel):
         return glob_include
     @pydantic.root_validator(skip_on_failure=True)
-    def validate_path_spec(cls, values: Dict) -> Dict[str, Any]:
+    @staticmethod
+    def validate_path_spec(values: Dict) -> Dict[str, Any]:
         # validate that main fields are populated
         required_fields = ["include", "file_types", "default_extension"]
         for f in required_fields:

datahub/ingestion/source/datahub/config.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Optional, Set
 import pydantic
 from pydantic import Field, root_validator
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
 from datahub.configuration.kafka import KafkaConsumerConnectionConfig
 from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
 from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -98,16 +98,14 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
         ),
     )
-    pull_from_datahub_api: bool = Field(
+    pull_from_datahub_api: HiddenFromDocs[bool] = Field(
         default=False,
         description="Use the DataHub API to fetch versioned aspects.",
-        hidden_from_docs=True,
     )
-    max_workers: int = Field(
+    max_workers: HiddenFromDocs[int] = Field(
         default=5 * (os.cpu_count() or 4),
         description="Number of worker threads to use for datahub api ingestion.",
-        hidden_from_docs=True,
     )
     urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
@@ -118,10 +116,11 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
         "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
     )
-    structured_properties_template_cache_invalidation_interval: int = Field(
-        hidden_from_docs=True,
-        default=60,
-        description="Interval in seconds to invalidate the structured properties template cache.",
+    structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
+        Field(
+            default=60,
+            description="Interval in seconds to invalidate the structured properties template cache.",
+        )
     )
     query_timeout: Optional[int] = Field(

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -246,6 +246,23 @@ class DBTEntitiesEnabled(ConfigModel):
         return self.model_performance == EmitDirective.YES
+class MaterializedNodePatternConfig(ConfigModel):
+    """Configuration for filtering materialized nodes based on their physical location"""
+    database_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for database names to filter materialized nodes.",
+    )
+    schema_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for schema names in format '{database}.{schema}' to filter materialized nodes.",
+    )
+    table_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for table/view names in format '{database}.{schema}.{table}' to filter materialized nodes.",
+    )
 class DBTCommonConfig(
     StatefulIngestionConfigBase,
     PlatformInstanceConfigMixin,
@@ -294,6 +311,11 @@ class DBTCommonConfig(
         default=AllowDenyPattern.allow_all(),
         description="regex patterns for dbt model names to filter in ingestion.",
     )
+    materialized_node_pattern: MaterializedNodePatternConfig = Field(
+        default=MaterializedNodePatternConfig(),
+        description="Advanced filtering for materialized nodes based on their physical database location. "
+        "Provides fine-grained control over database.schema.table patterns for catalog consistency.",
+    )
     meta_mapping: Dict = Field(
         default={},
         description="mapping rules that will be executed against dbt meta properties. Refer to the section below on dbt meta automated mappings.",
@@ -1018,15 +1040,53 @@ class DBTSourceBase(StatefulIngestionSourceBase):
             all_nodes_map,
         )
-    def _is_allowed_node(self, key: str) -> bool:
-        return self.config.node_name_pattern.allowed(key)
+    def _is_allowed_node(self, node: DBTNode) -> bool:
+        """
+        Check whether a node should be processed, using multi-layer rules. Checks for materialized nodes might need to be restricted in the future to some cases
+        """
+        if not self.config.node_name_pattern.allowed(node.dbt_name):
+            return False
+        if not self._is_allowed_materialized_node(node):
+            return False
+        return True
+    def _is_allowed_materialized_node(self, node: DBTNode) -> bool:
+        """Filter nodes based on their materialized database location for catalog consistency"""
+        # Database level filtering
+        if not node.database:
+            return True
+        if not self.config.materialized_node_pattern.database_pattern.allowed(
+            node.database
+        ):
+            return False
+        # Schema level filtering: {database}.{schema}
+        if not node.schema:
+            return True
+        if not self.config.materialized_node_pattern.schema_pattern.allowed(
+            node._join_parts([node.database, node.schema])
+        ):
+            return False
+        # Table level filtering: {database}.{schema}.{table}
+        if not node.name:
+            return True
+        if not self.config.materialized_node_pattern.table_pattern.allowed(
+            node.get_db_fqn()
+        ):
+            return False
+        return True
     def _filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]:
         nodes: List[DBTNode] = []
         for node in all_nodes:
             key = node.dbt_name
-            if not self._is_allowed_node(key):
+            if not self._is_allowed_node(node):
                 self.report.nodes_filtered.append(key)
                 continue
@@ -1118,8 +1178,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
             cll_nodes.add(dbt_name)
             schema_nodes.add(dbt_name)
-        for dbt_name in all_nodes_map:
-            if self._is_allowed_node(dbt_name):
+        for dbt_name, dbt_node in all_nodes_map.items():
+            if self._is_allowed_node(dbt_node):
                 add_node_to_cll_list(dbt_name)
         return schema_nodes, cll_nodes

datahub/ingestion/source/delta_lake/config.py CHANGED Viewed

@@ -78,7 +78,7 @@ class DeltaLakeSourceConfig(
         "When set to `False`, number_of_files in delta table can not be reported.",
     )
-    s3: Optional[S3] = Field()
+    s3: Optional[S3] = Field(None)
     @cached_property
     def is_s3(self):

datahub/ingestion/source/dremio/dremio_config.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import List, Literal, Optional
 import certifi
 from pydantic import Field, validator
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
 from datahub.configuration.source_common import (
     EnvConfigMixin,
     PlatformInstanceConfigMixin,
@@ -100,10 +100,9 @@ class ProfileConfig(GEProfilingBaseConfig):
     query_timeout: int = Field(
         default=300, description="Time before cancelling Dremio profiling query"
     )
-    include_field_median_value: bool = Field(
+    include_field_median_value: HiddenFromDocs[bool] = Field(
+        # Hidden because median causes a number of issues in Dremio.
         default=False,
-        hidden_from_docs=True,
-        description="Median causes a number of issues in Dremio.",
     )

datahub/ingestion/source/feast.py CHANGED Viewed

@@ -1,5 +1,6 @@
+import pathlib
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Union
 import feast.types
 from feast import (
@@ -97,7 +98,7 @@ class FeastRepositorySourceConfig(
     StatefulIngestionConfigBase,
 ):
     path: str = Field(description="Path to Feast repository")
-    fs_yaml_file: Optional[str] = Field(
+    fs_yaml_file: Optional[pathlib.Path] = Field(
         default=None,
         description="Path to the `feature_store.yaml` file used to configure the feature store",
     )
@@ -142,17 +143,14 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
     - Column types associated with each entity and feature
     """
-    platform = "feast"
-    source_config: FeastRepositorySourceConfig
-    report: StaleEntityRemovalSourceReport
-    feature_store: FeatureStore
+    platform: ClassVar[str] = "feast"
     def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
-        self.source_config = config
-        self.ctx = ctx
-        self.report = StaleEntityRemovalSourceReport()
-        self.feature_store = FeatureStore(
+        self.source_config: FeastRepositorySourceConfig = config
+        self.ctx: PipelineContext = ctx
+        self.report: StaleEntityRemovalSourceReport = StaleEntityRemovalSourceReport()
+        self.feature_store: FeatureStore = FeatureStore(
             repo_path=self.source_config.path,
             fs_yaml_file=self.source_config.fs_yaml_file,
         )

datahub/ingestion/source/fivetran/config.py CHANGED Viewed

@@ -102,7 +102,7 @@ class FivetranLogConfig(ConfigModel):
         "destination_config", "snowflake_destination_config"
     )
-    @root_validator(pre=True)
+    @root_validator(skip_on_failure=True)
     def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
         destination_platform = values["destination_platform"]
         if destination_platform == "snowflake":

datahub/ingestion/source/gcs/gcs_source.py CHANGED Viewed

@@ -37,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
 logger: logging.Logger = logging.getLogger(__name__)
+GCS_ENDPOINT_URL = "https://storage.googleapis.com"
 class HMACKey(ConfigModel):
     hmac_access_id: str = Field(description="Access ID")
@@ -112,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
         s3_config = DataLakeSourceConfig(
             path_specs=s3_path_specs,
             aws_config=AwsConnectionConfig(
-                aws_endpoint_url="https://storage.googleapis.com",
+                aws_endpoint_url=GCS_ENDPOINT_URL,
                 aws_access_key_id=self.config.credential.hmac_access_id,
                 aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
                 aws_region="auto",
@@ -121,15 +123,25 @@ class GCSSource(StatefulIngestionSourceBase):
             max_rows=self.config.max_rows,
             number_of_files_to_sample=self.config.number_of_files_to_sample,
             platform=PLATFORM_GCS,  # Ensure GCS platform is used for correct container subtypes
+            platform_instance=self.config.platform_instance,
         )
         return s3_config
     def create_equivalent_s3_path_specs(self):
         s3_path_specs = []
         for path_spec in self.config.path_specs:
+            # PathSpec modifies the passed-in include to add /** to the end if
+            # autodetecting partitions. Remove that, otherwise creating a new
+            # PathSpec will complain.
+            # TODO: this should be handled inside PathSpec, which probably shouldn't
+            # modify its input.
+            include = path_spec.include
+            if include.endswith("{table}/**") and not path_spec.allow_double_stars:
+                include = include.removesuffix("**")
             s3_path_specs.append(
                 PathSpec(
-                    include=path_spec.include.replace("gs://", "s3://"),
+                    include=include.replace("gs://", "s3://"),
                     exclude=(
                         [exc.replace("gs://", "s3://") for exc in path_spec.exclude]
                         if path_spec.exclude
@@ -140,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
                     table_name=path_spec.table_name,
                     enable_compression=path_spec.enable_compression,
                     sample_files=path_spec.sample_files,
+                    allow_double_stars=path_spec.allow_double_stars,
+                    autodetect_partitions=path_spec.autodetect_partitions,
+                    include_hidden_folders=path_spec.include_hidden_folders,
+                    tables_filter_pattern=path_spec.tables_filter_pattern,
+                    traversal_method=path_spec.traversal_method,
                 )
             )

acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.9rc1py3-none-any.whl → 1.2.0.10py3-none-any.whl