PyPI - acryl-datahub - Versions diffs - 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7__py3-none-any.whl - Mend

acryl-datahub 1.2.0.6rc1py3-none-any.whl → 1.2.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (84) hide show

{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2562 -2476
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/graphql/operation.py +1 -1
datahub/ingestion/autogenerated/capability_summary.json +46 -6
datahub/ingestion/autogenerated/lineage.json +3 -2
datahub/ingestion/run/pipeline.py +1 -0
datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
datahub/ingestion/source/common/subtypes.py +3 -0
datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
datahub/ingestion/source/dbt/dbt_common.py +74 -0
datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
datahub/ingestion/source/dremio/dremio_source.py +4 -0
datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
datahub/ingestion/source/excel/__init__.py +0 -0
datahub/ingestion/source/excel/config.py +92 -0
datahub/ingestion/source/excel/excel_file.py +539 -0
datahub/ingestion/source/excel/profiling.py +308 -0
datahub/ingestion/source/excel/report.py +49 -0
datahub/ingestion/source/excel/source.py +662 -0
datahub/ingestion/source/excel/util.py +18 -0
datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/powerbi/config.py +33 -0
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
datahub/ingestion/source/powerbi/powerbi.py +5 -0
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
datahub/ingestion/source/redshift/config.py +9 -6
datahub/ingestion/source/redshift/lineage.py +386 -687
datahub/ingestion/source/redshift/redshift.py +19 -106
datahub/ingestion/source/s3/source.py +65 -59
datahub/ingestion/source/snowflake/constants.py +2 -0
datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
datahub/ingestion/source/sql/hive_metastore.py +1 -0
datahub/ingestion/source/sql/mssql/job_models.py +3 -1
datahub/ingestion/source/sql/mssql/source.py +62 -3
datahub/ingestion/source/sql_queries.py +24 -2
datahub/ingestion/source/state/checkpoint.py +3 -28
datahub/ingestion/source/unity/config.py +74 -9
datahub/ingestion/source/unity/proxy.py +167 -5
datahub/ingestion/source/unity/proxy_patch.py +321 -0
datahub/ingestion/source/unity/proxy_types.py +24 -0
datahub/ingestion/source/unity/report.py +5 -0
datahub/ingestion/source/unity/source.py +111 -1
datahub/ingestion/source/usage/usage_common.py +1 -0
datahub/metadata/_internal_schema_classes.py +573 -517
datahub/metadata/_urns/urn_defs.py +1748 -1748
datahub/metadata/schema.avsc +18564 -18484
datahub/metadata/schemas/ChartInfo.avsc +2 -1
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
datahub/metadata/schemas/LogicalParent.avsc +104 -100
datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
datahub/metadata/schemas/Ownership.avsc +69 -0
datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
datahub/metadata/schemas/StructuredProperties.avsc +69 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/chart.py +36 -22
datahub/sdk/dashboard.py +38 -62
datahub/sdk/lineage_client.py +6 -26
datahub/sdk/main_client.py +7 -3
datahub/sdk/search_filters.py +16 -0
datahub/specific/aspect_helpers/siblings.py +73 -0
datahub/specific/dataset.py +2 -0
datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
datahub/sql_parsing/tool_meta_extractor.py +1 -3
datahub/upgrade/upgrade.py +14 -2
datahub/ingestion/source/redshift/lineage_v2.py +0 -466
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0

datahub/ingestion/autogenerated/lineage.json CHANGED Viewed

@@ -192,7 +192,8 @@
             "relationship": {
               "name": "Consumes",
               "entityTypes": [
-                "dataset"
+                "dataset",
+                "chart"
               ],
               "isLineage": true
             }
@@ -397,5 +398,5 @@
     }
   },
   "generated_by": "metadata-ingestion/scripts/modeldocgen.py",
-  "generated_at": "2025-07-01T10:49:03.713749+00:00"
+  "generated_at": "2025-08-05T19:29:49.306404+00:00"
 }

datahub/ingestion/run/pipeline.py CHANGED Viewed

@@ -639,6 +639,7 @@ class Pipeline:
                 "transformer_types": [
                     transformer.type for transformer in self.config.transformers or []
                 ],
+                "extractor_type": self.config.source.extractor,
                 "records_written": stats.discretize(
                     self.sink.get_report().total_records_written
                 ),

datahub/ingestion/source/aws/s3_boto_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
-from typing import Iterable, Optional, Union
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Iterable, Optional, Union
 from datahub.emitter.mce_builder import make_tag_urn
 from datahub.ingestion.api.common import PipelineContext
@@ -11,9 +12,14 @@ from datahub.ingestion.source.aws.s3_util import (
 )
 from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
+if TYPE_CHECKING:
+    from mypy_boto3_s3.service_resource import ObjectSummary
 logging.getLogger("py4j").setLevel(logging.ERROR)
 logger: logging.Logger = logging.getLogger(__name__)
+LIST_OBJECTS_PAGE_SIZE = 1000
 def get_s3_tags(
     bucket_name: str,
@@ -74,16 +80,79 @@ def get_s3_tags(
     return new_tags
+@dataclass
+class DirEntry:
+    """
+    Intended to be similar to os.DirEntry, which contains a name, full path, and possibly
+    other attributes of a directory entry. Currently only used to represent S3 folder-like
+    paths.
+    """
+    name: str
+    path: str
 def list_folders_path(
-    s3_uri: str, aws_config: Optional[AwsConnectionConfig]
-) -> Iterable[str]:
+    s3_uri: str,
+    *,
+    startswith: str = "",
+    aws_config: Optional[AwsConnectionConfig] = None,
+) -> Iterable[DirEntry]:
+    """
+    Given an S3 URI to a folder or bucket, return all sub-folders underneath that URI,
+    optionally filtering by startswith. Returned entries never contain a trailing slash.
+    """
+    if not is_s3_uri(s3_uri):
+        raise ValueError("Not a s3 URI: " + s3_uri)
+    if aws_config is None:
+        raise ValueError("aws_config not set. Cannot browse s3")
+    if not s3_uri.endswith("/"):
+        s3_uri += "/"
+    bucket_name = get_bucket_name(s3_uri)
+    if not bucket_name:
+        # No bucket name means we only have the s3[an]:// protocol, not a full bucket and
+        # prefix.
+        for folder in list_buckets(startswith, aws_config):
+            yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
+        return
+    prefix = get_bucket_relative_path(s3_uri) + startswith
+    for folder in list_folders(bucket_name, prefix, aws_config):
+        folder = folder.removesuffix("/").split("/")[-1]
+        yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
+def list_objects_recursive_path(
+    s3_uri: str, *, startswith: str, aws_config: Optional[AwsConnectionConfig]
+) -> Iterable["ObjectSummary"]:
+    """
+    Given an S3 URI to a folder or bucket, return all objects underneath that URI, optionally
+    filtering by startswith.
+    """
     if not is_s3_uri(s3_uri):
         raise ValueError("Not a s3 URI: " + s3_uri)
     if aws_config is None:
         raise ValueError("aws_config not set. Cannot browse s3")
+    if startswith and "/" in startswith:
+        raise ValueError(f"startswith contains forward slash: {repr(startswith)}")
+    if not s3_uri.endswith("/"):
+        s3_uri += "/"
     bucket_name = get_bucket_name(s3_uri)
-    prefix = get_bucket_relative_path(s3_uri)
-    yield from list_folders(bucket_name, prefix, aws_config)
+    if not bucket_name:
+        # No bucket name means we only have the s3[an]:// protocol, not a full bucket and
+        # prefix.
+        for bucket_name in list_buckets(startswith, aws_config):
+            yield from list_objects_recursive(bucket_name, "", aws_config)
+        return
+    prefix = get_bucket_relative_path(s3_uri) + startswith
+    yield from list_objects_recursive(bucket_name, prefix, aws_config)
 def list_folders(
@@ -99,3 +168,26 @@ def list_folders(
             if folder.endswith("/"):
                 folder = folder[:-1]
             yield f"{folder}"
+def list_buckets(
+    prefix: str, aws_config: Optional[AwsConnectionConfig]
+) -> Iterable[str]:
+    if aws_config is None:
+        raise ValueError("aws_config not set. Cannot browse s3")
+    s3_client = aws_config.get_s3_client()
+    paginator = s3_client.get_paginator("list_buckets")
+    for page in paginator.paginate(Prefix=prefix):
+        for o in page.get("Buckets", []):
+            yield str(o.get("Name"))
+def list_objects_recursive(
+    bucket_name: str, prefix: str, aws_config: Optional[AwsConnectionConfig]
+) -> Iterable["ObjectSummary"]:
+    if aws_config is None:
+        raise ValueError("aws_config not set. Cannot browse s3")
+    s3_resource = aws_config.get_s3_resource()
+    bucket = s3_resource.Bucket(bucket_name)
+    for obj in bucket.objects.filter(Prefix=prefix).page_size(LIST_OBJECTS_PAGE_SIZE):
+        yield obj

datahub/ingestion/source/bigquery_v2/bigquery_connection.py CHANGED Viewed

@@ -2,16 +2,23 @@ import logging
 import os
 from typing import Any, Dict, Optional
+from google.api_core.client_info import ClientInfo
 from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
 from google.cloud.logging_v2.client import Client as GCPLoggingClient
 from pydantic import Field, PrivateAttr
+from datahub._version import __version__
 from datahub.configuration.common import ConfigModel
 from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
 logger = logging.getLogger(__name__)
+def _get_bigquery_client_info() -> ClientInfo:
+    """Get ClientInfo with DataHub user-agent for BigQuery client identification"""
+    return ClientInfo(user_agent=f"datahub/{__version__}")
 class BigQueryConnectionConfig(ConfigModel):
     credential: Optional[GCPCredential] = Field(
         default=None, description="BigQuery credential informations"
@@ -41,7 +48,11 @@ class BigQueryConnectionConfig(ConfigModel):
     def get_bigquery_client(self) -> bigquery.Client:
         client_options = self.extra_client_options
-        return bigquery.Client(self.project_on_behalf, **client_options)
+        return bigquery.Client(
+            self.project_on_behalf,
+            client_info=_get_bigquery_client_info(),
+            **client_options,
+        )
     def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
         return resourcemanager_v3.ProjectsClient()

datahub/ingestion/source/common/subtypes.py CHANGED Viewed

@@ -54,6 +54,8 @@ class DatasetContainerSubTypes(StrEnum):
     ABS_CONTAINER = "ABS container"
     KEYSPACE = "Keyspace"  # Cassandra
     NAMESPACE = "Namespace"  # Iceberg
+    DREMIO_SPACE = "Dremio Space"
+    DREMIO_SOURCE = "Dremio Source"
 class BIContainerSubTypes(StrEnum):
@@ -63,6 +65,7 @@ class BIContainerSubTypes(StrEnum):
     TABLEAU_SITE = "Site"
     TABLEAU_PROJECT = "Project"
     TABLEAU_WORKBOOK = "Workbook"
+    POWERBI_WORKSPACE = "Workspace"
     POWERBI_DATASET = "Semantic Model"
     POWERBI_DATASET_TABLE = "Table"
     QLIK_SPACE = "Qlik Space"

datahub/ingestion/source/data_lake_common/path_spec.py CHANGED Viewed

@@ -563,7 +563,7 @@ class PathSpec(ConfigModel):
     def extract_table_name_and_path(self, path: str) -> Tuple[str, str]:
         parsed_vars = self.get_named_vars(path)
         if parsed_vars is None or "table" not in parsed_vars.named:
-            return os.path.basename(path), path
+            return os.path.basename(path.removesuffix("/")), path
         else:
             include = self.include
             depth = include.count("/", 0, include.find("{table}"))

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -104,6 +104,22 @@ class DataHubDatabaseReader:
             ORDER BY mav.urn
         """
+    def _get_json_extract_expression(self) -> str:
+        """
+        Returns the appropriate JSON extraction expression based on the database dialect.
+        Returns:
+            Database-specific JSON extraction expression
+        """
+        # Return the correct JSON extraction expression for the "removed" field,
+        # depending on the database dialect.
+        if self.engine.dialect.name == "postgresql":
+            # For PostgreSQL, cast the metadata column to JSON and extract the 'removed' key as boolean.
+            return "((metadata::json)->>'removed')::boolean"
+        else:
+            # For other databases (e.g., MySQL), use JSON_EXTRACT.
+            return "JSON_EXTRACT(metadata, '$.removed')"
     def query(self, set_structured_properties_filter: bool) -> str:
         """
         Main query that gets data for specified date range with appropriate filters.
@@ -125,7 +141,7 @@ class DataHubDatabaseReader:
             LEFT JOIN (
                 SELECT
                     *,
-                    JSON_EXTRACT(metadata, '$.removed') as removed
+                    {self._get_json_extract_expression()} as removed
                 FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)}
                 WHERE aspect = 'status'
                 AND version = 0
@@ -241,15 +257,10 @@ class DataHubDatabaseReader:
                     "end_createdon": end_date.strftime(DATETIME_FORMAT),
                     "limit": limit,
                     "offset": offset,
+                    # Always pass exclude_aspects as a tuple, postgres doesn't support lists
+                    "exclude_aspects": tuple(self.config.exclude_aspects),
                 }
-                # Add exclude_aspects if needed
-                if (
-                    hasattr(self.config, "exclude_aspects")
-                    and self.config.exclude_aspects
-                ):
-                    params["exclude_aspects"] = tuple(self.config.exclude_aspects)
                 logger.info(
                     f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
                     f"with limit {limit} and offset {offset} (inclusive range)"

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -91,6 +91,7 @@ from datahub.metadata.schema_classes import (
     OwnershipClass,
     OwnershipSourceTypeClass,
     OwnershipTypeClass,
+    SiblingsClass,
     StatusClass,
     SubTypesClass,
     TagAssociationClass,
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
     ViewPropertiesClass,
 )
 from datahub.metadata.urns import DatasetUrn
+from datahub.specific.dataset import DatasetPatchBuilder
 from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
 from datahub.sql_parsing.sqlglot_lineage import (
     SqlParsingDebugInfo,
@@ -374,6 +376,14 @@ class DBTCommonConfig(
         "Set to False to skip it for engines like AWS Athena where it's not required.",
     )
+    dbt_is_primary_sibling: bool = Field(
+        default=True,
+        description="Experimental: Controls sibling relationship primary designation between dbt entities and target platform entities. "
+        "When True (default), dbt entities are primary and target platform entities are secondary. "
+        "When False, target platform entities are primary and dbt entities are secondary. "
+        "Uses aspect patches for precise control. Requires DataHub server 1.3.0+.",
+    )
     drop_duplicate_sources: bool = Field(
         default=True,
         description="When enabled, drops sources that have the same name in the target platform as a model. "
@@ -1476,6 +1486,23 @@ class DBTSourceBase(StatefulIngestionSourceBase):
                 dataset_snapshot = DatasetSnapshot(
                     urn=node_datahub_urn, aspects=list(snapshot_aspects)
                 )
+                # Emit sibling aspect for dbt entity (dbt is authoritative source for sibling relationships)
+                if self._should_create_sibling_relationships(node):
+                    # Get the target platform URN
+                    target_platform_urn = node.get_urn(
+                        self.config.target_platform,
+                        self.config.env,
+                        self.config.target_platform_instance,
+                    )
+                    yield MetadataChangeProposalWrapper(
+                        entityUrn=node_datahub_urn,
+                        aspect=SiblingsClass(
+                            siblings=[target_platform_urn],
+                            primary=self.config.dbt_is_primary_sibling,
+                        ),
+                    ).as_workunit()
                 mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                 if self.config.write_semantics == "PATCH":
                     mce = self.get_patched_mce(mce)
@@ -1579,6 +1606,31 @@ class DBTSourceBase(StatefulIngestionSourceBase):
             if not node.exists_in_target_platform:
                 continue
+            # Emit sibling patch for target platform entity BEFORE any other aspects.
+            # This ensures the hook can detect explicit primary settings when processing later aspects.
+            if self._should_create_sibling_relationships(node):
+                # Get the dbt platform URN
+                dbt_platform_urn = node.get_urn(
+                    DBT_PLATFORM,
+                    self.config.env,
+                    self.config.platform_instance,
+                )
+                # Create patch for target platform entity (make it primary when dbt_is_primary_sibling=False)
+                target_patch = DatasetPatchBuilder(node_datahub_urn)
+                target_patch.add_sibling(
+                    dbt_platform_urn, primary=not self.config.dbt_is_primary_sibling
+                )
+                yield from auto_workunit(
+                    MetadataWorkUnit(
+                        id=MetadataWorkUnit.generate_workunit_id(mcp),
+                        mcp_raw=mcp,
+                        is_primary_source=False,  # Not authoritative over warehouse metadata
+                    )
+                    for mcp in target_patch.build()
+                )
             # This code block is run when we are generating entities of platform type.
             # We will not link the platform not to the dbt node for type "source" because
             # in this case the platform table existed first.
@@ -2134,5 +2186,27 @@ class DBTSourceBase(StatefulIngestionSourceBase):
                     term_id_set.add(existing_term.urn)
         return [GlossaryTermAssociation(term_urn) for term_urn in sorted(term_id_set)]
+    def _should_create_sibling_relationships(self, node: DBTNode) -> bool:
+        """
+        Determines whether to emit sibling relationships for a dbt node.
+        Sibling relationships (both dbt entity's aspect and target entity's patch) are only
+        emitted when dbt_is_primary_sibling=False to establish explicit primary/secondary
+        relationships. When dbt_is_primary_sibling=True,
+        the SiblingAssociationHook handles sibling creation automatically.
+        Args:
+            node: The dbt node to evaluate
+        Returns:
+            True if sibling patches should be emitted for this node
+        """
+        # Only create siblings for entities that exist in target platform
+        if not node.exists_in_target_platform:
+            return False
+        # Only emit patches when explicit primary/secondary control is needed
+        return self.config.dbt_is_primary_sibling is False
     def get_report(self):
         return self.report

datahub/ingestion/source/dremio/dremio_aspects.py CHANGED Viewed

@@ -14,6 +14,7 @@ from datahub.emitter.mce_builder import (
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import ContainerKey
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
 from datahub.ingestion.source.dremio.dremio_entities import (
     DremioContainer,
     DremioDataset,
@@ -364,9 +365,9 @@ class DremioAspects:
     ) -> Optional[BrowsePathsV2Class]:
         paths = []
-        if entity.subclass == "Dremio Space":
+        if entity.subclass == DatasetContainerSubTypes.DREMIO_SPACE.value:
             paths.append(BrowsePathEntryClass(id="Spaces"))
-        elif entity.subclass == "Dremio Source":
+        elif entity.subclass == DatasetContainerSubTypes.DREMIO_SOURCE.value:
             paths.append(BrowsePathEntryClass(id="Sources"))
         if paths:
             return BrowsePathsV2Class(path=paths)

datahub/ingestion/source/dremio/dremio_source.py CHANGED Viewed

@@ -90,6 +90,10 @@ class DremioSourceMapEntry:
 @capability(
     SourceCapability.CONTAINERS,
     "Enabled by default",
+    subtype_modifier=[
+        SourceCapabilityModifier.DREMIO_SPACE,
+        SourceCapabilityModifier.DREMIO_SOURCE,
+    ],
 )
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
 @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")

datahub/ingestion/source/dynamodb/dynamodb.py CHANGED Viewed

@@ -12,7 +12,7 @@ from typing import (
     Union,
 )
-from pydantic.fields import Field
+from pydantic import Field, PositiveInt
 from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -73,7 +73,6 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
 MAX_ITEMS_TO_RETRIEVE = 100
 PAGE_SIZE = 100
-MAX_SCHEMA_SIZE = 300
 MAX_PRIMARY_KEYS_SIZE = 100
 FIELD_DELIMITER = "."
@@ -107,6 +106,10 @@ class DynamoDBConfig(
         'Refer "Advanced Configurations" section for more details',
     )
+    max_schema_size: PositiveInt = Field(
+        default=300, description="Maximum number of fields to include in the schema."
+    )
     table_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
         description="Regex patterns for tables to filter in ingestion. The table name format is 'region.table'",
@@ -455,25 +458,25 @@ class DynamoDBSource(StatefulIngestionSourceBase):
     ) -> SchemaMetadataClass:
         """ "
         To construct the schema metadata, it will first sort the schema by the occurrence of attribute names
-        in descending order and truncate the schema by MAX_SCHEMA_SIZE, and then start to construct the
+        in descending order and truncate the schema by max_schema_size, and then start to construct the
         schema metadata sorted by attribute name
         """
         canonical_schema: List[SchemaField] = []
         schema_size = len(schema.values())
         table_fields = list(schema.values())
-        if schema_size > MAX_SCHEMA_SIZE:
+        if schema_size > self.config.max_schema_size:
             # downsample the schema, using frequency as the sort key
             self.report.report_warning(
                 title="Schema Size Too Large",
-                message=f"Downsampling the table schema because MAX_SCHEMA_SIZE threshold is {MAX_SCHEMA_SIZE}",
+                message=f"Downsampling the table schema because `max_schema_size` threshold is {self.config.max_schema_size}",
                 context=f"Collection: {dataset_urn}",
             )
             # Add this information to the custom properties so user can know they are looking at down sampled schema
             dataset_properties.customProperties["schema.downsampled"] = "True"
             dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
-        # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include MAX_SCHEMA_SIZE items
+        # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include max_schema_size items
         primary_keys = []
         for schema_field in sorted(
             table_fields,
@@ -481,7 +484,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
                 -x["count"],
                 x["delimited_name"],
             ),  # Negate `count` for descending order, `delimited_name` stays the same for ascending
-        )[0:MAX_SCHEMA_SIZE]:
+        )[: self.config.max_schema_size]:
             field_path = schema_field["delimited_name"]
             native_data_type = self.get_native_type(schema_field["type"], table_name)
             type = self.get_field_type(schema_field["type"], table_name)

datahub/ingestion/source/excel/__init__.py ADDED Viewed

File without changes

datahub/ingestion/source/excel/config.py ADDED Viewed

@@ -0,0 +1,92 @@
+from typing import List, Optional, Union
+from pydantic.fields import Field
+from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
+from datahub.ingestion.source.azure.azure_common import AzureConnectionConfig
+from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StatefulStaleMetadataRemovalConfig,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+)
+from datahub.ingestion.source_config.operation_config import is_profiling_enabled
+class ExcelSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
+    path_list: List[str] = Field(
+        description="List of paths to Excel files or folders to ingest."
+    )
+    path_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for file paths to filter in ingestion.",
+    )
+    aws_config: Optional[AwsConnectionConfig] = Field(
+        default=None, description="AWS configuration"
+    )
+    use_s3_bucket_tags: Optional[bool] = Field(
+        default=False,
+        description="Whether or not to create tags in datahub from the s3 bucket",
+    )
+    use_s3_object_tags: Optional[bool] = Field(
+        default=False,
+        description="Whether or not to create tags in datahub from the s3 object",
+    )
+    verify_ssl: Union[bool, str] = Field(
+        default=True,
+        description="Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use.",
+    )
+    azure_config: Optional[AzureConnectionConfig] = Field(
+        default=None, description="Azure configuration"
+    )
+    use_abs_blob_tags: Optional[bool] = Field(
+        default=False,
+        description="Whether to create tags in datahub from the abs blob tags",
+    )
+    convert_urns_to_lowercase: bool = Field(
+        default=False,
+        description="Enable to convert the Excel asset urns to lowercase",
+    )
+    active_sheet_only: bool = Field(
+        default=False,
+        description="Enable to only ingest the active sheet of the workbook. If not set, all sheets will be ingested.",
+    )
+    worksheet_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for worksheets to ingest. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
+        "For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
+    )
+    profile_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for worksheets to profile. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
+        "For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
+    )
+    profiling: GEProfilingConfig = Field(
+        default=GEProfilingConfig(),
+        description="Configuration for profiling",
+    )
+    stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
+        default=None,
+        description="Configuration for stateful ingestion and stale metadata removal.",
+    )
+    def is_profiling_enabled(self) -> bool:
+        return self.profiling.enabled and is_profiling_enabled(
+            self.profiling.operation_config
+        )

acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.6rc1py3-none-any.whl → 1.2.0.7py3-none-any.whl