PyPI - acryl-datahub - Versions diffs - 0.15.0.4rc2__py3-none-any.whl → 0.15.0.5__py3-none-any.whl - Mend

acryl-datahub 0.15.0.4rc2py3-none-any.whl → 0.15.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (96) hide show

acryl_datahub-0.15.0.5.dist-info/LICENSE +202 -0
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2444 -2404
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +96 -86
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
datahub/__init__.py +1 -25
datahub/_version.py +13 -0
datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
datahub/cli/check_cli.py +1 -1
datahub/cli/cli_utils.py +3 -3
datahub/cli/container_cli.py +1 -64
datahub/cli/iceberg_cli.py +707 -0
datahub/cli/ingest_cli.py +2 -2
datahub/emitter/composite_emitter.py +36 -0
datahub/emitter/rest_emitter.py +1 -1
datahub/entrypoints.py +26 -5
datahub/ingestion/api/incremental_lineage_helper.py +4 -0
datahub/ingestion/api/registry.py +4 -2
datahub/ingestion/glossary/classification_mixin.py +6 -0
datahub/ingestion/glossary/classifier.py +3 -2
datahub/ingestion/graph/client.py +2 -1
datahub/ingestion/graph/entity_versioning.py +201 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
datahub/ingestion/run/connection.py +1 -1
datahub/ingestion/run/pipeline.py +3 -3
datahub/ingestion/source/abs/report.py +2 -2
datahub/ingestion/source/apply/__init__.py +0 -0
datahub/ingestion/source/apply/datahub_apply.py +223 -0
datahub/ingestion/source/aws/glue.py +15 -6
datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
datahub/ingestion/source/dbt/dbt_core.py +1 -1
datahub/ingestion/source/delta_lake/report.py +2 -2
datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
datahub/ingestion/source/elastic_search.py +2 -1
datahub/ingestion/source/ge_profiling_config.py +11 -7
datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
datahub/ingestion/source/identity/azure_ad.py +6 -14
datahub/ingestion/source/identity/okta.py +2 -1
datahub/ingestion/source/kafka/kafka.py +2 -1
datahub/ingestion/source/kafka_connect/common.py +2 -1
datahub/ingestion/source/ldap.py +2 -1
datahub/ingestion/source/looker/looker_config.py +3 -1
datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
datahub/ingestion/source/looker/looker_file_loader.py +14 -3
datahub/ingestion/source/looker/looker_template_language.py +104 -14
datahub/ingestion/source/looker/lookml_config.py +29 -8
datahub/ingestion/source/looker/lookml_source.py +110 -22
datahub/ingestion/source/mode.py +2 -4
datahub/ingestion/source/mongodb.py +2 -1
datahub/ingestion/source/nifi.py +2 -1
datahub/ingestion/source/powerbi/config.py +2 -2
datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
datahub/ingestion/source/redash.py +5 -5
datahub/ingestion/source/salesforce.py +4 -1
datahub/ingestion/source/slack/slack.py +6 -0
datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
datahub/ingestion/source/sql/clickhouse.py +5 -43
datahub/ingestion/source/sql/mssql/job_models.py +37 -8
datahub/ingestion/source/sql/mssql/source.py +17 -0
datahub/ingestion/source/sql/sql_config.py +0 -10
datahub/ingestion/source/tableau/tableau.py +16 -13
datahub/ingestion/source/tableau/tableau_common.py +1 -1
datahub/ingestion/source/unity/ge_profiler.py +55 -4
datahub/ingestion/source/unity/proxy.py +2 -2
datahub/ingestion/source/unity/report.py +1 -0
datahub/ingestion/source_config/operation_config.py +9 -0
datahub/ingestion/source_report/pulsar.py +5 -4
datahub/metadata/_schema_classes.py +304 -6
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
datahub/metadata/schema.avsc +211 -12
datahub/metadata/schemas/AssertionInfo.avsc +2 -2
datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
datahub/metadata/schemas/DashboardInfo.avsc +5 -5
datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/Deprecation.avsc +12 -0
datahub/metadata/schemas/DisplayProperties.avsc +62 -0
datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
datahub/metadata/schemas/PostInfo.avsc +28 -2
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/specific/dashboard.py +43 -1
datahub/telemetry/telemetry.py +4 -4
datahub/testing/check_imports.py +28 -0
datahub/upgrade/upgrade.py +17 -9
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
{acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/apply/datahub_apply.py ADDED Viewed

@@ -0,0 +1,223 @@
+import logging
+from functools import partial
+from typing import Any, Iterable, List, Optional, Union
+import progressbar
+from pydantic import Field
+from datahub.configuration.common import ConfigModel
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.decorators import (
+    SupportStatus,
+    config_class,
+    platform_name,
+    support_status,
+)
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
+from datahub.ingestion.api.source_helpers import auto_workunit_reporter
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
+from datahub.metadata.schema_classes import (
+    DomainsClass,
+    GlossaryTermAssociationClass,
+    MetadataChangeProposalClass,
+    OwnerClass,
+    OwnershipTypeClass,
+    TagAssociationClass,
+)
+from datahub.specific.dataset import DatasetPatchBuilder
+logger = logging.getLogger(__name__)
+def apply_association_to_container(
+    container_urn: str,
+    association_urn: str,
+    association_type: str,
+    emit: bool = True,
+    graph: Optional[DataHubGraph] = None,
+) -> Optional[List[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]]:
+    """
+    Common function to add either tags, terms, domains, or owners to child datasets (for now).
+    Args:
+        container_urn: The URN of the container
+        association_urn: The URN of the tag, term, or user to apply
+        association_type: One of 'tag', 'term', 'domain' or 'owner'
+    """
+    urns: List[str] = [container_urn]
+    if not graph:
+        graph = get_default_graph()
+    logger.info(f"Using {graph}")
+    urns.extend(
+        graph.get_urns_by_filter(
+            container=container_urn,
+            batch_size=1000,
+            entity_types=["dataset", "container"],
+        )
+    )
+    all_patches: List[Any] = []
+    for urn in urns:
+        builder = DatasetPatchBuilder(urn)
+        patches: List[Any] = []
+        if association_type == "tag":
+            patches = builder.add_tag(TagAssociationClass(association_urn)).build()
+        elif association_type == "term":
+            patches = builder.add_term(
+                GlossaryTermAssociationClass(association_urn)
+            ).build()
+        elif association_type == "owner":
+            patches = builder.add_owner(
+                OwnerClass(
+                    owner=association_urn,
+                    type=OwnershipTypeClass.TECHNICAL_OWNER,
+                )
+            ).build()
+        elif association_type == "domain":
+            patches = [
+                MetadataChangeProposalWrapper(
+                    entityUrn=urn,
+                    aspect=DomainsClass(domains=[association_urn]),
+                )
+            ]
+        all_patches.extend(patches)
+    if emit:
+        mcps_iter = progressbar.progressbar(all_patches, redirect_stdout=True)
+        for mcp in mcps_iter:
+            graph.emit(mcp)
+        return None
+    else:
+        return all_patches
+class DomainApplyConfig(ConfigModel):
+    assets: List[str] = Field(
+        default_factory=list,
+        description="List of assets to apply domain hierarchichaly. Currently only containers and datasets are supported",
+    )
+    domain_urn: str = Field(default="")
+class TagApplyConfig(ConfigModel):
+    assets: List[str] = Field(
+        default_factory=list,
+        description="List of assets to apply tag hierarchichaly. Currently only containers and datasets are supported",
+    )
+    tag_urn: str = Field(default="")
+class TermApplyConfig(ConfigModel):
+    assets: List[str] = Field(
+        default_factory=list,
+        description="List of assets to apply term hierarchichaly. Currently only containers and datasets are supported",
+    )
+    term_urn: str = Field(default="")
+class OwnerApplyConfig(ConfigModel):
+    assets: List[str] = Field(
+        default_factory=list,
+        description="List of assets to apply owner hierarchichaly. Currently only containers and datasets are supported",
+    )
+    owner_urn: str = Field(default="")
+class DataHubApplyConfig(ConfigModel):
+    domain_apply: Optional[List[DomainApplyConfig]] = Field(
+        default=None,
+        description="List to apply domains to assets",
+    )
+    tag_apply: Optional[List[TagApplyConfig]] = Field(
+        default=None,
+        description="List to apply tags to assets",
+    )
+    term_apply: Optional[List[TermApplyConfig]] = Field(
+        default=None,
+        description="List to apply terms to assets",
+    )
+    owner_apply: Optional[List[OwnerApplyConfig]] = Field(
+        default=None,
+        description="List to apply owners to assets",
+    )
+@platform_name("DataHubApply")
+@config_class(DataHubApplyConfig)
+@support_status(SupportStatus.TESTING)
+class DataHubApplySource(Source):
+    """
+    This source is a helper over CLI
+    so people can use the helper to apply various metadata changes to DataHub
+    via Managed Ingestion
+    """
+    def __init__(self, ctx: PipelineContext, config: DataHubApplyConfig):
+        self.ctx = ctx
+        self.config = config
+        self.report = SourceReport()
+        self.graph = ctx.require_graph()
+    def _yield_workunits(
+        self,
+        proposals: List[
+            Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]
+        ],
+    ) -> Iterable[MetadataWorkUnit]:
+        for proposal in proposals:
+            if isinstance(proposal, MetadataChangeProposalWrapper):
+                yield proposal.as_workunit()
+            else:
+                yield MetadataWorkUnit(
+                    id=MetadataWorkUnit.generate_workunit_id(proposal),
+                    mcp_raw=proposal,
+                )
+    def _handle_assets(
+        self, assets: List[str], apply_urn: str, apply_type: str
+    ) -> Iterable[MetadataWorkUnit]:
+        for asset in assets:
+            change_proposals = apply_association_to_container(
+                asset, apply_urn, apply_type, emit=False, graph=self.graph
+            )
+            assert change_proposals is not None
+            yield from self._yield_workunits(change_proposals)
+    def _yield_domain(self) -> Iterable[MetadataWorkUnit]:
+        if not self.config.domain_apply:
+            return
+        for apply in self.config.domain_apply:
+            yield from self._handle_assets(apply.assets, apply.domain_urn, "domain")
+    def _yield_tag(self) -> Iterable[MetadataWorkUnit]:
+        if not self.config.tag_apply:
+            return
+        for apply in self.config.tag_apply:
+            yield from self._handle_assets(apply.assets, apply.tag_urn, "tag")
+    def _yield_term(self) -> Iterable[MetadataWorkUnit]:
+        if not self.config.term_apply:
+            return
+        for apply in self.config.term_apply:
+            yield from self._handle_assets(apply.assets, apply.term_urn, "term")
+    def _yield_owner(self) -> Iterable[MetadataWorkUnit]:
+        if not self.config.owner_apply:
+            return
+        for apply in self.config.owner_apply:
+            yield from self._handle_assets(apply.assets, apply.owner_urn, "owner")
+    def get_workunits_internal(
+        self,
+    ) -> Iterable[MetadataWorkUnit]:
+        yield from self._yield_domain()
+        yield from self._yield_tag()
+        yield from self._yield_term()
+        yield from self._yield_owner()
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [partial(auto_workunit_reporter, self.get_report())]
+    def get_report(self) -> SourceReport:
+        return self.report

datahub/ingestion/source/aws/glue.py CHANGED Viewed

@@ -113,6 +113,7 @@ from datahub.metadata.schema_classes import (
 )
 from datahub.utilities.delta import delta_type_to_hive_type
 from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -218,8 +219,9 @@ class GlueSourceConfig(
 @dataclass
 class GlueSourceReport(StaleEntityRemovalSourceReport):
+    catalog_id: Optional[str] = None
     tables_scanned = 0
-    filtered: List[str] = dataclass_field(default_factory=list)
+    filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
     databases: EntityFilterReport = EntityFilterReport.field(type="database")
     num_job_script_location_missing: int = 0
@@ -315,6 +317,7 @@ class GlueSource(StatefulIngestionSourceBase):
         self.extract_owners = config.extract_owners
         self.source_config = config
         self.report = GlueSourceReport()
+        self.report.catalog_id = self.source_config.catalog_id
         self.glue_client = config.glue_client
         self.s3_client = config.s3_client
         self.extract_transforms = config.extract_transforms
@@ -738,11 +741,17 @@ class GlueSource(StatefulIngestionSourceBase):
         self,
     ) -> Tuple[List[Mapping[str, Any]], List[Dict]]:
         all_databases = [*self.get_all_databases()]
-        all_tables = [
-            tables
-            for database in all_databases
-            for tables in self.get_tables_from_database(database)
-        ]
+        all_tables = []
+        for database in all_databases:
+            try:
+                for tables in self.get_tables_from_database(database):
+                    all_tables.append(tables)
+            except Exception as e:
+                self.report.warning(
+                    message="Failed to get tables from database",
+                    context=database["Name"],
+                    exc=e,
+                )
         return all_databases, all_tables
     def get_lineage_if_enabled(

datahub/ingestion/source/aws/sagemaker_processors/common.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Union
+from typing import Dict, Optional, Union
 from pydantic.fields import Field
@@ -9,6 +9,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StatefulIngestionConfigBase,
     StatefulStaleMetadataRemovalConfig,
 )
+from datahub.utilities.lossy_collections import LossyList
 class SagemakerSourceConfig(
@@ -42,7 +43,7 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
     jobs_scanned = 0
     jobs_processed = 0
     datasets_scanned = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     model_endpoint_lineage = 0
     model_group_lineage = 0

datahub/ingestion/source/bigquery_v2/bigquery_report.py CHANGED Viewed

@@ -141,7 +141,7 @@ class BigQueryV2Report(
     profiling_skipped_invalid_partition_type: Dict[str, str] = field(
         default_factory=TopKDict
     )
-    profiling_skipped_partition_profiling_disabled: List[str] = field(
+    profiling_skipped_partition_profiling_disabled: LossyList[str] = field(
         default_factory=LossyList
     )
     allow_pattern: Optional[str] = None

datahub/ingestion/source/dbt/dbt_core.py CHANGED Viewed

@@ -488,7 +488,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
     ) -> Dict:
         if re.match("^https?://", uri):
             return json.loads(requests.get(uri).text)
-        elif re.match("^s3://", uri):
+        elif is_s3_uri(uri):
             u = urlparse(uri)
             assert aws_connection
             response = aws_connection.get_s3_client().get_object(

datahub/ingestion/source/delta_lake/report.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import dataclasses
 from dataclasses import field as dataclass_field
-from typing import List
 from datahub.ingestion.api.source import SourceReport
+from datahub.utilities.lossy_collections import LossyList
 @dataclasses.dataclass
 class DeltaLakeSourceReport(SourceReport):
     files_scanned = 0
-    filtered: List[str] = dataclass_field(default_factory=list)
+    filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
     def report_file_scanned(self) -> None:
         self.files_scanned += 1

datahub/ingestion/source/dynamodb/dynamodb.py CHANGED Viewed

@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
     StringTypeClass,
     UnionTypeClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.registries.domain_registry import DomainRegistry
 MAX_ITEMS_TO_RETRIEVE = 100
@@ -120,7 +121,7 @@ class DynamoDBConfig(
 @dataclass
 class DynamoDBSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_dropped(self, name: str) -> None:
         self.filtered.append(name)

datahub/ingestion/source/elastic_search.py CHANGED Viewed

@@ -62,6 +62,7 @@ from datahub.metadata.schema_classes import (
     SubTypesClass,
 )
 from datahub.utilities.config_clean import remove_protocol
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.urns.dataset_urn import DatasetUrn
 logger = logging.getLogger(__name__)
@@ -189,7 +190,7 @@ class ElasticToSchemaFieldConverter:
 @dataclass
 class ElasticsearchSourceReport(SourceReport):
     index_scanned: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_index_scanned(self, index: str) -> None:
         self.index_scanned += 1

datahub/ingestion/source/ge_profiling_config.py CHANGED Viewed

@@ -115,26 +115,30 @@ class GEProfilingConfig(GEProfilingBaseConfig):
     )
     max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
         default=None,
-        description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.",
+        description="A positive integer that specifies the maximum number of columns to profile for "
+        "any table. `None` implies all columns. The cost of profiling goes up significantly as the "
+        "number of columns to profile goes up.",
     )
     profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
         default=None,
-        description="Profile table only if it has been updated since these many number of days. If set to `null`, no constraint of last modified time for tables to profile. Supported only in `snowflake` and `BigQuery`.",
+        description="Profile table only if it has been updated since these many number of days. "
+        "If set to `null`, no constraint of last modified time for tables to profile. "
+        "Supported only in `snowflake` and `BigQuery`.",
     )
     profile_table_size_limit: Optional[int] = Field(
         default=5,
         description="Profile tables only if their size is less than specified GBs. If set to `null`, "
-        "no limit on the size of tables to profile. Supported only in `snowflake` and `BigQuery`"
-        "Supported for `oracle` based on calculated size from gathered stats.",
+        "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
+        "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
     )
     profile_table_row_limit: Optional[int] = Field(
         default=5000000,
-        description="Profile tables only if their row count is less than specified count. If set to `null`, "
-        "no limit on the row count of tables to profile. Supported only in `snowflake` and `BigQuery`"
-        "Supported for `oracle` based on gathered stats.",
+        description="Profile tables only if their row count is less than specified count. "
+        "If set to `null`, no limit on the row count of tables to profile. Supported only in "
+        "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
     )
     profile_table_row_count_estimate_only: bool = Field(

datahub/ingestion/source/iceberg/iceberg_common.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 from humanfriendly import format_timespan
 from pydantic import Field, validator
@@ -20,6 +20,7 @@ from datahub.ingestion.source_config.operation_config import (
     OperationConfig,
     is_profiling_enabled,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
 logger = logging.getLogger(__name__)
@@ -198,7 +199,7 @@ class TimingClass:
 class IcebergSourceReport(StaleEntityRemovalSourceReport):
     tables_scanned: int = 0
     entities_profiled: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     load_table_timings: TimingClass = field(default_factory=TimingClass)
     processing_table_timings: TimingClass = field(default_factory=TimingClass)
     profiling_table_timings: TimingClass = field(default_factory=TimingClass)

datahub/ingestion/source/identity/azure_ad.py CHANGED Viewed

@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter, Retry
 from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.emitter.mce_builder import make_group_urn, make_user_urn
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.common import PipelineContext
@@ -51,6 +52,7 @@ from datahub.metadata.schema_classes import (
     OriginTypeClass,
     StatusClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -132,11 +134,7 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
         description="regex patterns for groups to include in ingestion.",
     )
-    # If enabled, report will contain names of filtered users and groups.
-    filtered_tracking: bool = Field(
-        default=True,
-        description="If enabled, report will contain names of filtered users and groups.",
-    )
+    _remove_filtered_tracking = pydantic_removed_field("filtered_tracking")
     # Optional: Whether to mask sensitive information from workunit ID's. On by default.
     mask_group_id: bool = Field(
@@ -156,14 +154,10 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
 @dataclass
 class AzureADSourceReport(StaleEntityRemovalSourceReport):
-    filtered: List[str] = field(default_factory=list)
-    filtered_tracking: bool = field(default=True, repr=False)
-    filtered_count: int = field(default=0)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_filtered(self, name: str) -> None:
-        self.filtered_count += 1
-        if self.filtered_tracking:
-            self.filtered.append(name)
+        self.filtered.append(name)
 # Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API
@@ -266,9 +260,7 @@ class AzureADSource(StatefulIngestionSourceBase):
     def __init__(self, config: AzureADConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
         self.config = config
-        self.report = AzureADSourceReport(
-            filtered_tracking=self.config.filtered_tracking
-        )
+        self.report = AzureADSourceReport()
         session = requests.Session()
         retries = Retry(
             total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]

datahub/ingestion/source/identity/okta.py CHANGED Viewed

@@ -50,6 +50,7 @@ from datahub.metadata.schema_classes import (
     OriginTypeClass,
     StatusClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
 nest_asyncio.apply()
@@ -173,7 +174,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
 @dataclass
 class OktaSourceReport(StaleEntityRemovalSourceReport):
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_filtered(self, name: str) -> None:
         self.filtered.append(name)

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -73,6 +73,7 @@ from datahub.metadata.schema_classes import (
     OwnershipSourceTypeClass,
     SubTypesClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.mapping import Constants, OperationProcessor
 from datahub.utilities.registries.domain_registry import DomainRegistry
 from datahub.utilities.str_enum import StrEnum
@@ -190,7 +191,7 @@ def get_kafka_admin_client(
 @dataclass
 class KafkaSourceReport(StaleEntityRemovalSourceReport):
     topics_scanned: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_topic_scanned(self, topic: str) -> None:
         self.topics_scanned += 1

datahub/ingestion/source/kafka_connect/common.py CHANGED Viewed

@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
 @dataclass
 class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
     connectors_scanned: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_connector_scanned(self, connector: str) -> None:
         self.connectors_scanned += 1

datahub/ingestion/source/ldap.py CHANGED Viewed

@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
     CorpUserSnapshotClass,
     GroupMembershipClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 # default mapping for attrs
 user_attrs_map: Dict[str, Any] = {}
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
 @dataclasses.dataclass
 class LDAPSourceReport(StaleEntityRemovalSourceReport):
-    dropped_dns: List[str] = dataclasses.field(default_factory=list)
+    dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
     def report_dropped(self, dn: str) -> None:
         self.dropped_dns.append(dn)

datahub/ingestion/source/looker/looker_config.py CHANGED Viewed

@@ -177,7 +177,9 @@ def _get_generic_definition(
 class LookerConnectionDefinition(ConfigModel):
     platform: str
     default_db: str
-    default_schema: Optional[str]  # Optional since some sources are two-level only
+    default_schema: Optional[str] = (
+        None  # Optional since some sources are two-level only
+    )
     platform_instance: Optional[str] = None
     platform_env: Optional[str] = Field(
         default=None,

datahub/ingestion/source/looker/looker_dataclasses.py CHANGED Viewed

@@ -32,6 +32,12 @@ class LookerField:
     sql: Optional[str]
+@dataclass
+class LookerConstant:
+    name: str
+    value: str
 @dataclass
 class LookerModel:
     connection: str
@@ -75,6 +81,7 @@ class LookerModel:
             try:
                 parsed = load_and_preprocess_file(
                     path=included_file,
+                    reporter=reporter,
                     source_config=source_config,
                 )
                 included_explores = parsed.get("explores", [])
@@ -217,6 +224,7 @@ class LookerModel:
                 try:
                     parsed = load_and_preprocess_file(
                         path=included_file,
+                        reporter=reporter,
                         source_config=source_config,
                     )
                     seen_so_far.add(included_file)

datahub/ingestion/source/looker/looker_file_loader.py CHANGED Viewed

@@ -4,7 +4,10 @@ from dataclasses import replace
 from typing import Dict, Optional
 from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
-from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile
+from datahub.ingestion.source.looker.looker_dataclasses import (
+    LookerConstant,
+    LookerViewFile,
+)
 from datahub.ingestion.source.looker.looker_template_language import (
     load_and_preprocess_file,
 )
@@ -30,12 +33,14 @@ class LookerViewFileLoader:
         base_projects_folder: Dict[str, pathlib.Path],
         reporter: LookMLSourceReport,
         source_config: LookMLSourceConfig,
+        manifest_constants: Dict[str, LookerConstant] = {},
     ) -> None:
         self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
         self._root_project_name = root_project_name
         self._base_projects_folder = base_projects_folder
         self.reporter = reporter
         self.source_config = source_config
+        self.manifest_constants = manifest_constants
     def _load_viewfile(
         self, project_name: str, path: str, reporter: LookMLSourceReport
@@ -60,7 +65,7 @@ class LookerViewFileLoader:
             with open(path) as file:
                 raw_file_content = file.read()
         except Exception as e:
-            self.reporter.failure(
+            self.reporter.report_warning(
                 title="LKML File Loading Error",
                 message="A lookml file is not present on local storage or GitHub",
                 context=f"file path: {path}",
@@ -71,9 +76,15 @@ class LookerViewFileLoader:
         try:
             logger.debug(f"Loading viewfile {path}")
+            # load_and preprocess_file is called multiple times for loading view file from multiple flows.
+            # Flag resolve_constants is a hack to avoid passing around manifest_constants from all of the flows.
+            # This is fine as rest of flows do not need resolution of constants.
             parsed = load_and_preprocess_file(
                 path=path,
+                reporter=self.reporter,
                 source_config=self.source_config,
+                resolve_constants=True,
+                manifest_constants=self.manifest_constants,
             )
             looker_viewfile = LookerViewFile.from_looker_dict(
@@ -90,7 +101,7 @@ class LookerViewFileLoader:
             self.viewfile_cache[path] = looker_viewfile
             return looker_viewfile
         except Exception as e:
-            self.reporter.failure(
+            self.reporter.report_warning(
                 title="LKML File Parsing Error",
                 message="The input file is not lookml file",
                 context=f"file path: {path}",

acryl-datahub 0.15.0.4rc2__py3-none-any.whl → 0.15.0.5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.4rc2py3-none-any.whl → 0.15.0.5py3-none-any.whl