PyPI - acryl-datahub - Versions diffs - 1.2.0.3rc1__py3-none-any.whl → 1.2.0.4__py3-none-any.whl - Mend

acryl-datahub 1.2.0.3rc1py3-none-any.whl → 1.2.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (74) hide show

{acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/METADATA +2609 -2608
{acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/RECORD +74 -73
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +3 -3
datahub/api/entities/external/external_tag.py +6 -4
datahub/api/entities/external/lake_formation_external_entites.py +50 -49
datahub/api/entities/external/restricted_text.py +105 -180
datahub/api/entities/external/unity_catalog_external_entites.py +51 -52
datahub/api/entities/forms/forms.py +3 -3
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/quickstart_versioning.py +1 -1
datahub/cli/specific/assertions_cli.py +37 -2
datahub/cli/specific/datacontract_cli.py +54 -4
datahub/emitter/rest_emitter.py +18 -5
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
datahub/ingestion/api/report.py +21 -2
datahub/ingestion/api/source.py +81 -7
datahub/ingestion/autogenerated/capability_summary.json +47 -19
datahub/ingestion/graph/client.py +19 -3
datahub/ingestion/sink/datahub_rest.py +2 -0
datahub/ingestion/source/abs/config.py +1 -1
datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
datahub/ingestion/source/abs/source.py +9 -0
datahub/ingestion/source/aws/glue.py +18 -2
datahub/ingestion/source/aws/tag_entities.py +4 -4
datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
datahub/ingestion/source/dbt/dbt_common.py +10 -0
datahub/ingestion/source/delta_lake/source.py +8 -1
datahub/ingestion/source/dremio/dremio_source.py +19 -2
datahub/ingestion/source/fivetran/fivetran.py +9 -3
datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
datahub/ingestion/source/ge_data_profiler.py +8 -0
datahub/ingestion/source/grafana/models.py +6 -0
datahub/ingestion/source/hex/hex.py +1 -1
datahub/ingestion/source/hex/query_fetcher.py +1 -1
datahub/ingestion/source/iceberg/iceberg.py +4 -4
datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
datahub/ingestion/source/mock_data/datahub_mock_data.py +26 -10
datahub/ingestion/source/powerbi/powerbi.py +4 -1
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
datahub/ingestion/source/redshift/datashares.py +1 -1
datahub/ingestion/source/redshift/redshift.py +1 -0
datahub/ingestion/source/salesforce.py +8 -0
datahub/ingestion/source/slack/slack.py +7 -14
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
datahub/ingestion/source/sql/athena_properties_extractor.py +2 -2
datahub/ingestion/source/sql/hive_metastore.py +8 -0
datahub/ingestion/source/sql/teradata.py +8 -1
datahub/ingestion/source/sql/trino.py +9 -0
datahub/ingestion/source/tableau/tableau.py +1 -1
datahub/ingestion/source/unity/config.py +36 -1
datahub/ingestion/source/unity/proxy.py +332 -46
datahub/ingestion/source/unity/proxy_types.py +12 -2
datahub/ingestion/source/unity/source.py +91 -34
datahub/ingestion/source/unity/tag_entities.py +5 -5
datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
datahub/ingestion/transformer/base_transformer.py +8 -5
datahub/metadata/_internal_schema_classes.py +513 -513
datahub/metadata/_urns/urn_defs.py +1684 -1684
datahub/metadata/schema.avsc +16745 -16348
datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
datahub/sdk/entity_client.py +22 -7
datahub/sdk/search_client.py +3 -0
datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
datahub/specific/datajob.py +15 -1
datahub/specific/dataset.py +37 -59
datahub/utilities/mapping.py +29 -2
datahub/utilities/server_config_util.py +2 -1
{acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/abs/datalake_profiler_config.py CHANGED Viewed

@@ -72,7 +72,7 @@ class DataLakeProfilerConfig(ConfigModel):
         description="Whether to profile for the sample values for all columns.",
     )
-    @pydantic.root_validator()
+    @pydantic.root_validator(skip_on_failure=True)
     def ensure_field_level_settings_are_normalized(
         cls: "DataLakeProfilerConfig", values: Dict[str, Any]
     ) -> Dict[str, Any]:

datahub/ingestion/source/abs/source.py CHANGED Viewed

@@ -44,6 +44,7 @@ from datahub.ingestion.source.azure.abs_utils import (
     get_key_prefix,
     strip_abs_prefix,
 )
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
 from datahub.ingestion.source.data_lake_common.data_lake_utils import (
     ContainerWUCreator,
     add_partition_columns_to_schema,
@@ -128,6 +129,14 @@ class TableData:
 @support_status(SupportStatus.INCUBATING)
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
 @capability(SourceCapability.TAGS, "Can extract ABS object/container tags if enabled")
+@capability(
+    SourceCapability.CONTAINERS,
+    "Extract ABS containers and folders",
+    subtype_modifier=[
+        SourceCapabilityModifier.FOLDER,
+        SourceCapabilityModifier.ABS_CONTAINER,
+    ],
+)
 class ABSSource(StatefulIngestionSourceBase):
     source_config: DataLakeSourceConfig
     report: DataLakeSourceReport

datahub/ingestion/source/aws/glue.py CHANGED Viewed

@@ -395,7 +395,7 @@ class GlueSource(StatefulIngestionSourceBase):
                     t = LakeFormationTag(
                         key=tag_key,
                         value=tag_value,
-                        catalog_id=catalog_id,
+                        catalog=catalog_id,
                     )
                     tags.append(t)
             return tags
@@ -438,7 +438,7 @@ class GlueSource(StatefulIngestionSourceBase):
                     t = LakeFormationTag(
                         key=tag_key,
                         value=tag_value,
-                        catalog_id=catalog_id,
+                        catalog=catalog_id,
                     )
                     tags.append(t)
             return tags
@@ -522,6 +522,14 @@ class GlueSource(StatefulIngestionSourceBase):
         bucket = url.netloc
         key = url.path[1:]
+        # validate that we have a non-empty key
+        if not key:
+            self.report.num_job_script_location_invalid += 1
+            logger.warning(
+                f"Error parsing DAG for Glue job. The script {script_path} is not a valid S3 path for flow urn: {flow_urn}."
+            )
+            return None
         # download the script contents
         # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object
         try:
@@ -533,6 +541,14 @@ class GlueSource(StatefulIngestionSourceBase):
             )
             self.report.num_job_script_failed_download += 1
             return None
+        except botocore.exceptions.ParamValidationError as e:
+            self.report_warning(
+                flow_urn,
+                f"Invalid S3 path for Glue job script {script_path}: {e}",
+            )
+            self.report.num_job_script_location_invalid += 1
+            return None
         script = obj["Body"].read().decode("utf-8")
         try:

datahub/ingestion/source/aws/tag_entities.py CHANGED Viewed

@@ -37,7 +37,7 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
     tag_key: str
     tag_value: Optional[str] = None
-    platform_instance: Optional[str]
+    platform_instance: Optional[str] = None
     catalog: Optional[str] = None
     exists_in_lake_formation: bool = False
     persisted: bool = False
@@ -88,8 +88,8 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
             return existing_platform_resource
         return LakeFormationTagPlatformResourceId(
-            tag_key=tag.key,
-            tag_value=tag.value if tag.value is not None else None,
+            tag_key=str(tag.key),
+            tag_value=str(tag.value) if tag.value is not None else None,
             platform_instance=platform_instance,
             exists_in_lake_formation=exists_in_lake_formation,
             catalog=catalog,
@@ -227,7 +227,7 @@ class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
     datahub_urns: LinkedResourceSet
     managed_by_datahub: bool
     id: LakeFormationTagPlatformResourceId
-    allowed_values: Optional[List[str]]
+    allowed_values: Optional[List[str]] = None
     def get_id(self) -> ExternalEntityId:
         return self.id

datahub/ingestion/source/data_lake_common/path_spec.py CHANGED Viewed

@@ -62,7 +62,6 @@ class SortKey(ConfigModel):
     date_format: Optional[str] = Field(
         default=None,
-        type=str,
         description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
     )
@@ -260,7 +259,7 @@ class PathSpec(ConfigModel):
     ) -> Union[None, parse.Result, parse.Match]:
         return self.compiled_folder_include.parse(path)
-    @pydantic.root_validator()
+    @pydantic.root_validator(skip_on_failure=True)
     def validate_no_double_stars(cls, values: Dict) -> Dict:
         if "include" not in values:
             return values
@@ -456,7 +455,11 @@ class PathSpec(ConfigModel):
                 partition = partition.rsplit("/", 1)[0]
                 for partition_key in partition.split("/"):
                     if partition_key.find("=") != -1:
-                        partition_keys.append(tuple(partition_key.split("=")))
+                        key_value = partition_key.split(
+                            "=", 1
+                        )  # Split into at most 2 parts
+                        if len(key_value) == 2:
+                            partition_keys.append((key_value[0], key_value[1]))
             else:
                 partition_split = partition.rsplit("/", 1)
                 if len(partition_split) == 1:

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -19,6 +19,7 @@ from datahub.ingestion.api.source_helpers import (
     auto_workunit_reporter,
 )
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
 from datahub.ingestion.source.datahub.config import DataHubSourceConfig
 from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
 from datahub.ingestion.source.datahub.datahub_database_reader import (
@@ -39,7 +40,13 @@ logger = logging.getLogger(__name__)
 @platform_name("DataHub")
 @config_class(DataHubSourceConfig)
 @support_status(SupportStatus.TESTING)
-@capability(SourceCapability.CONTAINERS, "Enabled by default")
+@capability(
+    SourceCapability.CONTAINERS,
+    "Enabled by default",
+    subtype_modifier=[
+        SourceCapabilityModifier.DATABASE,
+    ],
+)
 class DataHubSource(StatefulIngestionSourceBase):
     platform: str = "datahub"

datahub/ingestion/source/dbt/dbt_cloud.py CHANGED Viewed

@@ -370,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
             name = node["alias"]
         comment = node.get("comment", "")
-        description = node["description"]
-        if node.get("sourceDescription"):
-            description = node["sourceDescription"]
+        # In dbt sources, there are two types of descriptions:
+        # - description: table-level description (specific to the source table)
+        # - sourceDescription: schema-level description (describes the overall source schema)
+        # The table-level description should take precedence since it's more specific.
+        description = node["description"] or node.get("sourceDescription", "")
         if node["resourceType"] == "model":
             materialization = node["materializedType"]

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -120,6 +120,7 @@ logger = logging.getLogger(__name__)
 DBT_PLATFORM = "dbt"
 _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
+_DBT_MAX_COMPILED_CODE_LENGTH = 1 * 1024 * 1024  # 1MB
 @dataclass
@@ -1684,6 +1685,12 @@ class DBTSourceBase(StatefulIngestionSourceBase):
     def get_external_url(self, node: DBTNode) -> Optional[str]:
         pass
+    @staticmethod
+    def _truncate_code(code: str, max_length: int) -> str:
+        if len(code) > max_length:
+            return code[:max_length] + "..."
+        return code
     def _create_view_properties_aspect(
         self, node: DBTNode
     ) -> Optional[ViewPropertiesClass]:
@@ -1695,6 +1702,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
             compiled_code = try_format_query(
                 node.compiled_code, platform=self.config.target_platform
             )
+            compiled_code = self._truncate_code(
+                compiled_code, _DBT_MAX_COMPILED_CODE_LENGTH
+            )
         materialized = node.materialization in {"table", "incremental", "snapshot"}
         view_properties = ViewPropertiesClass(

datahub/ingestion/source/delta_lake/source.py CHANGED Viewed

@@ -29,6 +29,7 @@ from datahub.ingestion.source.aws.s3_util import (
     get_key_prefix,
     strip_s3_prefix,
 )
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
 from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
 from datahub.ingestion.source.delta_lake.config import DeltaLakeSourceConfig
 from datahub.ingestion.source.delta_lake.delta_lake_utils import (
@@ -85,7 +86,13 @@ OPERATION_STATEMENT_TYPES = {
 @config_class(DeltaLakeSourceConfig)
 @support_status(SupportStatus.INCUBATING)
 @capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
-@capability(SourceCapability.CONTAINERS, "Enabled by default")
+@capability(
+    SourceCapability.CONTAINERS,
+    "Enabled by default",
+    subtype_modifier=[
+        SourceCapabilityModifier.FOLDER,
+    ],
+)
 class DeltaLakeSource(StatefulIngestionSourceBase):
     """
     This plugin extracts:

datahub/ingestion/source/dremio/dremio_source.py CHANGED Viewed

@@ -22,6 +22,7 @@ from datahub.ingestion.api.source import (
     SourceReport,
 )
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
 from datahub.ingestion.source.dremio.dremio_api import (
     DremioAPIOperations,
     DremioEdition,
@@ -86,11 +87,27 @@ class DremioSourceMapEntry:
 @platform_name("Dremio")
 @config_class(DremioSourceConfig)
 @support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.CONTAINERS, "Enabled by default")
+@capability(
+    SourceCapability.CONTAINERS,
+    "Enabled by default",
+)
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
 @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
-@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
+@capability(
+    SourceCapability.LINEAGE_COARSE,
+    "Enabled by default",
+    subtype_modifier=[
+        SourceCapabilityModifier.TABLE,
+    ],
+)
+@capability(
+    SourceCapability.LINEAGE_FINE,
+    "Extract column-level lineage",
+    subtype_modifier=[
+        SourceCapabilityModifier.TABLE,
+    ],
+)
 @capability(SourceCapability.OWNERSHIP, "Enabled by default")
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
 @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")

datahub/ingestion/source/fivetran/fivetran.py CHANGED Viewed

@@ -16,7 +16,11 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
+from datahub.ingestion.api.source import (
+    MetadataWorkUnitProcessor,
+    SourceReport,
+    StructuredLogCategory,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.fivetran.config import (
     KNOWN_DATA_PLATFORM_MAPPING,
@@ -96,8 +100,10 @@ class FivetranSource(StatefulIngestionSourceBase):
                 self.report.info(
                     title="Guessing source platform for lineage",
                     message="We encountered a connector type that we don't fully support yet. "
-                    "We will attempt to guess the platform based on the connector type.",
-                    context=f"{connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
+                    "We will attempt to guess the platform based on the connector type. "
+                    "Note that we use connector_id as the key not connector_name which you may see in the UI of Fivetran. ",
+                    context=f"connector_name: {connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
+                    log_category=StructuredLogCategory.LINEAGE,
                 )
                 source_details.platform = connector.connector_type

datahub/ingestion/source/fivetran/fivetran_log_api.py CHANGED Viewed

@@ -69,9 +69,10 @@ class FivetranLogAPI:
                 fivetran_log_query.set_schema(bigquery_destination_config.dataset)
                 # The "database" should be the BigQuery project name.
-                fivetran_log_database = engine.execute(
-                    "SELECT @@project_id"
-                ).fetchone()[0]
+                result = engine.execute("SELECT @@project_id").fetchone()
+                if result is None:
+                    raise ValueError("Failed to retrieve BigQuery project ID")
+                fivetran_log_database = result[0]
         else:
             raise ConfigurationError(
                 f"Destination platform '{destination_platform}' is not yet supported."

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -216,6 +216,14 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
                 )
             ).scalar()
         )
+    elif self.engine.dialect.name.lower() == DATABRICKS:
+        return convert_to_json_serializable(
+            self.engine.execute(
+                sa.select(sa.func.approx_count_distinct(sa.column(column))).select_from(
+                    self._table
+                )
+            ).scalar()
+        )
     return convert_to_json_serializable(
         self.engine.execute(
             sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(

datahub/ingestion/source/grafana/models.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional
 from pydantic import BaseModel, Field
+from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
 from datahub.emitter.mcp_builder import ContainerKey
 # Grafana-specific type definitions for better type safety
@@ -106,6 +107,11 @@ class Folder(BaseModel):
     title: str
     description: Optional[str] = ""
+    if PYDANTIC_VERSION_2:
+        from pydantic import ConfigDict
+        model_config = ConfigDict(coerce_numbers_to_str=True)  # type: ignore
 class FolderKey(ContainerKey):
     """Key for identifying a Grafana folder."""

datahub/ingestion/source/hex/hex.py CHANGED Viewed

@@ -69,7 +69,7 @@ class HexSourceConfig(
     )
     include_components: bool = Field(
         default=True,
-        desciption="Include Hex Components in the ingestion",
+        description="Include Hex Components in the ingestion",
     )
     page_size: int = Field(
         default=HEX_API_PAGE_SIZE_DEFAULT,

datahub/ingestion/source/hex/query_fetcher.py CHANGED Viewed

@@ -97,7 +97,7 @@ class HexQueryFetcher:
             if not query_urns or not entities_by_urn:
                 self.report.warning(
                     title="No Queries found with Hex as origin",
-                    message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
+                    message="No lineage because of no Queries found with Hex as origin in the given time range. You may need to set use_queries_v2: true on your warehouse ingestion or you may consider extending the time range to fetch more queries.",
                     context=str(
                         dict(
                             workspace_name=self.workspace_name,

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -524,11 +524,11 @@ class IcebergSource(StatefulIngestionSourceBase):
         custom_properties["format-version"] = str(table.metadata.format_version)
         custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
         last_modified: Optional[int] = table.metadata.last_updated_ms
-        if table.current_snapshot():
-            custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
-            custom_properties["manifest-list"] = table.current_snapshot().manifest_list
+        if current_snapshot := table.current_snapshot():
+            custom_properties["snapshot-id"] = str(current_snapshot.snapshot_id)
+            custom_properties["manifest-list"] = current_snapshot.manifest_list
             if not last_modified:
-                last_modified = int(table.current_snapshot().timestamp_ms)
+                last_modified = int(current_snapshot.timestamp_ms)
         if "created-at" in custom_properties:
             try:
                 dt = dateutil_parser.isoparse(custom_properties["created-at"])

datahub/ingestion/source/looker/looker_liquid_tag.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import ClassVar, Optional, TextIO
+from typing import ClassVar, Optional, TextIO, Type
 from liquid import Environment
 from liquid.ast import Node
@@ -20,16 +20,27 @@ class CustomTagException(Exception):
 class ConditionNode(Node):
     def __init__(self, tok: Token, sql_or_lookml_reference: str, filter_name: str):
         self.tok = tok
         self.sql_or_lookml_reference = sql_or_lookml_reference
         self.filter_name = filter_name
     def render_to_output(self, context: Context, buffer: TextIO) -> Optional[bool]:
         # This implementation will make sure that sql parse work correctly if looker condition tag
         # is used in lookml sql field
         buffer.write(f"{self.sql_or_lookml_reference}='dummy_value'")
+        return True
+class IncrementConditionNode(Node):
+    def __init__(self, tok: Token, sql_or_lookml_reference: str):
+        self.tok = tok
+        self.sql_or_lookml_reference = sql_or_lookml_reference
+    def render_to_output(self, context: Context, buffer: TextIO) -> Optional[bool]:
+        # For incrementcondition, we need to generate a condition that would be used
+        # in incremental PDT updates. This typically involves date/time comparisons.
+        # We'll render it as a date comparison with a placeholder value
+        # See details in Looker documentation for incrementcondition tag -> cloud.google.com/looker/docs/reference/param-view-increment-key
+        buffer.write(f"{self.sql_or_lookml_reference} > '2023-01-01'")
         return True
@@ -44,7 +55,6 @@ class ConditionTag(Tag):
     This class render the below tag as order.region='ap-south-1' if order_region is provided in config.liquid_variables
     as order_region: 'ap-south-1'
         {% condition order_region %} order.region {% endcondition %}
     """
     TAG_START: ClassVar[str] = "condition"
@@ -79,7 +89,48 @@ class ConditionTag(Tag):
         )
-custom_tags = [ConditionTag]
+class IncrementConditionTag(Tag):
+    """
+    IncrementConditionTag is the equivalent implementation of looker's custom liquid tag "incrementcondition".
+    Refer doc: https://cloud.google.com/looker/docs/incremental-pdts#using_the_incrementcondition_tag
+    This tag is used for incremental PDTs to determine which records should be updated.
+    It typically works with date/time fields to filter data that has changed since the last update.
+    Example usage in Looker:
+        {% incrementcondition created_at %} order.created_at {% endincrementcondition %}
+    This would generate SQL like: order.created_at > '2023-01-01 00:00:00'
+    """
+    TAG_START: ClassVar[str] = "incrementcondition"
+    TAG_END: ClassVar[str] = "endincrementcondition"
+    name: str = "incrementcondition"
+    def __init__(self, env: Environment):
+        super().__init__(env)
+        self.parser = get_parser(self.env)
+    def parse(self, stream: TokenStream) -> Node:
+        expect(stream, TOKEN_TAG, value=IncrementConditionTag.TAG_START)
+        start_token = stream.current
+        stream.next_token()
+        expect(stream, TOKEN_LITERAL)
+        sql_or_lookml_reference: str = stream.current.value.strip()
+        stream.next_token()
+        expect(stream, TOKEN_TAG, value=IncrementConditionTag.TAG_END)
+        return IncrementConditionNode(
+            tok=start_token,
+            sql_or_lookml_reference=sql_or_lookml_reference,
+        )
+# Updated custom_tags list to include both tags
+custom_tags: list[Type[Tag]] = [ConditionTag, IncrementConditionTag]
 @string_filter

datahub/ingestion/source/mock_data/datahub_mock_data.py CHANGED Viewed

@@ -13,7 +13,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import Source, SourceReport, StructuredLogCategory
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
 from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
@@ -35,6 +35,8 @@ from datahub.utilities.str_enum import StrEnum
 logger = logging.getLogger(__name__)
+PLATFORM_NAME = "fake"
 class SubTypePattern(StrEnum):
     ALTERNATING = "alternating"
@@ -137,6 +139,10 @@ class DataHubMockDataConfig(ConfigModel):
         default=0,
         description="Number of warnings to add in report for testing",
     )
+    num_info: int = Field(
+        default=0,
+        description="Number of info to add in report for testing",
+    )
     gen_1: LineageConfigGen1 = Field(
         default_factory=LineageConfigGen1,
@@ -144,7 +150,7 @@ class DataHubMockDataConfig(ConfigModel):
     )
-@platform_name("DataHubMockData")
+@platform_name(PLATFORM_NAME)
 @config_class(DataHubMockDataConfig)
 @support_status(SupportStatus.TESTING)
 class DataHubMockDataSource(Source):
@@ -159,6 +165,9 @@ class DataHubMockDataSource(Source):
         self.report = DataHubMockDataReport()
     def get_workunits(self) -> Iterable[MetadataWorkUnit]:
+        # We don't want any implicit aspects to be produced
+        # so we are not using get_workunits_internal
         if self.config.throw_uncaught_exceptions:
             raise Exception("This is a test exception")
@@ -176,10 +185,17 @@ class DataHubMockDataSource(Source):
                     message="This is test warning",
                     title="Test Warning",
                     context=f"This is test warning {i}",
+                    log_category=StructuredLogCategory.LINEAGE,
+                )
+        if self.config.num_info > 0:
+            for i in range(self.config.num_info):
+                self.report.info(
+                    message="This is test info",
+                    title="Test Info",
+                    context=f"This is test info {i}",
                 )
-        # We don't want any implicit aspects to be produced
-        # so we are not using get_workunits_internal
         if self.config.gen_1.enabled:
             for wu in self._data_gen_1():
                 if self.report.first_urn_seen is None:
@@ -309,7 +325,7 @@ class DataHubMockDataSource(Source):
             table_level, table_index, subtype_pattern, subtype_types, level_subtypes
         )
-        urn = make_dataset_urn(platform="fake", name=table_name)
+        urn = make_dataset_urn(platform=PLATFORM_NAME, name=table_name)
         mcp = MetadataChangeProposalWrapper(
             entityUrn=urn,
             entityType="dataset",
@@ -433,7 +449,7 @@ class DataHubMockDataSource(Source):
     def _get_status_aspect(self, table: str) -> MetadataWorkUnit:
         urn = make_dataset_urn(
-            platform="fake",
+            platform=PLATFORM_NAME,
             name=table,
         )
         mcp = MetadataChangeProposalWrapper(
@@ -448,7 +464,7 @@ class DataHubMockDataSource(Source):
     ) -> MetadataWorkUnit:
         mcp = MetadataChangeProposalWrapper(
             entityUrn=make_dataset_urn(
-                platform="fake",
+                platform=PLATFORM_NAME,
                 name=downstream_table,
             ),
             entityType="dataset",
@@ -456,7 +472,7 @@ class DataHubMockDataSource(Source):
                 upstreams=[
                     UpstreamClass(
                         dataset=make_dataset_urn(
-                            platform="fake",
+                            platform=PLATFORM_NAME,
                             name=upstream_table,
                         ),
                         type=DatasetLineageTypeClass.TRANSFORMED,
@@ -468,7 +484,7 @@ class DataHubMockDataSource(Source):
     def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
         urn = make_dataset_urn(
-            platform="fake",
+            platform=PLATFORM_NAME,
             name=table,
         )
         mcp = MetadataChangeProposalWrapper(
@@ -485,7 +501,7 @@ class DataHubMockDataSource(Source):
     def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
         urn = make_dataset_urn(
-            platform="fake",
+            platform=PLATFORM_NAME,
             name=table,
         )
         mcp = MetadataChangeProposalWrapper(

datahub/ingestion/source/powerbi/powerbi.py CHANGED Viewed

@@ -1226,7 +1226,10 @@ class Mapper:
 @platform_name("PowerBI")
 @config_class(PowerBiDashboardSourceConfig)
 @support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.CONTAINERS, "Enabled by default")
+@capability(
+    SourceCapability.CONTAINERS,
+    "Enabled by default",
+)
 @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
 @capability(SourceCapability.OWNERSHIP, "Enabled by default")
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")

datahub/ingestion/source/powerbi_report_server/report_server_domain.py CHANGED Viewed

@@ -27,10 +27,8 @@ class CatalogItem(BaseModel):
     is_favorite: bool = Field(alias="IsFavorite")
     user_info: Any = Field(None, alias="UserInfo")
     display_name: Optional[str] = Field(None, alias="DisplayName")
-    has_data_sources: bool = Field(default=False, alias="HasDataSources")
-    data_sources: Optional[List["DataSource"]] = Field(
-        default_factory=list, alias="DataSources"
-    )
+    has_data_sources: bool = Field(False, alias="HasDataSources")
+    data_sources: Optional[List["DataSource"]] = Field(None, alias="DataSources")
     @validator("display_name", always=True)
     def validate_diplay_name(cls, value, values):

datahub/ingestion/source/redshift/datashares.py CHANGED Viewed

@@ -26,7 +26,7 @@ from datahub.utilities.search_utils import LogicalOperator
 class OutboundSharePlatformResource(BaseModel):
     namespace: str
-    platform_instance: Optional[str]
+    platform_instance: Optional[str] = None
     env: str
     source_database: str
     share_name: str

datahub/ingestion/source/redshift/redshift.py CHANGED Viewed

@@ -132,6 +132,7 @@ logger: logging.Logger = logging.getLogger(__name__)
     "Enabled by default",
     subtype_modifier=[
         SourceCapabilityModifier.DATABASE,
+        SourceCapabilityModifier.SCHEMA,
     ],
 )
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")

acryl-datahub 1.2.0.3rc1__py3-none-any.whl → 1.2.0.4__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.3rc1py3-none-any.whl → 1.2.0.4py3-none-any.whl