PyPI - acryl-datahub - Versions diffs - 1.2.0.9rc2__py3-none-any.whl → 1.2.0.10__py3-none-any.whl - Mend

acryl-datahub 1.2.0.9rc2py3-none-any.whl → 1.2.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (118) hide show

{acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2553 -2611
{acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +118 -111
{acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/assertion/assertion.py +1 -1
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/dataproduct/dataproduct.py +6 -3
datahub/api/entities/dataset/dataset.py +9 -18
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/api/graphql/operation.py +10 -6
datahub/cli/docker_check.py +2 -2
datahub/configuration/common.py +29 -1
datahub/configuration/connection_resolver.py +5 -2
datahub/configuration/import_resolver.py +7 -4
datahub/configuration/pydantic_migration_helpers.py +0 -9
datahub/configuration/source_common.py +3 -2
datahub/configuration/validate_field_deprecation.py +5 -2
datahub/configuration/validate_field_removal.py +5 -2
datahub/configuration/validate_field_rename.py +6 -5
datahub/configuration/validate_multiline_string.py +5 -2
datahub/ingestion/autogenerated/capability_summary.json +45 -1
datahub/ingestion/run/pipeline_config.py +2 -2
datahub/ingestion/source/azure/azure_common.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
datahub/ingestion/source/datahub/config.py +8 -9
datahub/ingestion/source/dbt/dbt_common.py +65 -5
datahub/ingestion/source/delta_lake/config.py +1 -1
datahub/ingestion/source/dremio/dremio_config.py +3 -4
datahub/ingestion/source/feast.py +8 -10
datahub/ingestion/source/fivetran/config.py +1 -1
datahub/ingestion/source/gcs/gcs_source.py +19 -2
datahub/ingestion/source/ge_data_profiler.py +15 -2
datahub/ingestion/source/ge_profiling_config.py +26 -22
datahub/ingestion/source/grafana/grafana_config.py +2 -2
datahub/ingestion/source/grafana/models.py +12 -14
datahub/ingestion/source/hex/hex.py +6 -1
datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
datahub/ingestion/source/kafka_connect/common.py +2 -2
datahub/ingestion/source/looker/looker_common.py +76 -75
datahub/ingestion/source/looker/looker_config.py +15 -4
datahub/ingestion/source/looker/looker_source.py +493 -547
datahub/ingestion/source/looker/lookml_config.py +1 -1
datahub/ingestion/source/looker/lookml_source.py +46 -88
datahub/ingestion/source/metabase.py +9 -2
datahub/ingestion/source/metadata/business_glossary.py +7 -7
datahub/ingestion/source/metadata/lineage.py +1 -1
datahub/ingestion/source/mode.py +13 -5
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/powerbi/config.py +14 -21
datahub/ingestion/source/preset.py +1 -1
datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
datahub/ingestion/source/redash.py +1 -1
datahub/ingestion/source/redshift/config.py +6 -3
datahub/ingestion/source/redshift/query.py +23 -19
datahub/ingestion/source/s3/source.py +26 -24
datahub/ingestion/source/salesforce.py +13 -9
datahub/ingestion/source/schema/json_schema.py +14 -14
datahub/ingestion/source/sigma/data_classes.py +3 -0
datahub/ingestion/source/snaplogic/__init__.py +0 -0
datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
datahub/ingestion/source/sql/athena.py +2 -1
datahub/ingestion/source/sql/clickhouse.py +12 -7
datahub/ingestion/source/sql/cockroachdb.py +5 -3
datahub/ingestion/source/sql/druid.py +2 -2
datahub/ingestion/source/sql/hive.py +4 -3
datahub/ingestion/source/sql/hive_metastore.py +7 -9
datahub/ingestion/source/sql/mssql/source.py +2 -2
datahub/ingestion/source/sql/mysql.py +2 -2
datahub/ingestion/source/sql/oracle.py +3 -3
datahub/ingestion/source/sql/presto.py +2 -1
datahub/ingestion/source/sql/teradata.py +4 -4
datahub/ingestion/source/sql/trino.py +2 -1
datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
datahub/ingestion/source/sql/vertica.py +1 -1
datahub/ingestion/source/sql_queries.py +6 -6
datahub/ingestion/source/state/checkpoint.py +5 -1
datahub/ingestion/source/state/entity_removal_state.py +5 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
datahub/ingestion/source/superset.py +122 -15
datahub/ingestion/source/tableau/tableau.py +68 -14
datahub/ingestion/source/tableau/tableau_common.py +5 -0
datahub/ingestion/source/tableau/tableau_constant.py +1 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
datahub/ingestion/source/unity/config.py +7 -3
datahub/ingestion/source/usage/usage_common.py +3 -3
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/set_browse_path.py +112 -0
datahub/metadata/_internal_schema_classes.py +728 -528
datahub/metadata/_urns/urn_defs.py +1702 -1702
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/schema.avsc +17434 -17732
datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
datahub/metadata/schemas/LogicalParent.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
datahub/sdk/_shared.py +126 -0
datahub/sdk/chart.py +87 -30
datahub/sdk/dashboard.py +79 -34
datahub/sdk/entity_client.py +11 -4
datahub/sdk/lineage_client.py +3 -3
datahub/sdk/search_filters.py +1 -7
datahub/sql_parsing/split_statements.py +13 -0
{acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/redshift/query.py CHANGED Viewed

@@ -89,7 +89,7 @@ class RedshiftCommonQuery:
     ) -> str:
         # NOTE: it looks like description is available only in pg_description
         # So this remains preferrred way
-        tables_query = """
+        tables_query = f"""
  SELECT  CASE c.relkind
                 WHEN 'r' THEN 'TABLE'
                 WHEN 'v' THEN 'VIEW'
@@ -120,6 +120,7 @@ class RedshiftCommonQuery:
         LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
         LEFT JOIN pg_class_info as ci on c.oid = ci.reloid
         LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid
+        JOIN svv_redshift_schemas rs ON rs.schema_name = n.nspname AND rs.database_name = '{database}'
         WHERE c.relkind IN ('r','v','m','S','f')
         AND   n.nspname !~ '^pg_'
         AND   n.nspname != 'information_schema'
@@ -128,23 +129,24 @@ class RedshiftCommonQuery:
         external_tables_query = f"""
         SELECT 'EXTERNAL_TABLE' as tabletype,
             NULL AS "schema_oid",
-            schemaname AS "schema",
+            t.schemaname AS "schema",
             NULL AS "rel_oid",
-            tablename AS "relname",
+            t.tablename AS "relname",
             NULL as "creation_time",
             NULL AS "diststyle",
             NULL AS "owner_id",
             NULL AS "owner_name",
             NULL AS "view_definition",
             NULL AS "privileges",
-            "location",
-            parameters,
-            input_format,
-            output_format,
-            serde_parameters,
+            t."location",
+            t.parameters,
+            t.input_format,
+            t.output_format,
+            t.serde_parameters,
             NULL as table_description
-        FROM pg_catalog.svv_external_tables
-        WHERE redshift_database_name='{database}'
+        FROM pg_catalog.svv_external_tables t
+        JOIN SVV_EXTERNAL_SCHEMAS s ON t.schemaname = s.schemaname
+        WHERE t.redshift_database_name='{database}'
         ORDER BY "schema",
                 "relname"
 """
@@ -232,11 +234,12 @@ class RedshiftCommonQuery:
               ON att.attrelid = c.oid
             LEFT JOIN pg_catalog.pg_attrdef ad
               ON (att.attrelid, att.attnum) = (ad.adrelid, ad.adnum)
+            JOIN svv_redshift_schemas rs ON rs.schema_name = n.nspname AND rs.database_name = '{database_name}'
             WHERE n.nspname !~ '^pg_'
               AND   n.nspname != 'information_schema'
               AND att.attnum > 0
               AND NOT att.attisdropped
-              and schema = '{schema_name}'
+              and n.nspname = '{schema_name}'
             UNION
             SELECT
               view_schema as "schema",
@@ -263,26 +266,27 @@ class RedshiftCommonQuery:
             WHERE 1 and schema = '{schema_name}'
             UNION
             SELECT
-              schemaname as "schema",
-              tablename as "table_name",
-              columnname as "name",
+              c.schemaname as "schema",
+              c.tablename as "table_name",
+              c.columnname as "name",
               null as "encode",
               -- Spectrum represents data types differently.
               -- Standardize, so we can infer types.
-              external_type AS "type",
+              c.external_type AS "type",
               null as "distkey",
               0 as "sortkey",
               null as "notnull",
               null as "comment",
               null as "adsrc",
               null as "attnum",
-              external_type AS "format_type",
+              c.external_type AS "format_type",
               null as "default",
               null as "schema_oid",
               null as "table_oid"
-            FROM SVV_EXTERNAL_COLUMNS
-            WHERE 1 and schema = '{schema_name}'
-            AND redshift_database_name = '{database_name}'
+            FROM SVV_EXTERNAL_COLUMNS c
+            JOIN SVV_EXTERNAL_SCHEMAS s ON c.schemaname = s.schemaname
+            WHERE c.schemaname = '{schema_name}'
+            AND c.redshift_database_name = '{database_name}'
             ORDER BY "schema", "table_name", "attnum"
 """

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -115,14 +115,7 @@ profiling_flags_to_report = [
     "include_field_sample_values",
 ]
-# LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
-#     path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
-# )
-#
-# LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
-#     ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
-# )
+URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
 def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
@@ -448,9 +441,8 @@ class S3Source(StatefulIngestionSourceBase):
                 self.source_config.verify_ssl
             )
-            file = smart_open(
-                table_data.full_path, "rb", transport_params={"client": s3_client}
-            )
+            path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
+            file = smart_open(path, "rb", transport_params={"client": s3_client})
         else:
             # We still use smart_open here to take advantage of the compression
             # capabilities of smart_open.
@@ -668,11 +660,9 @@ class S3Source(StatefulIngestionSourceBase):
         aspects: List[Optional[_Aspect]] = []
         logger.info(f"Extracting table schema from file: {table_data.full_path}")
-        browse_path: str = (
-            self.strip_s3_prefix(table_data.table_path)
-            if self.is_s3_platform()
-            else table_data.table_path.strip("/")
-        )
+        # remove protocol and any leading or trailing slashes
+        browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
         data_platform_urn = make_data_platform_urn(self.source_config.platform)
         logger.info(f"Creating dataset urn with name: {browse_path}")
@@ -806,10 +796,20 @@ class S3Source(StatefulIngestionSourceBase):
         else:
             return relative_path
-    def extract_table_name(self, path_spec: PathSpec, named_vars: dict) -> str:
-        if path_spec.table_name is None:
-            raise ValueError("path_spec.table_name is not set")
-        return path_spec.table_name.format_map(named_vars)
+    def extract_table_name_and_path(
+        self, path_spec: PathSpec, path: str
+    ) -> Tuple[str, str]:
+        # Extract the table name and base path from a path that's been normalized back to the
+        # "s3://" scheme that matches the path_spec
+        table_name, table_path = path_spec.extract_table_name_and_path(
+            self._normalize_uri_for_pattern_matching(path)
+        )
+        # Then convert the table base path back to the original scheme
+        scheme = re.match(URI_SCHEME_REGEX, path)
+        if scheme:
+            table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
+        return table_name, table_path
     def extract_table_data(
         self,
@@ -819,7 +819,7 @@ class S3Source(StatefulIngestionSourceBase):
         path = browse_path.file
         partitions = browse_path.partitions
         logger.debug(f"Getting table data for path: {path}")
-        table_name, table_path = path_spec.extract_table_name_and_path(path)
+        table_name, table_path = self.extract_table_name_and_path(path_spec, path)
         return TableData(
             display_name=table_name,
             is_s3=self.is_s3_platform(),
@@ -992,7 +992,9 @@ class S3Source(StatefulIngestionSourceBase):
             )
             # If partition_id is None, it means the folder is not a partition
-            partition_id = path_spec.get_partition_from_path(max_file_s3_path)
+            partition_id = path_spec.get_partition_from_path(
+                self._normalize_uri_for_pattern_matching(max_file_s3_path)
+            )
             yield Folder(
                 partition_id=partition_id,
@@ -1143,8 +1145,8 @@ class S3Source(StatefulIngestionSourceBase):
                     # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
                     # This uses the compiled regex pattern to extract the table name from the full path
-                    table_name, table_path = path_spec.extract_table_name_and_path(
-                        table_s3_path
+                    table_name, _ = self.extract_table_name_and_path(
+                        path_spec, table_s3_path
                     )
                     # Apply table name filtering if configured

datahub/ingestion/source/salesforce.py CHANGED Viewed

@@ -110,30 +110,33 @@ class SalesforceConfig(
     auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
     # Username, Password Auth
-    username: Optional[str] = Field(description="Salesforce username")
-    password: Optional[str] = Field(description="Password for Salesforce user")
+    username: Optional[str] = Field(None, description="Salesforce username")
+    password: Optional[str] = Field(None, description="Password for Salesforce user")
     consumer_key: Optional[str] = Field(
-        description="Consumer key for Salesforce JSON web token access"
+        None, description="Consumer key for Salesforce JSON web token access"
     )
     private_key: Optional[str] = Field(
-        description="Private key as a string for Salesforce JSON web token access"
+        None, description="Private key as a string for Salesforce JSON web token access"
     )
     security_token: Optional[str] = Field(
-        description="Security token for Salesforce username"
+        None, description="Security token for Salesforce username"
     )
     # client_id, client_secret not required
     # Direct - Instance URL, Access Token Auth
     instance_url: Optional[str] = Field(
-        description="Salesforce instance url. e.g. https://MyDomainName.my.salesforce.com"
+        None,
+        description="Salesforce instance url. e.g. https://MyDomainName.my.salesforce.com",
     )
     # Flag to indicate whether the instance is production or sandbox
     is_sandbox: bool = Field(
         default=False, description="Connect to Sandbox instance of your Salesforce"
     )
-    access_token: Optional[str] = Field(description="Access token for instance url")
+    access_token: Optional[str] = Field(
+        None, description="Access token for instance url"
+    )
-    ingest_tags: Optional[bool] = Field(
+    ingest_tags: bool = Field(
         default=False,
         description="Ingest Tags from source. This will override Tags entered from UI",
     )
@@ -147,7 +150,8 @@ class SalesforceConfig(
         description='Regex patterns for tables/schemas to describe domain_key domain key (domain_key can be any string like "sales".) There can be multiple domain keys specified.',
     )
     api_version: Optional[str] = Field(
-        description="If specified, overrides default version used by the Salesforce package. Example value: '59.0'"
+        None,
+        description="If specified, overrides default version used by the Salesforce package. Example value: '59.0'",
     )
     profiling: SalesforceProfilingConfig = SalesforceProfilingConfig()

datahub/ingestion/source/schema/json_schema.py CHANGED Viewed

@@ -4,7 +4,6 @@ import logging
 import os
 import tempfile
 import unittest
-import urllib.request
 from dataclasses import dataclass
 from os.path import basename, dirname
 from pathlib import Path
@@ -12,6 +11,7 @@ from typing import Any, Iterable, List, Optional, Union
 from urllib.parse import urlparse
 import jsonref
+import requests
 from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator
 from pydantic.fields import Field
@@ -91,19 +91,18 @@ class JsonSchemaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMix
     )
     @validator("path")
-    def download_http_url_to_temp_file(v):
+    def download_http_url_to_temp_file(cls, v):
         if isinstance(v, AnyHttpUrl):
             try:
-                with urllib.request.urlopen(v) as response:
-                    schema_dict = json.load(response)
-                    if not JsonSchemaTranslator._get_id_from_any_schema(schema_dict):
-                        schema_dict["$id"] = str(v)
-                    with tempfile.NamedTemporaryFile(
-                        mode="w", delete=False
-                    ) as tmp_file:
-                        tmp_file.write(json.dumps(schema_dict))
-                        tmp_file.flush()
-                        return tmp_file.name
+                response = requests.get(str(v))
+                response.raise_for_status()
+                schema_dict = response.json()
+                if not JsonSchemaTranslator._get_id_from_any_schema(schema_dict):
+                    schema_dict["$id"] = str(v)
+                with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
+                    tmp_file.write(json.dumps(schema_dict))
+                    tmp_file.flush()
+                    return tmp_file.name
             except Exception as e:
                 logger.error(
                     f"Failed to localize url {v} due to {e}. Run with --debug to get full stacktrace"
@@ -353,7 +352,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
         if self.config.platform_instance:
             browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
-        if os.path.isdir(self.config.path):
+        if isinstance(self.config.path, Path) and self.config.path.is_dir():
             for root, _, files in os.walk(self.config.path, topdown=False):
                 for file_name in [f for f in files if f.endswith(".json")]:
                     try:
@@ -373,10 +372,11 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
         else:
             try:
+                assert isinstance(self.config.path, Path)
                 yield from self._load_one_file(
                     ref_loader,
                     browse_prefix=browse_prefix,
-                    root_dir=Path(os.path.dirname(Path(self.config.path))),
+                    root_dir=self.config.path.parent,
                     file_name=str(self.config.path),
                 )
             except Exception as e:

datahub/ingestion/source/sigma/data_classes.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from copy import deepcopy
 from datetime import datetime
 from typing import Dict, List, Optional
@@ -23,6 +24,8 @@ class Workspace(BaseModel):
     @root_validator(pre=True)
     def update_values(cls, values: Dict) -> Dict:
+        # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
+        values = deepcopy(values)
         # Update name if presonal workspace
         if values["name"] == "User Folder":
             values["name"] = "My documents"

datahub/ingestion/source/snaplogic/__init__.py ADDED Viewed

File without changes

acryl-datahub 1.2.0.9rc2__py3-none-any.whl → 1.2.0.10__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.9rc2py3-none-any.whl → 1.2.0.10py3-none-any.whl