PyPI - acryl-datahub - Versions diffs - 0.15.0rc4__py3-none-any.whl → 0.15.0rc6__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc4py3-none-any.whl → 0.15.0rc6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (24) hide show

datahub/ingestion/source/neo4j/neo4j_source.py ADDED Viewed

@@ -0,0 +1,331 @@
+import logging
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Type, Union
+import pandas as pd
+from neo4j import GraphDatabase
+from pydantic.fields import Field
+from datahub.configuration.source_common import EnvConfigMixin
+from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.decorators import (
+    SupportStatus,
+    config_class,
+    platform_name,
+    support_status,
+)
+from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.common.subtypes import DatasetSubTypes
+from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
+from datahub.metadata.schema_classes import (
+    AuditStampClass,
+    BooleanTypeClass,
+    DatasetPropertiesClass,
+    DateTypeClass,
+    NullTypeClass,
+    NumberTypeClass,
+    OtherSchemaClass,
+    SchemaFieldClass,
+    SchemaMetadataClass,
+    StringTypeClass,
+    SubTypesClass,
+    UnionTypeClass,
+)
+log = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+_type_mapping: Dict[Union[Type, str], Type] = {
+    "list": UnionTypeClass,
+    "boolean": BooleanTypeClass,
+    "integer": NumberTypeClass,
+    "local_date_time": DateTypeClass,
+    "float": NumberTypeClass,
+    "string": StringTypeClass,
+    "date": DateTypeClass,
+    "node": StringTypeClass,
+    "relationship": StringTypeClass,
+}
+class Neo4jConfig(EnvConfigMixin):
+    username: str = Field(description="Neo4j Username")
+    password: str = Field(description="Neo4j Password")
+    uri: str = Field(description="The URI for the Neo4j server")
+    env: str = Field(description="Neo4j env")
+@dataclass
+class Neo4jSourceReport(SourceReport):
+    obj_failures: int = 0
+    obj_created: int = 0
+@platform_name("Neo4j", id="neo4j")
+@config_class(Neo4jConfig)
+@support_status(SupportStatus.CERTIFIED)
+class Neo4jSource(Source):
+    NODE = "node"
+    RELATIONSHIP = "relationship"
+    PLATFORM = "neo4j"
+    def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
+        self.ctx = ctx
+        self.config = config
+        self.report = Neo4jSourceReport()
+    @classmethod
+    def create(cls, config_dict, ctx):
+        config = Neo4jConfig.parse_obj(config_dict)
+        return cls(ctx, config)
+    def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
+        type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
+        return SchemaFieldDataType(type=type_class())
+    def get_schema_field_class(
+        self, col_name: str, col_type: str, **kwargs: Any
+    ) -> SchemaFieldClass:
+        if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
+            col_type = self.NODE
+        else:
+            col_type = col_type
+        return SchemaFieldClass(
+            fieldPath=col_name,
+            type=self.get_field_type(col_type),
+            nativeDataType=col_type,
+            description=col_type.upper()
+            if col_type in (self.NODE, self.RELATIONSHIP)
+            else col_type,
+            lastModified=AuditStampClass(
+                time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
+            ),
+        )
+    def add_properties(
+        self,
+        dataset: str,
+        description: Optional[str] = None,
+        custom_properties: Optional[Dict[str, str]] = None,
+    ) -> MetadataChangeProposalWrapper:
+        dataset_properties = DatasetPropertiesClass(
+            description=description,
+            customProperties=custom_properties,
+        )
+        return MetadataChangeProposalWrapper(
+            entityUrn=make_dataset_urn(
+                platform=self.PLATFORM, name=dataset, env=self.config.env
+            ),
+            aspect=dataset_properties,
+        )
+    def generate_neo4j_object(
+        self, dataset: str, columns: list, obj_type: Optional[str] = None
+    ) -> MetadataChangeProposalWrapper:
+        try:
+            fields = [
+                self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
+                for d in columns
+                for key, value in d.items()
+            ]
+            mcp = MetadataChangeProposalWrapper(
+                entityUrn=make_dataset_urn(
+                    platform=self.PLATFORM, name=dataset, env=self.config.env
+                ),
+                aspect=SchemaMetadataClass(
+                    schemaName=dataset,
+                    platform=make_data_platform_urn(self.PLATFORM),
+                    version=0,
+                    hash="",
+                    platformSchema=OtherSchemaClass(rawSchema=""),
+                    lastModified=AuditStampClass(
+                        time=round(time.time() * 1000),
+                        actor="urn:li:corpuser:ingestion",
+                    ),
+                    fields=fields,
+                ),
+            )
+            self.report.obj_created += 1
+        except Exception as e:
+            log.error(e)
+            self.report.obj_failures += 1
+        return mcp
+    def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
+        driver = GraphDatabase.driver(
+            self.config.uri, auth=(self.config.username, self.config.password)
+        )
+        """
+        This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
+        with two columns: key and value. The key represents the Neo4j object, while the value contains the
+        corresponding metadata.
+        When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
+        metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
+        relationships.
+        In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
+        dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
+        Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
+        single dataframe, which will be used to create the DataHub objects.
+        See the docs for examples of metadata:  metadata-ingestion/docs/sources/neo4j/neo4j.md
+        """
+        try:
+            log.info(f"{query}")
+            with driver.session() as session:
+                result = session.run(query)
+                data = [record for record in result]
+            log.info("Closing Neo4j driver")
+            driver.close()
+            node_df = self.process_nodes(data)
+            rel_df = self.process_relationships(data, node_df)
+            union_cols = ["key", "obj_type", "property_data_types", "description"]
+            df = pd.concat([node_df[union_cols], rel_df[union_cols]])
+        except Exception as e:
+            self.report.failure(
+                message="Failed to get neo4j metadata",
+                exc=e,
+            )
+        return df
+    def process_nodes(self, data: list) -> pd.DataFrame:
+        nodes = [record for record in data if record["value"]["type"] == self.NODE]
+        node_df = pd.DataFrame(
+            nodes,
+            columns=["key", "value"],
+        )
+        node_df["obj_type"] = node_df["value"].apply(
+            lambda record: self.get_obj_type(record)
+        )
+        node_df["relationships"] = node_df["value"].apply(
+            lambda record: self.get_relationships(record)
+        )
+        node_df["properties"] = node_df["value"].apply(
+            lambda record: self.get_properties(record)
+        )
+        node_df["property_data_types"] = node_df["properties"].apply(
+            lambda record: self.get_property_data_types(record)
+        )
+        node_df["description"] = node_df.apply(
+            lambda record: self.get_node_description(record, node_df), axis=1
+        )
+        return node_df
+    def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
+        rels = [
+            record for record in data if record["value"]["type"] == self.RELATIONSHIP
+        ]
+        rel_df = pd.DataFrame(rels, columns=["key", "value"])
+        rel_df["obj_type"] = rel_df["value"].apply(
+            lambda record: self.get_obj_type(record)
+        )
+        rel_df["properties"] = rel_df["value"].apply(
+            lambda record: self.get_properties(record)
+        )
+        rel_df["property_data_types"] = rel_df["properties"].apply(
+            lambda record: self.get_property_data_types(record)
+        )
+        rel_df["description"] = rel_df.apply(
+            lambda record: self.get_rel_descriptions(record, node_df), axis=1
+        )
+        return rel_df
+    def get_obj_type(self, record: dict) -> str:
+        return record["type"]
+    def get_rel_descriptions(self, record: dict, df: pd.DataFrame) -> str:
+        descriptions = []
+        for _, row in df.iterrows():
+            relationships = row.get("relationships", {})
+            for relationship, props in relationships.items():
+                if record["key"] == relationship:
+                    if props["direction"] == "in":
+                        for prop in props["labels"]:
+                            descriptions.append(
+                                f"({row['key']})-[{record['key']}]->({prop})"
+                            )
+        return "\n".join(descriptions)
+    def get_node_description(self, record: dict, df: pd.DataFrame) -> str:
+        descriptions = []
+        for _, row in df.iterrows():
+            if record["key"] == row["key"]:
+                for relationship, props in row["relationships"].items():
+                    direction = props["direction"]
+                    for node in set(props["labels"]):
+                        if direction == "in":
+                            descriptions.append(
+                                f"({row['key']})<-[{relationship}]-({node})"
+                            )
+                        elif direction == "out":
+                            descriptions.append(
+                                f"({row['key']})-[{relationship}]->({node})"
+                            )
+        return "\n".join(descriptions)
+    def get_property_data_types(self, record: dict) -> List[dict]:
+        return [{k: v["type"]} for k, v in record.items()]
+    def get_properties(self, record: dict) -> str:
+        return record["properties"]
+    def get_relationships(self, record: dict) -> dict:
+        return record.get("relationships", None)
+    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        df = self.get_neo4j_metadata(
+            "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
+        )
+        for index, row in df.iterrows():
+            try:
+                yield MetadataWorkUnit(
+                    id=row["key"],
+                    mcp=self.generate_neo4j_object(
+                        columns=row["property_data_types"],
+                        dataset=row["key"],
+                    ),
+                    is_primary_source=True,
+                )
+                yield MetadataWorkUnit(
+                    id=row["key"],
+                    mcp=MetadataChangeProposalWrapper(
+                        entityUrn=make_dataset_urn(
+                            platform=self.PLATFORM,
+                            name=row["key"],
+                            env=self.config.env,
+                        ),
+                        aspect=SubTypesClass(
+                            typeNames=[
+                                DatasetSubTypes.NEO4J_NODE
+                                if row["obj_type"] == self.NODE
+                                else DatasetSubTypes.NEO4J_RELATIONSHIP
+                            ]
+                        ),
+                    ),
+                )
+                yield MetadataWorkUnit(
+                    id=row["key"],
+                    mcp=self.add_properties(
+                        dataset=row["key"],
+                        custom_properties=None,
+                        description=row["description"],
+                    ),
+                )
+            except Exception as e:
+                raise e
+    def get_report(self):
+        return self.report

datahub/ingestion/source/qlik_sense/data_classes.py CHANGED Viewed

@@ -15,6 +15,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     TimeType,
 )
+# TODO: Replace with standardized types in sql_types.py
 FIELD_TYPE_MAPPING: Dict[
     str,
     Type[

datahub/ingestion/source/redshift/redshift.py CHANGED Viewed

@@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
     ```
     """
+    # TODO: Replace with standardized types in sql_types.py
     REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[
         str,
         Type[

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
 logger = logging.getLogger(__name__)
 # https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
+# TODO: Move to the standardized types in sql_types.py
 SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
     "DATE": DateType,
     "BIGINT": NumberType,

datahub/ingestion/source/sql/athena.py CHANGED Viewed

@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
+from datahub.ingestion.api.source import StructuredLogLevel
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.aws.s3_util import make_s3_urn
 from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
     register_custom_type,
 )
 from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
+from datahub.ingestion.source.sql.sql_report import SQLSourceReport
 from datahub.ingestion.source.sql.sql_utils import (
     add_table_to_schema_container,
     gen_database_container,
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
     get_schema_fields_for_sqlalchemy_column,
 )
+try:
+    from typing_extensions import override
+except ImportError:
+    _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
+    def override(f: _F, /) -> _F:  # noqa: F811
+        return f
 logger = logging.getLogger(__name__)
 assert STRUCT, "required type modules are not available"
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
     - Profiling when enabled.
     """
-    table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
+    config: AthenaConfig
+    report: SQLSourceReport
     def __init__(self, config, ctx):
         super().__init__(config, ctx, "athena")
         self.cursor: Optional[BaseCursor] = None
+        self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
     @classmethod
     def create(cls, config_dict, ctx):
         config = AthenaConfig.parse_obj(config_dict)
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
         )
     # It seems like database/schema filter in the connection string does not work and this to work around that
+    @override
     def get_schema_names(self, inspector: Inspector) -> List[str]:
         athena_config = typing.cast(AthenaConfig, self.config)
         schemas = inspector.get_schema_names()
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
             return [schema for schema in schemas if schema == athena_config.database]
         return schemas
-    # Overwrite to get partitions
+    @classmethod
+    def _casted_partition_key(cls, key: str) -> str:
+        # We need to cast the partition keys to a VARCHAR, since otherwise
+        # Athena may throw an error during concatenation / comparison.
+        return f"CAST({key} as VARCHAR)"
+    @override
     def get_partitions(
         self, inspector: Inspector, schema: str, table: str
-    ) -> List[str]:
-        partitions = []
-        athena_config = typing.cast(AthenaConfig, self.config)
-        if not athena_config.extract_partitions:
-            return []
+    ) -> Optional[List[str]]:
+        if not self.config.extract_partitions:
+            return None
         if not self.cursor:
-            return []
+            return None
         metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
             table_name=table, schema_name=schema
         )
-        if metadata.partition_keys:
-            for key in metadata.partition_keys:
-                if key.name:
-                    partitions.append(key.name)
-            if not partitions:
-                return []
+        partitions = []
+        for key in metadata.partition_keys:
+            if key.name:
+                partitions.append(key.name)
+        if not partitions:
+            return []
-            # We create an artiificaial concatenated partition key to be able to query max partition easier
-            part_concat = "|| '-' ||".join(partitions)
+        with self.report.report_exc(
+            message="Failed to extract partition details",
+            context=f"{schema}.{table}",
+            level=StructuredLogLevel.WARN,
+        ):
+            # We create an artifical concatenated partition key to be able to query max partition easier
+            part_concat = " || '-' || ".join(
+                self._casted_partition_key(key) for key in partitions
+            )
             max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
             ret = self.cursor.execute(max_partition_query)
             max_partition: Dict[str, str] = {}
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
                 partitions=partitions,
                 max_partition=max_partition,
             )
-            return partitions
-        return []
+        return partitions
     # Overwrite to modify the creation of schema fields
     def get_schema_fields_for_column(
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
         if partition and partition.max_partition:
             max_partition_filters = []
             for key, value in partition.max_partition.items():
-                max_partition_filters.append(f"CAST({key} as VARCHAR) = '{value}'")
+                max_partition_filters.append(
+                    f"{self._casted_partition_key(key)} = '{value}'"
+                )
             max_partition = str(partition.max_partition)
             return (
                 max_partition,

datahub/ingestion/source/sql/sql_types.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import re
-from typing import Any, Dict, ValuesView
+from typing import Any, Dict, Optional, Type, Union, ValuesView
 from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     ArrayType,
@@ -16,14 +16,28 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     UnionType,
 )
-# these can be obtained by running `select format_type(oid, null),* from pg_type;`
-# we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
-# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
+DATAHUB_FIELD_TYPE = Union[
+    ArrayType,
+    BooleanType,
+    BytesType,
+    DateType,
+    EnumType,
+    MapType,
+    NullType,
+    NumberType,
+    RecordType,
+    StringType,
+    TimeType,
+    UnionType,
+]
-# we map from format_type since this is what dbt uses
-# see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
-# see https://www.npgsql.org/dev/types.html for helpful type annotations
+# These can be obtained by running `select format_type(oid, null),* from pg_type;`
+# We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
+# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
+# We map from format_type since this is what dbt uses.
+# See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
+# See https://www.npgsql.org/dev/types.html for helpful type annotations
 POSTGRES_TYPES_MAP: Dict[str, Any] = {
     "boolean": BooleanType,
     "bytea": BytesType,
@@ -430,3 +444,54 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
     "geography": None,
     "uuid": StringType,
 }
+_merged_mapping = {
+    "boolean": BooleanType,
+    "date": DateType,
+    "time": TimeType,
+    "numeric": NumberType,
+    "text": StringType,
+    "timestamp with time zone": DateType,
+    "timestamp without time zone": DateType,
+    "integer": NumberType,
+    "float8": NumberType,
+    "struct": RecordType,
+    **POSTGRES_TYPES_MAP,
+    **SNOWFLAKE_TYPES_MAP,
+    **BIGQUERY_TYPES_MAP,
+    **SPARK_SQL_TYPES_MAP,
+    **TRINO_SQL_TYPES_MAP,
+    **ATHENA_SQL_TYPES_MAP,
+    **VERTICA_SQL_TYPES_MAP,
+}
+def resolve_sql_type(
+    column_type: Optional[str],
+    platform: Optional[str] = None,
+) -> Optional[DATAHUB_FIELD_TYPE]:
+    # In theory, we should use the platform-specific mapping where available.
+    # However, the types don't ever conflict, so the merged mapping is fine.
+    TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
+        _merged_mapping.get(column_type) if column_type else None
+    )
+    if TypeClass is None and column_type:
+        # resolve a modified type
+        if platform == "trino":
+            TypeClass = resolve_trino_modified_type(column_type)
+        elif platform == "athena":
+            TypeClass = resolve_athena_modified_type(column_type)
+        elif platform == "postgres" or platform == "redshift":
+            # Redshift uses a variant of Postgres, so we can use the same logic.
+            TypeClass = resolve_postgres_modified_type(column_type)
+        elif platform == "vertica":
+            TypeClass = resolve_vertica_modified_type(column_type)
+        elif platform == "snowflake":
+            # Snowflake types are uppercase, so we check that.
+            TypeClass = _merged_mapping.get(column_type.upper())
+    if TypeClass:
+        return TypeClass()
+    return None

datahub/ingestion/source/unity/proxy_types.py CHANGED Viewed

@@ -33,6 +33,7 @@ from datahub.metadata.schema_classes import (
 logger = logging.getLogger(__name__)
+# TODO: (maybe) Replace with standardized types in sql_types.py
 DATA_TYPE_REGISTRY: dict = {
     ColumnTypeName.BOOLEAN: BooleanTypeClass,
     ColumnTypeName.BYTE: BytesTypeClass,

datahub/utilities/urn_encoder.py CHANGED Viewed

@@ -4,7 +4,8 @@ from typing import List
 # NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage.
 # If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts
 # We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes.
-RESERVED_CHARS = {",", "(", ")"}
+# Also see https://datahubproject.io/docs/what/urn/#restrictions
+RESERVED_CHARS = {",", "(", ")", "␟"}
 RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"})

{acryl_datahub-0.15.0rc4.dist-info → acryl_datahub-0.15.0rc6.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-0.15.0rc4.dist-info → acryl_datahub-0.15.0rc6.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 0.15.0rc4__py3-none-any.whl → 0.15.0rc6__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc4py3-none-any.whl → 0.15.0rc6py3-none-any.whl