PyPI - acryl-datahub - Versions diffs - 1.1.0.4rc3__py3-none-any.whl → 1.1.0.5__py3-none-any.whl - Mend

acryl-datahub 1.1.0.4rc3py3-none-any.whl → 1.1.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (149) hide show

{acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2499 -2501
{acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +149 -131
{acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -1
datahub/cli/check_cli.py +65 -11
datahub/cli/cli_utils.py +63 -0
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +3 -4
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +149 -227
datahub/cli/exists_cli.py +0 -2
datahub/cli/get_cli.py +0 -2
datahub/cli/iceberg_cli.py +5 -0
datahub/cli/ingest_cli.py +3 -15
datahub/cli/migrate.py +2 -0
datahub/cli/put_cli.py +1 -4
datahub/cli/quickstart_versioning.py +50 -7
datahub/cli/specific/assertions_cli.py +0 -4
datahub/cli/specific/datacontract_cli.py +0 -3
datahub/cli/specific/dataproduct_cli.py +0 -11
datahub/cli/specific/dataset_cli.py +1 -8
datahub/cli/specific/forms_cli.py +0 -4
datahub/cli/specific/group_cli.py +0 -2
datahub/cli/specific/structuredproperties_cli.py +1 -4
datahub/cli/specific/user_cli.py +0 -2
datahub/cli/state_cli.py +0 -2
datahub/cli/timeline_cli.py +0 -2
datahub/emitter/rest_emitter.py +24 -8
datahub/entrypoints.py +4 -3
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +332 -3
datahub/ingestion/api/sink.py +3 -0
datahub/ingestion/api/source.py +47 -45
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3449 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/graph/client.py +73 -30
datahub/ingestion/run/pipeline.py +54 -2
datahub/ingestion/sink/datahub_rest.py +12 -0
datahub/ingestion/source/abs/source.py +1 -1
datahub/ingestion/source/aws/glue.py +1 -1
datahub/ingestion/source/azure/azure_common.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +45 -0
datahub/ingestion/source/data_lake_common/object_store.py +115 -27
datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
datahub/ingestion/source/dbt/dbt_common.py +3 -1
datahub/ingestion/source/dremio/dremio_api.py +38 -27
datahub/ingestion/source/dremio/dremio_source.py +7 -7
datahub/ingestion/source/fivetran/fivetran.py +34 -26
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +28 -20
datahub/ingestion/source/hex/api.py +26 -1
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/mlflow.py +11 -1
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
datahub/ingestion/source/powerbi/powerbi.py +0 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/preset.py +2 -2
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +367 -115
datahub/ingestion/source/salesforce.py +6 -3
datahub/ingestion/source/sigma/sigma.py +6 -1
datahub/ingestion/source/slack/slack.py +2 -1
datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +119 -12
datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
datahub/ingestion/source/sql/hive_metastore.py +0 -10
datahub/ingestion/source/sql/mssql/source.py +24 -15
datahub/ingestion/source/sql/oracle.py +1 -1
datahub/ingestion/source/sql/sql_common.py +11 -0
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/teradata.py +997 -235
datahub/ingestion/source/sql/vertica.py +10 -6
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
datahub/ingestion/source/superset.py +57 -2
datahub/ingestion/source/tableau/tableau.py +57 -37
datahub/ingestion/source/tableau/tableau_common.py +4 -2
datahub/ingestion/source/tableau/tableau_constant.py +0 -4
datahub/ingestion/source/unity/proxy.py +4 -3
datahub/ingestion/source/unity/source.py +56 -30
datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +1253 -536
datahub/metadata/_urns/urn_defs.py +1797 -1685
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
datahub/metadata/schema.avsc +16614 -16538
datahub/metadata/schemas/ContainerProperties.avsc +2 -0
datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
datahub/metadata/schemas/DataJobInfo.avsc +2 -0
datahub/metadata/schemas/DataProcessKey.avsc +2 -0
datahub/metadata/schemas/DatasetKey.avsc +4 -1
datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
datahub/metadata/schemas/LogicalParent.avsc +140 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
datahub/metadata/schemas/MLModelKey.avsc +2 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/sdk/datajob.py +39 -15
datahub/sdk/lineage_client.py +2 -0
datahub/sdk/main_client.py +14 -2
datahub/sdk/search_client.py +4 -3
datahub/specific/dataproduct.py +4 -0
datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
datahub/sql_parsing/sqlglot_lineage.py +40 -13
datahub/telemetry/telemetry.py +17 -11
datahub/upgrade/upgrade.py +46 -13
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
{acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0

datahub/metadata/schemas/LogicalParent.avsc ADDED Viewed

@@ -0,0 +1,140 @@
+{
+  "type": "record",
+  "Aspect": {
+    "name": "logicalParent"
+  },
+  "name": "LogicalParent",
+  "namespace": "com.linkedin.pegasus2avro.logical",
+  "fields": [
+    {
+      "Relationship": {
+        "/destinationUrn": {
+          "createdActor": "parent/created/actor",
+          "createdOn": "parent/created/time",
+          "entityTypes": [
+            "dataset",
+            "schemaField"
+          ],
+          "name": "PhysicalInstanceOf",
+          "properties": "parent/properties",
+          "updatedActor": "parent/lastModified/actor",
+          "updatedOn": "parent/lastModified/time"
+        }
+      },
+      "Searchable": {
+        "/destinationUrn": {
+          "addToFilters": true,
+          "fieldName": "logicalParent",
+          "fieldType": "URN",
+          "filterNameOverride": "Physical Instance Of",
+          "hasValuesFieldName": "hasLogicalParent",
+          "queryByDefault": false
+        }
+      },
+      "type": {
+        "type": "record",
+        "name": "Edge",
+        "namespace": "com.linkedin.pegasus2avro.common",
+        "fields": [
+          {
+            "java": {
+              "class": "com.linkedin.pegasus2avro.common.urn.Urn"
+            },
+            "type": [
+              "null",
+              "string"
+            ],
+            "name": "sourceUrn",
+            "default": null,
+            "doc": "Urn of the source of this relationship edge.\nIf not specified, assumed to be the entity that this aspect belongs to.",
+            "Urn": "Urn"
+          },
+          {
+            "java": {
+              "class": "com.linkedin.pegasus2avro.common.urn.Urn"
+            },
+            "type": "string",
+            "name": "destinationUrn",
+            "doc": "Urn of the destination of this relationship edge.",
+            "Urn": "Urn"
+          },
+          {
+            "type": [
+              "null",
+              {
+                "type": "record",
+                "name": "AuditStamp",
+                "namespace": "com.linkedin.pegasus2avro.common",
+                "fields": [
+                  {
+                    "type": "long",
+                    "name": "time",
+                    "doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
+                  },
+                  {
+                    "java": {
+                      "class": "com.linkedin.pegasus2avro.common.urn.Urn"
+                    },
+                    "type": "string",
+                    "name": "actor",
+                    "doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
+                    "Urn": "Urn"
+                  },
+                  {
+                    "java": {
+                      "class": "com.linkedin.pegasus2avro.common.urn.Urn"
+                    },
+                    "type": [
+                      "null",
+                      "string"
+                    ],
+                    "name": "impersonator",
+                    "default": null,
+                    "doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
+                    "Urn": "Urn"
+                  },
+                  {
+                    "type": [
+                      "null",
+                      "string"
+                    ],
+                    "name": "message",
+                    "default": null,
+                    "doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
+                  }
+                ],
+                "doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
+              }
+            ],
+            "name": "created",
+            "default": null,
+            "doc": "Audit stamp containing who created this relationship edge and when"
+          },
+          {
+            "type": [
+              "null",
+              "com.linkedin.pegasus2avro.common.AuditStamp"
+            ],
+            "name": "lastModified",
+            "default": null,
+            "doc": "Audit stamp containing who last modified this relationship edge and when"
+          },
+          {
+            "type": [
+              "null",
+              {
+                "type": "map",
+                "values": "string"
+              }
+            ],
+            "name": "properties",
+            "default": null,
+            "doc": "A generic properties bag that allows us to store specific information on this graph edge."
+          }
+        ],
+        "doc": "A common structure to represent all edges to entities when used inside aspects as collections\nThis ensures that all edges have common structure around audit-stamps and will support PATCH, time-travel automatically."
+      },
+      "name": "parent"
+    }
+  ]
+}

datahub/metadata/schemas/MLModelDeploymentKey.avsc CHANGED Viewed

@@ -60,6 +60,7 @@
           "QA": "Designates quality assurance fabrics",
           "RVW": "Designates review fabrics",
           "SANDBOX": "Designates sandbox fabrics",
+          "SBX": "Alternative spelling for sandbox",
           "SIT": "System Integration Testing",
           "STG": "Designates staging fabrics",
           "TEST": "Designates testing fabrics",
@@ -83,6 +84,7 @@
           "PRD",
           "TST",
           "SIT",
+          "SBX",
           "SANDBOX"
         ],
         "doc": "Fabric group type"

datahub/metadata/schemas/MLModelGroupKey.avsc CHANGED Viewed

@@ -67,6 +67,7 @@
           "QA": "Designates quality assurance fabrics",
           "RVW": "Designates review fabrics",
           "SANDBOX": "Designates sandbox fabrics",
+          "SBX": "Alternative spelling for sandbox",
           "SIT": "System Integration Testing",
           "STG": "Designates staging fabrics",
           "TEST": "Designates testing fabrics",
@@ -90,6 +91,7 @@
           "PRD",
           "TST",
           "SIT",
+          "SBX",
           "SANDBOX"
         ],
         "doc": "Fabric group type"

datahub/metadata/schemas/MLModelKey.avsc CHANGED Viewed

@@ -81,6 +81,7 @@
           "QA": "Designates quality assurance fabrics",
           "RVW": "Designates review fabrics",
           "SANDBOX": "Designates sandbox fabrics",
+          "SBX": "Alternative spelling for sandbox",
           "SIT": "System Integration Testing",
           "STG": "Designates staging fabrics",
           "TEST": "Designates testing fabrics",
@@ -104,6 +105,7 @@
           "PRD",
           "TST",
           "SIT",
+          "SBX",
           "SANDBOX"
         ],
         "doc": "Fabric group type"

datahub/metadata/schemas/MetadataChangeEvent.avsc CHANGED Viewed

@@ -2430,6 +2430,7 @@
                               "QA": "Designates quality assurance fabrics",
                               "RVW": "Designates review fabrics",
                               "SANDBOX": "Designates sandbox fabrics",
+                              "SBX": "Alternative spelling for sandbox",
                               "SIT": "System Integration Testing",
                               "STG": "Designates staging fabrics",
                               "TEST": "Designates testing fabrics",
@@ -2453,6 +2454,7 @@
                               "PRD",
                               "TST",
                               "SIT",
+                              "SBX",
                               "SANDBOX"
                             ],
                             "doc": "Fabric group type"

datahub/metadata/schemas/QuerySubjects.avsc CHANGED Viewed

@@ -15,13 +15,6 @@
           "namespace": "com.linkedin.pegasus2avro.query",
           "fields": [
             {
-              "Relationship": {
-                "entityTypes": [
-                  "dataset",
-                  "schemaField"
-                ],
-                "name": "IsAssociatedWith"
-              },
               "Searchable": {
                 "fieldName": "entities",
                 "fieldType": "URN"
@@ -32,11 +25,7 @@
               "type": "string",
               "name": "entity",
               "doc": "An entity which is the subject of a query.",
-              "Urn": "Urn",
-              "entityTypes": [
-                "dataset",
-                "schemaField"
-              ]
+              "Urn": "Urn"
             }
           ],
           "doc": "A single subject of a particular query.\nIn the future, we may evolve this model to include richer details\nabout the Query Subject in relation to the query."

datahub/metadata/schemas/SchemaFieldKey.avsc CHANGED Viewed

@@ -14,7 +14,8 @@
       "documentation",
       "testResults",
       "deprecation",
-      "subTypes"
+      "subTypes",
+      "logicalParent"
     ]
   },
   "name": "SchemaFieldKey",

datahub/sdk/datajob.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Dict, List, Optional, Type
 from typing_extensions import Self
+import datahub.emitter.mce_builder as builder
 import datahub.metadata.schema_classes as models
 from datahub.cli.cli_utils import first_non_null
 from datahub.errors import IngestionAttributionWarning
@@ -64,7 +65,7 @@ class DataJob(
         """Get the URN type for data jobs."""
         return DataJobUrn
-    def __init__(
+    def __init__(  # noqa: C901
         self,
         *,
         name: str,
@@ -86,6 +87,7 @@ class DataJob(
         domain: Optional[DomainInputType] = None,
         inlets: Optional[List[DatasetUrnOrStr]] = None,
         outlets: Optional[List[DatasetUrnOrStr]] = None,
+        fine_grained_lineages: Optional[List[models.FineGrainedLineageClass]] = None,
         structured_properties: Optional[StructuredPropertyInputType] = None,
         extra_aspects: ExtraAspectsType = None,
     ):
@@ -103,12 +105,14 @@ class DataJob(
             ValueError: If neither flow nor (flow_urn and platform_instance) are provided
         """
         if flow is None:
-            if flow_urn is None or platform_instance is None:
+            if flow_urn is None:
                 raise ValueError(
                     "You must provide either: 1. a DataFlow object, or 2. a DataFlowUrn (and a platform_instance config if required)"
                 )
             flow_urn = DataFlowUrn.from_string(flow_urn)
-            if flow_urn.flow_id.startswith(f"{platform_instance}."):
+            if platform_instance and flow_urn.flow_id.startswith(
+                f"{platform_instance}."
+            ):
                 flow_name = flow_urn.flow_id[len(platform_instance) + 1 :]
             else:
                 flow_name = flow_urn.flow_id
@@ -133,8 +137,6 @@ class DataJob(
         )
         self._setdefault_aspect(job_info)
         self._ensure_datajob_props().flowUrn = str(flow.urn)
-        # Set properties if provided
         if description is not None:
             self.set_description(description)
         if external_url is not None:
@@ -145,8 +147,6 @@ class DataJob(
             self.set_created(created)
         if last_modified is not None:
             self.set_last_modified(last_modified)
-        # Set standard aspects
         if subtype is not None:
             self.set_subtype(subtype)
         if owners is not None:
@@ -159,13 +159,19 @@ class DataJob(
             self.set_terms(terms)
         if domain is not None:
             self.set_domain(domain)
+        if structured_properties is not None:
+            for key, value in structured_properties.items():
+                self.set_structured_property(property_urn=key, values=value)
         if inlets is not None:
             self.set_inlets(inlets)
         if outlets is not None:
             self.set_outlets(outlets)
-        if structured_properties is not None:
-            for key, value in structured_properties.items():
-                self.set_structured_property(property_urn=key, values=value)
+        if fine_grained_lineages is not None:
+            self.set_fine_grained_lineages(fine_grained_lineages)
+        if self.flow_urn.cluster.upper() in builder.ALL_ENV_TYPES:
+            env = self.flow_urn.cluster.upper()
+        self._ensure_datajob_props().env = env
     @classmethod
     def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
@@ -201,9 +207,7 @@ class DataJob(
     ) -> Optional[models.DataJobInputOutputClass]:
         return self._get_aspect(models.DataJobInputOutputClass)
-    def _ensure_datajob_inputoutput_props(
-        self,
-    ) -> models.DataJobInputOutputClass:
+    def _ensure_datajob_inputoutput_props(self) -> models.DataJobInputOutputClass:
         return self._setdefault_aspect(
             models.DataJobInputOutputClass(inputDatasets=[], outputDatasets=[])
         )
@@ -307,8 +311,6 @@ class DataJob(
                 browse_path.append(
                     models.BrowsePathEntryClass(id=entry.id, urn=entry.urn)
                 )
-        # Add the job itself to the path
         browse_path.append(models.BrowsePathEntryClass(id=flow.name, urn=str(flow.urn)))
         # Set the browse path aspect
         self._set_aspect(models.BrowsePathsV2Class(path=browse_path))
@@ -341,3 +343,25 @@ class DataJob(
             self._ensure_datajob_inputoutput_props().outputDatasets.append(
                 str(outlet_urn)
             )
+    @property
+    def fine_grained_lineages(self) -> List[models.FineGrainedLineageClass]:
+        io_aspect = self._get_datajob_inputoutput_props()
+        return (
+            io_aspect.fineGrainedLineages
+            if io_aspect and io_aspect.fineGrainedLineages
+            else []
+        )
+    def set_fine_grained_lineages(
+        self, lineages: List[models.FineGrainedLineageClass]
+    ) -> None:
+        io_aspect = self._ensure_datajob_inputoutput_props()
+        if io_aspect.fineGrainedLineages is None:
+            io_aspect.fineGrainedLineages = []
+        io_aspect.fineGrainedLineages.extend(lineages)
+    @property
+    def env(self) -> Optional[str]:
+        """Get the environment of the data job."""
+        return str(self._ensure_datajob_props().env)

datahub/sdk/lineage_client.py CHANGED Viewed

@@ -478,6 +478,7 @@ class LineageClient:
         env: str = "PROD",
         default_db: Optional[str] = None,
         default_schema: Optional[str] = None,
+        override_dialect: Optional[str] = None,
     ) -> None:
         """Add lineage by parsing a SQL query."""
         from datahub.sql_parsing.sqlglot_lineage import (
@@ -493,6 +494,7 @@ class LineageClient:
             platform_instance=platform_instance,
             env=env,
             graph=self._client._graph,
+            override_dialect=override_dialect,
         )
         if parsed_result.debug_info.table_error:

datahub/sdk/main_client.py CHANGED Viewed

@@ -66,7 +66,12 @@ class DataHubClient:
         self._graph.test_connection()
     @classmethod
-    def from_env(cls) -> "DataHubClient":
+    def from_env(
+        cls,
+        *,
+        client_mode: ClientMode = ClientMode.SDK,
+        datahub_component: Optional[str] = None,
+    ) -> "DataHubClient":
         """Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
         This will first check DATAHUB_GMS_URL and DATAHUB_GMS_TOKEN. If not present,
@@ -76,6 +81,10 @@ class DataHubClient:
         If you're looking to specify the server/token in code, use the
         DataHubClient(server=..., token=...) constructor instead.
+        Args:
+            client_mode: [internal] The client mode to use. Defaults to "SDK".
+            datahub_component: [internal] The DataHub component name to include in the user agent.
         Returns:
             A DataHubClient instance.
         """
@@ -83,7 +92,10 @@ class DataHubClient:
         # Inspired by the DockerClient.from_env() method.
         # TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
         # That file is part of the "environment", but is not a traditional "env variable".
-        graph = get_default_graph(ClientMode.SDK)
+        graph = get_default_graph(
+            client_mode=client_mode,
+            datahub_component=datahub_component,
+        )
         return cls(graph=graph)

datahub/sdk/search_client.py CHANGED Viewed

@@ -19,6 +19,7 @@ from datahub.sdk.search_filters import (
     _OrFilters,
     _StatusFilter,
 )
+from datahub.utilities.ordered_set import OrderedSet
 if TYPE_CHECKING:
     from datahub.sdk.main_client import DataHubClient
@@ -80,7 +81,7 @@ def compute_entity_types(
 ) -> Optional[List[str]]:
     found_filters = False
     found_positive_filters = False
-    entity_types: List[str] = []
+    entity_types: OrderedSet[str] = OrderedSet()
     for ands in filters:
         for clause in ands["and"]:
             if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
@@ -88,7 +89,7 @@ def compute_entity_types(
                 if not clause.negated:
                     found_positive_filters = True
-                entity_types.extend(clause.values)
+                entity_types.update(clause.values)
     if not found_filters:
         # If we didn't find any filters, use None so we use the default set.
@@ -100,7 +101,7 @@ def compute_entity_types(
         # still want to use the default set.
         return None
-    return entity_types
+    return list(entity_types)
 class SearchClient:

datahub/specific/dataproduct.py CHANGED Viewed

@@ -9,6 +9,9 @@ from datahub.metadata.schema_classes import (
 )
 from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
 from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
+from datahub.specific.aspect_helpers.structured_properties import (
+    HasStructuredPropertiesPatch,
+)
 from datahub.specific.aspect_helpers.tags import HasTagsPatch
 from datahub.specific.aspect_helpers.terms import HasTermsPatch
@@ -16,6 +19,7 @@ from datahub.specific.aspect_helpers.terms import HasTermsPatch
 class DataProductPatchBuilder(
     HasOwnershipPatch,
     HasCustomPropertiesPatch,
+    HasStructuredPropertiesPatch,
     HasTagsPatch,
     HasTermsPatch,
     MetadataPatchProposal,

datahub/sql_parsing/sql_parsing_aggregator.py CHANGED Viewed

@@ -58,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
     ToolMetaExtractorReport,
 )
 from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
+from datahub.utilities.dedup_list import deduplicate_list
 from datahub.utilities.file_backed_collections import (
     ConnectionWrapper,
     FileBackedDict,
@@ -140,6 +141,7 @@ class QueryMetadata:
     used_temp_tables: bool = True
+    extra_info: Optional[dict] = None
     origin: Optional[Urn] = None
     def make_created_audit_stamp(self) -> models.AuditStampClass:
@@ -263,7 +265,7 @@ class PreparsedQuery:
     query_type_props: QueryTypeProps = dataclasses.field(
         default_factory=lambda: QueryTypeProps()
     )
-    # Use this to store addtitional key-value information about query for debugging
+    # Use this to store additional key-value information about the query for debugging.
     extra_info: Optional[dict] = None
     origin: Optional[Urn] = None
@@ -948,6 +950,7 @@ class SqlParsingAggregator(Closeable):
                 column_usage=parsed.column_usage or {},
                 confidence_score=parsed.confidence_score,
                 used_temp_tables=session_has_temp_tables,
+                extra_info=parsed.extra_info,
                 origin=parsed.origin,
             )
         )
@@ -1491,9 +1494,9 @@ class SqlParsingAggregator(Closeable):
             return
         # If a query doesn't involve any allowed tables, skip it.
-        if downstream_urn is None and not any(
-            self.is_allowed_table(urn) for urn in query.upstreams
-        ):
+        if (
+            downstream_urn is None or not self.is_allowed_table(downstream_urn)
+        ) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
             self.report.num_queries_skipped_due_to_filters += 1
             return
@@ -1574,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
         @dataclasses.dataclass
         class QueryLineageInfo:
-            upstreams: List[UrnStr]  # this is direct upstreams, with *no temp tables*
-            column_lineage: List[ColumnLineageInfo]
+            upstreams: OrderedSet[
+                UrnStr
+            ]  # this is direct upstreams, with *no temp tables*
+            column_lineage: OrderedSet[ColumnLineageInfo]
             confidence_score: float
             def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
-                self.upstreams += other_query.upstreams
-                self.column_lineage += other_query.column_lineage
+                self.upstreams.update(other_query.upstreams)
+                self.column_lineage.update(other_query.column_lineage)
                 self.confidence_score = min(
                     self.confidence_score, other_query.confidence_score
                 )
+        cache: Dict[str, QueryLineageInfo] = {}
         def _recurse_into_query(
             query: QueryMetadata, recursion_path: List[QueryId]
         ) -> QueryLineageInfo:
             if query.query_id in recursion_path:
                 # This is a cycle, so we just return the query as-is.
                 return QueryLineageInfo(
-                    upstreams=query.upstreams,
-                    column_lineage=query.column_lineage,
+                    upstreams=OrderedSet(query.upstreams),
+                    column_lineage=OrderedSet(query.column_lineage),
                     confidence_score=query.confidence_score,
                 )
+            if query.query_id in cache:
+                return cache[query.query_id]
             recursion_path = [*recursion_path, query.query_id]
             composed_of_queries.add(query.query_id)
@@ -1609,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
                         upstream_query = self._query_map.get(upstream_query_id)
                         if (
                             upstream_query
-                            and upstream_query.query_id not in composed_of_queries
+                            and upstream_query.query_id not in recursion_path
                         ):
                             temp_query_lineage_info = _recurse_into_query(
                                 upstream_query, recursion_path
@@ -1669,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
                 ]
             )
-            return QueryLineageInfo(
-                upstreams=list(new_upstreams),
-                column_lineage=new_cll,
+            ret = QueryLineageInfo(
+                upstreams=new_upstreams,
+                column_lineage=OrderedSet(new_cll),
                 confidence_score=new_confidence_score,
             )
+            cache[query.query_id] = ret
+            return ret
         resolved_lineage_info = _recurse_into_query(base_query, [])
@@ -1706,15 +1718,15 @@ class SqlParsingAggregator(Closeable):
         )
         merged_query_text = ";\n\n".join(
-            [q.formatted_query_string for q in ordered_queries]
+            deduplicate_list([q.formatted_query_string for q in ordered_queries])
         )
         resolved_query = dataclasses.replace(
             base_query,
             query_id=composite_query_id,
             formatted_query_string=merged_query_text,
-            upstreams=resolved_lineage_info.upstreams,
-            column_lineage=resolved_lineage_info.column_lineage,
+            upstreams=list(resolved_lineage_info.upstreams),
+            column_lineage=list(resolved_lineage_info.column_lineage),
             confidence_score=resolved_lineage_info.confidence_score,
         )

acryl-datahub 1.1.0.4rc3__py3-none-any.whl → 1.1.0.5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.4rc3py3-none-any.whl → 1.1.0.5py3-none-any.whl