PyPI - acryl-datahub - Versions diffs - 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl - Mend

acryl-datahub 1.1.0.5rc6py3-none-any.whl → 1.1.0.5rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show

{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
datahub/_version.py +1 -1
datahub/cli/check_cli.py +0 -7
datahub/cli/cli_utils.py +73 -0
datahub/cli/delete_cli.py +0 -6
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +148 -228
datahub/cli/exists_cli.py +0 -4
datahub/cli/get_cli.py +0 -4
datahub/cli/ingest_cli.py +1 -20
datahub/cli/put_cli.py +0 -6
datahub/cli/quickstart_versioning.py +50 -5
datahub/cli/specific/assertions_cli.py +0 -6
datahub/cli/specific/datacontract_cli.py +0 -6
datahub/cli/specific/dataproduct_cli.py +0 -22
datahub/cli/specific/dataset_cli.py +0 -11
datahub/cli/specific/forms_cli.py +0 -6
datahub/cli/specific/group_cli.py +0 -4
datahub/cli/specific/structuredproperties_cli.py +0 -7
datahub/cli/specific/user_cli.py +0 -4
datahub/cli/state_cli.py +0 -4
datahub/cli/timeline_cli.py +0 -4
datahub/entrypoints.py +4 -3
datahub/ingestion/api/report.py +183 -35
datahub/ingestion/autogenerated/capability_summary.json +3431 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +30 -128
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/graph/client.py +2 -2
datahub/ingestion/run/pipeline.py +47 -1
datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +1 -1
datahub/ingestion/source/data_lake_common/object_store.py +40 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
datahub/ingestion/source/dremio/dremio_source.py +7 -7
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +28 -20
datahub/ingestion/source/identity/okta.py +0 -13
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
datahub/ingestion/source/powerbi/powerbi.py +0 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/source.py +19 -3
datahub/ingestion/source/sigma/sigma.py +6 -1
datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/hive_metastore.py +0 -10
datahub/ingestion/source/sql/sql_common.py +4 -0
datahub/ingestion/source/sql/vertica.py +0 -4
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/superset.py +56 -1
datahub/ingestion/source/tableau/tableau.py +40 -34
datahub/ingestion/source/tableau/tableau_constant.py +0 -2
datahub/ingestion/source/unity/proxy.py +4 -3
datahub/ingestion/source/unity/source.py +19 -9
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +85 -4
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
datahub/metadata/schema.avsc +54 -1
datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
datahub/sdk/lineage_client.py +2 -0
datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
datahub/sql_parsing/sqlglot_lineage.py +40 -13
datahub/upgrade/upgrade.py +46 -13
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0

datahub/ingestion/autogenerated/lineage.json ADDED Viewed

@@ -0,0 +1,401 @@
+{
+  "entities": {
+    "dataJob": {
+      "dataJobInputOutput": {
+        "aspect": "dataJobInputOutput",
+        "fields": [
+          {
+            "name": "inputDatasets",
+            "path": "inputDatasets",
+            "isLineage": true,
+            "relationship": {
+              "name": "Consumes",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "inputDatasetEdges",
+            "path": "inputDatasetEdges",
+            "isLineage": true,
+            "relationship": {
+              "name": "Consumes",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "outputDatasets",
+            "path": "outputDatasets",
+            "isLineage": true,
+            "relationship": {
+              "name": "Produces",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "outputDatasetEdges",
+            "path": "outputDatasetEdges",
+            "isLineage": true,
+            "relationship": {
+              "name": "Produces",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "inputDatajobs",
+            "path": "inputDatajobs",
+            "isLineage": true,
+            "relationship": {
+              "name": "DownstreamOf",
+              "entityTypes": [
+                "dataJob"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "inputDatajobEdges",
+            "path": "inputDatajobEdges",
+            "isLineage": true,
+            "relationship": {
+              "name": "DownstreamOf",
+              "entityTypes": [
+                "dataJob"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    },
+    "dataProcessInstance": {
+      "dataProcessInstanceOutput": {
+        "aspect": "dataProcessInstanceOutput",
+        "fields": [
+          {
+            "name": "outputEdges",
+            "path": "outputEdges",
+            "isLineage": true,
+            "relationship": {
+              "name": "DataProcessInstanceProduces",
+              "entityTypes": [
+                "dataset",
+                "mlModel",
+                "dataProcessInstance"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      },
+      "dataProcessInstanceInput": {
+        "aspect": "dataProcessInstanceInput",
+        "fields": [
+          {
+            "name": "inputEdges",
+            "path": "inputEdges",
+            "isLineage": true,
+            "relationship": {
+              "name": "DataProcessInstanceConsumes",
+              "entityTypes": [
+                "dataset",
+                "mlModel",
+                "dataProcessInstance"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    },
+    "dataProcess": {
+      "dataProcessInfo": {
+        "aspect": "dataProcessInfo",
+        "fields": [
+          {
+            "name": "inputs",
+            "path": "inputs",
+            "isLineage": true,
+            "relationship": {
+              "name": "Consumes",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "outputs",
+            "path": "outputs",
+            "isLineage": true,
+            "relationship": {
+              "name": "Consumes",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    },
+    "dataset": {
+      "upstreamLineage": {
+        "aspect": "upstreamLineage",
+        "fields": [
+          {
+            "name": "dataset",
+            "path": "upstreams.dataset",
+            "isLineage": true,
+            "relationship": {
+              "name": "DownstreamOf",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    },
+    "chart": {
+      "chartInfo": {
+        "aspect": "chartInfo",
+        "fields": [
+          {
+            "name": "inputs",
+            "path": "inputs",
+            "isLineage": true,
+            "relationship": {
+              "name": "Consumes",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "inputEdges",
+            "path": "inputEdges",
+            "isLineage": true,
+            "relationship": {
+              "name": "Consumes",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    },
+    "dashboard": {
+      "dashboardInfo": {
+        "aspect": "dashboardInfo",
+        "fields": [
+          {
+            "name": "charts",
+            "path": "charts",
+            "isLineage": true,
+            "relationship": {
+              "name": "Contains",
+              "entityTypes": [
+                "chart"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "chartEdges",
+            "path": "chartEdges",
+            "isLineage": true,
+            "relationship": {
+              "name": "Contains",
+              "entityTypes": [
+                "chart"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "datasets",
+            "path": "datasets",
+            "isLineage": true,
+            "relationship": {
+              "name": "Consumes",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "datasetEdges",
+            "path": "datasetEdges",
+            "isLineage": true,
+            "relationship": {
+              "name": "Consumes",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "dashboards",
+            "path": "dashboards",
+            "isLineage": true,
+            "relationship": {
+              "name": "DashboardContainsDashboard",
+              "entityTypes": [
+                "dashboard"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    },
+    "mlModelGroup": {
+      "mlModelGroupProperties": {
+        "aspect": "mlModelGroupProperties",
+        "fields": [
+          {
+            "name": "trainingJobs",
+            "path": "trainingJobs",
+            "isLineage": true,
+            "relationship": {
+              "name": "TrainedBy",
+              "entityTypes": [
+                "dataJob",
+                "dataProcessInstance"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "downstreamJobs",
+            "path": "downstreamJobs",
+            "isLineage": true,
+            "relationship": {
+              "name": "UsedBy",
+              "entityTypes": [
+                "dataJob",
+                "dataProcessInstance"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    },
+    "mlFeature": {
+      "mlFeatureProperties": {
+        "aspect": "mlFeatureProperties",
+        "fields": [
+          {
+            "name": "sources",
+            "path": "sources",
+            "isLineage": true,
+            "relationship": {
+              "name": "DerivedFrom",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    },
+    "mlPrimaryKey": {
+      "mlPrimaryKeyProperties": {
+        "aspect": "mlPrimaryKeyProperties",
+        "fields": [
+          {
+            "name": "sources",
+            "path": "sources",
+            "isLineage": true,
+            "relationship": {
+              "name": "DerivedFrom",
+              "entityTypes": [
+                "dataset"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    },
+    "mlModel": {
+      "mlModelProperties": {
+        "aspect": "mlModelProperties",
+        "fields": [
+          {
+            "name": "trainingJobs",
+            "path": "trainingJobs",
+            "isLineage": true,
+            "relationship": {
+              "name": "TrainedBy",
+              "entityTypes": [
+                "dataJob",
+                "dataProcessInstance"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "downstreamJobs",
+            "path": "downstreamJobs",
+            "isLineage": true,
+            "relationship": {
+              "name": "UsedBy",
+              "entityTypes": [
+                "dataJob",
+                "dataProcessInstance"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "mlFeatures",
+            "path": "mlFeatures",
+            "isLineage": true,
+            "relationship": {
+              "name": "Consumes",
+              "entityTypes": [
+                "mlFeature"
+              ],
+              "isLineage": true
+            }
+          },
+          {
+            "name": "groups",
+            "path": "groups",
+            "isLineage": true,
+            "relationship": {
+              "name": "MemberOf",
+              "entityTypes": [
+                "mlModelGroup"
+              ],
+              "isLineage": true
+            }
+          }
+        ]
+      }
+    }
+  },
+  "generated_by": "metadata-ingestion/scripts/modeldocgen.py",
+  "generated_at": "2025-07-01T10:49:03.713749+00:00"
+}

datahub/ingestion/autogenerated/lineage_helper.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import json
 import logging
+from functools import lru_cache
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set
-from datahub.utilities.urns.urn import guess_entity_type
+from typing import Dict, List, Optional
 logger = logging.getLogger(__name__)
@@ -18,10 +17,9 @@ def _load_lineage_data() -> Dict:
     Load lineage data from the autogenerated lineage.json file.
     Returns:
-        Dict containing the lineage information
+        Dict containing the lineage information, or empty dict if file doesn't exist
     Raises:
-        FileNotFoundError: If lineage.json doesn't exist
         json.JSONDecodeError: If lineage.json is malformed
     """
     global _lineage_data
@@ -34,151 +32,55 @@ def _load_lineage_data() -> Dict:
     lineage_file = current_file.parent / "lineage.json"
     if not lineage_file.exists():
-        raise FileNotFoundError(f"Lineage file not found: {lineage_file}")
+        logger.warning(
+            f"Lineage file not found: {lineage_file}. "
+            "This may indicate a packaging issue. Lineage detection will be disabled."
+        )
+        _lineage_data = {}
+        return _lineage_data
     try:
         with open(lineage_file, "r") as f:
             _lineage_data = json.load(f)
         return _lineage_data
     except json.JSONDecodeError as e:
-        raise json.JSONDecodeError(
-            f"Failed to parse lineage.json: {e}", e.doc, e.pos
-        ) from e
-def get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
-    """
-    This is experimental internal API subject to breaking changes without prior notice.
-    Get lineage fields for a specific entity type and aspect.
-    Args:
-        entity_type: The entity type (e.g., 'dataset', 'dataJob')
-        aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
-    Returns:
-        List of lineage field dictionaries, each containing:
-        - name: field name
-        - path: dot-notation path to the field
-        - isLineage: boolean indicating if it's lineage
-        - relationship: relationship information
-    Raises:
-        FileNotFoundError: If lineage.json doesn't exist
-        json.JSONDecodeError: If lineage.json is malformed
-    """
-    lineage_data = _load_lineage_data()
-    entity_data = lineage_data.get("entities", {}).get(entity_type, {})
-    aspect_data = entity_data.get(aspect_name, {})
-    return aspect_data.get("fields", [])
-def is_lineage_field(urn: str, aspect_name: str, field_path: str) -> bool:
-    """
-    This is experimental internal API subject to breaking changes without prior notice.
-    Check if a specific field path is lineage-related.
-    Args:
-        urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
-        aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
-        field_path: The dot-notation path to the field (e.g., 'upstreams.dataset')
-    Returns:
-        True if the field is lineage-related, False otherwise
-    Raises:
-        FileNotFoundError: If lineage.json doesn't exist
-        json.JSONDecodeError: If lineage.json is malformed
-        AssertionError: If URN doesn't start with 'urn:li:'
-    """
-    entity_type = guess_entity_type(urn)
-    lineage_fields = get_lineage_fields(entity_type, aspect_name)
-    for field in lineage_fields:
-        if field.get("path") == field_path:
-            return field.get("isLineage", False)
-    return False
+        logger.error(
+            f"Failed to parse lineage.json: {e}. Lineage detection will be disabled."
+        )
+        _lineage_data = {}
+        return _lineage_data
-def has_lineage(urn: str, aspect: Any) -> bool:
+def _get_fields(entity_type: str, aspect_name: str) -> List[Dict]:
     """
     This is experimental internal API subject to breaking changes without prior notice.
-    Check if an aspect has any lineage fields.
-    Args:
-        urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
-        aspect: The aspect object
-    Returns:
-        True if the aspect has lineage fields, False otherwise
-    Raises:
-        FileNotFoundError: If lineage.json doesn't exist
-        json.JSONDecodeError: If lineage.json is malformed
-        AssertionError: If URN doesn't start with 'urn:li:'
     """
-    entity_type = guess_entity_type(urn)
-    aspect_class = getattr(aspect, "__class__", None)
-    aspect_name = (
-        aspect_class.__name__ if aspect_class is not None else str(type(aspect))
+    return (
+        _load_lineage_data()
+        .get("entities", {})
+        .get(entity_type, {})
+        .get(aspect_name, {})
+        .get("fields", [])
     )
-    lineage_fields = get_lineage_fields(entity_type, aspect_name)
-    return len(lineage_fields) > 0
-def has_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
+def _get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
     """
     This is experimental internal API subject to breaking changes without prior notice.
-    Check if an aspect has any lineage fields.
-    Args:
-        entity_type: The entity type (e.g., 'dataset', 'dataJob')
-        aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
-    Returns:
-        True if the aspect has lineage fields, False otherwise
-    Raises:
-        FileNotFoundError: If lineage.json doesn't exist
-        json.JSONDecodeError: If lineage.json is malformed
     """
-    lineage_fields = get_lineage_fields(entity_type, aspect_name)
-    return len(lineage_fields) > 0
+    return [
+        field
+        for field in _get_fields(entity_type, aspect_name)
+        if field.get("isLineage", False)
+    ]
-def get_all_lineage_aspects(entity_type: str) -> Set[str]:
+@lru_cache(maxsize=128)
+def is_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
     """
     This is experimental internal API subject to breaking changes without prior notice.
-    Get all aspects that have lineage fields for a given entity type.
-    Args:
-        entity_type: The entity type (e.g., 'dataset', 'dataJob')
-    Returns:
-        Set of aspect names that have lineage fields
-    Raises:
-        FileNotFoundError: If lineage.json doesn't exist
-        json.JSONDecodeError: If lineage.json is malformed
     """
-    lineage_data = _load_lineage_data()
-    entity_data = lineage_data.get("entities", {}).get(entity_type, {})
-    lineage_aspects = set()
-    for aspect_name, aspect_data in entity_data.items():
-        if aspect_data.get("fields"):
-            lineage_aspects.add(aspect_name)
-    return lineage_aspects
+    return len(_get_lineage_fields(entity_type, aspect_name)) > 0
 def clear_cache() -> None:

datahub/ingestion/extractor/schema_util.py CHANGED Viewed

@@ -125,7 +125,7 @@ class AvroToMceSchemaConverter:
         self._prefix_name_stack: PrefixNameStack = [self.version_string]
         # Tracks the fields on the current path.
         self._fields_stack: FieldStack = []
-        # Tracks the record types seen so far. Used to prevent infinite recursion with recursive types.
+        # Stack of record types currently being processed. Used to prevent infinite recursion with recursive types.
         self._record_types_seen: List[str] = []
         # If part of the key-schema or value-schema.
         self._is_key_schema = is_key_schema
@@ -522,10 +522,12 @@ class AvroToMceSchemaConverter:
         # Handle recursive record definitions
         recurse: bool = True
         if isinstance(schema, avro.schema.RecordSchema):
-            if schema.fullname not in self._record_types_seen:
-                self._record_types_seen.append(schema.fullname)
-            else:
+            # Only prevent recursion if we're currently processing this record type (true recursion)
+            # Allow reuse of the same record type in different contexts
+            if schema.fullname in self._record_types_seen:
                 recurse = False
+            else:
+                self._record_types_seen.append(schema.fullname)
         # Adjust actual schema if needed
         actual_schema = self._get_underlying_type_if_option_as_union(schema, schema)
@@ -559,6 +561,13 @@ class AvroToMceSchemaConverter:
                     for sub_schema in self._get_sub_schemas(actual_schema):
                         yield from self._to_mce_fields(sub_schema)
+        # Clean up the processing stack
+        if (
+            isinstance(schema, avro.schema.RecordSchema)
+            and schema.fullname in self._record_types_seen
+        ):
+            self._record_types_seen.remove(schema.fullname)
     def _gen_non_nested_to_mce_fields(
         self, schema: SchemaOrField
     ) -> Iterable[SchemaField]:

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -1576,7 +1576,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         env: str = DEFAULT_ENV,
         default_db: Optional[str] = None,
         default_schema: Optional[str] = None,
-        default_dialect: Optional[str] = None,
+        override_dialect: Optional[str] = None,
     ) -> "SqlParsingResult":
         from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
@@ -1590,7 +1590,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
             schema_resolver=schema_resolver,
             default_db=default_db,
             default_schema=default_schema,
-            default_dialect=default_dialect,
+            override_dialect=override_dialect,
         )
     def create_tag(self, tag_name: str) -> str:

acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.5rc6py3-none-any.whl → 1.1.0.5rc8py3-none-any.whl