PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show

{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
datahub/cli/cli_utils.py +2 -0
datahub/cli/delete_cli.py +103 -24
datahub/cli/ingest_cli.py +110 -0
datahub/cli/put_cli.py +1 -1
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +2 -1
datahub/configuration/common.py +3 -3
datahub/configuration/git.py +7 -1
datahub/configuration/kafka_consumer_config.py +31 -1
datahub/emitter/mcp_patch_builder.py +43 -0
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/api/incremental_properties_helper.py +69 -0
datahub/ingestion/api/source.py +6 -1
datahub/ingestion/api/source_helpers.py +4 -2
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
datahub/ingestion/run/pipeline.py +6 -5
datahub/ingestion/run/pipeline_config.py +6 -0
datahub/ingestion/sink/datahub_rest.py +15 -4
datahub/ingestion/source/abs/source.py +4 -0
datahub/ingestion/source/aws/aws_common.py +13 -1
datahub/ingestion/source/aws/sagemaker.py +8 -0
datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
datahub/ingestion/source/common/subtypes.py +2 -0
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_common.py +7 -61
datahub/ingestion/source/dremio/dremio_api.py +204 -86
datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
datahub/ingestion/source/dremio/dremio_config.py +5 -0
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_entities.py +4 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/dremio/dremio_source.py +7 -2
datahub/ingestion/source/elastic_search.py +1 -1
datahub/ingestion/source/feast.py +97 -6
datahub/ingestion/source/gc/datahub_gc.py +46 -35
datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
datahub/ingestion/source/ge_data_profiler.py +46 -9
datahub/ingestion/source/ge_profiling_config.py +5 -0
datahub/ingestion/source/iceberg/iceberg.py +12 -5
datahub/ingestion/source/kafka/kafka.py +39 -19
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
datahub/ingestion/source/looker/view_upstream.py +65 -30
datahub/ingestion/source/metadata/business_glossary.py +35 -18
datahub/ingestion/source/mode.py +0 -23
datahub/ingestion/source/neo4j/__init__.py +0 -0
datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
datahub/ingestion/source/powerbi/__init__.py +0 -1
datahub/ingestion/source/powerbi/config.py +3 -3
datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
datahub/ingestion/source/powerbi/powerbi.py +12 -6
datahub/ingestion/source/preset.py +1 -0
datahub/ingestion/source/pulsar.py +21 -2
datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
datahub/ingestion/source/redash.py +13 -63
datahub/ingestion/source/redshift/config.py +1 -0
datahub/ingestion/source/redshift/redshift.py +3 -0
datahub/ingestion/source/s3/source.py +2 -3
datahub/ingestion/source/sigma/data_classes.py +1 -0
datahub/ingestion/source/sigma/sigma.py +101 -43
datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
datahub/ingestion/source/sql/athena.py +46 -22
datahub/ingestion/source/sql/mssql/source.py +18 -6
datahub/ingestion/source/sql/sql_common.py +34 -21
datahub/ingestion/source/sql/sql_report.py +1 -0
datahub/ingestion/source/sql/sql_types.py +85 -8
datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
datahub/ingestion/source/superset.py +215 -65
datahub/ingestion/source/tableau/tableau.py +237 -76
datahub/ingestion/source/tableau/tableau_common.py +12 -6
datahub/ingestion/source/tableau/tableau_constant.py +2 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
datahub/ingestion/source/tableau/tableau_validation.py +48 -0
datahub/ingestion/source/unity/proxy_types.py +1 -0
datahub/ingestion/source/unity/source.py +4 -0
datahub/ingestion/source/unity/usage.py +20 -11
datahub/ingestion/transformer/add_dataset_tags.py +1 -1
datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
datahub/integrations/assertion/common.py +1 -1
datahub/lite/duckdb_lite.py +12 -17
datahub/metadata/_schema_classes.py +512 -392
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
datahub/metadata/schema.avsc +17222 -17499
datahub/metadata/schemas/FormInfo.avsc +4 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
datahub/specific/chart.py +0 -39
datahub/specific/dashboard.py +0 -39
datahub/specific/datajob.py +7 -57
datahub/sql_parsing/schema_resolver.py +23 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
datahub/sql_parsing/sqlglot_lineage.py +55 -14
datahub/sql_parsing/sqlglot_utils.py +8 -2
datahub/telemetry/telemetry.py +23 -9
datahub/testing/compare_metadata_json.py +1 -1
datahub/testing/doctest.py +12 -0
datahub/utilities/file_backed_collections.py +35 -2
datahub/utilities/partition_executor.py +1 -1
datahub/utilities/urn_encoder.py +2 -1
datahub/utilities/urns/_urn_base.py +1 -1
datahub/utilities/urns/structured_properties_urn.py +1 -1
datahub/utilities/sql_lineage_parser_impl.py +0 -160
datahub/utilities/sql_parser.py +0 -94
datahub/utilities/sql_parser_base.py +0 -21
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0

datahub/cli/specific/dataproduct_cli.py CHANGED Viewed

@@ -45,7 +45,7 @@ def _get_owner_urn(maybe_urn: str) -> str:
 def _abort_if_non_existent_urn(graph: DataHubGraph, urn: str, operation: str) -> None:
     try:
-        parsed_urn: Urn = Urn.create_from_string(urn)
+        parsed_urn: Urn = Urn.from_string(urn)
         entity_type = parsed_urn.get_type()
     except Exception:
         click.secho(f"Provided urn {urn} does not seem valid", fg="red")

datahub/cli/specific/structuredproperties_cli.py CHANGED Viewed

@@ -31,7 +31,8 @@ def properties() -> None:
 def upsert(file: Path) -> None:
     """Upsert structured properties in DataHub."""
-    StructuredProperties.create(str(file))
+    with get_default_graph() as graph:
+        StructuredProperties.create(str(file), graph)
 @properties.command(

datahub/configuration/common.py CHANGED Viewed

@@ -258,7 +258,7 @@ class AllowDenyPattern(ConfigModel):
         return AllowDenyPattern()
     def allowed(self, string: str) -> bool:
-        if self._denied(string):
+        if self.denied(string):
             return False
         return any(
@@ -266,7 +266,7 @@ class AllowDenyPattern(ConfigModel):
             for allow_pattern in self.allow
         )
-    def _denied(self, string: str) -> bool:
+    def denied(self, string: str) -> bool:
         for deny_pattern in self.deny:
             if re.match(deny_pattern, string, self.regex_flags):
                 return True
@@ -290,7 +290,7 @@ class AllowDenyPattern(ConfigModel):
             raise ValueError(
                 "allow list must be fully specified to get list of allowed strings"
             )
-        return [a for a in self.allow if not self._denied(a)]
+        return [a for a in self.allow if not self.denied(a)]
     def __eq__(self, other):  # type: ignore
         return isinstance(other, self.__class__) and self.__dict__ == other.__dict__

datahub/configuration/git.py CHANGED Viewed

@@ -24,7 +24,11 @@ class GitReference(ConfigModel):
         "main",
         description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
     )
+    url_subdir: Optional[str] = Field(
+        default=None,
+        description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. "
+        "Only affects URL generation, not git operations.",
+    )
     url_template: Optional[str] = Field(
         None,
         description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required."
@@ -68,6 +72,8 @@ class GitReference(ConfigModel):
     def get_url_for_file_path(self, file_path: str) -> str:
         assert self.url_template
+        if self.url_subdir:
+            file_path = f"{self.url_subdir}/{file_path}"
         return self.url_template.format(
             repo_url=self.repo, branch=self.branch, file_path=file_path
         )

datahub/configuration/kafka_consumer_config.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import inspect
 import logging
 from typing import Any, Dict, Optional
@@ -34,5 +35,34 @@ class CallableConsumerConfig:
             "oauth_cb must be a string representing python function reference "
             "in the format <python-module>:<function-name>."
         )
+        call_back_fn = import_path(call_back)
+        self._validate_call_back_fn_signature(call_back_fn)
         # Set the callback
-        self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back)
+        self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = call_back_fn
+    def _validate_call_back_fn_signature(self, call_back_fn: Any) -> None:
+        sig = inspect.signature(call_back_fn)
+        num_positional_args = len(
+            [
+                param
+                for param in sig.parameters.values()
+                if param.kind
+                in (
+                    inspect.Parameter.POSITIONAL_ONLY,
+                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                )
+                and param.default == inspect.Parameter.empty
+            ]
+        )
+        has_variadic_args = any(
+            param.kind == inspect.Parameter.VAR_POSITIONAL
+            for param in sig.parameters.values()
+        )
+        assert num_positional_args == 1 or (
+            has_variadic_args and num_positional_args <= 1
+        ), "oauth_cb function must accept single positional argument."

datahub/emitter/mcp_patch_builder.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import time
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
@@ -6,12 +7,15 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
 from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE
 from datahub.emitter.serialization_helper import pre_json_transform
 from datahub.metadata.schema_classes import (
+    AuditStampClass,
     ChangeTypeClass,
+    EdgeClass,
     GenericAspectClass,
     KafkaAuditHeaderClass,
     MetadataChangeProposalClass,
     SystemMetadataClass,
 )
+from datahub.metadata.urns import Urn
 from datahub.utilities.urns.urn import guess_entity_type
@@ -89,3 +93,42 @@ class MetadataPatchProposal:
             )
             for aspect_name, patches in self.patches.items()
         ]
+    @classmethod
+    def _mint_auditstamp(cls, message: Optional[str] = None) -> AuditStampClass:
+        """
+        Creates an AuditStampClass instance with the current timestamp and other default values.
+        Args:
+            message: The message associated with the audit stamp (optional).
+        Returns:
+            An instance of AuditStampClass.
+        """
+        return AuditStampClass(
+            time=int(time.time() * 1000.0),
+            actor="urn:li:corpuser:datahub",
+            message=message,
+        )
+    @classmethod
+    def _ensure_urn_type(
+        cls, entity_type: str, edges: List[EdgeClass], context: str
+    ) -> None:
+        """
+        Ensures that the destination URNs in the given edges have the specified entity type.
+        Args:
+            entity_type: The entity type to check against.
+            edges: A list of Edge objects.
+            context: The context or description of the operation.
+        Raises:
+            ValueError: If any of the destination URNs is not of the specified entity type.
+        """
+        for e in edges:
+            urn = Urn.from_string(e.destinationUrn)
+            if not urn.entity_type == entity_type:
+                raise ValueError(
+                    f"{context}: {e.destinationUrn} is not of type {entity_type}"
+                )

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -46,8 +46,18 @@ _DEFAULT_RETRY_MAX_TIMES = int(
     os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
 )
-# The limit is 16mb. We will use a max of 15mb to have some space for overhead.
-_MAX_BATCH_INGEST_PAYLOAD_SIZE = 15 * 1024 * 1024
+# The limit is 16mb. We will use a max of 15mb to have some space
+# for overhead like request headers.
+# This applies to pretty much all calls to GMS.
+INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024
+# This limit is somewhat arbitrary. All GMS endpoints will timeout
+# and return a 500 if processing takes too long. To avoid sending
+# too much to the backend and hitting a timeout, we try to limit
+# the number of MCPs we send in a batch.
+BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
+    os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
+)
 class DataHubRestEmitter(Closeable, Emitter):
@@ -290,11 +300,14 @@ class DataHubRestEmitter(Closeable, Emitter):
         # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
         # If we will exceed the limit, we need to break it up into chunks.
         mcp_obj_chunks: List[List[str]] = []
-        current_chunk_size = _MAX_BATCH_INGEST_PAYLOAD_SIZE
+        current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
         for mcp_obj in mcp_objs:
             mcp_obj_size = len(json.dumps(mcp_obj))
-            if mcp_obj_size + current_chunk_size > _MAX_BATCH_INGEST_PAYLOAD_SIZE:
+            if (
+                mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
+                or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
+            ):
                 mcp_obj_chunks.append([])
                 current_chunk_size = 0
             mcp_obj_chunks[-1].append(mcp_obj)

datahub/ingestion/api/incremental_properties_helper.py ADDED Viewed

@@ -0,0 +1,69 @@
+import logging
+from typing import Iterable, Optional
+from pydantic.fields import Field
+from datahub.configuration.common import ConfigModel
+from datahub.emitter.mce_builder import set_aspect
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.source_helpers import create_dataset_props_patch_builder
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.metadata.schema_classes import (
+    DatasetPropertiesClass,
+    MetadataChangeEventClass,
+    SystemMetadataClass,
+)
+logger = logging.getLogger(__name__)
+def convert_dataset_properties_to_patch(
+    urn: str,
+    aspect: DatasetPropertiesClass,
+    system_metadata: Optional[SystemMetadataClass],
+) -> MetadataWorkUnit:
+    patch_builder = create_dataset_props_patch_builder(urn, aspect, system_metadata)
+    mcp = next(iter(patch_builder.build()))
+    return MetadataWorkUnit(id=MetadataWorkUnit.generate_workunit_id(mcp), mcp_raw=mcp)
+def auto_incremental_properties(
+    incremental_properties: bool,
+    stream: Iterable[MetadataWorkUnit],
+) -> Iterable[MetadataWorkUnit]:
+    if not incremental_properties:
+        yield from stream
+        return  # early exit
+    for wu in stream:
+        urn = wu.get_urn()
+        if isinstance(wu.metadata, MetadataChangeEventClass):
+            properties_aspect = wu.get_aspect_of_type(DatasetPropertiesClass)
+            set_aspect(wu.metadata, None, DatasetPropertiesClass)
+            if len(wu.metadata.proposedSnapshot.aspects) > 0:
+                yield wu
+            if properties_aspect:
+                yield convert_dataset_properties_to_patch(
+                    urn, properties_aspect, wu.metadata.systemMetadata
+                )
+        elif isinstance(wu.metadata, MetadataChangeProposalWrapper) and isinstance(
+            wu.metadata.aspect, DatasetPropertiesClass
+        ):
+            properties_aspect = wu.metadata.aspect
+            if properties_aspect:
+                yield convert_dataset_properties_to_patch(
+                    urn, properties_aspect, wu.metadata.systemMetadata
+                )
+        else:
+            yield wu
+# TODO: Use this in SQLCommonConfig. Currently only used in snowflake
+class IncrementalPropertiesConfigMixin(ConfigModel):
+    incremental_properties: bool = Field(
+        default=False,
+        description="When enabled, emits dataset properties as incremental to existing dataset properties "
+        "in DataHub. When disabled, re-states dataset properties on each run.",
+    )

datahub/ingestion/api/source.py CHANGED Viewed

@@ -184,6 +184,7 @@ class StructuredLogs(Report):
 @dataclass
 class SourceReport(Report):
+    event_not_produced_warn: bool = True
     events_produced: int = 0
     events_produced_per_sec: int = 0
@@ -492,11 +493,15 @@ class Source(Closeable, metaclass=ABCMeta):
     def _infer_platform(self) -> Optional[str]:
         config = self.get_config()
-        return (
+        platform = (
             getattr(config, "platform_name", None)
             or getattr(self, "platform", None)
             or getattr(config, "platform", None)
         )
+        if platform is None and hasattr(self, "get_platform_id"):
+            platform = type(self).get_platform_id()
+        return platform
     def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor:
         config = self.get_config()

datahub/ingestion/api/source_helpers.py CHANGED Viewed

@@ -32,6 +32,7 @@ from datahub.metadata.schema_classes import (
     SchemaFieldClass,
     SchemaMetadataClass,
     StatusClass,
+    SystemMetadataClass,
     TimeWindowSizeClass,
 )
 from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
@@ -65,9 +66,10 @@ def auto_workunit(
 def create_dataset_props_patch_builder(
     dataset_urn: str,
     dataset_properties: DatasetPropertiesClass,
+    system_metadata: Optional[SystemMetadataClass] = None,
 ) -> DatasetPatchBuilder:
     """Creates a patch builder with a table's or view's attributes and dataset properties"""
-    patch_builder = DatasetPatchBuilder(dataset_urn)
+    patch_builder = DatasetPatchBuilder(dataset_urn, system_metadata)
     patch_builder.set_display_name(dataset_properties.name)
     patch_builder.set_description(dataset_properties.description)
     patch_builder.set_created(dataset_properties.created)
@@ -148,7 +150,7 @@ def auto_workunit_reporter(report: "SourceReport", stream: Iterable[T]) -> Itera
         report.report_workunit(wu)
         yield wu
-    if report.events_produced == 0:
+    if report.event_not_produced_warn and report.events_produced == 0:
         report.warning(
             title="No metadata was produced by the source",
             message="Please check the source configuration, filters, and permissions.",

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -67,6 +67,7 @@ from datahub.metadata.schema_classes import (
     SystemMetadataClass,
     TelemetryClientIdClass,
 )
+from datahub.telemetry.telemetry import telemetry_instance
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.str_enum import StrEnum
 from datahub.utilities.urns.urn import Urn, guess_entity_type
@@ -1819,4 +1820,5 @@ def get_default_graph() -> DataHubGraph:
     graph_config = config_utils.load_client_config()
     graph = DataHubGraph(graph_config)
     graph.test_connection()
+    telemetry_instance.set_context(server=graph)
     return graph

datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py CHANGED Viewed

@@ -148,10 +148,10 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
     def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
         assert ctx.pipeline_config
-        if not self.report_recipe or not ctx.pipeline_config._raw_dict:
+        if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
             return ""
         else:
-            return json.dumps(redact_raw_config(ctx.pipeline_config._raw_dict))
+            return json.dumps(redact_raw_config(ctx.pipeline_config.get_raw_dict()))
     def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
         self.sink.write_record_async(

datahub/ingestion/run/pipeline.py CHANGED Viewed

@@ -44,7 +44,8 @@ from datahub.ingestion.transformer.system_metadata_transformer import (
 )
 from datahub.ingestion.transformer.transform_registry import transform_registry
 from datahub.metadata.schema_classes import MetadataChangeProposalClass
-from datahub.telemetry import stats, telemetry
+from datahub.telemetry import stats
+from datahub.telemetry.telemetry import telemetry_instance
 from datahub.utilities._custom_package_loader import model_version_name
 from datahub.utilities.global_warning_util import (
     clear_global_warnings,
@@ -220,7 +221,7 @@ class Pipeline:
         dry_run: bool = False,
         preview_mode: bool = False,
         preview_workunits: int = 10,
-        report_to: Optional[str] = None,
+        report_to: Optional[str] = "datahub",
         no_progress: bool = False,
     ):
         self.config = config
@@ -273,8 +274,9 @@ class Pipeline:
         if self.graph is None and isinstance(self.sink, DatahubRestSink):
             with _add_init_error_context("setup default datahub client"):
                 self.graph = self.sink.emitter.to_graph()
+                self.graph.test_connection()
         self.ctx.graph = self.graph
-        telemetry.telemetry_instance.update_capture_exception_context(server=self.graph)
+        telemetry_instance.set_context(server=self.graph)
         with set_graph_context(self.graph):
             with _add_init_error_context("configure reporters"):
@@ -615,7 +617,7 @@ class Pipeline:
         sink_warnings = len(self.sink.get_report().warnings)
         global_warnings = len(get_global_warnings())
-        telemetry.telemetry_instance.ping(
+        telemetry_instance.ping(
             "ingest_stats",
             {
                 "source_type": self.source_type,
@@ -637,7 +639,6 @@ class Pipeline:
                 ),
                 "has_pipeline_name": bool(self.config.pipeline_name),
             },
-            self.ctx.graph,
         )
     def _approx_all_vals(self, d: LossyList[Any]) -> int:

datahub/ingestion/run/pipeline_config.py CHANGED Viewed

@@ -117,3 +117,9 @@ class PipelineConfig(ConfigModel):
         config = cls.parse_obj(resolved_dict)
         config._raw_dict = raw_dict
         return config
+    def get_raw_dict(self) -> Dict:
+        result = self._raw_dict
+        if result is None:
+            result = self.dict()
+        return result

datahub/ingestion/sink/datahub_rest.py CHANGED Viewed

@@ -18,7 +18,10 @@ from datahub.configuration.common import (
 )
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import mcps_from_mce
-from datahub.emitter.rest_emitter import DataHubRestEmitter
+from datahub.emitter.rest_emitter import (
+    BATCH_INGEST_MAX_PAYLOAD_LENGTH,
+    DataHubRestEmitter,
+)
 from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
 from datahub.ingestion.api.sink import (
     NoopWriteCallback,
@@ -65,11 +68,19 @@ class DatahubRestSinkConfig(DatahubClientConfig):
     mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
     # These only apply in async modes.
-    max_threads: int = _DEFAULT_REST_SINK_MAX_THREADS
-    max_pending_requests: int = 2000
+    max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
+    max_pending_requests: pydantic.PositiveInt = 2000
     # Only applies in async batch mode.
-    max_per_batch: int = 100
+    max_per_batch: pydantic.PositiveInt = 100
+    @pydantic.validator("max_per_batch", always=True)
+    def validate_max_per_batch(cls, v):
+        if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH:
+            raise ValueError(
+                f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}"
+            )
+        return v
 @dataclasses.dataclass

datahub/ingestion/source/abs/source.py CHANGED Viewed

@@ -201,6 +201,10 @@ class ABSSource(StatefulIngestionSourceBase):
                 ).infer_schema(file)
             elif extension == ".json":
                 fields = json.JsonInferrer().infer_schema(file)
+            elif extension == ".jsonl":
+                fields = json.JsonInferrer(
+                    max_rows=self.source_config.max_rows, format="jsonl"
+                ).infer_schema(file)
             elif extension == ".avro":
                 fields = avro.AvroInferrer().infer_schema(file)
             else:

datahub/ingestion/source/aws/aws_common.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from datetime import datetime, timedelta, timezone
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
 import boto3
 from boto3.session import Session
@@ -107,6 +107,14 @@ class AwsConnectionConfig(ConfigModel):
         default=None,
         description="A set of proxy configs to use with AWS. See the [botocore.config](https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html) docs for details.",
     )
+    aws_retry_num: int = Field(
+        default=5,
+        description="Number of times to retry failed AWS requests. See the [botocore.retry](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) docs for details.",
+    )
+    aws_retry_mode: Literal["legacy", "standard", "adaptive"] = Field(
+        default="standard",
+        description="Retry mode to use for failed AWS requests. See the [botocore.retry](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) docs for details.",
+    )
     read_timeout: float = Field(
         default=DEFAULT_TIMEOUT,
@@ -199,6 +207,10 @@ class AwsConnectionConfig(ConfigModel):
         return Config(
             proxies=self.aws_proxy,
             read_timeout=self.read_timeout,
+            retries={
+                "max_attempts": self.aws_retry_num,
+                "mode": self.aws_retry_mode,
+            },
             **self.aws_advanced_config,
         )

datahub/ingestion/source/aws/sagemaker.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 from collections import defaultdict
 from typing import TYPE_CHECKING, DefaultDict, Dict, Iterable, List, Optional
@@ -36,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
 if TYPE_CHECKING:
     from mypy_boto3_sagemaker import SageMakerClient
+logger = logging.getLogger(__name__)
 @platform_name("SageMaker")
 @config_class(SagemakerSourceConfig)
@@ -75,6 +78,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
         ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        logger.info("Starting SageMaker ingestion...")
         # get common lineage graph
         lineage_processor = LineageProcessor(
             sagemaker_client=self.sagemaker_client, env=self.env, report=self.report
@@ -83,6 +87,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
         # extract feature groups if specified
         if self.source_config.extract_feature_groups:
+            logger.info("Extracting feature groups...")
             feature_group_processor = FeatureGroupProcessor(
                 sagemaker_client=self.sagemaker_client, env=self.env, report=self.report
             )
@@ -95,6 +100,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
         # extract jobs if specified
         if self.source_config.extract_jobs is not False:
+            logger.info("Extracting jobs...")
             job_processor = JobProcessor(
                 sagemaker_client=self.client_factory.get_client,
                 env=self.env,
@@ -109,6 +115,8 @@ class SagemakerSource(StatefulIngestionSourceBase):
         # extract models if specified
         if self.source_config.extract_models:
+            logger.info("Extracting models...")
             model_processor = ModelProcessor(
                 sagemaker_client=self.sagemaker_client,
                 env=self.env,

datahub/ingestion/source/aws/sagemaker_processors/common.py CHANGED Viewed

@@ -40,8 +40,11 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
     groups_scanned = 0
     models_scanned = 0
     jobs_scanned = 0
+    jobs_processed = 0
     datasets_scanned = 0
     filtered: List[str] = field(default_factory=list)
+    model_endpoint_lineage = 0
+    model_group_lineage = 0
     def report_feature_group_scanned(self) -> None:
         self.feature_groups_scanned += 1
@@ -58,6 +61,9 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
     def report_model_scanned(self) -> None:
         self.models_scanned += 1
+    def report_job_processed(self) -> None:
+        self.jobs_processed += 1
     def report_job_scanned(self) -> None:
         self.jobs_scanned += 1

datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import logging
+import textwrap
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Iterable, List
@@ -28,6 +30,8 @@ if TYPE_CHECKING:
         FeatureGroupSummaryTypeDef,
     )
+logger = logging.getLogger(__name__)
 @dataclass
 class FeatureGroupProcessor:
@@ -197,11 +201,12 @@ class FeatureGroupProcessor:
                 full_table_name = f"{glue_database}.{glue_table}"
-                self.report.report_warning(
-                    full_table_name,
-                    f"""Note: table {full_table_name} is an AWS Glue object.
+                logging.info(
+                    textwrap.dedent(
+                        f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
                         To view full table metadata, run Glue ingestion
-                        (see https://datahubproject.io/docs/metadata-ingestion/#aws-glue-glue)""",
+                        (see https://datahubproject.io/docs/generated/ingestion/sources/glue)"""
+                    )
                 )
                 feature_sources.append(

datahub/ingestion/source/aws/sagemaker_processors/jobs.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 from collections import defaultdict
 from dataclasses import dataclass, field
 from enum import Enum
@@ -49,6 +50,8 @@ from datahub.metadata.schema_classes import (
 if TYPE_CHECKING:
     from mypy_boto3_sagemaker import SageMakerClient
+logger = logging.getLogger(__name__)
 JobInfo = TypeVar(
     "JobInfo",
     AutoMlJobInfo,
@@ -274,15 +277,18 @@ class JobProcessor:
         )
     def get_workunits(self) -> Iterable[MetadataWorkUnit]:
+        logger.info("Getting all SageMaker jobs")
         jobs = self.get_all_jobs()
         processed_jobs: Dict[str, SageMakerJob] = {}
+        logger.info("Processing SageMaker jobs")
         # first pass: process jobs and collect datasets used
+        logger.info("first pass: process jobs and collect datasets used")
         for job in jobs:
             job_type = job_type_to_info[job["type"]]
             job_name = job[job_type.list_name_key]
+            logger.debug(f"Processing job {job_name} with type {job_type}")
             job_details = self.get_job_details(job_name, job["type"])
             processed_job = getattr(self, job_type.processor)(job_details)
@@ -293,6 +299,9 @@ class JobProcessor:
         # second pass:
         #   - move output jobs to inputs
         #   - aggregate i/o datasets
+        logger.info(
+            "second pass: move output jobs to inputs and aggregate i/o datasets"
+        )
         for job_urn in sorted(processed_jobs):
             processed_job = processed_jobs[job_urn]
@@ -301,6 +310,7 @@ class JobProcessor:
             all_datasets.update(processed_job.input_datasets)
             all_datasets.update(processed_job.output_datasets)
+            self.report.report_job_processed()
         # yield datasets
         for dataset_urn, dataset in all_datasets.items():
@@ -322,6 +332,7 @@ class JobProcessor:
             self.report.report_dataset_scanned()
         # third pass: construct and yield MCEs
+        logger.info("third pass: construct and yield MCEs")
         for job_urn in sorted(processed_jobs):
             processed_job = processed_jobs[job_urn]
             job_snapshot = processed_job.job_snapshot

acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl