acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +141 -93
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +8 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +20 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
- datahub/ingestion/source/datahub/datahub_source.py +13 -3
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -78
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -47,7 +47,10 @@ class BaseTimeWindowConfig(ConfigModel):
|
|
|
47
47
|
default_factory=lambda: datetime.now(tz=timezone.utc),
|
|
48
48
|
description="Latest date of lineage/usage to consider. Default: Current time in UTC",
|
|
49
49
|
)
|
|
50
|
-
start_time: datetime = Field(
|
|
50
|
+
start_time: datetime = Field(
|
|
51
|
+
default=None,
|
|
52
|
+
description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.",
|
|
53
|
+
) # type: ignore
|
|
51
54
|
|
|
52
55
|
@pydantic.validator("start_time", pre=True, always=True)
|
|
53
56
|
def default_start_time(
|
|
@@ -63,12 +66,14 @@ class BaseTimeWindowConfig(ConfigModel):
|
|
|
63
66
|
# This is where start_time str is resolved to datetime
|
|
64
67
|
try:
|
|
65
68
|
delta = parse_relative_timespan(v)
|
|
66
|
-
assert delta < timedelta(
|
|
67
|
-
|
|
68
|
-
)
|
|
69
|
+
assert delta < timedelta(0), (
|
|
70
|
+
"Relative start time should start with minus sign (-) e.g. '-2 days'."
|
|
71
|
+
)
|
|
69
72
|
assert abs(delta) >= get_bucket_duration_delta(
|
|
70
73
|
values["bucket_duration"]
|
|
71
|
-
),
|
|
74
|
+
), (
|
|
75
|
+
"Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
|
|
76
|
+
)
|
|
72
77
|
|
|
73
78
|
# The end_time's default value is not yet populated, in which case
|
|
74
79
|
# we can just manually generate it here.
|
datahub/emitter/mce_builder.py
CHANGED
|
@@ -88,13 +88,11 @@ def get_sys_time() -> int:
|
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
@overload
|
|
91
|
-
def make_ts_millis(ts: None) -> None:
|
|
92
|
-
...
|
|
91
|
+
def make_ts_millis(ts: None) -> None: ...
|
|
93
92
|
|
|
94
93
|
|
|
95
94
|
@overload
|
|
96
|
-
def make_ts_millis(ts: datetime) -> int:
|
|
97
|
-
...
|
|
95
|
+
def make_ts_millis(ts: datetime) -> int: ...
|
|
98
96
|
|
|
99
97
|
|
|
100
98
|
def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
|
|
@@ -105,13 +103,11 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
|
|
|
105
103
|
|
|
106
104
|
|
|
107
105
|
@overload
|
|
108
|
-
def parse_ts_millis(ts: float) -> datetime:
|
|
109
|
-
...
|
|
106
|
+
def parse_ts_millis(ts: float) -> datetime: ...
|
|
110
107
|
|
|
111
108
|
|
|
112
109
|
@overload
|
|
113
|
-
def parse_ts_millis(ts: None) -> None:
|
|
114
|
-
...
|
|
110
|
+
def parse_ts_millis(ts: None) -> None: ...
|
|
115
111
|
|
|
116
112
|
|
|
117
113
|
def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
|
datahub/emitter/mcp_builder.py
CHANGED
|
@@ -31,9 +31,12 @@ from datahub.metadata.schema_classes import (
|
|
|
31
31
|
OwnershipClass,
|
|
32
32
|
OwnershipTypeClass,
|
|
33
33
|
StatusClass,
|
|
34
|
+
StructuredPropertiesClass,
|
|
35
|
+
StructuredPropertyValueAssignmentClass,
|
|
34
36
|
SubTypesClass,
|
|
35
37
|
TagAssociationClass,
|
|
36
38
|
)
|
|
39
|
+
from datahub.metadata.urns import StructuredPropertyUrn
|
|
37
40
|
|
|
38
41
|
# In https://github.com/datahub-project/datahub/pull/11214, we added a
|
|
39
42
|
# new env field to container properties. However, populating this field
|
|
@@ -187,12 +190,31 @@ def add_tags_to_entity_wu(
|
|
|
187
190
|
).as_workunit()
|
|
188
191
|
|
|
189
192
|
|
|
193
|
+
def add_structured_properties_to_entity_wu(
|
|
194
|
+
entity_urn: str, structured_properties: Dict[StructuredPropertyUrn, str]
|
|
195
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
196
|
+
aspect = StructuredPropertiesClass(
|
|
197
|
+
properties=[
|
|
198
|
+
StructuredPropertyValueAssignmentClass(
|
|
199
|
+
propertyUrn=urn.urn(),
|
|
200
|
+
values=[value],
|
|
201
|
+
)
|
|
202
|
+
for urn, value in structured_properties.items()
|
|
203
|
+
]
|
|
204
|
+
)
|
|
205
|
+
yield MetadataChangeProposalWrapper(
|
|
206
|
+
entityUrn=entity_urn,
|
|
207
|
+
aspect=aspect,
|
|
208
|
+
).as_workunit()
|
|
209
|
+
|
|
210
|
+
|
|
190
211
|
def gen_containers(
|
|
191
212
|
container_key: KeyType,
|
|
192
213
|
name: str,
|
|
193
214
|
sub_types: List[str],
|
|
194
215
|
parent_container_key: Optional[ContainerKey] = None,
|
|
195
216
|
extra_properties: Optional[Dict[str, str]] = None,
|
|
217
|
+
structured_properties: Optional[Dict[StructuredPropertyUrn, str]] = None,
|
|
196
218
|
domain_urn: Optional[str] = None,
|
|
197
219
|
description: Optional[str] = None,
|
|
198
220
|
owner_urn: Optional[str] = None,
|
|
@@ -282,6 +304,11 @@ def gen_containers(
|
|
|
282
304
|
tags=sorted(tags),
|
|
283
305
|
)
|
|
284
306
|
|
|
307
|
+
if structured_properties:
|
|
308
|
+
yield from add_structured_properties_to_entity_wu(
|
|
309
|
+
entity_urn=container_urn, structured_properties=structured_properties
|
|
310
|
+
)
|
|
311
|
+
|
|
285
312
|
|
|
286
313
|
def add_dataset_to_container(
|
|
287
314
|
container_key: KeyType, dataset_urn: str
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -1,9 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import functools
|
|
2
4
|
import json
|
|
3
5
|
import logging
|
|
4
6
|
import os
|
|
5
7
|
from json.decoder import JSONDecodeError
|
|
6
|
-
from typing import
|
|
8
|
+
from typing import (
|
|
9
|
+
TYPE_CHECKING,
|
|
10
|
+
Any,
|
|
11
|
+
Callable,
|
|
12
|
+
Dict,
|
|
13
|
+
List,
|
|
14
|
+
Optional,
|
|
15
|
+
Sequence,
|
|
16
|
+
Tuple,
|
|
17
|
+
Union,
|
|
18
|
+
)
|
|
7
19
|
|
|
8
20
|
import requests
|
|
9
21
|
from deprecated import deprecated
|
|
@@ -12,8 +24,13 @@ from requests.exceptions import HTTPError, RequestException
|
|
|
12
24
|
|
|
13
25
|
from datahub import nice_version_name
|
|
14
26
|
from datahub.cli import config_utils
|
|
15
|
-
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
|
|
16
|
-
from datahub.
|
|
27
|
+
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
|
|
28
|
+
from datahub.cli.env_utils import get_boolean_env_variable
|
|
29
|
+
from datahub.configuration.common import (
|
|
30
|
+
ConfigModel,
|
|
31
|
+
ConfigurationError,
|
|
32
|
+
OperationalError,
|
|
33
|
+
)
|
|
17
34
|
from datahub.emitter.generic_emitter import Emitter
|
|
18
35
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
36
|
from datahub.emitter.request_helper import make_curl_command
|
|
@@ -30,10 +47,8 @@ if TYPE_CHECKING:
|
|
|
30
47
|
|
|
31
48
|
logger = logging.getLogger(__name__)
|
|
32
49
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
30 # Any ingest call taking longer than 30 seconds should be abandoned
|
|
36
|
-
)
|
|
50
|
+
_DEFAULT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
|
|
51
|
+
_TIMEOUT_LOWER_BOUND_SEC = 1 # if below this, we log a warning
|
|
37
52
|
_DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
|
|
38
53
|
429,
|
|
39
54
|
500,
|
|
@@ -46,6 +61,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
46
61
|
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
47
62
|
)
|
|
48
63
|
|
|
64
|
+
_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
|
|
65
|
+
|
|
49
66
|
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
67
|
# for overhead like request headers.
|
|
51
68
|
# This applies to pretty much all calls to GMS.
|
|
@@ -60,15 +77,76 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
|
60
77
|
)
|
|
61
78
|
|
|
62
79
|
|
|
80
|
+
class RequestsSessionConfig(ConfigModel):
|
|
81
|
+
timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
|
|
82
|
+
|
|
83
|
+
retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
|
|
84
|
+
retry_methods: List[str] = _DEFAULT_RETRY_METHODS
|
|
85
|
+
retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
|
|
86
|
+
|
|
87
|
+
extra_headers: Dict[str, str] = {}
|
|
88
|
+
|
|
89
|
+
ca_certificate_path: Optional[str] = None
|
|
90
|
+
client_certificate_path: Optional[str] = None
|
|
91
|
+
disable_ssl_verification: bool = False
|
|
92
|
+
|
|
93
|
+
def build_session(self) -> requests.Session:
|
|
94
|
+
session = requests.Session()
|
|
95
|
+
|
|
96
|
+
if self.extra_headers:
|
|
97
|
+
session.headers.update(self.extra_headers)
|
|
98
|
+
|
|
99
|
+
if self.client_certificate_path:
|
|
100
|
+
session.cert = self.client_certificate_path
|
|
101
|
+
|
|
102
|
+
if self.ca_certificate_path:
|
|
103
|
+
session.verify = self.ca_certificate_path
|
|
104
|
+
|
|
105
|
+
if self.disable_ssl_verification:
|
|
106
|
+
session.verify = False
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Set raise_on_status to False to propagate errors:
|
|
110
|
+
# https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
|
|
111
|
+
# Must call `raise_for_status` after making a request, which we do
|
|
112
|
+
retry_strategy = Retry(
|
|
113
|
+
total=self.retry_max_times,
|
|
114
|
+
status_forcelist=self.retry_status_codes,
|
|
115
|
+
backoff_factor=2,
|
|
116
|
+
allowed_methods=self.retry_methods,
|
|
117
|
+
raise_on_status=False,
|
|
118
|
+
)
|
|
119
|
+
except TypeError:
|
|
120
|
+
# Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
|
|
121
|
+
retry_strategy = Retry(
|
|
122
|
+
total=self.retry_max_times,
|
|
123
|
+
status_forcelist=self.retry_status_codes,
|
|
124
|
+
backoff_factor=2,
|
|
125
|
+
method_whitelist=self.retry_methods,
|
|
126
|
+
raise_on_status=False,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
adapter = HTTPAdapter(
|
|
130
|
+
pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
|
|
131
|
+
)
|
|
132
|
+
session.mount("http://", adapter)
|
|
133
|
+
session.mount("https://", adapter)
|
|
134
|
+
|
|
135
|
+
if self.timeout is not None:
|
|
136
|
+
# Shim session.request to apply default timeout values.
|
|
137
|
+
# Via https://stackoverflow.com/a/59317604.
|
|
138
|
+
session.request = functools.partial( # type: ignore
|
|
139
|
+
session.request,
|
|
140
|
+
timeout=self.timeout,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return session
|
|
144
|
+
|
|
145
|
+
|
|
63
146
|
class DataHubRestEmitter(Closeable, Emitter):
|
|
64
147
|
_gms_server: str
|
|
65
148
|
_token: Optional[str]
|
|
66
149
|
_session: requests.Session
|
|
67
|
-
_connect_timeout_sec: float = _DEFAULT_CONNECT_TIMEOUT_SEC
|
|
68
|
-
_read_timeout_sec: float = _DEFAULT_READ_TIMEOUT_SEC
|
|
69
|
-
_retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
|
|
70
|
-
_retry_methods: List[str] = _DEFAULT_RETRY_METHODS
|
|
71
|
-
_retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
|
|
72
150
|
|
|
73
151
|
def __init__(
|
|
74
152
|
self,
|
|
@@ -99,15 +177,13 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
99
177
|
|
|
100
178
|
self._session = requests.Session()
|
|
101
179
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
}
|
|
108
|
-
)
|
|
180
|
+
headers = {
|
|
181
|
+
"X-RestLi-Protocol-Version": "2.0.0",
|
|
182
|
+
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
183
|
+
"Content-Type": "application/json",
|
|
184
|
+
}
|
|
109
185
|
if token:
|
|
110
|
-
|
|
186
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
111
187
|
else:
|
|
112
188
|
# HACK: When no token is provided but system auth env variables are set, we use them.
|
|
113
189
|
# Ideally this should simply get passed in as config, instead of being sneakily injected
|
|
@@ -116,75 +192,43 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
116
192
|
# rest emitter, and the rest sink uses the rest emitter under the hood.
|
|
117
193
|
system_auth = config_utils.get_system_auth()
|
|
118
194
|
if system_auth is not None:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if extra_headers:
|
|
122
|
-
self._session.headers.update(extra_headers)
|
|
123
|
-
|
|
124
|
-
if client_certificate_path:
|
|
125
|
-
self._session.cert = client_certificate_path
|
|
195
|
+
headers["Authorization"] = system_auth
|
|
126
196
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
self._connect_timeout_sec = (
|
|
134
|
-
connect_timeout_sec or timeout_sec or _DEFAULT_CONNECT_TIMEOUT_SEC
|
|
135
|
-
)
|
|
136
|
-
self._read_timeout_sec = (
|
|
137
|
-
read_timeout_sec or timeout_sec or _DEFAULT_READ_TIMEOUT_SEC
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
if self._connect_timeout_sec < 1 or self._read_timeout_sec < 1:
|
|
141
|
-
logger.warning(
|
|
142
|
-
f"Setting timeout values lower than 1 second is not recommended. Your configuration is connect_timeout:{self._connect_timeout_sec}s, read_timeout:{self._read_timeout_sec}s"
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
if retry_status_codes is not None: # Only if missing. Empty list is allowed
|
|
146
|
-
self._retry_status_codes = retry_status_codes
|
|
147
|
-
|
|
148
|
-
if retry_methods is not None:
|
|
149
|
-
self._retry_methods = retry_methods
|
|
150
|
-
|
|
151
|
-
if retry_max_times:
|
|
152
|
-
self._retry_max_times = retry_max_times
|
|
153
|
-
|
|
154
|
-
try:
|
|
155
|
-
# Set raise_on_status to False to propagate errors:
|
|
156
|
-
# https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
|
|
157
|
-
# Must call `raise_for_status` after making a request, which we do
|
|
158
|
-
retry_strategy = Retry(
|
|
159
|
-
total=self._retry_max_times,
|
|
160
|
-
status_forcelist=self._retry_status_codes,
|
|
161
|
-
backoff_factor=2,
|
|
162
|
-
allowed_methods=self._retry_methods,
|
|
163
|
-
raise_on_status=False,
|
|
164
|
-
)
|
|
165
|
-
except TypeError:
|
|
166
|
-
# Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
|
|
167
|
-
retry_strategy = Retry(
|
|
168
|
-
total=self._retry_max_times,
|
|
169
|
-
status_forcelist=self._retry_status_codes,
|
|
170
|
-
backoff_factor=2,
|
|
171
|
-
method_whitelist=self._retry_methods,
|
|
172
|
-
raise_on_status=False,
|
|
197
|
+
timeout: float | tuple[float, float]
|
|
198
|
+
if connect_timeout_sec is not None or read_timeout_sec is not None:
|
|
199
|
+
timeout = (
|
|
200
|
+
connect_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
|
|
201
|
+
read_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
|
|
173
202
|
)
|
|
203
|
+
if (
|
|
204
|
+
timeout[0] < _TIMEOUT_LOWER_BOUND_SEC
|
|
205
|
+
or timeout[1] < _TIMEOUT_LOWER_BOUND_SEC
|
|
206
|
+
):
|
|
207
|
+
logger.warning(
|
|
208
|
+
f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is (connect_timeout, read_timeout) = {timeout} seconds"
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
timeout = get_or_else(timeout_sec, _DEFAULT_TIMEOUT_SEC)
|
|
212
|
+
if timeout < _TIMEOUT_LOWER_BOUND_SEC:
|
|
213
|
+
logger.warning(
|
|
214
|
+
f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is timeout = {timeout} seconds"
|
|
215
|
+
)
|
|
174
216
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
217
|
+
self._session_config = RequestsSessionConfig(
|
|
218
|
+
timeout=timeout,
|
|
219
|
+
retry_status_codes=get_or_else(
|
|
220
|
+
retry_status_codes, _DEFAULT_RETRY_STATUS_CODES
|
|
221
|
+
),
|
|
222
|
+
retry_methods=get_or_else(retry_methods, _DEFAULT_RETRY_METHODS),
|
|
223
|
+
retry_max_times=get_or_else(retry_max_times, _DEFAULT_RETRY_MAX_TIMES),
|
|
224
|
+
extra_headers={**headers, **(extra_headers or {})},
|
|
225
|
+
ca_certificate_path=ca_certificate_path,
|
|
226
|
+
client_certificate_path=client_certificate_path,
|
|
227
|
+
disable_ssl_verification=disable_ssl_verification,
|
|
186
228
|
)
|
|
187
229
|
|
|
230
|
+
self._session = self._session_config.build_session()
|
|
231
|
+
|
|
188
232
|
def test_connection(self) -> None:
|
|
189
233
|
url = f"{self._gms_server}/config"
|
|
190
234
|
response = self._session.get(url)
|
|
@@ -291,7 +335,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
291
335
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
292
336
|
async_flag: Optional[bool] = None,
|
|
293
337
|
) -> int:
|
|
294
|
-
|
|
338
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
339
|
+
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
|
|
295
340
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
296
341
|
for mcp in mcps:
|
|
297
342
|
ensure_has_system_metadata(mcp)
|
|
@@ -304,29 +349,32 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
304
349
|
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
|
|
305
350
|
for mcp_obj in mcp_objs:
|
|
306
351
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
352
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
353
|
+
logger.debug(
|
|
354
|
+
f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
|
|
355
|
+
)
|
|
310
356
|
|
|
311
357
|
if (
|
|
312
358
|
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
|
|
313
359
|
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
314
360
|
):
|
|
315
|
-
|
|
361
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
362
|
+
logger.debug("Decided to create new chunk")
|
|
316
363
|
mcp_obj_chunks.append([])
|
|
317
364
|
current_chunk_size = 0
|
|
318
365
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
319
366
|
current_chunk_size += mcp_obj_size
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
367
|
+
if len(mcp_obj_chunks) > 0:
|
|
368
|
+
logger.debug(
|
|
369
|
+
f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
|
|
370
|
+
)
|
|
323
371
|
|
|
324
372
|
for mcp_obj_chunk in mcp_obj_chunks:
|
|
325
373
|
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
|
|
326
374
|
# the size when chunking, and again for the actual request.
|
|
327
375
|
payload_dict: dict = {"proposals": mcp_obj_chunk}
|
|
328
376
|
if async_flag is not None:
|
|
329
|
-
payload_dict["async"] =
|
|
377
|
+
payload_dict["async"] = "true" if async_flag else "false"
|
|
330
378
|
|
|
331
379
|
payload = json.dumps(payload_dict)
|
|
332
380
|
self._emit_generic(url, payload)
|
datahub/entrypoints.py
CHANGED
|
@@ -45,6 +45,12 @@ _logging_configured: Optional[ContextManager] = None
|
|
|
45
45
|
|
|
46
46
|
MAX_CONTENT_WIDTH = 120
|
|
47
47
|
|
|
48
|
+
if sys.version_info >= (3, 12):
|
|
49
|
+
click.secho(
|
|
50
|
+
"Python versions above 3.11 are not tested with. Please use Python 3.11.",
|
|
51
|
+
fg="red",
|
|
52
|
+
)
|
|
53
|
+
|
|
48
54
|
|
|
49
55
|
@click.group(
|
|
50
56
|
context_settings=dict(
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Iterable, List
|
|
3
|
+
from typing import TYPE_CHECKING, Iterable, List
|
|
4
4
|
|
|
5
5
|
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
|
|
6
6
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
7
|
-
from datahub.ingestion.api.source import SourceReport
|
|
8
7
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
9
8
|
from datahub.metadata.schema_classes import (
|
|
10
9
|
DatasetProfileClass,
|
|
@@ -12,12 +11,15 @@ from datahub.metadata.schema_classes import (
|
|
|
12
11
|
SchemaMetadataClass,
|
|
13
12
|
)
|
|
14
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from datahub.ingestion.api.source import SourceReport
|
|
16
|
+
|
|
15
17
|
logger = logging.getLogger(__name__)
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class EnsureAspectSizeProcessor:
|
|
19
21
|
def __init__(
|
|
20
|
-
self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
|
|
22
|
+
self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
|
|
21
23
|
):
|
|
22
24
|
self.report = report
|
|
23
25
|
self.payload_constraint = payload_constraint
|
|
@@ -55,15 +55,9 @@ def convert_chart_info_to_patch(
|
|
|
55
55
|
aspect.externalUrl
|
|
56
56
|
).set_type(aspect.type).set_title(aspect.title).set_access(
|
|
57
57
|
aspect.access
|
|
58
|
-
).set_last_modified(
|
|
59
|
-
aspect.lastModified
|
|
60
|
-
).set_last_refreshed(
|
|
58
|
+
).set_last_modified(aspect.lastModified).set_last_refreshed(
|
|
61
59
|
aspect.lastRefreshed
|
|
62
|
-
).set_description(
|
|
63
|
-
aspect.description
|
|
64
|
-
).add_inputs(
|
|
65
|
-
aspect.inputs
|
|
66
|
-
)
|
|
60
|
+
).set_description(aspect.description).add_inputs(aspect.inputs)
|
|
67
61
|
|
|
68
62
|
values = patch_builder.build()
|
|
69
63
|
if values:
|
datahub/ingestion/api/report.py
CHANGED
datahub/ingestion/api/source.py
CHANGED
|
@@ -23,7 +23,7 @@ from typing import (
|
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
from pydantic import BaseModel
|
|
26
|
-
from typing_extensions import LiteralString
|
|
26
|
+
from typing_extensions import LiteralString, Self
|
|
27
27
|
|
|
28
28
|
from datahub.configuration.common import ConfigModel
|
|
29
29
|
from datahub.configuration.source_common import PlatformInstanceConfigMixin
|
|
@@ -31,6 +31,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
|
|
|
31
31
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
32
32
|
auto_patch_last_modified,
|
|
33
33
|
)
|
|
34
|
+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
35
|
+
EnsureAspectSizeProcessor,
|
|
36
|
+
)
|
|
34
37
|
from datahub.ingestion.api.closeable import Closeable
|
|
35
38
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
36
39
|
from datahub.ingestion.api.report import Report
|
|
@@ -331,6 +334,8 @@ class SourceReport(Report):
|
|
|
331
334
|
}
|
|
332
335
|
|
|
333
336
|
def compute_stats(self) -> None:
|
|
337
|
+
super().compute_stats()
|
|
338
|
+
|
|
334
339
|
duration = datetime.datetime.now() - self.start_time
|
|
335
340
|
workunits_produced = self.events_produced
|
|
336
341
|
if duration.total_seconds() > 0:
|
|
@@ -395,7 +400,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
395
400
|
ctx: PipelineContext
|
|
396
401
|
|
|
397
402
|
@classmethod
|
|
398
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) ->
|
|
403
|
+
def create(cls, config_dict: dict, ctx: PipelineContext) -> Self:
|
|
399
404
|
# Technically, this method should be abstract. However, the @config_class
|
|
400
405
|
# decorator automatically generates a create method at runtime if one is
|
|
401
406
|
# not defined. Python still treats the class as abstract because it thinks
|
|
@@ -450,6 +455,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
450
455
|
browse_path_processor,
|
|
451
456
|
partial(auto_workunit_reporter, self.get_report()),
|
|
452
457
|
auto_patch_last_modified,
|
|
458
|
+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
453
459
|
]
|
|
454
460
|
|
|
455
461
|
@staticmethod
|
|
@@ -48,7 +48,7 @@ logger = logging.getLogger(__name__)
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def auto_workunit(
|
|
51
|
-
stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]]
|
|
51
|
+
stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]],
|
|
52
52
|
) -> Iterable[MetadataWorkUnit]:
|
|
53
53
|
"""Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
|
|
54
54
|
|
|
@@ -131,9 +131,9 @@ class FieldPath:
|
|
|
131
131
|
for i, schema_type in enumerate(p.schema_types):
|
|
132
132
|
if schema_type == schema_str:
|
|
133
133
|
# return the corresponding type for the schema that's a match
|
|
134
|
-
assert (
|
|
135
|
-
len(p.type)
|
|
136
|
-
)
|
|
134
|
+
assert len(p.type) > i, (
|
|
135
|
+
f"p.type({len(p.type)})) and p.schema_types({len(p.schema_types)}) should have the same length"
|
|
136
|
+
)
|
|
137
137
|
return p.type[i]
|
|
138
138
|
return None
|
|
139
139
|
|
|
@@ -263,15 +263,13 @@ class AvroToMceSchemaConverter:
|
|
|
263
263
|
@overload
|
|
264
264
|
def _get_underlying_type_if_option_as_union(
|
|
265
265
|
schema: SchemaOrField, default: SchemaOrField
|
|
266
|
-
) -> SchemaOrField:
|
|
267
|
-
...
|
|
266
|
+
) -> SchemaOrField: ...
|
|
268
267
|
|
|
269
268
|
@staticmethod
|
|
270
269
|
@overload
|
|
271
270
|
def _get_underlying_type_if_option_as_union(
|
|
272
271
|
schema: SchemaOrField, default: Optional[SchemaOrField] = None
|
|
273
|
-
) -> Optional[SchemaOrField]:
|
|
274
|
-
...
|
|
272
|
+
) -> Optional[SchemaOrField]: ...
|
|
275
273
|
|
|
276
274
|
@staticmethod
|
|
277
275
|
def _get_underlying_type_if_option_as_union(
|
|
@@ -386,7 +384,7 @@ class AvroToMceSchemaConverter:
|
|
|
386
384
|
|
|
387
385
|
if "deprecated" in merged_props:
|
|
388
386
|
description = (
|
|
389
|
-
f
|
|
387
|
+
f'<span style="color:red">DEPRECATED: {merged_props["deprecated"]}</span>\n'
|
|
390
388
|
+ description
|
|
391
389
|
if description
|
|
392
390
|
else ""
|
datahub/ingestion/fs/s3_fs.py
CHANGED
|
@@ -17,9 +17,9 @@ def parse_s3_path(path: str) -> "S3Path":
|
|
|
17
17
|
|
|
18
18
|
def assert_ok_status(s3_response):
|
|
19
19
|
is_ok = s3_response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
|
20
|
-
assert (
|
|
21
|
-
|
|
22
|
-
)
|
|
20
|
+
assert is_ok, (
|
|
21
|
+
f"Failed to fetch S3 object, error message: {s3_response['Error']['Message']}"
|
|
22
|
+
)
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
@dataclass
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
1
|
from abc import ABCMeta, abstractmethod
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from typing import Any, Dict, List, Optional
|
|
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
|
|
|
38
37
|
)
|
|
39
38
|
|
|
40
39
|
max_workers: int = Field(
|
|
41
|
-
default=
|
|
42
|
-
description="Number of worker processes to use for classification. Set to 1 to disable.",
|
|
40
|
+
default=1,
|
|
41
|
+
description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
|
|
43
42
|
)
|
|
44
43
|
|
|
45
44
|
table_pattern: AllowDenyPattern = Field(
|