acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +7 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +24 -26
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +110 -32
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +174 -22
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/azure_auth_config.py +15 -0
- datahub/ingestion/source/unity/config.py +51 -34
- datahub/ingestion/source/unity/connection.py +7 -1
- datahub/ingestion/source/unity/connection_test.py +1 -1
- datahub/ingestion/source/unity/proxy.py +216 -7
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +29 -3
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +271 -91
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -1
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +2 -0
- datahub/sdk/search_filters.py +68 -40
- datahub/sdk/tag.py +112 -0
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
datahub/cli/migrate.py
CHANGED
|
@@ -318,13 +318,13 @@ def migrate_containers(
|
|
|
318
318
|
try:
|
|
319
319
|
newKey: Union[SchemaKey, DatabaseKey, ProjectIdKey, BigQueryDatasetKey]
|
|
320
320
|
if subType == "Schema":
|
|
321
|
-
newKey = SchemaKey.
|
|
321
|
+
newKey = SchemaKey.model_validate(customProperties)
|
|
322
322
|
elif subType == "Database":
|
|
323
|
-
newKey = DatabaseKey.
|
|
323
|
+
newKey = DatabaseKey.model_validate(customProperties)
|
|
324
324
|
elif subType == "Project":
|
|
325
|
-
newKey = ProjectIdKey.
|
|
325
|
+
newKey = ProjectIdKey.model_validate(customProperties)
|
|
326
326
|
elif subType == "Dataset":
|
|
327
|
-
newKey = BigQueryDatasetKey.
|
|
327
|
+
newKey = BigQueryDatasetKey.model_validate(customProperties)
|
|
328
328
|
else:
|
|
329
329
|
log.warning(f"Invalid subtype {subType}. Skipping")
|
|
330
330
|
continue
|
|
@@ -80,7 +80,7 @@ class QuickstartVersionMappingConfig(BaseModel):
|
|
|
80
80
|
path = os.path.expanduser(LOCAL_QUICKSTART_MAPPING_FILE)
|
|
81
81
|
with open(path) as f:
|
|
82
82
|
config_raw = yaml.safe_load(f)
|
|
83
|
-
return cls.
|
|
83
|
+
return cls.model_validate(config_raw)
|
|
84
84
|
|
|
85
85
|
config_raw = None
|
|
86
86
|
try:
|
|
@@ -110,7 +110,7 @@ class QuickstartVersionMappingConfig(BaseModel):
|
|
|
110
110
|
}
|
|
111
111
|
)
|
|
112
112
|
|
|
113
|
-
config = cls.
|
|
113
|
+
config = cls.model_validate(config_raw)
|
|
114
114
|
|
|
115
115
|
# If stable is not defined in the config, we need to fetch the latest version from github.
|
|
116
116
|
if config.quickstart_version_map.get("stable") is None:
|
|
@@ -177,7 +177,7 @@ def save_quickstart_config(
|
|
|
177
177
|
path = os.path.expanduser(path)
|
|
178
178
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
179
179
|
with open(path, "w") as f:
|
|
180
|
-
yaml.dump(config.
|
|
180
|
+
yaml.dump(config.model_dump(), f)
|
|
181
181
|
logger.info(f"Saved quickstart config to {path}.")
|
|
182
182
|
|
|
183
183
|
|
|
@@ -42,7 +42,7 @@ def upsert(file: Path, override_editable: bool) -> None:
|
|
|
42
42
|
with get_default_graph(ClientMode.CLI) as emitter:
|
|
43
43
|
for group_config in group_configs:
|
|
44
44
|
try:
|
|
45
|
-
datahub_group = CorpGroup.
|
|
45
|
+
datahub_group = CorpGroup.model_validate(group_config)
|
|
46
46
|
for mcp in datahub_group.generate_mcp(
|
|
47
47
|
generation_config=CorpGroupGenerationConfig(
|
|
48
48
|
override_editable=override_editable, datahub_graph=emitter
|
|
@@ -85,7 +85,7 @@ def list(details: bool, to_file: str) -> None:
|
|
|
85
85
|
with open(file, "r") as fp:
|
|
86
86
|
existing_objects = yaml.load(fp) # this is a list of dicts
|
|
87
87
|
existing_objects = [
|
|
88
|
-
StructuredProperties.
|
|
88
|
+
StructuredProperties.model_validate(obj) for obj in existing_objects
|
|
89
89
|
]
|
|
90
90
|
objects = [obj for obj in objects]
|
|
91
91
|
# do a positional update of the existing objects
|
datahub/cli/specific/user_cli.py
CHANGED
|
@@ -42,7 +42,7 @@ def upsert(file: Path, override_editable: bool) -> None:
|
|
|
42
42
|
with get_default_graph(ClientMode.CLI) as emitter:
|
|
43
43
|
for user_config in user_configs:
|
|
44
44
|
try:
|
|
45
|
-
datahub_user: CorpUser = CorpUser.
|
|
45
|
+
datahub_user: CorpUser = CorpUser.model_validate(user_config)
|
|
46
46
|
|
|
47
47
|
emitter.emit_all(
|
|
48
48
|
datahub_user.generate_mcp(
|
datahub/configuration/common.py
CHANGED
|
@@ -140,6 +140,18 @@ class ConfigModel(BaseModel):
|
|
|
140
140
|
|
|
141
141
|
@classmethod
|
|
142
142
|
def parse_obj_allow_extras(cls, obj: Any) -> Self:
|
|
143
|
+
"""Parse an object while allowing extra fields.
|
|
144
|
+
|
|
145
|
+
'parse_obj' in Pydantic v1 is equivalent to 'model_validate' in Pydantic v2.
|
|
146
|
+
However, 'parse_obj_allow_extras' in v1 is not directly available in v2.
|
|
147
|
+
|
|
148
|
+
`model_validate(..., strict=False)` does not work because it still raises errors on extra fields;
|
|
149
|
+
strict=False only affects type coercion and validation strictness, not extra field handling.
|
|
150
|
+
|
|
151
|
+
This method temporarily modifies the model's configuration to allow extra fields
|
|
152
|
+
|
|
153
|
+
TODO: Do we really need to support this behaviour? Consider removing this method in future.
|
|
154
|
+
"""
|
|
143
155
|
if PYDANTIC_VERSION_2:
|
|
144
156
|
try:
|
|
145
157
|
with unittest.mock.patch.dict(
|
|
@@ -148,12 +160,12 @@ class ConfigModel(BaseModel):
|
|
|
148
160
|
clear=False,
|
|
149
161
|
):
|
|
150
162
|
cls.model_rebuild(force=True) # type: ignore
|
|
151
|
-
return cls.
|
|
163
|
+
return cls.model_validate(obj)
|
|
152
164
|
finally:
|
|
153
165
|
cls.model_rebuild(force=True) # type: ignore
|
|
154
166
|
else:
|
|
155
167
|
with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow):
|
|
156
|
-
return cls.
|
|
168
|
+
return cls.model_validate(obj)
|
|
157
169
|
|
|
158
170
|
|
|
159
171
|
class PermissiveConfigModel(ConfigModel):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Type
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
from pydantic import model_validator
|
|
4
4
|
|
|
5
5
|
from datahub.ingestion.api.global_context import get_graph_context
|
|
6
6
|
|
|
@@ -40,4 +40,4 @@ def auto_connection_resolver(
|
|
|
40
40
|
# https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
|
|
41
41
|
# This hack ensures that multiple validators do not overwrite each other.
|
|
42
42
|
_resolve_connection.__name__ = f"{_resolve_connection.__name__}_{connection_field}"
|
|
43
|
-
return
|
|
43
|
+
return model_validator(mode="before")(_resolve_connection)
|
datahub/configuration/git.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
import pathlib
|
|
2
|
+
from copy import deepcopy
|
|
2
3
|
from typing import Any, Dict, Optional, Union
|
|
3
4
|
|
|
4
|
-
from pydantic import
|
|
5
|
+
from pydantic import (
|
|
6
|
+
Field,
|
|
7
|
+
FilePath,
|
|
8
|
+
SecretStr,
|
|
9
|
+
field_validator,
|
|
10
|
+
model_validator,
|
|
11
|
+
)
|
|
5
12
|
|
|
6
13
|
from datahub.configuration.common import ConfigModel
|
|
7
14
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
@@ -41,7 +48,8 @@ class GitReference(ConfigModel):
|
|
|
41
48
|
transform=lambda url: _GITHUB_URL_TEMPLATE,
|
|
42
49
|
)
|
|
43
50
|
|
|
44
|
-
@
|
|
51
|
+
@field_validator("repo", mode="before")
|
|
52
|
+
@classmethod
|
|
45
53
|
def simplify_repo_url(cls, repo: str) -> str:
|
|
46
54
|
if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
|
|
47
55
|
repo = f"https://{repo}"
|
|
@@ -53,21 +61,22 @@ class GitReference(ConfigModel):
|
|
|
53
61
|
|
|
54
62
|
return repo
|
|
55
63
|
|
|
56
|
-
@
|
|
57
|
-
def infer_url_template(
|
|
58
|
-
if url_template is not None:
|
|
59
|
-
return
|
|
64
|
+
@model_validator(mode="after")
|
|
65
|
+
def infer_url_template(self) -> "GitReference":
|
|
66
|
+
if self.url_template is not None:
|
|
67
|
+
return self
|
|
60
68
|
|
|
61
|
-
repo:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
return _GITLAB_URL_TEMPLATE
|
|
69
|
+
if self.repo.startswith(_GITHUB_PREFIX):
|
|
70
|
+
self.url_template = _GITHUB_URL_TEMPLATE
|
|
71
|
+
elif self.repo.startswith(_GITLAB_PREFIX):
|
|
72
|
+
self.url_template = _GITLAB_URL_TEMPLATE
|
|
66
73
|
else:
|
|
67
74
|
raise ValueError(
|
|
68
75
|
"Unable to infer URL template from repo. Please set url_template manually."
|
|
69
76
|
)
|
|
70
77
|
|
|
78
|
+
return self
|
|
79
|
+
|
|
71
80
|
def get_url_for_file_path(self, file_path: str) -> str:
|
|
72
81
|
assert self.url_template
|
|
73
82
|
if self.url_subdir:
|
|
@@ -98,35 +107,43 @@ class GitInfo(GitReference):
|
|
|
98
107
|
|
|
99
108
|
_fix_deploy_key_newlines = pydantic_multiline_string("deploy_key")
|
|
100
109
|
|
|
101
|
-
@
|
|
110
|
+
@model_validator(mode="before")
|
|
111
|
+
@classmethod
|
|
102
112
|
def deploy_key_filled_from_deploy_key_file(
|
|
103
|
-
cls,
|
|
104
|
-
) ->
|
|
105
|
-
|
|
113
|
+
cls, values: Dict[str, Any]
|
|
114
|
+
) -> Dict[str, Any]:
|
|
115
|
+
# In-place update of the input dict would cause state contamination.
|
|
116
|
+
# So a deepcopy is performed first.
|
|
117
|
+
values = deepcopy(values)
|
|
118
|
+
|
|
119
|
+
if values.get("deploy_key") is None:
|
|
106
120
|
deploy_key_file = values.get("deploy_key_file")
|
|
107
121
|
if deploy_key_file is not None:
|
|
108
122
|
with open(deploy_key_file) as fp:
|
|
109
123
|
deploy_key = SecretStr(fp.read())
|
|
110
|
-
|
|
111
|
-
return
|
|
112
|
-
|
|
113
|
-
@
|
|
114
|
-
def infer_repo_ssh_locator(
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
124
|
+
values["deploy_key"] = deploy_key
|
|
125
|
+
return values
|
|
126
|
+
|
|
127
|
+
@model_validator(mode="after")
|
|
128
|
+
def infer_repo_ssh_locator(self) -> "GitInfo":
|
|
129
|
+
if self.repo_ssh_locator is not None:
|
|
130
|
+
return self
|
|
131
|
+
|
|
132
|
+
if self.repo.startswith(_GITHUB_PREFIX):
|
|
133
|
+
self.repo_ssh_locator = (
|
|
134
|
+
f"git@github.com:{self.repo[len(_GITHUB_PREFIX) :]}.git"
|
|
135
|
+
)
|
|
136
|
+
elif self.repo.startswith(_GITLAB_PREFIX):
|
|
137
|
+
self.repo_ssh_locator = (
|
|
138
|
+
f"git@gitlab.com:{self.repo[len(_GITLAB_PREFIX) :]}.git"
|
|
139
|
+
)
|
|
125
140
|
else:
|
|
126
141
|
raise ValueError(
|
|
127
142
|
"Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually."
|
|
128
143
|
)
|
|
129
144
|
|
|
145
|
+
return self
|
|
146
|
+
|
|
130
147
|
@property
|
|
131
148
|
def branch_for_clone(self) -> Optional[str]:
|
|
132
149
|
# If branch was manually set, we should use it. Otherwise return None.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Type, TypeVar, Union
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
from pydantic import field_validator
|
|
4
4
|
|
|
5
5
|
from datahub.ingestion.api.registry import import_path
|
|
6
6
|
|
|
@@ -15,4 +15,4 @@ def _pydantic_resolver(cls: Type, v: Union[str, _T]) -> _T:
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def pydantic_resolve_key(field: str) -> "V1Validator":
|
|
18
|
-
return
|
|
18
|
+
return field_validator(field, mode="before")(_pydantic_resolver)
|
datahub/configuration/kafka.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from pydantic import Field,
|
|
1
|
+
from pydantic import Field, field_validator
|
|
2
2
|
|
|
3
3
|
from datahub.configuration.common import ConfigModel, ConfigurationError
|
|
4
4
|
from datahub.configuration.env_vars import (
|
|
@@ -42,7 +42,8 @@ class _KafkaConnectionConfig(ConfigModel):
|
|
|
42
42
|
description="The request timeout used when interacting with the Kafka APIs.",
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
@
|
|
45
|
+
@field_validator("bootstrap", mode="after")
|
|
46
|
+
@classmethod
|
|
46
47
|
def bootstrap_host_colon_port_comma(cls, val: str) -> str:
|
|
47
48
|
for entry in val.split(","):
|
|
48
49
|
validate_host_port(entry)
|
|
@@ -57,7 +58,7 @@ class KafkaConsumerConnectionConfig(_KafkaConnectionConfig):
|
|
|
57
58
|
description="Extra consumer config serialized as JSON. These options will be passed into Kafka's DeserializingConsumer. See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md .",
|
|
58
59
|
)
|
|
59
60
|
|
|
60
|
-
@
|
|
61
|
+
@field_validator("consumer_config", mode="after")
|
|
61
62
|
@classmethod
|
|
62
63
|
def resolve_callback(cls, value: dict) -> dict:
|
|
63
64
|
if CallableConsumerConfig.is_callable_config(value):
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import enum
|
|
2
2
|
from datetime import datetime, timedelta, timezone
|
|
3
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, List
|
|
4
4
|
|
|
5
5
|
import humanfriendly
|
|
6
|
-
import
|
|
7
|
-
from pydantic.fields import Field
|
|
6
|
+
from pydantic import Field, ValidationInfo, field_validator, model_validator
|
|
8
7
|
|
|
9
8
|
from datahub.configuration.common import ConfigModel
|
|
10
9
|
from datahub.configuration.datetimes import parse_absolute_time, parse_relative_timespan
|
|
@@ -52,45 +51,46 @@ class BaseTimeWindowConfig(ConfigModel):
|
|
|
52
51
|
description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.",
|
|
53
52
|
) # type: ignore
|
|
54
53
|
|
|
55
|
-
@
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
if v is None:
|
|
60
|
-
return get_time_bucket(
|
|
61
|
-
values["end_time"]
|
|
62
|
-
- get_bucket_duration_delta(values["bucket_duration"]),
|
|
63
|
-
values["bucket_duration"],
|
|
64
|
-
)
|
|
65
|
-
elif isinstance(v, str):
|
|
54
|
+
@field_validator("start_time", mode="before")
|
|
55
|
+
@classmethod
|
|
56
|
+
def parse_start_time(cls, v: Any, info: ValidationInfo) -> Any:
|
|
57
|
+
if isinstance(v, str):
|
|
66
58
|
# This is where start_time str is resolved to datetime
|
|
67
59
|
try:
|
|
68
60
|
delta = parse_relative_timespan(v)
|
|
69
61
|
assert delta < timedelta(0), (
|
|
70
62
|
"Relative start time should start with minus sign (-) e.g. '-2 days'."
|
|
71
63
|
)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
), (
|
|
64
|
+
bucket_duration = info.data.get("bucket_duration", BucketDuration.DAY)
|
|
65
|
+
assert abs(delta) >= get_bucket_duration_delta(bucket_duration), (
|
|
75
66
|
"Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
|
|
76
67
|
)
|
|
77
68
|
|
|
78
|
-
#
|
|
79
|
-
#
|
|
80
|
-
|
|
81
|
-
|
|
69
|
+
# We need end_time, but it might not be set yet
|
|
70
|
+
# In that case, we'll use the default
|
|
71
|
+
end_time = info.data.get("end_time")
|
|
72
|
+
if end_time is None:
|
|
73
|
+
end_time = datetime.now(tz=timezone.utc)
|
|
82
74
|
|
|
83
|
-
return get_time_bucket(
|
|
84
|
-
values["end_time"] + delta, values["bucket_duration"]
|
|
85
|
-
)
|
|
75
|
+
return get_time_bucket(end_time + delta, bucket_duration)
|
|
86
76
|
except humanfriendly.InvalidTimespan:
|
|
87
77
|
# We do not floor start_time to the bucket start time if absolute start time is specified.
|
|
88
78
|
# If user has specified absolute start time in recipe, it's most likely that he means it.
|
|
89
79
|
return parse_absolute_time(v)
|
|
90
|
-
|
|
91
80
|
return v
|
|
92
81
|
|
|
93
|
-
@
|
|
82
|
+
@model_validator(mode="after")
|
|
83
|
+
def default_start_time(self) -> "BaseTimeWindowConfig":
|
|
84
|
+
# Only calculate start_time if it was None (not provided by user)
|
|
85
|
+
if self.start_time is None:
|
|
86
|
+
self.start_time = get_time_bucket(
|
|
87
|
+
self.end_time - get_bucket_duration_delta(self.bucket_duration),
|
|
88
|
+
self.bucket_duration,
|
|
89
|
+
)
|
|
90
|
+
return self
|
|
91
|
+
|
|
92
|
+
@field_validator("start_time", "end_time", mode="after")
|
|
93
|
+
@classmethod
|
|
94
94
|
def ensure_timestamps_in_utc(cls, v: datetime) -> datetime:
|
|
95
95
|
if v.tzinfo is None:
|
|
96
96
|
raise ValueError(
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import warnings
|
|
2
2
|
from typing import TYPE_CHECKING, Any, Optional, Type
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
from pydantic import model_validator
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigurationWarning
|
|
7
7
|
from datahub.utilities.global_warning_util import add_global_warning
|
|
@@ -34,4 +34,4 @@ def pydantic_field_deprecated(
|
|
|
34
34
|
# https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
|
|
35
35
|
# This hack ensures that multiple field deprecated do not overwrite each other.
|
|
36
36
|
_validate_deprecated.__name__ = f"{_validate_deprecated.__name__}_{field}"
|
|
37
|
-
return
|
|
37
|
+
return model_validator(mode="before")(_validate_deprecated)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import warnings
|
|
2
2
|
from typing import TYPE_CHECKING, Type
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
from pydantic import model_validator
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigurationWarning
|
|
7
7
|
|
|
@@ -31,4 +31,4 @@ def pydantic_removed_field(
|
|
|
31
31
|
# https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
|
|
32
32
|
# This hack ensures that multiple field removals do not overwrite each other.
|
|
33
33
|
_validate_field_removal.__name__ = f"{_validate_field_removal.__name__}_{field}"
|
|
34
|
-
return
|
|
34
|
+
return model_validator(mode="before")(_validate_field_removal)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import warnings
|
|
2
2
|
from typing import TYPE_CHECKING, Callable, Type, TypeVar
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
from pydantic import model_validator
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigurationWarning
|
|
7
7
|
from datahub.utilities.global_warning_util import add_global_warning
|
|
@@ -52,4 +52,4 @@ def pydantic_renamed_field(
|
|
|
52
52
|
# validator with pre=True gets all the values that were passed in.
|
|
53
53
|
# Given that a renamed field doesn't show up in the fields list, we can't use
|
|
54
54
|
# the field-level validator, even with a different field name.
|
|
55
|
-
return
|
|
55
|
+
return model_validator(mode="before")(_validate_field_rename)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Optional, Type, Union
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
|
+
from pydantic import field_validator
|
|
4
5
|
|
|
5
6
|
if TYPE_CHECKING:
|
|
6
7
|
from pydantic.deprecated.class_validators import V1Validator
|
|
@@ -31,4 +32,4 @@ def pydantic_multiline_string(field: str) -> "V1Validator":
|
|
|
31
32
|
# https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
|
|
32
33
|
# This hack ensures that multiple field deprecated do not overwrite each other.
|
|
33
34
|
_validate_field.__name__ = f"{_validate_field.__name__}_{field}"
|
|
34
|
-
return
|
|
35
|
+
return field_validator(field, mode="before")(_validate_field)
|
datahub/emitter/kafka_emitter.py
CHANGED
|
@@ -6,6 +6,7 @@ from confluent_kafka import SerializingProducer
|
|
|
6
6
|
from confluent_kafka.schema_registry import SchemaRegistryClient
|
|
7
7
|
from confluent_kafka.schema_registry.avro import AvroSerializer
|
|
8
8
|
from confluent_kafka.serialization import SerializationContext, StringSerializer
|
|
9
|
+
from pydantic import field_validator
|
|
9
10
|
|
|
10
11
|
from datahub.configuration.common import ConfigModel
|
|
11
12
|
from datahub.configuration.kafka import KafkaProducerConnectionConfig
|
|
@@ -49,7 +50,8 @@ class KafkaEmitterConfig(ConfigModel):
|
|
|
49
50
|
},
|
|
50
51
|
)
|
|
51
52
|
|
|
52
|
-
@
|
|
53
|
+
@field_validator("topic_routes", mode="after")
|
|
54
|
+
@classmethod
|
|
53
55
|
def validate_topic_routes(cls, v: Dict[str, str]) -> Dict[str, str]:
|
|
54
56
|
assert MCE_KEY in v, f"topic_routes must contain a route for {MCE_KEY}"
|
|
55
57
|
assert MCP_KEY in v, f"topic_routes must contain a route for {MCP_KEY}"
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -145,8 +145,7 @@ class EmitMode(ConfigEnum):
|
|
|
145
145
|
ASYNC_WAIT = auto()
|
|
146
146
|
|
|
147
147
|
|
|
148
|
-
_DEFAULT_EMIT_MODE = pydantic.
|
|
149
|
-
EmitMode,
|
|
148
|
+
_DEFAULT_EMIT_MODE = pydantic.TypeAdapter(EmitMode).validate_python(
|
|
150
149
|
get_emit_mode() or EmitMode.SYNC_PRIMARY,
|
|
151
150
|
)
|
|
152
151
|
|
|
@@ -156,8 +155,7 @@ class RestSinkEndpoint(ConfigEnum):
|
|
|
156
155
|
OPENAPI = auto()
|
|
157
156
|
|
|
158
157
|
|
|
159
|
-
DEFAULT_REST_EMITTER_ENDPOINT = pydantic.
|
|
160
|
-
RestSinkEndpoint,
|
|
158
|
+
DEFAULT_REST_EMITTER_ENDPOINT = pydantic.TypeAdapter(RestSinkEndpoint).validate_python(
|
|
161
159
|
get_rest_emitter_default_endpoint() or RestSinkEndpoint.RESTLI,
|
|
162
160
|
)
|
|
163
161
|
|
|
@@ -17,7 +17,7 @@ def config_class(config_cls: Type) -> Callable[[Type], Type]:
|
|
|
17
17
|
"""Adds a get_config_class method to the decorated class"""
|
|
18
18
|
|
|
19
19
|
def default_create(cls: Type, config_dict: Dict, ctx: PipelineContext) -> Type:
|
|
20
|
-
config = config_cls.
|
|
20
|
+
config = config_cls.model_validate(config_dict)
|
|
21
21
|
return cls(config=config, ctx=ctx)
|
|
22
22
|
|
|
23
23
|
def wrapper(cls: Type) -> Type:
|
datahub/ingestion/api/report.py
CHANGED
|
@@ -65,7 +65,7 @@ class Report(SupportsAsObj):
|
|
|
65
65
|
if isinstance(some_val, SupportsAsObj):
|
|
66
66
|
return some_val.as_obj()
|
|
67
67
|
elif isinstance(some_val, pydantic.BaseModel):
|
|
68
|
-
return Report.to_pure_python_obj(some_val.
|
|
68
|
+
return Report.to_pure_python_obj(some_val.model_dump())
|
|
69
69
|
elif dataclasses.is_dataclass(some_val) and not isinstance(some_val, type):
|
|
70
70
|
# The `is_dataclass` function returns `True` for both instances and classes.
|
|
71
71
|
# We need an extra check to ensure an instance was passed in.
|
datahub/ingestion/api/sink.py
CHANGED
|
@@ -123,7 +123,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
|
|
|
123
123
|
|
|
124
124
|
@classmethod
|
|
125
125
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Self":
|
|
126
|
-
return cls(ctx, cls.get_config_class().
|
|
126
|
+
return cls(ctx, cls.get_config_class().model_validate(config_dict))
|
|
127
127
|
|
|
128
128
|
def handle_work_unit_start(self, workunit: WorkUnit) -> None:
|
|
129
129
|
"""Called at the start of each new workunit.
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -480,7 +480,7 @@ class Extractor(Generic[WorkUnitType, ExtractorConfig], Closeable, metaclass=ABC
|
|
|
480
480
|
config_class = self.get_config_class()
|
|
481
481
|
|
|
482
482
|
self.ctx = ctx
|
|
483
|
-
self.config = config_class.
|
|
483
|
+
self.config = config_class.model_validate(config_dict)
|
|
484
484
|
|
|
485
485
|
@abstractmethod
|
|
486
486
|
def get_records(self, workunit: WorkUnitType) -> Iterable[RecordEnvelope]:
|
|
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional
|
|
|
3
3
|
from datahub_classify.helper_classes import ColumnInfo
|
|
4
4
|
from datahub_classify.infotype_predictor import predict_infotypes
|
|
5
5
|
from datahub_classify.reference_input import input1 as default_config
|
|
6
|
-
from pydantic import
|
|
6
|
+
from pydantic import field_validator
|
|
7
7
|
from pydantic.fields import Field
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import ConfigModel
|
|
@@ -90,7 +90,7 @@ class InfoTypeConfig(ConfigModel):
|
|
|
90
90
|
|
|
91
91
|
|
|
92
92
|
DEFAULT_CLASSIFIER_CONFIG = {
|
|
93
|
-
k: InfoTypeConfig.
|
|
93
|
+
k: InfoTypeConfig.model_validate(v) for k, v in default_config.items()
|
|
94
94
|
}
|
|
95
95
|
|
|
96
96
|
|
|
@@ -114,8 +114,11 @@ class DataHubClassifierConfig(ConfigModel):
|
|
|
114
114
|
description="Minimum number of non-null column values required to process `values` prediction factor.",
|
|
115
115
|
)
|
|
116
116
|
|
|
117
|
-
@
|
|
118
|
-
|
|
117
|
+
@field_validator("info_types_config", mode="after")
|
|
118
|
+
@classmethod
|
|
119
|
+
def input_config_selectively_overrides_default_config(
|
|
120
|
+
cls, info_types_config: Dict[str, Any]
|
|
121
|
+
) -> Dict[str, Any]:
|
|
119
122
|
for infotype, infotype_config in DEFAULT_CLASSIFIER_CONFIG.items():
|
|
120
123
|
if infotype not in info_types_config:
|
|
121
124
|
# if config for some info type is not provided by user, use default config for that info type.
|
|
@@ -125,7 +128,7 @@ class DataHubClassifierConfig(ConfigModel):
|
|
|
125
128
|
# use default config for that prediction factor.
|
|
126
129
|
for factor, weight in (
|
|
127
130
|
info_types_config[infotype]
|
|
128
|
-
.Prediction_Factors_and_Weights.
|
|
131
|
+
.Prediction_Factors_and_Weights.model_dump()
|
|
129
132
|
.items()
|
|
130
133
|
):
|
|
131
134
|
if (
|
|
@@ -146,7 +149,7 @@ class DataHubClassifierConfig(ConfigModel):
|
|
|
146
149
|
for (
|
|
147
150
|
factor,
|
|
148
151
|
weight,
|
|
149
|
-
) in custom_infotype_config.Prediction_Factors_and_Weights.
|
|
152
|
+
) in custom_infotype_config.Prediction_Factors_and_Weights.model_dump().items():
|
|
150
153
|
if weight > 0:
|
|
151
154
|
assert getattr(custom_infotype_config, factor) is not None, (
|
|
152
155
|
f"Missing Configuration for Prediction Factor {factor} for Custom Info Type {custom_infotype}"
|
|
@@ -173,7 +176,7 @@ class DataHubClassifier(Classifier):
|
|
|
173
176
|
def create(cls, config_dict: Optional[Dict[str, Any]]) -> "DataHubClassifier":
|
|
174
177
|
# This could be replaced by parsing to particular class, if required
|
|
175
178
|
if config_dict is not None:
|
|
176
|
-
config = DataHubClassifierConfig.
|
|
179
|
+
config = DataHubClassifierConfig.model_validate(config_dict)
|
|
177
180
|
else:
|
|
178
181
|
config = DataHubClassifierConfig()
|
|
179
182
|
return cls(config)
|
|
@@ -183,7 +186,7 @@ class DataHubClassifier(Classifier):
|
|
|
183
186
|
column_infos=columns,
|
|
184
187
|
confidence_level_threshold=self.config.confidence_level_threshold,
|
|
185
188
|
global_config={
|
|
186
|
-
k: v.
|
|
189
|
+
k: v.model_dump() for k, v in self.config.info_types_config.items()
|
|
187
190
|
},
|
|
188
191
|
infotypes=self.config.info_types,
|
|
189
192
|
minimum_values_threshold=self.config.minimum_values_threshold,
|
|
@@ -102,6 +102,7 @@ if TYPE_CHECKING:
|
|
|
102
102
|
from datahub.sql_parsing.schema_resolver import (
|
|
103
103
|
GraphQLSchemaMetadata,
|
|
104
104
|
SchemaResolver,
|
|
105
|
+
SchemaResolverReport,
|
|
105
106
|
)
|
|
106
107
|
from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
|
|
107
108
|
|
|
@@ -1543,6 +1544,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1543
1544
|
platform_instance: Optional[str],
|
|
1544
1545
|
env: str,
|
|
1545
1546
|
include_graph: bool = True,
|
|
1547
|
+
report: Optional["SchemaResolverReport"] = None,
|
|
1546
1548
|
) -> "SchemaResolver":
|
|
1547
1549
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
1548
1550
|
|
|
@@ -1551,6 +1553,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1551
1553
|
platform_instance=platform_instance,
|
|
1552
1554
|
env=env,
|
|
1553
1555
|
graph=self if include_graph else None,
|
|
1556
|
+
report=report,
|
|
1554
1557
|
)
|
|
1555
1558
|
|
|
1556
1559
|
def initialize_schema_resolver_from_datahub(
|
|
@@ -1559,10 +1562,11 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1559
1562
|
platform_instance: Optional[str],
|
|
1560
1563
|
env: str,
|
|
1561
1564
|
batch_size: int = 100,
|
|
1565
|
+
report: Optional["SchemaResolverReport"] = None,
|
|
1562
1566
|
) -> "SchemaResolver":
|
|
1563
1567
|
logger.info("Initializing schema resolver")
|
|
1564
1568
|
schema_resolver = self._make_schema_resolver(
|
|
1565
|
-
platform, platform_instance, env, include_graph=False
|
|
1569
|
+
platform, platform_instance, env, include_graph=False, report=report
|
|
1566
1570
|
)
|
|
1567
1571
|
|
|
1568
1572
|
logger.info(f"Fetching schemas for platform {platform}, env {env}")
|
|
@@ -82,7 +82,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
82
82
|
ctx: PipelineContext,
|
|
83
83
|
sink: Sink,
|
|
84
84
|
) -> PipelineRunListener:
|
|
85
|
-
reporter_config = DatahubIngestionRunSummaryProviderConfig.
|
|
85
|
+
reporter_config = DatahubIngestionRunSummaryProviderConfig.model_validate(
|
|
86
86
|
config_dict or {}
|
|
87
87
|
)
|
|
88
88
|
if reporter_config.sink:
|