acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +4 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/datahub_gc.py +1 -0
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +0 -1
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +245 -101
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
datahub/__init__.py
CHANGED
|
@@ -20,15 +20,13 @@ class Operator(Protocol):
|
|
|
20
20
|
|
|
21
21
|
operator: str
|
|
22
22
|
|
|
23
|
-
def id(self) -> str:
|
|
24
|
-
...
|
|
23
|
+
def id(self) -> str: ...
|
|
25
24
|
|
|
26
|
-
def generate_parameters(self) -> AssertionStdParametersClass:
|
|
27
|
-
...
|
|
25
|
+
def generate_parameters(self) -> AssertionStdParametersClass: ...
|
|
28
26
|
|
|
29
27
|
|
|
30
28
|
def _generate_assertion_std_parameter(
|
|
31
|
-
value: Union[str, int, float, list]
|
|
29
|
+
value: Union[str, int, float, list],
|
|
32
30
|
) -> AssertionStdParameterClass:
|
|
33
31
|
if isinstance(value, str):
|
|
34
32
|
return AssertionStdParameterClass(
|
|
@@ -19,15 +19,13 @@ class Operator(Protocol):
|
|
|
19
19
|
|
|
20
20
|
operator: str
|
|
21
21
|
|
|
22
|
-
def id(self) -> str:
|
|
23
|
-
...
|
|
22
|
+
def id(self) -> str: ...
|
|
24
23
|
|
|
25
|
-
def generate_parameters(self) -> AssertionStdParametersClass:
|
|
26
|
-
...
|
|
24
|
+
def generate_parameters(self) -> AssertionStdParametersClass: ...
|
|
27
25
|
|
|
28
26
|
|
|
29
27
|
def _generate_assertion_std_parameter(
|
|
30
|
-
value: Union[str, int, float]
|
|
28
|
+
value: Union[str, int, float],
|
|
31
29
|
) -> AssertionStdParameterClass:
|
|
32
30
|
if isinstance(value, str):
|
|
33
31
|
return AssertionStdParameterClass(
|
|
@@ -321,9 +321,9 @@ class DataProduct(ConfigModel):
|
|
|
321
321
|
|
|
322
322
|
@classmethod
|
|
323
323
|
def from_datahub(cls, graph: DataHubGraph, id: str) -> DataProduct:
|
|
324
|
-
data_product_properties: Optional[
|
|
325
|
-
DataProductPropertiesClass
|
|
326
|
-
|
|
324
|
+
data_product_properties: Optional[DataProductPropertiesClass] = (
|
|
325
|
+
graph.get_aspect(id, DataProductPropertiesClass)
|
|
326
|
+
)
|
|
327
327
|
domains: Optional[DomainsClass] = graph.get_aspect(id, DomainsClass)
|
|
328
328
|
assert domains, "Data Product must have an associated domain. Found none."
|
|
329
329
|
owners: Optional[OwnershipClass] = graph.get_aspect(id, OwnershipClass)
|
|
@@ -438,7 +438,7 @@ class DataProduct(ConfigModel):
|
|
|
438
438
|
for replace_index, replace_value in patches_replace.items():
|
|
439
439
|
list_to_manipulate[replace_index] = replace_value
|
|
440
440
|
|
|
441
|
-
for
|
|
441
|
+
for drop_value in patches_drop.values():
|
|
442
442
|
list_to_manipulate.remove(drop_value)
|
|
443
443
|
|
|
444
444
|
for add_value in patches_add:
|
|
@@ -266,7 +266,8 @@ class Dataset(BaseModel):
|
|
|
266
266
|
if self.schema_metadata.fields:
|
|
267
267
|
for field in self.schema_metadata.fields:
|
|
268
268
|
field_urn = field.urn or make_schema_field_urn(
|
|
269
|
-
self.urn,
|
|
269
|
+
self.urn, # type: ignore[arg-type]
|
|
270
|
+
field.id, # type: ignore[arg-type]
|
|
270
271
|
)
|
|
271
272
|
assert field_urn.startswith("urn:li:schemaField:")
|
|
272
273
|
|
|
@@ -118,9 +118,9 @@ class StructuredProperties(ConfigModel):
|
|
|
118
118
|
id = StructuredPropertyUrn.from_string(self.urn).id
|
|
119
119
|
if self.qualified_name is not None:
|
|
120
120
|
# ensure that qualified name and ID match
|
|
121
|
-
assert (
|
|
122
|
-
|
|
123
|
-
)
|
|
121
|
+
assert self.qualified_name == id, (
|
|
122
|
+
"ID in the urn and the qualified_name must match"
|
|
123
|
+
)
|
|
124
124
|
return id
|
|
125
125
|
|
|
126
126
|
@validator("urn", pre=True, always=True)
|
|
@@ -184,9 +184,9 @@ class StructuredProperties(ConfigModel):
|
|
|
184
184
|
|
|
185
185
|
@classmethod
|
|
186
186
|
def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
|
|
187
|
-
structured_property: Optional[
|
|
188
|
-
StructuredPropertyDefinitionClass
|
|
189
|
-
|
|
187
|
+
structured_property: Optional[StructuredPropertyDefinitionClass] = (
|
|
188
|
+
graph.get_aspect(urn, StructuredPropertyDefinitionClass)
|
|
189
|
+
)
|
|
190
190
|
if structured_property is None:
|
|
191
191
|
raise Exception(
|
|
192
192
|
"StructuredPropertyDefinition aspect is None. Unable to create structured property."
|
datahub/cli/cli_utils.py
CHANGED
|
@@ -412,7 +412,7 @@ def generate_access_token(
|
|
|
412
412
|
def ensure_has_system_metadata(
|
|
413
413
|
event: Union[
|
|
414
414
|
MetadataChangeProposal, MetadataChangeProposalWrapper, MetadataChangeEvent
|
|
415
|
-
]
|
|
415
|
+
],
|
|
416
416
|
) -> None:
|
|
417
417
|
if event.systemMetadata is None:
|
|
418
418
|
event.systemMetadata = SystemMetadataClass()
|
datahub/cli/docker_cli.py
CHANGED
|
@@ -296,9 +296,9 @@ def _restore(
|
|
|
296
296
|
restore_indices: Optional[bool],
|
|
297
297
|
primary_restore_file: Optional[str],
|
|
298
298
|
) -> int:
|
|
299
|
-
assert (
|
|
300
|
-
restore_primary or restore_indices
|
|
301
|
-
)
|
|
299
|
+
assert restore_primary or restore_indices, (
|
|
300
|
+
"Either restore_primary or restore_indices must be set"
|
|
301
|
+
)
|
|
302
302
|
msg = "datahub> "
|
|
303
303
|
if restore_primary:
|
|
304
304
|
msg += f"Will restore primary database from {primary_restore_file}. "
|
|
@@ -314,9 +314,9 @@ def _restore(
|
|
|
314
314
|
assert primary_restore_file
|
|
315
315
|
resolved_restore_file = os.path.expanduser(primary_restore_file)
|
|
316
316
|
logger.info(f"Restoring primary db from backup at {resolved_restore_file}")
|
|
317
|
-
assert os.path.exists(
|
|
318
|
-
resolved_restore_file
|
|
319
|
-
)
|
|
317
|
+
assert os.path.exists(resolved_restore_file), (
|
|
318
|
+
f"File {resolved_restore_file} does not exist"
|
|
319
|
+
)
|
|
320
320
|
with open(resolved_restore_file) as fp:
|
|
321
321
|
result = subprocess.run(
|
|
322
322
|
[
|
datahub/cli/ingest_cli.py
CHANGED
|
@@ -507,15 +507,11 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
|
|
|
507
507
|
click.echo("No response received from the server.")
|
|
508
508
|
return
|
|
509
509
|
|
|
510
|
-
#
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
):
|
|
515
|
-
click.echo("No matching ingestion sources found. Please check your filters.")
|
|
516
|
-
return
|
|
510
|
+
# a lot of responses can be null if there's errors in the run
|
|
511
|
+
ingestion_sources = (
|
|
512
|
+
data.get("data", {}).get("listIngestionSources", {}).get("ingestionSources", [])
|
|
513
|
+
)
|
|
517
514
|
|
|
518
|
-
ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
|
|
519
515
|
if not ingestion_sources:
|
|
520
516
|
click.echo("No ingestion sources or executions found.")
|
|
521
517
|
return
|
|
@@ -526,18 +522,32 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
|
|
|
526
522
|
name = ingestion_source.get("name", "N/A")
|
|
527
523
|
|
|
528
524
|
executions = ingestion_source.get("executions", {}).get("executionRequests", [])
|
|
525
|
+
|
|
529
526
|
for execution in executions:
|
|
527
|
+
if execution is None:
|
|
528
|
+
continue
|
|
529
|
+
|
|
530
530
|
execution_id = execution.get("id", "N/A")
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
531
|
+
result = execution.get("result") or {}
|
|
532
|
+
status = result.get("status", "N/A")
|
|
533
|
+
|
|
534
|
+
try:
|
|
535
|
+
start_time = (
|
|
536
|
+
datetime.fromtimestamp(
|
|
537
|
+
result.get("startTimeMs", 0) / 1000
|
|
538
|
+
).strftime("%Y-%m-%d %H:%M:%S")
|
|
539
|
+
if status != "DUPLICATE" and result.get("startTimeMs") is not None
|
|
540
|
+
else "N/A"
|
|
541
|
+
)
|
|
542
|
+
except (TypeError, ValueError):
|
|
543
|
+
start_time = "N/A"
|
|
538
544
|
|
|
539
545
|
rows.append([execution_id, name, start_time, status, urn])
|
|
540
546
|
|
|
547
|
+
if not rows:
|
|
548
|
+
click.echo("No execution data found.")
|
|
549
|
+
return
|
|
550
|
+
|
|
541
551
|
click.echo(
|
|
542
552
|
tabulate(
|
|
543
553
|
rows,
|
datahub/cli/lite_cli.py
CHANGED
|
@@ -176,7 +176,7 @@ def get(
|
|
|
176
176
|
)
|
|
177
177
|
)
|
|
178
178
|
end_time = time.time()
|
|
179
|
-
logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
|
|
179
|
+
logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis")
|
|
180
180
|
|
|
181
181
|
|
|
182
182
|
@lite.command()
|
|
@@ -228,7 +228,7 @@ def ls(path: Optional[str]) -> None:
|
|
|
228
228
|
try:
|
|
229
229
|
browseables = lite.ls(path)
|
|
230
230
|
end_time = time.time()
|
|
231
|
-
logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
|
|
231
|
+
logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis")
|
|
232
232
|
auto_complete: List[AutoComplete] = [
|
|
233
233
|
b.auto_complete for b in browseables if b.auto_complete is not None
|
|
234
234
|
]
|
datahub/cli/migrate.py
CHANGED
|
@@ -426,9 +426,9 @@ def batch_get_ids(
|
|
|
426
426
|
entities_yielded += 1
|
|
427
427
|
log.debug(f"yielding {x}")
|
|
428
428
|
yield x
|
|
429
|
-
assert (
|
|
430
|
-
|
|
431
|
-
)
|
|
429
|
+
assert entities_yielded == num_entities, (
|
|
430
|
+
"Did not delete all entities, try running this command again!"
|
|
431
|
+
)
|
|
432
432
|
else:
|
|
433
433
|
log.error(f"Failed to execute batch get with {str(response.content)}")
|
|
434
434
|
response.raise_for_status()
|
|
@@ -136,9 +136,9 @@ def extras_list_to_dict(extras: List[str]) -> Dict[str, str]:
|
|
|
136
136
|
extra_properties: Dict[str, str] = dict()
|
|
137
137
|
for x in extras:
|
|
138
138
|
parts = x.split("=")
|
|
139
|
-
assert (
|
|
140
|
-
|
|
141
|
-
)
|
|
139
|
+
assert len(parts) == 2, (
|
|
140
|
+
f"Invalid value for extras {x}, should be in format key=value"
|
|
141
|
+
)
|
|
142
142
|
extra_properties[parts[0]] = parts[1]
|
|
143
143
|
return extra_properties
|
|
144
144
|
|
datahub/cli/timeline_cli.py
CHANGED
|
@@ -50,7 +50,7 @@ def pretty_id(id: Optional[str]) -> str:
|
|
|
50
50
|
if id.startswith("urn:li:dataset"):
|
|
51
51
|
dataset_key = dataset_urn_to_key(id)
|
|
52
52
|
if dataset_key:
|
|
53
|
-
return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:'):], fg='white')}:{click.style(dataset_key.name, fg='white')}"
|
|
53
|
+
return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:') :], fg='white')}:{click.style(dataset_key.name, fg='white')}"
|
|
54
54
|
# failed to prettify, return original
|
|
55
55
|
return id
|
|
56
56
|
|
datahub/configuration/common.py
CHANGED
|
@@ -200,8 +200,7 @@ class IgnorableError(MetaError):
|
|
|
200
200
|
|
|
201
201
|
@runtime_checkable
|
|
202
202
|
class ExceptionWithProps(Protocol):
|
|
203
|
-
def get_telemetry_props(self) -> Dict[str, Any]:
|
|
204
|
-
...
|
|
203
|
+
def get_telemetry_props(self) -> Dict[str, Any]: ...
|
|
205
204
|
|
|
206
205
|
|
|
207
206
|
def should_show_stack_trace(exc: Exception) -> bool:
|
|
@@ -19,64 +19,87 @@ from datahub.configuration.yaml import YamlConfigurationMechanism
|
|
|
19
19
|
Environ = Mapping[str, str]
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def _resolve_element(element: str, environ: Environ) -> str:
|
|
23
|
-
if re.search(r"(\$\{).+(\})", element):
|
|
24
|
-
return expand(element, nounset=True, environ=environ)
|
|
25
|
-
elif element.startswith("$"):
|
|
26
|
-
try:
|
|
27
|
-
return expand(element, nounset=True, environ=environ)
|
|
28
|
-
except UnboundVariable:
|
|
29
|
-
return element
|
|
30
|
-
else:
|
|
31
|
-
return element
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def _resolve_list(ele_list: list, environ: Environ) -> list:
|
|
35
|
-
new_v: list = []
|
|
36
|
-
for ele in ele_list:
|
|
37
|
-
if isinstance(ele, str):
|
|
38
|
-
new_v.append(_resolve_element(ele, environ=environ))
|
|
39
|
-
elif isinstance(ele, list):
|
|
40
|
-
new_v.append(_resolve_list(ele, environ=environ))
|
|
41
|
-
elif isinstance(ele, dict):
|
|
42
|
-
new_v.append(resolve_env_variables(ele, environ=environ))
|
|
43
|
-
else:
|
|
44
|
-
new_v.append(ele)
|
|
45
|
-
return new_v
|
|
46
|
-
|
|
47
|
-
|
|
48
22
|
def resolve_env_variables(config: dict, environ: Environ) -> dict:
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
if isinstance(v, dict):
|
|
52
|
-
new_dict[k] = resolve_env_variables(v, environ=environ)
|
|
53
|
-
elif isinstance(v, list):
|
|
54
|
-
new_dict[k] = _resolve_list(v, environ=environ)
|
|
55
|
-
elif isinstance(v, str):
|
|
56
|
-
new_dict[k] = _resolve_element(v, environ=environ)
|
|
57
|
-
else:
|
|
58
|
-
new_dict[k] = v
|
|
59
|
-
return new_dict
|
|
23
|
+
# TODO: This is kept around for backwards compatibility.
|
|
24
|
+
return EnvResolver(environ).resolve(config)
|
|
60
25
|
|
|
61
26
|
|
|
62
27
|
def list_referenced_env_variables(config: dict) -> Set[str]:
|
|
63
|
-
# This is
|
|
64
|
-
|
|
28
|
+
# TODO: This is kept around for backwards compatibility.
|
|
29
|
+
return EnvResolver(environ=os.environ).list_referenced_variables(config)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class EnvResolver:
|
|
33
|
+
def __init__(self, environ: Environ, strict_env_syntax: bool = False):
|
|
34
|
+
self.environ = environ
|
|
35
|
+
self.strict_env_syntax = strict_env_syntax
|
|
65
36
|
|
|
66
|
-
|
|
37
|
+
def resolve(self, config: dict) -> dict:
|
|
38
|
+
return self._resolve_dict(config)
|
|
67
39
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
40
|
+
@classmethod
|
|
41
|
+
def list_referenced_variables(
|
|
42
|
+
cls,
|
|
43
|
+
config: dict,
|
|
44
|
+
strict_env_syntax: bool = False,
|
|
45
|
+
) -> Set[str]:
|
|
46
|
+
# This is a bit of a hack, but expandvars does a bunch of escaping
|
|
47
|
+
# and other logic that we don't want to duplicate here.
|
|
73
48
|
|
|
74
|
-
|
|
75
|
-
mock.get.side_effect = mock_get_env
|
|
49
|
+
vars = set()
|
|
76
50
|
|
|
77
|
-
|
|
51
|
+
def mock_get_env(key: str, default: Optional[str] = None) -> str:
|
|
52
|
+
vars.add(key)
|
|
53
|
+
if default is not None:
|
|
54
|
+
return default
|
|
55
|
+
return "mocked_value"
|
|
56
|
+
|
|
57
|
+
mock = unittest.mock.MagicMock()
|
|
58
|
+
mock.get.side_effect = mock_get_env
|
|
59
|
+
|
|
60
|
+
resolver = EnvResolver(environ=mock, strict_env_syntax=strict_env_syntax)
|
|
61
|
+
resolver._resolve_dict(config)
|
|
62
|
+
|
|
63
|
+
return vars
|
|
64
|
+
|
|
65
|
+
def _resolve_element(self, element: str) -> str:
|
|
66
|
+
if re.search(r"(\$\{).+(\})", element):
|
|
67
|
+
return expand(element, nounset=True, environ=self.environ)
|
|
68
|
+
elif not self.strict_env_syntax and element.startswith("$"):
|
|
69
|
+
try:
|
|
70
|
+
return expand(element, nounset=True, environ=self.environ)
|
|
71
|
+
except UnboundVariable:
|
|
72
|
+
# TODO: This fallback is kept around for backwards compatibility, but
|
|
73
|
+
# doesn't make a ton of sense from first principles.
|
|
74
|
+
return element
|
|
75
|
+
else:
|
|
76
|
+
return element
|
|
78
77
|
|
|
79
|
-
|
|
78
|
+
def _resolve_list(self, ele_list: list) -> list:
|
|
79
|
+
new_v: list = []
|
|
80
|
+
for ele in ele_list:
|
|
81
|
+
if isinstance(ele, str):
|
|
82
|
+
new_v.append(self._resolve_element(ele))
|
|
83
|
+
elif isinstance(ele, list):
|
|
84
|
+
new_v.append(self._resolve_list(ele))
|
|
85
|
+
elif isinstance(ele, dict):
|
|
86
|
+
new_v.append(self._resolve_dict(ele))
|
|
87
|
+
else:
|
|
88
|
+
new_v.append(ele)
|
|
89
|
+
return new_v
|
|
90
|
+
|
|
91
|
+
def _resolve_dict(self, config: dict) -> dict:
|
|
92
|
+
new_dict: Dict[Any, Any] = {}
|
|
93
|
+
for k, v in config.items():
|
|
94
|
+
if isinstance(v, dict):
|
|
95
|
+
new_dict[k] = self._resolve_dict(v)
|
|
96
|
+
elif isinstance(v, list):
|
|
97
|
+
new_dict[k] = self._resolve_list(v)
|
|
98
|
+
elif isinstance(v, str):
|
|
99
|
+
new_dict[k] = self._resolve_element(v)
|
|
100
|
+
else:
|
|
101
|
+
new_dict[k] = v
|
|
102
|
+
return new_dict
|
|
80
103
|
|
|
81
104
|
|
|
82
105
|
WRITE_TO_FILE_DIRECTIVE_PREFIX = "__DATAHUB_TO_FILE_"
|
|
@@ -159,7 +182,7 @@ def load_config_file(
|
|
|
159
182
|
|
|
160
183
|
config = raw_config.copy()
|
|
161
184
|
if resolve_env_vars:
|
|
162
|
-
config =
|
|
185
|
+
config = EnvResolver(environ=os.environ).resolve(config)
|
|
163
186
|
if process_directives:
|
|
164
187
|
config = _process_directives(config)
|
|
165
188
|
|
datahub/configuration/git.py
CHANGED
|
@@ -121,9 +121,9 @@ class GitInfo(GitReference):
|
|
|
121
121
|
|
|
122
122
|
repo: str = values["repo"]
|
|
123
123
|
if repo.startswith(_GITHUB_PREFIX):
|
|
124
|
-
return f"git@github.com:{repo[len(_GITHUB_PREFIX):]}.git"
|
|
124
|
+
return f"git@github.com:{repo[len(_GITHUB_PREFIX) :]}.git"
|
|
125
125
|
elif repo.startswith(_GITLAB_PREFIX):
|
|
126
|
-
return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX):]}.git"
|
|
126
|
+
return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX) :]}.git"
|
|
127
127
|
else:
|
|
128
128
|
raise ValueError(
|
|
129
129
|
"Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually."
|
|
@@ -47,7 +47,10 @@ class BaseTimeWindowConfig(ConfigModel):
|
|
|
47
47
|
default_factory=lambda: datetime.now(tz=timezone.utc),
|
|
48
48
|
description="Latest date of lineage/usage to consider. Default: Current time in UTC",
|
|
49
49
|
)
|
|
50
|
-
start_time: datetime = Field(
|
|
50
|
+
start_time: datetime = Field(
|
|
51
|
+
default=None,
|
|
52
|
+
description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.",
|
|
53
|
+
) # type: ignore
|
|
51
54
|
|
|
52
55
|
@pydantic.validator("start_time", pre=True, always=True)
|
|
53
56
|
def default_start_time(
|
|
@@ -63,12 +66,14 @@ class BaseTimeWindowConfig(ConfigModel):
|
|
|
63
66
|
# This is where start_time str is resolved to datetime
|
|
64
67
|
try:
|
|
65
68
|
delta = parse_relative_timespan(v)
|
|
66
|
-
assert delta < timedelta(
|
|
67
|
-
|
|
68
|
-
)
|
|
69
|
+
assert delta < timedelta(0), (
|
|
70
|
+
"Relative start time should start with minus sign (-) e.g. '-2 days'."
|
|
71
|
+
)
|
|
69
72
|
assert abs(delta) >= get_bucket_duration_delta(
|
|
70
73
|
values["bucket_duration"]
|
|
71
|
-
),
|
|
74
|
+
), (
|
|
75
|
+
"Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
|
|
76
|
+
)
|
|
72
77
|
|
|
73
78
|
# The end_time's default value is not yet populated, in which case
|
|
74
79
|
# we can just manually generate it here.
|
datahub/emitter/mce_builder.py
CHANGED
|
@@ -88,13 +88,11 @@ def get_sys_time() -> int:
|
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
@overload
|
|
91
|
-
def make_ts_millis(ts: None) -> None:
|
|
92
|
-
...
|
|
91
|
+
def make_ts_millis(ts: None) -> None: ...
|
|
93
92
|
|
|
94
93
|
|
|
95
94
|
@overload
|
|
96
|
-
def make_ts_millis(ts: datetime) -> int:
|
|
97
|
-
...
|
|
95
|
+
def make_ts_millis(ts: datetime) -> int: ...
|
|
98
96
|
|
|
99
97
|
|
|
100
98
|
def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
|
|
@@ -105,13 +103,11 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
|
|
|
105
103
|
|
|
106
104
|
|
|
107
105
|
@overload
|
|
108
|
-
def parse_ts_millis(ts: float) -> datetime:
|
|
109
|
-
...
|
|
106
|
+
def parse_ts_millis(ts: float) -> datetime: ...
|
|
110
107
|
|
|
111
108
|
|
|
112
109
|
@overload
|
|
113
|
-
def parse_ts_millis(ts: None) -> None:
|
|
114
|
-
...
|
|
110
|
+
def parse_ts_millis(ts: None) -> None: ...
|
|
115
111
|
|
|
116
112
|
|
|
117
113
|
def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
|
datahub/entrypoints.py
CHANGED
|
@@ -45,6 +45,12 @@ _logging_configured: Optional[ContextManager] = None
|
|
|
45
45
|
|
|
46
46
|
MAX_CONTENT_WIDTH = 120
|
|
47
47
|
|
|
48
|
+
if sys.version_info >= (3, 12):
|
|
49
|
+
click.secho(
|
|
50
|
+
"Python versions above 3.11 are not tested with. Please use Python 3.11.",
|
|
51
|
+
fg="red",
|
|
52
|
+
)
|
|
53
|
+
|
|
48
54
|
|
|
49
55
|
@click.group(
|
|
50
56
|
context_settings=dict(
|
|
@@ -55,15 +55,9 @@ def convert_chart_info_to_patch(
|
|
|
55
55
|
aspect.externalUrl
|
|
56
56
|
).set_type(aspect.type).set_title(aspect.title).set_access(
|
|
57
57
|
aspect.access
|
|
58
|
-
).set_last_modified(
|
|
59
|
-
aspect.lastModified
|
|
60
|
-
).set_last_refreshed(
|
|
58
|
+
).set_last_modified(aspect.lastModified).set_last_refreshed(
|
|
61
59
|
aspect.lastRefreshed
|
|
62
|
-
).set_description(
|
|
63
|
-
aspect.description
|
|
64
|
-
).add_inputs(
|
|
65
|
-
aspect.inputs
|
|
66
|
-
)
|
|
60
|
+
).set_description(aspect.description).add_inputs(aspect.inputs)
|
|
67
61
|
|
|
68
62
|
values = patch_builder.build()
|
|
69
63
|
if values:
|
datahub/ingestion/api/report.py
CHANGED
|
@@ -48,7 +48,7 @@ logger = logging.getLogger(__name__)
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def auto_workunit(
|
|
51
|
-
stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]]
|
|
51
|
+
stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]],
|
|
52
52
|
) -> Iterable[MetadataWorkUnit]:
|
|
53
53
|
"""Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
|
|
54
54
|
|
|
@@ -131,9 +131,9 @@ class FieldPath:
|
|
|
131
131
|
for i, schema_type in enumerate(p.schema_types):
|
|
132
132
|
if schema_type == schema_str:
|
|
133
133
|
# return the corresponding type for the schema that's a match
|
|
134
|
-
assert (
|
|
135
|
-
len(p.type)
|
|
136
|
-
)
|
|
134
|
+
assert len(p.type) > i, (
|
|
135
|
+
f"p.type({len(p.type)})) and p.schema_types({len(p.schema_types)}) should have the same length"
|
|
136
|
+
)
|
|
137
137
|
return p.type[i]
|
|
138
138
|
return None
|
|
139
139
|
|
|
@@ -263,15 +263,13 @@ class AvroToMceSchemaConverter:
|
|
|
263
263
|
@overload
|
|
264
264
|
def _get_underlying_type_if_option_as_union(
|
|
265
265
|
schema: SchemaOrField, default: SchemaOrField
|
|
266
|
-
) -> SchemaOrField:
|
|
267
|
-
...
|
|
266
|
+
) -> SchemaOrField: ...
|
|
268
267
|
|
|
269
268
|
@staticmethod
|
|
270
269
|
@overload
|
|
271
270
|
def _get_underlying_type_if_option_as_union(
|
|
272
271
|
schema: SchemaOrField, default: Optional[SchemaOrField] = None
|
|
273
|
-
) -> Optional[SchemaOrField]:
|
|
274
|
-
...
|
|
272
|
+
) -> Optional[SchemaOrField]: ...
|
|
275
273
|
|
|
276
274
|
@staticmethod
|
|
277
275
|
def _get_underlying_type_if_option_as_union(
|
|
@@ -386,7 +384,7 @@ class AvroToMceSchemaConverter:
|
|
|
386
384
|
|
|
387
385
|
if "deprecated" in merged_props:
|
|
388
386
|
description = (
|
|
389
|
-
f
|
|
387
|
+
f'<span style="color:red">DEPRECATED: {merged_props["deprecated"]}</span>\n'
|
|
390
388
|
+ description
|
|
391
389
|
if description
|
|
392
390
|
else ""
|
datahub/ingestion/fs/s3_fs.py
CHANGED
|
@@ -17,9 +17,9 @@ def parse_s3_path(path: str) -> "S3Path":
|
|
|
17
17
|
|
|
18
18
|
def assert_ok_status(s3_response):
|
|
19
19
|
is_ok = s3_response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
|
20
|
-
assert (
|
|
21
|
-
|
|
22
|
-
)
|
|
20
|
+
assert is_ok, (
|
|
21
|
+
f"Failed to fetch S3 object, error message: {s3_response['Error']['Message']}"
|
|
22
|
+
)
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
@dataclass
|