acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +6 -3
- datahub/api/entities/dataset/dataset.py +9 -18
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/docker_check.py +2 -2
- datahub/configuration/common.py +29 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/pydantic_migration_helpers.py +0 -9
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +5 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/ingestion/autogenerated/capability_summary.json +45 -1
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/source/azure/azure_common.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
- datahub/ingestion/source/datahub/config.py +8 -9
- datahub/ingestion/source/dbt/dbt_common.py +65 -5
- datahub/ingestion/source/delta_lake/config.py +1 -1
- datahub/ingestion/source/dremio/dremio_config.py +3 -4
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +19 -2
- datahub/ingestion/source/ge_data_profiler.py +15 -2
- datahub/ingestion/source/ge_profiling_config.py +26 -22
- datahub/ingestion/source/grafana/grafana_config.py +2 -2
- datahub/ingestion/source/grafana/models.py +12 -14
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/looker/looker_common.py +76 -75
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_source.py +493 -547
- datahub/ingestion/source/looker/lookml_config.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +46 -88
- datahub/ingestion/source/metabase.py +9 -2
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +1 -1
- datahub/ingestion/source/mode.py +13 -5
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +14 -21
- datahub/ingestion/source/preset.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +6 -3
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/s3/source.py +26 -24
- datahub/ingestion/source/salesforce.py +13 -9
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
- datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +12 -7
- datahub/ingestion/source/sql/cockroachdb.py +5 -3
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +7 -9
- datahub/ingestion/source/sql/mssql/source.py +2 -2
- datahub/ingestion/source/sql/mysql.py +2 -2
- datahub/ingestion/source/sql/oracle.py +3 -3
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -4
- datahub/ingestion/source/sql/trino.py +2 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +1 -1
- datahub/ingestion/source/sql_queries.py +6 -6
- datahub/ingestion/source/state/checkpoint.py +5 -1
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
- datahub/ingestion/source/superset.py +122 -15
- datahub/ingestion/source/tableau/tableau.py +68 -14
- datahub/ingestion/source/tableau/tableau_common.py +5 -0
- datahub/ingestion/source/tableau/tableau_constant.py +1 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +7 -3
- datahub/ingestion/source/usage/usage_common.py +3 -3
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/metadata/_internal_schema_classes.py +728 -528
- datahub/metadata/_urns/urn_defs.py +1702 -1702
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/schema.avsc +17434 -17732
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
- datahub/metadata/schemas/LogicalParent.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
- datahub/sdk/_shared.py +126 -0
- datahub/sdk/chart.py +87 -30
- datahub/sdk/dashboard.py +79 -34
- datahub/sdk/entity_client.py +11 -4
- datahub/sdk/lineage_client.py +3 -3
- datahub/sdk/search_filters.py +1 -7
- datahub/sql_parsing/split_statements.py +13 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
|
+
from copy import deepcopy
|
|
4
5
|
from datetime import timedelta
|
|
5
6
|
from typing import Dict, List, Optional, Union
|
|
6
7
|
|
|
7
8
|
from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
|
|
8
9
|
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
10
11
|
from datahub.configuration.source_common import (
|
|
11
12
|
EnvConfigMixin,
|
|
12
13
|
LowerCaseDatasetUrnConfigMixin,
|
|
@@ -73,8 +74,10 @@ class BigQueryBaseConfig(ConfigModel):
|
|
|
73
74
|
) from e
|
|
74
75
|
return v
|
|
75
76
|
|
|
76
|
-
@root_validator(pre=True
|
|
77
|
+
@root_validator(pre=True)
|
|
77
78
|
def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
|
|
79
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
80
|
+
values = deepcopy(values)
|
|
78
81
|
project_id = values.pop("project_id", None)
|
|
79
82
|
project_ids = values.get("project_ids")
|
|
80
83
|
|
|
@@ -182,13 +185,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
|
|
|
182
185
|
)
|
|
183
186
|
|
|
184
187
|
# NOTE: `schema_pattern` is added here only to hide it from docs.
|
|
185
|
-
schema_pattern: AllowDenyPattern = Field(
|
|
188
|
+
schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
|
|
186
189
|
default=AllowDenyPattern.allow_all(),
|
|
187
|
-
hidden_from_docs=True,
|
|
188
190
|
)
|
|
189
191
|
|
|
190
192
|
@root_validator(pre=False, skip_on_failure=True)
|
|
191
193
|
def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
|
|
194
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
195
|
+
values = deepcopy(values)
|
|
192
196
|
dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
|
|
193
197
|
schema_pattern = values.get("schema_pattern")
|
|
194
198
|
if (
|
|
@@ -320,8 +324,7 @@ class BigQueryV2Config(
|
|
|
320
324
|
description="Include full payload into events. It is only for debugging and internal use.",
|
|
321
325
|
)
|
|
322
326
|
|
|
323
|
-
number_of_datasets_process_in_batch: int = Field(
|
|
324
|
-
hidden_from_docs=True,
|
|
327
|
+
number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
|
|
325
328
|
default=10000,
|
|
326
329
|
description="Number of table queried in batch when getting metadata. This is a low level config property "
|
|
327
330
|
"which should be touched with care.",
|
|
@@ -436,17 +439,15 @@ class BigQueryV2Config(
|
|
|
436
439
|
|
|
437
440
|
upstream_lineage_in_report: bool = Field(
|
|
438
441
|
default=False,
|
|
439
|
-
description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
|
|
442
|
+
description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
|
|
440
443
|
)
|
|
441
444
|
|
|
442
|
-
run_optimized_column_query: bool = Field(
|
|
443
|
-
hidden_from_docs=True,
|
|
445
|
+
run_optimized_column_query: HiddenFromDocs[bool] = Field(
|
|
444
446
|
default=False,
|
|
445
447
|
description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
|
|
446
448
|
)
|
|
447
449
|
|
|
448
|
-
file_backed_cache_size: int = Field(
|
|
449
|
-
hidden_from_docs=True,
|
|
450
|
+
file_backed_cache_size: HiddenFromDocs[int] = Field(
|
|
450
451
|
default=2000,
|
|
451
452
|
description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
|
|
452
453
|
)
|
|
@@ -456,10 +457,9 @@ class BigQueryV2Config(
|
|
|
456
457
|
description="Option to exclude empty projects from being ingested.",
|
|
457
458
|
)
|
|
458
459
|
|
|
459
|
-
schema_resolution_batch_size: int = Field(
|
|
460
|
+
schema_resolution_batch_size: HiddenFromDocs[int] = Field(
|
|
460
461
|
default=100,
|
|
461
462
|
description="The number of tables to process in a batch when resolving schema from DataHub.",
|
|
462
|
-
hidden_from_schema=True,
|
|
463
463
|
)
|
|
464
464
|
|
|
465
465
|
max_threads_dataset_parallelism: int = Field(
|
|
@@ -480,6 +480,8 @@ class BigQueryV2Config(
|
|
|
480
480
|
|
|
481
481
|
@root_validator(pre=True)
|
|
482
482
|
def set_include_schema_metadata(cls, values: Dict) -> Dict:
|
|
483
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
484
|
+
values = deepcopy(values)
|
|
483
485
|
# Historically this is used to disable schema ingestion
|
|
484
486
|
if (
|
|
485
487
|
"include_tables" in values
|
|
@@ -498,6 +500,8 @@ class BigQueryV2Config(
|
|
|
498
500
|
|
|
499
501
|
@root_validator(skip_on_failure=True)
|
|
500
502
|
def profile_default_settings(cls, values: Dict) -> Dict:
|
|
503
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
504
|
+
values = deepcopy(values)
|
|
501
505
|
# Extra default SQLAlchemy option for better connection pooling and threading.
|
|
502
506
|
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
|
|
503
507
|
values["options"].setdefault("max_overflow", -1)
|
|
@@ -515,9 +519,19 @@ class BigQueryV2Config(
|
|
|
515
519
|
|
|
516
520
|
return v
|
|
517
521
|
|
|
522
|
+
@validator("upstream_lineage_in_report")
|
|
523
|
+
def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
|
|
524
|
+
if v and values.get("use_queries_v2", True):
|
|
525
|
+
logging.warning(
|
|
526
|
+
"`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
|
|
527
|
+
"This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
return v
|
|
531
|
+
|
|
518
532
|
def get_table_pattern(self, pattern: List[str]) -> str:
|
|
519
533
|
return "|".join(pattern) if pattern else ""
|
|
520
534
|
|
|
521
|
-
|
|
535
|
+
_platform_instance_not_supported_for_bigquery = pydantic_removed_field(
|
|
522
536
|
"platform_instance"
|
|
523
537
|
)
|
|
@@ -283,23 +283,30 @@ class BigQuerySchemaApi:
|
|
|
283
283
|
with self.report.list_datasets_timer:
|
|
284
284
|
self.report.num_list_datasets_api_requests += 1
|
|
285
285
|
datasets = self.bq_client.list_datasets(project_id, max_results=maxResults)
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
)
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
286
|
+
result = []
|
|
287
|
+
for d in datasets:
|
|
288
|
+
# TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
|
|
289
|
+
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
|
|
290
|
+
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
|
|
291
|
+
dataset = self.bq_client.get_dataset(d.reference)
|
|
292
|
+
|
|
293
|
+
location = (
|
|
294
|
+
d._properties.get("location")
|
|
295
|
+
if hasattr(d, "_properties") and isinstance(d._properties, dict)
|
|
296
|
+
else None
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
result.append(
|
|
300
|
+
BigqueryDataset(
|
|
301
|
+
name=d.dataset_id,
|
|
302
|
+
labels=d.labels,
|
|
303
|
+
location=location,
|
|
304
|
+
comment=dataset.description,
|
|
305
|
+
created=dataset.created,
|
|
306
|
+
last_altered=dataset.modified,
|
|
307
|
+
)
|
|
300
308
|
)
|
|
301
|
-
|
|
302
|
-
]
|
|
309
|
+
return result
|
|
303
310
|
|
|
304
311
|
# This is not used anywhere
|
|
305
312
|
def get_datasets_for_project_id_with_information_schema(
|
|
@@ -12,6 +12,7 @@ from datahub.emitter.mce_builder import (
|
|
|
12
12
|
make_dataset_urn_with_platform_instance,
|
|
13
13
|
make_schema_field_urn,
|
|
14
14
|
make_tag_urn,
|
|
15
|
+
make_ts_millis,
|
|
15
16
|
)
|
|
16
17
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
17
18
|
from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey
|
|
@@ -300,6 +301,8 @@ class BigQuerySchemaGenerator:
|
|
|
300
301
|
description: Optional[str] = None,
|
|
301
302
|
tags: Optional[Dict[str, str]] = None,
|
|
302
303
|
extra_properties: Optional[Dict[str, str]] = None,
|
|
304
|
+
created: Optional[int] = None,
|
|
305
|
+
last_modified: Optional[int] = None,
|
|
303
306
|
) -> Iterable[MetadataWorkUnit]:
|
|
304
307
|
schema_container_key = self.gen_dataset_key(project_id, dataset)
|
|
305
308
|
|
|
@@ -349,6 +352,8 @@ class BigQuerySchemaGenerator:
|
|
|
349
352
|
),
|
|
350
353
|
tags=tags_joined,
|
|
351
354
|
extra_properties=extra_properties,
|
|
355
|
+
created=created,
|
|
356
|
+
last_modified=last_modified,
|
|
352
357
|
)
|
|
353
358
|
|
|
354
359
|
def _process_project(
|
|
@@ -484,6 +489,12 @@ class BigQuerySchemaGenerator:
|
|
|
484
489
|
else None
|
|
485
490
|
),
|
|
486
491
|
description=bigquery_dataset.comment,
|
|
492
|
+
created=make_ts_millis(bigquery_dataset.created)
|
|
493
|
+
if bigquery_dataset.created
|
|
494
|
+
else None,
|
|
495
|
+
last_modified=make_ts_millis(bigquery_dataset.last_altered)
|
|
496
|
+
if bigquery_dataset.last_altered
|
|
497
|
+
else None,
|
|
487
498
|
)
|
|
488
499
|
|
|
489
500
|
columns = None
|
|
@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
|
|
|
8
8
|
from google.cloud.bigquery import Client
|
|
9
9
|
from pydantic import Field, PositiveInt
|
|
10
10
|
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
11
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
12
12
|
from datahub.configuration.time_window_config import (
|
|
13
13
|
BaseTimeWindowConfig,
|
|
14
14
|
get_time_bucket,
|
|
@@ -86,12 +86,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
|
|
|
86
86
|
# TODO: Support stateful ingestion for the time windows.
|
|
87
87
|
window: BaseTimeWindowConfig = BaseTimeWindowConfig()
|
|
88
88
|
|
|
89
|
-
local_temp_path: Optional[pathlib.Path] = Field(
|
|
90
|
-
default=None,
|
|
91
|
-
description="Local path to store the audit log.",
|
|
89
|
+
local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
|
|
92
90
|
# TODO: For now, this is simply an advanced config to make local testing easier.
|
|
93
91
|
# Eventually, we will want to store date-specific files in the directory and use it as a cache.
|
|
94
|
-
|
|
92
|
+
default=None,
|
|
93
|
+
description="Local path to store the audit log.",
|
|
95
94
|
)
|
|
96
95
|
|
|
97
96
|
user_email_pattern: AllowDenyPattern = Field(
|
|
@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class GCPCredential(ConfigModel):
|
|
12
|
-
project_id: Optional[str] = Field(
|
|
12
|
+
project_id: Optional[str] = Field(
|
|
13
|
+
None, description="Project id to set the credentials"
|
|
14
|
+
)
|
|
13
15
|
private_key_id: str = Field(description="Private key id")
|
|
14
16
|
private_key: str = Field(
|
|
15
17
|
description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
|
|
@@ -11,7 +11,7 @@ from cached_property import cached_property
|
|
|
11
11
|
from pydantic.fields import Field
|
|
12
12
|
from wcmatch import pathlib
|
|
13
13
|
|
|
14
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
14
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
15
15
|
from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
|
16
16
|
from datahub.ingestion.source.azure.abs_utils import is_abs_uri
|
|
17
17
|
from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
|
|
@@ -89,63 +89,62 @@ class PathSpec(ConfigModel):
|
|
|
89
89
|
description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
|
|
90
90
|
)
|
|
91
91
|
exclude: Optional[List[str]] = Field(
|
|
92
|
-
|
|
92
|
+
[],
|
|
93
93
|
description="list of paths in glob pattern which will be excluded while scanning for the datasets",
|
|
94
94
|
)
|
|
95
95
|
file_types: List[str] = Field(
|
|
96
|
-
|
|
96
|
+
SUPPORTED_FILE_TYPES,
|
|
97
97
|
description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.",
|
|
98
98
|
)
|
|
99
99
|
|
|
100
100
|
default_extension: Optional[str] = Field(
|
|
101
|
-
|
|
101
|
+
None,
|
|
102
102
|
description="For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.",
|
|
103
103
|
)
|
|
104
104
|
|
|
105
105
|
table_name: Optional[str] = Field(
|
|
106
|
-
|
|
106
|
+
None,
|
|
107
107
|
description="Display name of the dataset.Combination of named variables from include path and strings",
|
|
108
108
|
)
|
|
109
109
|
|
|
110
110
|
# This is not used yet, but will be used in the future to sort the partitions
|
|
111
|
-
sort_key: Optional[SortKey] = Field(
|
|
112
|
-
|
|
113
|
-
default=None,
|
|
111
|
+
sort_key: HiddenFromDocs[Optional[SortKey]] = Field(
|
|
112
|
+
None,
|
|
114
113
|
description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.",
|
|
115
114
|
)
|
|
116
115
|
|
|
117
116
|
enable_compression: bool = Field(
|
|
118
|
-
|
|
117
|
+
True,
|
|
119
118
|
description="Enable or disable processing compressed files. Currently .gz and .bz files are supported.",
|
|
120
119
|
)
|
|
121
120
|
|
|
122
121
|
sample_files: bool = Field(
|
|
123
|
-
|
|
122
|
+
True,
|
|
124
123
|
description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled",
|
|
125
124
|
)
|
|
126
125
|
|
|
127
126
|
allow_double_stars: bool = Field(
|
|
128
|
-
|
|
127
|
+
False,
|
|
129
128
|
description="Allow double stars in the include path. This can affect performance significantly if enabled",
|
|
130
129
|
)
|
|
131
130
|
|
|
132
131
|
autodetect_partitions: bool = Field(
|
|
133
|
-
|
|
132
|
+
True,
|
|
134
133
|
description="Autodetect partition(s) from the path. If set to true, it will autodetect partition key/value if the folder format is {partition_key}={partition_value} for example `year=2024`",
|
|
135
134
|
)
|
|
136
135
|
|
|
137
136
|
traversal_method: FolderTraversalMethod = Field(
|
|
138
|
-
|
|
137
|
+
FolderTraversalMethod.MAX,
|
|
139
138
|
description="Method to traverse the folder. ALL: Traverse all the folders, MIN_MAX: Traverse the folders by finding min and max value, MAX: Traverse the folder with max value",
|
|
140
139
|
)
|
|
141
140
|
|
|
142
141
|
include_hidden_folders: bool = Field(
|
|
143
|
-
|
|
142
|
+
False,
|
|
144
143
|
description="Include hidden folders in the traversal (folders starting with . or _",
|
|
145
144
|
)
|
|
146
145
|
|
|
147
146
|
tables_filter_pattern: AllowDenyPattern = Field(
|
|
148
|
-
|
|
147
|
+
AllowDenyPattern.allow_all(),
|
|
149
148
|
description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
|
|
150
149
|
)
|
|
151
150
|
|
|
@@ -479,7 +478,8 @@ class PathSpec(ConfigModel):
|
|
|
479
478
|
return glob_include
|
|
480
479
|
|
|
481
480
|
@pydantic.root_validator(skip_on_failure=True)
|
|
482
|
-
|
|
481
|
+
@staticmethod
|
|
482
|
+
def validate_path_spec(values: Dict) -> Dict[str, Any]:
|
|
483
483
|
# validate that main fields are populated
|
|
484
484
|
required_fields = ["include", "file_types", "default_extension"]
|
|
485
485
|
for f in required_fields:
|
|
@@ -4,7 +4,7 @@ from typing import Optional, Set
|
|
|
4
4
|
import pydantic
|
|
5
5
|
from pydantic import Field, root_validator
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
9
9
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
10
10
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
@@ -98,16 +98,14 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
98
98
|
),
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
-
pull_from_datahub_api: bool = Field(
|
|
101
|
+
pull_from_datahub_api: HiddenFromDocs[bool] = Field(
|
|
102
102
|
default=False,
|
|
103
103
|
description="Use the DataHub API to fetch versioned aspects.",
|
|
104
|
-
hidden_from_docs=True,
|
|
105
104
|
)
|
|
106
105
|
|
|
107
|
-
max_workers: int = Field(
|
|
106
|
+
max_workers: HiddenFromDocs[int] = Field(
|
|
108
107
|
default=5 * (os.cpu_count() or 4),
|
|
109
108
|
description="Number of worker threads to use for datahub api ingestion.",
|
|
110
|
-
hidden_from_docs=True,
|
|
111
109
|
)
|
|
112
110
|
|
|
113
111
|
urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
|
|
@@ -118,10 +116,11 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
118
116
|
"Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
|
|
119
117
|
)
|
|
120
118
|
|
|
121
|
-
structured_properties_template_cache_invalidation_interval: int =
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
119
|
+
structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
|
|
120
|
+
Field(
|
|
121
|
+
default=60,
|
|
122
|
+
description="Interval in seconds to invalidate the structured properties template cache.",
|
|
123
|
+
)
|
|
125
124
|
)
|
|
126
125
|
|
|
127
126
|
query_timeout: Optional[int] = Field(
|
|
@@ -246,6 +246,23 @@ class DBTEntitiesEnabled(ConfigModel):
|
|
|
246
246
|
return self.model_performance == EmitDirective.YES
|
|
247
247
|
|
|
248
248
|
|
|
249
|
+
class MaterializedNodePatternConfig(ConfigModel):
|
|
250
|
+
"""Configuration for filtering materialized nodes based on their physical location"""
|
|
251
|
+
|
|
252
|
+
database_pattern: AllowDenyPattern = Field(
|
|
253
|
+
default=AllowDenyPattern.allow_all(),
|
|
254
|
+
description="Regex patterns for database names to filter materialized nodes.",
|
|
255
|
+
)
|
|
256
|
+
schema_pattern: AllowDenyPattern = Field(
|
|
257
|
+
default=AllowDenyPattern.allow_all(),
|
|
258
|
+
description="Regex patterns for schema names in format '{database}.{schema}' to filter materialized nodes.",
|
|
259
|
+
)
|
|
260
|
+
table_pattern: AllowDenyPattern = Field(
|
|
261
|
+
default=AllowDenyPattern.allow_all(),
|
|
262
|
+
description="Regex patterns for table/view names in format '{database}.{schema}.{table}' to filter materialized nodes.",
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
249
266
|
class DBTCommonConfig(
|
|
250
267
|
StatefulIngestionConfigBase,
|
|
251
268
|
PlatformInstanceConfigMixin,
|
|
@@ -294,6 +311,11 @@ class DBTCommonConfig(
|
|
|
294
311
|
default=AllowDenyPattern.allow_all(),
|
|
295
312
|
description="regex patterns for dbt model names to filter in ingestion.",
|
|
296
313
|
)
|
|
314
|
+
materialized_node_pattern: MaterializedNodePatternConfig = Field(
|
|
315
|
+
default=MaterializedNodePatternConfig(),
|
|
316
|
+
description="Advanced filtering for materialized nodes based on their physical database location. "
|
|
317
|
+
"Provides fine-grained control over database.schema.table patterns for catalog consistency.",
|
|
318
|
+
)
|
|
297
319
|
meta_mapping: Dict = Field(
|
|
298
320
|
default={},
|
|
299
321
|
description="mapping rules that will be executed against dbt meta properties. Refer to the section below on dbt meta automated mappings.",
|
|
@@ -1018,15 +1040,53 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1018
1040
|
all_nodes_map,
|
|
1019
1041
|
)
|
|
1020
1042
|
|
|
1021
|
-
def _is_allowed_node(self,
|
|
1022
|
-
|
|
1043
|
+
def _is_allowed_node(self, node: DBTNode) -> bool:
|
|
1044
|
+
"""
|
|
1045
|
+
Check whether a node should be processed, using multi-layer rules. Checks for materialized nodes might need to be restricted in the future to some cases
|
|
1046
|
+
"""
|
|
1047
|
+
if not self.config.node_name_pattern.allowed(node.dbt_name):
|
|
1048
|
+
return False
|
|
1049
|
+
|
|
1050
|
+
if not self._is_allowed_materialized_node(node):
|
|
1051
|
+
return False
|
|
1052
|
+
|
|
1053
|
+
return True
|
|
1054
|
+
|
|
1055
|
+
def _is_allowed_materialized_node(self, node: DBTNode) -> bool:
|
|
1056
|
+
"""Filter nodes based on their materialized database location for catalog consistency"""
|
|
1057
|
+
|
|
1058
|
+
# Database level filtering
|
|
1059
|
+
if not node.database:
|
|
1060
|
+
return True
|
|
1061
|
+
if not self.config.materialized_node_pattern.database_pattern.allowed(
|
|
1062
|
+
node.database
|
|
1063
|
+
):
|
|
1064
|
+
return False
|
|
1065
|
+
|
|
1066
|
+
# Schema level filtering: {database}.{schema}
|
|
1067
|
+
if not node.schema:
|
|
1068
|
+
return True
|
|
1069
|
+
if not self.config.materialized_node_pattern.schema_pattern.allowed(
|
|
1070
|
+
node._join_parts([node.database, node.schema])
|
|
1071
|
+
):
|
|
1072
|
+
return False
|
|
1073
|
+
|
|
1074
|
+
# Table level filtering: {database}.{schema}.{table}
|
|
1075
|
+
if not node.name:
|
|
1076
|
+
return True
|
|
1077
|
+
if not self.config.materialized_node_pattern.table_pattern.allowed(
|
|
1078
|
+
node.get_db_fqn()
|
|
1079
|
+
):
|
|
1080
|
+
return False
|
|
1081
|
+
|
|
1082
|
+
return True
|
|
1023
1083
|
|
|
1024
1084
|
def _filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]:
|
|
1025
1085
|
nodes: List[DBTNode] = []
|
|
1026
1086
|
for node in all_nodes:
|
|
1027
1087
|
key = node.dbt_name
|
|
1028
1088
|
|
|
1029
|
-
if not self._is_allowed_node(
|
|
1089
|
+
if not self._is_allowed_node(node):
|
|
1030
1090
|
self.report.nodes_filtered.append(key)
|
|
1031
1091
|
continue
|
|
1032
1092
|
|
|
@@ -1118,8 +1178,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1118
1178
|
cll_nodes.add(dbt_name)
|
|
1119
1179
|
schema_nodes.add(dbt_name)
|
|
1120
1180
|
|
|
1121
|
-
for dbt_name in all_nodes_map:
|
|
1122
|
-
if self._is_allowed_node(
|
|
1181
|
+
for dbt_name, dbt_node in all_nodes_map.items():
|
|
1182
|
+
if self._is_allowed_node(dbt_node):
|
|
1123
1183
|
add_node_to_cll_list(dbt_name)
|
|
1124
1184
|
|
|
1125
1185
|
return schema_nodes, cll_nodes
|
|
@@ -4,7 +4,7 @@ from typing import List, Literal, Optional
|
|
|
4
4
|
import certifi
|
|
5
5
|
from pydantic import Field, validator
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.source_common import (
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
@@ -100,10 +100,9 @@ class ProfileConfig(GEProfilingBaseConfig):
|
|
|
100
100
|
query_timeout: int = Field(
|
|
101
101
|
default=300, description="Time before cancelling Dremio profiling query"
|
|
102
102
|
)
|
|
103
|
-
include_field_median_value: bool = Field(
|
|
103
|
+
include_field_median_value: HiddenFromDocs[bool] = Field(
|
|
104
|
+
# Hidden because median causes a number of issues in Dremio.
|
|
104
105
|
default=False,
|
|
105
|
-
hidden_from_docs=True,
|
|
106
|
-
description="Median causes a number of issues in Dremio.",
|
|
107
106
|
)
|
|
108
107
|
|
|
109
108
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import pathlib
|
|
1
2
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
|
3
|
+
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Union
|
|
3
4
|
|
|
4
5
|
import feast.types
|
|
5
6
|
from feast import (
|
|
@@ -97,7 +98,7 @@ class FeastRepositorySourceConfig(
|
|
|
97
98
|
StatefulIngestionConfigBase,
|
|
98
99
|
):
|
|
99
100
|
path: str = Field(description="Path to Feast repository")
|
|
100
|
-
fs_yaml_file: Optional[
|
|
101
|
+
fs_yaml_file: Optional[pathlib.Path] = Field(
|
|
101
102
|
default=None,
|
|
102
103
|
description="Path to the `feature_store.yaml` file used to configure the feature store",
|
|
103
104
|
)
|
|
@@ -142,17 +143,14 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
|
|
|
142
143
|
- Column types associated with each entity and feature
|
|
143
144
|
"""
|
|
144
145
|
|
|
145
|
-
platform = "feast"
|
|
146
|
-
source_config: FeastRepositorySourceConfig
|
|
147
|
-
report: StaleEntityRemovalSourceReport
|
|
148
|
-
feature_store: FeatureStore
|
|
146
|
+
platform: ClassVar[str] = "feast"
|
|
149
147
|
|
|
150
148
|
def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
|
|
151
149
|
super().__init__(config, ctx)
|
|
152
|
-
self.source_config = config
|
|
153
|
-
self.ctx = ctx
|
|
154
|
-
self.report = StaleEntityRemovalSourceReport()
|
|
155
|
-
self.feature_store = FeatureStore(
|
|
150
|
+
self.source_config: FeastRepositorySourceConfig = config
|
|
151
|
+
self.ctx: PipelineContext = ctx
|
|
152
|
+
self.report: StaleEntityRemovalSourceReport = StaleEntityRemovalSourceReport()
|
|
153
|
+
self.feature_store: FeatureStore = FeatureStore(
|
|
156
154
|
repo_path=self.source_config.path,
|
|
157
155
|
fs_yaml_file=self.source_config.fs_yaml_file,
|
|
158
156
|
)
|
|
@@ -102,7 +102,7 @@ class FivetranLogConfig(ConfigModel):
|
|
|
102
102
|
"destination_config", "snowflake_destination_config"
|
|
103
103
|
)
|
|
104
104
|
|
|
105
|
-
@root_validator(
|
|
105
|
+
@root_validator(skip_on_failure=True)
|
|
106
106
|
def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
|
|
107
107
|
destination_platform = values["destination_platform"]
|
|
108
108
|
if destination_platform == "snowflake":
|
|
@@ -37,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
37
37
|
|
|
38
38
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
39
39
|
|
|
40
|
+
GCS_ENDPOINT_URL = "https://storage.googleapis.com"
|
|
41
|
+
|
|
40
42
|
|
|
41
43
|
class HMACKey(ConfigModel):
|
|
42
44
|
hmac_access_id: str = Field(description="Access ID")
|
|
@@ -112,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
112
114
|
s3_config = DataLakeSourceConfig(
|
|
113
115
|
path_specs=s3_path_specs,
|
|
114
116
|
aws_config=AwsConnectionConfig(
|
|
115
|
-
aws_endpoint_url=
|
|
117
|
+
aws_endpoint_url=GCS_ENDPOINT_URL,
|
|
116
118
|
aws_access_key_id=self.config.credential.hmac_access_id,
|
|
117
119
|
aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
|
|
118
120
|
aws_region="auto",
|
|
@@ -121,15 +123,25 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
121
123
|
max_rows=self.config.max_rows,
|
|
122
124
|
number_of_files_to_sample=self.config.number_of_files_to_sample,
|
|
123
125
|
platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
|
|
126
|
+
platform_instance=self.config.platform_instance,
|
|
124
127
|
)
|
|
125
128
|
return s3_config
|
|
126
129
|
|
|
127
130
|
def create_equivalent_s3_path_specs(self):
|
|
128
131
|
s3_path_specs = []
|
|
129
132
|
for path_spec in self.config.path_specs:
|
|
133
|
+
# PathSpec modifies the passed-in include to add /** to the end if
|
|
134
|
+
# autodetecting partitions. Remove that, otherwise creating a new
|
|
135
|
+
# PathSpec will complain.
|
|
136
|
+
# TODO: this should be handled inside PathSpec, which probably shouldn't
|
|
137
|
+
# modify its input.
|
|
138
|
+
include = path_spec.include
|
|
139
|
+
if include.endswith("{table}/**") and not path_spec.allow_double_stars:
|
|
140
|
+
include = include.removesuffix("**")
|
|
141
|
+
|
|
130
142
|
s3_path_specs.append(
|
|
131
143
|
PathSpec(
|
|
132
|
-
include=
|
|
144
|
+
include=include.replace("gs://", "s3://"),
|
|
133
145
|
exclude=(
|
|
134
146
|
[exc.replace("gs://", "s3://") for exc in path_spec.exclude]
|
|
135
147
|
if path_spec.exclude
|
|
@@ -140,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
140
152
|
table_name=path_spec.table_name,
|
|
141
153
|
enable_compression=path_spec.enable_compression,
|
|
142
154
|
sample_files=path_spec.sample_files,
|
|
155
|
+
allow_double_stars=path_spec.allow_double_stars,
|
|
156
|
+
autodetect_partitions=path_spec.autodetect_partitions,
|
|
157
|
+
include_hidden_folders=path_spec.include_hidden_folders,
|
|
158
|
+
tables_filter_pattern=path_spec.tables_filter_pattern,
|
|
159
|
+
traversal_method=path_spec.traversal_method,
|
|
143
160
|
)
|
|
144
161
|
)
|
|
145
162
|
|