acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
  2. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
  3. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/dataproduct/dataproduct.py +6 -3
  8. datahub/api/entities/dataset/dataset.py +9 -18
  9. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  10. datahub/api/graphql/operation.py +10 -6
  11. datahub/cli/docker_check.py +2 -2
  12. datahub/configuration/common.py +29 -1
  13. datahub/configuration/connection_resolver.py +5 -2
  14. datahub/configuration/import_resolver.py +7 -4
  15. datahub/configuration/pydantic_migration_helpers.py +0 -9
  16. datahub/configuration/source_common.py +3 -2
  17. datahub/configuration/validate_field_deprecation.py +5 -2
  18. datahub/configuration/validate_field_removal.py +5 -2
  19. datahub/configuration/validate_field_rename.py +6 -5
  20. datahub/configuration/validate_multiline_string.py +5 -2
  21. datahub/ingestion/autogenerated/capability_summary.json +45 -1
  22. datahub/ingestion/run/pipeline_config.py +2 -2
  23. datahub/ingestion/source/azure/azure_common.py +1 -1
  24. datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
  25. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  26. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
  27. datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
  28. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  29. datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
  30. datahub/ingestion/source/datahub/config.py +8 -9
  31. datahub/ingestion/source/dbt/dbt_common.py +65 -5
  32. datahub/ingestion/source/delta_lake/config.py +1 -1
  33. datahub/ingestion/source/dremio/dremio_config.py +3 -4
  34. datahub/ingestion/source/feast.py +8 -10
  35. datahub/ingestion/source/fivetran/config.py +1 -1
  36. datahub/ingestion/source/gcs/gcs_source.py +19 -2
  37. datahub/ingestion/source/ge_data_profiler.py +15 -2
  38. datahub/ingestion/source/ge_profiling_config.py +26 -22
  39. datahub/ingestion/source/grafana/grafana_config.py +2 -2
  40. datahub/ingestion/source/grafana/models.py +12 -14
  41. datahub/ingestion/source/hex/hex.py +6 -1
  42. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  43. datahub/ingestion/source/kafka_connect/common.py +2 -2
  44. datahub/ingestion/source/looker/looker_common.py +76 -75
  45. datahub/ingestion/source/looker/looker_config.py +15 -4
  46. datahub/ingestion/source/looker/looker_source.py +493 -547
  47. datahub/ingestion/source/looker/lookml_config.py +1 -1
  48. datahub/ingestion/source/looker/lookml_source.py +46 -88
  49. datahub/ingestion/source/metabase.py +9 -2
  50. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  51. datahub/ingestion/source/metadata/lineage.py +1 -1
  52. datahub/ingestion/source/mode.py +13 -5
  53. datahub/ingestion/source/nifi.py +1 -1
  54. datahub/ingestion/source/powerbi/config.py +14 -21
  55. datahub/ingestion/source/preset.py +1 -1
  56. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  57. datahub/ingestion/source/redash.py +1 -1
  58. datahub/ingestion/source/redshift/config.py +6 -3
  59. datahub/ingestion/source/redshift/query.py +23 -19
  60. datahub/ingestion/source/s3/source.py +26 -24
  61. datahub/ingestion/source/salesforce.py +13 -9
  62. datahub/ingestion/source/schema/json_schema.py +14 -14
  63. datahub/ingestion/source/sigma/data_classes.py +3 -0
  64. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  65. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  66. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  67. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  68. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  69. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  70. datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
  71. datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
  72. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
  73. datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
  74. datahub/ingestion/source/sql/athena.py +2 -1
  75. datahub/ingestion/source/sql/clickhouse.py +12 -7
  76. datahub/ingestion/source/sql/cockroachdb.py +5 -3
  77. datahub/ingestion/source/sql/druid.py +2 -2
  78. datahub/ingestion/source/sql/hive.py +4 -3
  79. datahub/ingestion/source/sql/hive_metastore.py +7 -9
  80. datahub/ingestion/source/sql/mssql/source.py +2 -2
  81. datahub/ingestion/source/sql/mysql.py +2 -2
  82. datahub/ingestion/source/sql/oracle.py +3 -3
  83. datahub/ingestion/source/sql/presto.py +2 -1
  84. datahub/ingestion/source/sql/teradata.py +4 -4
  85. datahub/ingestion/source/sql/trino.py +2 -1
  86. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  87. datahub/ingestion/source/sql/vertica.py +1 -1
  88. datahub/ingestion/source/sql_queries.py +6 -6
  89. datahub/ingestion/source/state/checkpoint.py +5 -1
  90. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  91. datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
  92. datahub/ingestion/source/superset.py +122 -15
  93. datahub/ingestion/source/tableau/tableau.py +68 -14
  94. datahub/ingestion/source/tableau/tableau_common.py +5 -0
  95. datahub/ingestion/source/tableau/tableau_constant.py +1 -0
  96. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  97. datahub/ingestion/source/unity/config.py +7 -3
  98. datahub/ingestion/source/usage/usage_common.py +3 -3
  99. datahub/ingestion/source_config/pulsar.py +3 -1
  100. datahub/ingestion/transformer/set_browse_path.py +112 -0
  101. datahub/metadata/_internal_schema_classes.py +728 -528
  102. datahub/metadata/_urns/urn_defs.py +1702 -1702
  103. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  104. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  105. datahub/metadata/schema.avsc +17434 -17732
  106. datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
  107. datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
  108. datahub/metadata/schemas/LogicalParent.avsc +2 -1
  109. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  110. datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
  111. datahub/sdk/_shared.py +126 -0
  112. datahub/sdk/chart.py +87 -30
  113. datahub/sdk/dashboard.py +79 -34
  114. datahub/sdk/entity_client.py +11 -4
  115. datahub/sdk/lineage_client.py +3 -3
  116. datahub/sdk/search_filters.py +1 -7
  117. datahub/sql_parsing/split_statements.py +13 -0
  118. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
  119. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
  120. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,13 @@
1
1
  import logging
2
2
  import os
3
3
  import re
4
+ from copy import deepcopy
4
5
  from datetime import timedelta
5
6
  from typing import Dict, List, Optional, Union
6
7
 
7
8
  from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
8
9
 
9
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
10
11
  from datahub.configuration.source_common import (
11
12
  EnvConfigMixin,
12
13
  LowerCaseDatasetUrnConfigMixin,
@@ -73,8 +74,10 @@ class BigQueryBaseConfig(ConfigModel):
73
74
  ) from e
74
75
  return v
75
76
 
76
- @root_validator(pre=True, skip_on_failure=True)
77
+ @root_validator(pre=True)
77
78
  def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
79
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
80
+ values = deepcopy(values)
78
81
  project_id = values.pop("project_id", None)
79
82
  project_ids = values.get("project_ids")
80
83
 
@@ -182,13 +185,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
182
185
  )
183
186
 
184
187
  # NOTE: `schema_pattern` is added here only to hide it from docs.
185
- schema_pattern: AllowDenyPattern = Field(
188
+ schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
186
189
  default=AllowDenyPattern.allow_all(),
187
- hidden_from_docs=True,
188
190
  )
189
191
 
190
192
  @root_validator(pre=False, skip_on_failure=True)
191
193
  def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
194
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
195
+ values = deepcopy(values)
192
196
  dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
193
197
  schema_pattern = values.get("schema_pattern")
194
198
  if (
@@ -320,8 +324,7 @@ class BigQueryV2Config(
320
324
  description="Include full payload into events. It is only for debugging and internal use.",
321
325
  )
322
326
 
323
- number_of_datasets_process_in_batch: int = Field(
324
- hidden_from_docs=True,
327
+ number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
325
328
  default=10000,
326
329
  description="Number of table queried in batch when getting metadata. This is a low level config property "
327
330
  "which should be touched with care.",
@@ -436,17 +439,15 @@ class BigQueryV2Config(
436
439
 
437
440
  upstream_lineage_in_report: bool = Field(
438
441
  default=False,
439
- description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
442
+ description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
440
443
  )
441
444
 
442
- run_optimized_column_query: bool = Field(
443
- hidden_from_docs=True,
445
+ run_optimized_column_query: HiddenFromDocs[bool] = Field(
444
446
  default=False,
445
447
  description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
446
448
  )
447
449
 
448
- file_backed_cache_size: int = Field(
449
- hidden_from_docs=True,
450
+ file_backed_cache_size: HiddenFromDocs[int] = Field(
450
451
  default=2000,
451
452
  description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
452
453
  )
@@ -456,10 +457,9 @@ class BigQueryV2Config(
456
457
  description="Option to exclude empty projects from being ingested.",
457
458
  )
458
459
 
459
- schema_resolution_batch_size: int = Field(
460
+ schema_resolution_batch_size: HiddenFromDocs[int] = Field(
460
461
  default=100,
461
462
  description="The number of tables to process in a batch when resolving schema from DataHub.",
462
- hidden_from_schema=True,
463
463
  )
464
464
 
465
465
  max_threads_dataset_parallelism: int = Field(
@@ -480,6 +480,8 @@ class BigQueryV2Config(
480
480
 
481
481
  @root_validator(pre=True)
482
482
  def set_include_schema_metadata(cls, values: Dict) -> Dict:
483
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
484
+ values = deepcopy(values)
483
485
  # Historically this is used to disable schema ingestion
484
486
  if (
485
487
  "include_tables" in values
@@ -498,6 +500,8 @@ class BigQueryV2Config(
498
500
 
499
501
  @root_validator(skip_on_failure=True)
500
502
  def profile_default_settings(cls, values: Dict) -> Dict:
503
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
504
+ values = deepcopy(values)
501
505
  # Extra default SQLAlchemy option for better connection pooling and threading.
502
506
  # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
503
507
  values["options"].setdefault("max_overflow", -1)
@@ -515,9 +519,19 @@ class BigQueryV2Config(
515
519
 
516
520
  return v
517
521
 
522
+ @validator("upstream_lineage_in_report")
523
+ def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
524
+ if v and values.get("use_queries_v2", True):
525
+ logging.warning(
526
+ "`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
527
+ "This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
528
+ )
529
+
530
+ return v
531
+
518
532
  def get_table_pattern(self, pattern: List[str]) -> str:
519
533
  return "|".join(pattern) if pattern else ""
520
534
 
521
- platform_instance_not_supported_for_bigquery = pydantic_removed_field(
535
+ _platform_instance_not_supported_for_bigquery = pydantic_removed_field(
522
536
  "platform_instance"
523
537
  )
@@ -283,23 +283,30 @@ class BigQuerySchemaApi:
283
283
  with self.report.list_datasets_timer:
284
284
  self.report.num_list_datasets_api_requests += 1
285
285
  datasets = self.bq_client.list_datasets(project_id, max_results=maxResults)
286
- return [
287
- BigqueryDataset(
288
- name=d.dataset_id,
289
- labels=d.labels,
290
- location=(
291
- d._properties.get("location")
292
- if hasattr(d, "_properties") and isinstance(d._properties, dict)
293
- else None
294
- ),
295
- # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
296
- # TODO: Given we are calling get_dataset for each dataset, we may consume and publish other fields too, such as created, modified, etc...
297
- # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
298
- # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
299
- comment=self.bq_client.get_dataset(d.reference).description,
286
+ result = []
287
+ for d in datasets:
288
+ # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
289
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
290
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
291
+ dataset = self.bq_client.get_dataset(d.reference)
292
+
293
+ location = (
294
+ d._properties.get("location")
295
+ if hasattr(d, "_properties") and isinstance(d._properties, dict)
296
+ else None
297
+ )
298
+
299
+ result.append(
300
+ BigqueryDataset(
301
+ name=d.dataset_id,
302
+ labels=d.labels,
303
+ location=location,
304
+ comment=dataset.description,
305
+ created=dataset.created,
306
+ last_altered=dataset.modified,
307
+ )
300
308
  )
301
- for d in datasets
302
- ]
309
+ return result
303
310
 
304
311
  # This is not used anywhere
305
312
  def get_datasets_for_project_id_with_information_schema(
@@ -12,6 +12,7 @@ from datahub.emitter.mce_builder import (
12
12
  make_dataset_urn_with_platform_instance,
13
13
  make_schema_field_urn,
14
14
  make_tag_urn,
15
+ make_ts_millis,
15
16
  )
16
17
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
18
  from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey
@@ -300,6 +301,8 @@ class BigQuerySchemaGenerator:
300
301
  description: Optional[str] = None,
301
302
  tags: Optional[Dict[str, str]] = None,
302
303
  extra_properties: Optional[Dict[str, str]] = None,
304
+ created: Optional[int] = None,
305
+ last_modified: Optional[int] = None,
303
306
  ) -> Iterable[MetadataWorkUnit]:
304
307
  schema_container_key = self.gen_dataset_key(project_id, dataset)
305
308
 
@@ -349,6 +352,8 @@ class BigQuerySchemaGenerator:
349
352
  ),
350
353
  tags=tags_joined,
351
354
  extra_properties=extra_properties,
355
+ created=created,
356
+ last_modified=last_modified,
352
357
  )
353
358
 
354
359
  def _process_project(
@@ -484,6 +489,12 @@ class BigQuerySchemaGenerator:
484
489
  else None
485
490
  ),
486
491
  description=bigquery_dataset.comment,
492
+ created=make_ts_millis(bigquery_dataset.created)
493
+ if bigquery_dataset.created
494
+ else None,
495
+ last_modified=make_ts_millis(bigquery_dataset.last_altered)
496
+ if bigquery_dataset.last_altered
497
+ else None,
487
498
  )
488
499
 
489
500
  columns = None
@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
8
8
  from google.cloud.bigquery import Client
9
9
  from pydantic import Field, PositiveInt
10
10
 
11
- from datahub.configuration.common import AllowDenyPattern
11
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
12
12
  from datahub.configuration.time_window_config import (
13
13
  BaseTimeWindowConfig,
14
14
  get_time_bucket,
@@ -86,12 +86,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
86
86
  # TODO: Support stateful ingestion for the time windows.
87
87
  window: BaseTimeWindowConfig = BaseTimeWindowConfig()
88
88
 
89
- local_temp_path: Optional[pathlib.Path] = Field(
90
- default=None,
91
- description="Local path to store the audit log.",
89
+ local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
92
90
  # TODO: For now, this is simply an advanced config to make local testing easier.
93
91
  # Eventually, we will want to store date-specific files in the directory and use it as a cache.
94
- hidden_from_docs=True,
92
+ default=None,
93
+ description="Local path to store the audit log.",
95
94
  )
96
95
 
97
96
  user_email_pattern: AllowDenyPattern = Field(
@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
9
9
 
10
10
 
11
11
  class GCPCredential(ConfigModel):
12
- project_id: Optional[str] = Field(description="Project id to set the credentials")
12
+ project_id: Optional[str] = Field(
13
+ None, description="Project id to set the credentials"
14
+ )
13
15
  private_key_id: str = Field(description="Private key id")
14
16
  private_key: str = Field(
15
17
  description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
@@ -11,7 +11,7 @@ from cached_property import cached_property
11
11
  from pydantic.fields import Field
12
12
  from wcmatch import pathlib
13
13
 
14
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
14
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
15
15
  from datahub.ingestion.source.aws.s3_util import is_s3_uri
16
16
  from datahub.ingestion.source.azure.abs_utils import is_abs_uri
17
17
  from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
@@ -89,63 +89,62 @@ class PathSpec(ConfigModel):
89
89
  description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
90
90
  )
91
91
  exclude: Optional[List[str]] = Field(
92
- default=[],
92
+ [],
93
93
  description="list of paths in glob pattern which will be excluded while scanning for the datasets",
94
94
  )
95
95
  file_types: List[str] = Field(
96
- default=SUPPORTED_FILE_TYPES,
96
+ SUPPORTED_FILE_TYPES,
97
97
  description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.",
98
98
  )
99
99
 
100
100
  default_extension: Optional[str] = Field(
101
- default=None,
101
+ None,
102
102
  description="For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.",
103
103
  )
104
104
 
105
105
  table_name: Optional[str] = Field(
106
- default=None,
106
+ None,
107
107
  description="Display name of the dataset.Combination of named variables from include path and strings",
108
108
  )
109
109
 
110
110
  # This is not used yet, but will be used in the future to sort the partitions
111
- sort_key: Optional[SortKey] = Field(
112
- hidden_from_docs=True,
113
- default=None,
111
+ sort_key: HiddenFromDocs[Optional[SortKey]] = Field(
112
+ None,
114
113
  description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.",
115
114
  )
116
115
 
117
116
  enable_compression: bool = Field(
118
- default=True,
117
+ True,
119
118
  description="Enable or disable processing compressed files. Currently .gz and .bz files are supported.",
120
119
  )
121
120
 
122
121
  sample_files: bool = Field(
123
- default=True,
122
+ True,
124
123
  description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled",
125
124
  )
126
125
 
127
126
  allow_double_stars: bool = Field(
128
- default=False,
127
+ False,
129
128
  description="Allow double stars in the include path. This can affect performance significantly if enabled",
130
129
  )
131
130
 
132
131
  autodetect_partitions: bool = Field(
133
- default=True,
132
+ True,
134
133
  description="Autodetect partition(s) from the path. If set to true, it will autodetect partition key/value if the folder format is {partition_key}={partition_value} for example `year=2024`",
135
134
  )
136
135
 
137
136
  traversal_method: FolderTraversalMethod = Field(
138
- default=FolderTraversalMethod.MAX,
137
+ FolderTraversalMethod.MAX,
139
138
  description="Method to traverse the folder. ALL: Traverse all the folders, MIN_MAX: Traverse the folders by finding min and max value, MAX: Traverse the folder with max value",
140
139
  )
141
140
 
142
141
  include_hidden_folders: bool = Field(
143
- default=False,
142
+ False,
144
143
  description="Include hidden folders in the traversal (folders starting with . or _",
145
144
  )
146
145
 
147
146
  tables_filter_pattern: AllowDenyPattern = Field(
148
- default=AllowDenyPattern.allow_all(),
147
+ AllowDenyPattern.allow_all(),
149
148
  description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
150
149
  )
151
150
 
@@ -479,7 +478,8 @@ class PathSpec(ConfigModel):
479
478
  return glob_include
480
479
 
481
480
  @pydantic.root_validator(skip_on_failure=True)
482
- def validate_path_spec(cls, values: Dict) -> Dict[str, Any]:
481
+ @staticmethod
482
+ def validate_path_spec(values: Dict) -> Dict[str, Any]:
483
483
  # validate that main fields are populated
484
484
  required_fields = ["include", "file_types", "default_extension"]
485
485
  for f in required_fields:
@@ -4,7 +4,7 @@ from typing import Optional, Set
4
4
  import pydantic
5
5
  from pydantic import Field, root_validator
6
6
 
7
- from datahub.configuration.common import AllowDenyPattern
7
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
8
8
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
9
9
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
10
10
  from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -98,16 +98,14 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
98
98
  ),
99
99
  )
100
100
 
101
- pull_from_datahub_api: bool = Field(
101
+ pull_from_datahub_api: HiddenFromDocs[bool] = Field(
102
102
  default=False,
103
103
  description="Use the DataHub API to fetch versioned aspects.",
104
- hidden_from_docs=True,
105
104
  )
106
105
 
107
- max_workers: int = Field(
106
+ max_workers: HiddenFromDocs[int] = Field(
108
107
  default=5 * (os.cpu_count() or 4),
109
108
  description="Number of worker threads to use for datahub api ingestion.",
110
- hidden_from_docs=True,
111
109
  )
112
110
 
113
111
  urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
@@ -118,10 +116,11 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
118
116
  "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
119
117
  )
120
118
 
121
- structured_properties_template_cache_invalidation_interval: int = Field(
122
- hidden_from_docs=True,
123
- default=60,
124
- description="Interval in seconds to invalidate the structured properties template cache.",
119
+ structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
120
+ Field(
121
+ default=60,
122
+ description="Interval in seconds to invalidate the structured properties template cache.",
123
+ )
125
124
  )
126
125
 
127
126
  query_timeout: Optional[int] = Field(
@@ -246,6 +246,23 @@ class DBTEntitiesEnabled(ConfigModel):
246
246
  return self.model_performance == EmitDirective.YES
247
247
 
248
248
 
249
+ class MaterializedNodePatternConfig(ConfigModel):
250
+ """Configuration for filtering materialized nodes based on their physical location"""
251
+
252
+ database_pattern: AllowDenyPattern = Field(
253
+ default=AllowDenyPattern.allow_all(),
254
+ description="Regex patterns for database names to filter materialized nodes.",
255
+ )
256
+ schema_pattern: AllowDenyPattern = Field(
257
+ default=AllowDenyPattern.allow_all(),
258
+ description="Regex patterns for schema names in format '{database}.{schema}' to filter materialized nodes.",
259
+ )
260
+ table_pattern: AllowDenyPattern = Field(
261
+ default=AllowDenyPattern.allow_all(),
262
+ description="Regex patterns for table/view names in format '{database}.{schema}.{table}' to filter materialized nodes.",
263
+ )
264
+
265
+
249
266
  class DBTCommonConfig(
250
267
  StatefulIngestionConfigBase,
251
268
  PlatformInstanceConfigMixin,
@@ -294,6 +311,11 @@ class DBTCommonConfig(
294
311
  default=AllowDenyPattern.allow_all(),
295
312
  description="regex patterns for dbt model names to filter in ingestion.",
296
313
  )
314
+ materialized_node_pattern: MaterializedNodePatternConfig = Field(
315
+ default=MaterializedNodePatternConfig(),
316
+ description="Advanced filtering for materialized nodes based on their physical database location. "
317
+ "Provides fine-grained control over database.schema.table patterns for catalog consistency.",
318
+ )
297
319
  meta_mapping: Dict = Field(
298
320
  default={},
299
321
  description="mapping rules that will be executed against dbt meta properties. Refer to the section below on dbt meta automated mappings.",
@@ -1018,15 +1040,53 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1018
1040
  all_nodes_map,
1019
1041
  )
1020
1042
 
1021
- def _is_allowed_node(self, key: str) -> bool:
1022
- return self.config.node_name_pattern.allowed(key)
1043
+ def _is_allowed_node(self, node: DBTNode) -> bool:
1044
+ """
1045
+ Check whether a node should be processed, using multi-layer rules. Checks for materialized nodes might need to be restricted in the future to some cases
1046
+ """
1047
+ if not self.config.node_name_pattern.allowed(node.dbt_name):
1048
+ return False
1049
+
1050
+ if not self._is_allowed_materialized_node(node):
1051
+ return False
1052
+
1053
+ return True
1054
+
1055
+ def _is_allowed_materialized_node(self, node: DBTNode) -> bool:
1056
+ """Filter nodes based on their materialized database location for catalog consistency"""
1057
+
1058
+ # Database level filtering
1059
+ if not node.database:
1060
+ return True
1061
+ if not self.config.materialized_node_pattern.database_pattern.allowed(
1062
+ node.database
1063
+ ):
1064
+ return False
1065
+
1066
+ # Schema level filtering: {database}.{schema}
1067
+ if not node.schema:
1068
+ return True
1069
+ if not self.config.materialized_node_pattern.schema_pattern.allowed(
1070
+ node._join_parts([node.database, node.schema])
1071
+ ):
1072
+ return False
1073
+
1074
+ # Table level filtering: {database}.{schema}.{table}
1075
+ if not node.name:
1076
+ return True
1077
+ if not self.config.materialized_node_pattern.table_pattern.allowed(
1078
+ node.get_db_fqn()
1079
+ ):
1080
+ return False
1081
+
1082
+ return True
1023
1083
 
1024
1084
  def _filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]:
1025
1085
  nodes: List[DBTNode] = []
1026
1086
  for node in all_nodes:
1027
1087
  key = node.dbt_name
1028
1088
 
1029
- if not self._is_allowed_node(key):
1089
+ if not self._is_allowed_node(node):
1030
1090
  self.report.nodes_filtered.append(key)
1031
1091
  continue
1032
1092
 
@@ -1118,8 +1178,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1118
1178
  cll_nodes.add(dbt_name)
1119
1179
  schema_nodes.add(dbt_name)
1120
1180
 
1121
- for dbt_name in all_nodes_map:
1122
- if self._is_allowed_node(dbt_name):
1181
+ for dbt_name, dbt_node in all_nodes_map.items():
1182
+ if self._is_allowed_node(dbt_node):
1123
1183
  add_node_to_cll_list(dbt_name)
1124
1184
 
1125
1185
  return schema_nodes, cll_nodes
@@ -78,7 +78,7 @@ class DeltaLakeSourceConfig(
78
78
  "When set to `False`, number_of_files in delta table can not be reported.",
79
79
  )
80
80
 
81
- s3: Optional[S3] = Field()
81
+ s3: Optional[S3] = Field(None)
82
82
 
83
83
  @cached_property
84
84
  def is_s3(self):
@@ -4,7 +4,7 @@ from typing import List, Literal, Optional
4
4
  import certifi
5
5
  from pydantic import Field, validator
6
6
 
7
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
7
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
8
8
  from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
@@ -100,10 +100,9 @@ class ProfileConfig(GEProfilingBaseConfig):
100
100
  query_timeout: int = Field(
101
101
  default=300, description="Time before cancelling Dremio profiling query"
102
102
  )
103
- include_field_median_value: bool = Field(
103
+ include_field_median_value: HiddenFromDocs[bool] = Field(
104
+ # Hidden because median causes a number of issues in Dremio.
104
105
  default=False,
105
- hidden_from_docs=True,
106
- description="Median causes a number of issues in Dremio.",
107
106
  )
108
107
 
109
108
 
@@ -1,5 +1,6 @@
1
+ import pathlib
1
2
  from dataclasses import dataclass
2
- from typing import Dict, Iterable, List, Optional, Tuple, Union
3
+ from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Union
3
4
 
4
5
  import feast.types
5
6
  from feast import (
@@ -97,7 +98,7 @@ class FeastRepositorySourceConfig(
97
98
  StatefulIngestionConfigBase,
98
99
  ):
99
100
  path: str = Field(description="Path to Feast repository")
100
- fs_yaml_file: Optional[str] = Field(
101
+ fs_yaml_file: Optional[pathlib.Path] = Field(
101
102
  default=None,
102
103
  description="Path to the `feature_store.yaml` file used to configure the feature store",
103
104
  )
@@ -142,17 +143,14 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
142
143
  - Column types associated with each entity and feature
143
144
  """
144
145
 
145
- platform = "feast"
146
- source_config: FeastRepositorySourceConfig
147
- report: StaleEntityRemovalSourceReport
148
- feature_store: FeatureStore
146
+ platform: ClassVar[str] = "feast"
149
147
 
150
148
  def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
151
149
  super().__init__(config, ctx)
152
- self.source_config = config
153
- self.ctx = ctx
154
- self.report = StaleEntityRemovalSourceReport()
155
- self.feature_store = FeatureStore(
150
+ self.source_config: FeastRepositorySourceConfig = config
151
+ self.ctx: PipelineContext = ctx
152
+ self.report: StaleEntityRemovalSourceReport = StaleEntityRemovalSourceReport()
153
+ self.feature_store: FeatureStore = FeatureStore(
156
154
  repo_path=self.source_config.path,
157
155
  fs_yaml_file=self.source_config.fs_yaml_file,
158
156
  )
@@ -102,7 +102,7 @@ class FivetranLogConfig(ConfigModel):
102
102
  "destination_config", "snowflake_destination_config"
103
103
  )
104
104
 
105
- @root_validator(pre=True)
105
+ @root_validator(skip_on_failure=True)
106
106
  def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
107
107
  destination_platform = values["destination_platform"]
108
108
  if destination_platform == "snowflake":
@@ -37,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
37
37
 
38
38
  logger: logging.Logger = logging.getLogger(__name__)
39
39
 
40
+ GCS_ENDPOINT_URL = "https://storage.googleapis.com"
41
+
40
42
 
41
43
  class HMACKey(ConfigModel):
42
44
  hmac_access_id: str = Field(description="Access ID")
@@ -112,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
112
114
  s3_config = DataLakeSourceConfig(
113
115
  path_specs=s3_path_specs,
114
116
  aws_config=AwsConnectionConfig(
115
- aws_endpoint_url="https://storage.googleapis.com",
117
+ aws_endpoint_url=GCS_ENDPOINT_URL,
116
118
  aws_access_key_id=self.config.credential.hmac_access_id,
117
119
  aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
118
120
  aws_region="auto",
@@ -121,15 +123,25 @@ class GCSSource(StatefulIngestionSourceBase):
121
123
  max_rows=self.config.max_rows,
122
124
  number_of_files_to_sample=self.config.number_of_files_to_sample,
123
125
  platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
126
+ platform_instance=self.config.platform_instance,
124
127
  )
125
128
  return s3_config
126
129
 
127
130
  def create_equivalent_s3_path_specs(self):
128
131
  s3_path_specs = []
129
132
  for path_spec in self.config.path_specs:
133
+ # PathSpec modifies the passed-in include to add /** to the end if
134
+ # autodetecting partitions. Remove that, otherwise creating a new
135
+ # PathSpec will complain.
136
+ # TODO: this should be handled inside PathSpec, which probably shouldn't
137
+ # modify its input.
138
+ include = path_spec.include
139
+ if include.endswith("{table}/**") and not path_spec.allow_double_stars:
140
+ include = include.removesuffix("**")
141
+
130
142
  s3_path_specs.append(
131
143
  PathSpec(
132
- include=path_spec.include.replace("gs://", "s3://"),
144
+ include=include.replace("gs://", "s3://"),
133
145
  exclude=(
134
146
  [exc.replace("gs://", "s3://") for exc in path_spec.exclude]
135
147
  if path_spec.exclude
@@ -140,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
140
152
  table_name=path_spec.table_name,
141
153
  enable_compression=path_spec.enable_compression,
142
154
  sample_files=path_spec.sample_files,
155
+ allow_double_stars=path_spec.allow_double_stars,
156
+ autodetect_partitions=path_spec.autodetect_partitions,
157
+ include_hidden_folders=path_spec.include_hidden_folders,
158
+ tables_filter_pattern=path_spec.tables_filter_pattern,
159
+ traversal_method=path_spec.traversal_method,
143
160
  )
144
161
  )
145
162