acryl-datahub 1.0.0rc7__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (88) hide show
  1. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/METADATA +2487 -2487
  2. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/RECORD +88 -84
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +731 -42
  5. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  6. datahub/cli/specific/dataset_cli.py +128 -14
  7. datahub/configuration/git.py +1 -3
  8. datahub/ingestion/glossary/classification_mixin.py +1 -1
  9. datahub/ingestion/graph/client.py +16 -12
  10. datahub/ingestion/graph/filters.py +64 -37
  11. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  12. datahub/ingestion/source/abs/config.py +2 -4
  13. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  14. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
  15. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  16. datahub/ingestion/source/csv_enricher.py +1 -1
  17. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  18. datahub/ingestion/source/file.py +5 -2
  19. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  20. datahub/ingestion/source/ge_data_profiler.py +11 -14
  21. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  22. datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
  23. datahub/ingestion/source/identity/okta.py +1 -3
  24. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
  25. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  26. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  27. datahub/ingestion/source/looker/lookml_source.py +2 -1
  28. datahub/ingestion/source/metadata/lineage.py +2 -2
  29. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  30. datahub/ingestion/source/nifi.py +6 -3
  31. datahub/ingestion/source/openapi_parser.py +2 -2
  32. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  33. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  34. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  35. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  36. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  37. datahub/ingestion/source/preset.py +7 -4
  38. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  39. datahub/ingestion/source/redash.py +2 -1
  40. datahub/ingestion/source/s3/config.py +2 -4
  41. datahub/ingestion/source/s3/source.py +20 -41
  42. datahub/ingestion/source/salesforce.py +1 -1
  43. datahub/ingestion/source/schema_inference/object.py +1 -1
  44. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  45. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  46. datahub/ingestion/source/sql/athena.py +2 -2
  47. datahub/ingestion/source/sql/sql_common.py +2 -2
  48. datahub/ingestion/source/sql/sql_types.py +2 -2
  49. datahub/ingestion/source/sql/teradata.py +4 -2
  50. datahub/ingestion/source/sql/trino.py +2 -2
  51. datahub/ingestion/source/superset.py +218 -56
  52. datahub/ingestion/source/tableau/tableau.py +1 -5
  53. datahub/lite/duckdb_lite.py +3 -9
  54. datahub/metadata/_schema_classes.py +157 -14
  55. datahub/metadata/_urns/urn_defs.py +58 -58
  56. datahub/metadata/schema.avsc +23 -10
  57. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  58. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  59. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  60. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  61. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  62. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  63. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  64. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  65. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  66. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  67. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  68. datahub/metadata/schemas/PostKey.avsc +2 -1
  69. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  70. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  71. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  72. datahub/pydantic/__init__.py +0 -0
  73. datahub/pydantic/compat.py +58 -0
  74. datahub/sdk/__init__.py +1 -0
  75. datahub/sdk/_all_entities.py +1 -1
  76. datahub/sdk/_shared.py +88 -3
  77. datahub/sdk/container.py +7 -1
  78. datahub/sdk/dataset.py +10 -4
  79. datahub/sdk/{_entity.py → entity.py} +4 -0
  80. datahub/sdk/entity_client.py +1 -1
  81. datahub/sdk/main_client.py +7 -1
  82. datahub/sdk/resolver_client.py +17 -29
  83. datahub/sdk/search_client.py +50 -0
  84. datahub/sdk/search_filters.py +374 -0
  85. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/LICENSE +0 -0
  86. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/WHEEL +0 -0
  87. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/entry_points.txt +0 -0
  88. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from enum import Enum
3
3
  from pathlib import Path
4
- from typing import Iterable, List, Optional
4
+ from typing import Iterable, List, Optional, Union
5
5
 
6
6
  import yaml
7
7
  from pydantic import validator
@@ -38,7 +38,7 @@ class AllowedTypes(Enum):
38
38
 
39
39
 
40
40
  class AllowedValue(ConfigModel):
41
- value: str
41
+ value: Union[int, float, str]
42
42
  description: Optional[str] = None
43
43
 
44
44
 
@@ -1,12 +1,15 @@
1
+ import filecmp
1
2
  import json
2
3
  import logging
4
+ import os
5
+ import shutil
3
6
  from pathlib import Path
4
- from typing import Set, Tuple
7
+ from typing import List, Set, Tuple
5
8
 
6
9
  import click
7
10
  from click_default_group import DefaultGroup
8
11
 
9
- from datahub.api.entities.dataset.dataset import Dataset
12
+ from datahub.api.entities.dataset.dataset import Dataset, DatasetRetrievalConfig
10
13
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
11
14
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
12
15
  from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
@@ -30,18 +33,9 @@ def dataset() -> None:
30
33
  @telemetry.with_telemetry()
31
34
  def upsert(file: Path) -> None:
32
35
  """Upsert attributes to a Dataset in DataHub."""
33
-
34
- with get_default_graph() as graph:
35
- for dataset in Dataset.from_yaml(str(file)):
36
- try:
37
- for mcp in dataset.generate_mcp():
38
- graph.emit(mcp)
39
- click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
40
- except Exception as e:
41
- click.secho(
42
- f"Update failed for id {id}. due to {e}",
43
- fg="red",
44
- )
36
+ # Call the sync command with to_datahub=True to perform the upsert operation
37
+ ctx = click.get_current_context()
38
+ ctx.invoke(sync, file=str(file), to_datahub=True)
45
39
 
46
40
 
47
41
  @dataset.command(
@@ -111,3 +105,123 @@ def _get_existing_siblings(graph: DataHubGraph, urn: str) -> Set[str]:
111
105
  return set(existing.siblings)
112
106
  else:
113
107
  return set()
108
+
109
+
110
+ @dataset.command(
111
+ name="file",
112
+ )
113
+ @click.option("--lintCheck", required=False, is_flag=True)
114
+ @click.option("--lintFix", required=False, is_flag=True)
115
+ @click.argument("file", type=click.Path(exists=True))
116
+ @upgrade.check_upgrade
117
+ @telemetry.with_telemetry()
118
+ def file(lintcheck: bool, lintfix: bool, file: str) -> None:
119
+ """Operate on a Dataset file"""
120
+
121
+ if lintcheck or lintfix:
122
+ import tempfile
123
+ from pathlib import Path
124
+
125
+ # Create a temporary file in a secure way
126
+ # The file will be automatically deleted when the context manager exits
127
+ with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as temp:
128
+ temp_path = Path(temp.name)
129
+ try:
130
+ # Copy content to the temporary file
131
+ shutil.copyfile(file, temp_path)
132
+
133
+ # Run the linting
134
+ datasets = Dataset.from_yaml(temp.name)
135
+ for dataset in datasets:
136
+ dataset.to_yaml(temp_path)
137
+
138
+ # Compare the files
139
+ files_match = filecmp.cmp(file, temp_path)
140
+
141
+ if files_match:
142
+ click.secho("No differences found", fg="green")
143
+ else:
144
+ # Show diff for visibility
145
+ os.system(f"diff {file} {temp_path}")
146
+
147
+ if lintfix:
148
+ shutil.copyfile(temp_path, file)
149
+ click.secho(f"Fixed linting issues in {file}", fg="green")
150
+ else:
151
+ click.secho(
152
+ f"To fix these differences, run 'datahub dataset file --lintFix {file}'",
153
+ fg="yellow",
154
+ )
155
+ finally:
156
+ # Ensure the temporary file is removed
157
+ if temp_path.exists():
158
+ temp_path.unlink()
159
+ else:
160
+ click.secho(
161
+ "No operation specified. Choose from --lintCheck or --lintFix", fg="yellow"
162
+ )
163
+
164
+
165
+ @dataset.command(
166
+ name="sync",
167
+ )
168
+ @click.option("-f", "--file", required=True, type=click.Path(exists=True))
169
+ @click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
170
+ @upgrade.check_upgrade
171
+ @telemetry.with_telemetry()
172
+ def sync(file: str, to_datahub: bool) -> None:
173
+ """Sync a Dataset file to/from DataHub"""
174
+
175
+ failures: List[str] = []
176
+ with get_default_graph() as graph:
177
+ datasets = Dataset.from_yaml(file)
178
+ for dataset in datasets:
179
+ assert (
180
+ dataset.urn is not None
181
+ ) # Validator should have ensured this is filled. Tell mypy it's not None
182
+ if to_datahub:
183
+ missing_entity_references = [
184
+ entity_reference
185
+ for entity_reference in dataset.entity_references()
186
+ if not graph.exists(entity_reference)
187
+ ]
188
+ if missing_entity_references:
189
+ click.secho(
190
+ "\n\t- ".join(
191
+ [
192
+ f"Skipping Dataset {dataset.urn} due to missing entity references: "
193
+ ]
194
+ + missing_entity_references
195
+ ),
196
+ fg="red",
197
+ )
198
+ failures.append(dataset.urn)
199
+ continue
200
+ try:
201
+ for mcp in dataset.generate_mcp():
202
+ graph.emit(mcp)
203
+ click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
204
+ except Exception as e:
205
+ click.secho(
206
+ f"Update failed for id {id}. due to {e}",
207
+ fg="red",
208
+ )
209
+ else:
210
+ # Sync from DataHub
211
+ if graph.exists(dataset.urn):
212
+ dataset_get_config = DatasetRetrievalConfig()
213
+ if dataset.downstreams:
214
+ dataset_get_config.include_downstreams = True
215
+ existing_dataset: Dataset = Dataset.from_datahub(
216
+ graph=graph, urn=dataset.urn, config=dataset_get_config
217
+ )
218
+ existing_dataset.to_yaml(Path(file))
219
+ else:
220
+ click.secho(f"Dataset {dataset.urn} does not exist")
221
+ failures.append(dataset.urn)
222
+ if failures:
223
+ click.secho(
224
+ f"\nFailed to sync the following Datasets: {', '.join(failures)}",
225
+ fg="red",
226
+ )
227
+ raise click.Abort()
@@ -43,9 +43,7 @@ class GitReference(ConfigModel):
43
43
 
44
44
  @validator("repo", pre=True)
45
45
  def simplify_repo_url(cls, repo: str) -> str:
46
- if repo.startswith("github.com/"):
47
- repo = f"https://{repo}"
48
- elif repo.startswith("gitlab.com"):
46
+ if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
49
47
  repo = f"https://{repo}"
50
48
  elif repo.count("/") == 1:
51
49
  repo = f"https://github.com/{repo}"
@@ -281,7 +281,7 @@ class ClassificationHandler:
281
281
  ),
282
282
  values=(
283
283
  sample_data[schema_field.fieldPath]
284
- if schema_field.fieldPath in sample_data.keys()
284
+ if schema_field.fieldPath in sample_data
285
285
  else []
286
286
  ),
287
287
  )
@@ -16,6 +16,7 @@ from typing import (
16
16
  List,
17
17
  Literal,
18
18
  Optional,
19
+ Sequence,
19
20
  Tuple,
20
21
  Type,
21
22
  Union,
@@ -42,8 +43,8 @@ from datahub.ingestion.graph.connections import (
42
43
  )
43
44
  from datahub.ingestion.graph.entity_versioning import EntityVersioningAPI
44
45
  from datahub.ingestion.graph.filters import (
46
+ RawSearchFilterRule,
45
47
  RemovedStatusFilter,
46
- SearchFilterRule,
47
48
  generate_filter,
48
49
  )
49
50
  from datahub.ingestion.source.state.checkpoint import Checkpoint
@@ -105,7 +106,7 @@ class RelatedEntity:
105
106
  via: Optional[str] = None
106
107
 
107
108
 
108
- def _graphql_entity_type(entity_type: str) -> str:
109
+ def entity_type_to_graphql(entity_type: str) -> str:
109
110
  """Convert the entity types into GraphQL "EntityType" enum values."""
110
111
 
111
112
  # Hard-coded special cases.
@@ -330,7 +331,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
330
331
  aspect_type_name: Optional[str] = None,
331
332
  version: int = 0,
332
333
  ) -> Optional[Aspect]:
333
- assert aspect_type.ASPECT_NAME == aspect
334
+ assert aspect == aspect_type.ASPECT_NAME
334
335
  return self.get_aspect(
335
336
  entity_urn=entity_urn,
336
337
  aspect_type=aspect_type,
@@ -797,13 +798,13 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
797
798
  container: Optional[str] = None,
798
799
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
799
800
  batch_size: int = 100,
800
- extraFilters: Optional[List[SearchFilterRule]] = None,
801
+ extraFilters: Optional[List[RawSearchFilterRule]] = None,
801
802
  ) -> Iterable[Tuple[str, "GraphQLSchemaMetadata"]]:
802
803
  """Fetch schema info for datasets that match all of the given filters.
803
804
 
804
805
  :return: An iterable of (urn, schema info) tuple that match the filters.
805
806
  """
806
- types = [_graphql_entity_type("dataset")]
807
+ types = [entity_type_to_graphql("dataset")]
807
808
 
808
809
  # Add the query default of * if no query is specified.
809
810
  query = query or "*"
@@ -865,7 +866,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
865
866
  def get_urns_by_filter(
866
867
  self,
867
868
  *,
868
- entity_types: Optional[List[str]] = None,
869
+ entity_types: Optional[Sequence[str]] = None,
869
870
  platform: Optional[str] = None,
870
871
  platform_instance: Optional[str] = None,
871
872
  env: Optional[str] = None,
@@ -873,8 +874,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
873
874
  container: Optional[str] = None,
874
875
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
875
876
  batch_size: int = 10000,
876
- extraFilters: Optional[List[SearchFilterRule]] = None,
877
- extra_or_filters: Optional[List[Dict[str, List[SearchFilterRule]]]] = None,
877
+ extraFilters: Optional[List[RawSearchFilterRule]] = None,
878
+ extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
878
879
  ) -> Iterable[str]:
879
880
  """Fetch all urns that match all of the given filters.
880
881
 
@@ -965,8 +966,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
965
966
  container: Optional[str] = None,
966
967
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
967
968
  batch_size: int = 10000,
968
- extra_and_filters: Optional[List[SearchFilterRule]] = None,
969
- extra_or_filters: Optional[List[Dict[str, List[SearchFilterRule]]]] = None,
969
+ extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
970
+ extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
970
971
  extra_source_fields: Optional[List[str]] = None,
971
972
  skip_cache: bool = False,
972
973
  ) -> Iterable[dict]:
@@ -1109,7 +1110,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1109
1110
  f"Scrolling to next scrollAcrossEntities page: {scroll_id}"
1110
1111
  )
1111
1112
 
1112
- def _get_types(self, entity_types: Optional[List[str]]) -> Optional[List[str]]:
1113
+ @classmethod
1114
+ def _get_types(cls, entity_types: Optional[Sequence[str]]) -> Optional[List[str]]:
1113
1115
  types: Optional[List[str]] = None
1114
1116
  if entity_types is not None:
1115
1117
  if not entity_types:
@@ -1117,7 +1119,9 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1117
1119
  "entity_types cannot be an empty list; use None for all entities"
1118
1120
  )
1119
1121
 
1120
- types = [_graphql_entity_type(entity_type) for entity_type in entity_types]
1122
+ types = [
1123
+ entity_type_to_graphql(entity_type) for entity_type in entity_types
1124
+ ]
1121
1125
  return types
1122
1126
 
1123
1127
  def get_latest_pipeline_checkpoint(
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  import enum
2
3
  from typing import Any, Dict, List, Optional
3
4
 
@@ -7,7 +8,31 @@ from datahub.emitter.mce_builder import (
7
8
  )
8
9
  from datahub.utilities.urns.urn import guess_entity_type
9
10
 
10
- SearchFilterRule = Dict[str, Any]
11
+ RawSearchFilterRule = Dict[str, Any]
12
+
13
+
14
+ @dataclasses.dataclass
15
+ class SearchFilterRule:
16
+ field: str
17
+ condition: str # TODO: convert to an enum
18
+ values: List[str]
19
+ negated: bool = False
20
+
21
+ def to_raw(self) -> RawSearchFilterRule:
22
+ return {
23
+ "field": self.field,
24
+ "condition": self.condition,
25
+ "values": self.values,
26
+ "negated": self.negated,
27
+ }
28
+
29
+ def negate(self) -> "SearchFilterRule":
30
+ return SearchFilterRule(
31
+ field=self.field,
32
+ condition=self.condition,
33
+ values=self.values,
34
+ negated=not self.negated,
35
+ )
11
36
 
12
37
 
13
38
  class RemovedStatusFilter(enum.Enum):
@@ -29,9 +54,9 @@ def generate_filter(
29
54
  env: Optional[str],
30
55
  container: Optional[str],
31
56
  status: RemovedStatusFilter,
32
- extra_filters: Optional[List[SearchFilterRule]],
33
- extra_or_filters: Optional[List[SearchFilterRule]] = None,
34
- ) -> List[Dict[str, List[SearchFilterRule]]]:
57
+ extra_filters: Optional[List[RawSearchFilterRule]],
58
+ extra_or_filters: Optional[List[RawSearchFilterRule]] = None,
59
+ ) -> List[Dict[str, List[RawSearchFilterRule]]]:
35
60
  """
36
61
  Generate a search filter based on the provided parameters.
37
62
  :param platform: The platform to filter by.
@@ -43,30 +68,32 @@ def generate_filter(
43
68
  :param extra_or_filters: Extra OR filters to apply. These are combined with
44
69
  the AND filters using an OR at the top level.
45
70
  """
46
- and_filters: List[SearchFilterRule] = []
71
+ and_filters: List[RawSearchFilterRule] = []
47
72
 
48
73
  # Platform filter.
49
74
  if platform:
50
- and_filters.append(_get_platform_filter(platform))
75
+ and_filters.append(_get_platform_filter(platform).to_raw())
51
76
 
52
77
  # Platform instance filter.
53
78
  if platform_instance:
54
- and_filters.append(_get_platform_instance_filter(platform, platform_instance))
79
+ and_filters.append(
80
+ _get_platform_instance_filter(platform, platform_instance).to_raw()
81
+ )
55
82
 
56
83
  # Browse path v2 filter.
57
84
  if container:
58
- and_filters.append(_get_container_filter(container))
85
+ and_filters.append(_get_container_filter(container).to_raw())
59
86
 
60
87
  # Status filter.
61
88
  status_filter = _get_status_filter(status)
62
89
  if status_filter:
63
- and_filters.append(status_filter)
90
+ and_filters.append(status_filter.to_raw())
64
91
 
65
92
  # Extra filters.
66
93
  if extra_filters:
67
94
  and_filters += extra_filters
68
95
 
69
- or_filters: List[Dict[str, List[SearchFilterRule]]] = [{"and": and_filters}]
96
+ or_filters: List[Dict[str, List[RawSearchFilterRule]]] = [{"and": and_filters}]
70
97
 
71
98
  # Env filter
72
99
  if env:
@@ -89,7 +116,7 @@ def generate_filter(
89
116
  return or_filters
90
117
 
91
118
 
92
- def _get_env_filters(env: str) -> List[SearchFilterRule]:
119
+ def _get_env_filters(env: str) -> List[RawSearchFilterRule]:
93
120
  # The env filter is a bit more tricky since it's not always stored
94
121
  # in the same place in ElasticSearch.
95
122
  return [
@@ -125,19 +152,19 @@ def _get_status_filter(status: RemovedStatusFilter) -> Optional[SearchFilterRule
125
152
  # removed field is simply not present in the ElasticSearch document. Ideally this
126
153
  # would be a "removed" : "false" filter, but that doesn't work. Instead, we need to
127
154
  # use a negated filter.
128
- return {
129
- "field": "removed",
130
- "values": ["true"],
131
- "condition": "EQUAL",
132
- "negated": True,
133
- }
155
+ return SearchFilterRule(
156
+ field="removed",
157
+ values=["true"],
158
+ condition="EQUAL",
159
+ negated=True,
160
+ )
134
161
 
135
162
  elif status == RemovedStatusFilter.ONLY_SOFT_DELETED:
136
- return {
137
- "field": "removed",
138
- "values": ["true"],
139
- "condition": "EQUAL",
140
- }
163
+ return SearchFilterRule(
164
+ field="removed",
165
+ values=["true"],
166
+ condition="EQUAL",
167
+ )
141
168
 
142
169
  elif status == RemovedStatusFilter.ALL:
143
170
  # We don't need to add a filter for this case.
@@ -152,11 +179,11 @@ def _get_container_filter(container: str) -> SearchFilterRule:
152
179
  if guess_entity_type(container) != "container":
153
180
  raise ValueError(f"Invalid container urn: {container}")
154
181
 
155
- return {
156
- "field": "browsePathV2",
157
- "values": [container],
158
- "condition": "CONTAIN",
159
- }
182
+ return SearchFilterRule(
183
+ field="browsePathV2",
184
+ values=[container],
185
+ condition="CONTAIN",
186
+ )
160
187
 
161
188
 
162
189
  def _get_platform_instance_filter(
@@ -171,16 +198,16 @@ def _get_platform_instance_filter(
171
198
  if guess_entity_type(platform_instance) != "dataPlatformInstance":
172
199
  raise ValueError(f"Invalid data platform instance urn: {platform_instance}")
173
200
 
174
- return {
175
- "field": "platformInstance",
176
- "values": [platform_instance],
177
- "condition": "EQUAL",
178
- }
201
+ return SearchFilterRule(
202
+ field="platformInstance",
203
+ condition="EQUAL",
204
+ values=[platform_instance],
205
+ )
179
206
 
180
207
 
181
208
  def _get_platform_filter(platform: str) -> SearchFilterRule:
182
- return {
183
- "field": "platform.keyword",
184
- "values": [make_data_platform_urn(platform)],
185
- "condition": "EQUAL",
186
- }
209
+ return SearchFilterRule(
210
+ field="platform.keyword",
211
+ condition="EQUAL",
212
+ values=[make_data_platform_urn(platform)],
213
+ )
@@ -163,12 +163,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
163
163
  key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
164
164
  for key, value in obj.items()
165
165
  }
166
- elif isinstance(obj, list):
167
- return [
168
- DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
169
- for element in obj
170
- ]
171
- elif isinstance(obj, set):
166
+ elif isinstance(obj, list) or isinstance(obj, set):
172
167
  return [
173
168
  DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
174
169
  for element in obj
@@ -144,10 +144,8 @@ class DataLakeSourceConfig(
144
144
  return path_specs
145
145
 
146
146
  @pydantic.validator("platform", always=True)
147
- def platform_not_empty(cls, platform: str, values: dict) -> str:
148
- inferred_platform = values.get(
149
- "platform", None
150
- ) # we may have inferred it above
147
+ def platform_not_empty(cls, platform: Any, values: dict) -> str:
148
+ inferred_platform = values.get("platform") # we may have inferred it above
151
149
  platform = platform or inferred_platform
152
150
  if not platform:
153
151
  raise ValueError("platform must not be empty")
@@ -165,7 +165,7 @@ class BigQueryTableRef:
165
165
  @classmethod
166
166
  def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
167
167
  for key in ["projectId", "datasetId", "tableId"]:
168
- if key not in spec.keys():
168
+ if key not in spec:
169
169
  raise ValueError(f"invalid BigQuery table reference dict: {spec}")
170
170
 
171
171
  return cls(
@@ -344,7 +344,7 @@ class BigQuerySchemaApi:
344
344
  with_partitions: bool = False,
345
345
  ) -> Iterator[BigqueryTable]:
346
346
  with PerfTimer() as current_timer:
347
- filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
347
+ filter_clause: str = ", ".join(f"'{table}'" for table in tables)
348
348
 
349
349
  if with_partitions:
350
350
  query_template = BigqueryQuery.tables_for_dataset
@@ -59,9 +59,9 @@ from datahub.metadata.schema_classes import (
59
59
  UpstreamLineageClass,
60
60
  ViewPropertiesClass,
61
61
  )
62
- from datahub.sdk._entity import Entity
63
62
  from datahub.sdk.container import Container
64
63
  from datahub.sdk.dataset import Dataset
64
+ from datahub.sdk.entity import Entity
65
65
 
66
66
  logger = logging.getLogger(__name__)
67
67
 
@@ -314,7 +314,7 @@ class CSVEnricherSource(Source):
314
314
  "datajob": EditableDataJobPropertiesClass,
315
315
  "dataflow": EditableDataFlowPropertiesClass,
316
316
  "notebook": EditableNotebookPropertiesClass,
317
- }.get(entityType, None)
317
+ }.get(entityType)
318
318
 
319
319
  if not entityClass:
320
320
  raise ValueError(
@@ -1033,7 +1033,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1033
1033
  cll_nodes.add(dbt_name)
1034
1034
  schema_nodes.add(dbt_name)
1035
1035
 
1036
- for dbt_name in all_nodes_map.keys():
1036
+ for dbt_name in all_nodes_map:
1037
1037
  if self._is_allowed_node(dbt_name):
1038
1038
  add_node_to_cll_list(dbt_name)
1039
1039
 
@@ -410,10 +410,13 @@ def _from_obj_for_file(
410
410
  item = MetadataChangeEvent.from_obj(obj)
411
411
  elif "aspect" in obj:
412
412
  item = MetadataChangeProposalWrapper.from_obj(obj)
413
- else:
413
+ elif "bucket" in obj:
414
414
  item = UsageAggregationClass.from_obj(obj)
415
+ else:
416
+ raise ValueError(f"Unknown object type: {obj}")
417
+
415
418
  if not item.validate():
416
- raise ValueError(f"failed to parse: {obj}")
419
+ raise ValueError(f"Failed to parse: {obj}")
417
420
 
418
421
  if isinstance(item, UsageAggregationClass):
419
422
  logger.warning(f"Dropping deprecated UsageAggregationClass: {item}")
@@ -498,7 +498,7 @@ class DataProcessCleanup:
498
498
  # Delete empty dataflows if needed
499
499
  if self.config.delete_empty_data_flows:
500
500
  deleted_data_flows: int = 0
501
- for key in dataFlows.keys():
501
+ for key in dataFlows:
502
502
  if not dataJobs.get(key) or len(dataJobs[key]) == 0:
503
503
  logger.info(
504
504
  f"Deleting dataflow {key} because there are not datajobs"
@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
170
170
  ).select_from(self._table)
171
171
  )
172
172
  return convert_to_json_serializable(element_values.fetchone()[0])
173
- elif self.engine.dialect.name.lower() == BIGQUERY:
174
- element_values = self.engine.execute(
175
- sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
176
- self._table
177
- )
178
- )
179
- return convert_to_json_serializable(element_values.fetchone()[0])
180
- elif self.engine.dialect.name.lower() == SNOWFLAKE:
173
+ elif (
174
+ self.engine.dialect.name.lower() == BIGQUERY
175
+ or self.engine.dialect.name.lower() == SNOWFLAKE
176
+ ):
181
177
  element_values = self.engine.execute(
182
178
  sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
183
179
  self._table
@@ -381,13 +377,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
381
377
  col = col_dict["name"]
382
378
  self.column_types[col] = str(col_dict["type"])
383
379
  # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
384
- if not self.config._allow_deny_patterns.allowed(
385
- f"{self.dataset_name}.{col}"
380
+ if (
381
+ not self.config._allow_deny_patterns.allowed(
382
+ f"{self.dataset_name}.{col}"
383
+ )
384
+ or not self.config.profile_nested_fields
385
+ and "." in col
386
386
  ):
387
387
  ignored_columns_by_pattern.append(col)
388
- # We try to ignore nested columns as well
389
- elif not self.config.profile_nested_fields and "." in col:
390
- ignored_columns_by_pattern.append(col)
391
388
  elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
392
389
  ignored_columns_by_type.append(col)
393
390
  else:
@@ -1408,7 +1405,7 @@ class DatahubGEProfiler:
1408
1405
  },
1409
1406
  )
1410
1407
 
1411
- if platform == BIGQUERY or platform == DATABRICKS:
1408
+ if platform in (BIGQUERY, DATABRICKS):
1412
1409
  # This is done as GE makes the name as DATASET.TABLE
1413
1410
  # but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
1414
1411
  name_parts = pretty_name.split(".")