acryl-datahub 1.0.0rc7__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (52) hide show
  1. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2405 -2405
  2. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +52 -52
  3. datahub/_version.py +1 -1
  4. datahub/configuration/git.py +1 -3
  5. datahub/ingestion/glossary/classification_mixin.py +1 -1
  6. datahub/ingestion/graph/client.py +1 -1
  7. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  8. datahub/ingestion/source/abs/config.py +2 -4
  9. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  10. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
  11. datahub/ingestion/source/csv_enricher.py +1 -1
  12. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  13. datahub/ingestion/source/file.py +5 -2
  14. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  15. datahub/ingestion/source/ge_data_profiler.py +11 -14
  16. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  17. datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
  18. datahub/ingestion/source/identity/okta.py +1 -3
  19. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
  20. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  21. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  22. datahub/ingestion/source/looker/lookml_source.py +2 -1
  23. datahub/ingestion/source/metadata/lineage.py +2 -2
  24. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  25. datahub/ingestion/source/nifi.py +6 -3
  26. datahub/ingestion/source/openapi_parser.py +2 -2
  27. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  28. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  29. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  30. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  31. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  32. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  33. datahub/ingestion/source/redash.py +2 -1
  34. datahub/ingestion/source/s3/config.py +2 -4
  35. datahub/ingestion/source/s3/source.py +20 -41
  36. datahub/ingestion/source/salesforce.py +1 -1
  37. datahub/ingestion/source/schema_inference/object.py +1 -1
  38. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  39. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  40. datahub/ingestion/source/sql/athena.py +2 -2
  41. datahub/ingestion/source/sql/sql_common.py +2 -2
  42. datahub/ingestion/source/sql/sql_types.py +2 -2
  43. datahub/ingestion/source/sql/teradata.py +4 -2
  44. datahub/ingestion/source/sql/trino.py +2 -2
  45. datahub/ingestion/source/superset.py +65 -37
  46. datahub/ingestion/source/tableau/tableau.py +1 -5
  47. datahub/lite/duckdb_lite.py +3 -9
  48. datahub/sdk/dataset.py +3 -3
  49. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
  50. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +0 -0
  51. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
  52. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0
@@ -2,8 +2,9 @@ import json
2
2
  import logging
3
3
  import threading
4
4
  import uuid
5
- from typing import Any, Dict, Iterable, List, Optional
5
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
6
6
 
7
+ from dateutil import parser as dateutil_parser
7
8
  from pyiceberg.catalog import Catalog
8
9
  from pyiceberg.exceptions import (
9
10
  NoSuchIcebergTableError,
@@ -81,6 +82,7 @@ from datahub.metadata.schema_classes import (
81
82
  OwnerClass,
82
83
  OwnershipClass,
83
84
  OwnershipTypeClass,
85
+ TimeStampClass,
84
86
  )
85
87
  from datahub.utilities.perf_timer import PerfTimer
86
88
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -183,16 +185,9 @@ class IcebergSource(StatefulIngestionSourceBase):
183
185
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
184
186
  thread_local = threading.local()
185
187
 
186
- def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
187
- LOGGER.debug(f"Processing dataset for path {dataset_path}")
188
- dataset_name = ".".join(dataset_path)
189
- if not self.config.table_pattern.allowed(dataset_name):
190
- # Dataset name is rejected by pattern, report as dropped.
191
- self.report.report_dropped(dataset_name)
192
- LOGGER.debug(
193
- f"Skipping table {dataset_name} due to not being allowed by the config pattern"
194
- )
195
- return
188
+ def _try_processing_dataset(
189
+ dataset_path: Tuple[str, ...], dataset_name: str
190
+ ) -> Iterable[MetadataWorkUnit]:
196
191
  try:
197
192
  if not hasattr(thread_local, "local_catalog"):
198
193
  LOGGER.debug(
@@ -248,10 +243,31 @@ class IcebergSource(StatefulIngestionSourceBase):
248
243
  LOGGER.warning(
249
244
  f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
250
245
  )
246
+ except ValueError as e:
247
+ if "Could not initialize FileIO" not in str(e):
248
+ raise
249
+ self.report.warning(
250
+ "Could not initialize FileIO",
251
+ f"Could not initialize FileIO for {dataset_path} due to: {e}",
252
+ )
253
+
254
+ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
255
+ try:
256
+ LOGGER.debug(f"Processing dataset for path {dataset_path}")
257
+ dataset_name = ".".join(dataset_path)
258
+ if not self.config.table_pattern.allowed(dataset_name):
259
+ # Dataset name is rejected by pattern, report as dropped.
260
+ self.report.report_dropped(dataset_name)
261
+ LOGGER.debug(
262
+ f"Skipping table {dataset_name} due to not being allowed by the config pattern"
263
+ )
264
+ return
265
+
266
+ yield from _try_processing_dataset(dataset_path, dataset_name)
251
267
  except Exception as e:
252
268
  self.report.report_failure(
253
269
  "general",
254
- f"Failed to create workunit for dataset {dataset_name}: {e}",
270
+ f"Failed to create workunit for dataset {dataset_path}: {e}",
255
271
  )
256
272
  LOGGER.exception(
257
273
  f"Exception while processing table {dataset_path}, skipping it.",
@@ -288,6 +304,7 @@ class IcebergSource(StatefulIngestionSourceBase):
288
304
  )
289
305
 
290
306
  # Dataset properties aspect.
307
+ additional_properties = {}
291
308
  custom_properties = table.metadata.properties.copy()
292
309
  custom_properties["location"] = table.metadata.location
293
310
  custom_properties["format-version"] = str(table.metadata.format_version)
@@ -299,10 +316,27 @@ class IcebergSource(StatefulIngestionSourceBase):
299
316
  custom_properties["manifest-list"] = (
300
317
  table.current_snapshot().manifest_list
301
318
  )
319
+ additional_properties["lastModified"] = TimeStampClass(
320
+ int(table.current_snapshot().timestamp_ms)
321
+ )
322
+ if "created-at" in custom_properties:
323
+ try:
324
+ dt = dateutil_parser.isoparse(custom_properties["created-at"])
325
+ additional_properties["created"] = TimeStampClass(
326
+ int(dt.timestamp() * 1000)
327
+ )
328
+ except Exception as ex:
329
+ LOGGER.warning(
330
+ f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
331
+ )
332
+
302
333
  dataset_properties = DatasetPropertiesClass(
303
334
  name=table.name()[-1],
304
335
  description=table.metadata.properties.get("comment", None),
305
336
  customProperties=custom_properties,
337
+ lastModified=additional_properties.get("lastModified"),
338
+ created=additional_properties.get("created"),
339
+ qualifiedName=dataset_name,
306
340
  )
307
341
  dataset_snapshot.aspects.append(dataset_properties)
308
342
  # Dataset ownership aspect.
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import threading
2
3
  from dataclasses import dataclass, field
3
4
  from typing import Any, Dict, Optional
4
5
 
@@ -156,18 +157,21 @@ class TopTableTimings:
156
157
  def __init__(self, size: int = 10):
157
158
  self._size = size
158
159
  self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
160
+ self._lock = threading.Lock()
159
161
 
160
162
  def add(self, entity: Dict[str, Any]) -> None:
161
163
  if self._VALUE_FIELD not in entity:
162
164
  return
163
- self.top_entites.add(entity)
164
- if len(self.top_entites) > self._size:
165
- self.top_entites.pop()
165
+ with self._lock:
166
+ self.top_entites.add(entity)
167
+ if len(self.top_entites) > self._size:
168
+ self.top_entites.pop()
166
169
 
167
170
  def __str__(self) -> str:
168
- if len(self.top_entites) == 0:
169
- return "no timings reported"
170
- return str(list(self.top_entites))
171
+ with self._lock:
172
+ if len(self.top_entites) == 0:
173
+ return "no timings reported"
174
+ return str(list(self.top_entites))
171
175
 
172
176
 
173
177
  class TimingClass:
@@ -175,24 +179,31 @@ class TimingClass:
175
179
 
176
180
  def __init__(self):
177
181
  self.times = SortedList()
182
+ self._lock = threading.Lock()
178
183
 
179
184
  def add_timing(self, t: float) -> None:
180
- self.times.add(t)
185
+ with self._lock:
186
+ self.times.add(t)
181
187
 
182
188
  def __str__(self) -> str:
183
- if len(self.times) == 0:
184
- return "no timings reported"
185
- total = sum(self.times)
186
- avg = total / len(self.times)
187
- return str(
188
- {
189
- "average_time": format_timespan(avg, detailed=True, max_units=3),
190
- "min_time": format_timespan(self.times[0], detailed=True, max_units=3),
191
- "max_time": format_timespan(self.times[-1], detailed=True, max_units=3),
192
- # total_time does not provide correct information in case we run in more than 1 thread
193
- "total_time": format_timespan(total, detailed=True, max_units=3),
194
- }
195
- )
189
+ with self._lock:
190
+ if len(self.times) == 0:
191
+ return "no timings reported"
192
+ total = sum(self.times)
193
+ avg = total / len(self.times)
194
+ return str(
195
+ {
196
+ "average_time": format_timespan(avg, detailed=True, max_units=3),
197
+ "min_time": format_timespan(
198
+ self.times[0], detailed=True, max_units=3
199
+ ),
200
+ "max_time": format_timespan(
201
+ self.times[-1], detailed=True, max_units=3
202
+ ),
203
+ # total_time does not provide correct information in case we run in more than 1 thread
204
+ "total_time": format_timespan(total, detailed=True, max_units=3),
205
+ }
206
+ )
196
207
 
197
208
 
198
209
  @dataclass
@@ -568,9 +568,7 @@ class OktaSource(StatefulIngestionSourceBase):
568
568
  if (
569
569
  self.config.include_deprovisioned_users is False
570
570
  and okta_user.status == UserStatus.DEPROVISIONED
571
- ):
572
- return False
573
- elif (
571
+ ) or (
574
572
  self.config.include_suspended_users is False
575
573
  and okta_user.status == UserStatus.SUSPENDED
576
574
  ):
@@ -447,13 +447,10 @@ class DebeziumSourceConnector(BaseConnector):
447
447
  ) -> DebeziumParser:
448
448
  connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
449
449
 
450
- if connector_class == "io.debezium.connector.mysql.MySqlConnector":
451
- parser = self.DebeziumParser(
452
- source_platform="mysql",
453
- server_name=self.get_server_name(connector_manifest),
454
- database_name=None,
455
- )
456
- elif connector_class == "MySqlConnector":
450
+ if (
451
+ connector_class == "io.debezium.connector.mysql.MySqlConnector"
452
+ or connector_class == "MySqlConnector"
453
+ ):
457
454
  parser = self.DebeziumParser(
458
455
  source_platform="mysql",
459
456
  server_name=self.get_server_name(connector_manifest),
@@ -205,8 +205,9 @@ class LookerAPI:
205
205
  def folder_ancestors(
206
206
  self,
207
207
  folder_id: str,
208
- fields: Union[str, List[str]] = ["id", "name", "parent_id"],
208
+ fields: Optional[Union[str, List[str]]] = None,
209
209
  ) -> Sequence[Folder]:
210
+ fields = fields or ["id", "name", "parent_id"]
210
211
  self.client_stats.folder_calls += 1
211
212
  try:
212
213
  return self.client.folder_ancestors(
@@ -464,9 +464,10 @@ def process_lookml_template_language(
464
464
  source_config: LookMLSourceConfig,
465
465
  view_lkml_file_dict: dict,
466
466
  reporter: LookMLSourceReport,
467
- manifest_constants: Dict[str, "LookerConstant"] = {},
467
+ manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
468
468
  resolve_constants: bool = False,
469
469
  ) -> None:
470
+ manifest_constants = manifest_constants or {}
470
471
  if "views" not in view_lkml_file_dict:
471
472
  return
472
473
 
@@ -507,9 +508,10 @@ def load_and_preprocess_file(
507
508
  path: Union[str, pathlib.Path],
508
509
  source_config: LookMLSourceConfig,
509
510
  reporter: LookMLSourceReport,
510
- manifest_constants: Dict[str, "LookerConstant"] = {},
511
+ manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
511
512
  resolve_constants: bool = False,
512
513
  ) -> dict:
514
+ manifest_constants = manifest_constants or {}
513
515
  parsed = load_lkml(path)
514
516
 
515
517
  process_lookml_template_language(
@@ -1006,8 +1006,9 @@ class LookMLSource(StatefulIngestionSourceBase):
1006
1006
  def report_skipped_unreachable_views(
1007
1007
  self,
1008
1008
  viewfile_loader: LookerViewFileLoader,
1009
- processed_view_map: Dict[str, Set[str]] = {},
1009
+ processed_view_map: Optional[Dict[str, Set[str]]] = None,
1010
1010
  ) -> None:
1011
+ processed_view_map = processed_view_map or {}
1011
1012
  view_files: Dict[str, List[pathlib.Path]] = {}
1012
1013
  for project, folder_path in self.base_projects_folder.items():
1013
1014
  folder = pathlib.Path(folder_path)
@@ -104,8 +104,8 @@ class FineGrainedLineageConfig(ConfigModel):
104
104
 
105
105
  class EntityNodeConfig(ConfigModel):
106
106
  entity: EntityConfig
107
- upstream: Optional[List["EntityNodeConfig"]]
108
- fineGrainedLineages: Optional[List[FineGrainedLineageConfig]]
107
+ upstream: Optional[List["EntityNodeConfig"]] = None
108
+ fineGrainedLineages: Optional[List[FineGrainedLineageConfig]] = None
109
109
 
110
110
 
111
111
  # https://pydantic-docs.helpmanual.io/usage/postponed_annotations/ required for when you reference a model within itself
@@ -292,7 +292,7 @@ class Neo4jSource(StatefulIngestionSourceBase):
292
292
  return record["properties"]
293
293
 
294
294
  def get_relationships(self, record: dict) -> dict:
295
- return record.get("relationships", None)
295
+ return record.get("relationships", {})
296
296
 
297
297
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
298
298
  return [
@@ -1234,11 +1234,14 @@ class NifiSource(StatefulIngestionSourceBase):
1234
1234
  job_type: str,
1235
1235
  description: Optional[str],
1236
1236
  job_properties: Optional[Dict[str, str]] = None,
1237
- inlets: List[str] = [],
1238
- outlets: List[str] = [],
1239
- inputJobs: List[str] = [],
1237
+ inlets: Optional[List[str]] = None,
1238
+ outlets: Optional[List[str]] = None,
1239
+ inputJobs: Optional[List[str]] = None,
1240
1240
  status: Optional[str] = None,
1241
1241
  ) -> Iterable[MetadataWorkUnit]:
1242
+ inlets = inlets or []
1243
+ outlets = outlets or []
1244
+ inputJobs = inputJobs or []
1242
1245
  logger.debug(f"Begining construction of job workunit for {job_urn}")
1243
1246
  if job_properties:
1244
1247
  job_properties = {k: v for k, v in job_properties.items() if v is not None}
@@ -167,7 +167,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
167
167
  Try to determine if example data is defined for the endpoint, and return it
168
168
  """
169
169
  data = {}
170
- if "content" in base_res.keys():
170
+ if "content" in base_res:
171
171
  res_cont = base_res["content"]
172
172
  if "application/json" in res_cont.keys():
173
173
  ex_field = None
@@ -188,7 +188,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
188
188
  )
189
189
  elif "text/csv" in res_cont.keys():
190
190
  data = res_cont["text/csv"]["schema"]
191
- elif "examples" in base_res.keys():
191
+ elif "examples" in base_res:
192
192
  data = base_res["examples"]["application/json"]
193
193
 
194
194
  return data
@@ -2,7 +2,7 @@ import functools
2
2
  import importlib.resources as pkg_resource
3
3
  import logging
4
4
  import os
5
- from typing import Dict, List
5
+ from typing import Dict, List, Optional
6
6
 
7
7
  import lark
8
8
  from lark import Lark, Tree
@@ -65,8 +65,9 @@ def get_upstream_tables(
65
65
  platform_instance_resolver: AbstractDataPlatformInstanceResolver,
66
66
  ctx: PipelineContext,
67
67
  config: PowerBiDashboardSourceConfig,
68
- parameters: Dict[str, str] = {},
68
+ parameters: Optional[Dict[str, str]] = None,
69
69
  ) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
70
+ parameters = parameters or {}
70
71
  if table.expression is None:
71
72
  logger.debug(f"There is no M-Query expression in table {table.full_name}")
72
73
  return []
@@ -70,13 +70,14 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
70
70
  return expression_tree
71
71
 
72
72
 
73
- def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
73
+ def token_values(tree: Tree, parameters: Optional[Dict[str, str]] = None) -> List[str]:
74
74
  """
75
75
  :param tree: Tree to traverse
76
76
  :param parameters: If parameters is not an empty dict, it will try to resolve identifier variable references
77
77
  using the values in 'parameters'.
78
78
  :return: List of leaf token data
79
79
  """
80
+ parameters = parameters or {}
80
81
  values: List[str] = []
81
82
 
82
83
  def internal(node: Union[Tree, Token]) -> None:
@@ -890,9 +890,7 @@ class Mapper:
890
890
  set(user_rights) & set(self.__config.ownership.owner_criteria)
891
891
  )
892
892
  > 0
893
- ):
894
- user_mcps.extend(self.to_datahub_user(user))
895
- elif self.__config.ownership.owner_criteria is None:
893
+ ) or self.__config.ownership.owner_criteria is None:
896
894
  user_mcps.extend(self.to_datahub_user(user))
897
895
  else:
898
896
  continue
@@ -380,8 +380,9 @@ class DataResolverBase(ABC):
380
380
  def itr_pages(
381
381
  self,
382
382
  endpoint: str,
383
- parameter_override: Dict = {},
383
+ parameter_override: Optional[Dict] = None,
384
384
  ) -> Iterator[List[Dict]]:
385
+ parameter_override = parameter_override or {}
385
386
  params: dict = {
386
387
  "$skip": 0,
387
388
  "$top": self.TOP,
@@ -196,7 +196,7 @@ class PowerBiReportServerAPI:
196
196
  }
197
197
 
198
198
  reports: List[Any] = []
199
- for report_type in report_types_mapping.keys():
199
+ for report_type in report_types_mapping:
200
200
  report_get_endpoint: str = API_ENDPOINTS[report_type]
201
201
  # Replace place holders
202
202
  report_get_endpoint_http = report_get_endpoint.format(
@@ -17,8 +17,9 @@ class WebsocketConnection:
17
17
  self.handle = [-1]
18
18
 
19
19
  def _build_websocket_request_dict(
20
- self, method: str, params: Union[Dict, List] = {}
20
+ self, method: str, params: Optional[Union[Dict, List]] = None
21
21
  ) -> Dict:
22
+ params = params or {}
22
23
  return {
23
24
  "jsonrpc": "2.0",
24
25
  "id": self.request_id,
@@ -37,11 +38,12 @@ class WebsocketConnection:
37
38
  return {}
38
39
 
39
40
  def websocket_send_request(
40
- self, method: str, params: Union[Dict, List] = {}
41
+ self, method: str, params: Optional[Union[Dict, List]] = None
41
42
  ) -> Dict:
42
43
  """
43
44
  Method to send request to websocket
44
45
  """
46
+ params = params or {}
45
47
  self.request_id += 1
46
48
  request = self._build_websocket_request_dict(method, params)
47
49
  response = self._send_request(request=request)
@@ -421,8 +421,9 @@ class RedashSource(StatefulIngestionSourceBase):
421
421
  return database_name
422
422
 
423
423
  def _get_datasource_urns(
424
- self, data_source: Dict, sql_query_data: Dict = {}
424
+ self, data_source: Dict, sql_query_data: Optional[Dict] = None
425
425
  ) -> Optional[List[str]]:
426
+ sql_query_data = sql_query_data or {}
426
427
  platform = self._get_platform_based_on_datasource(data_source)
427
428
  database_name = self._get_database_name_based_on_datasource(data_source)
428
429
  data_source_syntax = data_source.get("syntax")
@@ -154,10 +154,8 @@ class DataLakeSourceConfig(
154
154
  return path_specs
155
155
 
156
156
  @pydantic.validator("platform", always=True)
157
- def platform_valid(cls, platform: str, values: dict) -> str:
158
- inferred_platform = values.get(
159
- "platform", None
160
- ) # we may have inferred it above
157
+ def platform_valid(cls, platform: Any, values: dict) -> str:
158
+ inferred_platform = values.get("platform") # we may have inferred it above
161
159
  platform = platform or inferred_platform
162
160
  if not platform:
163
161
  raise ValueError("platform must not be empty")
@@ -834,7 +834,7 @@ class S3Source(StatefulIngestionSourceBase):
834
834
  min=min,
835
835
  )
836
836
  folders.extend(folders_list)
837
- if not path_spec.traversal_method == FolderTraversalMethod.ALL:
837
+ if path_spec.traversal_method != FolderTraversalMethod.ALL:
838
838
  return folders
839
839
  if folders:
840
840
  return folders
@@ -847,7 +847,7 @@ class S3Source(StatefulIngestionSourceBase):
847
847
  path_spec: PathSpec,
848
848
  bucket: "Bucket",
849
849
  prefix: str,
850
- ) -> List[Folder]:
850
+ ) -> Iterable[Folder]:
851
851
  """
852
852
  Retrieves all the folders in a path by listing all the files in the prefix.
853
853
  If the prefix is a full path then only that folder will be extracted.
@@ -877,51 +877,30 @@ class S3Source(StatefulIngestionSourceBase):
877
877
  s3_objects = (
878
878
  obj
879
879
  for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
880
- if _is_allowed_path(path_spec, f"s3://{obj.bucket_name}/{obj.key}")
880
+ if _is_allowed_path(
881
+ path_spec, self.create_s3_path(obj.bucket_name, obj.key)
882
+ )
881
883
  )
882
-
883
- partitions: List[Folder] = []
884
884
  grouped_s3_objects_by_dirname = groupby_unsorted(
885
885
  s3_objects,
886
886
  key=lambda obj: obj.key.rsplit("/", 1)[0],
887
887
  )
888
- for key, group in grouped_s3_objects_by_dirname:
889
- file_size = 0
890
- creation_time = None
891
- modification_time = None
892
-
893
- for item in group:
894
- file_size += item.size
895
- if creation_time is None or item.last_modified < creation_time:
896
- creation_time = item.last_modified
897
- if modification_time is None or item.last_modified > modification_time:
898
- modification_time = item.last_modified
899
- max_file = item
900
-
901
- if modification_time is None:
902
- logger.warning(
903
- f"Unable to find any files in the folder {key}. Skipping..."
904
- )
905
- continue
906
-
907
- id = path_spec.get_partition_from_path(
908
- self.create_s3_path(max_file.bucket_name, max_file.key)
888
+ for _, group in grouped_s3_objects_by_dirname:
889
+ max_file = max(group, key=lambda x: x.last_modified)
890
+ max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
891
+
892
+ # If partition_id is None, it means the folder is not a partition
893
+ partition_id = path_spec.get_partition_from_path(max_file_s3_path)
894
+
895
+ yield Folder(
896
+ partition_id=partition_id,
897
+ is_partition=bool(partition_id),
898
+ creation_time=min(obj.last_modified for obj in group),
899
+ modification_time=max_file.last_modified,
900
+ sample_file=max_file_s3_path,
901
+ size=sum(obj.size for obj in group),
909
902
  )
910
903
 
911
- # If id is None, it means the folder is not a partition
912
- partitions.append(
913
- Folder(
914
- partition_id=id,
915
- is_partition=bool(id),
916
- creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
917
- modification_time=modification_time,
918
- sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
919
- size=file_size,
920
- )
921
- )
922
-
923
- return partitions
924
-
925
904
  def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
926
905
  if self.source_config.aws_config is None:
927
906
  raise ValueError("aws_config not set. Cannot browse s3")
@@ -1000,7 +979,7 @@ class S3Source(StatefulIngestionSourceBase):
1000
979
  min=True,
1001
980
  )
1002
981
  dirs_to_process.append(dirs_to_process_min[0])
1003
- folders = []
982
+ folders: List[Folder] = []
1004
983
  for dir in dirs_to_process:
1005
984
  logger.info(f"Getting files from folder: {dir}")
1006
985
  prefix_to_process = urlparse(dir).path.lstrip("/")
@@ -615,7 +615,7 @@ class SalesforceSource(StatefulIngestionSourceBase):
615
615
  prefix = "\\" if text.startswith("#") else ""
616
616
  desc += f"\n\n{prefix}{text}"
617
617
 
618
- text = field.get("InlineHelpText", None)
618
+ text = field.get("InlineHelpText")
619
619
  if text:
620
620
  prefix = "\\" if text.startswith("#") else ""
621
621
  desc += f"\n\n{prefix}{text}"
@@ -149,7 +149,7 @@ def construct_schema(
149
149
 
150
150
  extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {}
151
151
 
152
- for field_path in schema.keys():
152
+ for field_path in schema:
153
153
  field_types = schema[field_path]["types"]
154
154
  field_type: Union[str, type] = "mixed"
155
155
 
@@ -125,7 +125,7 @@ class SnowflakeConnectionConfig(ConfigModel):
125
125
 
126
126
  @pydantic.validator("authentication_type", always=True)
127
127
  def authenticator_type_is_valid(cls, v, values):
128
- if v not in _VALID_AUTH_TYPES.keys():
128
+ if v not in _VALID_AUTH_TYPES:
129
129
  raise ValueError(
130
130
  f"unsupported authenticator type '{v}' was provided,"
131
131
  f" use one of {list(_VALID_AUTH_TYPES.keys())}"
@@ -439,7 +439,7 @@ class SnowflakeV2Source(
439
439
  failure_reason=failure_message,
440
440
  )
441
441
 
442
- if c in _report.keys():
442
+ if c in _report:
443
443
  continue
444
444
 
445
445
  # If some capabilities are missing, then mark them as not capable
@@ -396,7 +396,7 @@ class AthenaSource(SQLAlchemySource):
396
396
  metadata.table_type if metadata.table_type else ""
397
397
  )
398
398
 
399
- location: Optional[str] = custom_properties.get("location", None)
399
+ location: Optional[str] = custom_properties.get("location")
400
400
  if location is not None:
401
401
  if location.startswith("s3://"):
402
402
  location = make_s3_urn(location, self.config.env)
@@ -538,7 +538,7 @@ class AthenaSource(SQLAlchemySource):
538
538
  column_name=column["name"],
539
539
  column_type=column["type"],
540
540
  inspector=inspector,
541
- description=column.get("comment", None),
541
+ description=column.get("comment"),
542
542
  nullable=column.get("nullable", True),
543
543
  is_part_of_key=(
544
544
  True
@@ -204,7 +204,7 @@ def get_column_type(
204
204
  """
205
205
 
206
206
  TypeClass: Optional[Type] = None
207
- for sql_type in _field_type_mapping.keys():
207
+ for sql_type in _field_type_mapping:
208
208
  if isinstance(column_type, sql_type):
209
209
  TypeClass = _field_type_mapping[sql_type]
210
210
  break
@@ -973,7 +973,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
973
973
  inspector=inspector,
974
974
  )
975
975
  ),
976
- description=column.get("comment", None),
976
+ description=column.get("comment"),
977
977
  nullable=column["nullable"],
978
978
  recursive=False,
979
979
  globalTags=gtc,
@@ -317,10 +317,10 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
317
317
  match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
318
318
  if match:
319
319
  modified_type_base = match.group(1) # Extract the base type
320
- return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
320
+ return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
321
321
 
322
322
  # Fallback for types without precision/scale
323
- return SNOWFLAKE_TYPES_MAP.get(type_string, None)
323
+ return SNOWFLAKE_TYPES_MAP.get(type_string)
324
324
 
325
325
 
326
326
  # see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32