acryl-datahub 1.0.0rc6__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (74) hide show
  1. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2490 -2490
  2. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +74 -74
  3. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/docker_cli.py +1 -1
  6. datahub/cli/iceberg_cli.py +1 -1
  7. datahub/cli/lite_cli.py +4 -2
  8. datahub/cli/specific/dataproduct_cli.py +1 -1
  9. datahub/configuration/git.py +1 -3
  10. datahub/configuration/kafka.py +1 -1
  11. datahub/ingestion/fs/s3_fs.py +2 -2
  12. datahub/ingestion/glossary/classification_mixin.py +1 -1
  13. datahub/ingestion/graph/client.py +16 -7
  14. datahub/ingestion/graph/entity_versioning.py +3 -3
  15. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  16. datahub/ingestion/source/abs/config.py +2 -4
  17. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  18. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
  19. datahub/ingestion/source/cassandra/cassandra_api.py +2 -1
  20. datahub/ingestion/source/csv_enricher.py +3 -3
  21. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  22. datahub/ingestion/source/dremio/dremio_api.py +3 -3
  23. datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
  24. datahub/ingestion/source/file.py +5 -2
  25. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  26. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  27. datahub/ingestion/source/ge_data_profiler.py +11 -14
  28. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  29. datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
  30. datahub/ingestion/source/identity/okta.py +1 -3
  31. datahub/ingestion/source/kafka/kafka.py +1 -1
  32. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
  33. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  34. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  35. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  36. datahub/ingestion/source/looker/lookml_source.py +3 -2
  37. datahub/ingestion/source/metabase.py +54 -32
  38. datahub/ingestion/source/metadata/lineage.py +2 -2
  39. datahub/ingestion/source/mode.py +1 -1
  40. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  41. datahub/ingestion/source/nifi.py +6 -3
  42. datahub/ingestion/source/openapi_parser.py +2 -2
  43. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  44. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  45. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  46. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  47. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  48. datahub/ingestion/source/pulsar.py +2 -2
  49. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  50. datahub/ingestion/source/redash.py +2 -1
  51. datahub/ingestion/source/s3/config.py +2 -4
  52. datahub/ingestion/source/s3/source.py +20 -41
  53. datahub/ingestion/source/salesforce.py +1 -1
  54. datahub/ingestion/source/schema_inference/object.py +1 -1
  55. datahub/ingestion/source/sigma/sigma.py +1 -1
  56. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  57. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  58. datahub/ingestion/source/sql/athena.py +2 -2
  59. datahub/ingestion/source/sql/druid.py +1 -5
  60. datahub/ingestion/source/sql/sql_common.py +2 -2
  61. datahub/ingestion/source/sql/sql_types.py +2 -2
  62. datahub/ingestion/source/sql/teradata.py +4 -2
  63. datahub/ingestion/source/sql/trino.py +2 -2
  64. datahub/ingestion/source/superset.py +65 -37
  65. datahub/ingestion/source/tableau/tableau.py +3 -6
  66. datahub/ingestion/source/tableau/tableau_common.py +2 -1
  67. datahub/lite/duckdb_lite.py +5 -10
  68. datahub/lite/lite_local.py +1 -1
  69. datahub/lite/lite_util.py +4 -3
  70. datahub/sdk/dataset.py +3 -3
  71. datahub/utilities/memory_footprint.py +3 -2
  72. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
  73. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
  74. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0
@@ -330,7 +330,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
330
330
  aspect_type_name: Optional[str] = None,
331
331
  version: int = 0,
332
332
  ) -> Optional[Aspect]:
333
- assert aspect_type.ASPECT_NAME == aspect
333
+ assert aspect == aspect_type.ASPECT_NAME
334
334
  return self.get_aspect(
335
335
  entity_urn=entity_urn,
336
336
  aspect_type=aspect_type,
@@ -1547,7 +1547,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1547
1547
  return fragment
1548
1548
 
1549
1549
  def _run_assertion_build_params(
1550
- self, params: Optional[Dict[str, str]] = {}
1550
+ self, params: Optional[Dict[str, str]] = None
1551
1551
  ) -> List[Any]:
1552
1552
  if params is None:
1553
1553
  return []
@@ -1566,9 +1566,11 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1566
1566
  self,
1567
1567
  urn: str,
1568
1568
  save_result: bool = True,
1569
- parameters: Optional[Dict[str, str]] = {},
1569
+ parameters: Optional[Dict[str, str]] = None,
1570
1570
  async_flag: bool = False,
1571
1571
  ) -> Dict:
1572
+ if parameters is None:
1573
+ parameters = {}
1572
1574
  params = self._run_assertion_build_params(parameters)
1573
1575
  graph_query: str = """
1574
1576
  %s
@@ -1597,9 +1599,11 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1597
1599
  self,
1598
1600
  urns: List[str],
1599
1601
  save_result: bool = True,
1600
- parameters: Optional[Dict[str, str]] = {},
1602
+ parameters: Optional[Dict[str, str]] = None,
1601
1603
  async_flag: bool = False,
1602
1604
  ) -> Dict:
1605
+ if parameters is None:
1606
+ parameters = {}
1603
1607
  params = self._run_assertion_build_params(parameters)
1604
1608
  graph_query: str = """
1605
1609
  %s
@@ -1636,10 +1640,14 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1636
1640
  def run_assertions_for_asset(
1637
1641
  self,
1638
1642
  urn: str,
1639
- tag_urns: Optional[List[str]] = [],
1640
- parameters: Optional[Dict[str, str]] = {},
1643
+ tag_urns: Optional[List[str]] = None,
1644
+ parameters: Optional[Dict[str, str]] = None,
1641
1645
  async_flag: bool = False,
1642
1646
  ) -> Dict:
1647
+ if tag_urns is None:
1648
+ tag_urns = []
1649
+ if parameters is None:
1650
+ parameters = {}
1643
1651
  params = self._run_assertion_build_params(parameters)
1644
1652
  graph_query: str = """
1645
1653
  %s
@@ -1677,9 +1685,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1677
1685
  self,
1678
1686
  entity_name: str,
1679
1687
  urns: List[str],
1680
- aspects: List[str] = [],
1688
+ aspects: Optional[List[str]] = None,
1681
1689
  with_system_metadata: bool = False,
1682
1690
  ) -> Dict[str, Any]:
1691
+ aspects = aspects or []
1683
1692
  payload = {
1684
1693
  "urns": urns,
1685
1694
  "aspectNames": aspects,
@@ -93,7 +93,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
93
93
  try:
94
94
  return response["linkAssetVersion"]["urn"]
95
95
  except KeyError:
96
- raise ValueError(f"Unexpected response: {response}")
96
+ raise ValueError(f"Unexpected response: {response}") from None
97
97
 
98
98
  def link_asset_to_versioned_asset(
99
99
  self,
@@ -165,7 +165,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
165
165
  try:
166
166
  return response["unlinkAssetVersion"]["urn"]
167
167
  except KeyError:
168
- raise ValueError(f"Unexpected response: {response}")
168
+ raise ValueError(f"Unexpected response: {response}") from None
169
169
 
170
170
  def unlink_latest_asset_from_version_set(
171
171
  self, version_set_urn: str
@@ -198,4 +198,4 @@ class EntityVersioningAPI(DataHubGraphProtocol):
198
198
  try:
199
199
  return response["unlinkAssetVersion"]["urn"]
200
200
  except KeyError:
201
- raise ValueError(f"Unexpected response: {response}")
201
+ raise ValueError(f"Unexpected response: {response}") from None
@@ -163,12 +163,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
163
163
  key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
164
164
  for key, value in obj.items()
165
165
  }
166
- elif isinstance(obj, list):
167
- return [
168
- DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
169
- for element in obj
170
- ]
171
- elif isinstance(obj, set):
166
+ elif isinstance(obj, list) or isinstance(obj, set):
172
167
  return [
173
168
  DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
174
169
  for element in obj
@@ -144,10 +144,8 @@ class DataLakeSourceConfig(
144
144
  return path_specs
145
145
 
146
146
  @pydantic.validator("platform", always=True)
147
- def platform_not_empty(cls, platform: str, values: dict) -> str:
148
- inferred_platform = values.get(
149
- "platform", None
150
- ) # we may have inferred it above
147
+ def platform_not_empty(cls, platform: Any, values: dict) -> str:
148
+ inferred_platform = values.get("platform") # we may have inferred it above
151
149
  platform = platform or inferred_platform
152
150
  if not platform:
153
151
  raise ValueError("platform must not be empty")
@@ -165,7 +165,7 @@ class BigQueryTableRef:
165
165
  @classmethod
166
166
  def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
167
167
  for key in ["projectId", "datasetId", "tableId"]:
168
- if key not in spec.keys():
168
+ if key not in spec:
169
169
  raise ValueError(f"invalid BigQuery table reference dict: {spec}")
170
170
 
171
171
  return cls(
@@ -344,7 +344,7 @@ class BigQuerySchemaApi:
344
344
  with_partitions: bool = False,
345
345
  ) -> Iterator[BigqueryTable]:
346
346
  with PerfTimer() as current_timer:
347
- filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
347
+ filter_clause: str = ", ".join(f"'{table}'" for table in tables)
348
348
 
349
349
  if with_partitions:
350
350
  query_template = BigqueryQuery.tables_for_dataset
@@ -159,7 +159,8 @@ class CassandraAPI:
159
159
  self.report.failure(message="Failed to authenticate to Cassandra", exc=e)
160
160
  return False
161
161
 
162
- def get(self, query: str, parameters: Optional[List] = []) -> List:
162
+ def get(self, query: str, parameters: Optional[List] = None) -> List:
163
+ parameters = parameters or []
163
164
  if not self._cassandra_session:
164
165
  return []
165
166
 
@@ -314,7 +314,7 @@ class CSVEnricherSource(Source):
314
314
  "datajob": EditableDataJobPropertiesClass,
315
315
  "dataflow": EditableDataFlowPropertiesClass,
316
316
  "notebook": EditableNotebookPropertiesClass,
317
- }.get(entityType, None)
317
+ }.get(entityType)
318
318
 
319
319
  if not entityClass:
320
320
  raise ValueError(
@@ -640,8 +640,8 @@ class CSVEnricherSource(Source):
640
640
  )
641
641
  except Exception as e:
642
642
  raise ConfigurationError(
643
- f"Cannot read remote file {self.config.filename}, error:{e}"
644
- )
643
+ f"Cannot read remote file {self.config.filename}: {e}"
644
+ ) from e
645
645
  else:
646
646
  with open(pathlib.Path(self.config.filename), encoding="utf-8-sig") as f:
647
647
  rows = list(csv.DictReader(f, delimiter=self.config.delimiter))
@@ -1033,7 +1033,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1033
1033
  cll_nodes.add(dbt_name)
1034
1034
  schema_nodes.add(dbt_name)
1035
1035
 
1036
- for dbt_name in all_nodes_map.keys():
1036
+ for dbt_name in all_nodes_map:
1037
1037
  if self._is_allowed_node(dbt_name):
1038
1038
  add_node_to_cll_list(dbt_name)
1039
1039
 
@@ -271,12 +271,12 @@ class DremioAPIOperations:
271
271
  self.cancel_query(job_id)
272
272
  raise DremioAPIException(
273
273
  f"Query execution timed out after {timeout} seconds"
274
- )
274
+ ) from None
275
275
  except RuntimeError as e:
276
- raise DremioAPIException(f"{str(e)}")
276
+ raise DremioAPIException() from e
277
277
 
278
278
  except requests.RequestException as e:
279
- raise DremioAPIException(f"Error executing query: {str(e)}")
279
+ raise DremioAPIException("Error executing query") from e
280
280
 
281
281
  def fetch_results(self, job_id: str) -> List[Dict]:
282
282
  """Fetch job results with status checking"""
@@ -168,8 +168,9 @@ class DremioAspects:
168
168
  )
169
169
 
170
170
  def get_container_urn(
171
- self, name: Optional[str] = None, path: Optional[List[str]] = []
171
+ self, name: Optional[str] = None, path: Optional[List[str]] = None
172
172
  ) -> str:
173
+ path = path or []
173
174
  container_key = self.get_container_key(name, path)
174
175
  return container_key.as_urn()
175
176
 
@@ -410,10 +410,13 @@ def _from_obj_for_file(
410
410
  item = MetadataChangeEvent.from_obj(obj)
411
411
  elif "aspect" in obj:
412
412
  item = MetadataChangeProposalWrapper.from_obj(obj)
413
- else:
413
+ elif "bucket" in obj:
414
414
  item = UsageAggregationClass.from_obj(obj)
415
+ else:
416
+ raise ValueError(f"Unknown object type: {obj}")
417
+
415
418
  if not item.validate():
416
- raise ValueError(f"failed to parse: {obj}")
419
+ raise ValueError(f"Failed to parse: {obj}")
417
420
 
418
421
  if isinstance(item, UsageAggregationClass):
419
422
  logger.warning(f"Dropping deprecated UsageAggregationClass: {item}")
@@ -498,7 +498,7 @@ class DataProcessCleanup:
498
498
  # Delete empty dataflows if needed
499
499
  if self.config.delete_empty_data_flows:
500
500
  deleted_data_flows: int = 0
501
- for key in dataFlows.keys():
501
+ for key in dataFlows:
502
502
  if not dataJobs.get(key) or len(dataJobs[key]) == 0:
503
503
  logger.info(
504
504
  f"Deleting dataflow {key} because there are not datajobs"
@@ -130,8 +130,9 @@ class DatahubExecutionRequestCleanup:
130
130
  )
131
131
 
132
132
  def _scroll_execution_requests(
133
- self, overrides: Dict[str, Any] = {}
133
+ self, overrides: Optional[Dict[str, Any]] = None
134
134
  ) -> Iterator[CleanupRecord]:
135
+ overrides = overrides or {}
135
136
  headers: Dict[str, Any] = {
136
137
  "Accept": "application/json",
137
138
  "Content-Type": "application/json",
@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
170
170
  ).select_from(self._table)
171
171
  )
172
172
  return convert_to_json_serializable(element_values.fetchone()[0])
173
- elif self.engine.dialect.name.lower() == BIGQUERY:
174
- element_values = self.engine.execute(
175
- sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
176
- self._table
177
- )
178
- )
179
- return convert_to_json_serializable(element_values.fetchone()[0])
180
- elif self.engine.dialect.name.lower() == SNOWFLAKE:
173
+ elif (
174
+ self.engine.dialect.name.lower() == BIGQUERY
175
+ or self.engine.dialect.name.lower() == SNOWFLAKE
176
+ ):
181
177
  element_values = self.engine.execute(
182
178
  sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
183
179
  self._table
@@ -381,13 +377,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
381
377
  col = col_dict["name"]
382
378
  self.column_types[col] = str(col_dict["type"])
383
379
  # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
384
- if not self.config._allow_deny_patterns.allowed(
385
- f"{self.dataset_name}.{col}"
380
+ if (
381
+ not self.config._allow_deny_patterns.allowed(
382
+ f"{self.dataset_name}.{col}"
383
+ )
384
+ or not self.config.profile_nested_fields
385
+ and "." in col
386
386
  ):
387
387
  ignored_columns_by_pattern.append(col)
388
- # We try to ignore nested columns as well
389
- elif not self.config.profile_nested_fields and "." in col:
390
- ignored_columns_by_pattern.append(col)
391
388
  elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
392
389
  ignored_columns_by_type.append(col)
393
390
  else:
@@ -1408,7 +1405,7 @@ class DatahubGEProfiler:
1408
1405
  },
1409
1406
  )
1410
1407
 
1411
- if platform == BIGQUERY or platform == DATABRICKS:
1408
+ if platform in (BIGQUERY, DATABRICKS):
1412
1409
  # This is done as GE makes the name as DATASET.TABLE
1413
1410
  # but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
1414
1411
  name_parts = pretty_name.split(".")
@@ -2,8 +2,9 @@ import json
2
2
  import logging
3
3
  import threading
4
4
  import uuid
5
- from typing import Any, Dict, Iterable, List, Optional
5
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
6
6
 
7
+ from dateutil import parser as dateutil_parser
7
8
  from pyiceberg.catalog import Catalog
8
9
  from pyiceberg.exceptions import (
9
10
  NoSuchIcebergTableError,
@@ -81,6 +82,7 @@ from datahub.metadata.schema_classes import (
81
82
  OwnerClass,
82
83
  OwnershipClass,
83
84
  OwnershipTypeClass,
85
+ TimeStampClass,
84
86
  )
85
87
  from datahub.utilities.perf_timer import PerfTimer
86
88
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -183,16 +185,9 @@ class IcebergSource(StatefulIngestionSourceBase):
183
185
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
184
186
  thread_local = threading.local()
185
187
 
186
- def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
187
- LOGGER.debug(f"Processing dataset for path {dataset_path}")
188
- dataset_name = ".".join(dataset_path)
189
- if not self.config.table_pattern.allowed(dataset_name):
190
- # Dataset name is rejected by pattern, report as dropped.
191
- self.report.report_dropped(dataset_name)
192
- LOGGER.debug(
193
- f"Skipping table {dataset_name} due to not being allowed by the config pattern"
194
- )
195
- return
188
+ def _try_processing_dataset(
189
+ dataset_path: Tuple[str, ...], dataset_name: str
190
+ ) -> Iterable[MetadataWorkUnit]:
196
191
  try:
197
192
  if not hasattr(thread_local, "local_catalog"):
198
193
  LOGGER.debug(
@@ -248,10 +243,31 @@ class IcebergSource(StatefulIngestionSourceBase):
248
243
  LOGGER.warning(
249
244
  f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
250
245
  )
246
+ except ValueError as e:
247
+ if "Could not initialize FileIO" not in str(e):
248
+ raise
249
+ self.report.warning(
250
+ "Could not initialize FileIO",
251
+ f"Could not initialize FileIO for {dataset_path} due to: {e}",
252
+ )
253
+
254
+ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
255
+ try:
256
+ LOGGER.debug(f"Processing dataset for path {dataset_path}")
257
+ dataset_name = ".".join(dataset_path)
258
+ if not self.config.table_pattern.allowed(dataset_name):
259
+ # Dataset name is rejected by pattern, report as dropped.
260
+ self.report.report_dropped(dataset_name)
261
+ LOGGER.debug(
262
+ f"Skipping table {dataset_name} due to not being allowed by the config pattern"
263
+ )
264
+ return
265
+
266
+ yield from _try_processing_dataset(dataset_path, dataset_name)
251
267
  except Exception as e:
252
268
  self.report.report_failure(
253
269
  "general",
254
- f"Failed to create workunit for dataset {dataset_name}: {e}",
270
+ f"Failed to create workunit for dataset {dataset_path}: {e}",
255
271
  )
256
272
  LOGGER.exception(
257
273
  f"Exception while processing table {dataset_path}, skipping it.",
@@ -288,6 +304,7 @@ class IcebergSource(StatefulIngestionSourceBase):
288
304
  )
289
305
 
290
306
  # Dataset properties aspect.
307
+ additional_properties = {}
291
308
  custom_properties = table.metadata.properties.copy()
292
309
  custom_properties["location"] = table.metadata.location
293
310
  custom_properties["format-version"] = str(table.metadata.format_version)
@@ -299,10 +316,27 @@ class IcebergSource(StatefulIngestionSourceBase):
299
316
  custom_properties["manifest-list"] = (
300
317
  table.current_snapshot().manifest_list
301
318
  )
319
+ additional_properties["lastModified"] = TimeStampClass(
320
+ int(table.current_snapshot().timestamp_ms)
321
+ )
322
+ if "created-at" in custom_properties:
323
+ try:
324
+ dt = dateutil_parser.isoparse(custom_properties["created-at"])
325
+ additional_properties["created"] = TimeStampClass(
326
+ int(dt.timestamp() * 1000)
327
+ )
328
+ except Exception as ex:
329
+ LOGGER.warning(
330
+ f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
331
+ )
332
+
302
333
  dataset_properties = DatasetPropertiesClass(
303
334
  name=table.name()[-1],
304
335
  description=table.metadata.properties.get("comment", None),
305
336
  customProperties=custom_properties,
337
+ lastModified=additional_properties.get("lastModified"),
338
+ created=additional_properties.get("created"),
339
+ qualifiedName=dataset_name,
306
340
  )
307
341
  dataset_snapshot.aspects.append(dataset_properties)
308
342
  # Dataset ownership aspect.
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import threading
2
3
  from dataclasses import dataclass, field
3
4
  from typing import Any, Dict, Optional
4
5
 
@@ -156,18 +157,21 @@ class TopTableTimings:
156
157
  def __init__(self, size: int = 10):
157
158
  self._size = size
158
159
  self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
160
+ self._lock = threading.Lock()
159
161
 
160
162
  def add(self, entity: Dict[str, Any]) -> None:
161
163
  if self._VALUE_FIELD not in entity:
162
164
  return
163
- self.top_entites.add(entity)
164
- if len(self.top_entites) > self._size:
165
- self.top_entites.pop()
165
+ with self._lock:
166
+ self.top_entites.add(entity)
167
+ if len(self.top_entites) > self._size:
168
+ self.top_entites.pop()
166
169
 
167
170
  def __str__(self) -> str:
168
- if len(self.top_entites) == 0:
169
- return "no timings reported"
170
- return str(list(self.top_entites))
171
+ with self._lock:
172
+ if len(self.top_entites) == 0:
173
+ return "no timings reported"
174
+ return str(list(self.top_entites))
171
175
 
172
176
 
173
177
  class TimingClass:
@@ -175,24 +179,31 @@ class TimingClass:
175
179
 
176
180
  def __init__(self):
177
181
  self.times = SortedList()
182
+ self._lock = threading.Lock()
178
183
 
179
184
  def add_timing(self, t: float) -> None:
180
- self.times.add(t)
185
+ with self._lock:
186
+ self.times.add(t)
181
187
 
182
188
  def __str__(self) -> str:
183
- if len(self.times) == 0:
184
- return "no timings reported"
185
- total = sum(self.times)
186
- avg = total / len(self.times)
187
- return str(
188
- {
189
- "average_time": format_timespan(avg, detailed=True, max_units=3),
190
- "min_time": format_timespan(self.times[0], detailed=True, max_units=3),
191
- "max_time": format_timespan(self.times[-1], detailed=True, max_units=3),
192
- # total_time does not provide correct information in case we run in more than 1 thread
193
- "total_time": format_timespan(total, detailed=True, max_units=3),
194
- }
195
- )
189
+ with self._lock:
190
+ if len(self.times) == 0:
191
+ return "no timings reported"
192
+ total = sum(self.times)
193
+ avg = total / len(self.times)
194
+ return str(
195
+ {
196
+ "average_time": format_timespan(avg, detailed=True, max_units=3),
197
+ "min_time": format_timespan(
198
+ self.times[0], detailed=True, max_units=3
199
+ ),
200
+ "max_time": format_timespan(
201
+ self.times[-1], detailed=True, max_units=3
202
+ ),
203
+ # total_time does not provide correct information in case we run in more than 1 thread
204
+ "total_time": format_timespan(total, detailed=True, max_units=3),
205
+ }
206
+ )
196
207
 
197
208
 
198
209
  @dataclass
@@ -568,9 +568,7 @@ class OktaSource(StatefulIngestionSourceBase):
568
568
  if (
569
569
  self.config.include_deprovisioned_users is False
570
570
  and okta_user.status == UserStatus.DEPROVISIONED
571
- ):
572
- return False
573
- elif (
571
+ ) or (
574
572
  self.config.include_suspended_users is False
575
573
  and okta_user.status == UserStatus.SUSPENDED
576
574
  ):
@@ -272,7 +272,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
272
272
  return schema_registry_class.create(config, report)
273
273
  except Exception as e:
274
274
  logger.debug(e, exc_info=e)
275
- raise ImportError(config.schema_registry_class)
275
+ raise ImportError(config.schema_registry_class) from e
276
276
 
277
277
  def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
278
278
  super().__init__(config, ctx)
@@ -447,13 +447,10 @@ class DebeziumSourceConnector(BaseConnector):
447
447
  ) -> DebeziumParser:
448
448
  connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
449
449
 
450
- if connector_class == "io.debezium.connector.mysql.MySqlConnector":
451
- parser = self.DebeziumParser(
452
- source_platform="mysql",
453
- server_name=self.get_server_name(connector_manifest),
454
- database_name=None,
455
- )
456
- elif connector_class == "MySqlConnector":
450
+ if (
451
+ connector_class == "io.debezium.connector.mysql.MySqlConnector"
452
+ or connector_class == "MySqlConnector"
453
+ ):
457
454
  parser = self.DebeziumParser(
458
455
  source_platform="mysql",
459
456
  server_name=self.get_server_name(connector_manifest),
@@ -33,14 +33,14 @@ class LookerViewFileLoader:
33
33
  base_projects_folder: Dict[str, pathlib.Path],
34
34
  reporter: LookMLSourceReport,
35
35
  source_config: LookMLSourceConfig,
36
- manifest_constants: Dict[str, LookerConstant] = {},
36
+ manifest_constants: Optional[Dict[str, LookerConstant]] = None,
37
37
  ) -> None:
38
38
  self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
39
39
  self._root_project_name = root_project_name
40
40
  self._base_projects_folder = base_projects_folder
41
41
  self.reporter = reporter
42
42
  self.source_config = source_config
43
- self.manifest_constants = manifest_constants
43
+ self.manifest_constants = manifest_constants or {}
44
44
 
45
45
  def _load_viewfile(
46
46
  self, project_name: str, path: str, reporter: LookMLSourceReport
@@ -205,8 +205,9 @@ class LookerAPI:
205
205
  def folder_ancestors(
206
206
  self,
207
207
  folder_id: str,
208
- fields: Union[str, List[str]] = ["id", "name", "parent_id"],
208
+ fields: Optional[Union[str, List[str]]] = None,
209
209
  ) -> Sequence[Folder]:
210
+ fields = fields or ["id", "name", "parent_id"]
210
211
  self.client_stats.folder_calls += 1
211
212
  try:
212
213
  return self.client.folder_ancestors(
@@ -464,9 +464,10 @@ def process_lookml_template_language(
464
464
  source_config: LookMLSourceConfig,
465
465
  view_lkml_file_dict: dict,
466
466
  reporter: LookMLSourceReport,
467
- manifest_constants: Dict[str, "LookerConstant"] = {},
467
+ manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
468
468
  resolve_constants: bool = False,
469
469
  ) -> None:
470
+ manifest_constants = manifest_constants or {}
470
471
  if "views" not in view_lkml_file_dict:
471
472
  return
472
473
 
@@ -507,9 +508,10 @@ def load_and_preprocess_file(
507
508
  path: Union[str, pathlib.Path],
508
509
  source_config: LookMLSourceConfig,
509
510
  reporter: LookMLSourceReport,
510
- manifest_constants: Dict[str, "LookerConstant"] = {},
511
+ manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
511
512
  resolve_constants: bool = False,
512
513
  ) -> dict:
514
+ manifest_constants = manifest_constants or {}
513
515
  parsed = load_lkml(path)
514
516
 
515
517
  process_lookml_template_language(
@@ -501,7 +501,7 @@ class LookMLSource(StatefulIngestionSourceBase):
501
501
  raise ValueError(
502
502
  f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
503
503
  f"in your config file"
504
- )
504
+ ) from None
505
505
 
506
506
  def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
507
507
  manifest_file = folder / "manifest.lkml"
@@ -1006,8 +1006,9 @@ class LookMLSource(StatefulIngestionSourceBase):
1006
1006
  def report_skipped_unreachable_views(
1007
1007
  self,
1008
1008
  viewfile_loader: LookerViewFileLoader,
1009
- processed_view_map: Dict[str, Set[str]] = {},
1009
+ processed_view_map: Optional[Dict[str, Set[str]]] = None,
1010
1010
  ) -> None:
1011
+ processed_view_map = processed_view_map or {}
1011
1012
  view_files: Dict[str, List[pathlib.Path]] = {}
1012
1013
  for project, folder_path in self.base_projects_folder.items():
1013
1014
  folder = pathlib.Path(folder_path)