acryl-datahub 1.0.0.2rc1__py3-none-any.whl → 1.0.0.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -0,0 +1,297 @@
1
+ import logging
2
+ import re
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Dict, Iterable, List, Optional, Tuple
6
+
7
+ from datahub.ingestion.api.source import SourceReport
8
+ from datahub.ingestion.source.hex.constants import (
9
+ DATAHUB_API_PAGE_SIZE_DEFAULT,
10
+ HEX_PLATFORM_URN,
11
+ )
12
+ from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
13
+ from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
14
+ from datahub.sdk.main_client import DataHubClient
15
+ from datahub.sdk.search_filters import FilterDsl as F
16
+ from datahub.utilities.time import datetime_to_ts_millis
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
21
+ HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
22
+
23
+
24
+ @dataclass
25
+ class QueryResponse:
26
+ """This is the public response model for the HexQueryFetcher."""
27
+
28
+ urn: QueryUrn
29
+ hex_project_id: str
30
+ dataset_subjects: List[DatasetUrn] = field(default_factory=list)
31
+ schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
32
+
33
+
34
+ @dataclass
35
+ class HexQueryFetcherReport(SourceReport):
36
+ start_datetime: Optional[datetime] = None
37
+ end_datetime: Optional[datetime] = None
38
+ fetched_query_urns: int = 0
39
+ fetched_query_objects: int = 0
40
+ filtered_out_queries_missing_metadata: int = 0
41
+ filtered_out_queries_different_workspace: int = 0
42
+ filtered_out_queries_no_subjects: int = 0
43
+ total_queries: int = 0
44
+ total_dataset_subjects: int = 0
45
+ total_schema_field_subjects: int = 0
46
+ num_calls_fetch_query_entities: int = 0
47
+
48
+
49
+ class HexQueryFetcher:
50
+ def __init__(
51
+ self,
52
+ datahub_client: DataHubClient,
53
+ workspace_name: str,
54
+ start_datetime: datetime,
55
+ end_datetime: datetime,
56
+ report: HexQueryFetcherReport,
57
+ page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
58
+ ):
59
+ self.datahub_client = datahub_client
60
+ self.workspace_name = workspace_name
61
+ self.start_datetime = start_datetime
62
+ self.end_datetime = end_datetime
63
+ self.report = report
64
+ self.page_size = page_size
65
+
66
+ self.report.start_datetime = start_datetime
67
+ self.report.end_datetime = end_datetime
68
+
69
+ def fetch(self) -> Iterable[QueryResponse]:
70
+ try:
71
+ query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
72
+ assert all(isinstance(urn, QueryUrn) for urn in query_urns)
73
+ self.report.fetched_query_urns = len(query_urns)
74
+
75
+ entities_by_urn = self._fetch_query_entities(query_urns)
76
+ self.report.fetched_query_objects = len(entities_by_urn)
77
+ except Exception as e:
78
+ self.report.failure(
79
+ title="Error fetching Queries for lineage",
80
+ message="Error fetching Queries will result on missing lineage",
81
+ context=str(
82
+ dict(
83
+ workspace_name=self.workspace_name,
84
+ start_datetime=self.start_datetime,
85
+ end_datetime=self.end_datetime,
86
+ )
87
+ ),
88
+ exc=e,
89
+ )
90
+ else:
91
+ if not query_urns or not entities_by_urn:
92
+ self.report.warning(
93
+ title="No Queries found with Hex as origin",
94
+ message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
95
+ context=str(
96
+ dict(
97
+ workspace_name=self.workspace_name,
98
+ start_datetime=self.start_datetime,
99
+ end_datetime=self.end_datetime,
100
+ )
101
+ ),
102
+ )
103
+ return
104
+
105
+ for query_urn, (
106
+ query_properties,
107
+ query_subjects,
108
+ ) in entities_by_urn.items():
109
+ maybe_query_response = self._build_query_response(
110
+ query_urn=query_urn,
111
+ query_properties=query_properties,
112
+ query_subjects=query_subjects,
113
+ )
114
+ if maybe_query_response:
115
+ yield maybe_query_response
116
+
117
+ def _fetch_query_entities(
118
+ self, query_urns: List[QueryUrn]
119
+ ) -> Dict[
120
+ QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
121
+ ]:
122
+ entities_by_urn: Dict[
123
+ QueryUrn,
124
+ Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
125
+ ] = {}
126
+ for i in range(0, len(query_urns), self.page_size):
127
+ batch = query_urns[i : i + self.page_size]
128
+
129
+ logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
130
+ entities = self.datahub_client._graph.get_entities(
131
+ entity_name=QueryUrn.ENTITY_TYPE,
132
+ urns=[urn.urn() for urn in batch],
133
+ aspects=[
134
+ QueryPropertiesClass.ASPECT_NAME,
135
+ QuerySubjectsClass.ASPECT_NAME,
136
+ ],
137
+ with_system_metadata=False,
138
+ )
139
+ self.report.num_calls_fetch_query_entities += 1
140
+ logger.debug(f"Get entities response: {entities}")
141
+
142
+ for urn, entity in entities.items():
143
+ query_urn = QueryUrn.from_string(urn)
144
+
145
+ properties_tuple = entity.get(
146
+ QueryPropertiesClass.ASPECT_NAME, (None, None)
147
+ )
148
+ query_properties: Optional[QueryPropertiesClass] = None
149
+ if properties_tuple and properties_tuple[0]:
150
+ assert isinstance(properties_tuple[0], QueryPropertiesClass)
151
+ query_properties = properties_tuple[0]
152
+
153
+ subjects_tuple = entity.get(
154
+ QuerySubjectsClass.ASPECT_NAME, (None, None)
155
+ )
156
+ query_subjects: Optional[QuerySubjectsClass] = None
157
+ if subjects_tuple and subjects_tuple[0]:
158
+ assert isinstance(subjects_tuple[0], QuerySubjectsClass)
159
+ query_subjects = subjects_tuple[0]
160
+
161
+ entities_by_urn[query_urn] = (query_properties, query_subjects)
162
+
163
+ return entities_by_urn
164
+
165
+ def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
166
+ last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
167
+ last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
168
+
169
+ urns = self.datahub_client.search.get_urns(
170
+ filter=F.and_(
171
+ F.entity_type(QueryUrn.ENTITY_TYPE),
172
+ F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
173
+ F.custom_filter(
174
+ "lastModifiedAt",
175
+ "GREATER_THAN_OR_EQUAL_TO",
176
+ [str(last_modified_start_at_millis)],
177
+ ),
178
+ F.custom_filter(
179
+ "lastModifiedAt",
180
+ "LESS_THAN_OR_EQUAL_TO",
181
+ [str(last_modified_end_at_millis)],
182
+ ),
183
+ ),
184
+ )
185
+ logger.debug(f"Get URNS by filter: {urns}")
186
+ return [QueryUrn.from_string(urn.urn()) for urn in urns]
187
+
188
+ def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
189
+ """
190
+ Extract project ID and workspace name from SQL statement.
191
+
192
+ Looks for Hex metadata in SQL comments in the format:
193
+ -- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
194
+
195
+ Example:
196
+ -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
197
+
198
+ # TODO: Consider supporting multiline metadata format in the future:
199
+ # -- Hex query metadata: {
200
+ # -- "categories": ["Scratchpad"],
201
+ # -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
202
+ # -- ...
203
+ # -- }
204
+
205
+ Returns:
206
+ A tuple of (project_id, workspace_name) if both are successfully extracted
207
+ None if extraction fails for any reason
208
+ """
209
+ # Extract both project_id and workspace name in a single regex operation
210
+ match = re.search(HEX_METADATA_PATTERN, sql_statement)
211
+
212
+ if not match:
213
+ return None
214
+
215
+ try:
216
+ project_id = match.group(1)
217
+ workspace_name = match.group(2)
218
+ return project_id, workspace_name
219
+ except (IndexError, AttributeError) as e:
220
+ self.report.warning(
221
+ title="Failed to extract information from Hex query metadata",
222
+ message="Failed to extract information from Hex query metadata will result on missing lineage",
223
+ context=sql_statement,
224
+ exc=e,
225
+ )
226
+
227
+ return None
228
+
229
+ def _build_query_response(
230
+ self,
231
+ query_urn: QueryUrn,
232
+ query_properties: Optional[QueryPropertiesClass],
233
+ query_subjects: Optional[QuerySubjectsClass],
234
+ ) -> Optional[QueryResponse]:
235
+ # Skip if missing required aspects
236
+ if (
237
+ not query_properties
238
+ or not query_properties.statement
239
+ or not query_properties.statement.value
240
+ or not query_subjects
241
+ or query_subjects.subjects is None # empty list is allowed
242
+ ):
243
+ logger.debug(
244
+ f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
245
+ )
246
+ self.report.filtered_out_queries_missing_metadata += 1
247
+ return None
248
+
249
+ # Extract hex metadata (project_id and workspace_name)
250
+ metadata_result = self._extract_hex_metadata(query_properties.statement.value)
251
+ if not metadata_result:
252
+ logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
253
+ self.report.filtered_out_queries_missing_metadata += 1
254
+ return None
255
+
256
+ hex_project_id, workspace_from_url = metadata_result
257
+
258
+ # Validate workspace
259
+ if workspace_from_url != self.workspace_name:
260
+ logger.debug(
261
+ f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
262
+ )
263
+ self.report.filtered_out_queries_different_workspace += 1
264
+ return None
265
+
266
+ # Extract subjects
267
+ dataset_subjects: List[DatasetUrn] = []
268
+ schema_field_subjects: List[SchemaFieldUrn] = []
269
+ for subject in query_subjects.subjects:
270
+ if subject.entity and subject.entity.startswith("urn:li:dataset:"):
271
+ dataset_subjects.append(DatasetUrn.from_string(subject.entity))
272
+ elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
273
+ schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
274
+
275
+ if not dataset_subjects and not schema_field_subjects:
276
+ self.report.filtered_out_queries_no_subjects += 1
277
+ return None
278
+
279
+ # Create response
280
+ response = QueryResponse(
281
+ urn=query_urn,
282
+ hex_project_id=hex_project_id,
283
+ dataset_subjects=dataset_subjects,
284
+ schema_field_subjects=schema_field_subjects,
285
+ )
286
+ logger.debug(
287
+ f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
288
+ )
289
+ self.report.total_queries += 1
290
+ self.report.total_dataset_subjects += len(dataset_subjects)
291
+ self.report.total_schema_field_subjects += len(schema_field_subjects)
292
+
293
+ logger.debug(
294
+ f"Processed query {query_urn} with Hex project ID {hex_project_id}"
295
+ )
296
+
297
+ return response
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  from dataclasses import dataclass, field
4
5
  from datetime import datetime
5
6
  from functools import lru_cache
@@ -100,6 +101,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
100
101
  from datahub.utilities import config_clean
101
102
  from datahub.utilities.lossy_collections import LossyList
102
103
  from datahub.utilities.registries.domain_registry import DomainRegistry
104
+ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
103
105
 
104
106
  logger = logging.getLogger(__name__)
105
107
 
@@ -210,6 +212,11 @@ class SupersetConfig(
210
212
  default=10, description="Timeout of single API call to superset."
211
213
  )
212
214
 
215
+ max_threads: int = Field(
216
+ default_factory=lambda: os.cpu_count() or 40,
217
+ description="Max parallelism for API calls. Defaults to cpuCount or 40",
218
+ )
219
+
213
220
  # TODO: Check and remove this if no longer needed.
214
221
  # Config database_alias is removed from sql sources.
215
222
  database_alias: Dict[str, str] = Field(
@@ -339,6 +346,7 @@ class SupersetSource(StatefulIngestionSourceBase):
339
346
 
340
347
  if response.status_code != 200:
341
348
  logger.warning(f"Failed to get {entity_type} data: {response.text}")
349
+ continue
342
350
 
343
351
  payload = response.json()
344
352
  # Update total_items with the actual count from the response
@@ -501,33 +509,41 @@ class SupersetSource(StatefulIngestionSourceBase):
501
509
 
502
510
  return dashboard_snapshot
503
511
 
504
- def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
505
- for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
506
- try:
507
- dashboard_id = str(dashboard_data.get("id"))
508
- dashboard_title = dashboard_data.get("dashboard_title", "")
509
-
510
- if not self.config.dashboard_pattern.allowed(dashboard_title):
511
- self.report.report_dropped(
512
- f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
513
- )
514
- continue
515
-
516
- dashboard_snapshot = self.construct_dashboard_from_api_data(
517
- dashboard_data
518
- )
519
- except Exception as e:
520
- self.report.warning(
521
- f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
512
+ def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
513
+ dashboard_title = ""
514
+ try:
515
+ dashboard_id = str(dashboard_data.get("id"))
516
+ dashboard_title = dashboard_data.get("dashboard_title", "")
517
+ if not self.config.dashboard_pattern.allowed(dashboard_title):
518
+ self.report.report_dropped(
519
+ f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
522
520
  )
523
- continue
524
- # Emit the dashboard
525
- mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
526
- yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
527
- yield from self._get_domain_wu(
528
- title=dashboard_title,
529
- entity_urn=dashboard_snapshot.urn,
521
+ return
522
+ dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
523
+ except Exception as e:
524
+ self.report.warning(
525
+ f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
526
+ )
527
+ return
528
+ mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
529
+ yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
530
+ yield from self._get_domain_wu(
531
+ title=dashboard_title, entity_urn=dashboard_snapshot.urn
532
+ )
533
+
534
+ def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
535
+ dashboard_data_list = [
536
+ (dashboard_data,)
537
+ for dashboard_data in self.paginate_entity_api_results(
538
+ "dashboard/", PAGE_SIZE
530
539
  )
540
+ ]
541
+
542
+ yield from ThreadedIteratorExecutor.process(
543
+ worker_func=self._process_dashboard,
544
+ args_list=dashboard_data_list,
545
+ max_workers=self.config.max_threads,
546
+ )
531
547
 
532
548
  def build_input_fields(
533
549
  self,
@@ -762,40 +778,46 @@ class SupersetSource(StatefulIngestionSourceBase):
762
778
  entity_urn=chart_urn,
763
779
  )
764
780
 
765
- def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
766
- for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
767
- try:
768
- chart_id = str(chart_data.get("id"))
769
- chart_name = chart_data.get("slice_name", "")
770
-
771
- if not self.config.chart_pattern.allowed(chart_name):
772
- self.report.report_dropped(
773
- f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
781
+ def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
782
+ chart_name = ""
783
+ try:
784
+ chart_id = str(chart_data.get("id"))
785
+ chart_name = chart_data.get("slice_name", "")
786
+ if not self.config.chart_pattern.allowed(chart_name):
787
+ self.report.report_dropped(
788
+ f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
789
+ )
790
+ return
791
+ if self.config.dataset_pattern != AllowDenyPattern.allow_all():
792
+ datasource_id = chart_data.get("datasource_id")
793
+ if datasource_id:
794
+ dataset_response = self.get_dataset_info(datasource_id)
795
+ dataset_name = dataset_response.get("result", {}).get(
796
+ "table_name", ""
774
797
  )
775
- continue
776
-
777
- # Emit a warning if charts use data from a dataset that will be filtered out
778
- if self.config.dataset_pattern != AllowDenyPattern.allow_all():
779
- datasource_id = chart_data.get("datasource_id")
780
- if datasource_id:
781
- dataset_response = self.get_dataset_info(datasource_id)
782
- dataset_name = dataset_response.get("result", {}).get(
783
- "table_name", ""
798
+ if dataset_name and not self.config.dataset_pattern.allowed(
799
+ dataset_name
800
+ ):
801
+ self.report.warning(
802
+ f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
784
803
  )
804
+ yield from self.construct_chart_from_chart_data(chart_data)
805
+ except Exception as e:
806
+ self.report.warning(
807
+ f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
808
+ )
809
+ return
785
810
 
786
- if dataset_name and not self.config.dataset_pattern.allowed(
787
- dataset_name
788
- ):
789
- self.report.warning(
790
- f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
791
- )
792
-
793
- yield from self.construct_chart_from_chart_data(chart_data)
794
- except Exception as e:
795
- self.report.warning(
796
- f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
797
- )
798
- continue
811
+ def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
812
+ chart_data_list = [
813
+ (chart_data,)
814
+ for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE)
815
+ ]
816
+ yield from ThreadedIteratorExecutor.process(
817
+ worker_func=self._process_chart,
818
+ args_list=chart_data_list,
819
+ max_workers=self.config.max_threads,
820
+ )
799
821
 
800
822
  def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
801
823
  schema_fields: List[SchemaField] = []
@@ -1023,33 +1045,38 @@ class SupersetSource(StatefulIngestionSourceBase):
1023
1045
 
1024
1046
  return dataset_snapshot
1025
1047
 
1026
- def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
1027
- for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
1028
- try:
1029
- dataset_name = dataset_data.get("table_name", "")
1030
-
1031
- # Check if dataset should be filtered by dataset name
1032
- if not self.config.dataset_pattern.allowed(dataset_name):
1033
- self.report.report_dropped(
1034
- f"Dataset '{dataset_name}' filtered by dataset_pattern"
1035
- )
1036
- continue
1037
-
1038
- dataset_snapshot = self.construct_dataset_from_dataset_data(
1039
- dataset_data
1040
- )
1041
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1042
- except Exception as e:
1043
- self.report.warning(
1044
- f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
1048
+ def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
1049
+ dataset_name = ""
1050
+ try:
1051
+ dataset_name = dataset_data.get("table_name", "")
1052
+ if not self.config.dataset_pattern.allowed(dataset_name):
1053
+ self.report.report_dropped(
1054
+ f"Dataset '{dataset_name}' filtered by dataset_pattern"
1045
1055
  )
1046
- continue
1047
- # Emit the dataset
1048
- yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
1049
- yield from self._get_domain_wu(
1050
- title=dataset_data.get("table_name", ""),
1051
- entity_urn=dataset_snapshot.urn,
1056
+ return
1057
+ dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
1058
+ mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1059
+ except Exception as e:
1060
+ self.report.warning(
1061
+ f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
1052
1062
  )
1063
+ return
1064
+ yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
1065
+ yield from self._get_domain_wu(
1066
+ title=dataset_data.get("table_name", ""),
1067
+ entity_urn=dataset_snapshot.urn,
1068
+ )
1069
+
1070
+ def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
1071
+ dataset_data_list = [
1072
+ (dataset_data,)
1073
+ for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE)
1074
+ ]
1075
+ yield from ThreadedIteratorExecutor.process(
1076
+ worker_func=self._process_dataset,
1077
+ args_list=dataset_data_list,
1078
+ max_workers=self.config.max_threads,
1079
+ )
1053
1080
 
1054
1081
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1055
1082
  if self.config.ingest_dashboards: