acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  from dataclasses import dataclass, field
4
5
  from datetime import datetime
5
6
  from functools import lru_cache
@@ -100,6 +101,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
100
101
  from datahub.utilities import config_clean
101
102
  from datahub.utilities.lossy_collections import LossyList
102
103
  from datahub.utilities.registries.domain_registry import DomainRegistry
104
+ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
103
105
 
104
106
  logger = logging.getLogger(__name__)
105
107
 
@@ -210,6 +212,11 @@ class SupersetConfig(
210
212
  default=10, description="Timeout of single API call to superset."
211
213
  )
212
214
 
215
+ max_threads: int = Field(
216
+ default_factory=lambda: os.cpu_count() or 40,
217
+ description="Max parallelism for API calls. Defaults to cpuCount or 40",
218
+ )
219
+
213
220
  # TODO: Check and remove this if no longer needed.
214
221
  # Config database_alias is removed from sql sources.
215
222
  database_alias: Dict[str, str] = Field(
@@ -339,6 +346,7 @@ class SupersetSource(StatefulIngestionSourceBase):
339
346
 
340
347
  if response.status_code != 200:
341
348
  logger.warning(f"Failed to get {entity_type} data: {response.text}")
349
+ continue
342
350
 
343
351
  payload = response.json()
344
352
  # Update total_items with the actual count from the response
@@ -501,33 +509,41 @@ class SupersetSource(StatefulIngestionSourceBase):
501
509
 
502
510
  return dashboard_snapshot
503
511
 
504
- def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
505
- for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
506
- try:
507
- dashboard_id = str(dashboard_data.get("id"))
508
- dashboard_title = dashboard_data.get("dashboard_title", "")
509
-
510
- if not self.config.dashboard_pattern.allowed(dashboard_title):
511
- self.report.report_dropped(
512
- f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
513
- )
514
- continue
515
-
516
- dashboard_snapshot = self.construct_dashboard_from_api_data(
517
- dashboard_data
518
- )
519
- except Exception as e:
520
- self.report.warning(
521
- f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
512
+ def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
513
+ dashboard_title = ""
514
+ try:
515
+ dashboard_id = str(dashboard_data.get("id"))
516
+ dashboard_title = dashboard_data.get("dashboard_title", "")
517
+ if not self.config.dashboard_pattern.allowed(dashboard_title):
518
+ self.report.report_dropped(
519
+ f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
522
520
  )
523
- continue
524
- # Emit the dashboard
525
- mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
526
- yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
527
- yield from self._get_domain_wu(
528
- title=dashboard_title,
529
- entity_urn=dashboard_snapshot.urn,
521
+ return
522
+ dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
523
+ except Exception as e:
524
+ self.report.warning(
525
+ f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
526
+ )
527
+ return
528
+ mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
529
+ yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
530
+ yield from self._get_domain_wu(
531
+ title=dashboard_title, entity_urn=dashboard_snapshot.urn
532
+ )
533
+
534
+ def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
535
+ dashboard_data_list = [
536
+ (dashboard_data,)
537
+ for dashboard_data in self.paginate_entity_api_results(
538
+ "dashboard/", PAGE_SIZE
530
539
  )
540
+ ]
541
+
542
+ yield from ThreadedIteratorExecutor.process(
543
+ worker_func=self._process_dashboard,
544
+ args_list=dashboard_data_list,
545
+ max_workers=self.config.max_threads,
546
+ )
531
547
 
532
548
  def build_input_fields(
533
549
  self,
@@ -762,40 +778,46 @@ class SupersetSource(StatefulIngestionSourceBase):
762
778
  entity_urn=chart_urn,
763
779
  )
764
780
 
765
- def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
766
- for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
767
- try:
768
- chart_id = str(chart_data.get("id"))
769
- chart_name = chart_data.get("slice_name", "")
770
-
771
- if not self.config.chart_pattern.allowed(chart_name):
772
- self.report.report_dropped(
773
- f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
781
+ def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
782
+ chart_name = ""
783
+ try:
784
+ chart_id = str(chart_data.get("id"))
785
+ chart_name = chart_data.get("slice_name", "")
786
+ if not self.config.chart_pattern.allowed(chart_name):
787
+ self.report.report_dropped(
788
+ f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
789
+ )
790
+ return
791
+ if self.config.dataset_pattern != AllowDenyPattern.allow_all():
792
+ datasource_id = chart_data.get("datasource_id")
793
+ if datasource_id:
794
+ dataset_response = self.get_dataset_info(datasource_id)
795
+ dataset_name = dataset_response.get("result", {}).get(
796
+ "table_name", ""
774
797
  )
775
- continue
776
-
777
- # Emit a warning if charts use data from a dataset that will be filtered out
778
- if self.config.dataset_pattern != AllowDenyPattern.allow_all():
779
- datasource_id = chart_data.get("datasource_id")
780
- if datasource_id:
781
- dataset_response = self.get_dataset_info(datasource_id)
782
- dataset_name = dataset_response.get("result", {}).get(
783
- "table_name", ""
798
+ if dataset_name and not self.config.dataset_pattern.allowed(
799
+ dataset_name
800
+ ):
801
+ self.report.warning(
802
+ f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
784
803
  )
804
+ yield from self.construct_chart_from_chart_data(chart_data)
805
+ except Exception as e:
806
+ self.report.warning(
807
+ f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
808
+ )
809
+ return
785
810
 
786
- if dataset_name and not self.config.dataset_pattern.allowed(
787
- dataset_name
788
- ):
789
- self.report.warning(
790
- f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
791
- )
792
-
793
- yield from self.construct_chart_from_chart_data(chart_data)
794
- except Exception as e:
795
- self.report.warning(
796
- f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
797
- )
798
- continue
811
+ def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
812
+ chart_data_list = [
813
+ (chart_data,)
814
+ for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE)
815
+ ]
816
+ yield from ThreadedIteratorExecutor.process(
817
+ worker_func=self._process_chart,
818
+ args_list=chart_data_list,
819
+ max_workers=self.config.max_threads,
820
+ )
799
821
 
800
822
  def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
801
823
  schema_fields: List[SchemaField] = []
@@ -1023,33 +1045,38 @@ class SupersetSource(StatefulIngestionSourceBase):
1023
1045
 
1024
1046
  return dataset_snapshot
1025
1047
 
1026
- def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
1027
- for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
1028
- try:
1029
- dataset_name = dataset_data.get("table_name", "")
1030
-
1031
- # Check if dataset should be filtered by dataset name
1032
- if not self.config.dataset_pattern.allowed(dataset_name):
1033
- self.report.report_dropped(
1034
- f"Dataset '{dataset_name}' filtered by dataset_pattern"
1035
- )
1036
- continue
1037
-
1038
- dataset_snapshot = self.construct_dataset_from_dataset_data(
1039
- dataset_data
1040
- )
1041
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1042
- except Exception as e:
1043
- self.report.warning(
1044
- f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
1048
+ def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
1049
+ dataset_name = ""
1050
+ try:
1051
+ dataset_name = dataset_data.get("table_name", "")
1052
+ if not self.config.dataset_pattern.allowed(dataset_name):
1053
+ self.report.report_dropped(
1054
+ f"Dataset '{dataset_name}' filtered by dataset_pattern"
1045
1055
  )
1046
- continue
1047
- # Emit the dataset
1048
- yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
1049
- yield from self._get_domain_wu(
1050
- title=dataset_data.get("table_name", ""),
1051
- entity_urn=dataset_snapshot.urn,
1056
+ return
1057
+ dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
1058
+ mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1059
+ except Exception as e:
1060
+ self.report.warning(
1061
+ f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
1052
1062
  )
1063
+ return
1064
+ yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
1065
+ yield from self._get_domain_wu(
1066
+ title=dataset_data.get("table_name", ""),
1067
+ entity_urn=dataset_snapshot.urn,
1068
+ )
1069
+
1070
+ def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
1071
+ dataset_data_list = [
1072
+ (dataset_data,)
1073
+ for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE)
1074
+ ]
1075
+ yield from ThreadedIteratorExecutor.process(
1076
+ worker_func=self._process_dataset,
1077
+ args_list=dataset_data_list,
1078
+ max_workers=self.config.max_threads,
1079
+ )
1053
1080
 
1054
1081
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1055
1082
  if self.config.ingest_dashboards:
@@ -22,7 +22,11 @@ from google.oauth2 import service_account
22
22
 
23
23
  import datahub.emitter.mce_builder as builder
24
24
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
25
- from datahub.emitter.mcp_builder import ContainerKey, ProjectIdKey, gen_containers
25
+ from datahub.emitter.mcp_builder import (
26
+ ExperimentKey,
27
+ ProjectIdKey,
28
+ gen_containers,
29
+ )
26
30
  from datahub.ingestion.api.common import PipelineContext
27
31
  from datahub.ingestion.api.decorators import (
28
32
  SupportStatus,
@@ -96,10 +100,6 @@ class ModelMetadata:
96
100
  endpoints: Optional[List[Endpoint]] = None
97
101
 
98
102
 
99
- class ContainerKeyWithId(ContainerKey):
100
- id: str
101
-
102
-
103
103
  @platform_name("Vertex AI", id="vertexai")
104
104
  @config_class(VertexAIConfig)
105
105
  @support_status(SupportStatus.TESTING)
@@ -173,7 +173,7 @@ class VertexAISource(Source):
173
173
  ) -> Iterable[MetadataWorkUnit]:
174
174
  yield from gen_containers(
175
175
  parent_container_key=self._get_project_container(),
176
- container_key=ContainerKeyWithId(
176
+ container_key=ExperimentKey(
177
177
  platform=self.platform,
178
178
  id=self._make_vertexai_experiment_name(experiment.name),
179
179
  ),
@@ -309,7 +309,7 @@ class VertexAISource(Source):
309
309
  def _gen_experiment_run_mcps(
310
310
  self, experiment: Experiment, run: ExperimentRun
311
311
  ) -> Iterable[MetadataChangeProposalWrapper]:
312
- experiment_key = ContainerKeyWithId(
312
+ experiment_key = ExperimentKey(
313
313
  platform=self.platform,
314
314
  id=self._make_vertexai_experiment_name(experiment.name),
315
315
  )
@@ -32,10 +32,10 @@ def deploy_source_vars(
32
32
  name: Optional[str],
33
33
  config: str,
34
34
  urn: Optional[str],
35
- executor_id: str,
35
+ executor_id: Optional[str],
36
36
  cli_version: Optional[str],
37
37
  schedule: Optional[str],
38
- time_zone: str,
38
+ time_zone: Optional[str],
39
39
  extra_pip: Optional[str],
40
40
  debug: bool = False,
41
41
  ) -> dict: