batch-analytics 0.3.18__tar.gz → 0.3.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/PKG-INFO +13 -10
  2. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/README.md +6 -5
  3. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/pyproject.toml +9 -9
  4. batch_analytics-0.3.21/src/batch_analytics/__init__.py +63 -0
  5. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/__main__.py +1 -0
  6. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/analytics/gluon_autogluon_train.py +23 -3
  7. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/PKG-INFO +13 -10
  8. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/requires.txt +7 -4
  9. batch_analytics-0.3.18/src/batch_analytics/__init__.py +0 -52
  10. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/setup.cfg +0 -0
  11. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/analytics/__init__.py +0 -0
  12. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/analytics/correlation.py +0 -0
  13. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
  14. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/analytics/linear_regression.py +0 -0
  15. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  16. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/analytics/t_test.py +0 -0
  17. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/config.py +0 -0
  18. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/extract.py +0 -0
  19. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/job_runner.py +0 -0
  20. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/log.py +0 -0
  21. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/modules.py +0 -0
  22. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/output/__init__.py +0 -0
  23. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/output/base.py +0 -0
  24. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/output/clickhouse.py +0 -0
  25. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/output/local.py +0 -0
  26. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/output/s3.py +0 -0
  27. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/transform.py +0 -0
  28. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/utils/__init__.py +0 -0
  29. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  30. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  31. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  32. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  33. {batch_analytics-0.3.18 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,16 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.18
3
+ Version: 0.3.21
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
7
7
  Requires-Python: >=3.8
8
8
  Description-Content-Type: text/markdown
9
- Requires-Dist: pyspark<3.6,>=3.4
10
9
  Requires-Dist: numpy>=1.19.0
11
- Requires-Dist: scipy>=1.5.0
12
10
  Provides-Extra: dev
13
11
  Requires-Dist: pytest>=7.0; extra == "dev"
12
+ Provides-Extra: spark
13
+ Requires-Dist: pyspark<3.6,>=3.4; extra == "spark"
14
+ Requires-Dist: scipy>=1.5.0; extra == "spark"
14
15
  Provides-Extra: ttest
15
16
  Requires-Dist: scipy>=1.5.0; extra == "ttest"
16
17
  Provides-Extra: s3
@@ -23,18 +24,19 @@ Requires-Dist: boto3>=1.28; extra == "output"
23
24
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
24
25
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
25
26
  Provides-Extra: autogluon
26
- Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
27
+ Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
27
28
  Requires-Dist: pandas>=1.3.0; extra == "autogluon"
28
29
  Requires-Dist: boto3>=1.28; extra == "autogluon"
29
30
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
30
31
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
31
32
  Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
32
33
  Provides-Extra: full
34
+ Requires-Dist: pyspark<3.6,>=3.4; extra == "full"
33
35
  Requires-Dist: scipy>=1.5.0; extra == "full"
34
36
  Requires-Dist: boto3>=1.28; extra == "full"
35
37
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
36
38
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
37
- Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
39
+ Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
38
40
  Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
39
41
 
40
42
  # Batch Analytics
@@ -71,12 +73,13 @@ analytics/
71
73
 
72
74
  ## Install
73
75
 
76
+ - `pip install batch-analytics` — core (numpy only)
77
+ - `pip install batch-analytics[spark]` — PySpark ETL + job runner + SciPy (t-test / ANOVA)
78
+ - `pip install batch-analytics[autogluon]` — AutoGluon + I/O (Gluon image; no PySpark)
79
+ - `pip install batch-analytics[full]` — PySpark + AutoGluon + I/O
80
+
74
81
  ```bash
75
- pip install -e .
76
- # or install every runtime dependency used anywhere in the package, then editable:
77
- pip install -r requirements.txt && pip install -e .
78
- # PyPI install includes numpy and scipy (t-test); extras: s3, clickhouse, output, full
79
- pip install "batch-analytics[full]"
82
+ pip install -e ".[spark]" # dev
80
83
  ```
81
84
 
82
85
  ## Run
@@ -32,12 +32,13 @@ analytics/
32
32
 
33
33
  ## Install
34
34
 
35
+ - `pip install batch-analytics` — core (numpy only)
36
+ - `pip install batch-analytics[spark]` — PySpark ETL + job runner + SciPy (t-test / ANOVA)
37
+ - `pip install batch-analytics[autogluon]` — AutoGluon + I/O (Gluon image; no PySpark)
38
+ - `pip install batch-analytics[full]` — PySpark + AutoGluon + I/O
39
+
35
40
  ```bash
36
- pip install -e .
37
- # or install every runtime dependency used anywhere in the package, then editable:
38
- pip install -r requirements.txt && pip install -e .
39
- # PyPI install includes numpy and scipy (t-test); extras: s3, clickhouse, output, full
40
- pip install "batch-analytics[full]"
41
+ pip install -e ".[spark]" # dev
41
42
  ```
42
43
 
43
44
  ## Run
@@ -4,22 +4,22 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.18"
7
+ version = "0.3.21"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
11
11
  dependencies = [
12
- "pyspark>=3.4,<3.6",
13
12
  "numpy>=1.19.0",
14
- # Welch t-test (t_test.py); keep on core deps so `pip install batch-analytics` works in minimal driver images
15
- "scipy>=1.5.0",
16
13
  ]
17
14
  authors = [{ name = "Litewave Analytics Team" }]
18
15
  license = { text = "MIT" }
19
16
 
20
17
  [project.optional-dependencies]
21
18
  dev = ["pytest>=7.0"]
22
- # Legacy: scipy is a core dependency; kept so `pip install "batch-analytics[ttest]"` still resolves.
19
+ spark = [
20
+ "pyspark>=3.4,<3.6",
21
+ "scipy>=1.5.0",
22
+ ]
23
23
  ttest = ["scipy>=1.5.0"]
24
24
  s3 = ["boto3>=1.28"]
25
25
  clickhouse = [
@@ -31,22 +31,22 @@ output = [
31
31
  "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
32
32
  "clickhouse-connect>=0.7; python_version >= '3.9'",
33
33
  ]
34
- # AutoGluon Tabular train/infer (POST /submit/train, /submit/inf on analytics_runner). Requires Python 3.9+.
34
+ # autogluon-tabular[lightgbm]: tabular AutoML with LightGBM only (no PyTorch from full autogluon metapackage)
35
35
  autogluon = [
36
- "autogluon>=1.0,<2.0",
36
+ "autogluon-tabular[lightgbm]>=1.0,<2.0",
37
37
  "pandas>=1.3.0",
38
38
  "boto3>=1.28",
39
39
  "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
40
40
  "clickhouse-connect>=0.7; python_version >= '3.9'",
41
41
  "pyarrow>=10.0.0; python_version >= '3.8'",
42
42
  ]
43
- # Install all optional runtime deps used anywhere in the package
44
43
  full = [
44
+ "pyspark>=3.4,<3.6",
45
45
  "scipy>=1.5.0",
46
46
  "boto3>=1.28",
47
47
  "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
48
48
  "clickhouse-connect>=0.7; python_version >= '3.9'",
49
- "autogluon>=1.0,<2.0",
49
+ "autogluon-tabular[lightgbm]>=1.0,<2.0",
50
50
  "pyarrow>=10.0.0; python_version >= '3.8'",
51
51
  ]
52
52
 
@@ -0,0 +1,63 @@
1
+ """
2
+ Batch analytics pipeline: Extract, Transform, Log stages + analytics modules.
3
+
4
+ PySpark is optional: install ``batch-analytics[spark]``. Gluon image: ``[autogluon]`` only.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import importlib
10
+ from typing import Any
11
+
12
+ from .config import BatchAnalyticsConfig, SparkK8sConfig
13
+
14
+ __all__ = [
15
+ "BatchAnalyticsConfig",
16
+ "SparkK8sConfig",
17
+ "expand_kv_blob_column",
18
+ "extract_anchor_id",
19
+ "extract_all",
20
+ "extract_table",
21
+ "extract_unified",
22
+ "parse_extract_filter_values",
23
+ "remove_duplicates",
24
+ "stage_to_clickhouse",
25
+ "transform",
26
+ "transform_and_stage",
27
+ "load_staged",
28
+ "log_run",
29
+ "log_analytics_artifacts",
30
+ "run_pipeline",
31
+ "create_spark_session",
32
+ ]
33
+
34
+ # Lazy imports so ``import batch_analytics`` works without PySpark (Gluon image).
35
+ _LAZY = {
36
+ "expand_kv_blob_column": ("transform", "expand_kv_blob_column"),
37
+ "extract_anchor_id": ("transform", "extract_anchor_id"),
38
+ "extract_all": ("extract", "extract_all"),
39
+ "extract_table": ("extract", "extract_table"),
40
+ "extract_unified": ("extract", "extract_unified"),
41
+ "parse_extract_filter_values": ("extract", "parse_extract_filter_values"),
42
+ "remove_duplicates": ("transform", "remove_duplicates"),
43
+ "stage_to_clickhouse": ("transform", "stage_to_clickhouse"),
44
+ "transform": ("transform", "transform"),
45
+ "transform_and_stage": ("transform", "transform_and_stage"),
46
+ "load_staged": ("transform", "load_staged"),
47
+ "log_run": ("log", "log_run"),
48
+ "log_analytics_artifacts": ("log", "log_analytics_artifacts"),
49
+ "run_pipeline": ("job_runner", "run_pipeline"),
50
+ "create_spark_session": ("job_runner", "create_spark_session"),
51
+ }
52
+
53
+
54
+ def __getattr__(name: str) -> Any:
55
+ if name in _LAZY:
56
+ mod_name, attr = _LAZY[name]
57
+ mod = importlib.import_module(f".{mod_name}", __name__)
58
+ return getattr(mod, attr)
59
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
60
+
61
+
62
+ def __dir__() -> list[str]:
63
+ return sorted({*__all__, *globals().keys()})
@@ -2,4 +2,5 @@
2
2
 
3
3
  from .job_runner import main
4
4
  import sys
5
+
5
6
  sys.exit(main())
@@ -4,11 +4,13 @@ Train AutoGluon TabularPredictor from ClickHouse staging data; upload artifacts
4
4
  Env (injected by analytics_runner Gluon job):
5
5
  CLICKHOUSE_*, BATCH_STAGING_TABLE, MODEL_S3_PREFIX, TASK_ID,
6
6
  AUTOGLUON_LABEL, AUTOGLUON_FEATURES, AUTOGLUON_PROBLEM_TYPE, AUTOGLUON_TIME_LIMIT,
7
- optional AUTOGLUON_MAX_ROWS
7
+ optional AUTOGLUON_MAX_ROWS,
8
+ optional AUTOGLUON_HYPERPARAMETERS — JSON object of AutoGluon model hyperparameters; default {"GBM": {}} (LightGBM only, no PyTorch)
8
9
  """
9
10
 
10
11
  from __future__ import annotations
11
12
 
13
+ import json
12
14
  import logging
13
15
  import os
14
16
  import shutil
@@ -44,6 +46,22 @@ def _require(name: str) -> str:
44
46
  return v
45
47
 
46
48
 
49
+ def _hyperparameters_from_env() -> dict:
50
+ """Default: GBM (LightGBM) only — matches autogluon-tabular[lightgbm] and avoids NN/torch models."""
51
+ raw = os.environ.get("AUTOGLUON_HYPERPARAMETERS", "").strip()
52
+ if not raw:
53
+ return {"GBM": {}}
54
+ try:
55
+ parsed = json.loads(raw)
56
+ except json.JSONDecodeError as e:
57
+ logger.error("AUTOGLUON_HYPERPARAMETERS must be valid JSON: %s", e)
58
+ sys.exit(2)
59
+ if not isinstance(parsed, dict):
60
+ logger.error("AUTOGLUON_HYPERPARAMETERS must be a JSON object, e.g. {\"GBM\": {}}")
61
+ sys.exit(2)
62
+ return parsed
63
+
64
+
47
65
  def main() -> None:
48
66
  model_prefix = _require("MODEL_S3_PREFIX")
49
67
  label = _require("AUTOGLUON_LABEL")
@@ -92,10 +110,12 @@ def main() -> None:
92
110
  os.makedirs(local_dir, exist_ok=True)
93
111
 
94
112
  train_df = df[feature_list + [label]]
113
+ hyper = _hyperparameters_from_env()
95
114
  logger.info(
96
- "Fitting TabularPredictor problem_type=%s time_limit=%ss rows=%s",
115
+ "Fitting TabularPredictor problem_type=%s time_limit=%ss hyperparameters=%s rows=%s",
97
116
  problem_type,
98
117
  time_limit,
118
+ hyper,
99
119
  len(train_df),
100
120
  )
101
121
  predictor = TabularPredictor(
@@ -103,7 +123,7 @@ def main() -> None:
103
123
  problem_type=problem_type,
104
124
  path=local_dir,
105
125
  )
106
- predictor.fit(train_df, time_limit=time_limit)
126
+ predictor.fit(train_df, time_limit=time_limit, hyperparameters=hyper)
107
127
 
108
128
  logger.info("Uploading model artifacts to %s", model_prefix)
109
129
  upload_directory_to_s3(local_dir, model_prefix)
@@ -1,16 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.18
3
+ Version: 0.3.21
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
7
7
  Requires-Python: >=3.8
8
8
  Description-Content-Type: text/markdown
9
- Requires-Dist: pyspark<3.6,>=3.4
10
9
  Requires-Dist: numpy>=1.19.0
11
- Requires-Dist: scipy>=1.5.0
12
10
  Provides-Extra: dev
13
11
  Requires-Dist: pytest>=7.0; extra == "dev"
12
+ Provides-Extra: spark
13
+ Requires-Dist: pyspark<3.6,>=3.4; extra == "spark"
14
+ Requires-Dist: scipy>=1.5.0; extra == "spark"
14
15
  Provides-Extra: ttest
15
16
  Requires-Dist: scipy>=1.5.0; extra == "ttest"
16
17
  Provides-Extra: s3
@@ -23,18 +24,19 @@ Requires-Dist: boto3>=1.28; extra == "output"
23
24
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
24
25
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
25
26
  Provides-Extra: autogluon
26
- Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
27
+ Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
27
28
  Requires-Dist: pandas>=1.3.0; extra == "autogluon"
28
29
  Requires-Dist: boto3>=1.28; extra == "autogluon"
29
30
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
30
31
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
31
32
  Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
32
33
  Provides-Extra: full
34
+ Requires-Dist: pyspark<3.6,>=3.4; extra == "full"
33
35
  Requires-Dist: scipy>=1.5.0; extra == "full"
34
36
  Requires-Dist: boto3>=1.28; extra == "full"
35
37
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
36
38
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
37
- Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
39
+ Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
38
40
  Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
39
41
 
40
42
  # Batch Analytics
@@ -71,12 +73,13 @@ analytics/
71
73
 
72
74
  ## Install
73
75
 
76
+ - `pip install batch-analytics` — core (numpy only)
77
+ - `pip install batch-analytics[spark]` — PySpark ETL + job runner + SciPy (t-test / ANOVA)
78
+ - `pip install batch-analytics[autogluon]` — AutoGluon + I/O (Gluon image; no PySpark)
79
+ - `pip install batch-analytics[full]` — PySpark + AutoGluon + I/O
80
+
74
81
  ```bash
75
- pip install -e .
76
- # or install every runtime dependency used anywhere in the package, then editable:
77
- pip install -r requirements.txt && pip install -e .
78
- # PyPI install includes numpy and scipy (t-test); extras: s3, clickhouse, output, full
79
- pip install "batch-analytics[full]"
82
+ pip install -e ".[spark]" # dev
80
83
  ```
81
84
 
82
85
  ## Run
@@ -1,9 +1,7 @@
1
- pyspark<3.6,>=3.4
2
1
  numpy>=1.19.0
3
- scipy>=1.5.0
4
2
 
5
3
  [autogluon]
6
- autogluon<2.0,>=1.0
4
+ autogluon-tabular[lightgbm]<2.0,>=1.0
7
5
  pandas>=1.3.0
8
6
  boto3>=1.28
9
7
 
@@ -28,9 +26,10 @@ clickhouse-connect>=0.7
28
26
  pytest>=7.0
29
27
 
30
28
  [full]
29
+ pyspark<3.6,>=3.4
31
30
  scipy>=1.5.0
32
31
  boto3>=1.28
33
- autogluon<2.0,>=1.0
32
+ autogluon-tabular[lightgbm]<2.0,>=1.0
34
33
 
35
34
  [full:python_version < "3.9"]
36
35
  clickhouse-connect<0.9,>=0.7
@@ -53,5 +52,9 @@ clickhouse-connect>=0.7
53
52
  [s3]
54
53
  boto3>=1.28
55
54
 
55
+ [spark]
56
+ pyspark<3.6,>=3.4
57
+ scipy>=1.5.0
58
+
56
59
  [ttest]
57
60
  scipy>=1.5.0
@@ -1,52 +0,0 @@
1
- """
2
- Batch analytics pipeline: Extract, Transform, Log stages + analytics modules.
3
-
4
- Stages:
5
- - Extract: Load data from ClickHouse via Spark ClickHouse connector or JDBC
6
- - Transform: Deduplicate and stage data (parquet/delta/clickhouse)
7
- - Log: Persist run metadata and analytics results
8
-
9
- Analytics modules:
10
- - Module 1: Linear regression (XY) with slope comparison across groups
11
- - Module 2: Multi-feature correlation
12
- - Module 3: PCA + KMeans clustering
13
- """
14
-
15
- from .config import BatchAnalyticsConfig, SparkK8sConfig
16
- from .extract import (
17
- extract_all,
18
- extract_table,
19
- extract_unified,
20
- parse_extract_filter_values,
21
- )
22
- from .transform import (
23
- expand_kv_blob_column,
24
- extract_anchor_id,
25
- load_staged,
26
- remove_duplicates,
27
- stage_to_clickhouse,
28
- transform,
29
- transform_and_stage,
30
- )
31
- from .log import log_analytics_artifacts, log_run
32
- from .job_runner import run_pipeline, create_spark_session
33
-
34
- __all__ = [
35
- "BatchAnalyticsConfig",
36
- "SparkK8sConfig",
37
- "expand_kv_blob_column",
38
- "extract_anchor_id",
39
- "extract_all",
40
- "extract_table",
41
- "extract_unified",
42
- "parse_extract_filter_values",
43
- "remove_duplicates",
44
- "stage_to_clickhouse",
45
- "transform",
46
- "transform_and_stage",
47
- "load_staged",
48
- "log_run",
49
- "log_analytics_artifacts",
50
- "run_pipeline",
51
- "create_spark_session",
52
- ]