taldbt 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. taldbt-0.2.0/LICENSE +21 -0
  2. taldbt-0.2.0/MANIFEST.in +5 -0
  3. taldbt-0.2.0/PKG-INFO +126 -0
  4. taldbt-0.2.0/README.md +79 -0
  5. taldbt-0.2.0/pyproject.toml +74 -0
  6. taldbt-0.2.0/setup.cfg +4 -0
  7. taldbt-0.2.0/taldbt/__init__.py +3 -0
  8. taldbt-0.2.0/taldbt/__main__.py +5 -0
  9. taldbt-0.2.0/taldbt/cli.py +144 -0
  10. taldbt-0.2.0/taldbt/codegen/__init__.py +0 -0
  11. taldbt-0.2.0/taldbt/codegen/dbt_scaffolder.py +200 -0
  12. taldbt-0.2.0/taldbt/codegen/model_assembler.py +168 -0
  13. taldbt-0.2.0/taldbt/codegen/sql_generator.py +487 -0
  14. taldbt-0.2.0/taldbt/engine/__init__.py +0 -0
  15. taldbt-0.2.0/taldbt/engine/duckdb_engine.py +174 -0
  16. taldbt-0.2.0/taldbt/engine/self_healing.py +3 -0
  17. taldbt-0.2.0/taldbt/engine/test_data_generator.py +563 -0
  18. taldbt-0.2.0/taldbt/engine/validation.py +364 -0
  19. taldbt-0.2.0/taldbt/expert/__init__.py +1 -0
  20. taldbt-0.2.0/taldbt/expert/component_kb.py +764 -0
  21. taldbt-0.2.0/taldbt/expert/job_analyzer.py +368 -0
  22. taldbt-0.2.0/taldbt/expert/migration_engine.py +350 -0
  23. taldbt-0.2.0/taldbt/graphing/__init__.py +0 -0
  24. taldbt-0.2.0/taldbt/graphing/dag_builder.py +198 -0
  25. taldbt-0.2.0/taldbt/graphing/data_lineage.py +229 -0
  26. taldbt-0.2.0/taldbt/llm/__init__.py +0 -0
  27. taldbt-0.2.0/taldbt/llm/knowledge_base.py +656 -0
  28. taldbt-0.2.0/taldbt/llm/llm_provider.py +433 -0
  29. taldbt-0.2.0/taldbt/llm/ollama_client.py +337 -0
  30. taldbt-0.2.0/taldbt/models/__init__.py +0 -0
  31. taldbt-0.2.0/taldbt/models/ast_models.py +371 -0
  32. taldbt-0.2.0/taldbt/orchestration/__init__.py +0 -0
  33. taldbt-0.2.0/taldbt/orchestration/autopilot.py +386 -0
  34. taldbt-0.2.0/taldbt/orchestration/workflow_generator.py +413 -0
  35. taldbt-0.2.0/taldbt/parsers/__init__.py +0 -0
  36. taldbt-0.2.0/taldbt/parsers/classifier.py +197 -0
  37. taldbt-0.2.0/taldbt/parsers/components/__init__.py +6 -0
  38. taldbt-0.2.0/taldbt/parsers/components/aggregate_parser.py +96 -0
  39. taldbt-0.2.0/taldbt/parsers/components/dedup_parser.py +78 -0
  40. taldbt-0.2.0/taldbt/parsers/components/filter_parser.py +104 -0
  41. taldbt-0.2.0/taldbt/parsers/components/input_parser.py +137 -0
  42. taldbt-0.2.0/taldbt/parsers/components/sort_parser.py +61 -0
  43. taldbt-0.2.0/taldbt/parsers/components/tmap_parser.py +182 -0
  44. taldbt-0.2.0/taldbt/parsers/project_scanner.py +47 -0
  45. taldbt-0.2.0/taldbt/parsers/xml_parser.py +354 -0
  46. taldbt-0.2.0/taldbt/tests/__init__.py +0 -0
  47. taldbt-0.2.0/taldbt/tests/e2e_pipeline_test.py +202 -0
  48. taldbt-0.2.0/taldbt/tests/test_llm_provider.py +101 -0
  49. taldbt-0.2.0/taldbt/ui/app.py +1056 -0
  50. taldbt-0.2.0/taldbt/ui/favicon.svg +5 -0
  51. taldbt-0.2.0/taldbt/ui/logo.svg +14 -0
  52. taldbt-0.2.0/taldbt/ui/particles.html +201 -0
  53. taldbt-0.2.0/taldbt.egg-info/PKG-INFO +126 -0
  54. taldbt-0.2.0/taldbt.egg-info/SOURCES.txt +56 -0
  55. taldbt-0.2.0/taldbt.egg-info/dependency_links.txt +1 -0
  56. taldbt-0.2.0/taldbt.egg-info/entry_points.txt +2 -0
  57. taldbt-0.2.0/taldbt.egg-info/requires.txt +22 -0
  58. taldbt-0.2.0/taldbt.egg-info/top_level.txt +1 -0
taldbt-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025-2026 Sourav Roy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include taldbt/templates *.sql *.yml *.yaml
4
+ recursive-include taldbt/expert *.json
5
+ recursive-include taldbt/ui *.css *.svg *.html *.py
taldbt-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,126 @@
1
+ Metadata-Version: 2.4
2
+ Name: taldbt
3
+ Version: 0.2.0
4
+ Summary: AI-powered Talend to dbt migration. $0 cost. No infrastructure. One click.
5
+ Author-email: Sourav Roy <souravroy.etl@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://taldbt.netlify.app
8
+ Project-URL: Documentation, https://taldbt.netlify.app
9
+ Project-URL: Demo, https://taldbt.streamlit.app
10
+ Project-URL: Docker, https://hub.docker.com/r/souravetl/taldbt
11
+ Project-URL: Issues, https://github.com/SouravRoy-ETL/taldbt/issues
12
+ Keywords: talend,dbt,migration,etl,data-engineering,duckdb,temporal,ai,llm,sql
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Topic :: Database
17
+ Classifier: Topic :: Software Development :: Code Generators
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: lxml>=5.0
28
+ Requires-Dist: networkx>=3.0
29
+ Requires-Dist: pydantic>=2.0
30
+ Requires-Dist: duckdb>=1.0
31
+ Requires-Dist: jinja2>=3.0
32
+ Requires-Dist: pyyaml>=6.0
33
+ Requires-Dist: rich>=13.0
34
+ Requires-Dist: requests>=2.31
35
+ Requires-Dist: dbt-core>=1.7
36
+ Requires-Dist: dbt-duckdb>=1.7
37
+ Requires-Dist: sqlglot>=25.0
38
+ Requires-Dist: faker>=28.0
39
+ Provides-Extra: ui
40
+ Requires-Dist: streamlit>=1.30; extra == "ui"
41
+ Provides-Extra: temporal
42
+ Requires-Dist: temporalio>=1.7; extra == "temporal"
43
+ Provides-Extra: all
44
+ Requires-Dist: streamlit>=1.30; extra == "all"
45
+ Requires-Dist: temporalio>=1.7; extra == "all"
46
+ Dynamic: license-file
47
+
48
+ # taldbt — AI Powered Talend to dbt Migration
49
+
50
+ Convert legacy Talend ETL to modern dbt SQL using semantic AI transpilation.
51
+
52
+ **Product Page:** https://taldbt.netlify.app
53
+ **Live Demo:** https://taldbt.streamlit.app
54
+ **Docker Image:** `docker pull souravetl/taldbt:latest`
55
+
56
+ ## Quick Start
57
+
58
+ ### Docker (recommended)
59
+ ```bash
60
+ docker pull souravetl/taldbt:latest
61
+ docker pull ollama/ollama:latest
62
+ docker compose up -d
63
+ docker exec taldbt-ollama ollama pull qwen3-coder:30b
64
+ # Open http://localhost:8501
65
+ ```
66
+
67
+ ### Cloud (no install)
68
+ Upload your Talend ZIP at https://taldbt.streamlit.app
69
+
70
+ ### Local Development
71
+ ```bash
72
+ pip install -r requirements.txt
73
+ streamlit run taldbt/ui/app.py
74
+ ```
75
+
76
+ ## Tech Stack
77
+
78
+ | Component | Purpose |
79
+ |-----------|---------|
80
+ | DuckDB + Flock | In-process analytics + LLM-in-SQL validation |
81
+ | dbt-core | SQL transformation framework |
82
+ | Temporal.io | DAG-aware workflow orchestration |
83
+ | Ollama / Cerebras / Groq | AI translation (local or cloud) |
84
+ | sqlglot | Multi-dialect SQL transpilation |
85
+ | Faker | Synthetic test data with FK integrity |
86
+ | networkx | Dependency graph + topological sort |
87
+ | lxml + Pydantic | XML parsing + type-safe AST |
88
+
89
+ ## Project Structure
90
+
91
+ ```
92
+ taldbt/
93
+ ├── Dockerfile # Tier 1: Docker image
94
+ ├── docker-compose.yml # Tier 1: full stack
95
+ ├── docker-compose.cpu.yml # Tier 1: no-GPU override
96
+ ├── docker/entrypoint.sh # Docker startup script
97
+ ├── requirements.txt # Python dependencies
98
+ ├── packages.txt # Tier 3: apt deps (Streamlit Cloud)
99
+ ├── .streamlit/ # Streamlit config + secrets
100
+ ├── dist/ # Air-gapped distribution package
101
+ ├── docs/ # Architecture + knowledge transfer
102
+ ├── main.py # CLI entry point
103
+ └── taldbt/ # Core application
104
+ ├── ui/ # Streamlit web app
105
+ ├── parsers/ # XML parsing + component parsers
106
+ ├── codegen/ # SQL generation + dbt scaffolding
107
+ ├── engine/ # DuckDB + validation + test data
108
+ ├── expert/ # Component knowledge base
109
+ ├── graphing/ # DAG builder + data lineage
110
+ ├── llm/ # LLM provider chain
111
+ ├── models/ # Pydantic AST models
112
+ ├── orchestration/ # Temporal + AutoPilot
113
+ └── tests/ # Test suite
114
+ ```
115
+
116
+ ## Deployment Tiers
117
+
118
+ - **Tier 1 (Docker):** `docker compose up -d` — Ollama + Temporal + UI
119
+ - **Tier 3 (Cloud):** Streamlit Cloud + Cerebras/Groq AI — no local install
120
+
121
+ ## License
122
+
123
+ Proprietary. Contact souravroy.etl@gmail.com for licensing.
124
+
125
+ ---
126
+ Made with care by Sourav Roy
taldbt-0.2.0/README.md ADDED
@@ -0,0 +1,79 @@
1
+ # taldbt — AI Powered Talend to dbt Migration
2
+
3
+ Convert legacy Talend ETL to modern dbt SQL using semantic AI transpilation.
4
+
5
+ **Product Page:** https://taldbt.netlify.app
6
+ **Live Demo:** https://taldbt.streamlit.app
7
+ **Docker Image:** `docker pull souravetl/taldbt:latest`
8
+
9
+ ## Quick Start
10
+
11
+ ### Docker (recommended)
12
+ ```bash
13
+ docker pull souravetl/taldbt:latest
14
+ docker pull ollama/ollama:latest
15
+ docker compose up -d
16
+ docker exec taldbt-ollama ollama pull qwen3-coder:30b
17
+ # Open http://localhost:8501
18
+ ```
19
+
20
+ ### Cloud (no install)
21
+ Upload your Talend ZIP at https://taldbt.streamlit.app
22
+
23
+ ### Local Development
24
+ ```bash
25
+ pip install -r requirements.txt
26
+ streamlit run taldbt/ui/app.py
27
+ ```
28
+
29
+ ## Tech Stack
30
+
31
+ | Component | Purpose |
32
+ |-----------|---------|
33
+ | DuckDB + Flock | In-process analytics + LLM-in-SQL validation |
34
+ | dbt-core | SQL transformation framework |
35
+ | Temporal.io | DAG-aware workflow orchestration |
36
+ | Ollama / Cerebras / Groq | AI translation (local or cloud) |
37
+ | sqlglot | Multi-dialect SQL transpilation |
38
+ | Faker | Synthetic test data with FK integrity |
39
+ | networkx | Dependency graph + topological sort |
40
+ | lxml + Pydantic | XML parsing + type-safe AST |
41
+
42
+ ## Project Structure
43
+
44
+ ```
45
+ taldbt/
46
+ ├── Dockerfile # Tier 1: Docker image
47
+ ├── docker-compose.yml # Tier 1: full stack
48
+ ├── docker-compose.cpu.yml # Tier 1: no-GPU override
49
+ ├── docker/entrypoint.sh # Docker startup script
50
+ ├── requirements.txt # Python dependencies
51
+ ├── packages.txt # Tier 3: apt deps (Streamlit Cloud)
52
+ ├── .streamlit/ # Streamlit config + secrets
53
+ ├── dist/ # Air-gapped distribution package
54
+ ├── docs/ # Architecture + knowledge transfer
55
+ ├── main.py # CLI entry point
56
+ └── taldbt/ # Core application
57
+ ├── ui/ # Streamlit web app
58
+ ├── parsers/ # XML parsing + component parsers
59
+ ├── codegen/ # SQL generation + dbt scaffolding
60
+ ├── engine/ # DuckDB + validation + test data
61
+ ├── expert/ # Component knowledge base
62
+ ├── graphing/ # DAG builder + data lineage
63
+ ├── llm/ # LLM provider chain
64
+ ├── models/ # Pydantic AST models
65
+ ├── orchestration/ # Temporal + AutoPilot
66
+ └── tests/ # Test suite
67
+ ```
68
+
69
+ ## Deployment Tiers
70
+
71
+ - **Tier 1 (Docker):** `docker compose up -d` — Ollama + Temporal + UI
72
+ - **Tier 3 (Cloud):** Streamlit Cloud + Cerebras/Groq AI — no local install
73
+
74
+ ## License
75
+
76
+ Proprietary. Contact souravroy.etl@gmail.com for licensing.
77
+
78
+ ---
79
+ Made with care by Sourav Roy
@@ -0,0 +1,74 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "taldbt"
7
+ version = "0.2.0"
8
+ description = "AI-powered Talend to dbt migration. $0 cost. No infrastructure. One click."
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "Sourav Roy", email = "souravroy.etl@gmail.com"}
14
+ ]
15
+ keywords = [
16
+ "talend", "dbt", "migration", "etl", "data-engineering",
17
+ "duckdb", "temporal", "ai", "llm", "sql"
18
+ ]
19
+ classifiers = [
20
+ "Development Status :: 4 - Beta",
21
+ "Intended Audience :: Developers",
22
+ "Intended Audience :: Science/Research",
23
+ "Topic :: Database",
24
+ "Topic :: Software Development :: Code Generators",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Programming Language :: Python :: 3.11",
29
+ "Programming Language :: Python :: 3.12",
30
+ "Programming Language :: Python :: 3.13",
31
+ ]
32
+ dependencies = [
33
+ "lxml>=5.0",
34
+ "networkx>=3.0",
35
+ "pydantic>=2.0",
36
+ "duckdb>=1.0",
37
+ "jinja2>=3.0",
38
+ "pyyaml>=6.0",
39
+ "rich>=13.0",
40
+ "requests>=2.31",
41
+ "dbt-core>=1.7",
42
+ "dbt-duckdb>=1.7",
43
+ "sqlglot>=25.0",
44
+ "faker>=28.0",
45
+ ]
46
+
47
+ [project.optional-dependencies]
48
+ ui = ["streamlit>=1.30"]
49
+ temporal = ["temporalio>=1.7"]
50
+ all = ["streamlit>=1.30", "temporalio>=1.7"]
51
+
52
+ [project.urls]
53
+ Homepage = "https://taldbt.netlify.app"
54
+ Documentation = "https://taldbt.netlify.app"
55
+ Demo = "https://taldbt.streamlit.app"
56
+ Docker = "https://hub.docker.com/r/souravetl/taldbt"
57
+ Issues = "https://github.com/SouravRoy-ETL/taldbt/issues"
58
+
59
+ [project.scripts]
60
+ taldbt = "taldbt.cli:main"
61
+
62
+ [tool.setuptools.packages.find]
63
+ include = ["taldbt*"]
64
+
65
+ [tool.setuptools.package-data]
66
+ taldbt = [
67
+ "templates/**/*.sql",
68
+ "templates/**/*.yml",
69
+ "templates/**/*.yaml",
70
+ "expert/**/*.json",
71
+ "ui/**/*.css",
72
+ "ui/**/*.svg",
73
+ "ui/**/*.html",
74
+ ]
taldbt-0.2.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """taldbt - AI Powered Talend to dbt Migration"""
2
+ __version__ = "0.2.0"
3
+ __author__ = "Sourav Roy"
@@ -0,0 +1,5 @@
1
+ """Allow running taldbt as: python -m taldbt"""
2
+ from taldbt.cli import main
3
+
4
+ if __name__ == "__main__":
5
+ main()
@@ -0,0 +1,144 @@
1
+ """
2
+ taldbt CLI — entry point for `pip install taldbt`
3
+
4
+ Usage:
5
+ taldbt ui # Launch Streamlit web UI
6
+ taldbt discover <talend_dir> # Scan and analyze a Talend project
7
+ taldbt migrate <talend_dir> <out> # Full migration to dbt
8
+ taldbt version # Print version
9
+ """
10
+ import sys
11
+ import os
12
+
13
+
14
+ def main():
15
+ args = sys.argv[1:]
16
+
17
+ if not args or args[0] in ("-h", "--help", "help"):
18
+ _print_help()
19
+ return
20
+
21
+ if args[0] in ("version", "--version", "-v"):
22
+ from taldbt import __version__
23
+ print(f"taldbt v{__version__}")
24
+ return
25
+
26
+ if args[0] == "ui":
27
+ _launch_ui()
28
+ return
29
+
30
+ if args[0] == "discover" and len(args) >= 2:
31
+ _discover(args[1], args[2] if len(args) > 2 else "migration_plan.json")
32
+ return
33
+
34
+ if args[0] == "migrate" and len(args) >= 3:
35
+ _migrate(args[1], args[2])
36
+ return
37
+
38
+ print(f"Unknown command: {args[0]}")
39
+ _print_help()
40
+ sys.exit(1)
41
+
42
+
43
+ def _print_help():
44
+ from taldbt import __version__
45
+ print(f"""
46
+ taldbt v{__version__} — AI Powered Talend to dbt Migration
47
+
48
+ Usage:
49
+ taldbt ui Launch the web UI
50
+ taldbt discover <talend_dir> [output] Scan & analyze a Talend project
51
+ taldbt migrate <talend_dir> <out_dir> Full migration to dbt
52
+ taldbt version Print version
53
+
54
+ Examples:
55
+ taldbt ui
56
+ taldbt discover ./my_talend_project
57
+ taldbt migrate ./my_talend_project ./dbt_output
58
+
59
+ More info: https://taldbt.netlify.app
60
+ """.strip())
61
+
62
+
63
+ def _launch_ui():
64
+ try:
65
+ import streamlit
66
+ except ImportError:
67
+ print("Streamlit not installed. Run: pip install taldbt[ui]")
68
+ sys.exit(1)
69
+
70
+ ui_path = os.path.join(os.path.dirname(__file__), "ui", "app.py")
71
+ os.system(f"streamlit run {ui_path}")
72
+
73
+
74
+ def _discover(input_path, out_path):
75
+ from taldbt.parsers.project_scanner import scan_project
76
+ from taldbt.parsers.xml_parser import parse_job
77
+ from taldbt.models.ast_models import ProjectAST, JobType
78
+ from taldbt.graphing.dag_builder import apply_dag_to_project
79
+ import json
80
+
81
+ print(f"[taldbt] Scanning {input_path}...")
82
+
83
+ scan = scan_project(input_path)
84
+ project = ProjectAST(
85
+ project_name=os.path.basename(input_path),
86
+ input_path=input_path,
87
+ )
88
+
89
+ for entry in scan["process_jobs"]:
90
+ job = parse_job(entry["path"], entry["name"])
91
+ project.jobs[entry["name"]] = job
92
+
93
+ for entry in scan.get("joblets", []):
94
+ job = parse_job(entry["path"], entry["name"], JobType.JOBLET)
95
+ project.joblets[entry["name"]] = job
96
+
97
+ apply_dag_to_project(project)
98
+
99
+ with open(out_path, "w", encoding="utf-8") as f:
100
+ json.dump(project.model_dump(), f, indent=2, default=str)
101
+
102
+ n_jobs = len(project.jobs)
103
+ n_src = sum(len(j.sources) for j in project.jobs.values())
104
+ n_comp = sum(len(j.components) for j in project.jobs.values())
105
+ print(f"[taldbt] {n_jobs} jobs, {n_src} sources, {n_comp} components")
106
+ print(f"[taldbt] Discovery complete -> {out_path}")
107
+
108
+
109
+ def _migrate(input_path, output_path):
110
+ from taldbt.parsers.project_scanner import scan_project
111
+ from taldbt.parsers.xml_parser import parse_job
112
+ from taldbt.models.ast_models import ProjectAST
113
+ from taldbt.graphing.dag_builder import apply_dag_to_project
114
+ from taldbt.codegen.model_assembler import assemble_model
115
+ from taldbt.codegen.dbt_scaffolder import scaffold_dbt_project, write_model_file
116
+
117
+ print(f"[taldbt] Migrating {input_path} -> {output_path}")
118
+
119
+ scan = scan_project(input_path)
120
+ project = ProjectAST(
121
+ project_name=os.path.basename(input_path),
122
+ input_path=input_path,
123
+ )
124
+
125
+ for entry in scan["process_jobs"]:
126
+ job = parse_job(entry["path"], entry["name"])
127
+ project.jobs[entry["name"]] = job
128
+
129
+ apply_dag_to_project(project)
130
+ scaffold_dbt_project(project, output_path)
131
+
132
+ generated = 0
133
+ for name, job in project.jobs.items():
134
+ sql = assemble_model(job)
135
+ if sql:
136
+ write_model_file(sql, name, output_path)
137
+ generated += 1
138
+ print(f" [ok] {name}")
139
+
140
+ print(f"[taldbt] Done. {generated} models generated in {output_path}")
141
+
142
+
143
+ if __name__ == "__main__":
144
+ main()
File without changes
@@ -0,0 +1,200 @@
1
+ """
2
+ dbt Project Scaffolder: generates dbt_project.yml, profiles.yml, sources.yml,
3
+ schema.yml, and the folder structure for the output dbt project.
4
+ """
5
+ from __future__ import annotations
6
+ import os
7
+ import re
8
+ from pathlib import Path
9
+ import yaml
10
+ from taldbt.models.ast_models import ProjectAST, JobAST, SourceInfo, ColumnSchema
11
+
12
+
13
+ def scaffold_dbt_project(project: ProjectAST, output_dir: str):
14
+ """Generate a complete dbt project structure from the parsed Talend project."""
15
+ os.makedirs(output_dir, exist_ok=True)
16
+ os.makedirs(os.path.join(output_dir, "models", "staging"), exist_ok=True)
17
+ os.makedirs(os.path.join(output_dir, "models", "intermediate"), exist_ok=True)
18
+ os.makedirs(os.path.join(output_dir, "models", "marts"), exist_ok=True)
19
+ os.makedirs(os.path.join(output_dir, "macros"), exist_ok=True)
20
+ os.makedirs(os.path.join(output_dir, "tests"), exist_ok=True)
21
+ os.makedirs(os.path.join(output_dir, "seeds"), exist_ok=True)
22
+
23
+ _write_dbt_project_yml(project, output_dir)
24
+ _write_profiles_yml(project, output_dir)
25
+ _write_sources_yml(project, output_dir)
26
+ _write_schema_yml(project, output_dir)
27
+
28
+
29
+ def _write_dbt_project_yml(project: ProjectAST, output_dir: str):
30
+ name = project.project_name.replace("-", "_").replace(" ", "_").lower() or "taldbt_project"
31
+ config = {
32
+ "name": name,
33
+ "version": "1.0.0",
34
+ "config-version": 2,
35
+ "profile": name,
36
+ "model-paths": ["models"],
37
+ "test-paths": ["tests"],
38
+ "macro-paths": ["macros"],
39
+ "seed-paths": ["seeds"],
40
+ "models": {
41
+ name: {
42
+ "staging": {"materialized": "view"},
43
+ "intermediate": {"materialized": "view"},
44
+ "marts": {"materialized": "table"},
45
+ }
46
+ },
47
+ }
48
+
49
+ # Add context variables as dbt vars
50
+ all_vars = {}
51
+ for ctx_name, ctx_vars in project.contexts.items():
52
+ all_vars.update(ctx_vars)
53
+ if all_vars:
54
+ config["vars"] = all_vars
55
+
56
+ path = os.path.join(output_dir, "dbt_project.yml")
57
+ with open(path, "w", encoding="utf-8") as f:
58
+ f.write("# Generated by taldbt - Talend to dbt migration tool\n\n")
59
+ yaml.dump(config, f, default_flow_style=False, sort_keys=False)
60
+
61
+
62
+ def _write_profiles_yml(project: ProjectAST, output_dir: str):
63
+ name = project.project_name.replace("-", "_").replace(" ", "_").lower() or "taldbt_project"
64
+ # Use absolute paths so dbt always finds the right DuckDB file
65
+ dev_db = os.path.join(output_dir, "dev.duckdb").replace("\\", "/")
66
+ prod_db = os.path.join(output_dir, "prod.duckdb").replace("\\", "/")
67
+
68
+ profile = {
69
+ name: {
70
+ "target": "dev",
71
+ "outputs": {
72
+ "dev": {
73
+ "type": "duckdb",
74
+ "path": dev_db,
75
+ "threads": 4,
76
+ },
77
+ "prod": {
78
+ "type": "duckdb",
79
+ "path": prod_db,
80
+ "threads": 8,
81
+ },
82
+ },
83
+ },
84
+ }
85
+
86
+ path = os.path.join(output_dir, "profiles.yml")
87
+ with open(path, "w", encoding="utf-8") as f:
88
+ f.write("# Generated by taldbt\n")
89
+ f.write("# Change 'type: duckdb' to snowflake/bigquery/etc for production\n\n")
90
+ yaml.dump(profile, f, default_flow_style=False, sort_keys=False)
91
+
92
+
93
+ def _write_sources_yml(project: ProjectAST, output_dir: str):
94
+ """Generate sources.yml from the discovered source catalog."""
95
+ if not project.source_catalog:
96
+ return
97
+
98
+ # Group sources by database
99
+ by_db: dict[str, list[SourceInfo]] = {}
100
+ for sid, src in project.source_catalog.items():
101
+ db = "raw"
102
+ if src.connection and src.connection.database:
103
+ db = src.connection.database.replace('"', '').replace("'", "")
104
+ by_db.setdefault(db, []).append(src)
105
+
106
+ sources = []
107
+ for db_name, src_list in by_db.items():
108
+ tables = []
109
+ for src in src_list:
110
+ tbl_name = src.connection.table.replace('"', '').replace("'", "") if src.connection else src.source_id
111
+ table_entry = {"name": tbl_name}
112
+
113
+ # Add column definitions if available
114
+ if src.columns:
115
+ table_entry["columns"] = [
116
+ {"name": c.name, "description": c.comment or ""}
117
+ for c in src.columns
118
+ ]
119
+ tables.append(table_entry)
120
+
121
+ sources.append({
122
+ "name": db_name.replace("-", "_").replace(".", "_"),
123
+ "tables": tables,
124
+ })
125
+
126
+ config = {"version": 2, "sources": sources}
127
+ path = os.path.join(output_dir, "models", "sources.yml")
128
+ with open(path, "w", encoding="utf-8") as f:
129
+ f.write("# Generated by taldbt from Talend source metadata\n\n")
130
+ yaml.dump(config, f, default_flow_style=False, sort_keys=False)
131
+
132
+
133
+ def _write_schema_yml(project: ProjectAST, output_dir: str):
134
+ """Generate schema.yml with model definitions and basic tests."""
135
+ models = []
136
+ for name, job in project.jobs.items():
137
+ if job.job_type.value == "ORCHESTRATION":
138
+ continue
139
+
140
+ model_entry = {
141
+ "name": _model_name(name),
142
+ "description": f"Migrated from Talend job: {name}",
143
+ }
144
+
145
+ # Add column tests from output schemas (deduplicated)
146
+ columns = []
147
+ seen_col_names = set()
148
+ for comp in job.components.values():
149
+ if comp.behavior.value == "DATA_SINK" and comp.schemas:
150
+ # Use FLOW connector only, skip REJECT
151
+ flow_cols = comp.schemas.get("FLOW", [])
152
+ if not flow_cols:
153
+ # Fallback: use first non-REJECT connector
154
+ for conn_name, cols in comp.schemas.items():
155
+ if conn_name != "REJECT":
156
+ flow_cols = cols
157
+ break
158
+ for col in flow_cols:
159
+ if col.name.lower() in seen_col_names or not col.name:
160
+ continue
161
+ seen_col_names.add(col.name.lower())
162
+ col_entry = {"name": col.name}
163
+ tests = []
164
+ if col.is_key:
165
+ tests.extend(["not_null", "unique"])
166
+ elif not col.nullable:
167
+ tests.append("not_null")
168
+ if tests:
169
+ col_entry["tests"] = tests
170
+ columns.append(col_entry)
171
+ break # only use first DATA_SINK component
172
+
173
+ if columns:
174
+ model_entry["columns"] = columns
175
+ models.append(model_entry)
176
+
177
+ if models:
178
+ config = {"version": 2, "models": models}
179
+ path = os.path.join(output_dir, "models", "schema.yml")
180
+ with open(path, "w", encoding="utf-8") as f:
181
+ f.write("# Generated by taldbt\n\n")
182
+ yaml.dump(config, f, default_flow_style=False, sort_keys=False)
183
+
184
+
185
+ def _model_name(job_name: str) -> str:
186
+ """Convert Talend job name to a clean dbt model name."""
187
+ name = job_name.replace(" ", "_").lower()
188
+ # Remove version suffix: _0.1, _1.0
189
+ import re
190
+ name = re.sub(r'_\d+\.\d+$', '', name)
191
+ return name
192
+
193
+
194
+ def write_model_file(sql: str, job_name: str, output_dir: str, subfolder: str = "marts") -> str:
195
+ """Write a generated SQL model to the dbt project."""
196
+ model_name = _model_name(job_name)
197
+ out = Path(output_dir) / "models" / subfolder / f"{model_name}.sql"
198
+ out.parent.mkdir(parents=True, exist_ok=True)
199
+ out.write_text(sql, encoding="utf-8")
200
+ return str(out)