taldbt 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- taldbt-0.2.0/LICENSE +21 -0
- taldbt-0.2.0/MANIFEST.in +5 -0
- taldbt-0.2.0/PKG-INFO +126 -0
- taldbt-0.2.0/README.md +79 -0
- taldbt-0.2.0/pyproject.toml +74 -0
- taldbt-0.2.0/setup.cfg +4 -0
- taldbt-0.2.0/taldbt/__init__.py +3 -0
- taldbt-0.2.0/taldbt/__main__.py +5 -0
- taldbt-0.2.0/taldbt/cli.py +144 -0
- taldbt-0.2.0/taldbt/codegen/__init__.py +0 -0
- taldbt-0.2.0/taldbt/codegen/dbt_scaffolder.py +200 -0
- taldbt-0.2.0/taldbt/codegen/model_assembler.py +168 -0
- taldbt-0.2.0/taldbt/codegen/sql_generator.py +487 -0
- taldbt-0.2.0/taldbt/engine/__init__.py +0 -0
- taldbt-0.2.0/taldbt/engine/duckdb_engine.py +174 -0
- taldbt-0.2.0/taldbt/engine/self_healing.py +3 -0
- taldbt-0.2.0/taldbt/engine/test_data_generator.py +563 -0
- taldbt-0.2.0/taldbt/engine/validation.py +364 -0
- taldbt-0.2.0/taldbt/expert/__init__.py +1 -0
- taldbt-0.2.0/taldbt/expert/component_kb.py +764 -0
- taldbt-0.2.0/taldbt/expert/job_analyzer.py +368 -0
- taldbt-0.2.0/taldbt/expert/migration_engine.py +350 -0
- taldbt-0.2.0/taldbt/graphing/__init__.py +0 -0
- taldbt-0.2.0/taldbt/graphing/dag_builder.py +198 -0
- taldbt-0.2.0/taldbt/graphing/data_lineage.py +229 -0
- taldbt-0.2.0/taldbt/llm/__init__.py +0 -0
- taldbt-0.2.0/taldbt/llm/knowledge_base.py +656 -0
- taldbt-0.2.0/taldbt/llm/llm_provider.py +433 -0
- taldbt-0.2.0/taldbt/llm/ollama_client.py +337 -0
- taldbt-0.2.0/taldbt/models/__init__.py +0 -0
- taldbt-0.2.0/taldbt/models/ast_models.py +371 -0
- taldbt-0.2.0/taldbt/orchestration/__init__.py +0 -0
- taldbt-0.2.0/taldbt/orchestration/autopilot.py +386 -0
- taldbt-0.2.0/taldbt/orchestration/workflow_generator.py +413 -0
- taldbt-0.2.0/taldbt/parsers/__init__.py +0 -0
- taldbt-0.2.0/taldbt/parsers/classifier.py +197 -0
- taldbt-0.2.0/taldbt/parsers/components/__init__.py +6 -0
- taldbt-0.2.0/taldbt/parsers/components/aggregate_parser.py +96 -0
- taldbt-0.2.0/taldbt/parsers/components/dedup_parser.py +78 -0
- taldbt-0.2.0/taldbt/parsers/components/filter_parser.py +104 -0
- taldbt-0.2.0/taldbt/parsers/components/input_parser.py +137 -0
- taldbt-0.2.0/taldbt/parsers/components/sort_parser.py +61 -0
- taldbt-0.2.0/taldbt/parsers/components/tmap_parser.py +182 -0
- taldbt-0.2.0/taldbt/parsers/project_scanner.py +47 -0
- taldbt-0.2.0/taldbt/parsers/xml_parser.py +354 -0
- taldbt-0.2.0/taldbt/tests/__init__.py +0 -0
- taldbt-0.2.0/taldbt/tests/e2e_pipeline_test.py +202 -0
- taldbt-0.2.0/taldbt/tests/test_llm_provider.py +101 -0
- taldbt-0.2.0/taldbt/ui/app.py +1056 -0
- taldbt-0.2.0/taldbt/ui/favicon.svg +5 -0
- taldbt-0.2.0/taldbt/ui/logo.svg +14 -0
- taldbt-0.2.0/taldbt/ui/particles.html +201 -0
- taldbt-0.2.0/taldbt.egg-info/PKG-INFO +126 -0
- taldbt-0.2.0/taldbt.egg-info/SOURCES.txt +56 -0
- taldbt-0.2.0/taldbt.egg-info/dependency_links.txt +1 -0
- taldbt-0.2.0/taldbt.egg-info/entry_points.txt +2 -0
- taldbt-0.2.0/taldbt.egg-info/requires.txt +22 -0
- taldbt-0.2.0/taldbt.egg-info/top_level.txt +1 -0
taldbt-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025-2026 Sourav Roy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
taldbt-0.2.0/MANIFEST.in
ADDED
taldbt-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: taldbt
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: AI-powered Talend to dbt migration. $0 cost. No infrastructure. One click.
|
|
5
|
+
Author-email: Sourav Roy <souravroy.etl@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://taldbt.netlify.app
|
|
8
|
+
Project-URL: Documentation, https://taldbt.netlify.app
|
|
9
|
+
Project-URL: Demo, https://taldbt.streamlit.app
|
|
10
|
+
Project-URL: Docker, https://hub.docker.com/r/souravetl/taldbt
|
|
11
|
+
Project-URL: Issues, https://github.com/SouravRoy-ETL/taldbt/issues
|
|
12
|
+
Keywords: talend,dbt,migration,etl,data-engineering,duckdb,temporal,ai,llm,sql
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Topic :: Database
|
|
17
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: lxml>=5.0
|
|
28
|
+
Requires-Dist: networkx>=3.0
|
|
29
|
+
Requires-Dist: pydantic>=2.0
|
|
30
|
+
Requires-Dist: duckdb>=1.0
|
|
31
|
+
Requires-Dist: jinja2>=3.0
|
|
32
|
+
Requires-Dist: pyyaml>=6.0
|
|
33
|
+
Requires-Dist: rich>=13.0
|
|
34
|
+
Requires-Dist: requests>=2.31
|
|
35
|
+
Requires-Dist: dbt-core>=1.7
|
|
36
|
+
Requires-Dist: dbt-duckdb>=1.7
|
|
37
|
+
Requires-Dist: sqlglot>=25.0
|
|
38
|
+
Requires-Dist: faker>=28.0
|
|
39
|
+
Provides-Extra: ui
|
|
40
|
+
Requires-Dist: streamlit>=1.30; extra == "ui"
|
|
41
|
+
Provides-Extra: temporal
|
|
42
|
+
Requires-Dist: temporalio>=1.7; extra == "temporal"
|
|
43
|
+
Provides-Extra: all
|
|
44
|
+
Requires-Dist: streamlit>=1.30; extra == "all"
|
|
45
|
+
Requires-Dist: temporalio>=1.7; extra == "all"
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
|
|
48
|
+
# taldbt — AI Powered Talend to dbt Migration
|
|
49
|
+
|
|
50
|
+
Convert legacy Talend ETL to modern dbt SQL using semantic AI transpilation.
|
|
51
|
+
|
|
52
|
+
**Product Page:** https://taldbt.netlify.app
|
|
53
|
+
**Live Demo:** https://taldbt.streamlit.app
|
|
54
|
+
**Docker Image:** `docker pull souravetl/taldbt:latest`
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
### Docker (recommended)
|
|
59
|
+
```bash
|
|
60
|
+
docker pull souravetl/taldbt:latest
|
|
61
|
+
docker pull ollama/ollama:latest
|
|
62
|
+
docker compose up -d
|
|
63
|
+
docker exec taldbt-ollama ollama pull qwen3-coder:30b
|
|
64
|
+
# Open http://localhost:8501
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Cloud (no install)
|
|
68
|
+
Upload your Talend ZIP at https://taldbt.streamlit.app
|
|
69
|
+
|
|
70
|
+
### Local Development
|
|
71
|
+
```bash
|
|
72
|
+
pip install -r requirements.txt
|
|
73
|
+
streamlit run taldbt/ui/app.py
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Tech Stack
|
|
77
|
+
|
|
78
|
+
| Component | Purpose |
|
|
79
|
+
|-----------|---------|
|
|
80
|
+
| DuckDB + Flock | In-process analytics + LLM-in-SQL validation |
|
|
81
|
+
| dbt-core | SQL transformation framework |
|
|
82
|
+
| Temporal.io | DAG-aware workflow orchestration |
|
|
83
|
+
| Ollama / Cerebras / Groq | AI translation (local or cloud) |
|
|
84
|
+
| sqlglot | Multi-dialect SQL transpilation |
|
|
85
|
+
| Faker | Synthetic test data with FK integrity |
|
|
86
|
+
| networkx | Dependency graph + topological sort |
|
|
87
|
+
| lxml + Pydantic | XML parsing + type-safe AST |
|
|
88
|
+
|
|
89
|
+
## Project Structure
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
taldbt/
|
|
93
|
+
├── Dockerfile # Tier 1: Docker image
|
|
94
|
+
├── docker-compose.yml # Tier 1: full stack
|
|
95
|
+
├── docker-compose.cpu.yml # Tier 1: no-GPU override
|
|
96
|
+
├── docker/entrypoint.sh # Docker startup script
|
|
97
|
+
├── requirements.txt # Python dependencies
|
|
98
|
+
├── packages.txt # Tier 3: apt deps (Streamlit Cloud)
|
|
99
|
+
├── .streamlit/ # Streamlit config + secrets
|
|
100
|
+
├── dist/ # Air-gapped distribution package
|
|
101
|
+
├── docs/ # Architecture + knowledge transfer
|
|
102
|
+
├── main.py # CLI entry point
|
|
103
|
+
└── taldbt/ # Core application
|
|
104
|
+
├── ui/ # Streamlit web app
|
|
105
|
+
├── parsers/ # XML parsing + component parsers
|
|
106
|
+
├── codegen/ # SQL generation + dbt scaffolding
|
|
107
|
+
├── engine/ # DuckDB + validation + test data
|
|
108
|
+
├── expert/ # Component knowledge base
|
|
109
|
+
├── graphing/ # DAG builder + data lineage
|
|
110
|
+
├── llm/ # LLM provider chain
|
|
111
|
+
├── models/ # Pydantic AST models
|
|
112
|
+
├── orchestration/ # Temporal + AutoPilot
|
|
113
|
+
└── tests/ # Test suite
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Deployment Tiers
|
|
117
|
+
|
|
118
|
+
- **Tier 1 (Docker):** `docker compose up -d` — Ollama + Temporal + UI
|
|
119
|
+
- **Tier 3 (Cloud):** Streamlit Cloud + Cerebras/Groq AI — no local install
|
|
120
|
+
|
|
121
|
+
## License
|
|
122
|
+
|
|
123
|
+
Proprietary. Contact souravroy.etl@gmail.com for licensing.
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
Made with care by Sourav Roy
|
taldbt-0.2.0/README.md
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# taldbt — AI Powered Talend to dbt Migration
|
|
2
|
+
|
|
3
|
+
Convert legacy Talend ETL to modern dbt SQL using semantic AI transpilation.
|
|
4
|
+
|
|
5
|
+
**Product Page:** https://taldbt.netlify.app
|
|
6
|
+
**Live Demo:** https://taldbt.streamlit.app
|
|
7
|
+
**Docker Image:** `docker pull souravetl/taldbt:latest`
|
|
8
|
+
|
|
9
|
+
## Quick Start
|
|
10
|
+
|
|
11
|
+
### Docker (recommended)
|
|
12
|
+
```bash
|
|
13
|
+
docker pull souravetl/taldbt:latest
|
|
14
|
+
docker pull ollama/ollama:latest
|
|
15
|
+
docker compose up -d
|
|
16
|
+
docker exec taldbt-ollama ollama pull qwen3-coder:30b
|
|
17
|
+
# Open http://localhost:8501
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### Cloud (no install)
|
|
21
|
+
Upload your Talend ZIP at https://taldbt.streamlit.app
|
|
22
|
+
|
|
23
|
+
### Local Development
|
|
24
|
+
```bash
|
|
25
|
+
pip install -r requirements.txt
|
|
26
|
+
streamlit run taldbt/ui/app.py
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Tech Stack
|
|
30
|
+
|
|
31
|
+
| Component | Purpose |
|
|
32
|
+
|-----------|---------|
|
|
33
|
+
| DuckDB + Flock | In-process analytics + LLM-in-SQL validation |
|
|
34
|
+
| dbt-core | SQL transformation framework |
|
|
35
|
+
| Temporal.io | DAG-aware workflow orchestration |
|
|
36
|
+
| Ollama / Cerebras / Groq | AI translation (local or cloud) |
|
|
37
|
+
| sqlglot | Multi-dialect SQL transpilation |
|
|
38
|
+
| Faker | Synthetic test data with FK integrity |
|
|
39
|
+
| networkx | Dependency graph + topological sort |
|
|
40
|
+
| lxml + Pydantic | XML parsing + type-safe AST |
|
|
41
|
+
|
|
42
|
+
## Project Structure
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
taldbt/
|
|
46
|
+
├── Dockerfile # Tier 1: Docker image
|
|
47
|
+
├── docker-compose.yml # Tier 1: full stack
|
|
48
|
+
├── docker-compose.cpu.yml # Tier 1: no-GPU override
|
|
49
|
+
├── docker/entrypoint.sh # Docker startup script
|
|
50
|
+
├── requirements.txt # Python dependencies
|
|
51
|
+
├── packages.txt # Tier 3: apt deps (Streamlit Cloud)
|
|
52
|
+
├── .streamlit/ # Streamlit config + secrets
|
|
53
|
+
├── dist/ # Air-gapped distribution package
|
|
54
|
+
├── docs/ # Architecture + knowledge transfer
|
|
55
|
+
├── main.py # CLI entry point
|
|
56
|
+
└── taldbt/ # Core application
|
|
57
|
+
├── ui/ # Streamlit web app
|
|
58
|
+
├── parsers/ # XML parsing + component parsers
|
|
59
|
+
├── codegen/ # SQL generation + dbt scaffolding
|
|
60
|
+
├── engine/ # DuckDB + validation + test data
|
|
61
|
+
├── expert/ # Component knowledge base
|
|
62
|
+
├── graphing/ # DAG builder + data lineage
|
|
63
|
+
├── llm/ # LLM provider chain
|
|
64
|
+
├── models/ # Pydantic AST models
|
|
65
|
+
├── orchestration/ # Temporal + AutoPilot
|
|
66
|
+
└── tests/ # Test suite
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Deployment Tiers
|
|
70
|
+
|
|
71
|
+
- **Tier 1 (Docker):** `docker compose up -d` — Ollama + Temporal + UI
|
|
72
|
+
- **Tier 3 (Cloud):** Streamlit Cloud + Cerebras/Groq AI — no local install
|
|
73
|
+
|
|
74
|
+
## License
|
|
75
|
+
|
|
76
|
+
Proprietary. Contact souravroy.etl@gmail.com for licensing.
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
Made with care by Sourav Roy
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "taldbt"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "AI-powered Talend to dbt migration. $0 cost. No infrastructure. One click."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Sourav Roy", email = "souravroy.etl@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"talend", "dbt", "migration", "etl", "data-engineering",
|
|
17
|
+
"duckdb", "temporal", "ai", "llm", "sql"
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 4 - Beta",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"Intended Audience :: Science/Research",
|
|
23
|
+
"Topic :: Database",
|
|
24
|
+
"Topic :: Software Development :: Code Generators",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Programming Language :: Python :: 3.13",
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
"lxml>=5.0",
|
|
34
|
+
"networkx>=3.0",
|
|
35
|
+
"pydantic>=2.0",
|
|
36
|
+
"duckdb>=1.0",
|
|
37
|
+
"jinja2>=3.0",
|
|
38
|
+
"pyyaml>=6.0",
|
|
39
|
+
"rich>=13.0",
|
|
40
|
+
"requests>=2.31",
|
|
41
|
+
"dbt-core>=1.7",
|
|
42
|
+
"dbt-duckdb>=1.7",
|
|
43
|
+
"sqlglot>=25.0",
|
|
44
|
+
"faker>=28.0",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[project.optional-dependencies]
|
|
48
|
+
ui = ["streamlit>=1.30"]
|
|
49
|
+
temporal = ["temporalio>=1.7"]
|
|
50
|
+
all = ["streamlit>=1.30", "temporalio>=1.7"]
|
|
51
|
+
|
|
52
|
+
[project.urls]
|
|
53
|
+
Homepage = "https://taldbt.netlify.app"
|
|
54
|
+
Documentation = "https://taldbt.netlify.app"
|
|
55
|
+
Demo = "https://taldbt.streamlit.app"
|
|
56
|
+
Docker = "https://hub.docker.com/r/souravetl/taldbt"
|
|
57
|
+
Issues = "https://github.com/SouravRoy-ETL/taldbt/issues"
|
|
58
|
+
|
|
59
|
+
[project.scripts]
|
|
60
|
+
taldbt = "taldbt.cli:main"
|
|
61
|
+
|
|
62
|
+
[tool.setuptools.packages.find]
|
|
63
|
+
include = ["taldbt*"]
|
|
64
|
+
|
|
65
|
+
[tool.setuptools.package-data]
|
|
66
|
+
taldbt = [
|
|
67
|
+
"templates/**/*.sql",
|
|
68
|
+
"templates/**/*.yml",
|
|
69
|
+
"templates/**/*.yaml",
|
|
70
|
+
"expert/**/*.json",
|
|
71
|
+
"ui/**/*.css",
|
|
72
|
+
"ui/**/*.svg",
|
|
73
|
+
"ui/**/*.html",
|
|
74
|
+
]
|
taldbt-0.2.0/setup.cfg
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
taldbt CLI — entry point for `pip install taldbt`
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
taldbt ui # Launch Streamlit web UI
|
|
6
|
+
taldbt discover <talend_dir> # Scan and analyze a Talend project
|
|
7
|
+
taldbt migrate <talend_dir> <out> # Full migration to dbt
|
|
8
|
+
taldbt version # Print version
|
|
9
|
+
"""
|
|
10
|
+
import sys
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main():
|
|
15
|
+
args = sys.argv[1:]
|
|
16
|
+
|
|
17
|
+
if not args or args[0] in ("-h", "--help", "help"):
|
|
18
|
+
_print_help()
|
|
19
|
+
return
|
|
20
|
+
|
|
21
|
+
if args[0] in ("version", "--version", "-v"):
|
|
22
|
+
from taldbt import __version__
|
|
23
|
+
print(f"taldbt v{__version__}")
|
|
24
|
+
return
|
|
25
|
+
|
|
26
|
+
if args[0] == "ui":
|
|
27
|
+
_launch_ui()
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
if args[0] == "discover" and len(args) >= 2:
|
|
31
|
+
_discover(args[1], args[2] if len(args) > 2 else "migration_plan.json")
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
if args[0] == "migrate" and len(args) >= 3:
|
|
35
|
+
_migrate(args[1], args[2])
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
print(f"Unknown command: {args[0]}")
|
|
39
|
+
_print_help()
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _print_help():
|
|
44
|
+
from taldbt import __version__
|
|
45
|
+
print(f"""
|
|
46
|
+
taldbt v{__version__} — AI Powered Talend to dbt Migration
|
|
47
|
+
|
|
48
|
+
Usage:
|
|
49
|
+
taldbt ui Launch the web UI
|
|
50
|
+
taldbt discover <talend_dir> [output] Scan & analyze a Talend project
|
|
51
|
+
taldbt migrate <talend_dir> <out_dir> Full migration to dbt
|
|
52
|
+
taldbt version Print version
|
|
53
|
+
|
|
54
|
+
Examples:
|
|
55
|
+
taldbt ui
|
|
56
|
+
taldbt discover ./my_talend_project
|
|
57
|
+
taldbt migrate ./my_talend_project ./dbt_output
|
|
58
|
+
|
|
59
|
+
More info: https://taldbt.netlify.app
|
|
60
|
+
""".strip())
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _launch_ui():
|
|
64
|
+
try:
|
|
65
|
+
import streamlit
|
|
66
|
+
except ImportError:
|
|
67
|
+
print("Streamlit not installed. Run: pip install taldbt[ui]")
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
ui_path = os.path.join(os.path.dirname(__file__), "ui", "app.py")
|
|
71
|
+
os.system(f"streamlit run {ui_path}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _discover(input_path, out_path):
|
|
75
|
+
from taldbt.parsers.project_scanner import scan_project
|
|
76
|
+
from taldbt.parsers.xml_parser import parse_job
|
|
77
|
+
from taldbt.models.ast_models import ProjectAST, JobType
|
|
78
|
+
from taldbt.graphing.dag_builder import apply_dag_to_project
|
|
79
|
+
import json
|
|
80
|
+
|
|
81
|
+
print(f"[taldbt] Scanning {input_path}...")
|
|
82
|
+
|
|
83
|
+
scan = scan_project(input_path)
|
|
84
|
+
project = ProjectAST(
|
|
85
|
+
project_name=os.path.basename(input_path),
|
|
86
|
+
input_path=input_path,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
for entry in scan["process_jobs"]:
|
|
90
|
+
job = parse_job(entry["path"], entry["name"])
|
|
91
|
+
project.jobs[entry["name"]] = job
|
|
92
|
+
|
|
93
|
+
for entry in scan.get("joblets", []):
|
|
94
|
+
job = parse_job(entry["path"], entry["name"], JobType.JOBLET)
|
|
95
|
+
project.joblets[entry["name"]] = job
|
|
96
|
+
|
|
97
|
+
apply_dag_to_project(project)
|
|
98
|
+
|
|
99
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
100
|
+
json.dump(project.model_dump(), f, indent=2, default=str)
|
|
101
|
+
|
|
102
|
+
n_jobs = len(project.jobs)
|
|
103
|
+
n_src = sum(len(j.sources) for j in project.jobs.values())
|
|
104
|
+
n_comp = sum(len(j.components) for j in project.jobs.values())
|
|
105
|
+
print(f"[taldbt] {n_jobs} jobs, {n_src} sources, {n_comp} components")
|
|
106
|
+
print(f"[taldbt] Discovery complete -> {out_path}")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _migrate(input_path, output_path):
|
|
110
|
+
from taldbt.parsers.project_scanner import scan_project
|
|
111
|
+
from taldbt.parsers.xml_parser import parse_job
|
|
112
|
+
from taldbt.models.ast_models import ProjectAST
|
|
113
|
+
from taldbt.graphing.dag_builder import apply_dag_to_project
|
|
114
|
+
from taldbt.codegen.model_assembler import assemble_model
|
|
115
|
+
from taldbt.codegen.dbt_scaffolder import scaffold_dbt_project, write_model_file
|
|
116
|
+
|
|
117
|
+
print(f"[taldbt] Migrating {input_path} -> {output_path}")
|
|
118
|
+
|
|
119
|
+
scan = scan_project(input_path)
|
|
120
|
+
project = ProjectAST(
|
|
121
|
+
project_name=os.path.basename(input_path),
|
|
122
|
+
input_path=input_path,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
for entry in scan["process_jobs"]:
|
|
126
|
+
job = parse_job(entry["path"], entry["name"])
|
|
127
|
+
project.jobs[entry["name"]] = job
|
|
128
|
+
|
|
129
|
+
apply_dag_to_project(project)
|
|
130
|
+
scaffold_dbt_project(project, output_path)
|
|
131
|
+
|
|
132
|
+
generated = 0
|
|
133
|
+
for name, job in project.jobs.items():
|
|
134
|
+
sql = assemble_model(job)
|
|
135
|
+
if sql:
|
|
136
|
+
write_model_file(sql, name, output_path)
|
|
137
|
+
generated += 1
|
|
138
|
+
print(f" [ok] {name}")
|
|
139
|
+
|
|
140
|
+
print(f"[taldbt] Done. {generated} models generated in {output_path}")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
if __name__ == "__main__":
|
|
144
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
dbt Project Scaffolder: generates dbt_project.yml, profiles.yml, sources.yml,
|
|
3
|
+
schema.yml, and the folder structure for the output dbt project.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import yaml
|
|
10
|
+
from taldbt.models.ast_models import ProjectAST, JobAST, SourceInfo, ColumnSchema
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def scaffold_dbt_project(project: ProjectAST, output_dir: str):
|
|
14
|
+
"""Generate a complete dbt project structure from the parsed Talend project."""
|
|
15
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
16
|
+
os.makedirs(os.path.join(output_dir, "models", "staging"), exist_ok=True)
|
|
17
|
+
os.makedirs(os.path.join(output_dir, "models", "intermediate"), exist_ok=True)
|
|
18
|
+
os.makedirs(os.path.join(output_dir, "models", "marts"), exist_ok=True)
|
|
19
|
+
os.makedirs(os.path.join(output_dir, "macros"), exist_ok=True)
|
|
20
|
+
os.makedirs(os.path.join(output_dir, "tests"), exist_ok=True)
|
|
21
|
+
os.makedirs(os.path.join(output_dir, "seeds"), exist_ok=True)
|
|
22
|
+
|
|
23
|
+
_write_dbt_project_yml(project, output_dir)
|
|
24
|
+
_write_profiles_yml(project, output_dir)
|
|
25
|
+
_write_sources_yml(project, output_dir)
|
|
26
|
+
_write_schema_yml(project, output_dir)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _write_dbt_project_yml(project: ProjectAST, output_dir: str):
|
|
30
|
+
name = project.project_name.replace("-", "_").replace(" ", "_").lower() or "taldbt_project"
|
|
31
|
+
config = {
|
|
32
|
+
"name": name,
|
|
33
|
+
"version": "1.0.0",
|
|
34
|
+
"config-version": 2,
|
|
35
|
+
"profile": name,
|
|
36
|
+
"model-paths": ["models"],
|
|
37
|
+
"test-paths": ["tests"],
|
|
38
|
+
"macro-paths": ["macros"],
|
|
39
|
+
"seed-paths": ["seeds"],
|
|
40
|
+
"models": {
|
|
41
|
+
name: {
|
|
42
|
+
"staging": {"materialized": "view"},
|
|
43
|
+
"intermediate": {"materialized": "view"},
|
|
44
|
+
"marts": {"materialized": "table"},
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Add context variables as dbt vars
|
|
50
|
+
all_vars = {}
|
|
51
|
+
for ctx_name, ctx_vars in project.contexts.items():
|
|
52
|
+
all_vars.update(ctx_vars)
|
|
53
|
+
if all_vars:
|
|
54
|
+
config["vars"] = all_vars
|
|
55
|
+
|
|
56
|
+
path = os.path.join(output_dir, "dbt_project.yml")
|
|
57
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
58
|
+
f.write("# Generated by taldbt - Talend to dbt migration tool\n\n")
|
|
59
|
+
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _write_profiles_yml(project: ProjectAST, output_dir: str):
|
|
63
|
+
name = project.project_name.replace("-", "_").replace(" ", "_").lower() or "taldbt_project"
|
|
64
|
+
# Use absolute paths so dbt always finds the right DuckDB file
|
|
65
|
+
dev_db = os.path.join(output_dir, "dev.duckdb").replace("\\", "/")
|
|
66
|
+
prod_db = os.path.join(output_dir, "prod.duckdb").replace("\\", "/")
|
|
67
|
+
|
|
68
|
+
profile = {
|
|
69
|
+
name: {
|
|
70
|
+
"target": "dev",
|
|
71
|
+
"outputs": {
|
|
72
|
+
"dev": {
|
|
73
|
+
"type": "duckdb",
|
|
74
|
+
"path": dev_db,
|
|
75
|
+
"threads": 4,
|
|
76
|
+
},
|
|
77
|
+
"prod": {
|
|
78
|
+
"type": "duckdb",
|
|
79
|
+
"path": prod_db,
|
|
80
|
+
"threads": 8,
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
},
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
path = os.path.join(output_dir, "profiles.yml")
|
|
87
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
88
|
+
f.write("# Generated by taldbt\n")
|
|
89
|
+
f.write("# Change 'type: duckdb' to snowflake/bigquery/etc for production\n\n")
|
|
90
|
+
yaml.dump(profile, f, default_flow_style=False, sort_keys=False)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _write_sources_yml(project: ProjectAST, output_dir: str):
|
|
94
|
+
"""Generate sources.yml from the discovered source catalog."""
|
|
95
|
+
if not project.source_catalog:
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
# Group sources by database
|
|
99
|
+
by_db: dict[str, list[SourceInfo]] = {}
|
|
100
|
+
for sid, src in project.source_catalog.items():
|
|
101
|
+
db = "raw"
|
|
102
|
+
if src.connection and src.connection.database:
|
|
103
|
+
db = src.connection.database.replace('"', '').replace("'", "")
|
|
104
|
+
by_db.setdefault(db, []).append(src)
|
|
105
|
+
|
|
106
|
+
sources = []
|
|
107
|
+
for db_name, src_list in by_db.items():
|
|
108
|
+
tables = []
|
|
109
|
+
for src in src_list:
|
|
110
|
+
tbl_name = src.connection.table.replace('"', '').replace("'", "") if src.connection else src.source_id
|
|
111
|
+
table_entry = {"name": tbl_name}
|
|
112
|
+
|
|
113
|
+
# Add column definitions if available
|
|
114
|
+
if src.columns:
|
|
115
|
+
table_entry["columns"] = [
|
|
116
|
+
{"name": c.name, "description": c.comment or ""}
|
|
117
|
+
for c in src.columns
|
|
118
|
+
]
|
|
119
|
+
tables.append(table_entry)
|
|
120
|
+
|
|
121
|
+
sources.append({
|
|
122
|
+
"name": db_name.replace("-", "_").replace(".", "_"),
|
|
123
|
+
"tables": tables,
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
config = {"version": 2, "sources": sources}
|
|
127
|
+
path = os.path.join(output_dir, "models", "sources.yml")
|
|
128
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
129
|
+
f.write("# Generated by taldbt from Talend source metadata\n\n")
|
|
130
|
+
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _write_schema_yml(project: ProjectAST, output_dir: str):
|
|
134
|
+
"""Generate schema.yml with model definitions and basic tests."""
|
|
135
|
+
models = []
|
|
136
|
+
for name, job in project.jobs.items():
|
|
137
|
+
if job.job_type.value == "ORCHESTRATION":
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
model_entry = {
|
|
141
|
+
"name": _model_name(name),
|
|
142
|
+
"description": f"Migrated from Talend job: {name}",
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# Add column tests from output schemas (deduplicated)
|
|
146
|
+
columns = []
|
|
147
|
+
seen_col_names = set()
|
|
148
|
+
for comp in job.components.values():
|
|
149
|
+
if comp.behavior.value == "DATA_SINK" and comp.schemas:
|
|
150
|
+
# Use FLOW connector only, skip REJECT
|
|
151
|
+
flow_cols = comp.schemas.get("FLOW", [])
|
|
152
|
+
if not flow_cols:
|
|
153
|
+
# Fallback: use first non-REJECT connector
|
|
154
|
+
for conn_name, cols in comp.schemas.items():
|
|
155
|
+
if conn_name != "REJECT":
|
|
156
|
+
flow_cols = cols
|
|
157
|
+
break
|
|
158
|
+
for col in flow_cols:
|
|
159
|
+
if col.name.lower() in seen_col_names or not col.name:
|
|
160
|
+
continue
|
|
161
|
+
seen_col_names.add(col.name.lower())
|
|
162
|
+
col_entry = {"name": col.name}
|
|
163
|
+
tests = []
|
|
164
|
+
if col.is_key:
|
|
165
|
+
tests.extend(["not_null", "unique"])
|
|
166
|
+
elif not col.nullable:
|
|
167
|
+
tests.append("not_null")
|
|
168
|
+
if tests:
|
|
169
|
+
col_entry["tests"] = tests
|
|
170
|
+
columns.append(col_entry)
|
|
171
|
+
break # only use first DATA_SINK component
|
|
172
|
+
|
|
173
|
+
if columns:
|
|
174
|
+
model_entry["columns"] = columns
|
|
175
|
+
models.append(model_entry)
|
|
176
|
+
|
|
177
|
+
if models:
|
|
178
|
+
config = {"version": 2, "models": models}
|
|
179
|
+
path = os.path.join(output_dir, "models", "schema.yml")
|
|
180
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
181
|
+
f.write("# Generated by taldbt\n\n")
|
|
182
|
+
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _model_name(job_name: str) -> str:
|
|
186
|
+
"""Convert Talend job name to a clean dbt model name."""
|
|
187
|
+
name = job_name.replace(" ", "_").lower()
|
|
188
|
+
# Remove version suffix: _0.1, _1.0
|
|
189
|
+
import re
|
|
190
|
+
name = re.sub(r'_\d+\.\d+$', '', name)
|
|
191
|
+
return name
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def write_model_file(sql: str, job_name: str, output_dir: str, subfolder: str = "marts") -> str:
|
|
195
|
+
"""Write a generated SQL model to the dbt project."""
|
|
196
|
+
model_name = _model_name(job_name)
|
|
197
|
+
out = Path(output_dir) / "models" / subfolder / f"{model_name}.sql"
|
|
198
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
out.write_text(sql, encoding="utf-8")
|
|
200
|
+
return str(out)
|