python-infrakit-dev 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infrakit/__init__.py +0 -0
- infrakit/cli/__init__.py +1 -0
- infrakit/cli/commands/__init__.py +1 -0
- infrakit/cli/commands/deps.py +530 -0
- infrakit/cli/commands/init.py +129 -0
- infrakit/cli/commands/llm.py +295 -0
- infrakit/cli/commands/logger.py +160 -0
- infrakit/cli/commands/module.py +342 -0
- infrakit/cli/commands/time.py +81 -0
- infrakit/cli/main.py +65 -0
- infrakit/core/__init__.py +0 -0
- infrakit/core/config/__init__.py +0 -0
- infrakit/core/config/converter.py +480 -0
- infrakit/core/config/exporter.py +304 -0
- infrakit/core/config/loader.py +713 -0
- infrakit/core/config/validator.py +389 -0
- infrakit/core/logger/__init__.py +21 -0
- infrakit/core/logger/formatters.py +143 -0
- infrakit/core/logger/handlers.py +322 -0
- infrakit/core/logger/retention.py +176 -0
- infrakit/core/logger/setup.py +314 -0
- infrakit/deps/__init__.py +239 -0
- infrakit/deps/clean.py +141 -0
- infrakit/deps/depfile.py +405 -0
- infrakit/deps/health.py +357 -0
- infrakit/deps/optimizer.py +642 -0
- infrakit/deps/scanner.py +550 -0
- infrakit/llm/__init__.py +35 -0
- infrakit/llm/batch.py +165 -0
- infrakit/llm/client.py +575 -0
- infrakit/llm/key_manager.py +728 -0
- infrakit/llm/llm_readme.md +306 -0
- infrakit/llm/models.py +148 -0
- infrakit/llm/providers/__init__.py +5 -0
- infrakit/llm/providers/base.py +112 -0
- infrakit/llm/providers/gemini.py +164 -0
- infrakit/llm/providers/openai.py +168 -0
- infrakit/llm/rate_limiter.py +54 -0
- infrakit/scaffolder/__init__.py +31 -0
- infrakit/scaffolder/ai.py +508 -0
- infrakit/scaffolder/backend.py +555 -0
- infrakit/scaffolder/cli_tool.py +386 -0
- infrakit/scaffolder/generator.py +338 -0
- infrakit/scaffolder/pipeline.py +562 -0
- infrakit/scaffolder/registry.py +121 -0
- infrakit/time/__init__.py +60 -0
- infrakit/time/profiler.py +511 -0
- python_infrakit_dev-0.1.0.dist-info/METADATA +124 -0
- python_infrakit_dev-0.1.0.dist-info/RECORD +51 -0
- python_infrakit_dev-0.1.0.dist-info/WHEEL +4 -0
- python_infrakit_dev-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,562 @@
|
|
|
1
|
+
"""
|
|
2
|
+
infrakit.scaffolder.templates.pipeline
|
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
4
|
+
Scaffold a data pipeline / ETL project.
|
|
5
|
+
|
|
6
|
+
Designed for batch jobs that extract, transform, and load data —
|
|
7
|
+
with or without an LLM enrichment step.
|
|
8
|
+
|
|
9
|
+
Layout
|
|
10
|
+
------
|
|
11
|
+
<project>/
|
|
12
|
+
├── src/
|
|
13
|
+
│ └── __init__.py
|
|
14
|
+
├── pipeline/
|
|
15
|
+
│ ├── __init__.py
|
|
16
|
+
│ ├── extract.py # pull from source(s)
|
|
17
|
+
│ ├── transform.py # clean / reshape
|
|
18
|
+
│ ├── enrich.py # optional LLM enrichment step
|
|
19
|
+
│ ├── load.py # write to destination
|
|
20
|
+
│ └── runner.py # orchestrator — runs the full DAG
|
|
21
|
+
├── schemas/
|
|
22
|
+
│ ├── __init__.py
|
|
23
|
+
│ └── records.py # Pydantic models for input/output records
|
|
24
|
+
├── data/
|
|
25
|
+
│ ├── input/ # raw source files (not committed)
|
|
26
|
+
│ ├── staging/ # intermediate work (not committed)
|
|
27
|
+
│ └── output/ # final output (not committed)
|
|
28
|
+
├── utils/
|
|
29
|
+
│ ├── __init__.py
|
|
30
|
+
│ ├── logger.py
|
|
31
|
+
│ └── llm.py # optional
|
|
32
|
+
├── tests/
|
|
33
|
+
│ ├── __init__.py
|
|
34
|
+
│ └── test_pipeline.py
|
|
35
|
+
├── logs/
|
|
36
|
+
├── pyproject.toml / requirements.txt
|
|
37
|
+
├── config.{env|yaml|json}
|
|
38
|
+
├── README.md
|
|
39
|
+
└── .gitignore
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
from pathlib import Path
|
|
45
|
+
|
|
46
|
+
from infrakit.scaffolder.generator import (
|
|
47
|
+
ScaffoldResult,
|
|
48
|
+
_mkdir,
|
|
49
|
+
_write,
|
|
50
|
+
_config_content,
|
|
51
|
+
_gitignore,
|
|
52
|
+
_logger_util,
|
|
53
|
+
_src_init,
|
|
54
|
+
_tests_init,
|
|
55
|
+
_pyproject_toml,
|
|
56
|
+
_requirements_txt,
|
|
57
|
+
)
|
|
58
|
+
from infrakit.scaffolder.ai import _llm_util
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ── template content ──────────────────────────────────────────────────────────
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _pipeline_pkg_init() -> str:
|
|
65
|
+
return '''\
|
|
66
|
+
"""
|
|
67
|
+
pipeline
|
|
68
|
+
~~~~~~~~
|
|
69
|
+
Stages are independent modules; runner.py wires them in order.
|
|
70
|
+
|
|
71
|
+
Stage contract
|
|
72
|
+
--------------
|
|
73
|
+
Each stage exposes a ``run(**kwargs)`` function that:
|
|
74
|
+
- Reads from a well-known location (data/input, data/staging, etc.)
|
|
75
|
+
- Writes its output to the next location
|
|
76
|
+
- Returns a summary dict {records_in, records_out, errors}
|
|
77
|
+
- Is idempotent where possible
|
|
78
|
+
"""
|
|
79
|
+
'''
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _extract() -> str:
|
|
83
|
+
return '''\
|
|
84
|
+
"""
|
|
85
|
+
pipeline.extract
|
|
86
|
+
~~~~~~~~~~~~~~~~
|
|
87
|
+
Pull records from source(s) into data/input/.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
from pathlib import Path
|
|
91
|
+
|
|
92
|
+
from utils.logger import get_logger
|
|
93
|
+
|
|
94
|
+
log = get_logger(__name__)
|
|
95
|
+
IN_DIR = Path("data/input")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def run(source: str = "") -> dict:
|
|
99
|
+
IN_DIR.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
log.info("extract: starting (source=%r)", source)
|
|
101
|
+
|
|
102
|
+
records = []
|
|
103
|
+
# TODO: fetch from API / database / files and append to `records`
|
|
104
|
+
|
|
105
|
+
log.info("extract: %d records fetched", len(records))
|
|
106
|
+
return {"records_in": 0, "records_out": len(records), "errors": 0}
|
|
107
|
+
'''
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _transform() -> str:
|
|
111
|
+
return '''\
|
|
112
|
+
"""
|
|
113
|
+
pipeline.transform
|
|
114
|
+
~~~~~~~~~~~~~~~~~~
|
|
115
|
+
Clean and reshape records from data/input/ → data/staging/.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
from pathlib import Path
|
|
119
|
+
|
|
120
|
+
from utils.logger import get_logger
|
|
121
|
+
|
|
122
|
+
log = get_logger(__name__)
|
|
123
|
+
IN_DIR = Path("data/input")
|
|
124
|
+
STAGING_DIR = Path("data/staging")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def run() -> dict:
|
|
128
|
+
STAGING_DIR.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
log.info("transform: starting")
|
|
130
|
+
|
|
131
|
+
errors = 0
|
|
132
|
+
records_out = 0
|
|
133
|
+
|
|
134
|
+
# TODO: read files from IN_DIR, clean/reshape, write to STAGING_DIR
|
|
135
|
+
|
|
136
|
+
log.info("transform: %d records, %d errors", records_out, errors)
|
|
137
|
+
return {"records_in": 0, "records_out": records_out, "errors": errors}
|
|
138
|
+
'''
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _enrich(include_llm: bool) -> str:
|
|
142
|
+
if include_llm:
|
|
143
|
+
return '''\
|
|
144
|
+
"""
|
|
145
|
+
pipeline.enrich
|
|
146
|
+
~~~~~~~~~~~~~~~
|
|
147
|
+
Optional LLM enrichment step — adds AI-generated fields to records.
|
|
148
|
+
Reads from data/staging/, writes enriched records back to data/staging/.
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
from pathlib import Path
|
|
152
|
+
|
|
153
|
+
from utils.llm import llm, Prompt
|
|
154
|
+
from utils.logger import get_logger
|
|
155
|
+
|
|
156
|
+
log = get_logger(__name__)
|
|
157
|
+
STAGING_DIR = Path("data/staging")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def run(provider: str = "openai", batch_size: int = 50) -> dict:
|
|
161
|
+
log.info("enrich: starting (provider=%s)", provider)
|
|
162
|
+
|
|
163
|
+
# TODO: load records from STAGING_DIR
|
|
164
|
+
raw_texts: list[str] = []
|
|
165
|
+
|
|
166
|
+
if not raw_texts:
|
|
167
|
+
log.info("enrich: nothing to enrich")
|
|
168
|
+
return {"records_in": 0, "records_out": 0, "errors": 0}
|
|
169
|
+
|
|
170
|
+
prompts = [Prompt(user=text) for text in raw_texts]
|
|
171
|
+
batch = llm.batch_generate(prompts, provider=provider)
|
|
172
|
+
|
|
173
|
+
errors = batch.failure_count
|
|
174
|
+
log.info(
|
|
175
|
+
"enrich: %d ok, %d errors, %d tokens",
|
|
176
|
+
batch.success_count, errors, batch.total_tokens,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# TODO: merge batch.results back into records and write to STAGING_DIR
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
"records_in": len(raw_texts),
|
|
183
|
+
"records_out": batch.success_count,
|
|
184
|
+
"errors": errors,
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
if __name__ == "__main__":
|
|
189
|
+
run()
|
|
190
|
+
'''
|
|
191
|
+
else:
|
|
192
|
+
return '''\
|
|
193
|
+
"""
|
|
194
|
+
pipeline.enrich
|
|
195
|
+
~~~~~~~~~~~~~~~
|
|
196
|
+
Placeholder enrichment step — add derived / computed fields to records.
|
|
197
|
+
Reads from data/staging/, writes back to data/staging/.
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
from pathlib import Path
|
|
201
|
+
|
|
202
|
+
from utils.logger import get_logger
|
|
203
|
+
|
|
204
|
+
log = get_logger(__name__)
|
|
205
|
+
STAGING_DIR = Path("data/staging")
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def run() -> dict:
|
|
209
|
+
log.info("enrich: starting")
|
|
210
|
+
# TODO: load records, compute derived fields, write back
|
|
211
|
+
return {"records_in": 0, "records_out": 0, "errors": 0}
|
|
212
|
+
'''
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _load() -> str:
|
|
216
|
+
return '''\
|
|
217
|
+
"""
|
|
218
|
+
pipeline.load
|
|
219
|
+
~~~~~~~~~~~~~
|
|
220
|
+
Write staged records to the final destination (data/output/, DB, API …).
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
from pathlib import Path
|
|
224
|
+
|
|
225
|
+
from utils.logger import get_logger
|
|
226
|
+
|
|
227
|
+
log = get_logger(__name__)
|
|
228
|
+
STAGE_DIR = Path("data/staging")
|
|
229
|
+
OUTPUT_DIR = Path("data/output")
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def run(destination: str = "file") -> dict:
|
|
233
|
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
234
|
+
log.info("load: starting (destination=%r)", destination)
|
|
235
|
+
|
|
236
|
+
records_written = 0
|
|
237
|
+
errors = 0
|
|
238
|
+
|
|
239
|
+
if destination == "file":
|
|
240
|
+
# TODO: read from STAGE_DIR, write to OUTPUT_DIR
|
|
241
|
+
pass
|
|
242
|
+
else:
|
|
243
|
+
# TODO: write to database / external API
|
|
244
|
+
pass
|
|
245
|
+
|
|
246
|
+
log.info("load: %d written, %d errors", records_written, errors)
|
|
247
|
+
return {"records_in": 0, "records_out": records_written, "errors": errors}
|
|
248
|
+
'''
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _runner(include_llm: bool) -> str:
|
|
252
|
+
enrich_import = "from pipeline import enrich\n" if True else ""
|
|
253
|
+
enrich_call = (
|
|
254
|
+
" summary['enrich'] = enrich.run(provider=provider)\n"
|
|
255
|
+
if include_llm else
|
|
256
|
+
" summary['enrich'] = enrich.run()\n"
|
|
257
|
+
)
|
|
258
|
+
provider_param = (
|
|
259
|
+
" provider: str = \"openai\","
|
|
260
|
+
if include_llm else ""
|
|
261
|
+
)
|
|
262
|
+
return f'''\
|
|
263
|
+
"""
|
|
264
|
+
pipeline.runner
|
|
265
|
+
~~~~~~~~~~~~~~~
|
|
266
|
+
Orchestrates the full extract → transform → enrich → load sequence.
|
|
267
|
+
|
|
268
|
+
Run the full pipeline:
|
|
269
|
+
python -m pipeline.runner
|
|
270
|
+
|
|
271
|
+
Run individual stages:
|
|
272
|
+
python -m pipeline.extract
|
|
273
|
+
python -m pipeline.transform
|
|
274
|
+
python -m pipeline.enrich
|
|
275
|
+
python -m pipeline.load
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
from pipeline import extract, transform, enrich, load
|
|
279
|
+
from utils.logger import get_logger
|
|
280
|
+
|
|
281
|
+
log = get_logger(__name__)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def run(
|
|
285
|
+
source: str = "",
|
|
286
|
+
destination: str = "file",
|
|
287
|
+
{(" provider: str = 'openai'," if include_llm else "")}
|
|
288
|
+
) -> dict:
|
|
289
|
+
log.info("pipeline: start")
|
|
290
|
+
summary = {{}}
|
|
291
|
+
|
|
292
|
+
summary["extract"] = extract.run(source=source)
|
|
293
|
+
summary["transform"] = transform.run()
|
|
294
|
+
{enrich_call} summary["load"] = load.run(destination=destination)
|
|
295
|
+
|
|
296
|
+
total_errors = sum(s.get("errors", 0) for s in summary.values())
|
|
297
|
+
log.info("pipeline: done — %d total errors", total_errors)
|
|
298
|
+
return summary
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
if __name__ == "__main__":
|
|
302
|
+
import json, sys
|
|
303
|
+
result = run()
|
|
304
|
+
print(json.dumps(result, indent=2))
|
|
305
|
+
sys.exit(0 if all(s.get("errors", 0) == 0 for s in result.values()) else 1)
|
|
306
|
+
'''
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _schemas_records() -> str:
|
|
310
|
+
return '''\
|
|
311
|
+
"""
|
|
312
|
+
schemas.records
|
|
313
|
+
~~~~~~~~~~~~~~~
|
|
314
|
+
Pydantic models for input and output records.
|
|
315
|
+
|
|
316
|
+
Define your data contracts here so every pipeline stage can import and
|
|
317
|
+
validate against them.
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
from typing import Optional
|
|
321
|
+
from pydantic import BaseModel
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class InputRecord(BaseModel):
|
|
325
|
+
"""Raw record as received from the source."""
|
|
326
|
+
id: str
|
|
327
|
+
raw_text: str
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
class OutputRecord(BaseModel):
|
|
331
|
+
"""Enriched / transformed record written to the destination."""
|
|
332
|
+
id: str
|
|
333
|
+
processed_text: str
|
|
334
|
+
enriched_field: Optional[str] = None
|
|
335
|
+
'''
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _test_pipeline() -> str:
|
|
339
|
+
return '''\
|
|
340
|
+
"""tests.test_pipeline — smoke tests for each stage."""
|
|
341
|
+
|
|
342
|
+
import pytest
|
|
343
|
+
from unittest.mock import patch
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def test_extract_returns_summary():
|
|
347
|
+
from pipeline import extract
|
|
348
|
+
# patch out any I/O so the test stays offline
|
|
349
|
+
result = extract.run(source="")
|
|
350
|
+
assert "records_out" in result
|
|
351
|
+
assert "errors" in result
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def test_transform_returns_summary():
|
|
355
|
+
from pipeline import transform
|
|
356
|
+
result = transform.run()
|
|
357
|
+
assert "records_out" in result
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def test_load_returns_summary(tmp_path, monkeypatch):
|
|
361
|
+
import pipeline.load as load_mod
|
|
362
|
+
monkeypatch.setattr(load_mod, "OUTPUT_DIR", tmp_path / "output")
|
|
363
|
+
result = load_mod.run(destination="file")
|
|
364
|
+
assert "records_out" in result
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def test_runner_returns_all_stages():
|
|
368
|
+
from pipeline.runner import run
|
|
369
|
+
result = run()
|
|
370
|
+
assert set(result.keys()) >= {"extract", "transform", "enrich", "load"}
|
|
371
|
+
'''
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _pipeline_pyproject(
|
|
375
|
+
project_name: str, version: str, description: str, author: str, include_llm: bool
|
|
376
|
+
) -> str:
|
|
377
|
+
author_line = f' "{author}",' if author else ' # "Your Name <you@example.com>",'
|
|
378
|
+
llm_deps = """\
|
|
379
|
+
"openai",
|
|
380
|
+
"google-generativeai",
|
|
381
|
+
"tqdm",
|
|
382
|
+
""" if include_llm else ""
|
|
383
|
+
return f"""\
|
|
384
|
+
[project]
|
|
385
|
+
name = "{project_name}"
|
|
386
|
+
version = "{version}"
|
|
387
|
+
description = "{description}"
|
|
388
|
+
readme = "README.md"
|
|
389
|
+
requires-python = ">=3.10"
|
|
390
|
+
authors = [
|
|
391
|
+
{author_line}
|
|
392
|
+
]
|
|
393
|
+
|
|
394
|
+
dependencies = [
|
|
395
|
+
"infrakit",
|
|
396
|
+
"pydantic>=2.0",
|
|
397
|
+
{llm_deps}]
|
|
398
|
+
|
|
399
|
+
[project.optional-dependencies]
|
|
400
|
+
dev = [
|
|
401
|
+
"pytest",
|
|
402
|
+
"pytest-cov",
|
|
403
|
+
]
|
|
404
|
+
"""
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _pipeline_readme(project_name: str, description: str, include_llm: bool) -> str:
|
|
408
|
+
title = project_name.replace("-", " ").replace("_", " ").title()
|
|
409
|
+
desc_line = f"\n{description}\n" if description else ""
|
|
410
|
+
llm_note = (
|
|
411
|
+
"\nIncludes an LLM enrichment step via `infrakit.llm`. "
|
|
412
|
+
"Set `OPENAI_API_KEY` or `GEMINI_API_KEY` to enable it.\n"
|
|
413
|
+
) if include_llm else ""
|
|
414
|
+
return f"""\
|
|
415
|
+
# {title}
|
|
416
|
+
{desc_line}{llm_note}
|
|
417
|
+
## Setup
|
|
418
|
+
|
|
419
|
+
```bash
|
|
420
|
+
pip install -e .
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
## Run
|
|
424
|
+
|
|
425
|
+
```bash
|
|
426
|
+
# full pipeline
|
|
427
|
+
python -m pipeline.runner
|
|
428
|
+
|
|
429
|
+
# individual stages
|
|
430
|
+
python -m pipeline.extract
|
|
431
|
+
python -m pipeline.transform
|
|
432
|
+
python -m pipeline.enrich
|
|
433
|
+
python -m pipeline.load
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
## Structure
|
|
437
|
+
|
|
438
|
+
| Path | Purpose |
|
|
439
|
+
|---|---|
|
|
440
|
+
| `pipeline/extract.py` | Pull records from source |
|
|
441
|
+
| `pipeline/transform.py` | Clean and reshape |
|
|
442
|
+
| `pipeline/enrich.py` | {"LLM enrichment" if include_llm else "Computed / derived fields"} |
|
|
443
|
+
| `pipeline/load.py` | Write to destination |
|
|
444
|
+
| `pipeline/runner.py` | Orchestrate all stages |
|
|
445
|
+
| `schemas/records.py` | Pydantic data contracts |
|
|
446
|
+
| `data/input/` | Raw source data (not committed) |
|
|
447
|
+
| `data/staging/` | Intermediate (not committed) |
|
|
448
|
+
| `data/output/` | Final output (not committed) |
|
|
449
|
+
|
|
450
|
+
## Development
|
|
451
|
+
|
|
452
|
+
```bash
|
|
453
|
+
pip install -e ".[dev]"
|
|
454
|
+
pytest
|
|
455
|
+
```
|
|
456
|
+
"""
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _pipeline_gitignore() -> str:
|
|
460
|
+
return _gitignore() + """\
|
|
461
|
+
# Pipeline data (never commit raw / staging / output data)
|
|
462
|
+
data/input/
|
|
463
|
+
data/staging/
|
|
464
|
+
data/output/
|
|
465
|
+
|
|
466
|
+
# Keys
|
|
467
|
+
.env
|
|
468
|
+
keys.json
|
|
469
|
+
"""
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
# ── public API ────────────────────────────────────────────────────────────────
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def scaffold_pipeline(
|
|
476
|
+
project_dir: Path,
|
|
477
|
+
*,
|
|
478
|
+
version: str = "0.1.0",
|
|
479
|
+
description: str = "",
|
|
480
|
+
author: str = "",
|
|
481
|
+
config_fmt: str = "env",
|
|
482
|
+
deps: str = "toml",
|
|
483
|
+
include_llm: bool = False,
|
|
484
|
+
) -> ScaffoldResult:
|
|
485
|
+
"""
|
|
486
|
+
Scaffold a data pipeline / ETL project under ``project_dir``.
|
|
487
|
+
|
|
488
|
+
Parameters
|
|
489
|
+
----------
|
|
490
|
+
project_dir:
|
|
491
|
+
Root directory for the project.
|
|
492
|
+
version:
|
|
493
|
+
Starting version string.
|
|
494
|
+
description:
|
|
495
|
+
Short project description.
|
|
496
|
+
author:
|
|
497
|
+
Author string.
|
|
498
|
+
config_fmt:
|
|
499
|
+
Config file format — ``"env"``, ``"yaml"``, or ``"json"``.
|
|
500
|
+
deps:
|
|
501
|
+
``"toml"`` or ``"requirements"``.
|
|
502
|
+
include_llm:
|
|
503
|
+
Whether to wire up an LLM enrichment step in the pipeline.
|
|
504
|
+
"""
|
|
505
|
+
result = ScaffoldResult(project_dir=project_dir)
|
|
506
|
+
project_name = project_dir.name
|
|
507
|
+
|
|
508
|
+
# ── directories ───────────────────────────────────────────────────────────
|
|
509
|
+
_mkdir(result, project_dir)
|
|
510
|
+
_mkdir(result, project_dir / "src")
|
|
511
|
+
_mkdir(result, project_dir / "pipeline")
|
|
512
|
+
_mkdir(result, project_dir / "schemas")
|
|
513
|
+
_mkdir(result, project_dir / "data" / "input")
|
|
514
|
+
_mkdir(result, project_dir / "data" / "staging")
|
|
515
|
+
_mkdir(result, project_dir / "data" / "output")
|
|
516
|
+
_mkdir(result, project_dir / "utils")
|
|
517
|
+
_mkdir(result, project_dir / "tests")
|
|
518
|
+
_mkdir(result, project_dir / "logs")
|
|
519
|
+
|
|
520
|
+
# ── src ───────────────────────────────────────────────────────────────────
|
|
521
|
+
_write(result, project_dir / "src" / "__init__.py", _src_init(version))
|
|
522
|
+
|
|
523
|
+
# ── pipeline stages ───────────────────────────────────────────────────────
|
|
524
|
+
_write(result, project_dir / "pipeline" / "__init__.py", _pipeline_pkg_init())
|
|
525
|
+
_write(result, project_dir / "pipeline" / "extract.py", _extract())
|
|
526
|
+
_write(result, project_dir / "pipeline" / "transform.py", _transform())
|
|
527
|
+
_write(result, project_dir / "pipeline" / "enrich.py", _enrich(include_llm))
|
|
528
|
+
_write(result, project_dir / "pipeline" / "load.py", _load())
|
|
529
|
+
_write(result, project_dir / "pipeline" / "runner.py", _runner(include_llm))
|
|
530
|
+
|
|
531
|
+
# ── schemas ───────────────────────────────────────────────────────────────
|
|
532
|
+
_write(result, project_dir / "schemas" / "__init__.py", "")
|
|
533
|
+
_write(result, project_dir / "schemas" / "records.py", _schemas_records())
|
|
534
|
+
|
|
535
|
+
# ── utils ─────────────────────────────────────────────────────────────────
|
|
536
|
+
_write(result, project_dir / "utils" / "__init__.py", '"""Shared utilities."""\n')
|
|
537
|
+
_write(result, project_dir / "utils" / "logger.py", _logger_util())
|
|
538
|
+
if include_llm:
|
|
539
|
+
_write(result, project_dir / "utils" / "llm.py", _llm_util(project_name))
|
|
540
|
+
|
|
541
|
+
# ── tests ─────────────────────────────────────────────────────────────────
|
|
542
|
+
_write(result, project_dir / "tests" / "__init__.py", _tests_init())
|
|
543
|
+
_write(result, project_dir / "tests" / "test_pipeline.py", _test_pipeline())
|
|
544
|
+
|
|
545
|
+
# ── config ────────────────────────────────────────────────────────────────
|
|
546
|
+
cfg_name, cfg_content = _config_content(config_fmt)
|
|
547
|
+
_write(result, project_dir / cfg_name, cfg_content)
|
|
548
|
+
|
|
549
|
+
# ── dependency file ───────────────────────────────────────────────────────
|
|
550
|
+
if deps == "requirements":
|
|
551
|
+
_write(result, project_dir / "requirements.txt",
|
|
552
|
+
_requirements_txt(project_name))
|
|
553
|
+
else:
|
|
554
|
+
_write(result, project_dir / "pyproject.toml",
|
|
555
|
+
_pipeline_pyproject(project_name, version, description, author, include_llm))
|
|
556
|
+
|
|
557
|
+
# ── repo files ────────────────────────────────────────────────────────────
|
|
558
|
+
_write(result, project_dir / "README.md",
|
|
559
|
+
_pipeline_readme(project_name, description, include_llm))
|
|
560
|
+
_write(result, project_dir / ".gitignore", _pipeline_gitignore())
|
|
561
|
+
|
|
562
|
+
return result
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
infrakit.scaffolder.templates.registry
|
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
4
|
+
Central registry mapping template names to their scaffold functions.
|
|
5
|
+
|
|
6
|
+
Usage (programmatic)
|
|
7
|
+
---------------------
|
|
8
|
+
from infrakit.scaffolder.templates.registry import get_template, list_templates
|
|
9
|
+
|
|
10
|
+
fn = get_template("ai")
|
|
11
|
+
fn(Path("my_project"), version="0.2.0", include_notebooks=True)
|
|
12
|
+
|
|
13
|
+
Usage (CLI)
|
|
14
|
+
-----------
|
|
15
|
+
ik init my_project --template backend
|
|
16
|
+
ik init my_project --template pipeline --include-llm
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Callable
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class TemplateInfo:
|
|
28
|
+
name: str
|
|
29
|
+
description: str
|
|
30
|
+
fn: Callable
|
|
31
|
+
extra_flags: list[str] # optional flags supported by this template
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _load_registry() -> dict[str, TemplateInfo]:
|
|
35
|
+
# Imports are deferred so that only the SDK(s) required by the chosen
|
|
36
|
+
# template need to be installed (e.g. openai is not needed for cli_tool).
|
|
37
|
+
from infrakit.scaffolder.generator import scaffold_basic
|
|
38
|
+
from infrakit.scaffolder.templates.ai import scaffold_ai
|
|
39
|
+
from infrakit.scaffolder.templates.backend import scaffold_backend
|
|
40
|
+
from infrakit.scaffolder.templates.cli_tool import scaffold_cli_tool
|
|
41
|
+
from infrakit.scaffolder.templates.pipeline import scaffold_pipeline
|
|
42
|
+
|
|
43
|
+
entries = [
|
|
44
|
+
TemplateInfo(
|
|
45
|
+
name="basic",
|
|
46
|
+
description="Minimal project — src/, utils/, tests/, logger.",
|
|
47
|
+
fn=scaffold_basic,
|
|
48
|
+
extra_flags=[],
|
|
49
|
+
),
|
|
50
|
+
TemplateInfo(
|
|
51
|
+
name="ai",
|
|
52
|
+
description=(
|
|
53
|
+
"AI / ML project — pipelines, data dirs, notebooks, "
|
|
54
|
+
"utils/llm.py, utils/logger.py, prompts/."
|
|
55
|
+
),
|
|
56
|
+
fn=scaffold_ai,
|
|
57
|
+
extra_flags=["--include-notebooks"],
|
|
58
|
+
),
|
|
59
|
+
TemplateInfo(
|
|
60
|
+
name="backend",
|
|
61
|
+
description=(
|
|
62
|
+
"FastAPI service — app/, routes/, services/, middleware/, "
|
|
63
|
+
"utils/llm.py, Dockerfile, docker-compose."
|
|
64
|
+
),
|
|
65
|
+
fn=scaffold_backend,
|
|
66
|
+
extra_flags=["--include-llm / --no-include-llm"],
|
|
67
|
+
),
|
|
68
|
+
TemplateInfo(
|
|
69
|
+
name="cli-tool",
|
|
70
|
+
description=(
|
|
71
|
+
"Distributable Typer CLI — src/<pkg>/cli/, commands/, "
|
|
72
|
+
"entry point wired via pyproject.toml."
|
|
73
|
+
),
|
|
74
|
+
fn=scaffold_cli_tool,
|
|
75
|
+
extra_flags=["--include-llm / --no-include-llm"],
|
|
76
|
+
),
|
|
77
|
+
TemplateInfo(
|
|
78
|
+
name="pipeline",
|
|
79
|
+
description=(
|
|
80
|
+
"Data pipeline / ETL — extract, transform, enrich, load stages, "
|
|
81
|
+
"schemas/, data dirs."
|
|
82
|
+
),
|
|
83
|
+
fn=scaffold_pipeline,
|
|
84
|
+
extra_flags=["--include-llm / --no-include-llm"],
|
|
85
|
+
),
|
|
86
|
+
]
|
|
87
|
+
return {e.name: e for e in entries}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# module-level singleton
|
|
91
|
+
_REGISTRY: dict[str, TemplateInfo] | None = None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _registry() -> dict[str, TemplateInfo]:
|
|
95
|
+
global _REGISTRY
|
|
96
|
+
if _REGISTRY is None:
|
|
97
|
+
_REGISTRY = _load_registry()
|
|
98
|
+
return _REGISTRY
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def list_templates() -> list[TemplateInfo]:
|
|
102
|
+
"""Return all registered templates in definition order."""
|
|
103
|
+
return list(_registry().values())
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_template(name: str) -> Callable:
|
|
107
|
+
"""
|
|
108
|
+
Return the scaffold function for *name*.
|
|
109
|
+
|
|
110
|
+
Raises
|
|
111
|
+
------
|
|
112
|
+
ValueError
|
|
113
|
+
If *name* is not a known template.
|
|
114
|
+
"""
|
|
115
|
+
reg = _registry()
|
|
116
|
+
if name not in reg:
|
|
117
|
+
available = ", ".join(f"'{k}'" for k in reg)
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"Unknown template '{name}'. Available: {available}"
|
|
120
|
+
)
|
|
121
|
+
return reg[name].fn
|