@booklib/core 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cursor/rules/booklib-standards.mdc +40 -0
- package/.gemini/context.md +372 -0
- package/AGENTS.md +166 -0
- package/CHANGELOG.md +226 -0
- package/CLAUDE.md +81 -0
- package/CODE_OF_CONDUCT.md +31 -0
- package/CONTRIBUTING.md +304 -0
- package/LICENSE +21 -0
- package/PLAN.md +28 -0
- package/README.ja.md +198 -0
- package/README.ko.md +198 -0
- package/README.md +503 -0
- package/README.pt-BR.md +198 -0
- package/README.uk.md +241 -0
- package/README.zh-CN.md +198 -0
- package/SECURITY.md +9 -0
- package/agents/architecture-reviewer.md +136 -0
- package/agents/booklib-reviewer.md +90 -0
- package/agents/data-reviewer.md +107 -0
- package/agents/jvm-reviewer.md +146 -0
- package/agents/python-reviewer.md +128 -0
- package/agents/rust-reviewer.md +115 -0
- package/agents/ts-reviewer.md +110 -0
- package/agents/ui-reviewer.md +117 -0
- package/assets/logo.svg +36 -0
- package/bin/booklib-mcp.js +304 -0
- package/bin/booklib.js +1705 -0
- package/bin/skills.cjs +1292 -0
- package/booklib-router.mdc +36 -0
- package/booklib.config.json +19 -0
- package/commands/animation-at-work.md +10 -0
- package/commands/clean-code-reviewer.md +10 -0
- package/commands/data-intensive-patterns.md +10 -0
- package/commands/data-pipelines.md +10 -0
- package/commands/design-patterns.md +10 -0
- package/commands/domain-driven-design.md +10 -0
- package/commands/effective-java.md +10 -0
- package/commands/effective-kotlin.md +10 -0
- package/commands/effective-python.md +10 -0
- package/commands/effective-typescript.md +10 -0
- package/commands/kotlin-in-action.md +10 -0
- package/commands/lean-startup.md +10 -0
- package/commands/microservices-patterns.md +10 -0
- package/commands/programming-with-rust.md +10 -0
- package/commands/refactoring-ui.md +10 -0
- package/commands/rust-in-action.md +10 -0
- package/commands/skill-router.md +10 -0
- package/commands/spring-boot-in-action.md +10 -0
- package/commands/storytelling-with-data.md +10 -0
- package/commands/system-design-interview.md +10 -0
- package/commands/using-asyncio-python.md +10 -0
- package/commands/web-scraping-python.md +10 -0
- package/community/registry.json +1616 -0
- package/hooks/hooks.json +23 -0
- package/hooks/posttooluse-capture.mjs +67 -0
- package/hooks/suggest.js +153 -0
- package/lib/agent-behaviors.js +40 -0
- package/lib/agent-detector.js +96 -0
- package/lib/config-loader.js +39 -0
- package/lib/conflict-resolver.js +148 -0
- package/lib/context-builder.js +574 -0
- package/lib/discovery-engine.js +298 -0
- package/lib/doctor/hook-installer.js +83 -0
- package/lib/doctor/usage-tracker.js +87 -0
- package/lib/engine/ai-features.js +253 -0
- package/lib/engine/auditor.js +103 -0
- package/lib/engine/bm25-index.js +178 -0
- package/lib/engine/capture.js +120 -0
- package/lib/engine/corrections.js +198 -0
- package/lib/engine/doctor.js +195 -0
- package/lib/engine/graph-injector.js +137 -0
- package/lib/engine/graph.js +161 -0
- package/lib/engine/handoff.js +405 -0
- package/lib/engine/indexer.js +242 -0
- package/lib/engine/parser.js +53 -0
- package/lib/engine/query-expander.js +42 -0
- package/lib/engine/reranker.js +40 -0
- package/lib/engine/rrf.js +59 -0
- package/lib/engine/scanner.js +151 -0
- package/lib/engine/searcher.js +139 -0
- package/lib/engine/session-coordinator.js +306 -0
- package/lib/engine/session-manager.js +429 -0
- package/lib/engine/synthesizer.js +70 -0
- package/lib/installer.js +70 -0
- package/lib/instinct-block.js +33 -0
- package/lib/mcp-config-writer.js +88 -0
- package/lib/paths.js +57 -0
- package/lib/profiles/design.md +19 -0
- package/lib/profiles/general.md +16 -0
- package/lib/profiles/research-analysis.md +22 -0
- package/lib/profiles/software-development.md +23 -0
- package/lib/profiles/writing-content.md +19 -0
- package/lib/project-initializer.js +916 -0
- package/lib/registry/skills.js +102 -0
- package/lib/registry-searcher.js +99 -0
- package/lib/rules/rules-manager.js +169 -0
- package/lib/skill-fetcher.js +333 -0
- package/lib/well-known-builder.js +70 -0
- package/lib/wizard/index.js +404 -0
- package/lib/wizard/integration-detector.js +41 -0
- package/lib/wizard/project-detector.js +100 -0
- package/lib/wizard/prompt.js +156 -0
- package/lib/wizard/registry-embeddings.js +107 -0
- package/lib/wizard/skill-recommender.js +69 -0
- package/llms-full.txt +254 -0
- package/llms.txt +70 -0
- package/package.json +45 -0
- package/research-reports/2026-04-01-current-architecture.md +160 -0
- package/research-reports/IDEAS.md +93 -0
- package/rules/common/clean-code.md +42 -0
- package/rules/java/effective-java.md +42 -0
- package/rules/kotlin/effective-kotlin.md +37 -0
- package/rules/python/effective-python.md +38 -0
- package/rules/rust/rust.md +37 -0
- package/rules/typescript/effective-typescript.md +42 -0
- package/scripts/gen-llms-full.mjs +36 -0
- package/scripts/gen-og.mjs +142 -0
- package/scripts/validate-frontmatter.js +25 -0
- package/skills/animation-at-work/SKILL.md +270 -0
- package/skills/animation-at-work/assets/example_asset.txt +1 -0
- package/skills/animation-at-work/evals/evals.json +44 -0
- package/skills/animation-at-work/evals/results.json +13 -0
- package/skills/animation-at-work/examples/after.md +64 -0
- package/skills/animation-at-work/examples/before.md +35 -0
- package/skills/animation-at-work/references/api_reference.md +369 -0
- package/skills/animation-at-work/references/review-checklist.md +79 -0
- package/skills/animation-at-work/scripts/audit_animations.py +295 -0
- package/skills/animation-at-work/scripts/example.py +1 -0
- package/skills/clean-code-reviewer/SKILL.md +444 -0
- package/skills/clean-code-reviewer/audit.json +35 -0
- package/skills/clean-code-reviewer/evals/evals.json +185 -0
- package/skills/clean-code-reviewer/evals/results.json +13 -0
- package/skills/clean-code-reviewer/examples/after.md +48 -0
- package/skills/clean-code-reviewer/examples/before.md +33 -0
- package/skills/clean-code-reviewer/references/api_reference.md +158 -0
- package/skills/clean-code-reviewer/references/practices-catalog.md +282 -0
- package/skills/clean-code-reviewer/references/review-checklist.md +254 -0
- package/skills/clean-code-reviewer/scripts/pre-review.py +206 -0
- package/skills/data-intensive-patterns/SKILL.md +267 -0
- package/skills/data-intensive-patterns/assets/example_asset.txt +1 -0
- package/skills/data-intensive-patterns/evals/evals.json +54 -0
- package/skills/data-intensive-patterns/evals/results.json +13 -0
- package/skills/data-intensive-patterns/examples/after.md +61 -0
- package/skills/data-intensive-patterns/examples/before.md +38 -0
- package/skills/data-intensive-patterns/references/api_reference.md +34 -0
- package/skills/data-intensive-patterns/references/patterns-catalog.md +551 -0
- package/skills/data-intensive-patterns/references/review-checklist.md +193 -0
- package/skills/data-intensive-patterns/scripts/adr.py +213 -0
- package/skills/data-intensive-patterns/scripts/example.py +1 -0
- package/skills/data-pipelines/SKILL.md +259 -0
- package/skills/data-pipelines/assets/example_asset.txt +1 -0
- package/skills/data-pipelines/evals/evals.json +45 -0
- package/skills/data-pipelines/evals/results.json +13 -0
- package/skills/data-pipelines/examples/after.md +97 -0
- package/skills/data-pipelines/examples/before.md +37 -0
- package/skills/data-pipelines/references/api_reference.md +301 -0
- package/skills/data-pipelines/references/review-checklist.md +181 -0
- package/skills/data-pipelines/scripts/example.py +1 -0
- package/skills/data-pipelines/scripts/new_pipeline.py +444 -0
- package/skills/design-patterns/SKILL.md +271 -0
- package/skills/design-patterns/assets/example_asset.txt +1 -0
- package/skills/design-patterns/evals/evals.json +46 -0
- package/skills/design-patterns/evals/results.json +13 -0
- package/skills/design-patterns/examples/after.md +52 -0
- package/skills/design-patterns/examples/before.md +29 -0
- package/skills/design-patterns/references/api_reference.md +1 -0
- package/skills/design-patterns/references/patterns-catalog.md +726 -0
- package/skills/design-patterns/references/review-checklist.md +173 -0
- package/skills/design-patterns/scripts/example.py +1 -0
- package/skills/design-patterns/scripts/scaffold.py +807 -0
- package/skills/domain-driven-design/SKILL.md +142 -0
- package/skills/domain-driven-design/assets/example_asset.txt +1 -0
- package/skills/domain-driven-design/evals/evals.json +48 -0
- package/skills/domain-driven-design/evals/results.json +13 -0
- package/skills/domain-driven-design/examples/after.md +80 -0
- package/skills/domain-driven-design/examples/before.md +43 -0
- package/skills/domain-driven-design/references/api_reference.md +1 -0
- package/skills/domain-driven-design/references/patterns-catalog.md +545 -0
- package/skills/domain-driven-design/references/review-checklist.md +158 -0
- package/skills/domain-driven-design/scripts/example.py +1 -0
- package/skills/domain-driven-design/scripts/scaffold.py +421 -0
- package/skills/effective-java/SKILL.md +227 -0
- package/skills/effective-java/assets/example_asset.txt +1 -0
- package/skills/effective-java/evals/evals.json +46 -0
- package/skills/effective-java/evals/results.json +13 -0
- package/skills/effective-java/examples/after.md +83 -0
- package/skills/effective-java/examples/before.md +37 -0
- package/skills/effective-java/references/api_reference.md +1 -0
- package/skills/effective-java/references/items-catalog.md +955 -0
- package/skills/effective-java/references/review-checklist.md +216 -0
- package/skills/effective-java/scripts/checkstyle_setup.py +211 -0
- package/skills/effective-java/scripts/example.py +1 -0
- package/skills/effective-kotlin/SKILL.md +271 -0
- package/skills/effective-kotlin/assets/example_asset.txt +1 -0
- package/skills/effective-kotlin/audit.json +29 -0
- package/skills/effective-kotlin/evals/evals.json +45 -0
- package/skills/effective-kotlin/evals/results.json +13 -0
- package/skills/effective-kotlin/examples/after.md +36 -0
- package/skills/effective-kotlin/examples/before.md +38 -0
- package/skills/effective-kotlin/references/api_reference.md +1 -0
- package/skills/effective-kotlin/references/practices-catalog.md +1228 -0
- package/skills/effective-kotlin/references/review-checklist.md +126 -0
- package/skills/effective-kotlin/scripts/example.py +1 -0
- package/skills/effective-python/SKILL.md +441 -0
- package/skills/effective-python/evals/evals.json +44 -0
- package/skills/effective-python/evals/results.json +13 -0
- package/skills/effective-python/examples/after.md +56 -0
- package/skills/effective-python/examples/before.md +40 -0
- package/skills/effective-python/ref-01-pythonic-thinking.md +202 -0
- package/skills/effective-python/ref-02-lists-and-dicts.md +146 -0
- package/skills/effective-python/ref-03-functions.md +186 -0
- package/skills/effective-python/ref-04-comprehensions-generators.md +211 -0
- package/skills/effective-python/ref-05-classes-interfaces.md +188 -0
- package/skills/effective-python/ref-06-metaclasses-attributes.md +209 -0
- package/skills/effective-python/ref-07-concurrency.md +213 -0
- package/skills/effective-python/ref-08-robustness-performance.md +248 -0
- package/skills/effective-python/ref-09-testing-debugging.md +253 -0
- package/skills/effective-python/ref-10-collaboration.md +175 -0
- package/skills/effective-python/references/api_reference.md +218 -0
- package/skills/effective-python/references/practices-catalog.md +483 -0
- package/skills/effective-python/references/review-checklist.md +190 -0
- package/skills/effective-python/scripts/lint.py +173 -0
- package/skills/effective-typescript/SKILL.md +262 -0
- package/skills/effective-typescript/audit.json +29 -0
- package/skills/effective-typescript/evals/evals.json +37 -0
- package/skills/effective-typescript/evals/results.json +13 -0
- package/skills/effective-typescript/examples/after.md +70 -0
- package/skills/effective-typescript/examples/before.md +47 -0
- package/skills/effective-typescript/references/api_reference.md +118 -0
- package/skills/effective-typescript/references/practices-catalog.md +371 -0
- package/skills/effective-typescript/scripts/review.py +169 -0
- package/skills/kotlin-in-action/SKILL.md +261 -0
- package/skills/kotlin-in-action/assets/example_asset.txt +1 -0
- package/skills/kotlin-in-action/evals/evals.json +43 -0
- package/skills/kotlin-in-action/evals/results.json +13 -0
- package/skills/kotlin-in-action/examples/after.md +53 -0
- package/skills/kotlin-in-action/examples/before.md +39 -0
- package/skills/kotlin-in-action/references/api_reference.md +1 -0
- package/skills/kotlin-in-action/references/practices-catalog.md +436 -0
- package/skills/kotlin-in-action/references/review-checklist.md +204 -0
- package/skills/kotlin-in-action/scripts/example.py +1 -0
- package/skills/kotlin-in-action/scripts/setup_detekt.py +224 -0
- package/skills/lean-startup/SKILL.md +160 -0
- package/skills/lean-startup/assets/example_asset.txt +1 -0
- package/skills/lean-startup/evals/evals.json +43 -0
- package/skills/lean-startup/evals/results.json +13 -0
- package/skills/lean-startup/examples/after.md +80 -0
- package/skills/lean-startup/examples/before.md +34 -0
- package/skills/lean-startup/references/api_reference.md +319 -0
- package/skills/lean-startup/references/review-checklist.md +137 -0
- package/skills/lean-startup/scripts/example.py +1 -0
- package/skills/lean-startup/scripts/new_experiment.py +286 -0
- package/skills/microservices-patterns/SKILL.md +384 -0
- package/skills/microservices-patterns/evals/evals.json +45 -0
- package/skills/microservices-patterns/evals/results.json +13 -0
- package/skills/microservices-patterns/examples/after.md +69 -0
- package/skills/microservices-patterns/examples/before.md +40 -0
- package/skills/microservices-patterns/references/patterns-catalog.md +391 -0
- package/skills/microservices-patterns/references/review-checklist.md +169 -0
- package/skills/microservices-patterns/scripts/new_service.py +583 -0
- package/skills/programming-with-rust/SKILL.md +209 -0
- package/skills/programming-with-rust/evals/evals.json +37 -0
- package/skills/programming-with-rust/evals/results.json +13 -0
- package/skills/programming-with-rust/examples/after.md +107 -0
- package/skills/programming-with-rust/examples/before.md +59 -0
- package/skills/programming-with-rust/references/api_reference.md +152 -0
- package/skills/programming-with-rust/references/practices-catalog.md +335 -0
- package/skills/programming-with-rust/scripts/review.py +142 -0
- package/skills/refactoring-ui/SKILL.md +362 -0
- package/skills/refactoring-ui/assets/example_asset.txt +1 -0
- package/skills/refactoring-ui/evals/evals.json +45 -0
- package/skills/refactoring-ui/evals/results.json +13 -0
- package/skills/refactoring-ui/examples/after.md +85 -0
- package/skills/refactoring-ui/examples/before.md +58 -0
- package/skills/refactoring-ui/references/api_reference.md +355 -0
- package/skills/refactoring-ui/references/review-checklist.md +114 -0
- package/skills/refactoring-ui/scripts/audit_css.py +250 -0
- package/skills/refactoring-ui/scripts/example.py +1 -0
- package/skills/rust-in-action/SKILL.md +350 -0
- package/skills/rust-in-action/evals/evals.json +38 -0
- package/skills/rust-in-action/evals/results.json +13 -0
- package/skills/rust-in-action/examples/after.md +156 -0
- package/skills/rust-in-action/examples/before.md +56 -0
- package/skills/rust-in-action/references/practices-catalog.md +346 -0
- package/skills/rust-in-action/scripts/review.py +147 -0
- package/skills/skill-router/SKILL.md +186 -0
- package/skills/skill-router/evals/evals.json +38 -0
- package/skills/skill-router/evals/results.json +13 -0
- package/skills/skill-router/examples/after.md +63 -0
- package/skills/skill-router/examples/before.md +39 -0
- package/skills/skill-router/references/api_reference.md +24 -0
- package/skills/skill-router/references/routing-heuristics.md +89 -0
- package/skills/skill-router/references/skill-catalog.md +174 -0
- package/skills/skill-router/scripts/route.py +266 -0
- package/skills/spring-boot-in-action/SKILL.md +340 -0
- package/skills/spring-boot-in-action/evals/evals.json +39 -0
- package/skills/spring-boot-in-action/evals/results.json +13 -0
- package/skills/spring-boot-in-action/examples/after.md +185 -0
- package/skills/spring-boot-in-action/examples/before.md +84 -0
- package/skills/spring-boot-in-action/references/practices-catalog.md +403 -0
- package/skills/spring-boot-in-action/scripts/review.py +184 -0
- package/skills/storytelling-with-data/SKILL.md +241 -0
- package/skills/storytelling-with-data/assets/example_asset.txt +1 -0
- package/skills/storytelling-with-data/evals/evals.json +47 -0
- package/skills/storytelling-with-data/evals/results.json +13 -0
- package/skills/storytelling-with-data/examples/after.md +50 -0
- package/skills/storytelling-with-data/examples/before.md +33 -0
- package/skills/storytelling-with-data/references/api_reference.md +379 -0
- package/skills/storytelling-with-data/references/review-checklist.md +111 -0
- package/skills/storytelling-with-data/scripts/chart_review.py +301 -0
- package/skills/storytelling-with-data/scripts/example.py +1 -0
- package/skills/system-design-interview/SKILL.md +233 -0
- package/skills/system-design-interview/assets/example_asset.txt +1 -0
- package/skills/system-design-interview/evals/evals.json +46 -0
- package/skills/system-design-interview/evals/results.json +13 -0
- package/skills/system-design-interview/examples/after.md +94 -0
- package/skills/system-design-interview/examples/before.md +27 -0
- package/skills/system-design-interview/references/api_reference.md +582 -0
- package/skills/system-design-interview/references/review-checklist.md +201 -0
- package/skills/system-design-interview/scripts/example.py +1 -0
- package/skills/system-design-interview/scripts/new_design.py +421 -0
- package/skills/using-asyncio-python/SKILL.md +290 -0
- package/skills/using-asyncio-python/assets/example_asset.txt +1 -0
- package/skills/using-asyncio-python/evals/evals.json +43 -0
- package/skills/using-asyncio-python/evals/results.json +13 -0
- package/skills/using-asyncio-python/examples/after.md +68 -0
- package/skills/using-asyncio-python/examples/before.md +39 -0
- package/skills/using-asyncio-python/references/api_reference.md +267 -0
- package/skills/using-asyncio-python/references/review-checklist.md +149 -0
- package/skills/using-asyncio-python/scripts/check_blocking.py +270 -0
- package/skills/using-asyncio-python/scripts/example.py +1 -0
- package/skills/web-scraping-python/SKILL.md +280 -0
- package/skills/web-scraping-python/assets/example_asset.txt +1 -0
- package/skills/web-scraping-python/evals/evals.json +46 -0
- package/skills/web-scraping-python/evals/results.json +13 -0
- package/skills/web-scraping-python/examples/after.md +109 -0
- package/skills/web-scraping-python/examples/before.md +40 -0
- package/skills/web-scraping-python/references/api_reference.md +393 -0
- package/skills/web-scraping-python/references/review-checklist.md +163 -0
- package/skills/web-scraping-python/scripts/example.py +1 -0
- package/skills/web-scraping-python/scripts/new_scraper.py +231 -0
- package/skills/writing-plans/audit.json +34 -0
- package/tests/agent-detector.test.js +83 -0
- package/tests/corrections.test.js +245 -0
- package/tests/doctor/hook-installer.test.js +72 -0
- package/tests/doctor/usage-tracker.test.js +140 -0
- package/tests/engine/benchmark-eval.test.js +31 -0
- package/tests/engine/bm25-index.test.js +85 -0
- package/tests/engine/capture-command.test.js +35 -0
- package/tests/engine/capture.test.js +17 -0
- package/tests/engine/graph-augmented-search.test.js +107 -0
- package/tests/engine/graph-injector.test.js +44 -0
- package/tests/engine/graph.test.js +216 -0
- package/tests/engine/hybrid-searcher.test.js +74 -0
- package/tests/engine/indexer-bm25.test.js +37 -0
- package/tests/engine/mcp-tools.test.js +73 -0
- package/tests/engine/project-initializer-mcp.test.js +99 -0
- package/tests/engine/query-expander.test.js +36 -0
- package/tests/engine/reranker.test.js +51 -0
- package/tests/engine/rrf.test.js +49 -0
- package/tests/engine/srag-prefix.test.js +47 -0
- package/tests/instinct-block.test.js +23 -0
- package/tests/mcp-config-writer.test.js +60 -0
- package/tests/project-initializer-new-agents.test.js +48 -0
- package/tests/rules/rules-manager.test.js +230 -0
- package/tests/well-known-builder.test.js +40 -0
- package/tests/wizard/integration-detector.test.js +31 -0
- package/tests/wizard/project-detector.test.js +51 -0
- package/tests/wizard/prompt-session.test.js +61 -0
- package/tests/wizard/prompt.test.js +16 -0
- package/tests/wizard/registry-embeddings.test.js +35 -0
- package/tests/wizard/skill-recommender.test.js +34 -0
- package/tests/wizard/slot-count.test.js +25 -0
- package/vercel.json +21 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# Data Pipelines Pocket Reference — Pipeline Review Checklist
|
|
2
|
+
|
|
3
|
+
Systematic checklist for reviewing data pipelines against the 13 chapters
|
|
4
|
+
from *Data Pipelines Pocket Reference* by James Densmore.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## 1. Architecture & Patterns (Chapters 1–3)
|
|
9
|
+
|
|
10
|
+
### Infrastructure
|
|
11
|
+
- [ ] **Ch 1-2 — Appropriate infrastructure** — Is the right storage chosen for the use case (warehouse for analytics, lake for raw/unstructured)?
|
|
12
|
+
- [ ] **Ch 2 — Cloud-native services** — Are managed services used where appropriate to reduce operational burden?
|
|
13
|
+
- [ ] **Ch 2 — Separation of storage and compute** — Is compute scaled independently from storage?
|
|
14
|
+
|
|
15
|
+
### Pipeline Patterns
|
|
16
|
+
- [ ] **Ch 3 — ETL vs ELT** — Is the right pattern chosen? Is ELT used for analytics workloads with modern warehouses?
|
|
17
|
+
- [ ] **Ch 3 — Full vs incremental** — Is incremental extraction used for growing datasets? Is full extraction justified for small tables?
|
|
18
|
+
- [ ] **Ch 3 — CDC where appropriate** — Is CDC used for real-time sync needs instead of polling?
|
|
19
|
+
- [ ] **Ch 3 — Loading strategy** — Is the right load pattern used (append for events, upsert for dimensions, full refresh for small lookups)?
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 2. Data Ingestion (Chapters 4–7)
|
|
24
|
+
|
|
25
|
+
### Database Ingestion (Ch 4)
|
|
26
|
+
- [ ] **Ch 4 — Read replica usage** — Are extractions running against read replicas, not production databases?
|
|
27
|
+
- [ ] **Ch 4 — Incremental column** — Is there a reliable timestamp or ID column for incremental extraction?
|
|
28
|
+
- [ ] **Ch 4 — Connection management** — Are connections pooled and properly closed? Are timeouts configured?
|
|
29
|
+
- [ ] **Ch 4 — Query efficiency** — Are only needed columns selected? Are WHERE clauses using indexed columns?
|
|
30
|
+
- [ ] **Ch 4 — Large table handling** — Are large extractions chunked or using streaming cursors?
|
|
31
|
+
|
|
32
|
+
### File Ingestion (Ch 5)
|
|
33
|
+
- [ ] **Ch 5 — Schema validation** — Are file schemas validated before processing? Are malformed rows handled?
|
|
34
|
+
- [ ] **Ch 5 — Encoding handling** — Is character encoding handled consistently (UTF-8)?
|
|
35
|
+
- [ ] **Ch 5 — Cloud storage patterns** — Are files organized with partitioned prefixes? Are processed files archived?
|
|
36
|
+
- [ ] **Ch 5 — File tracking** — Is there a mechanism to track which files have been processed to avoid reprocessing?
|
|
37
|
+
- [ ] **Ch 5 — Compression** — Are files compressed for storage and transfer efficiency?
|
|
38
|
+
|
|
39
|
+
### API Ingestion (Ch 6)
|
|
40
|
+
- [ ] **Ch 6 — Pagination** — Is pagination implemented correctly? Are all pages fetched? Is cursor-based preferred?
|
|
41
|
+
- [ ] **Ch 6 — Rate limiting** — Are rate limit headers respected? Is backoff implemented for 429 responses?
|
|
42
|
+
- [ ] **Ch 6 — Retry logic** — Are transient errors (5xx, timeouts) retried with exponential backoff? Are 4xx errors not retried?
|
|
43
|
+
- [ ] **Ch 6 — Authentication** — Are credentials stored securely? Are token refreshes handled?
|
|
44
|
+
- [ ] **Ch 6 — Incremental fetching** — Are date/cursor parameters used to fetch only new data?
|
|
45
|
+
|
|
46
|
+
### Streaming Ingestion (Ch 7)
|
|
47
|
+
- [ ] **Ch 7 — Consumer group design** — Are consumer groups configured for parallel processing? Are rebalances handled?
|
|
48
|
+
- [ ] **Ch 7 — Offset management** — Are offsets committed after successful processing, not before?
|
|
49
|
+
- [ ] **Ch 7 — Serialization** — Are events serialized with a schema (Avro, Protobuf) for evolution support?
|
|
50
|
+
- [ ] **Ch 7 — Dead letter queue** — Are failed events routed to a DLQ for inspection and reprocessing?
|
|
51
|
+
- [ ] **Ch 7 — Exactly-once semantics** — Is deduplication implemented downstream or are idempotent producers used?
|
|
52
|
+
- [ ] **Ch 7 — Backpressure** — Is backpressure handled when consumer can't keep up with producer?
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## 3. Data Storage & Loading (Chapter 8)
|
|
57
|
+
|
|
58
|
+
### Loading Patterns
|
|
59
|
+
- [ ] **Ch 8 — Bulk loading** — Are bulk load commands used (COPY, load jobs) instead of row-by-row INSERT?
|
|
60
|
+
- [ ] **Ch 8 — Staging tables** — Is data loaded to staging first, validated, then merged to production?
|
|
61
|
+
- [ ] **Ch 8 — Atomic operations** — Are loads atomic? Is the destination never left in a partial state?
|
|
62
|
+
- [ ] **Ch 8 — Data type mapping** — Are source types mapped correctly to destination types? No implicit conversions?
|
|
63
|
+
|
|
64
|
+
### Table Design
|
|
65
|
+
- [ ] **Ch 8 — Partitioning** — Are large tables partitioned by date or key? Are queries leveraging partition pruning?
|
|
66
|
+
- [ ] **Ch 8 — Clustering** — Are frequently filtered columns used as cluster keys within partitions?
|
|
67
|
+
- [ ] **Ch 8 — Sort/distribution keys** — For Redshift: are SORTKEY and DISTKEY chosen based on query patterns?
|
|
68
|
+
|
|
69
|
+
### Warehouse-Specific
|
|
70
|
+
- [ ] **Ch 8 — Redshift COPY** — Is S3-based COPY used instead of INSERT for bulk loads?
|
|
71
|
+
- [ ] **Ch 8 — BigQuery load jobs** — Are load jobs preferred over streaming inserts for batch pipelines?
|
|
72
|
+
- [ ] **Ch 8 — Snowflake stages** — Are stages used for file-based loading? Is SNOWPIPE configured for continuous loads?
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## 4. Transformations (Chapter 9)
|
|
77
|
+
|
|
78
|
+
### SQL Transforms
|
|
79
|
+
- [ ] **Ch 9 — CTEs for readability** — Are Common Table Expressions used instead of deeply nested subqueries?
|
|
80
|
+
- [ ] **Ch 9 — Window functions** — Are window functions used for ranking, running totals instead of self-joins?
|
|
81
|
+
- [ ] **Ch 9 — Grain awareness** — Do joins maintain the correct grain? No accidental fan-out producing duplicate rows?
|
|
82
|
+
- [ ] **Ch 9 — Deterministic logic** — Are transforms deterministic (same input → same output)? No reliance on row ordering?
|
|
83
|
+
|
|
84
|
+
### dbt Patterns
|
|
85
|
+
- [ ] **Ch 9 — Layer structure** — Are models organized into staging → intermediate → mart layers?
|
|
86
|
+
- [ ] **Ch 9 — Staging models** — Are staging models 1:1 with sources? Do they rename, cast, and filter only?
|
|
87
|
+
- [ ] **Ch 9 — Incremental models** — Are large models configured as incremental with proper `unique_key` and merge strategy?
|
|
88
|
+
- [ ] **Ch 9 — Source definitions** — Are sources defined in YAML with `source()` macro? Are freshness checks configured?
|
|
89
|
+
- [ ] **Ch 9 — Model materialization** — Are materializations appropriate (view for light transforms, table for heavy, incremental for large)?
|
|
90
|
+
|
|
91
|
+
### Python Transforms
|
|
92
|
+
- [ ] **Ch 9 — Appropriate tool** — Is Python used only when SQL is insufficient (ML, complex parsing, API calls)?
|
|
93
|
+
- [ ] **Ch 9 — Vectorized operations** — Are pandas/numpy vectorized operations used instead of row-by-row iteration?
|
|
94
|
+
- [ ] **Ch 9 — Memory management** — Are large datasets processed in chunks or with distributed frameworks (PySpark)?
|
|
95
|
+
- [ ] **Ch 9 — Pure functions** — Are transformation functions pure (no side effects) and independently testable?
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## 5. Data Validation & Testing (Chapter 10)
|
|
100
|
+
|
|
101
|
+
### Validation Coverage
|
|
102
|
+
- [ ] **Ch 10 — Schema validation** — Are column names, types, and count validated at ingestion?
|
|
103
|
+
- [ ] **Ch 10 — Row count reconciliation** — Are source and destination row counts compared? Is threshold alerting configured?
|
|
104
|
+
- [ ] **Ch 10 — Null checks** — Are NOT NULL constraints enforced on key columns? Are null percentages tracked?
|
|
105
|
+
- [ ] **Ch 10 — Uniqueness checks** — Is primary key uniqueness verified after loading?
|
|
106
|
+
- [ ] **Ch 10 — Referential integrity** — Are foreign key relationships validated between related tables?
|
|
107
|
+
- [ ] **Ch 10 — Range validation** — Are values checked against expected ranges (dates in past, amounts positive, percentages 0-100)?
|
|
108
|
+
- [ ] **Ch 10 — Freshness checks** — Is data freshness monitored? Are alerts configured for stale data?
|
|
109
|
+
|
|
110
|
+
### Testing
|
|
111
|
+
- [ ] **Ch 10 — Unit tests** — Are individual transformation functions tested with known inputs/outputs?
|
|
112
|
+
- [ ] **Ch 10 — Integration tests** — Is the end-to-end pipeline tested with sample data?
|
|
113
|
+
- [ ] **Ch 10 — dbt tests** — Are dbt schema tests (not_null, unique, relationships) defined? Are custom data tests used?
|
|
114
|
+
- [ ] **Ch 10 — Regression tests** — Are current results compared against known-good baselines for critical tables?
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## 6. Orchestration (Chapter 11)
|
|
119
|
+
|
|
120
|
+
### DAG Design
|
|
121
|
+
- [ ] **Ch 11 — One pipeline per DAG** — Are DAGs focused on a single pipeline? No mega-DAGs?
|
|
122
|
+
- [ ] **Ch 11 — Task granularity** — Are tasks atomic and independently retryable? Not too fine or too coarse?
|
|
123
|
+
- [ ] **Ch 11 — Shallow and wide** — Are DAGs shallow (few sequential steps) and wide (parallel where possible)?
|
|
124
|
+
- [ ] **Ch 11 — No hardcoded dates** — Are dates parameterized using execution_date or equivalent? No hardcoded date strings?
|
|
125
|
+
|
|
126
|
+
### Airflow Specifics
|
|
127
|
+
- [ ] **Ch 11 — Idempotent tasks** — Can every task be safely re-run without data duplication or side effects?
|
|
128
|
+
- [ ] **Ch 11 — Appropriate operators** — Are provider operators used where available instead of generic PythonOperator?
|
|
129
|
+
- [ ] **Ch 11 — XCom usage** — Are XComs used only for small metadata (file paths, row counts), not large data?
|
|
130
|
+
- [ ] **Ch 11 — Sensor timeouts** — Do sensors have timeouts to avoid indefinite waiting?
|
|
131
|
+
- [ ] **Ch 11 — Error callbacks** — Are `on_failure_callback` configured for alerting on task failures?
|
|
132
|
+
- [ ] **Ch 11 — Retries** — Are retries configured with appropriate delay for transient failures?
|
|
133
|
+
- [ ] **Ch 11 — Pool limits** — Are pools used to limit concurrency for resource-constrained tasks?
|
|
134
|
+
- [ ] **Ch 11 — Catchup configuration** — Is `catchup` set appropriately (True for backfill-supporting DAGs, False otherwise)?
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## 7. Monitoring & Operations (Chapters 12–13)
|
|
139
|
+
|
|
140
|
+
### Monitoring
|
|
141
|
+
- [ ] **Ch 12 — Duration tracking** — Is pipeline execution time tracked and alerted on anomalies?
|
|
142
|
+
- [ ] **Ch 12 — Row count monitoring** — Are rows processed per run tracked? Alerts on zero or unusual counts?
|
|
143
|
+
- [ ] **Ch 12 — Error rate tracking** — Are failed records, retries, and exceptions monitored?
|
|
144
|
+
- [ ] **Ch 12 — Data freshness SLA** — Are freshness SLAs defined and monitored? Alerts on breaches?
|
|
145
|
+
- [ ] **Ch 12 — Resource monitoring** — Are CPU, memory, disk, and network usage tracked for pipeline infrastructure?
|
|
146
|
+
|
|
147
|
+
### Alerting
|
|
148
|
+
- [ ] **Ch 12 — Actionable alerts** — Do alerts include context (error message, affected table, run ID, link to logs)?
|
|
149
|
+
- [ ] **Ch 12 — Severity levels** — Are alerts classified by severity with appropriate routing?
|
|
150
|
+
- [ ] **Ch 12 — Alert fatigue prevention** — Are thresholds tuned to avoid noisy alerts? Are alerts deduplicated?
|
|
151
|
+
|
|
152
|
+
### Operational Excellence
|
|
153
|
+
- [ ] **Ch 13 — Idempotency** — Are all pipelines idempotent? Can they be re-run without data corruption?
|
|
154
|
+
- [ ] **Ch 13 — Backfill support** — Are pipelines parameterized for date-range backfilling?
|
|
155
|
+
- [ ] **Ch 13 — Error handling** — Are transient errors retried? Are permanent errors failed fast? Are bad records quarantined?
|
|
156
|
+
- [ ] **Ch 13 — Credential security** — Are credentials in secrets managers, not in code or config files?
|
|
157
|
+
- [ ] **Ch 13 — Data lineage** — Is source-to-destination mapping documented? Is transformation logic recorded?
|
|
158
|
+
- [ ] **Ch 13 — Documentation** — Is there a README, data dictionary, and runbook for each pipeline?
|
|
159
|
+
- [ ] **Ch 13 — Version control** — Is all pipeline code in git? Are changes reviewed via PR?
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Quick Review Workflow
|
|
164
|
+
|
|
165
|
+
1. **Architecture pass** — Verify ETL/ELT choice, incremental vs full, infrastructure fit
|
|
166
|
+
2. **Ingestion pass** — Check source-specific best practices, error handling, incremental logic
|
|
167
|
+
3. **Loading pass** — Verify bulk loading, staging tables, partitioning, atomic operations
|
|
168
|
+
4. **Transform pass** — Check SQL quality, dbt patterns, layer structure, determinism
|
|
169
|
+
5. **Validation pass** — Verify data quality checks at each boundary, test coverage
|
|
170
|
+
6. **Orchestration pass** — Check DAG design, idempotency, task granularity, error handling
|
|
171
|
+
7. **Operations pass** — Verify monitoring, alerting, backfill support, documentation
|
|
172
|
+
8. **Prioritize findings** — Rank by severity: data loss risk > data quality > performance > best practices > style
|
|
173
|
+
|
|
174
|
+
## Severity Levels
|
|
175
|
+
|
|
176
|
+
| Severity | Description | Example |
|
|
177
|
+
|----------|-------------|---------|
|
|
178
|
+
| **Critical** | Data loss, corruption, or security risk | Non-idempotent pipeline causing duplicates, hardcoded credentials, no staging tables with partial load risk, missing dead letter queue losing events |
|
|
179
|
+
| **High** | Data quality or reliability issues | Missing validation, no error handling, full extraction on large tables, no monitoring or alerting, blocking on rate limits |
|
|
180
|
+
| **Medium** | Performance, maintainability, or operational gaps | Missing partitioning, monolithic DAGs, no backfill support, missing documentation, no incremental models for large tables |
|
|
181
|
+
| **Low** | Best practice improvements, optimization opportunities | Missing compression, suboptimal clustering, verbose logging, minor naming inconsistencies |
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
new_pipeline.py — Scaffold a new data pipeline with extract/transform/load structure.
|
|
4
|
+
Usage: python new_pipeline.py <pipeline-name> [--source csv|api|db] [--target db|file|api]
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from string import Template
|
|
12
|
+
|
|
13
|
+
# ---------------------------------------------------------------------------
|
|
14
|
+
# File templates
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
EXTRACT_CSV = '''\
|
|
18
|
+
"""extract.py — Extract data from a CSV source."""
|
|
19
|
+
|
|
20
|
+
import csv
|
|
21
|
+
import logging
|
|
22
|
+
import time
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from functools import wraps
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def retry(max_attempts=3, delay=2.0, exceptions=(Exception,)):
|
|
30
|
+
"""Retry decorator with exponential backoff."""
|
|
31
|
+
def decorator(fn):
|
|
32
|
+
@wraps(fn)
|
|
33
|
+
def wrapper(*args, **kwargs):
|
|
34
|
+
for attempt in range(1, max_attempts + 1):
|
|
35
|
+
try:
|
|
36
|
+
return fn(*args, **kwargs)
|
|
37
|
+
except exceptions as exc:
|
|
38
|
+
if attempt == max_attempts:
|
|
39
|
+
raise
|
|
40
|
+
wait = delay * (2 ** (attempt - 1))
|
|
41
|
+
logger.warning("Attempt %d failed: %s. Retrying in %.1fs...", attempt, exc, wait)
|
|
42
|
+
time.sleep(wait)
|
|
43
|
+
return wrapper
|
|
44
|
+
return decorator
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@retry(max_attempts=3, exceptions=(OSError,))
|
|
48
|
+
def extract(source_path: str) -> list[dict]:
|
|
49
|
+
"""Read rows from a CSV file. Returns a list of dicts."""
|
|
50
|
+
path = Path(source_path)
|
|
51
|
+
if not path.exists():
|
|
52
|
+
raise FileNotFoundError(f"Source file not found: {path}")
|
|
53
|
+
logger.info("Extracting from %s", path)
|
|
54
|
+
with path.open(newline="", encoding="utf-8") as fh:
|
|
55
|
+
reader = csv.DictReader(fh)
|
|
56
|
+
rows = list(reader)
|
|
57
|
+
logger.info("Extracted %d rows", len(rows))
|
|
58
|
+
return rows
|
|
59
|
+
'''
|
|
60
|
+
|
|
61
|
+
EXTRACT_API = '''\
|
|
62
|
+
"""extract.py — Extract data from an HTTP API source."""
|
|
63
|
+
|
|
64
|
+
import json
|
|
65
|
+
import logging
|
|
66
|
+
import time
|
|
67
|
+
import urllib.error
|
|
68
|
+
import urllib.request
|
|
69
|
+
from functools import wraps
|
|
70
|
+
|
|
71
|
+
logger = logging.getLogger(__name__)
|
|
72
|
+
|
|
73
|
+
BASE_URL = "https://api.example.com/data"
|
|
74
|
+
API_KEY = "" # Set via environment variable in production
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def retry(max_attempts=3, delay=2.0, exceptions=(Exception,)):
|
|
78
|
+
"""Retry decorator with exponential backoff."""
|
|
79
|
+
def decorator(fn):
|
|
80
|
+
@wraps(fn)
|
|
81
|
+
def wrapper(*args, **kwargs):
|
|
82
|
+
for attempt in range(1, max_attempts + 1):
|
|
83
|
+
try:
|
|
84
|
+
return fn(*args, **kwargs)
|
|
85
|
+
except exceptions as exc:
|
|
86
|
+
if attempt == max_attempts:
|
|
87
|
+
raise
|
|
88
|
+
wait = delay * (2 ** (attempt - 1))
|
|
89
|
+
logger.warning("Attempt %d failed: %s. Retrying in %.1fs...", attempt, exc, wait)
|
|
90
|
+
time.sleep(wait)
|
|
91
|
+
return wrapper
|
|
92
|
+
return decorator
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@retry(max_attempts=3, exceptions=(urllib.error.URLError, OSError))
|
|
96
|
+
def extract(endpoint: str = BASE_URL) -> list[dict]:
|
|
97
|
+
"""Fetch JSON records from an API endpoint. Returns a list of dicts."""
|
|
98
|
+
logger.info("Extracting from %s", endpoint)
|
|
99
|
+
req = urllib.request.Request(endpoint, headers={"Accept": "application/json"})
|
|
100
|
+
with urllib.request.urlopen(req, timeout=30) as response:
|
|
101
|
+
data = json.loads(response.read())
|
|
102
|
+
records = data if isinstance(data, list) else data.get("results", data.get("items", []))
|
|
103
|
+
logger.info("Extracted %d records", len(records))
|
|
104
|
+
return records
|
|
105
|
+
'''
|
|
106
|
+
|
|
107
|
+
EXTRACT_DB = '''\
|
|
108
|
+
"""extract.py — Extract data from a database source."""
|
|
109
|
+
|
|
110
|
+
import logging
|
|
111
|
+
import sqlite3
|
|
112
|
+
import time
|
|
113
|
+
from functools import wraps
|
|
114
|
+
|
|
115
|
+
logger = logging.getLogger(__name__)
|
|
116
|
+
|
|
117
|
+
DB_PATH = "source.db"
|
|
118
|
+
QUERY = "SELECT * FROM source_table"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def retry(max_attempts=3, delay=2.0, exceptions=(Exception,)):
|
|
122
|
+
"""Retry decorator with exponential backoff."""
|
|
123
|
+
def decorator(fn):
|
|
124
|
+
@wraps(fn)
|
|
125
|
+
def wrapper(*args, **kwargs):
|
|
126
|
+
for attempt in range(1, max_attempts + 1):
|
|
127
|
+
try:
|
|
128
|
+
return fn(*args, **kwargs)
|
|
129
|
+
except exceptions as exc:
|
|
130
|
+
if attempt == max_attempts:
|
|
131
|
+
raise
|
|
132
|
+
wait = delay * (2 ** (attempt - 1))
|
|
133
|
+
logger.warning("Attempt %d failed: %s. Retrying in %.1fs...", attempt, exc, wait)
|
|
134
|
+
time.sleep(wait)
|
|
135
|
+
return wrapper
|
|
136
|
+
return decorator
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@retry(max_attempts=3, exceptions=(sqlite3.OperationalError,))
|
|
140
|
+
def extract(db_path: str = DB_PATH, query: str = QUERY) -> list[dict]:
|
|
141
|
+
"""Query records from a SQLite database. Returns a list of dicts."""
|
|
142
|
+
logger.info("Connecting to %s", db_path)
|
|
143
|
+
conn = sqlite3.connect(db_path)
|
|
144
|
+
conn.row_factory = sqlite3.Row
|
|
145
|
+
try:
|
|
146
|
+
cursor = conn.execute(query)
|
|
147
|
+
rows = [dict(row) for row in cursor.fetchall()]
|
|
148
|
+
finally:
|
|
149
|
+
conn.close()
|
|
150
|
+
logger.info("Extracted %d rows", len(rows))
|
|
151
|
+
return rows
|
|
152
|
+
'''
|
|
153
|
+
|
|
154
|
+
TRANSFORM_TEMPLATE = '''\
|
|
155
|
+
"""transform.py — Transform extracted records."""
|
|
156
|
+
|
|
157
|
+
import logging
|
|
158
|
+
from typing import Any
|
|
159
|
+
|
|
160
|
+
logger = logging.getLogger(__name__)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _clean_record(record: dict[str, Any]) -> dict[str, Any]:
|
|
164
|
+
"""Strip whitespace from string values and drop empty fields."""
|
|
165
|
+
cleaned = {}
|
|
166
|
+
for key, value in record.items():
|
|
167
|
+
if isinstance(value, str):
|
|
168
|
+
value = value.strip()
|
|
169
|
+
if value not in (None, "", []):
|
|
170
|
+
cleaned[key] = value
|
|
171
|
+
return cleaned
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _validate_record(record: dict[str, Any]) -> bool:
|
|
175
|
+
"""Return True if the record is valid. Customize required fields here."""
|
|
176
|
+
# TODO: add field-specific validation
|
|
177
|
+
return bool(record)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def transform(records: list[dict]) -> list[dict]:
|
|
181
|
+
"""Clean, validate, and reshape records for loading."""
|
|
182
|
+
logger.info("Transforming %d records", len(records))
|
|
183
|
+
output = []
|
|
184
|
+
skipped = 0
|
|
185
|
+
for record in records:
|
|
186
|
+
cleaned = _clean_record(record)
|
|
187
|
+
if not _validate_record(cleaned):
|
|
188
|
+
skipped += 1
|
|
189
|
+
continue
|
|
190
|
+
# TODO: add field mappings / enrichment here
|
|
191
|
+
output.append(cleaned)
|
|
192
|
+
if skipped:
|
|
193
|
+
logger.warning("Skipped %d invalid records", skipped)
|
|
194
|
+
logger.info("Transformed %d records", len(output))
|
|
195
|
+
return output
|
|
196
|
+
'''
|
|
197
|
+
|
|
198
|
+
LOAD_DB = '''\
|
|
199
|
+
"""load.py — Idempotent load into a SQLite database using upsert."""
|
|
200
|
+
|
|
201
|
+
import logging
|
|
202
|
+
import sqlite3
|
|
203
|
+
from typing import Any
|
|
204
|
+
|
|
205
|
+
logger = logging.getLogger(__name__)
|
|
206
|
+
|
|
207
|
+
DB_PATH = "output.db"
|
|
208
|
+
TABLE = "$pipeline_name"
|
|
209
|
+
# Define a unique key column used for upsert conflict detection
|
|
210
|
+
UNIQUE_KEY = "id"
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _ensure_table(conn: sqlite3.Connection, sample: dict[str, Any]) -> None:
|
|
214
|
+
columns = ", ".join(
|
|
215
|
+
f"{col} TEXT" if col != UNIQUE_KEY else f"{col} TEXT PRIMARY KEY"
|
|
216
|
+
for col in sample
|
|
217
|
+
)
|
|
218
|
+
conn.execute(f"CREATE TABLE IF NOT EXISTS {TABLE} ({columns})")
|
|
219
|
+
conn.commit()
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def load(records: list[dict]) -> int:
|
|
223
|
+
"""Upsert records into SQLite. Returns number of rows written."""
|
|
224
|
+
if not records:
|
|
225
|
+
logger.info("No records to load.")
|
|
226
|
+
return 0
|
|
227
|
+
logger.info("Loading %d records into %s:%s", len(records), DB_PATH, TABLE)
|
|
228
|
+
conn = sqlite3.connect(DB_PATH)
|
|
229
|
+
try:
|
|
230
|
+
_ensure_table(conn, records[0])
|
|
231
|
+
cols = ", ".join(records[0].keys())
|
|
232
|
+
placeholders = ", ".join("?" for _ in records[0])
|
|
233
|
+
sql = (
|
|
234
|
+
f"INSERT OR REPLACE INTO {TABLE} ({cols}) VALUES ({placeholders})"
|
|
235
|
+
)
|
|
236
|
+
conn.executemany(sql, [list(r.values()) for r in records])
|
|
237
|
+
conn.commit()
|
|
238
|
+
finally:
|
|
239
|
+
conn.close()
|
|
240
|
+
logger.info("Loaded %d records", len(records))
|
|
241
|
+
return len(records)
|
|
242
|
+
'''
|
|
243
|
+
|
|
244
|
+
LOAD_FILE = '''\
|
|
245
|
+
"""load.py — Write records to a CSV or JSON file (idempotent by overwrite)."""
|
|
246
|
+
|
|
247
|
+
import csv
|
|
248
|
+
import json
|
|
249
|
+
import logging
|
|
250
|
+
from pathlib import Path
|
|
251
|
+
|
|
252
|
+
logger = logging.getLogger(__name__)
|
|
253
|
+
|
|
254
|
+
OUTPUT_PATH = "$pipeline_name_output.csv"
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def load(records: list[dict], output_path: str = OUTPUT_PATH) -> int:
|
|
258
|
+
"""Write records to a file. Overwrites to ensure idempotency."""
|
|
259
|
+
if not records:
|
|
260
|
+
logger.info("No records to load.")
|
|
261
|
+
return 0
|
|
262
|
+
path = Path(output_path)
|
|
263
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
264
|
+
if path.suffix == ".json":
|
|
265
|
+
path.write_text(json.dumps(records, indent=2, default=str), encoding="utf-8")
|
|
266
|
+
else:
|
|
267
|
+
with path.open("w", newline="", encoding="utf-8") as fh:
|
|
268
|
+
writer = csv.DictWriter(fh, fieldnames=records[0].keys())
|
|
269
|
+
writer.writeheader()
|
|
270
|
+
writer.writerows(records)
|
|
271
|
+
logger.info("Wrote %d records to %s", len(records), path)
|
|
272
|
+
return len(records)
|
|
273
|
+
'''
|
|
274
|
+
|
|
275
|
+
LOAD_API = '''\
|
|
276
|
+
"""load.py — POST records to an API endpoint (idempotent with dedup key)."""
|
|
277
|
+
|
|
278
|
+
import json
|
|
279
|
+
import logging
|
|
280
|
+
import urllib.error
|
|
281
|
+
import urllib.request
|
|
282
|
+
|
|
283
|
+
logger = logging.getLogger(__name__)
|
|
284
|
+
|
|
285
|
+
TARGET_URL = "https://api.example.com/ingest"
|
|
286
|
+
BATCH_SIZE = 100
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _post_batch(batch: list[dict]) -> None:
|
|
290
|
+
payload = json.dumps(batch).encode("utf-8")
|
|
291
|
+
req = urllib.request.Request(
|
|
292
|
+
TARGET_URL,
|
|
293
|
+
data=payload,
|
|
294
|
+
method="POST",
|
|
295
|
+
headers={"Content-Type": "application/json"},
|
|
296
|
+
)
|
|
297
|
+
try:
|
|
298
|
+
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
299
|
+
status = resp.status
|
|
300
|
+
logger.info("Batch of %d posted — HTTP %d", len(batch), status)
|
|
301
|
+
except urllib.error.HTTPError as exc:
|
|
302
|
+
logger.error("HTTP error %d posting batch: %s", exc.code, exc.reason)
|
|
303
|
+
raise
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def load(records: list[dict]) -> int:
|
|
307
|
+
"""POST records in batches. Returns total records sent."""
|
|
308
|
+
if not records:
|
|
309
|
+
logger.info("No records to load.")
|
|
310
|
+
return 0
|
|
311
|
+
total = 0
|
|
312
|
+
for i in range(0, len(records), BATCH_SIZE):
|
|
313
|
+
batch = records[i:i + BATCH_SIZE]
|
|
314
|
+
_post_batch(batch)
|
|
315
|
+
total += len(batch)
|
|
316
|
+
logger.info("Loaded %d records via API", total)
|
|
317
|
+
return total
|
|
318
|
+
'''
|
|
319
|
+
|
|
320
|
+
PIPELINE_TEMPLATE = '''\
|
|
321
|
+
"""pipeline.py — Orchestrator: extract → transform → load."""
|
|
322
|
+
|
|
323
|
+
import logging
|
|
324
|
+
import sys
|
|
325
|
+
import time
|
|
326
|
+
|
|
327
|
+
from extract import extract
|
|
328
|
+
from transform import transform
|
|
329
|
+
from load import load
|
|
330
|
+
|
|
331
|
+
logging.basicConfig(
|
|
332
|
+
level=logging.INFO,
|
|
333
|
+
format="%(asctime)s %(levelname)-8s %(name)s %(message)s",
|
|
334
|
+
datefmt="%Y-%m-%dT%H:%M:%S",
|
|
335
|
+
)
|
|
336
|
+
logger = logging.getLogger("$pipeline_name")
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def run() -> int:
|
|
340
|
+
"""Run the full pipeline. Returns exit code (0=success, 1=failure)."""
|
|
341
|
+
start = time.monotonic()
|
|
342
|
+
logger.info("Pipeline '$pipeline_name' starting")
|
|
343
|
+
try:
|
|
344
|
+
raw = extract()
|
|
345
|
+
records = transform(raw)
|
|
346
|
+
count = load(records)
|
|
347
|
+
elapsed = time.monotonic() - start
|
|
348
|
+
logger.info(
|
|
349
|
+
"Pipeline complete — %d records loaded in %.2fs", count, elapsed
|
|
350
|
+
)
|
|
351
|
+
return 0
|
|
352
|
+
except Exception as exc:
|
|
353
|
+
logger.exception("Pipeline failed: %s", exc)
|
|
354
|
+
return 1
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
if __name__ == "__main__":
|
|
358
|
+
sys.exit(run())
|
|
359
|
+
'''
|
|
360
|
+
|
|
361
|
+
REQUIREMENTS_TEMPLATE = '''\
|
|
362
|
+
# Runtime dependencies for $pipeline_name pipeline
|
|
363
|
+
# Add your project-specific packages below.
|
|
364
|
+
|
|
365
|
+
# Uncomment as needed:
|
|
366
|
+
# requests>=2.31 # for API sources/targets
|
|
367
|
+
# psycopg2-binary>=2.9 # for PostgreSQL
|
|
368
|
+
# pymysql>=1.1 # for MySQL
|
|
369
|
+
# pandas>=2.0 # for complex transformations
|
|
370
|
+
# pydantic>=2.0 # for record validation
|
|
371
|
+
'''
|
|
372
|
+
|
|
373
|
+
# ---------------------------------------------------------------------------
|
|
374
|
+
# Source/target template selection
|
|
375
|
+
# ---------------------------------------------------------------------------
|
|
376
|
+
|
|
377
|
+
EXTRACT_TEMPLATES = {"csv": EXTRACT_CSV, "api": EXTRACT_API, "db": EXTRACT_DB}
|
|
378
|
+
LOAD_TEMPLATES = {"db": LOAD_DB, "file": LOAD_FILE, "api": LOAD_API}
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def render(template_str: str, pipeline_name: str) -> str:
|
|
382
|
+
safe_name = pipeline_name.replace("-", "_")
|
|
383
|
+
return Template(template_str).safe_substitute(pipeline_name=safe_name)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def create_pipeline(name: str, source: str, target: str) -> None:
|
|
387
|
+
base = Path(name)
|
|
388
|
+
if base.exists():
|
|
389
|
+
print(f"Error: directory '{base}' already exists. Choose a different name.")
|
|
390
|
+
sys.exit(1)
|
|
391
|
+
base.mkdir(parents=True)
|
|
392
|
+
|
|
393
|
+
files = {
|
|
394
|
+
"extract.py": render(EXTRACT_TEMPLATES[source], name),
|
|
395
|
+
"transform.py": render(TRANSFORM_TEMPLATE, name),
|
|
396
|
+
"load.py": render(LOAD_TEMPLATES[target], name),
|
|
397
|
+
"pipeline.py": render(PIPELINE_TEMPLATE, name),
|
|
398
|
+
"requirements.txt": render(REQUIREMENTS_TEMPLATE, name),
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
created = []
|
|
402
|
+
for filename, content in files.items():
|
|
403
|
+
path = base / filename
|
|
404
|
+
path.write_text(content, encoding="utf-8")
|
|
405
|
+
created.append(str(path))
|
|
406
|
+
|
|
407
|
+
print(f"\nPipeline '{name}' created successfully!\n")
|
|
408
|
+
print(f" Source : {source}")
|
|
409
|
+
print(f" Target : {target}")
|
|
410
|
+
print(f"\nFiles created:")
|
|
411
|
+
for f in created:
|
|
412
|
+
print(f" {f}")
|
|
413
|
+
print(f"\nNext steps:")
|
|
414
|
+
print(f" 1. cd {name}")
|
|
415
|
+
print(f" 2. Review extract.py and update source configuration")
|
|
416
|
+
print(f" 3. Customize transform.py with your business logic")
|
|
417
|
+
print(f" 4. Review load.py and configure target destination")
|
|
418
|
+
print(f" 5. pip install -r requirements.txt # add packages as needed")
|
|
419
|
+
print(f" 6. python pipeline.py")
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def main():
|
|
423
|
+
parser = argparse.ArgumentParser(
|
|
424
|
+
description="Scaffold a new data pipeline (extract → transform → load)"
|
|
425
|
+
)
|
|
426
|
+
parser.add_argument("name", help="Pipeline name (used as directory name)")
|
|
427
|
+
parser.add_argument(
|
|
428
|
+
"--source",
|
|
429
|
+
choices=["csv", "api", "db"],
|
|
430
|
+
default="csv",
|
|
431
|
+
help="Data source type (default: csv)",
|
|
432
|
+
)
|
|
433
|
+
parser.add_argument(
|
|
434
|
+
"--target",
|
|
435
|
+
choices=["db", "file", "api"],
|
|
436
|
+
default="db",
|
|
437
|
+
help="Data target type (default: db)",
|
|
438
|
+
)
|
|
439
|
+
args = parser.parse_args()
|
|
440
|
+
create_pipeline(args.name, args.source, args.target)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
if __name__ == "__main__":
|
|
444
|
+
main()
|