forgecraft-mcp 1.2.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/README.md +525 -525
  2. package/dist/cli/help.js +44 -44
  3. package/dist/registry/renderer-skeletons.js +92 -92
  4. package/dist/shared/gs-score-logger.js +6 -6
  5. package/dist/tools/add-module.js +123 -123
  6. package/dist/tools/advice-registry.js +18 -18
  7. package/dist/tools/check-cascade-report.js +64 -64
  8. package/dist/tools/configure-mcp.d.ts +3 -0
  9. package/dist/tools/configure-mcp.d.ts.map +1 -1
  10. package/dist/tools/configure-mcp.js +10 -0
  11. package/dist/tools/configure-mcp.js.map +1 -1
  12. package/dist/tools/forgecraft-dispatch.d.ts.map +1 -1
  13. package/dist/tools/forgecraft-dispatch.js +3 -0
  14. package/dist/tools/forgecraft-dispatch.js.map +1 -1
  15. package/dist/tools/forgecraft-schema-params.d.ts +9 -0
  16. package/dist/tools/forgecraft-schema-params.d.ts.map +1 -1
  17. package/dist/tools/forgecraft-schema-params.js +21 -0
  18. package/dist/tools/forgecraft-schema-params.js.map +1 -1
  19. package/dist/tools/forgecraft-schema.d.ts +9 -0
  20. package/dist/tools/forgecraft-schema.d.ts.map +1 -1
  21. package/dist/tools/refresh-output.js +14 -14
  22. package/dist/tools/scaffold-spec-stubs.js +115 -115
  23. package/dist/tools/scaffold-templates.js +62 -62
  24. package/dist/tools/setup-artifact-writers.d.ts +30 -0
  25. package/dist/tools/setup-artifact-writers.d.ts.map +1 -1
  26. package/dist/tools/setup-artifact-writers.js +120 -8
  27. package/dist/tools/setup-artifact-writers.js.map +1 -1
  28. package/dist/tools/setup-phase1.d.ts +3 -0
  29. package/dist/tools/setup-phase1.d.ts.map +1 -1
  30. package/dist/tools/setup-phase1.js +79 -35
  31. package/dist/tools/setup-phase1.js.map +1 -1
  32. package/dist/tools/setup-phase2.d.ts +2 -0
  33. package/dist/tools/setup-phase2.d.ts.map +1 -1
  34. package/dist/tools/setup-phase2.js +10 -1
  35. package/dist/tools/setup-phase2.js.map +1 -1
  36. package/dist/tools/setup-project.d.ts +18 -0
  37. package/dist/tools/setup-project.d.ts.map +1 -1
  38. package/dist/tools/setup-project.js +77 -1
  39. package/dist/tools/setup-project.js.map +1 -1
  40. package/dist/tools/spec-parser-tags.d.ts +9 -0
  41. package/dist/tools/spec-parser-tags.d.ts.map +1 -1
  42. package/dist/tools/spec-parser-tags.js +92 -0
  43. package/dist/tools/spec-parser-tags.js.map +1 -1
  44. package/package.json +89 -86
  45. package/templates/analytics/instructions.yaml +37 -37
  46. package/templates/analytics/mcp-servers.yaml +11 -11
  47. package/templates/analytics/structure.yaml +25 -25
  48. package/templates/api/instructions.yaml +231 -231
  49. package/templates/api/mcp-servers.yaml +22 -13
  50. package/templates/api/nfr.yaml +23 -23
  51. package/templates/api/review.yaml +103 -103
  52. package/templates/api/structure.yaml +34 -34
  53. package/templates/api/verification.yaml +132 -132
  54. package/templates/cli/instructions.yaml +31 -31
  55. package/templates/cli/mcp-servers.yaml +11 -11
  56. package/templates/cli/review.yaml +53 -53
  57. package/templates/cli/structure.yaml +16 -16
  58. package/templates/data-lineage/instructions.yaml +28 -28
  59. package/templates/data-lineage/mcp-servers.yaml +22 -22
  60. package/templates/data-pipeline/instructions.yaml +84 -84
  61. package/templates/data-pipeline/mcp-servers.yaml +13 -13
  62. package/templates/data-pipeline/nfr.yaml +39 -39
  63. package/templates/data-pipeline/structure.yaml +23 -23
  64. package/templates/fintech/hooks.yaml +55 -55
  65. package/templates/fintech/instructions.yaml +112 -112
  66. package/templates/fintech/mcp-servers.yaml +13 -13
  67. package/templates/fintech/nfr.yaml +46 -46
  68. package/templates/fintech/playbook.yaml +210 -210
  69. package/templates/fintech/verification.yaml +239 -239
  70. package/templates/game/instructions.yaml +289 -289
  71. package/templates/game/mcp-servers.yaml +38 -38
  72. package/templates/game/nfr.yaml +64 -64
  73. package/templates/game/playbook.yaml +214 -214
  74. package/templates/game/review.yaml +97 -97
  75. package/templates/game/structure.yaml +67 -67
  76. package/templates/game/verification.yaml +174 -174
  77. package/templates/healthcare/instructions.yaml +42 -42
  78. package/templates/healthcare/mcp-servers.yaml +13 -13
  79. package/templates/healthcare/nfr.yaml +47 -47
  80. package/templates/hipaa/instructions.yaml +41 -41
  81. package/templates/hipaa/mcp-servers.yaml +13 -13
  82. package/templates/infra/instructions.yaml +104 -104
  83. package/templates/infra/mcp-servers.yaml +20 -20
  84. package/templates/infra/nfr.yaml +46 -46
  85. package/templates/infra/review.yaml +65 -65
  86. package/templates/infra/structure.yaml +25 -25
  87. package/templates/library/instructions.yaml +36 -36
  88. package/templates/library/mcp-servers.yaml +20 -20
  89. package/templates/library/review.yaml +56 -56
  90. package/templates/library/structure.yaml +19 -19
  91. package/templates/medallion-architecture/instructions.yaml +41 -41
  92. package/templates/medallion-architecture/mcp-servers.yaml +22 -22
  93. package/templates/ml/instructions.yaml +85 -85
  94. package/templates/ml/mcp-servers.yaml +11 -11
  95. package/templates/ml/nfr.yaml +39 -39
  96. package/templates/ml/structure.yaml +25 -25
  97. package/templates/ml/verification.yaml +156 -156
  98. package/templates/mobile/instructions.yaml +44 -44
  99. package/templates/mobile/mcp-servers.yaml +11 -11
  100. package/templates/mobile/nfr.yaml +49 -49
  101. package/templates/mobile/structure.yaml +27 -27
  102. package/templates/mobile/verification.yaml +121 -121
  103. package/templates/observability-xray/instructions.yaml +40 -40
  104. package/templates/observability-xray/mcp-servers.yaml +15 -15
  105. package/templates/realtime/instructions.yaml +42 -42
  106. package/templates/realtime/mcp-servers.yaml +13 -13
  107. package/templates/soc2/instructions.yaml +41 -41
  108. package/templates/soc2/mcp-servers.yaml +24 -24
  109. package/templates/social/instructions.yaml +43 -43
  110. package/templates/social/mcp-servers.yaml +24 -24
  111. package/templates/state-machine/instructions.yaml +42 -42
  112. package/templates/state-machine/mcp-servers.yaml +11 -11
  113. package/templates/tools-registry.yaml +164 -164
  114. package/templates/universal/hooks.yaml +531 -531
  115. package/templates/universal/instructions.yaml +1692 -1692
  116. package/templates/universal/mcp-servers.yaml +50 -50
  117. package/templates/universal/nfr.yaml +197 -197
  118. package/templates/universal/reference.yaml +326 -326
  119. package/templates/universal/review.yaml +204 -204
  120. package/templates/universal/skills.yaml +262 -262
  121. package/templates/universal/structure.yaml +67 -67
  122. package/templates/universal/verification.yaml +416 -416
  123. package/templates/web-react/hooks.yaml +44 -44
  124. package/templates/web-react/instructions.yaml +207 -207
  125. package/templates/web-react/mcp-servers.yaml +20 -20
  126. package/templates/web-react/nfr.yaml +27 -27
  127. package/templates/web-react/review.yaml +94 -94
  128. package/templates/web-react/structure.yaml +46 -46
  129. package/templates/web-react/verification.yaml +126 -126
  130. package/templates/web-static/instructions.yaml +115 -115
  131. package/templates/web-static/mcp-servers.yaml +20 -20
  132. package/templates/web3/instructions.yaml +44 -44
  133. package/templates/web3/mcp-servers.yaml +11 -11
  134. package/templates/web3/verification.yaml +159 -159
  135. package/templates/zero-trust/instructions.yaml +41 -41
  136. package/templates/zero-trust/mcp-servers.yaml +15 -15
@@ -1,53 +1,53 @@
1
- tag: CLI
2
- section: review
3
- blocks:
4
- - id: cli-architecture-review
5
- tier: recommended
6
- dimension: architecture
7
- title: "CLI Architecture Review"
8
- description: |
9
- Evaluate CLI-specific patterns: argument parsing, output formatting, and exit codes.
10
- checklist:
11
- - id: cli-argument-parsing
12
- description: "Arguments parsed with a proper library (yargs, commander, clap). No manual argv slicing."
13
- severity: important
14
- - id: cli-exit-codes
15
- description: "Exit codes follow convention: 0 = success, 1 = general error, 2 = usage error. Documented."
16
- severity: important
17
- - id: cli-help-output
18
- description: "--help flag yields clear usage with examples. Every subcommand has a description."
19
- severity: important
20
- - id: cli-stdin-stdout
21
- description: "Supports stdin/stdout piping for composability. Machine-readable output (JSON) via --json flag."
22
- severity: nice-to-have
23
-
24
- - id: cli-code-quality-review
25
- tier: recommended
26
- dimension: code-quality
27
- title: "CLI Code Quality Review"
28
- description: |
29
- Evaluate CLI UX and error handling.
30
- checklist:
31
- - id: cli-error-messages
32
- description: "Errors show what went wrong, why, and how to fix it. Include the failing input value."
33
- severity: critical
34
- - id: cli-progress-feedback
35
- description: "Long operations show progress (spinner, progress bar). Silent mode available via --quiet."
36
- severity: important
37
- - id: cli-destructive-confirm
38
- description: "Destructive operations require confirmation unless --force or --yes flag is passed."
39
- severity: important
40
-
41
- - id: cli-test-review
42
- tier: recommended
43
- dimension: tests
44
- title: "CLI Test Review"
45
- description: |
46
- Evaluate CLI-specific testing patterns.
47
- checklist:
48
- - id: cli-integration-tests
49
- description: "End-to-end tests invoke the CLI binary and assert on stdout, stderr, and exit code."
50
- severity: critical
51
- - id: cli-snapshot-output
52
- description: "Help text and formatted output tested via snapshots to catch unintended changes."
53
- severity: nice-to-have
1
+ tag: CLI
2
+ section: review
3
+ blocks:
4
+ - id: cli-architecture-review
5
+ tier: recommended
6
+ dimension: architecture
7
+ title: "CLI Architecture Review"
8
+ description: |
9
+ Evaluate CLI-specific patterns: argument parsing, output formatting, and exit codes.
10
+ checklist:
11
+ - id: cli-argument-parsing
12
+ description: "Arguments parsed with a proper library (yargs, commander, clap). No manual argv slicing."
13
+ severity: important
14
+ - id: cli-exit-codes
15
+ description: "Exit codes follow convention: 0 = success, 1 = general error, 2 = usage error. Documented."
16
+ severity: important
17
+ - id: cli-help-output
18
+ description: "--help flag yields clear usage with examples. Every subcommand has a description."
19
+ severity: important
20
+ - id: cli-stdin-stdout
21
+ description: "Supports stdin/stdout piping for composability. Machine-readable output (JSON) via --json flag."
22
+ severity: nice-to-have
23
+
24
+ - id: cli-code-quality-review
25
+ tier: recommended
26
+ dimension: code-quality
27
+ title: "CLI Code Quality Review"
28
+ description: |
29
+ Evaluate CLI UX and error handling.
30
+ checklist:
31
+ - id: cli-error-messages
32
+ description: "Errors show what went wrong, why, and how to fix it. Include the failing input value."
33
+ severity: critical
34
+ - id: cli-progress-feedback
35
+ description: "Long operations show progress (spinner, progress bar). Silent mode available via --quiet."
36
+ severity: important
37
+ - id: cli-destructive-confirm
38
+ description: "Destructive operations require confirmation unless --force or --yes flag is passed."
39
+ severity: important
40
+
41
+ - id: cli-test-review
42
+ tier: recommended
43
+ dimension: tests
44
+ title: "CLI Test Review"
45
+ description: |
46
+ Evaluate CLI-specific testing patterns.
47
+ checklist:
48
+ - id: cli-integration-tests
49
+ description: "End-to-end tests invoke the CLI binary and assert on stdout, stderr, and exit code."
50
+ severity: critical
51
+ - id: cli-snapshot-output
52
+ description: "Help text and formatted output tested via snapshots to catch unintended changes."
53
+ severity: nice-to-have
@@ -1,16 +1,16 @@
1
- tag: CLI
2
- section: structure
3
- language: typescript
4
- entries:
5
- - path: src/index.ts
6
- type: file
7
- description: "Entry point with shebang (#!/usr/bin/env node)"
8
- - path: src/cli
9
- type: directory
10
- description: "CLI command definitions and argument parsing"
11
- - path: src/shared/config
12
- type: directory
13
- description: "CLI configuration and env handling"
14
- - path: src/shared/errors
15
- type: directory
16
- description: "Error hierarchy with user-friendly messages"
1
+ tag: CLI
2
+ section: structure
3
+ language: typescript
4
+ entries:
5
+ - path: src/index.ts
6
+ type: file
7
+ description: "Entry point with shebang (#!/usr/bin/env node)"
8
+ - path: src/cli
9
+ type: directory
10
+ description: "CLI command definitions and argument parsing"
11
+ - path: src/shared/config
12
+ type: directory
13
+ description: "CLI configuration and env handling"
14
+ - path: src/shared/errors
15
+ type: directory
16
+ description: "Error hierarchy with user-friendly messages"
@@ -1,28 +1,28 @@
1
- tag: DATA-LINEAGE
2
- section: instructions
3
- blocks:
4
- - id: field-coverage
5
- tier: recommended
6
- title: "100% Field Coverage Enforcement"
7
- content: |
8
- ## 100% Field Coverage Enforcement
9
-
10
- - Every data field must have a documented origin (source system, table, column) and destination (target system, table, column).
11
- - Maintain a field-level lineage registry: no field enters production without a lineage entry. Enforce this in CI.
12
- - Track field transformations explicitly: document every rename, type cast, aggregation, join, and filter applied to each field.
13
- - Implement automated coverage checks: compare the lineage registry against actual schema definitions. Flag any untracked fields as errors.
14
- - Version lineage metadata alongside code: lineage definitions live in the repository, not in external wikis or spreadsheets.
15
- - Generate lineage reports per pipeline run: which fields were read, transformed, and written, with row counts and data quality metrics.
16
-
17
- - id: lineage-tracking-decorators
18
- tier: recommended
19
- title: "Lineage Tracking Decorators & Annotations"
20
- content: |
21
- ## Lineage Tracking Decorators & Annotations
22
-
23
- - Use decorators or annotations on transformation functions to declare input fields, output fields, and transformation type.
24
- - Standardize lineage metadata format: source, transformation, destination, timestamp, pipeline_id, run_id.
25
- - Emit lineage events at runtime: every transformation step publishes a lineage event to a centralized lineage store.
26
- - Support both code-level lineage (decorators on functions) and config-level lineage (YAML/JSON transformation specs).
27
- - Integrate lineage with data quality: when a quality check fails, trace back through lineage to identify the source of bad data.
28
- - Visualize lineage graphs: generate dependency diagrams showing field-level flow from source to consumption.
1
+ tag: DATA-LINEAGE
2
+ section: instructions
3
+ blocks:
4
+ - id: field-coverage
5
+ tier: recommended
6
+ title: "100% Field Coverage Enforcement"
7
+ content: |
8
+ ## 100% Field Coverage Enforcement
9
+
10
+ - Every data field must have a documented origin (source system, table, column) and destination (target system, table, column).
11
+ - Maintain a field-level lineage registry: no field enters production without a lineage entry. Enforce this in CI.
12
+ - Track field transformations explicitly: document every rename, type cast, aggregation, join, and filter applied to each field.
13
+ - Implement automated coverage checks: compare the lineage registry against actual schema definitions. Flag any untracked fields as errors.
14
+ - Version lineage metadata alongside code: lineage definitions live in the repository, not in external wikis or spreadsheets.
15
+ - Generate lineage reports per pipeline run: which fields were read, transformed, and written, with row counts and data quality metrics.
16
+
17
+ - id: lineage-tracking-decorators
18
+ tier: recommended
19
+ title: "Lineage Tracking Decorators & Annotations"
20
+ content: |
21
+ ## Lineage Tracking Decorators & Annotations
22
+
23
+ - Use decorators or annotations on transformation functions to declare input fields, output fields, and transformation type.
24
+ - Standardize lineage metadata format: source, transformation, destination, timestamp, pipeline_id, run_id.
25
+ - Emit lineage events at runtime: every transformation step publishes a lineage event to a centralized lineage store.
26
+ - Support both code-level lineage (decorators on functions) and config-level lineage (YAML/JSON transformation specs).
27
+ - Integrate lineage with data quality: when a quality check fails, trace back through lineage to identify the source of bad data.
28
+ - Visualize lineage graphs: generate dependency diagrams showing field-level flow from source to consumption.
@@ -1,22 +1,22 @@
1
- tag: DATA-LINEAGE
2
- section: mcp-servers
3
- servers:
4
- - name: postgres
5
- description: "PostgreSQL database inspection — query lineage metadata and schema definitions"
6
- command: npx
7
- args: ["-y", "@modelcontextprotocol/server-postgres"]
8
- tags: [DATA-LINEAGE, DATA-PIPELINE]
9
- category: database
10
- tier: recommended
11
- env:
12
- POSTGRES_CONNECTION_STRING: ""
13
- url: "https://github.com/modelcontextprotocol/servers/tree/main/src/postgres"
14
-
15
- - name: filesystem
16
- description: "Filesystem access for lineage definition files and pipeline configurations"
17
- command: npx
18
- args: ["-y", "@modelcontextprotocol/server-filesystem"]
19
- tags: [DATA-LINEAGE, UNIVERSAL]
20
- category: filesystem
21
- tier: optional
22
- url: "https://github.com/modelcontextprotocol/servers/tree/main/src/filesystem"
1
+ tag: DATA-LINEAGE
2
+ section: mcp-servers
3
+ servers:
4
+ - name: postgres
5
+ description: "PostgreSQL database inspection — query lineage metadata and schema definitions"
6
+ command: npx
7
+ args: ["-y", "@modelcontextprotocol/server-postgres"]
8
+ tags: [DATA-LINEAGE, DATA-PIPELINE]
9
+ category: database
10
+ tier: recommended
11
+ env:
12
+ POSTGRES_CONNECTION_STRING: ""
13
+ url: "https://github.com/modelcontextprotocol/servers/tree/main/src/postgres"
14
+
15
+ - name: filesystem
16
+ description: "Filesystem access for lineage definition files and pipeline configurations"
17
+ command: npx
18
+ args: ["-y", "@modelcontextprotocol/server-filesystem"]
19
+ tags: [DATA-LINEAGE, UNIVERSAL]
20
+ category: filesystem
21
+ tier: optional
22
+ url: "https://github.com/modelcontextprotocol/servers/tree/main/src/filesystem"
@@ -1,84 +1,84 @@
1
- tag: DATA-PIPELINE
2
- section: instructions
3
- blocks:
4
- - id: etl-patterns
5
- tier: recommended
6
- title: "ETL & Pipeline Design"
7
- content: |
8
- ## ETL & Pipeline Orchestration
9
-
10
- - Design every pipeline stage to be idempotent. Re-running a stage with the same input must produce the same output without side effects or duplicates.
11
- - Use a DAG-based orchestrator (e.g., Airflow, Dagster, Prefect) to define task dependencies, schedule runs, and handle retries with exponential backoff.
12
- - Partition data by time (daily/hourly) or logical key so that re-processing a failed partition does not require reprocessing the entire dataset.
13
- - Implement the Extract-Load-Transform (ELT) pattern when the target warehouse supports compute; push transformations into SQL/dbt models for transparency and version control.
14
- - Tag every pipeline run with a unique run ID. Propagate this ID through all stages and into output metadata for end-to-end lineage tracking.
15
- - Keep extraction, transformation, and loading logic in separate, independently testable modules. Avoid monolithic scripts that mix concerns.
16
- - Define SLAs for each pipeline (e.g., "daily sales pipeline completes by 06:00 UTC"). Alert on SLA breach, not just on failure.
17
-
18
- - id: data-validation
19
- tier: recommended
20
- title: "Data Validation & Quality"
21
- content: |
22
- ## Data Validation & Quality Gates
23
-
24
- - Validate data at every boundary: after extraction (schema conformance), after transformation (business rules), and before loading (referential integrity).
25
- - Use schema contracts (e.g., JSON Schema, Avro, Protobuf, or Great Expectations suites) to enforce column types, nullability, and allowed value ranges.
26
- - Implement row-count and distribution checks between stages. A sudden drop or spike in row count (> 20% deviation from baseline) should trigger an alert and pause downstream processing.
27
- - Quarantine invalid records into a dead-letter table rather than dropping them silently. Include the original record, the validation error, and the run ID.
28
- - Maintain a data quality dashboard that tracks freshness, completeness, uniqueness, and accuracy metrics per dataset over time.
29
- - Write unit tests for transformation logic using fixed input fixtures. Write integration tests that run a mini-pipeline against a test database.
30
-
31
- - id: reliability-patterns
32
- tier: recommended
33
- title: "Reliability & Error Handling"
34
- content: |
35
- ## Reliability, Retry & Recovery
36
-
37
- - Configure retries with exponential backoff and jitter for transient failures (network timeouts, rate limits, temporary unavailability). Cap retries at 3-5 attempts.
38
- - Use checkpointing or write-ahead logs for long-running pipelines so that a restart resumes from the last successful checkpoint, not from the beginning.
39
- - Implement circuit breakers on external API calls to prevent cascading failures when a source system is degraded.
40
- - Log structured events (JSON) for every stage: start, success, failure, retry, and skip. Include record counts, duration, and error details.
41
- - Design for exactly-once semantics where possible using upserts (INSERT ... ON CONFLICT UPDATE) or deduplication keys in the target store.
42
- - Maintain a pipeline runbook that documents failure modes, recovery steps, and escalation contacts. Review and update it quarterly.
43
-
44
- - id: data-pipeline-testing
45
- tier: recommended
46
- title: "Data Pipeline Testing Requirements"
47
- content: |
48
- ## Data Pipeline Testing Requirements
49
-
50
- ### Data Quality Assertions at Every Stage
51
- Assert at each pipeline boundary — after extraction, after transformation, before loading:
52
- - **Row count** — within expected range or ± % of prior run baseline.
53
- - **Null rate** — per column, against stated threshold (e.g., `customer_id` must never be null).
54
- - **Schema compliance** — column names, types, nullable flags match the declared schema.
55
- - **Referential integrity** — foreign key values exist in the referenced table.
56
- - **Distribution checks** — value ranges and cardinality within expected bounds. Alert on sudden shifts (> 20% deviation from trailing 7-day average).
57
- Use Great Expectations, dbt tests, or equivalent. Assertions are first-class artifacts — committed, versioned, and enforced in CI.
58
-
59
- ### Idempotency Tests
60
- Re-running the pipeline with the same input must produce identical output with no side effects. Test explicitly:
61
- - Run pipeline N once; capture output row count and checksum.
62
- - Run pipeline N again with the same source data.
63
- - Assert: output row count and checksum are identical; no duplicates in target; no reprocessed records in dead-letter queue.
64
- Idempotency is a contract, not an assumption. If a stage is not idempotent by design, that decision requires an ADR.
65
-
66
- ### Backfill Correctness Tests
67
- Historical data processed against current transformation logic must produce correct results:
68
- - Define a fixed historical fixture (date range, known input, known output).
69
- - Run current pipeline logic against the fixture.
70
- - Assert output matches expected historical output.
71
- Any change to transformation logic that causes a backfill regression is a breaking change requiring an ADR.
72
-
73
- ### Dead-Letter Queue Draining Tests
74
- Malformed records must be handled — not silently dropped and not blocking the pipeline:
75
- - Inject a record that violates the schema; assert it routes to the dead-letter table/queue.
76
- - Inject a record that passes schema but fails a business rule; assert correct routing.
77
- - Assert the DLQ record contains: original payload, validation error, run ID, timestamp.
78
- - Test DLQ drain procedure: records can be corrected, resubmitted, and processed successfully.
79
-
80
- ### Volume / Scale Tests at Staging
81
- Unit-scale fixtures are insufficient for pipeline correctness. Run at representative data volume:
82
- - At staging: use a realistic data volume (minimum 10% of production record count, or a known large-batch scenario).
83
- - Assert: completion within SLA window; no memory exhaustion; partition output correct.
84
- - Document the scale test fixture size and the SLA threshold in the spec.
1
+ tag: DATA-PIPELINE
2
+ section: instructions
3
+ blocks:
4
+ - id: etl-patterns
5
+ tier: recommended
6
+ title: "ETL & Pipeline Design"
7
+ content: |
8
+ ## ETL & Pipeline Orchestration
9
+
10
+ - Design every pipeline stage to be idempotent. Re-running a stage with the same input must produce the same output without side effects or duplicates.
11
+ - Use a DAG-based orchestrator (e.g., Airflow, Dagster, Prefect) to define task dependencies, schedule runs, and handle retries with exponential backoff.
12
+ - Partition data by time (daily/hourly) or logical key so that re-processing a failed partition does not require reprocessing the entire dataset.
13
+ - Implement the Extract-Load-Transform (ELT) pattern when the target warehouse supports compute; push transformations into SQL/dbt models for transparency and version control.
14
+ - Tag every pipeline run with a unique run ID. Propagate this ID through all stages and into output metadata for end-to-end lineage tracking.
15
+ - Keep extraction, transformation, and loading logic in separate, independently testable modules. Avoid monolithic scripts that mix concerns.
16
+ - Define SLAs for each pipeline (e.g., "daily sales pipeline completes by 06:00 UTC"). Alert on SLA breach, not just on failure.
17
+
18
+ - id: data-validation
19
+ tier: recommended
20
+ title: "Data Validation & Quality"
21
+ content: |
22
+ ## Data Validation & Quality Gates
23
+
24
+ - Validate data at every boundary: after extraction (schema conformance), after transformation (business rules), and before loading (referential integrity).
25
+ - Use schema contracts (e.g., JSON Schema, Avro, Protobuf, or Great Expectations suites) to enforce column types, nullability, and allowed value ranges.
26
+ - Implement row-count and distribution checks between stages. A sudden drop or spike in row count (> 20% deviation from baseline) should trigger an alert and pause downstream processing.
27
+ - Quarantine invalid records into a dead-letter table rather than dropping them silently. Include the original record, the validation error, and the run ID.
28
+ - Maintain a data quality dashboard that tracks freshness, completeness, uniqueness, and accuracy metrics per dataset over time.
29
+ - Write unit tests for transformation logic using fixed input fixtures. Write integration tests that run a mini-pipeline against a test database.
30
+
31
+ - id: reliability-patterns
32
+ tier: recommended
33
+ title: "Reliability & Error Handling"
34
+ content: |
35
+ ## Reliability, Retry & Recovery
36
+
37
+ - Configure retries with exponential backoff and jitter for transient failures (network timeouts, rate limits, temporary unavailability). Cap retries at 3-5 attempts.
38
+ - Use checkpointing or write-ahead logs for long-running pipelines so that a restart resumes from the last successful checkpoint, not from the beginning.
39
+ - Implement circuit breakers on external API calls to prevent cascading failures when a source system is degraded.
40
+ - Log structured events (JSON) for every stage: start, success, failure, retry, and skip. Include record counts, duration, and error details.
41
+ - Design for exactly-once semantics where possible using upserts (INSERT ... ON CONFLICT UPDATE) or deduplication keys in the target store.
42
+ - Maintain a pipeline runbook that documents failure modes, recovery steps, and escalation contacts. Review and update it quarterly.
43
+
44
+ - id: data-pipeline-testing
45
+ tier: recommended
46
+ title: "Data Pipeline Testing Requirements"
47
+ content: |
48
+ ## Data Pipeline Testing Requirements
49
+
50
+ ### Data Quality Assertions at Every Stage
51
+ Assert at each pipeline boundary — after extraction, after transformation, before loading:
52
+ - **Row count** — within expected range or ± % of prior run baseline.
53
+ - **Null rate** — per column, against stated threshold (e.g., `customer_id` must never be null).
54
+ - **Schema compliance** — column names, types, nullable flags match the declared schema.
55
+ - **Referential integrity** — foreign key values exist in the referenced table.
56
+ - **Distribution checks** — value ranges and cardinality within expected bounds. Alert on sudden shifts (> 20% deviation from trailing 7-day average).
57
+ Use Great Expectations, dbt tests, or equivalent. Assertions are first-class artifacts — committed, versioned, and enforced in CI.
58
+
59
+ ### Idempotency Tests
60
+ Re-running the pipeline with the same input must produce identical output with no side effects. Test explicitly:
61
+ - Run pipeline N once; capture output row count and checksum.
62
+ - Run pipeline N again with the same source data.
63
+ - Assert: output row count and checksum are identical; no duplicates in target; no reprocessed records in dead-letter queue.
64
+ Idempotency is a contract, not an assumption. If a stage is not idempotent by design, that decision requires an ADR.
65
+
66
+ ### Backfill Correctness Tests
67
+ Historical data processed against current transformation logic must produce correct results:
68
+ - Define a fixed historical fixture (date range, known input, known output).
69
+ - Run current pipeline logic against the fixture.
70
+ - Assert output matches expected historical output.
71
+ Any change to transformation logic that causes a backfill regression is a breaking change requiring an ADR.
72
+
73
+ ### Dead-Letter Queue Draining Tests
74
+ Malformed records must be handled — not silently dropped and not blocking the pipeline:
75
+ - Inject a record that violates the schema; assert it routes to the dead-letter table/queue.
76
+ - Inject a record that passes schema but fails a business rule; assert correct routing.
77
+ - Assert the DLQ record contains: original payload, validation error, run ID, timestamp.
78
+ - Test DLQ drain procedure: records can be corrected, resubmitted, and processed successfully.
79
+
80
+ ### Volume / Scale Tests at Staging
81
+ Unit-scale fixtures are insufficient for pipeline correctness. Run at representative data volume:
82
+ - At staging: use a realistic data volume (minimum 10% of production record count, or a known large-batch scenario).
83
+ - Assert: completion within SLA window; no memory exhaustion; partition output correct.
84
+ - Document the scale test fixture size and the SLA threshold in the spec.
@@ -1,13 +1,13 @@
1
- tag: DATA-PIPELINE
2
- section: mcp-servers
3
- servers:
4
- - name: postgres
5
- description: "PostgreSQL database inspection, queries, and schema management"
6
- command: npx
7
- args: ["-y", "@modelcontextprotocol/server-postgres"]
8
- tags: [DATA-PIPELINE, API]
9
- category: database
10
- tier: recommended
11
- env:
12
- POSTGRES_CONNECTION_STRING: ""
13
- url: "https://github.com/modelcontextprotocol/servers/tree/main/src/postgres"
1
+ tag: DATA-PIPELINE
2
+ section: mcp-servers
3
+ servers:
4
+ - name: postgres
5
+ description: "PostgreSQL database inspection, queries, and schema management"
6
+ command: npx
7
+ args: ["-y", "@modelcontextprotocol/server-postgres"]
8
+ tags: [DATA-PIPELINE, API]
9
+ category: database
10
+ tier: recommended
11
+ env:
12
+ POSTGRES_CONNECTION_STRING: ""
13
+ url: "https://github.com/modelcontextprotocol/servers/tree/main/src/postgres"
@@ -1,39 +1,39 @@
1
- tag: DATA-PIPELINE
2
- section: nfr
3
- blocks:
4
- - id: pipeline-reliability
5
- tier: recommended
6
- title: "Pipeline Reliability"
7
- content: |
8
- ## NFR: Pipeline Reliability
9
-
10
- ### SLAs
11
- - Data freshness SLA: {{freshness_sla | default: 1 hour}} from source to destination.
12
- - Pipeline success rate: ≥ {{pipeline_success_rate | default: 99%}} per day.
13
- - Alert on SLA breach within 15 minutes.
14
-
15
- ### Idempotency
16
- - Every pipeline step is idempotent — safe to rerun without duplicating data.
17
- - Backfill supported: reprocess historical date ranges without manual intervention.
18
- - Partial failures resume from last checkpoint, not from scratch.
19
-
20
- ### Data Quality
21
- - Schema validation at ingestion. Reject or quarantine malformed records.
22
- - Row count, null rate, uniqueness, and distribution checks after each stage.
23
- - Data quality metrics tracked over time. Alert on anomalies.
24
-
25
- - id: pipeline-performance
26
- tier: recommended
27
- title: "Pipeline Performance"
28
- content: |
29
- ## NFR: Pipeline Performance
30
-
31
- ### Throughput
32
- - Target throughput: {{pipeline_throughput | default: 100K records/minute}}.
33
- - Batch vs. streaming decision documented with rationale.
34
- - Partitioning strategy aligned with query patterns.
35
-
36
- ### Resource Management
37
- - Compute resources auto-scale with data volume. No over-provisioning for peak.
38
- - Cost per GB processed tracked. Budget alerts at 80% threshold.
39
- - Temporary/staging data cleaned up automatically after pipeline completion.
1
+ tag: DATA-PIPELINE
2
+ section: nfr
3
+ blocks:
4
+ - id: pipeline-reliability
5
+ tier: recommended
6
+ title: "Pipeline Reliability"
7
+ content: |
8
+ ## NFR: Pipeline Reliability
9
+
10
+ ### SLAs
11
+ - Data freshness SLA: {{freshness_sla | default: 1 hour}} from source to destination.
12
+ - Pipeline success rate: ≥ {{pipeline_success_rate | default: 99%}} per day.
13
+ - Alert on SLA breach within 15 minutes.
14
+
15
+ ### Idempotency
16
+ - Every pipeline step is idempotent — safe to rerun without duplicating data.
17
+ - Backfill supported: reprocess historical date ranges without manual intervention.
18
+ - Partial failures resume from last checkpoint, not from scratch.
19
+
20
+ ### Data Quality
21
+ - Schema validation at ingestion. Reject or quarantine malformed records.
22
+ - Row count, null rate, uniqueness, and distribution checks after each stage.
23
+ - Data quality metrics tracked over time. Alert on anomalies.
24
+
25
+ - id: pipeline-performance
26
+ tier: recommended
27
+ title: "Pipeline Performance"
28
+ content: |
29
+ ## NFR: Pipeline Performance
30
+
31
+ ### Throughput
32
+ - Target throughput: {{pipeline_throughput | default: 100K records/minute}}.
33
+ - Batch vs. streaming decision documented with rationale.
34
+ - Partitioning strategy aligned with query patterns.
35
+
36
+ ### Resource Management
37
+ - Compute resources auto-scale with data volume. No over-provisioning for peak.
38
+ - Cost per GB processed tracked. Budget alerts at 80% threshold.
39
+ - Temporary/staging data cleaned up automatically after pipeline completion.
@@ -1,23 +1,23 @@
1
- tag: DATA-PIPELINE
2
- section: structure
3
- entries:
4
- - path: dags/
5
- description: "Pipeline orchestration definitions (Airflow DAGs, Prefect flows, Dagster jobs)"
6
- - path: src/extractors/
7
- description: "Data source connectors: API, database, file extractors"
8
- - path: src/transformers/
9
- description: "Data transformation logic: cleaning, enrichment, aggregation"
10
- - path: src/loaders/
11
- description: "Data sinks: warehouse loaders, file writers, API publishers"
12
- - path: src/validators/
13
- description: "Data quality checks: schema validation, business rules, anomaly detection"
14
- - path: src/config/
15
- description: "Pipeline configuration: sources, schedules, thresholds (YAML)"
16
- - path: tests/
17
- description: "Unit tests for transformers, validators, and business logic"
18
- - path: tests/fixtures/
19
- description: "Sample data files for deterministic testing"
20
- - path: sql/
21
- description: "SQL transformations and DDL for warehouse tables"
22
- - path: scripts/
23
- description: "Operational scripts: backfill, reprocess, data repair"
1
+ tag: DATA-PIPELINE
2
+ section: structure
3
+ entries:
4
+ - path: dags/
5
+ description: "Pipeline orchestration definitions (Airflow DAGs, Prefect flows, Dagster jobs)"
6
+ - path: src/extractors/
7
+ description: "Data source connectors: API, database, file extractors"
8
+ - path: src/transformers/
9
+ description: "Data transformation logic: cleaning, enrichment, aggregation"
10
+ - path: src/loaders/
11
+ description: "Data sinks: warehouse loaders, file writers, API publishers"
12
+ - path: src/validators/
13
+ description: "Data quality checks: schema validation, business rules, anomaly detection"
14
+ - path: src/config/
15
+ description: "Pipeline configuration: sources, schedules, thresholds (YAML)"
16
+ - path: tests/
17
+ description: "Unit tests for transformers, validators, and business logic"
18
+ - path: tests/fixtures/
19
+ description: "Sample data files for deterministic testing"
20
+ - path: sql/
21
+ description: "SQL transformations and DDL for warehouse tables"
22
+ - path: scripts/
23
+ description: "Operational scripts: backfill, reprocess, data repair"