@grainulation/silo 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ {
2
+ "name": "CI/CD and Deployment",
3
+ "description": "Pipeline design patterns, deployment strategies (blue-green, canary, rolling), feature flags, rollback procedures, and artifact management for continuous delivery.",
4
+ "version": "1.0.0",
5
+ "claims": [
6
+ {
7
+ "id": "cicd-001",
8
+ "type": "constraint",
9
+ "topic": "pipeline time budget",
10
+ "content": "CI pipelines should complete in under 10 minutes for the critical path (lint, test, build). Pipelines over 15 minutes measurably reduce PR throughput and encourage developers to batch changes, increasing merge conflict risk. Parallelize test suites and cache dependencies aggressively.",
11
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
12
+ "evidence": "production",
13
+ "status": "active",
14
+ "phase_added": "define",
15
+ "timestamp": "2025-01-01T00:00:00.000Z",
16
+ "conflicts_with": [],
17
+ "resolved_by": null,
18
+ "tags": ["ci-cd", "pipeline", "performance", "developer-experience"]
19
+ },
20
+ {
21
+ "id": "cicd-002",
22
+ "type": "factual",
23
+ "topic": "blue-green deployment mechanics",
24
+ "content": "Blue-green deployment maintains two identical production environments. Traffic routes to blue (current) while green receives the new version. After green passes health checks, traffic switches via load balancer. Rollback is instant (switch back to blue). Cost: 2x infrastructure during deployment window. Database migrations must be backwards-compatible since both versions share the database.",
25
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
26
+ "evidence": "documented",
27
+ "status": "active",
28
+ "phase_added": "define",
29
+ "timestamp": "2025-01-01T00:00:00.000Z",
30
+ "conflicts_with": [],
31
+ "resolved_by": null,
32
+ "tags": ["ci-cd", "deployment", "blue-green", "rollback"]
33
+ },
34
+ {
35
+ "id": "cicd-003",
36
+ "type": "recommendation",
37
+ "topic": "canary deployment percentage",
38
+ "content": "Canary deployments should start at 1-5% of traffic, bake for 10-15 minutes while monitoring error rates and latency, then expand to 25%, 50%, 100% with automated rollback if error rate exceeds 1% above baseline or p99 latency increases by more than 50%. Full rollout should take 30-60 minutes minimum.",
39
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
40
+ "evidence": "production",
41
+ "status": "active",
42
+ "phase_added": "define",
43
+ "timestamp": "2025-01-01T00:00:00.000Z",
44
+ "conflicts_with": [],
45
+ "resolved_by": null,
46
+ "tags": ["ci-cd", "deployment", "canary", "progressive-rollout"]
47
+ },
48
+ {
49
+ "id": "cicd-004",
50
+ "type": "risk",
51
+ "topic": "database migration coupling",
52
+ "content": "Deploying application code and database migrations simultaneously causes failures if the migration is slow or fails mid-deploy. Decouple migrations: deploy migration first (must be backwards-compatible with current code), verify, then deploy application code. Use expand-and-contract pattern for schema changes.",
53
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
54
+ "evidence": "production",
55
+ "status": "active",
56
+ "phase_added": "define",
57
+ "timestamp": "2025-01-01T00:00:00.000Z",
58
+ "conflicts_with": [],
59
+ "resolved_by": null,
60
+ "tags": ["ci-cd", "database", "migrations", "deployment"]
61
+ },
62
+ {
63
+ "id": "cicd-005",
64
+ "type": "recommendation",
65
+ "topic": "feature flag lifecycle",
66
+ "content": "Feature flags should have an owner, a creation date, and a planned removal date. Flags older than 90 days without removal should trigger a tech debt alert. Stale feature flags accumulate as dead code paths and make testing combinatorially complex. Use a flag management service (LaunchDarkly, Unleash, or custom) rather than config files.",
67
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
68
+ "evidence": "documented",
69
+ "status": "active",
70
+ "phase_added": "define",
71
+ "timestamp": "2025-01-01T00:00:00.000Z",
72
+ "conflicts_with": [],
73
+ "resolved_by": null,
74
+ "tags": ["ci-cd", "feature-flags", "tech-debt", "lifecycle"]
75
+ },
76
+ {
77
+ "id": "cicd-006",
78
+ "type": "constraint",
79
+ "topic": "immutable build artifacts",
80
+ "content": "Build artifacts must be immutable: the exact same artifact (Docker image, binary, package) that passes CI is the one deployed to staging and production. Never rebuild for different environments. Use environment variables or config injection for environment-specific settings. Tag artifacts with git SHA for traceability.",
81
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
82
+ "evidence": "documented",
83
+ "status": "active",
84
+ "phase_added": "define",
85
+ "timestamp": "2025-01-01T00:00:00.000Z",
86
+ "conflicts_with": [],
87
+ "resolved_by": null,
88
+ "tags": ["ci-cd", "artifacts", "immutability", "docker"]
89
+ },
90
+ {
91
+ "id": "cicd-007",
92
+ "type": "risk",
93
+ "topic": "rollback without data rollback",
94
+ "content": "Code rollbacks do not roll back data. If version N+1 wrote data in a new format, rolling back to version N may leave incompatible data in the database. Design writes to be forward-compatible: version N should tolerate data written by N+1. This constraint makes additive-only schema changes critical.",
95
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
96
+ "evidence": "production",
97
+ "status": "active",
98
+ "phase_added": "define",
99
+ "timestamp": "2025-01-01T00:00:00.000Z",
100
+ "conflicts_with": [],
101
+ "resolved_by": null,
102
+ "tags": ["ci-cd", "rollback", "database", "data-compatibility"]
103
+ },
104
+ {
105
+ "id": "cicd-008",
106
+ "type": "factual",
107
+ "topic": "DORA metrics benchmarks",
108
+ "content": "DORA elite team benchmarks (2023): deployment frequency (on-demand, multiple per day), lead time for changes (less than 1 hour), mean time to restore (less than 1 hour), change failure rate (0-15%). High performers deploy 973x more frequently than low performers. Measure these four metrics to assess CI/CD health.",
109
+ "source": { "origin": "industry", "artifact": null, "connector": null },
110
+ "evidence": "documented",
111
+ "status": "active",
112
+ "phase_added": "define",
113
+ "timestamp": "2025-01-01T00:00:00.000Z",
114
+ "conflicts_with": [],
115
+ "resolved_by": null,
116
+ "tags": ["ci-cd", "dora", "metrics", "engineering-effectiveness"]
117
+ },
118
+ {
119
+ "id": "cicd-009",
120
+ "type": "recommendation",
121
+ "topic": "trunk-based development",
122
+ "content": "Trunk-based development (short-lived branches merged to main within 1-2 days) correlates with higher DORA metrics than Gitflow. Long-lived feature branches (over 3 days) have exponentially higher merge conflict probability. Use feature flags instead of feature branches for incomplete work.",
123
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
124
+ "evidence": "documented",
125
+ "status": "active",
126
+ "phase_added": "define",
127
+ "timestamp": "2025-01-01T00:00:00.000Z",
128
+ "conflicts_with": [],
129
+ "resolved_by": null,
130
+ "tags": ["ci-cd", "branching", "trunk-based", "gitflow"]
131
+ },
132
+ {
133
+ "id": "cicd-010",
134
+ "type": "recommendation",
135
+ "topic": "test pyramid enforcement",
136
+ "content": "Enforce the test pyramid in CI: 70% unit tests (fast, isolated), 20% integration tests (database, APIs), 10% end-to-end tests (browser, full stack). Inverted pyramids (heavy E2E, few unit tests) result in slow, flaky pipelines. Quarantine flaky tests immediately rather than retrying them.",
137
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
138
+ "evidence": "documented",
139
+ "status": "active",
140
+ "phase_added": "define",
141
+ "timestamp": "2025-01-01T00:00:00.000Z",
142
+ "conflicts_with": [],
143
+ "resolved_by": null,
144
+ "tags": ["ci-cd", "testing", "test-pyramid", "flaky-tests"]
145
+ },
146
+ {
147
+ "id": "cicd-011",
148
+ "type": "risk",
149
+ "topic": "secret exposure in CI logs",
150
+ "content": "CI systems frequently leak secrets in build logs through debug output, error messages, or environment variable dumps. Use secret masking features in your CI platform. Never pass secrets as command-line arguments (visible in process listings). Audit CI logs quarterly for accidental secret exposure.",
151
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
152
+ "evidence": "production",
153
+ "status": "active",
154
+ "phase_added": "define",
155
+ "timestamp": "2025-01-01T00:00:00.000Z",
156
+ "conflicts_with": [],
157
+ "resolved_by": null,
158
+ "tags": ["ci-cd", "security", "secrets", "logs"]
159
+ },
160
+ {
161
+ "id": "cicd-012",
162
+ "type": "estimate",
163
+ "topic": "deploy frequency impact on MTTR",
164
+ "content": "Teams deploying daily have a mean time to restore (MTTR) of 1-4 hours. Teams deploying weekly average 1-3 days MTTR. Teams deploying monthly average 1-2 weeks MTTR. Smaller, more frequent deploys are easier to diagnose because the blast radius of each change is smaller.",
165
+ "source": { "origin": "industry", "artifact": null, "connector": null },
166
+ "evidence": "web",
167
+ "status": "active",
168
+ "phase_added": "define",
169
+ "timestamp": "2025-01-01T00:00:00.000Z",
170
+ "conflicts_with": [],
171
+ "resolved_by": null,
172
+ "tags": ["ci-cd", "deploy-frequency", "mttr", "reliability"]
173
+ }
174
+ ]
175
+ }
@@ -0,0 +1,203 @@
1
+ {
2
+ "name": "Compliance Constraints",
3
+ "description": "Regulatory constraint sets for HIPAA, SOC 2, and GDPR research sprints. Use as a starting point when evaluating compliance-sensitive architecture or vendor decisions.",
4
+ "version": "1.0.0",
5
+ "claims": [
6
+ {
7
+ "id": "hipaa-001",
8
+ "type": "constraint",
9
+ "topic": "HIPAA encryption at rest",
10
+ "content": "Protected Health Information (PHI) must be encrypted at rest using AES-256 or equivalent, per HIPAA Security Rule 45 CFR 164.312(a)(2)(iv).",
11
+ "source": { "origin": "regulation", "artifact": "45 CFR 164.312", "connector": null },
12
+ "evidence": "documented",
13
+ "status": "active",
14
+ "phase_added": "define",
15
+ "timestamp": "2025-01-01T00:00:00.000Z",
16
+ "conflicts_with": [],
17
+ "resolved_by": null,
18
+ "tags": ["hipaa", "encryption", "phi"]
19
+ },
20
+ {
21
+ "id": "hipaa-002",
22
+ "type": "constraint",
23
+ "topic": "HIPAA encryption in transit",
24
+ "content": "PHI in transit must be encrypted via TLS 1.2 or higher. Unencrypted transmission channels (HTTP, FTP) are prohibited for PHI.",
25
+ "source": { "origin": "regulation", "artifact": "45 CFR 164.312(e)(1)", "connector": null },
26
+ "evidence": "documented",
27
+ "status": "active",
28
+ "phase_added": "define",
29
+ "timestamp": "2025-01-01T00:00:00.000Z",
30
+ "conflicts_with": [],
31
+ "resolved_by": null,
32
+ "tags": ["hipaa", "encryption", "transit"]
33
+ },
34
+ {
35
+ "id": "hipaa-003",
36
+ "type": "constraint",
37
+ "topic": "HIPAA business associate agreements",
38
+ "content": "A Business Associate Agreement (BAA) must be signed with every third-party vendor that processes, stores, or transmits PHI, including cloud providers.",
39
+ "source": { "origin": "regulation", "artifact": "45 CFR 164.502(e)", "connector": null },
40
+ "evidence": "documented",
41
+ "status": "active",
42
+ "phase_added": "define",
43
+ "timestamp": "2025-01-01T00:00:00.000Z",
44
+ "conflicts_with": [],
45
+ "resolved_by": null,
46
+ "tags": ["hipaa", "baa", "vendor"]
47
+ },
48
+ {
49
+ "id": "hipaa-004",
50
+ "type": "constraint",
51
+ "topic": "HIPAA minimum necessary access",
52
+ "content": "Access to PHI must follow minimum necessary standard — users see only the data required for their role. Role-based access control (RBAC) is the typical implementation.",
53
+ "source": { "origin": "regulation", "artifact": "45 CFR 164.502(b)", "connector": null },
54
+ "evidence": "documented",
55
+ "status": "active",
56
+ "phase_added": "define",
57
+ "timestamp": "2025-01-01T00:00:00.000Z",
58
+ "conflicts_with": [],
59
+ "resolved_by": null,
60
+ "tags": ["hipaa", "access-control", "rbac"]
61
+ },
62
+ {
63
+ "id": "hipaa-005",
64
+ "type": "constraint",
65
+ "topic": "HIPAA audit log retention",
66
+ "content": "Audit logs of all PHI access events must be retained for a minimum of 6 years. Logs must include user identity, timestamp, and action performed.",
67
+ "source": { "origin": "regulation", "artifact": "45 CFR 164.312(b)", "connector": null },
68
+ "evidence": "documented",
69
+ "status": "active",
70
+ "phase_added": "define",
71
+ "timestamp": "2025-01-01T00:00:00.000Z",
72
+ "conflicts_with": [],
73
+ "resolved_by": null,
74
+ "tags": ["hipaa", "audit", "logging"]
75
+ },
76
+ {
77
+ "id": "soc2-001",
78
+ "type": "constraint",
79
+ "topic": "SOC 2 Type II observation period",
80
+ "content": "SOC 2 Type II requires demonstrating that security controls operated effectively over a minimum observation period of 6 months (typically 12 months).",
81
+ "source": { "origin": "standard", "artifact": "AICPA Trust Services Criteria", "connector": null },
82
+ "evidence": "documented",
83
+ "status": "active",
84
+ "phase_added": "define",
85
+ "timestamp": "2025-01-01T00:00:00.000Z",
86
+ "conflicts_with": [],
87
+ "resolved_by": null,
88
+ "tags": ["soc2", "audit", "controls"]
89
+ },
90
+ {
91
+ "id": "soc2-002",
92
+ "type": "constraint",
93
+ "topic": "SOC 2 change management",
94
+ "content": "Change management controls must track all production changes with approval records, rollback plans, and post-deployment verification.",
95
+ "source": { "origin": "standard", "artifact": "AICPA CC8.1", "connector": null },
96
+ "evidence": "documented",
97
+ "status": "active",
98
+ "phase_added": "define",
99
+ "timestamp": "2025-01-01T00:00:00.000Z",
100
+ "conflicts_with": [],
101
+ "resolved_by": null,
102
+ "tags": ["soc2", "change-management", "controls"]
103
+ },
104
+ {
105
+ "id": "soc2-003",
106
+ "type": "constraint",
107
+ "topic": "SOC 2 incident response",
108
+ "content": "Incident response plan must define severity levels, escalation paths, communication templates, and post-incident review process. Annual tabletop exercises required.",
109
+ "source": { "origin": "standard", "artifact": "AICPA CC7.3-CC7.5", "connector": null },
110
+ "evidence": "documented",
111
+ "status": "active",
112
+ "phase_added": "define",
113
+ "timestamp": "2025-01-01T00:00:00.000Z",
114
+ "conflicts_with": [],
115
+ "resolved_by": null,
116
+ "tags": ["soc2", "incident-response", "controls"]
117
+ },
118
+ {
119
+ "id": "soc2-004",
120
+ "type": "constraint",
121
+ "topic": "SOC 2 access reviews",
122
+ "content": "Logical access reviews must be performed quarterly. Terminated employee access must be revoked within 24 hours of separation.",
123
+ "source": { "origin": "standard", "artifact": "AICPA CC6.1-CC6.3", "connector": null },
124
+ "evidence": "documented",
125
+ "status": "active",
126
+ "phase_added": "define",
127
+ "timestamp": "2025-01-01T00:00:00.000Z",
128
+ "conflicts_with": [],
129
+ "resolved_by": null,
130
+ "tags": ["soc2", "access-control", "offboarding"]
131
+ },
132
+ {
133
+ "id": "gdpr-001",
134
+ "type": "constraint",
135
+ "topic": "GDPR cross-border data transfer",
136
+ "content": "Personal data of EU residents cannot be transferred outside the EEA without an adequacy decision, Standard Contractual Clauses (SCCs), or Binding Corporate Rules.",
137
+ "source": { "origin": "regulation", "artifact": "GDPR Articles 44-49", "connector": null },
138
+ "evidence": "documented",
139
+ "status": "active",
140
+ "phase_added": "define",
141
+ "timestamp": "2025-01-01T00:00:00.000Z",
142
+ "conflicts_with": [],
143
+ "resolved_by": null,
144
+ "tags": ["gdpr", "data-transfer", "international"]
145
+ },
146
+ {
147
+ "id": "gdpr-002",
148
+ "type": "constraint",
149
+ "topic": "GDPR right to erasure",
150
+ "content": "Data subjects have the right to erasure ('right to be forgotten'). Systems must support complete deletion of a user's personal data within 30 days of request.",
151
+ "source": { "origin": "regulation", "artifact": "GDPR Article 17", "connector": null },
152
+ "evidence": "documented",
153
+ "status": "active",
154
+ "phase_added": "define",
155
+ "timestamp": "2025-01-01T00:00:00.000Z",
156
+ "conflicts_with": [],
157
+ "resolved_by": null,
158
+ "tags": ["gdpr", "erasure", "data-subject-rights"]
159
+ },
160
+ {
161
+ "id": "gdpr-003",
162
+ "type": "constraint",
163
+ "topic": "GDPR breach notification",
164
+ "content": "Data breaches affecting personal data must be reported to the supervisory authority within 72 hours of discovery. Affected individuals must be notified without undue delay if risk is high.",
165
+ "source": { "origin": "regulation", "artifact": "GDPR Articles 33-34", "connector": null },
166
+ "evidence": "documented",
167
+ "status": "active",
168
+ "phase_added": "define",
169
+ "timestamp": "2025-01-01T00:00:00.000Z",
170
+ "conflicts_with": [],
171
+ "resolved_by": null,
172
+ "tags": ["gdpr", "breach-notification", "incident-response"]
173
+ },
174
+ {
175
+ "id": "gdpr-004",
176
+ "type": "constraint",
177
+ "topic": "GDPR lawful basis for processing",
178
+ "content": "Processing personal data requires a lawful basis (consent, contract, legal obligation, vital interest, public task, or legitimate interest). Consent must be freely given, specific, and withdrawable.",
179
+ "source": { "origin": "regulation", "artifact": "GDPR Article 6", "connector": null },
180
+ "evidence": "documented",
181
+ "status": "active",
182
+ "phase_added": "define",
183
+ "timestamp": "2025-01-01T00:00:00.000Z",
184
+ "conflicts_with": [],
185
+ "resolved_by": null,
186
+ "tags": ["gdpr", "lawful-basis", "consent"]
187
+ },
188
+ {
189
+ "id": "gdpr-005",
190
+ "type": "constraint",
191
+ "topic": "GDPR data protection impact assessment",
192
+ "content": "A Data Protection Impact Assessment (DPIA) is mandatory before processing that is likely to result in high risk to individuals, including large-scale profiling or systematic monitoring.",
193
+ "source": { "origin": "regulation", "artifact": "GDPR Article 35", "connector": null },
194
+ "evidence": "documented",
195
+ "status": "active",
196
+ "phase_added": "define",
197
+ "timestamp": "2025-01-01T00:00:00.000Z",
198
+ "conflicts_with": [],
199
+ "resolved_by": null,
200
+ "tags": ["gdpr", "dpia", "risk-assessment"]
201
+ }
202
+ ]
203
+ }
@@ -0,0 +1,175 @@
1
+ {
2
+ "name": "Data Engineering",
3
+ "description": "ETL vs ELT patterns, data quality enforcement, schema evolution strategies, batch vs streaming tradeoffs, data lake anti-patterns, and change data capture.",
4
+ "version": "1.0.0",
5
+ "claims": [
6
+ {
7
+ "id": "data-001",
8
+ "type": "factual",
9
+ "topic": "ETL vs ELT tradeoffs",
10
+ "content": "ETL (Extract-Transform-Load) transforms data before loading into the target, reducing storage costs but making the pipeline brittle to schema changes. ELT (Extract-Load-Transform) loads raw data first and transforms in the warehouse, leveraging cheap columnar storage and enabling reprocessing without re-extraction. ELT is the dominant pattern for cloud data warehouses (Snowflake, BigQuery, Redshift).",
11
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
12
+ "evidence": "documented",
13
+ "status": "active",
14
+ "phase_added": "define",
15
+ "timestamp": "2025-01-01T00:00:00.000Z",
16
+ "conflicts_with": [],
17
+ "resolved_by": null,
18
+ "tags": ["data-engineering", "etl", "elt", "warehouse"]
19
+ },
20
+ {
21
+ "id": "data-002",
22
+ "type": "constraint",
23
+ "topic": "schema evolution compatibility",
24
+ "content": "Schema changes must be backwards-compatible for consumers: adding optional fields (safe), removing fields (breaking), renaming fields (breaking), changing types (breaking). Use schema registries (Confluent, AWS Glue) to enforce compatibility rules. Avro supports full forward/backward/full compatibility modes. Breaking changes require a new topic or table version.",
25
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
26
+ "evidence": "documented",
27
+ "status": "active",
28
+ "phase_added": "define",
29
+ "timestamp": "2025-01-01T00:00:00.000Z",
30
+ "conflicts_with": [],
31
+ "resolved_by": null,
32
+ "tags": ["data-engineering", "schema", "evolution", "avro"]
33
+ },
34
+ {
35
+ "id": "data-003",
36
+ "type": "recommendation",
37
+ "topic": "data quality checks in pipelines",
38
+ "content": "Every data pipeline should include automated quality checks: row count expectations (within 10% of previous run), null rate thresholds per column, uniqueness constraints on key columns, freshness checks (data arrived within expected window), and referential integrity across tables. Use tools like Great Expectations, dbt tests, or Soda to define checks as code.",
39
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
40
+ "evidence": "production",
41
+ "status": "active",
42
+ "phase_added": "define",
43
+ "timestamp": "2025-01-01T00:00:00.000Z",
44
+ "conflicts_with": [],
45
+ "resolved_by": null,
46
+ "tags": ["data-engineering", "data-quality", "testing", "dbt"]
47
+ },
48
+ {
49
+ "id": "data-004",
50
+ "type": "risk",
51
+ "topic": "data lake swamp anti-pattern",
52
+ "content": "Data lakes degenerate into data swamps when: (a) data is dumped without schema documentation, (b) there is no catalog or discovery mechanism, (c) no access controls or data classification, (d) no retention policies, (e) no data quality monitoring. Within 12-18 months, undisciplined data lakes become untrusted and abandoned. Apply medallion architecture (bronze/silver/gold) to impose structure.",
53
+ "source": { "origin": "industry", "artifact": null, "connector": null },
54
+ "evidence": "web",
55
+ "status": "active",
56
+ "phase_added": "define",
57
+ "timestamp": "2025-01-01T00:00:00.000Z",
58
+ "conflicts_with": [],
59
+ "resolved_by": null,
60
+ "tags": ["data-engineering", "data-lake", "anti-pattern", "governance"]
61
+ },
62
+ {
63
+ "id": "data-005",
64
+ "type": "factual",
65
+ "topic": "batch vs streaming latency tradeoffs",
66
+ "content": "Batch processing (hourly/daily) is simpler, cheaper, and sufficient when business requirements tolerate T+1 or T+hour latency. Streaming (Kafka, Kinesis, Flink) delivers sub-second latency but costs 3-10x more in infrastructure and 2-3x more in engineering complexity (exactly-once semantics, out-of-order events, state management). Default to batch unless latency requirements demand streaming.",
67
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
68
+ "evidence": "production",
69
+ "status": "active",
70
+ "phase_added": "define",
71
+ "timestamp": "2025-01-01T00:00:00.000Z",
72
+ "conflicts_with": [],
73
+ "resolved_by": null,
74
+ "tags": ["data-engineering", "batch", "streaming", "architecture"]
75
+ },
76
+ {
77
+ "id": "data-006",
78
+ "type": "recommendation",
79
+ "topic": "change data capture pattern",
80
+ "content": "Change Data Capture (CDC) reads database transaction logs (binlog, WAL) to stream row-level changes to downstream systems. Use Debezium for open-source CDC from PostgreSQL, MySQL, MongoDB, and SQL Server. CDC avoids polling overhead, captures deletes (which polling misses), and preserves event ordering. Initial snapshot + streaming log is the standard bootstrap pattern.",
81
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
82
+ "evidence": "documented",
83
+ "status": "active",
84
+ "phase_added": "define",
85
+ "timestamp": "2025-01-01T00:00:00.000Z",
86
+ "conflicts_with": [],
87
+ "resolved_by": null,
88
+ "tags": ["data-engineering", "cdc", "debezium", "replication"]
89
+ },
90
+ {
91
+ "id": "data-007",
92
+ "type": "risk",
93
+ "topic": "late-arriving data in event streams",
94
+ "content": "Event streams contain late-arriving data due to network delays, mobile offline sync, or batch uploads. Windowed aggregations must handle late data with watermarks (maximum allowed lateness). Flink default watermark is 0 (no late data tolerance). Set watermarks based on observed p99 lateness in your data. Late events beyond the watermark are either dropped or routed to a dead-letter topic for manual reconciliation.",
95
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
96
+ "evidence": "production",
97
+ "status": "active",
98
+ "phase_added": "define",
99
+ "timestamp": "2025-01-01T00:00:00.000Z",
100
+ "conflicts_with": [],
101
+ "resolved_by": null,
102
+ "tags": ["data-engineering", "streaming", "late-data", "watermarks"]
103
+ },
104
+ {
105
+ "id": "data-008",
106
+ "type": "recommendation",
107
+ "topic": "idempotent pipeline design",
108
+ "content": "Data pipelines must be idempotent: running the same pipeline twice with the same input produces the same output without duplicates. Implement with: write to a staging table, then MERGE/upsert to the target (not INSERT). Use partition overwrite for append-only tables. Idempotency enables safe retries after partial failures, which occur in approximately 5-10% of batch runs.",
109
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
110
+ "evidence": "production",
111
+ "status": "active",
112
+ "phase_added": "define",
113
+ "timestamp": "2025-01-01T00:00:00.000Z",
114
+ "conflicts_with": [],
115
+ "resolved_by": null,
116
+ "tags": ["data-engineering", "idempotency", "pipeline", "reliability"]
117
+ },
118
+ {
119
+ "id": "data-009",
120
+ "type": "estimate",
121
+ "topic": "Parquet vs CSV storage savings",
122
+ "content": "Columnar formats (Parquet, ORC) reduce storage by 75-90% compared to CSV/JSON for analytical workloads, and query performance improves 10-100x due to column pruning and predicate pushdown. A 100 GB CSV dataset typically compresses to 5-15 GB in Parquet with Snappy compression. Always use Parquet or ORC for data lake storage, never raw CSV.",
123
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
124
+ "evidence": "tested",
125
+ "status": "active",
126
+ "phase_added": "define",
127
+ "timestamp": "2025-01-01T00:00:00.000Z",
128
+ "conflicts_with": [],
129
+ "resolved_by": null,
130
+ "tags": ["data-engineering", "parquet", "storage", "performance"]
131
+ },
132
+ {
133
+ "id": "data-010",
134
+ "type": "constraint",
135
+ "topic": "PII handling in data pipelines",
136
+ "content": "PII must be classified, tagged, and handled according to data governance policy at ingestion time, not after the fact. Apply column-level encryption or tokenization for sensitive fields (SSN, email, phone). Implement row-level access controls in the warehouse. Maintain a data catalog that tracks PII lineage from source to all downstream tables. GDPR right-to-erasure requires the ability to delete a user across all derived datasets.",
137
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
138
+ "evidence": "documented",
139
+ "status": "active",
140
+ "phase_added": "define",
141
+ "timestamp": "2025-01-01T00:00:00.000Z",
142
+ "conflicts_with": [],
143
+ "resolved_by": null,
144
+ "tags": ["data-engineering", "pii", "gdpr", "governance"]
145
+ },
146
+ {
147
+ "id": "data-011",
148
+ "type": "factual",
149
+ "topic": "exactly-once semantics cost",
150
+ "content": "Exactly-once processing in streaming systems (Kafka transactions, Flink checkpointing) adds 10-30% throughput overhead compared to at-least-once. At-least-once with idempotent consumers (using unique event IDs and upsert writes) achieves the same end result with lower complexity. True exactly-once is only required when side effects cannot be made idempotent (sending emails, charging payments).",
151
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
152
+ "evidence": "tested",
153
+ "status": "active",
154
+ "phase_added": "define",
155
+ "timestamp": "2025-01-01T00:00:00.000Z",
156
+ "conflicts_with": [],
157
+ "resolved_by": null,
158
+ "tags": ["data-engineering", "streaming", "exactly-once", "kafka"]
159
+ },
160
+ {
161
+ "id": "data-012",
162
+ "type": "recommendation",
163
+ "topic": "dbt for transformation layer",
164
+ "content": "Use dbt (data build tool) for the transformation layer in ELT pipelines. dbt provides: version-controlled SQL transformations, automatic DAG resolution, built-in testing (not_null, unique, accepted_values, relationships), documentation generation, and incremental materialization for large tables. It has become the industry standard for analytics engineering with adoption at over 30,000 companies.",
165
+ "source": { "origin": "industry", "artifact": null, "connector": null },
166
+ "evidence": "web",
167
+ "status": "active",
168
+ "phase_added": "define",
169
+ "timestamp": "2025-01-01T00:00:00.000Z",
170
+ "conflicts_with": [],
171
+ "resolved_by": null,
172
+ "tags": ["data-engineering", "dbt", "transformation", "analytics-engineering"]
173
+ }
174
+ ]
175
+ }