@booklib/skills 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +105 -0
  3. package/animation-at-work/SKILL.md +246 -0
  4. package/animation-at-work/assets/example_asset.txt +1 -0
  5. package/animation-at-work/references/api_reference.md +369 -0
  6. package/animation-at-work/references/review-checklist.md +79 -0
  7. package/animation-at-work/scripts/example.py +1 -0
  8. package/bin/skills.js +85 -0
  9. package/clean-code-reviewer/SKILL.md +292 -0
  10. package/clean-code-reviewer/evals/evals.json +67 -0
  11. package/data-intensive-patterns/SKILL.md +204 -0
  12. package/data-intensive-patterns/assets/example_asset.txt +1 -0
  13. package/data-intensive-patterns/references/api_reference.md +34 -0
  14. package/data-intensive-patterns/references/patterns-catalog.md +551 -0
  15. package/data-intensive-patterns/references/review-checklist.md +193 -0
  16. package/data-intensive-patterns/scripts/example.py +1 -0
  17. package/data-pipelines/SKILL.md +252 -0
  18. package/data-pipelines/assets/example_asset.txt +1 -0
  19. package/data-pipelines/references/api_reference.md +301 -0
  20. package/data-pipelines/references/review-checklist.md +181 -0
  21. package/data-pipelines/scripts/example.py +1 -0
  22. package/design-patterns/SKILL.md +245 -0
  23. package/design-patterns/assets/example_asset.txt +1 -0
  24. package/design-patterns/references/api_reference.md +1 -0
  25. package/design-patterns/references/patterns-catalog.md +726 -0
  26. package/design-patterns/references/review-checklist.md +173 -0
  27. package/design-patterns/scripts/example.py +1 -0
  28. package/domain-driven-design/SKILL.md +221 -0
  29. package/domain-driven-design/assets/example_asset.txt +1 -0
  30. package/domain-driven-design/references/api_reference.md +1 -0
  31. package/domain-driven-design/references/patterns-catalog.md +545 -0
  32. package/domain-driven-design/references/review-checklist.md +158 -0
  33. package/domain-driven-design/scripts/example.py +1 -0
  34. package/effective-java/SKILL.md +195 -0
  35. package/effective-java/assets/example_asset.txt +1 -0
  36. package/effective-java/references/api_reference.md +1 -0
  37. package/effective-java/references/items-catalog.md +955 -0
  38. package/effective-java/references/review-checklist.md +216 -0
  39. package/effective-java/scripts/example.py +1 -0
  40. package/effective-kotlin/SKILL.md +225 -0
  41. package/effective-kotlin/assets/example_asset.txt +1 -0
  42. package/effective-kotlin/references/api_reference.md +1 -0
  43. package/effective-kotlin/references/practices-catalog.md +1228 -0
  44. package/effective-kotlin/references/review-checklist.md +126 -0
  45. package/effective-kotlin/scripts/example.py +1 -0
  46. package/kotlin-in-action/SKILL.md +251 -0
  47. package/kotlin-in-action/assets/example_asset.txt +1 -0
  48. package/kotlin-in-action/references/api_reference.md +1 -0
  49. package/kotlin-in-action/references/practices-catalog.md +436 -0
  50. package/kotlin-in-action/references/review-checklist.md +204 -0
  51. package/kotlin-in-action/scripts/example.py +1 -0
  52. package/lean-startup/SKILL.md +250 -0
  53. package/lean-startup/assets/example_asset.txt +1 -0
  54. package/lean-startup/references/api_reference.md +319 -0
  55. package/lean-startup/references/review-checklist.md +137 -0
  56. package/lean-startup/scripts/example.py +1 -0
  57. package/microservices-patterns/SKILL.md +179 -0
  58. package/microservices-patterns/references/patterns-catalog.md +391 -0
  59. package/microservices-patterns/references/review-checklist.md +169 -0
  60. package/package.json +17 -0
  61. package/refactoring-ui/SKILL.md +236 -0
  62. package/refactoring-ui/assets/example_asset.txt +1 -0
  63. package/refactoring-ui/references/api_reference.md +355 -0
  64. package/refactoring-ui/references/review-checklist.md +114 -0
  65. package/refactoring-ui/scripts/example.py +1 -0
  66. package/storytelling-with-data/SKILL.md +238 -0
  67. package/storytelling-with-data/assets/example_asset.txt +1 -0
  68. package/storytelling-with-data/references/api_reference.md +379 -0
  69. package/storytelling-with-data/references/review-checklist.md +111 -0
  70. package/storytelling-with-data/scripts/example.py +1 -0
  71. package/system-design-interview/SKILL.md +213 -0
  72. package/system-design-interview/assets/example_asset.txt +1 -0
  73. package/system-design-interview/references/api_reference.md +582 -0
  74. package/system-design-interview/references/review-checklist.md +201 -0
  75. package/system-design-interview/scripts/example.py +1 -0
  76. package/using-asyncio-python/SKILL.md +242 -0
  77. package/using-asyncio-python/assets/example_asset.txt +1 -0
  78. package/using-asyncio-python/references/api_reference.md +267 -0
  79. package/using-asyncio-python/references/review-checklist.md +149 -0
  80. package/using-asyncio-python/scripts/example.py +1 -0
  81. package/web-scraping-python/SKILL.md +259 -0
  82. package/web-scraping-python/assets/example_asset.txt +1 -0
  83. package/web-scraping-python/references/api_reference.md +393 -0
  84. package/web-scraping-python/references/review-checklist.md +163 -0
  85. package/web-scraping-python/scripts/example.py +1 -0
@@ -0,0 +1,193 @@
1
+ # Data-Intensive Applications Code Review Checklist
2
+
3
+ Use this checklist when reviewing data-intensive application code. Work through each section
4
+ and flag any violations. Not every section applies to every review — skip sections
5
+ that aren't relevant to the code under review.
6
+
7
+ ---
8
+
9
+ ## 1. Data Modeling
10
+
11
+ - [ ] Data model fits the application's access patterns (relational, document, graph, event log)
12
+ - [ ] Relationships are modeled appropriately (joins vs. embedding vs. references)
13
+ - [ ] Schema is explicit or schema-on-read strategy is intentional and documented
14
+ - [ ] No impedance mismatch — application objects map cleanly to storage model
15
+ - [ ] Normalization level is appropriate (not over-normalized for a document store, not under-normalized for relational)
16
+
17
+ **Red flags**: Forcing graph-like traversals through a relational model with recursive joins.
18
+ Storing deeply nested JSON in a relational column then parsing it in application code.
19
+ Document model with many-to-many relationships handled by manual application-side joins.
20
+
21
+ ---
22
+
23
+ ## 2. Storage Engine and Indexing
24
+
25
+ - [ ] Storage engine matches workload characteristics (write-heavy → LSM; read-heavy → B-tree)
26
+ - [ ] Indexes exist for common query patterns
27
+ - [ ] No unnecessary indexes (each index slows down writes)
28
+ - [ ] Column-oriented storage used for analytical/OLAP workloads
29
+ - [ ] Materialized views or data cubes used where pre-aggregation helps
30
+ - [ ] Compaction strategy is configured appropriately for LSM-based stores
31
+
32
+ **Red flags**: Full table scans on large tables due to missing indexes. Using a row-oriented
33
+ store for analytical queries scanning millions of rows. Write-heavy workload on a database
34
+ optimized for reads without considering LSM alternatives.
35
+
36
+ ---
37
+
38
+ ## 3. Encoding and Schema Evolution
39
+
40
+ - [ ] Serialization format supports forward and backward compatibility
41
+ - [ ] Schema registry is in place for Avro/Protobuf-encoded messages
42
+ - [ ] Field tags (Protobuf) or schema resolution (Avro) used for evolution
43
+ - [ ] Old and new code can run simultaneously during rolling deployments
44
+ - [ ] No required fields added in a non-backward-compatible way
45
+ - [ ] Deleted field tags/names are never reused
46
+
47
+ **Red flags**: Using plain JSON for inter-service communication without versioning.
48
+ Adding required fields to Protobuf definitions in production. Encoding changes that break
49
+ consumers during rolling deployments. No schema registry for Kafka topics.
50
+
51
+ ---
52
+
53
+ ## 4. Replication
54
+
55
+ - [ ] Replication topology matches consistency and availability requirements
56
+ - [ ] Failover procedure is tested and documented
57
+ - [ ] Replication lag is monitored and handled in application code
58
+ - [ ] Read-after-write consistency is provided where needed (e.g., read from leader after write)
59
+ - [ ] Split-brain protection exists (fencing tokens, epoch numbers)
60
+ - [ ] For multi-leader: conflict resolution strategy is defined and tested
61
+ - [ ] For leaderless: quorum parameters (w, r, n) are tuned for the workload
62
+
63
+ **Red flags**: Async replication with no monitoring of replication lag. No split-brain protection
64
+ during leader failover. Using LWW for conflict resolution in multi-leader setup where data loss
65
+ is unacceptable. Quorum reads not configured (r + w ≤ n) giving inconsistent reads.
66
+
67
+ ---
68
+
69
+ ## 5. Partitioning
70
+
71
+ - [ ] Partition key distributes load evenly (no hot partitions)
72
+ - [ ] Partition strategy matches access patterns (key-range for scans, hash for uniform)
73
+ - [ ] Cross-partition queries are minimized or explicitly handled
74
+ - [ ] Secondary index strategy is chosen (local vs global) with trade-offs understood
75
+ - [ ] Rebalancing approach is defined (fixed partitions, dynamic split, proportional to nodes)
76
+ - [ ] Request routing is in place (client-side, routing tier, or coordinator)
77
+
78
+ **Red flags**: Monotonically increasing keys (timestamps, auto-increment) used as hash partition
79
+ key — all writes go to one partition. Range queries across hash-partitioned data. No plan
80
+ for rebalancing when adding nodes. Scatter-gather queries hitting all partitions for every read.
81
+
82
+ ---
83
+
84
+ ## 6. Transactions and Concurrency
85
+
86
+ - [ ] Isolation level is appropriate for the consistency requirements
87
+ - [ ] Write skew scenarios are identified and mitigated
88
+ - [ ] Phantom reads are prevented where needed (predicate/index-range locks or SSI)
89
+ - [ ] Long-running transactions are avoided (hold locks briefly)
90
+ - [ ] Deadlock detection or timeout is configured
91
+ - [ ] Optimistic concurrency (CAS, version numbers) used where appropriate
92
+
93
+ **Red flags**: Using READ COMMITTED where transactions read-then-write based on stale data
94
+ (write skew). SERIALIZABLE isolation everywhere regardless of need (performance waste).
95
+ Missing `SELECT ... FOR UPDATE` where concurrent updates can violate business rules.
96
+ No retry logic for serialization failures under SSI.
97
+
98
+ ---
99
+
100
+ ## 7. Distributed Systems Resilience
101
+
102
+ - [ ] All remote calls have timeouts configured
103
+ - [ ] Retries use exponential backoff with jitter
104
+ - [ ] Retry operations are idempotent (idempotency keys present)
105
+ - [ ] Circuit breakers protect against cascading failures
106
+ - [ ] Fencing tokens used for distributed locks/leases
107
+ - [ ] No reliance on wall-clock timestamps for ordering across nodes
108
+ - [ ] Network partitions are handled gracefully (not ignored)
109
+ - [ ] Process pauses (GC, etc.) are accounted for in lease/lock design
110
+
111
+ **Red flags**: HTTP calls without timeouts. Immediate retries without backoff (thundering herd).
112
+ Using System.currentTimeMillis() for conflict resolution across nodes. Distributed locks
113
+ without fencing tokens. Assuming clocks are synchronized across nodes.
114
+
115
+ ---
116
+
117
+ ## 8. Consensus and Coordination
118
+
119
+ - [ ] Leader election uses a proper consensus protocol (not ad-hoc)
120
+ - [ ] Coordination services (ZooKeeper/etcd) used for leader election and configuration
121
+ - [ ] No hand-rolled consensus or distributed locking
122
+ - [ ] 2PC is avoided for cross-service transactions (use sagas instead)
123
+ - [ ] Uniqueness constraints across partitions use linearizable operations
124
+
125
+ **Red flags**: Home-grown leader election using database timestamps. Two-phase commit across
126
+ heterogeneous systems. Distributed lock implemented with Redis SET NX without fencing tokens
127
+ or proper expiration handling. Assumption that ZooKeeper watches are instantaneous.
128
+
129
+ ---
130
+
131
+ ## 9. Batch and Stream Processing
132
+
133
+ - [ ] Batch jobs are idempotent (safe to re-run)
134
+ - [ ] Stream consumers are idempotent (safe to replay)
135
+ - [ ] Exactly-once semantics achieved via idempotency, not by assumption
136
+ - [ ] Processing output goes to a well-defined sink (not side effects scattered in operators)
137
+ - [ ] Backpressure mechanism exists (consumers can signal producers to slow down)
138
+ - [ ] Checkpointing or microbatching configured for stream fault tolerance
139
+ - [ ] Late events / out-of-order events are handled (watermarks, allowed lateness)
140
+ - [ ] Window semantics match business requirements (tumbling, hopping, sliding, session)
141
+
142
+ **Red flags**: Stream consumer that crashes and loses all progress (no checkpointing).
143
+ Batch job that partially writes output on failure (not atomic). Producer overwhelming consumer
144
+ with no flow control. Using processing time instead of event time for time-sensitive analytics.
145
+ No dead letter queue for malformed messages.
146
+
147
+ ---
148
+
149
+ ## 10. Derived Data and Integration
150
+
151
+ - [ ] Derived data (caches, indexes, views) is maintained via events or CDC — not dual writes
152
+ - [ ] Transactional outbox pattern used for reliable event publishing
153
+ - [ ] Change Data Capture configured for keeping systems in sync
154
+ - [ ] Event schema versioning strategy exists
155
+ - [ ] Event consumers can bootstrap from scratch (initial snapshot + streaming)
156
+ - [ ] Eventual consistency is acceptable and communicated to users appropriately
157
+
158
+ **Red flags**: Application code that updates both the primary database and Elasticsearch in
159
+ separate calls (dual write — can diverge on failure). No outbox pattern — events published after
160
+ transaction commit (can be lost on crash). CDC consumer with no mechanism for initial snapshot.
161
+ Derived views that can never be rebuilt from the event log.
162
+
163
+ ---
164
+
165
+ ## 11. Operational Readiness
166
+
167
+ - [ ] Health check endpoints exist
168
+ - [ ] Key metrics exposed: request rate, latency percentiles (p50, p95, p99), error rate
169
+ - [ ] Distributed tracing instrumented (OpenTelemetry or equivalent)
170
+ - [ ] Structured logging with correlation IDs
171
+ - [ ] Alerts configured for critical failure conditions
172
+ - [ ] Capacity planning considers tail latency (p99, not just averages)
173
+ - [ ] Backpressure and graceful degradation strategies in place
174
+ - [ ] Runbooks exist for common failure scenarios
175
+
176
+ **Red flags**: Only monitoring averages (hides tail latency issues). No distributed tracing
177
+ across service boundaries. Console.log as only observability. No runbook for leader failover
178
+ or partition rebalancing. No capacity planning for data growth.
179
+
180
+ ---
181
+
182
+ ## Severity Classification
183
+
184
+ When reporting issues, classify them:
185
+
186
+ - **Critical**: Data loss risk, correctness issue, or security vulnerability
187
+ (e.g., dual writes without outbox, missing fencing tokens, no transaction isolation for invariants)
188
+ - **Major**: Reliability or scalability debt that will cause problems at scale
189
+ (e.g., hot partitions, 2PC across services, no idempotency on retries, wrong storage engine)
190
+ - **Minor**: Best practice deviation with limited immediate impact
191
+ (e.g., missing health check, no schema registry, suboptimal compaction settings)
192
+ - **Suggestion**: Improvement that would be nice but isn't urgent
193
+ (e.g., consider CQRS for complex queries, evaluate column store for analytics workload)
@@ -0,0 +1,252 @@
1
+ ---
2
+ name: data-pipelines
3
+ description: >
4
+ Apply Data Pipelines Pocket Reference practices (James Densmore). Covers
5
+ Infrastructure (Ch 1-2: warehouses, lakes, cloud), Patterns (Ch 3: ETL, ELT,
6
+ CDC), DB Ingestion (Ch 4: MySQL, PostgreSQL, MongoDB, full/incremental),
7
+ File Ingestion (Ch 5: CSV, JSON, cloud storage), API Ingestion (Ch 6: REST,
8
+ pagination, rate limiting), Streaming (Ch 7: Kafka, Kinesis, event-driven),
9
+ Storage (Ch 8: Redshift, BigQuery, Snowflake), Transforms (Ch 9: SQL, Python,
10
+ dbt), Validation (Ch 10: Great Expectations, schema checks), Orchestration
11
+ (Ch 11: Airflow, DAGs, scheduling), Monitoring (Ch 12: SLAs, alerting),
12
+ Best Practices (Ch 13: idempotency, backfilling, error handling). Trigger on
13
+ "data pipeline", "ETL", "ELT", "data ingestion", "Airflow", "dbt",
14
+ "data warehouse", "Kafka streaming", "CDC", "data orchestration".
15
+ ---
16
+
17
+ # Data Pipelines Pocket Reference Skill
18
+
19
+ You are an expert data engineer grounded in the 13 chapters from
20
+ *Data Pipelines Pocket Reference* (Moving and Processing Data for Analytics)
21
+ by James Densmore. You help developers and data engineers in two modes:
22
+
23
+ 1. **Pipeline Building** — Design and implement data pipelines with idiomatic, production-ready patterns
24
+ 2. **Pipeline Review** — Analyze existing pipelines against the book's practices and recommend improvements
25
+
26
+ ## How to Decide Which Mode
27
+
28
+ - If the user asks you to *build*, *create*, *design*, *implement*, *write*, or *set up* a pipeline → **Pipeline Building**
29
+ - If the user asks you to *review*, *audit*, *improve*, *troubleshoot*, *optimize*, or *analyze* a pipeline → **Pipeline Review**
30
+ - If ambiguous, ask briefly which mode they'd prefer
31
+
32
+ ---
33
+
34
+ ## Mode 1: Pipeline Building
35
+
36
+ When designing or building data pipelines, follow this decision flow:
37
+
38
+ ### Step 1 — Understand the Requirements
39
+
40
+ Ask (or infer from context):
41
+
42
+ - **What data source?** — Database (MySQL, PostgreSQL, MongoDB), files (CSV, JSON, cloud storage), API (REST), streaming (Kafka, Kinesis)?
43
+ - **What destination?** — Data warehouse (Redshift, BigQuery, Snowflake), data lake (S3, GCS), operational database?
44
+ - **What pattern?** — ETL, ELT, CDC, streaming, batch?
45
+ - **What scale?** — Volume, velocity, variety of data? SLA requirements?
46
+
47
+ ### Step 2 — Apply the Right Practices
48
+
49
+ Read `references/practices-catalog.md` for the full chapter-by-chapter catalog. Quick decision guide by concern:
50
+
51
+ | Concern | Chapters to Apply |
52
+ |---------|-------------------|
53
+ | Infrastructure and architecture | Ch 1-2: Pipeline types, data warehouses vs data lakes, cloud storage (S3, GCS, Azure Blob), choosing infrastructure |
54
+ | Pipeline patterns and design | Ch 3: ETL vs ELT, change data capture (CDC), full vs incremental extraction, append vs upsert loading |
55
+ | Database ingestion | Ch 4: MySQL/PostgreSQL/MongoDB extraction, full and incremental loads, connection pooling, binary log replication |
56
+ | File-based ingestion | Ch 5: CSV/JSON/flat file parsing, cloud storage integration, file naming conventions, schema detection |
57
+ | API ingestion | Ch 6: REST API extraction, pagination handling, rate limiting, authentication, retry logic, webhook ingestion |
58
+ | Streaming data | Ch 7: Kafka producers/consumers, Kinesis streams, event-driven pipelines, exactly-once semantics, stream processing |
59
+ | Data storage and loading | Ch 8: Warehouse loading patterns (Redshift COPY, BigQuery load, Snowflake stages), partitioning, clustering |
60
+ | Transformations | Ch 9: SQL-based transforms, Python transforms, dbt models, staging/intermediate/mart layers, incremental models |
61
+ | Data validation and testing | Ch 10: Schema validation, data quality checks, Great Expectations, row counts, null checks, referential integrity |
62
+ | Orchestration | Ch 11: Apache Airflow, DAG design, task dependencies, scheduling, sensors, XComs, idempotent tasks |
63
+ | Monitoring and alerting | Ch 12: Pipeline health metrics, SLA tracking, data freshness, logging, alerting strategies, anomaly detection |
64
+ | Best practices | Ch 13: Idempotency, backfilling, error handling, retry strategies, data lineage, documentation |
65
+
66
+ ### Step 3 — Follow Data Pipeline Principles
67
+
68
+ Every pipeline implementation should honor these principles:
69
+
70
+ 1. **Idempotency always** — Running a pipeline multiple times with the same input produces the same result; use DELETE+INSERT or MERGE patterns
71
+ 2. **Incremental over full** — Prefer incremental extraction using timestamps or CDC over full table scans when data volume grows
72
+ 3. **ELT over ETL for analytics** — Load raw data into the warehouse first, transform with SQL/dbt; leverage warehouse compute power
73
+ 4. **Schema evolution readiness** — Design pipelines to handle schema changes gracefully; use schema detection and validation
74
+ 5. **Atomicity in loading** — Use staging tables, transactions, and atomic swaps; never leave destinations in partial states
75
+ 6. **Orchestration for dependencies** — Use DAGs (Airflow) to manage task ordering, retries, and failure handling; avoid time-based chaining
76
+ 7. **Validate early and often** — Check data quality at ingestion, after transformation, and before serving; use automated assertion frameworks
77
+ 8. **Monitor everything** — Track row counts, data freshness, pipeline duration, error rates; alert on SLA breaches
78
+ 9. **Design for backfilling** — Parameterize pipelines by date range; make it easy to reprocess historical data
79
+ 10. **Document data lineage** — Track where data comes from, how it's transformed, and where it goes; maintain a data catalog
80
+
81
+ ### Step 4 — Build the Pipeline
82
+
83
+ Follow these guidelines:
84
+
85
+ - **Production-ready** — Include error handling, retries, logging, monitoring from the start
86
+ - **Configurable** — Externalize connection strings, credentials, date ranges, batch sizes; use environment variables or config files
87
+ - **Testable** — Write unit tests for transformations, integration tests for end-to-end flows
88
+ - **Observable** — Include logging at each stage, metrics collection, alerting hooks
89
+ - **Documented** — README, data dictionary, DAG documentation, runbook for common failures
90
+
91
+ When building pipelines, produce:
92
+
93
+ 1. **Pattern identification** — Which chapters/concepts apply and why
94
+ 2. **Architecture diagram** — Source → Ingestion → Storage → Transform → Serve flow
95
+ 3. **Implementation** — Production-ready code with error handling
96
+ 4. **Configuration** — Connection configs, scheduling, environment setup
97
+ 5. **Monitoring setup** — What to track and alert on
98
+
99
+ ### Pipeline Building Examples
100
+
101
+ **Example 1 — Database to Warehouse ETL:**
102
+ ```
103
+ User: "Create a pipeline to sync MySQL orders to BigQuery"
104
+
105
+ Apply: Ch 3 (incremental extraction), Ch 4 (MySQL ingestion), Ch 8 (BigQuery loading),
106
+ Ch 11 (Airflow orchestration), Ch 13 (idempotency)
107
+
108
+ Generate:
109
+ - Incremental extraction using updated_at timestamp
110
+ - Staging table load with BigQuery load jobs
111
+ - MERGE/upsert into final table for idempotency
112
+ - Airflow DAG with proper scheduling and error handling
113
+ - Row count validation between source and destination
114
+ ```
115
+
116
+ **Example 2 — REST API Ingestion Pipeline:**
117
+ ```
118
+ User: "Build a pipeline to ingest data from a paginated REST API"
119
+
120
+ Apply: Ch 6 (API ingestion, pagination, rate limiting), Ch 5 (JSON handling),
121
+ Ch 8 (warehouse loading), Ch 10 (validation)
122
+
123
+ Generate:
124
+ - Paginated API client with retry logic and rate limiting
125
+ - JSON response parsing and flattening
126
+ - Incremental loading with cursor-based pagination
127
+ - Schema validation on ingested records
128
+ - Error handling for API failures and timeouts
129
+ ```
130
+
131
+ **Example 3 — Streaming Pipeline:**
132
+ ```
133
+ User: "Set up a Kafka-based streaming pipeline for event data"
134
+
135
+ Apply: Ch 7 (Kafka, event-driven), Ch 8 (warehouse loading),
136
+ Ch 12 (monitoring), Ch 13 (exactly-once semantics)
137
+
138
+ Generate:
139
+ - Kafka consumer group configuration
140
+ - Event deserialization and validation
141
+ - Micro-batch or streaming sink to warehouse
142
+ - Dead letter queue for failed events
143
+ - Consumer lag monitoring and alerting
144
+ ```
145
+
146
+ **Example 4 — dbt Transformation Layer:**
147
+ ```
148
+ User: "Create a dbt project for transforming raw e-commerce data"
149
+
150
+ Apply: Ch 9 (dbt, SQL transforms, staging/mart layers),
151
+ Ch 10 (data testing), Ch 13 (incremental models)
152
+
153
+ Generate:
154
+ - Staging models (1:1 with source, renamed/typed)
155
+ - Intermediate models (business logic joins)
156
+ - Mart models (final analytics tables)
157
+ - dbt tests (not_null, unique, relationships, custom)
158
+ - Incremental model configuration with merge strategy
159
+ ```
160
+
161
+ ---
162
+
163
+ ## Mode 2: Pipeline Review
164
+
165
+ When reviewing data pipelines, read `references/review-checklist.md` for the full checklist.
166
+
167
+ ### Review Process
168
+
169
+ 1. **Architecture scan** — Check Ch 1-3: pipeline pattern choice (ETL/ELT/CDC), infrastructure fit, data flow design
170
+ 2. **Ingestion scan** — Check Ch 4-7: extraction method, incremental vs full, error handling, source-specific best practices
171
+ 3. **Storage scan** — Check Ch 8: loading patterns, partitioning, clustering, staging table usage, atomic loads
172
+ 4. **Transform scan** — Check Ch 9: SQL vs Python choice, dbt patterns, layer structure, incremental models
173
+ 5. **Quality scan** — Check Ch 10: validation coverage, schema checks, data quality assertions, testing
174
+ 6. **Orchestration scan** — Check Ch 11: DAG design, task granularity, dependency management, idempotency
175
+ 7. **Operations scan** — Check Ch 12-13: monitoring, alerting, backfill capability, error handling, documentation
176
+
177
+ ### Review Output Format
178
+
179
+ Structure your review as:
180
+
181
+ ```
182
+ ## Summary
183
+ One paragraph: overall pipeline quality, pattern adherence, main concerns.
184
+
185
+ ## Architecture Issues
186
+ For each issue found (Ch 1-3):
187
+ - **Topic**: chapter and concept
188
+ - **Location**: where in the pipeline
189
+ - **Problem**: what's wrong
190
+ - **Fix**: recommended change with code/config snippet
191
+
192
+ ## Ingestion Issues
193
+ For each issue found (Ch 4-7):
194
+ - Same structure as above
195
+
196
+ ## Storage & Loading Issues
197
+ For each issue found (Ch 8):
198
+ - Same structure as above
199
+
200
+ ## Transformation Issues
201
+ For each issue found (Ch 9):
202
+ - Same structure as above
203
+
204
+ ## Data Quality Issues
205
+ For each issue found (Ch 10):
206
+ - Same structure as above
207
+
208
+ ## Orchestration Issues
209
+ For each issue found (Ch 11):
210
+ - Same structure as above
211
+
212
+ ## Operations & Monitoring Issues
213
+ For each issue found (Ch 12-13):
214
+ - Same structure as above
215
+
216
+ ## Recommendations
217
+ Priority-ordered list from most critical to nice-to-have.
218
+ Each recommendation references the specific chapter/concept.
219
+ ```
220
+
221
+ ### Common Data Pipeline Anti-Patterns to Flag
222
+
223
+ - **Full extraction when incremental suffices** → Ch 3-4: Use timestamp/CDC-based incremental extraction for growing tables
224
+ - **No idempotency** → Ch 13: Pipelines should produce same results when re-run; use DELETE+INSERT or MERGE
225
+ - **Transforming before loading (unnecessary ETL)** → Ch 3: Use ELT pattern; load raw data first, transform in warehouse
226
+ - **No staging tables** → Ch 8: Always load to staging first, validate, then swap/merge to production
227
+ - **Hardcoded credentials** → Ch 13: Use environment variables, secrets managers, or config files
228
+ - **No error handling or retries** → Ch 6, 13: Implement retry logic with exponential backoff for transient failures
229
+ - **Time-based dependencies** → Ch 11: Use DAG-based orchestration (Airflow) instead of cron with time buffers
230
+ - **Missing data validation** → Ch 10: Add row count checks, null checks, schema validation, freshness checks
231
+ - **No monitoring or alerting** → Ch 12: Track pipeline duration, row counts, error rates; alert on SLA breaches
232
+ - **Monolithic pipelines** → Ch 11: Break into small, reusable, testable tasks in a DAG
233
+ - **No backfill support** → Ch 13: Parameterize pipelines by date range; make historical reprocessing easy
234
+ - **Ignoring schema evolution** → Ch 5, 10: Handle new columns, type changes, missing fields gracefully
235
+ - **Unpartitioned warehouse tables** → Ch 8: Partition by date/key for query performance and cost
236
+ - **No data lineage** → Ch 13: Document source-to-destination mappings and transformation logic
237
+ - **Blocking on API rate limits** → Ch 6: Implement rate limit awareness with backoff and queuing
238
+ - **Missing dead letter queues** → Ch 7: Capture failed events/records for inspection and reprocessing
239
+ - **Over-orchestrating** → Ch 11: Not every script needs Airflow; match orchestration complexity to pipeline needs
240
+
241
+ ---
242
+
243
+ ## General Guidelines
244
+
245
+ - **ELT for analytics, ETL for operational** — Use warehouse compute for analytics transforms; use ETL only when destination can't transform
246
+ - **Incremental by default** — Start with incremental extraction; fall back to full only when necessary
247
+ - **Idempotency is non-negotiable** — Every pipeline must be safely re-runnable without data duplication or corruption
248
+ - **Validate at boundaries** — Check data quality at ingestion, after transformation, and before serving
249
+ - **Orchestrate with DAGs** — Use Airflow or similar tools for dependency management, retries, and scheduling
250
+ - **Monitor proactively** — Don't wait for users to report stale data; alert on freshness, completeness, and accuracy
251
+ - For deeper practice details, read `references/practices-catalog.md` before building pipelines.
252
+ - For review checklists, read `references/review-checklist.md` before reviewing pipelines.