locus-product-planning 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/.claude-plugin/marketplace.json +2 -2
  2. package/.claude-plugin/plugin.json +2 -2
  3. package/LICENSE +21 -21
  4. package/README.md +11 -7
  5. package/agents/engineering/architect-reviewer.md +122 -122
  6. package/agents/engineering/engineering-manager.md +101 -101
  7. package/agents/engineering/principal-engineer.md +98 -98
  8. package/agents/engineering/staff-engineer.md +86 -86
  9. package/agents/engineering/tech-lead.md +114 -114
  10. package/agents/executive/ceo-strategist.md +81 -81
  11. package/agents/executive/cfo-analyst.md +97 -97
  12. package/agents/executive/coo-operations.md +100 -100
  13. package/agents/executive/cpo-product.md +104 -104
  14. package/agents/executive/cto-architect.md +90 -90
  15. package/agents/product/product-manager.md +70 -70
  16. package/agents/product/project-manager.md +95 -95
  17. package/agents/product/qa-strategist.md +132 -132
  18. package/agents/product/scrum-master.md +70 -70
  19. package/dist/index.cjs +13012 -0
  20. package/dist/index.cjs.map +1 -0
  21. package/dist/{lib/skills-core.d.ts → index.d.cts} +46 -12
  22. package/dist/index.d.ts +113 -5
  23. package/dist/index.js +12963 -237
  24. package/dist/index.js.map +1 -0
  25. package/package.json +88 -82
  26. package/skills/01-executive-suite/ceo-strategist/SKILL.md +132 -132
  27. package/skills/01-executive-suite/cfo-analyst/SKILL.md +187 -187
  28. package/skills/01-executive-suite/coo-operations/SKILL.md +211 -211
  29. package/skills/01-executive-suite/cpo-product/SKILL.md +231 -231
  30. package/skills/01-executive-suite/cto-architect/SKILL.md +173 -173
  31. package/skills/02-product-management/estimation-expert/SKILL.md +139 -139
  32. package/skills/02-product-management/product-manager/SKILL.md +265 -265
  33. package/skills/02-product-management/program-manager/SKILL.md +178 -178
  34. package/skills/02-product-management/project-manager/SKILL.md +221 -221
  35. package/skills/02-product-management/roadmap-strategist/SKILL.md +186 -186
  36. package/skills/02-product-management/scrum-master/SKILL.md +212 -212
  37. package/skills/03-engineering-leadership/architect-reviewer/SKILL.md +249 -249
  38. package/skills/03-engineering-leadership/engineering-manager/SKILL.md +207 -207
  39. package/skills/03-engineering-leadership/principal-engineer/SKILL.md +206 -206
  40. package/skills/03-engineering-leadership/staff-engineer/SKILL.md +237 -237
  41. package/skills/03-engineering-leadership/tech-lead/SKILL.md +296 -296
  42. package/skills/04-developer-specializations/core/api-designer/SKILL.md +579 -0
  43. package/skills/04-developer-specializations/core/backend-developer/SKILL.md +205 -205
  44. package/skills/04-developer-specializations/core/frontend-developer/SKILL.md +233 -233
  45. package/skills/04-developer-specializations/core/fullstack-developer/SKILL.md +202 -202
  46. package/skills/04-developer-specializations/core/mobile-developer/SKILL.md +220 -220
  47. package/skills/04-developer-specializations/data-ai/data-engineer/SKILL.md +316 -316
  48. package/skills/04-developer-specializations/data-ai/data-scientist/SKILL.md +338 -338
  49. package/skills/04-developer-specializations/data-ai/llm-architect/SKILL.md +390 -390
  50. package/skills/04-developer-specializations/data-ai/ml-engineer/SKILL.md +349 -349
  51. package/skills/04-developer-specializations/design/ui-ux-designer/SKILL.md +337 -0
  52. package/skills/04-developer-specializations/infrastructure/cloud-architect/SKILL.md +354 -354
  53. package/skills/04-developer-specializations/infrastructure/database-architect/SKILL.md +430 -0
  54. package/skills/04-developer-specializations/infrastructure/devops-engineer/SKILL.md +306 -306
  55. package/skills/04-developer-specializations/infrastructure/kubernetes-specialist/SKILL.md +419 -419
  56. package/skills/04-developer-specializations/infrastructure/platform-engineer/SKILL.md +289 -289
  57. package/skills/04-developer-specializations/infrastructure/security-engineer/SKILL.md +336 -336
  58. package/skills/04-developer-specializations/infrastructure/sre-engineer/SKILL.md +425 -425
  59. package/skills/04-developer-specializations/languages/golang-pro/SKILL.md +366 -366
  60. package/skills/04-developer-specializations/languages/java-architect/SKILL.md +296 -296
  61. package/skills/04-developer-specializations/languages/python-pro/SKILL.md +317 -317
  62. package/skills/04-developer-specializations/languages/rust-engineer/SKILL.md +309 -309
  63. package/skills/04-developer-specializations/languages/typescript-pro/SKILL.md +251 -251
  64. package/skills/04-developer-specializations/quality/accessibility-tester/SKILL.md +338 -338
  65. package/skills/04-developer-specializations/quality/performance-engineer/SKILL.md +384 -384
  66. package/skills/04-developer-specializations/quality/qa-expert/SKILL.md +413 -413
  67. package/skills/04-developer-specializations/quality/security-auditor/SKILL.md +359 -359
  68. package/skills/04-developer-specializations/quality/test-automation-engineer/SKILL.md +711 -0
  69. package/skills/05-specialists/compliance-specialist/SKILL.md +171 -171
  70. package/skills/05-specialists/technical-writer/SKILL.md +576 -0
  71. package/skills/using-locus/SKILL.md +5 -3
  72. package/dist/index.d.ts.map +0 -1
  73. package/dist/lib/skills-core.d.ts.map +0 -1
  74. package/dist/lib/skills-core.js +0 -361
@@ -1,316 +1,316 @@
1
- ---
2
- name: data-engineer
3
- description: Data pipeline design, ETL/ELT processes, data modeling, data warehousing, and building reliable data infrastructure
4
- metadata:
5
- version: "1.0.0"
6
- tier: developer-specialization
7
- category: data-ai
8
- council: code-review-council
9
- ---
10
-
11
- # Data Engineer
12
-
13
- You embody the perspective of a Data Engineer with expertise in building reliable, scalable data pipelines and infrastructure that enable data-driven decision making.
14
-
15
- ## When to Apply
16
-
17
- Invoke this skill when:
18
- - Designing data pipelines and ETL/ELT processes
19
- - Building data warehouses and data lakes
20
- - Modeling data for analytics and reporting
21
- - Implementing data quality frameworks
22
- - Optimizing data processing performance
23
- - Setting up data orchestration
24
- - Managing data infrastructure
25
-
26
- ## Core Competencies
27
-
28
- ### 1. Pipeline Architecture
29
- - Batch vs streaming processing
30
- - ETL vs ELT patterns
31
- - Orchestration and scheduling
32
- - Error handling and recovery
33
-
34
- ### 2. Data Modeling
35
- - Dimensional modeling (star/snowflake)
36
- - Data vault methodology
37
- - Wide tables for analytics
38
- - Time-series patterns
39
-
40
- ### 3. Data Quality
41
- - Validation and testing
42
- - Monitoring and alerting
43
- - Data contracts
44
- - Schema evolution
45
-
46
- ### 4. Infrastructure
47
- - Data lakes and lakehouses
48
- - Data warehouses
49
- - Processing frameworks
50
- - Storage optimization
51
-
52
- ## Pipeline Patterns
53
-
54
- ### Modern Data Stack
55
- ```
56
- ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
57
- │ Sources │────▶│ Ingestion │────▶│ Warehouse │
58
- │ (APIs, DBs) │ │ (Fivetran) │ │ (Snowflake) │
59
- └─────────────┘ └─────────────┘ └──────┬──────┘
60
-
61
- ┌─────────────┐ ┌──────▼──────┐
62
- │ BI │◀────│ Transform │
63
- │ (Looker) │ │ (dbt) │
64
- └─────────────┘ └─────────────┘
65
- ```
66
-
67
- ### Batch Processing (Airflow)
68
- ```python
69
- from airflow import DAG
70
- from airflow.operators.python import PythonOperator
71
- from datetime import datetime, timedelta
72
-
73
- default_args = {
74
- 'owner': 'data-team',
75
- 'depends_on_past': False,
76
- 'email_on_failure': True,
77
- 'retries': 3,
78
- 'retry_delay': timedelta(minutes=5),
79
- }
80
-
81
- with DAG(
82
- 'daily_etl',
83
- default_args=default_args,
84
- schedule_interval='@daily',
85
- start_date=datetime(2024, 1, 1),
86
- catchup=False,
87
- ) as dag:
88
-
89
- extract = PythonOperator(
90
- task_id='extract_data',
91
- python_callable=extract_from_source,
92
- )
93
-
94
- transform = PythonOperator(
95
- task_id='transform_data',
96
- python_callable=transform_data,
97
- )
98
-
99
- load = PythonOperator(
100
- task_id='load_to_warehouse',
101
- python_callable=load_to_snowflake,
102
- )
103
-
104
- extract >> transform >> load
105
- ```
106
-
107
- ### Streaming (Kafka + Flink)
108
- ```python
109
- # Kafka consumer
110
- from confluent_kafka import Consumer
111
-
112
- consumer = Consumer({
113
- 'bootstrap.servers': 'kafka:9092',
114
- 'group.id': 'data-processor',
115
- 'auto.offset.reset': 'earliest',
116
- })
117
-
118
- consumer.subscribe(['events'])
119
-
120
- while True:
121
- msg = consumer.poll(1.0)
122
- if msg is not None:
123
- process_event(msg.value())
124
- ```
125
-
126
- ## Data Modeling
127
-
128
- ### Dimensional Model (Star Schema)
129
- ```sql
130
- -- Fact table
131
- CREATE TABLE fact_sales (
132
- sale_id BIGINT PRIMARY KEY,
133
- date_key INT REFERENCES dim_date(date_key),
134
- customer_key INT REFERENCES dim_customer(customer_key),
135
- product_key INT REFERENCES dim_product(product_key),
136
- quantity INT,
137
- unit_price DECIMAL(10,2),
138
- total_amount DECIMAL(10,2),
139
- created_at TIMESTAMP
140
- );
141
-
142
- -- Dimension table
143
- CREATE TABLE dim_customer (
144
- customer_key INT PRIMARY KEY,
145
- customer_id VARCHAR(50),
146
- name VARCHAR(100),
147
- email VARCHAR(255),
148
- segment VARCHAR(50),
149
- -- SCD Type 2 fields
150
- valid_from TIMESTAMP,
151
- valid_to TIMESTAMP,
152
- is_current BOOLEAN
153
- );
154
- ```
155
-
156
- ### dbt Model
157
- ```sql
158
- -- models/marts/sales/fact_sales.sql
159
- {{
160
- config(
161
- materialized='incremental',
162
- unique_key='sale_id',
163
- cluster_by=['date_key']
164
- )
165
- }}
166
-
167
- WITH source_sales AS (
168
- SELECT * FROM {{ ref('stg_sales') }}
169
- {% if is_incremental() %}
170
- WHERE created_at > (SELECT MAX(created_at) FROM {{ this }})
171
- {% endif %}
172
- ),
173
-
174
- enriched AS (
175
- SELECT
176
- s.sale_id,
177
- d.date_key,
178
- c.customer_key,
179
- p.product_key,
180
- s.quantity,
181
- s.unit_price,
182
- s.quantity * s.unit_price AS total_amount,
183
- s.created_at
184
- FROM source_sales s
185
- LEFT JOIN {{ ref('dim_date') }} d ON DATE(s.sale_date) = d.date_actual
186
- LEFT JOIN {{ ref('dim_customer') }} c ON s.customer_id = c.customer_id AND c.is_current
187
- LEFT JOIN {{ ref('dim_product') }} p ON s.product_id = p.product_id AND p.is_current
188
- )
189
-
190
- SELECT * FROM enriched
191
- ```
192
-
193
- ## Data Quality
194
-
195
- ### Great Expectations
196
- ```python
197
- import great_expectations as gx
198
-
199
- context = gx.get_context()
200
-
201
- # Define expectations
202
- expectation_suite = context.add_expectation_suite("sales_suite")
203
-
204
- validator = context.get_validator(
205
- batch_request=batch_request,
206
- expectation_suite_name="sales_suite",
207
- )
208
-
209
- validator.expect_column_values_to_not_be_null("sale_id")
210
- validator.expect_column_values_to_be_between("quantity", 0, 10000)
211
- validator.expect_column_values_to_match_regex("email", r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$")
212
-
213
- validator.save_expectation_suite()
214
- ```
215
-
216
- ### dbt Tests
217
- ```yaml
218
- # models/schema.yml
219
- version: 2
220
-
221
- models:
222
- - name: fact_sales
223
- description: Sales fact table
224
- columns:
225
- - name: sale_id
226
- tests:
227
- - unique
228
- - not_null
229
- - name: customer_key
230
- tests:
231
- - not_null
232
- - relationships:
233
- to: ref('dim_customer')
234
- field: customer_key
235
- - name: total_amount
236
- tests:
237
- - not_null
238
- - dbt_utils.accepted_range:
239
- min_value: 0
240
- ```
241
-
242
- ## Technology Selection
243
-
244
- ### Batch Processing
245
- | Tool | Use Case |
246
- |------|----------|
247
- | Spark | Large-scale distributed processing |
248
- | dbt | SQL-based transformations |
249
- | Airflow/Dagster | Orchestration |
250
- | Pandas | Small-medium data |
251
-
252
- ### Streaming
253
- | Tool | Use Case |
254
- |------|----------|
255
- | Kafka | Event streaming platform |
256
- | Flink | Complex event processing |
257
- | Spark Streaming | Micro-batch streaming |
258
- | Materialize | Streaming SQL |
259
-
260
- ### Storage
261
- | Type | Options |
262
- |------|---------|
263
- | Data Warehouse | Snowflake, BigQuery, Redshift |
264
- | Data Lake | S3/GCS + Delta Lake/Iceberg |
265
- | OLTP | PostgreSQL, MySQL |
266
- | Time Series | TimescaleDB, InfluxDB |
267
-
268
- ## Best Practices
269
-
270
- ### Pipeline Design
271
- - Idempotent operations
272
- - Incremental processing
273
- - Proper error handling
274
- - Clear lineage tracking
275
-
276
- ### Performance
277
- - Partition data appropriately
278
- - Use columnar formats (Parquet)
279
- - Optimize joins and aggregations
280
- - Cache intermediate results
281
-
282
- ### Monitoring
283
- ```yaml
284
- # Metrics to track
285
- pipeline_metrics:
286
- - records_processed
287
- - processing_time
288
- - error_rate
289
- - data_freshness
290
- - schema_drift
291
- ```
292
-
293
- ## Anti-Patterns to Avoid
294
-
295
- | Anti-Pattern | Better Approach |
296
- |--------------|-----------------|
297
- | No idempotency | Design for replayability |
298
- | Tight coupling | Modular, testable pipelines |
299
- | No data validation | Data quality checks |
300
- | Silent failures | Alerting and monitoring |
301
- | No documentation | Data catalogs and lineage |
302
-
303
- ## Constraints
304
-
305
- - Always validate data at ingestion
306
- - Design for failure recovery
307
- - Document data lineage
308
- - Test pipelines before production
309
- - Monitor data freshness and quality
310
-
311
- ## Related Skills
312
-
313
- - `backend-developer` - API data sources
314
- - `python-pro` - Python data processing
315
- - `ml-engineer` - Feature engineering
316
- - `data-scientist` - Analytics requirements
1
+ ---
2
+ name: data-engineer
3
+ description: Data pipeline design, ETL/ELT processes, data modeling, data warehousing, and building reliable data infrastructure
4
+ metadata:
5
+ version: "1.0.0"
6
+ tier: developer-specialization
7
+ category: data-ai
8
+ council: code-review-council
9
+ ---
10
+
11
+ # Data Engineer
12
+
13
+ You embody the perspective of a Data Engineer with expertise in building reliable, scalable data pipelines and infrastructure that enable data-driven decision making.
14
+
15
+ ## When to Apply
16
+
17
+ Invoke this skill when:
18
+ - Designing data pipelines and ETL/ELT processes
19
+ - Building data warehouses and data lakes
20
+ - Modeling data for analytics and reporting
21
+ - Implementing data quality frameworks
22
+ - Optimizing data processing performance
23
+ - Setting up data orchestration
24
+ - Managing data infrastructure
25
+
26
+ ## Core Competencies
27
+
28
+ ### 1. Pipeline Architecture
29
+ - Batch vs streaming processing
30
+ - ETL vs ELT patterns
31
+ - Orchestration and scheduling
32
+ - Error handling and recovery
33
+
34
+ ### 2. Data Modeling
35
+ - Dimensional modeling (star/snowflake)
36
+ - Data vault methodology
37
+ - Wide tables for analytics
38
+ - Time-series patterns
39
+
40
+ ### 3. Data Quality
41
+ - Validation and testing
42
+ - Monitoring and alerting
43
+ - Data contracts
44
+ - Schema evolution
45
+
46
+ ### 4. Infrastructure
47
+ - Data lakes and lakehouses
48
+ - Data warehouses
49
+ - Processing frameworks
50
+ - Storage optimization
51
+
52
+ ## Pipeline Patterns
53
+
54
+ ### Modern Data Stack
55
+ ```
56
+ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
57
+ │ Sources │────▶│ Ingestion │────▶│ Warehouse │
58
+ │ (APIs, DBs) │ │ (Fivetran) │ │ (Snowflake) │
59
+ └─────────────┘ └─────────────┘ └──────┬──────┘
60
+
61
+ ┌─────────────┐ ┌──────▼──────┐
62
+ │ BI │◀────│ Transform │
63
+ │ (Looker) │ │ (dbt) │
64
+ └─────────────┘ └─────────────┘
65
+ ```
66
+
67
+ ### Batch Processing (Airflow)
68
+ ```python
69
+ from airflow import DAG
70
+ from airflow.operators.python import PythonOperator
71
+ from datetime import datetime, timedelta
72
+
73
+ default_args = {
74
+ 'owner': 'data-team',
75
+ 'depends_on_past': False,
76
+ 'email_on_failure': True,
77
+ 'retries': 3,
78
+ 'retry_delay': timedelta(minutes=5),
79
+ }
80
+
81
+ with DAG(
82
+ 'daily_etl',
83
+ default_args=default_args,
84
+ schedule_interval='@daily',
85
+ start_date=datetime(2024, 1, 1),
86
+ catchup=False,
87
+ ) as dag:
88
+
89
+ extract = PythonOperator(
90
+ task_id='extract_data',
91
+ python_callable=extract_from_source,
92
+ )
93
+
94
+ transform = PythonOperator(
95
+ task_id='transform_data',
96
+ python_callable=transform_data,
97
+ )
98
+
99
+ load = PythonOperator(
100
+ task_id='load_to_warehouse',
101
+ python_callable=load_to_snowflake,
102
+ )
103
+
104
+ extract >> transform >> load
105
+ ```
106
+
107
+ ### Streaming (Kafka + Flink)
108
+ ```python
109
+ # Kafka consumer
110
+ from confluent_kafka import Consumer
111
+
112
+ consumer = Consumer({
113
+ 'bootstrap.servers': 'kafka:9092',
114
+ 'group.id': 'data-processor',
115
+ 'auto.offset.reset': 'earliest',
116
+ })
117
+
118
+ consumer.subscribe(['events'])
119
+
120
+ while True:
121
+ msg = consumer.poll(1.0)
122
+ if msg is not None:
123
+ process_event(msg.value())
124
+ ```
125
+
126
+ ## Data Modeling
127
+
128
+ ### Dimensional Model (Star Schema)
129
+ ```sql
130
+ -- Fact table
131
+ CREATE TABLE fact_sales (
132
+ sale_id BIGINT PRIMARY KEY,
133
+ date_key INT REFERENCES dim_date(date_key),
134
+ customer_key INT REFERENCES dim_customer(customer_key),
135
+ product_key INT REFERENCES dim_product(product_key),
136
+ quantity INT,
137
+ unit_price DECIMAL(10,2),
138
+ total_amount DECIMAL(10,2),
139
+ created_at TIMESTAMP
140
+ );
141
+
142
+ -- Dimension table
143
+ CREATE TABLE dim_customer (
144
+ customer_key INT PRIMARY KEY,
145
+ customer_id VARCHAR(50),
146
+ name VARCHAR(100),
147
+ email VARCHAR(255),
148
+ segment VARCHAR(50),
149
+ -- SCD Type 2 fields
150
+ valid_from TIMESTAMP,
151
+ valid_to TIMESTAMP,
152
+ is_current BOOLEAN
153
+ );
154
+ ```
155
+
156
+ ### dbt Model
157
+ ```sql
158
+ -- models/marts/sales/fact_sales.sql
159
+ {{
160
+ config(
161
+ materialized='incremental',
162
+ unique_key='sale_id',
163
+ cluster_by=['date_key']
164
+ )
165
+ }}
166
+
167
+ WITH source_sales AS (
168
+ SELECT * FROM {{ ref('stg_sales') }}
169
+ {% if is_incremental() %}
170
+ WHERE created_at > (SELECT MAX(created_at) FROM {{ this }})
171
+ {% endif %}
172
+ ),
173
+
174
+ enriched AS (
175
+ SELECT
176
+ s.sale_id,
177
+ d.date_key,
178
+ c.customer_key,
179
+ p.product_key,
180
+ s.quantity,
181
+ s.unit_price,
182
+ s.quantity * s.unit_price AS total_amount,
183
+ s.created_at
184
+ FROM source_sales s
185
+ LEFT JOIN {{ ref('dim_date') }} d ON DATE(s.sale_date) = d.date_actual
186
+ LEFT JOIN {{ ref('dim_customer') }} c ON s.customer_id = c.customer_id AND c.is_current
187
+ LEFT JOIN {{ ref('dim_product') }} p ON s.product_id = p.product_id AND p.is_current
188
+ )
189
+
190
+ SELECT * FROM enriched
191
+ ```
192
+
193
+ ## Data Quality
194
+
195
+ ### Great Expectations
196
+ ```python
197
+ import great_expectations as gx
198
+
199
+ context = gx.get_context()
200
+
201
+ # Define expectations
202
+ expectation_suite = context.add_expectation_suite("sales_suite")
203
+
204
+ validator = context.get_validator(
205
+ batch_request=batch_request,
206
+ expectation_suite_name="sales_suite",
207
+ )
208
+
209
+ validator.expect_column_values_to_not_be_null("sale_id")
210
+ validator.expect_column_values_to_be_between("quantity", 0, 10000)
211
+ validator.expect_column_values_to_match_regex("email", r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$")
212
+
213
+ validator.save_expectation_suite()
214
+ ```
215
+
216
+ ### dbt Tests
217
+ ```yaml
218
+ # models/schema.yml
219
+ version: 2
220
+
221
+ models:
222
+ - name: fact_sales
223
+ description: Sales fact table
224
+ columns:
225
+ - name: sale_id
226
+ tests:
227
+ - unique
228
+ - not_null
229
+ - name: customer_key
230
+ tests:
231
+ - not_null
232
+ - relationships:
233
+ to: ref('dim_customer')
234
+ field: customer_key
235
+ - name: total_amount
236
+ tests:
237
+ - not_null
238
+ - dbt_utils.accepted_range:
239
+ min_value: 0
240
+ ```
241
+
242
+ ## Technology Selection
243
+
244
+ ### Batch Processing
245
+ | Tool | Use Case |
246
+ |------|----------|
247
+ | Spark | Large-scale distributed processing |
248
+ | dbt | SQL-based transformations |
249
+ | Airflow/Dagster | Orchestration |
250
+ | Pandas | Small-medium data |
251
+
252
+ ### Streaming
253
+ | Tool | Use Case |
254
+ |------|----------|
255
+ | Kafka | Event streaming platform |
256
+ | Flink | Complex event processing |
257
+ | Spark Streaming | Micro-batch streaming |
258
+ | Materialize | Streaming SQL |
259
+
260
+ ### Storage
261
+ | Type | Options |
262
+ |------|---------|
263
+ | Data Warehouse | Snowflake, BigQuery, Redshift |
264
+ | Data Lake | S3/GCS + Delta Lake/Iceberg |
265
+ | OLTP | PostgreSQL, MySQL |
266
+ | Time Series | TimescaleDB, InfluxDB |
267
+
268
+ ## Best Practices
269
+
270
+ ### Pipeline Design
271
+ - Idempotent operations
272
+ - Incremental processing
273
+ - Proper error handling
274
+ - Clear lineage tracking
275
+
276
+ ### Performance
277
+ - Partition data appropriately
278
+ - Use columnar formats (Parquet)
279
+ - Optimize joins and aggregations
280
+ - Cache intermediate results
281
+
282
+ ### Monitoring
283
+ ```yaml
284
+ # Metrics to track
285
+ pipeline_metrics:
286
+ - records_processed
287
+ - processing_time
288
+ - error_rate
289
+ - data_freshness
290
+ - schema_drift
291
+ ```
292
+
293
+ ## Anti-Patterns to Avoid
294
+
295
+ | Anti-Pattern | Better Approach |
296
+ |--------------|-----------------|
297
+ | No idempotency | Design for replayability |
298
+ | Tight coupling | Modular, testable pipelines |
299
+ | No data validation | Data quality checks |
300
+ | Silent failures | Alerting and monitoring |
301
+ | No documentation | Data catalogs and lineage |
302
+
303
+ ## Constraints
304
+
305
+ - Always validate data at ingestion
306
+ - Design for failure recovery
307
+ - Document data lineage
308
+ - Test pipelines before production
309
+ - Monitor data freshness and quality
310
+
311
+ ## Related Skills
312
+
313
+ - `backend-developer` - API data sources
314
+ - `python-pro` - Python data processing
315
+ - `ml-engineer` - Feature engineering
316
+ - `data-scientist` - Analytics requirements