container-superposition 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +206 -1
- package/dist/scripts/init.js +235 -179
- package/dist/scripts/init.js.map +1 -1
- package/dist/tool/commands/doctor.d.ts +15 -0
- package/dist/tool/commands/doctor.d.ts.map +1 -0
- package/dist/tool/commands/doctor.js +862 -0
- package/dist/tool/commands/doctor.js.map +1 -0
- package/dist/tool/commands/explain.d.ts +13 -0
- package/dist/tool/commands/explain.d.ts.map +1 -0
- package/dist/tool/commands/explain.js +211 -0
- package/dist/tool/commands/explain.js.map +1 -0
- package/dist/tool/commands/list.d.ts +16 -0
- package/dist/tool/commands/list.d.ts.map +1 -0
- package/dist/tool/commands/list.js +121 -0
- package/dist/tool/commands/list.js.map +1 -0
- package/dist/tool/commands/plan.d.ts +16 -0
- package/dist/tool/commands/plan.d.ts.map +1 -0
- package/dist/tool/commands/plan.js +329 -0
- package/dist/tool/commands/plan.js.map +1 -0
- package/dist/tool/questionnaire/composer.d.ts +6 -1
- package/dist/tool/questionnaire/composer.d.ts.map +1 -1
- package/dist/tool/questionnaire/composer.js +300 -202
- package/dist/tool/questionnaire/composer.js.map +1 -1
- package/dist/tool/readme/markdown-parser.d.ts.map +1 -1
- package/dist/tool/readme/markdown-parser.js.map +1 -1
- package/dist/tool/readme/readme-generator.d.ts.map +1 -1
- package/dist/tool/readme/readme-generator.js +11 -6
- package/dist/tool/readme/readme-generator.js.map +1 -1
- package/dist/tool/schema/deployment-targets.d.ts +77 -0
- package/dist/tool/schema/deployment-targets.d.ts.map +1 -0
- package/dist/tool/schema/deployment-targets.js +91 -0
- package/dist/tool/schema/deployment-targets.js.map +1 -0
- package/dist/tool/schema/manifest-migrations.d.ts +51 -0
- package/dist/tool/schema/manifest-migrations.d.ts.map +1 -0
- package/dist/tool/schema/manifest-migrations.js +159 -0
- package/dist/tool/schema/manifest-migrations.js.map +1 -0
- package/dist/tool/schema/overlay-loader.d.ts +1 -1
- package/dist/tool/schema/overlay-loader.d.ts.map +1 -1
- package/dist/tool/schema/overlay-loader.js +42 -14
- package/dist/tool/schema/overlay-loader.js.map +1 -1
- package/dist/tool/schema/types.d.ts +44 -2
- package/dist/tool/schema/types.d.ts.map +1 -1
- package/dist/tool/utils/merge.d.ts +134 -0
- package/dist/tool/utils/merge.d.ts.map +1 -0
- package/dist/tool/utils/merge.js +277 -0
- package/dist/tool/utils/merge.js.map +1 -0
- package/dist/tool/utils/port-utils.d.ts +29 -0
- package/dist/tool/utils/port-utils.d.ts.map +1 -0
- package/dist/tool/utils/port-utils.js +128 -0
- package/dist/tool/utils/port-utils.js.map +1 -0
- package/dist/tool/utils/version.d.ts +9 -0
- package/dist/tool/utils/version.d.ts.map +1 -0
- package/dist/tool/utils/version.js +32 -0
- package/dist/tool/utils/version.js.map +1 -0
- package/docs/architecture.md +25 -21
- package/docs/deployment-targets.md +150 -0
- package/docs/discovery-commands.md +442 -0
- package/docs/merge-strategy.md +700 -0
- package/docs/minimal-and-editor.md +265 -0
- package/docs/overlay-imports.md +209 -0
- package/docs/overlay-manifest-refactoring.md +2 -2
- package/docs/overlay-metadata-archive.md +1 -1
- package/docs/overlays.md +91 -23
- package/docs/presets-architecture.md +3 -3
- package/docs/presets.md +1 -1
- package/docs/publishing.md +36 -35
- package/docs/team-workflow.md +540 -0
- package/overlays/.presets/data-engineering.yml +392 -0
- package/overlays/.presets/event-sourced-service.yml +262 -0
- package/overlays/.presets/frontend.yml +287 -0
- package/overlays/.presets/k8s-operator-dev.yml +462 -0
- package/overlays/.registry/README.md +1 -1
- package/overlays/.registry/deployment-targets.yml +54 -0
- package/overlays/.shared/README.md +43 -0
- package/overlays/.shared/compose/common-healthchecks.yml +38 -0
- package/overlays/.shared/otel/instrumentation.env +20 -0
- package/overlays/.shared/otel/otel-base-config.yaml +30 -0
- package/overlays/.shared/vscode/recommended-extensions.json +14 -0
- package/overlays/README.md +1 -1
- package/overlays/codex/overlay.yml +1 -0
- package/overlays/duckdb/README.md +274 -0
- package/overlays/duckdb/devcontainer.patch.json +10 -0
- package/overlays/duckdb/overlay.yml +17 -0
- package/overlays/duckdb/setup.sh +45 -0
- package/overlays/duckdb/verify.sh +32 -0
- package/overlays/git-helpers/overlay.yml +1 -0
- package/overlays/grafana/README.md +5 -5
- package/overlays/grafana/dashboard-provider.yml +1 -1
- package/overlays/grafana/docker-compose.yml +2 -2
- package/overlays/grafana/overlay.yml +6 -1
- package/overlays/jaeger/overlay.yml +16 -3
- package/overlays/jupyter/.env.example +6 -0
- package/overlays/jupyter/README.md +210 -0
- package/overlays/jupyter/devcontainer.patch.json +14 -0
- package/overlays/jupyter/docker-compose.yml +23 -0
- package/overlays/jupyter/overlay.yml +18 -0
- package/overlays/jupyter/verify.sh +35 -0
- package/overlays/kind/README.md +221 -0
- package/overlays/kind/devcontainer.patch.json +10 -0
- package/overlays/kind/overlay.yml +18 -0
- package/overlays/kind/setup.sh +43 -0
- package/overlays/kind/verify.sh +40 -0
- package/overlays/localstack/.env.example +6 -0
- package/overlays/localstack/README.md +188 -0
- package/overlays/localstack/devcontainer.patch.json +21 -0
- package/overlays/localstack/docker-compose.yml +25 -0
- package/overlays/localstack/overlay.yml +18 -0
- package/overlays/localstack/verify.sh +47 -0
- package/overlays/loki/overlay.yml +6 -1
- package/overlays/modern-cli-tools/overlay.yml +1 -0
- package/overlays/mongodb/overlay.yml +12 -2
- package/overlays/mysql/overlay.yml +12 -2
- package/overlays/nats/overlay.yml +12 -2
- package/overlays/openapi-tools/README.md +243 -0
- package/overlays/openapi-tools/devcontainer.patch.json +10 -0
- package/overlays/openapi-tools/overlay.yml +16 -0
- package/overlays/openapi-tools/setup.sh +45 -0
- package/overlays/openapi-tools/verify.sh +51 -0
- package/overlays/otel-collector/overlay.yml.example +26 -0
- package/overlays/postgres/overlay.yml +6 -1
- package/overlays/prometheus/overlay.yml +6 -1
- package/overlays/rabbitmq/overlay.yml +12 -2
- package/overlays/redis/overlay.yml +6 -1
- package/overlays/tilt/README.md +259 -0
- package/overlays/tilt/devcontainer.patch.json +17 -0
- package/overlays/tilt/overlay.yml +19 -0
- package/overlays/tilt/setup.sh +25 -0
- package/overlays/tilt/verify.sh +24 -0
- package/package.json +8 -6
- package/tool/README.md +12 -16
- package/tool/schema/overlay-manifest.schema.json +64 -4
- package/tool/schema/superposition-manifest.schema.json +104 -0
- /package/overlays/{presets → .presets}/docs-site.yml +0 -0
- /package/overlays/{presets → .presets}/fullstack.yml +0 -0
- /package/overlays/{presets → .presets}/microservice.yml +0 -0
- /package/overlays/{presets → .presets}/web-api.yml +0 -0
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
# Data Engineering Preset
|
|
2
|
+
# Python-based data engineering environment
|
|
3
|
+
|
|
4
|
+
id: data-engineering
|
|
5
|
+
name: Data Engineering
|
|
6
|
+
description: Python-based data engineering with database, object storage, and analytics
|
|
7
|
+
type: meta
|
|
8
|
+
category: preset
|
|
9
|
+
supports: [compose] # Requires Docker Compose for services
|
|
10
|
+
tags: [preset, data-engineering, python, etl, analytics, data-science]
|
|
11
|
+
|
|
12
|
+
# Overlays to select
|
|
13
|
+
selects:
|
|
14
|
+
# Always included
|
|
15
|
+
required:
|
|
16
|
+
- python
|
|
17
|
+
- minio # Data lake storage
|
|
18
|
+
- modern-cli-tools
|
|
19
|
+
|
|
20
|
+
# User makes choices
|
|
21
|
+
userChoice:
|
|
22
|
+
database:
|
|
23
|
+
id: database
|
|
24
|
+
prompt: Select database for analytics
|
|
25
|
+
options: [postgres, mongodb]
|
|
26
|
+
defaultOption: postgres
|
|
27
|
+
|
|
28
|
+
# Glue configuration - integration helpers
|
|
29
|
+
glueConfig:
|
|
30
|
+
# Pre-configured environment variables
|
|
31
|
+
environment:
|
|
32
|
+
# Python environment
|
|
33
|
+
PYTHONUNBUFFERED: '1'
|
|
34
|
+
PYTHONDONTWRITEBYTECODE: '1'
|
|
35
|
+
|
|
36
|
+
# Database connections
|
|
37
|
+
# PostgreSQL
|
|
38
|
+
DATABASE_URL: 'postgresql://postgres:postgres@postgres:5432/datawarehouse'
|
|
39
|
+
POSTGRES_HOST: 'postgres'
|
|
40
|
+
POSTGRES_PORT: '5432'
|
|
41
|
+
POSTGRES_DB: 'datawarehouse'
|
|
42
|
+
|
|
43
|
+
# MongoDB (alternative)
|
|
44
|
+
MONGODB_URL: 'mongodb://mongodb:27017/datawarehouse'
|
|
45
|
+
MONGODB_HOST: 'mongodb'
|
|
46
|
+
MONGODB_PORT: '27017'
|
|
47
|
+
MONGODB_DB: 'datawarehouse'
|
|
48
|
+
|
|
49
|
+
# MinIO for data lake
|
|
50
|
+
MINIO_ENDPOINT: 'minio:9000'
|
|
51
|
+
MINIO_ACCESS_KEY: 'minioadmin'
|
|
52
|
+
MINIO_SECRET_KEY: 'minioadmin'
|
|
53
|
+
MINIO_BUCKET_RAW: 'raw-data'
|
|
54
|
+
MINIO_BUCKET_PROCESSED: 'processed-data'
|
|
55
|
+
MINIO_USE_SSL: 'false'
|
|
56
|
+
|
|
57
|
+
# Data processing settings
|
|
58
|
+
SPARK_MASTER: 'local[*]'
|
|
59
|
+
DASK_SCHEDULER: 'threads'
|
|
60
|
+
|
|
61
|
+
# Jupyter settings (for future jupyter overlay)
|
|
62
|
+
JUPYTER_PORT: '8888'
|
|
63
|
+
JUPYTER_TOKEN: 'data-engineering'
|
|
64
|
+
|
|
65
|
+
# Suggested port mappings
|
|
66
|
+
portMappings:
|
|
67
|
+
jupyter: 8888 # For future jupyter overlay
|
|
68
|
+
minio: 9000
|
|
69
|
+
minio-console: 9001
|
|
70
|
+
|
|
71
|
+
# README snippet to add to generated devcontainer
|
|
72
|
+
readme: |
|
|
73
|
+
## Data Engineering Stack
|
|
74
|
+
|
|
75
|
+
This devcontainer is configured for data engineering and analytics:
|
|
76
|
+
|
|
77
|
+
### Architecture
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Data Sources ──→ Raw Data (MinIO) ──→ ETL Pipeline (Python)
|
|
81
|
+
│
|
|
82
|
+
↓
|
|
83
|
+
Processed Data (MinIO) ──→ Analytics DB
|
|
84
|
+
│
|
|
85
|
+
↓
|
|
86
|
+
Analysis/Reporting
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Services
|
|
90
|
+
|
|
91
|
+
- **Python**: Data processing runtime with scientific libraries
|
|
92
|
+
- **Database**: Analytics database (PostgreSQL or MongoDB)
|
|
93
|
+
- **MinIO**: S3-compatible data lake (ports 9000, 9001)
|
|
94
|
+
- **Modern CLI Tools**: jq, yq, bat, fd, ripgrep for data wrangling
|
|
95
|
+
|
|
96
|
+
### Connection Strings
|
|
97
|
+
|
|
98
|
+
#### Database
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# PostgreSQL
|
|
102
|
+
DATABASE_URL="postgresql://postgres:postgres@postgres:5432/datawarehouse"
|
|
103
|
+
|
|
104
|
+
# MongoDB (alternative)
|
|
105
|
+
MONGODB_URL="mongodb://mongodb:27017/datawarehouse"
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
#### MinIO (Data Lake)
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
MINIO_ENDPOINT="minio:9000"
|
|
112
|
+
MINIO_ACCESS_KEY="minioadmin"
|
|
113
|
+
MINIO_SECRET_KEY="minioadmin"
|
|
114
|
+
MINIO_BUCKET_RAW="raw-data"
|
|
115
|
+
MINIO_BUCKET_PROCESSED="processed-data"
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Python Libraries
|
|
119
|
+
|
|
120
|
+
Pre-installed data science libraries are configured in `global-packages-python.txt`:
|
|
121
|
+
|
|
122
|
+
- **pandas**: Data manipulation and analysis
|
|
123
|
+
- **polars**: Fast DataFrame library (Rust-based)
|
|
124
|
+
- **numpy**: Numerical computing
|
|
125
|
+
- **sqlalchemy**: Database ORM
|
|
126
|
+
- **psycopg2**: PostgreSQL driver
|
|
127
|
+
- **pymongo**: MongoDB driver
|
|
128
|
+
- **boto3**: AWS SDK (for MinIO/S3)
|
|
129
|
+
- **requests**: HTTP library for API calls
|
|
130
|
+
|
|
131
|
+
Install additional libraries:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
pip install dask pyarrow fastparquet
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Quick Start
|
|
138
|
+
|
|
139
|
+
#### 1. Set Up MinIO Buckets
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# Access MinIO Console at http://localhost:9001
|
|
143
|
+
# Login: minioadmin/minioadmin
|
|
144
|
+
|
|
145
|
+
# Or use CLI:
|
|
146
|
+
mc alias set local http://minio:9000 minioadmin minioadmin
|
|
147
|
+
mc mb local/raw-data
|
|
148
|
+
mc mb local/processed-data
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
#### 2. Upload Raw Data to MinIO
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
import boto3
|
|
155
|
+
from io import BytesIO
|
|
156
|
+
|
|
157
|
+
# Configure S3 client for MinIO
|
|
158
|
+
s3 = boto3.client(
|
|
159
|
+
's3',
|
|
160
|
+
endpoint_url='http://minio:9000',
|
|
161
|
+
aws_access_key_id='minioadmin',
|
|
162
|
+
aws_secret_access_key='minioadmin'
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Upload CSV file
|
|
166
|
+
s3.upload_file('data.csv', 'raw-data', 'sales/2024/data.csv')
|
|
167
|
+
|
|
168
|
+
# Or upload from memory
|
|
169
|
+
csv_buffer = BytesIO()
|
|
170
|
+
df.to_csv(csv_buffer, index=False)
|
|
171
|
+
s3.put_object(
|
|
172
|
+
Bucket='raw-data',
|
|
173
|
+
Key='sales/2024/data.csv',
|
|
174
|
+
Body=csv_buffer.getvalue()
|
|
175
|
+
)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
#### 3. ETL Pipeline Example
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
import pandas as pd
|
|
182
|
+
import boto3
|
|
183
|
+
from sqlalchemy import create_engine
|
|
184
|
+
|
|
185
|
+
# Initialize connections
|
|
186
|
+
s3 = boto3.client('s3', endpoint_url='http://minio:9000',
|
|
187
|
+
aws_access_key_id='minioadmin',
|
|
188
|
+
aws_secret_access_key='minioadmin')
|
|
189
|
+
engine = create_engine('postgresql://postgres:postgres@postgres:5432/datawarehouse')
|
|
190
|
+
|
|
191
|
+
# Extract: Read from MinIO
|
|
192
|
+
obj = s3.get_object(Bucket='raw-data', Key='sales/2024/data.csv')
|
|
193
|
+
df = pd.read_csv(obj['Body'])
|
|
194
|
+
|
|
195
|
+
# Transform: Clean and process
|
|
196
|
+
df = df.dropna()
|
|
197
|
+
df['date'] = pd.to_datetime(df['date'])
|
|
198
|
+
df['revenue'] = df['quantity'] * df['price']
|
|
199
|
+
|
|
200
|
+
# Load: Save to database
|
|
201
|
+
df.to_sql('sales', engine, if_exists='append', index=False)
|
|
202
|
+
|
|
203
|
+
# Archive processed data to MinIO
|
|
204
|
+
df.to_parquet('/tmp/processed.parquet')
|
|
205
|
+
s3.upload_file('/tmp/processed.parquet', 'processed-data',
|
|
206
|
+
'sales/2024/processed.parquet')
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Data Processing Patterns
|
|
210
|
+
|
|
211
|
+
#### Batch Processing with Pandas
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
import pandas as pd
|
|
215
|
+
|
|
216
|
+
# Read data in chunks for large files
|
|
217
|
+
chunk_size = 10000
|
|
218
|
+
chunks = pd.read_csv('large_file.csv', chunksize=chunk_size)
|
|
219
|
+
|
|
220
|
+
for chunk in chunks:
|
|
221
|
+
# Process chunk
|
|
222
|
+
processed = chunk.groupby('category').agg({
|
|
223
|
+
'revenue': 'sum',
|
|
224
|
+
'quantity': 'count'
|
|
225
|
+
})
|
|
226
|
+
# Write to database
|
|
227
|
+
processed.to_sql('aggregates', engine, if_exists='append')
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
#### Using Polars (Faster)
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
import polars as pl
|
|
234
|
+
|
|
235
|
+
# Read and process with Polars
|
|
236
|
+
df = pl.read_csv('data.csv')
|
|
237
|
+
result = (
|
|
238
|
+
df
|
|
239
|
+
.filter(pl.col('revenue') > 1000)
|
|
240
|
+
.group_by('category')
|
|
241
|
+
.agg([
|
|
242
|
+
pl.col('revenue').sum().alias('total_revenue'),
|
|
243
|
+
pl.col('quantity').mean().alias('avg_quantity')
|
|
244
|
+
])
|
|
245
|
+
)
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
#### Data Lake Organization
|
|
249
|
+
|
|
250
|
+
Organize data in MinIO using partitions:
|
|
251
|
+
|
|
252
|
+
```
|
|
253
|
+
raw-data/
|
|
254
|
+
├── sales/
|
|
255
|
+
│ ├── 2024/
|
|
256
|
+
│ │ ├── 01/
|
|
257
|
+
│ │ │ └── data.csv
|
|
258
|
+
│ │ └── 02/
|
|
259
|
+
│ └── 2023/
|
|
260
|
+
└── inventory/
|
|
261
|
+
└── current/
|
|
262
|
+
└── stock.json
|
|
263
|
+
|
|
264
|
+
processed-data/
|
|
265
|
+
├── sales/
|
|
266
|
+
│ ├── daily_aggregates/
|
|
267
|
+
│ │ └── 2024-01-15.parquet
|
|
268
|
+
│ └── monthly_summary/
|
|
269
|
+
│ └── 2024-01.parquet
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### Database Analytics
|
|
273
|
+
|
|
274
|
+
#### PostgreSQL
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
import pandas as pd
|
|
278
|
+
from sqlalchemy import create_engine
|
|
279
|
+
|
|
280
|
+
engine = create_engine('postgresql://postgres:postgres@postgres:5432/datawarehouse')
|
|
281
|
+
|
|
282
|
+
# Read data
|
|
283
|
+
df = pd.read_sql('SELECT * FROM sales WHERE date >= CURRENT_DATE - 30', engine)
|
|
284
|
+
|
|
285
|
+
# Write data
|
|
286
|
+
df.to_sql('sales_summary', engine, if_exists='replace')
|
|
287
|
+
|
|
288
|
+
# Execute custom SQL
|
|
289
|
+
with engine.connect() as conn:
|
|
290
|
+
result = conn.execute("""
|
|
291
|
+
SELECT category, SUM(revenue) as total
|
|
292
|
+
FROM sales
|
|
293
|
+
GROUP BY category
|
|
294
|
+
ORDER BY total DESC
|
|
295
|
+
""")
|
|
296
|
+
for row in result:
|
|
297
|
+
print(row)
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
### Modern CLI Tools for Data
|
|
301
|
+
|
|
302
|
+
```bash
|
|
303
|
+
# Inspect JSON data
|
|
304
|
+
cat data.json | jq '.[] | select(.revenue > 1000)'
|
|
305
|
+
|
|
306
|
+
# Search in CSV files
|
|
307
|
+
rg -t csv 'product_name'
|
|
308
|
+
|
|
309
|
+
# Find data files
|
|
310
|
+
fd -e csv -e parquet -e json
|
|
311
|
+
|
|
312
|
+
# Preview files with syntax highlighting
|
|
313
|
+
bat sales.csv
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
### Workflow Automation
|
|
317
|
+
|
|
318
|
+
Create data pipelines with Python scripts:
|
|
319
|
+
|
|
320
|
+
```python
|
|
321
|
+
# pipeline.py
|
|
322
|
+
import schedule
|
|
323
|
+
import time
|
|
324
|
+
|
|
325
|
+
def etl_job():
|
|
326
|
+
print("Running ETL pipeline...")
|
|
327
|
+
# Extract, transform, load
|
|
328
|
+
pass
|
|
329
|
+
|
|
330
|
+
# Run every hour
|
|
331
|
+
schedule.every().hour.do(etl_job)
|
|
332
|
+
|
|
333
|
+
while True:
|
|
334
|
+
schedule.run_pending()
|
|
335
|
+
time.sleep(60)
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
Or use cron:
|
|
339
|
+
|
|
340
|
+
```bash
|
|
341
|
+
# Add to crontab
|
|
342
|
+
0 * * * * python /workspace/pipeline.py
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### Data Quality
|
|
346
|
+
|
|
347
|
+
```python
|
|
348
|
+
# Data validation with pandas
|
|
349
|
+
def validate_data(df):
|
|
350
|
+
# Check for nulls
|
|
351
|
+
assert df['revenue'].notna().all(), "Revenue has null values"
|
|
352
|
+
|
|
353
|
+
# Check data types
|
|
354
|
+
assert df['date'].dtype == 'datetime64[ns]', "Date is not datetime"
|
|
355
|
+
|
|
356
|
+
# Check ranges
|
|
357
|
+
assert (df['quantity'] > 0).all(), "Quantity must be positive"
|
|
358
|
+
|
|
359
|
+
# Check uniqueness
|
|
360
|
+
assert df['transaction_id'].is_unique, "Duplicate transactions found"
|
|
361
|
+
|
|
362
|
+
validate_data(df)
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
### Future Enhancements
|
|
366
|
+
|
|
367
|
+
When Jupyter and DuckDB overlays are available:
|
|
368
|
+
|
|
369
|
+
#### Jupyter Notebooks
|
|
370
|
+
|
|
371
|
+
- Interactive data exploration
|
|
372
|
+
- Visualization with matplotlib, seaborn
|
|
373
|
+
- Experiment tracking
|
|
374
|
+
- Collaborative analysis
|
|
375
|
+
|
|
376
|
+
#### DuckDB
|
|
377
|
+
|
|
378
|
+
- In-process SQL analytics
|
|
379
|
+
- Query MinIO Parquet files directly
|
|
380
|
+
- Faster than loading to pandas
|
|
381
|
+
- OLAP-style aggregations
|
|
382
|
+
|
|
383
|
+
### Next Steps
|
|
384
|
+
|
|
385
|
+
- Design your data lake structure in MinIO
|
|
386
|
+
- Set up database schemas for analytics
|
|
387
|
+
- Create ETL pipelines for data ingestion
|
|
388
|
+
- Implement data quality checks
|
|
389
|
+
- Set up scheduled jobs for recurring pipelines
|
|
390
|
+
- Add data monitoring and alerting
|
|
391
|
+
- Document data lineage
|
|
392
|
+
- Create data catalog
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# Event-Sourced Service Preset
|
|
2
|
+
# Development environment for event-driven/CQRS applications
|
|
3
|
+
|
|
4
|
+
id: event-sourced-service
|
|
5
|
+
name: Event-Sourced Service
|
|
6
|
+
description: Event-driven architecture with event store, message broker, and observability
|
|
7
|
+
type: meta
|
|
8
|
+
category: preset
|
|
9
|
+
supports: [compose] # Requires Docker Compose for services
|
|
10
|
+
tags: [preset, event-sourcing, cqrs, messaging, event-driven]
|
|
11
|
+
|
|
12
|
+
# Overlays to select
|
|
13
|
+
selects:
|
|
14
|
+
# Always included
|
|
15
|
+
required:
|
|
16
|
+
- minio # Object storage for events
|
|
17
|
+
- otel-collector
|
|
18
|
+
- jaeger
|
|
19
|
+
- prometheus
|
|
20
|
+
- grafana
|
|
21
|
+
|
|
22
|
+
# User makes choices
|
|
23
|
+
userChoice:
|
|
24
|
+
language:
|
|
25
|
+
id: language
|
|
26
|
+
prompt: Select service language/framework
|
|
27
|
+
options: [nodejs, dotnet, python, go, java]
|
|
28
|
+
defaultOption: nodejs
|
|
29
|
+
|
|
30
|
+
eventStore:
|
|
31
|
+
id: eventStore
|
|
32
|
+
prompt: Select event store database
|
|
33
|
+
options: [postgres, mongodb]
|
|
34
|
+
defaultOption: postgres
|
|
35
|
+
|
|
36
|
+
messaging:
|
|
37
|
+
id: messaging
|
|
38
|
+
prompt: Select message broker
|
|
39
|
+
options: [rabbitmq, redpanda, nats]
|
|
40
|
+
defaultOption: rabbitmq
|
|
41
|
+
|
|
42
|
+
# Glue configuration - integration helpers
|
|
43
|
+
glueConfig:
|
|
44
|
+
# Pre-configured environment variables
|
|
45
|
+
environment:
|
|
46
|
+
# Event store configuration (varies by choice)
|
|
47
|
+
# PostgreSQL event store
|
|
48
|
+
EVENT_STORE_URL: 'postgresql://postgres:postgres@postgres:5432/eventstore'
|
|
49
|
+
POSTGRES_HOST: 'postgres'
|
|
50
|
+
POSTGRES_PORT: '5432'
|
|
51
|
+
POSTGRES_DB: 'eventstore'
|
|
52
|
+
|
|
53
|
+
# MongoDB event store (alternative)
|
|
54
|
+
MONGODB_URL: 'mongodb://mongodb:27017/eventstore'
|
|
55
|
+
MONGODB_HOST: 'mongodb'
|
|
56
|
+
MONGODB_PORT: '27017'
|
|
57
|
+
MONGODB_DB: 'eventstore'
|
|
58
|
+
|
|
59
|
+
# MinIO for event snapshots and archives
|
|
60
|
+
MINIO_ENDPOINT: 'minio:9000'
|
|
61
|
+
MINIO_ACCESS_KEY: 'minioadmin'
|
|
62
|
+
MINIO_SECRET_KEY: 'minioadmin'
|
|
63
|
+
MINIO_BUCKET: 'events'
|
|
64
|
+
MINIO_USE_SSL: 'false'
|
|
65
|
+
|
|
66
|
+
# Messaging URLs (varies by choice)
|
|
67
|
+
# RabbitMQ
|
|
68
|
+
RABBITMQ_URL: 'amqp://rabbitmq:5672'
|
|
69
|
+
|
|
70
|
+
# Redpanda (Kafka-compatible)
|
|
71
|
+
KAFKA_BROKERS: 'redpanda:9092'
|
|
72
|
+
|
|
73
|
+
# NATS
|
|
74
|
+
NATS_URL: 'nats://nats:4222'
|
|
75
|
+
|
|
76
|
+
# OpenTelemetry configuration
|
|
77
|
+
OTEL_EXPORTER_OTLP_ENDPOINT: 'http://otel-collector:4317'
|
|
78
|
+
OTEL_SERVICE_NAME: 'event-service'
|
|
79
|
+
OTEL_METRICS_EXPORTER: 'otlp'
|
|
80
|
+
OTEL_TRACES_EXPORTER: 'otlp'
|
|
81
|
+
OTEL_LOGS_EXPORTER: 'otlp'
|
|
82
|
+
|
|
83
|
+
# Event sourcing settings
|
|
84
|
+
EVENT_STORE_SNAPSHOT_THRESHOLD: '100'
|
|
85
|
+
EVENT_STORE_RETENTION_DAYS: '30'
|
|
86
|
+
|
|
87
|
+
# Suggested port mappings
|
|
88
|
+
portMappings:
|
|
89
|
+
service: 8080
|
|
90
|
+
grafana: 3000
|
|
91
|
+
prometheus: 9090
|
|
92
|
+
jaeger: 16686
|
|
93
|
+
minio: 9000
|
|
94
|
+
minio-console: 9001
|
|
95
|
+
|
|
96
|
+
# README snippet to add to generated devcontainer
|
|
97
|
+
readme: |
|
|
98
|
+
## Event-Sourced Service Stack
|
|
99
|
+
|
|
100
|
+
This devcontainer is configured for event-driven architecture development:
|
|
101
|
+
|
|
102
|
+
### Architecture Pattern
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
Commands ──→ Service ──→ Event Store (append-only)
|
|
106
|
+
│ │
|
|
107
|
+
↓ ↓
|
|
108
|
+
Message Broker ← Events Published
|
|
109
|
+
│
|
|
110
|
+
↓
|
|
111
|
+
Read Models / Projections
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Services
|
|
115
|
+
|
|
116
|
+
- **Event Store**: Your chosen database (PostgreSQL or MongoDB) for event persistence
|
|
117
|
+
- **Message Broker**: Async event distribution (RabbitMQ, Redpanda, or NATS)
|
|
118
|
+
- **MinIO**: Object storage for event snapshots and archives (ports 9000, 9001)
|
|
119
|
+
- **OpenTelemetry Collector**: Distributed tracing (ports 4317, 4318)
|
|
120
|
+
- **Jaeger**: Trace visualization (port 16686)
|
|
121
|
+
- **Prometheus**: Metrics storage (port 9090)
|
|
122
|
+
- **Grafana**: Observability dashboard (port 3000)
|
|
123
|
+
|
|
124
|
+
### Connection Strings
|
|
125
|
+
|
|
126
|
+
#### Event Store
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# PostgreSQL
|
|
130
|
+
EVENT_STORE_URL="postgresql://postgres:postgres@postgres:5432/eventstore"
|
|
131
|
+
|
|
132
|
+
# MongoDB (alternative)
|
|
133
|
+
MONGODB_URL="mongodb://mongodb:27017/eventstore"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
#### Message Broker
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
RABBITMQ_URL="amqp://rabbitmq:5672"
|
|
141
|
+
|
|
142
|
+
# Redpanda (Kafka-compatible)
|
|
143
|
+
KAFKA_BROKERS="redpanda:9092"
|
|
144
|
+
|
|
145
|
+
# NATS
|
|
146
|
+
NATS_URL="nats://nats:4222"
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
#### MinIO (Event Snapshots)
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
MINIO_ENDPOINT="minio:9000"
|
|
153
|
+
MINIO_ACCESS_KEY="minioadmin"
|
|
154
|
+
MINIO_SECRET_KEY="minioadmin"
|
|
155
|
+
MINIO_BUCKET="events"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Quick Start
|
|
159
|
+
|
|
160
|
+
1. **Start your service** on port 8080
|
|
161
|
+
|
|
162
|
+
2. **Set up event store schema**:
|
|
163
|
+
```bash
|
|
164
|
+
# PostgreSQL example - create events table
|
|
165
|
+
psql -h postgres -U postgres -d eventstore <<EOF
|
|
166
|
+
CREATE TABLE events (
|
|
167
|
+
event_id UUID PRIMARY KEY,
|
|
168
|
+
aggregate_id UUID NOT NULL,
|
|
169
|
+
aggregate_type VARCHAR(255) NOT NULL,
|
|
170
|
+
event_type VARCHAR(255) NOT NULL,
|
|
171
|
+
event_data JSONB NOT NULL,
|
|
172
|
+
metadata JSONB,
|
|
173
|
+
version BIGINT NOT NULL,
|
|
174
|
+
timestamp TIMESTAMP DEFAULT NOW(),
|
|
175
|
+
UNIQUE(aggregate_id, version)
|
|
176
|
+
);
|
|
177
|
+
CREATE INDEX idx_aggregate ON events(aggregate_id);
|
|
178
|
+
CREATE INDEX idx_timestamp ON events(timestamp);
|
|
179
|
+
EOF
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
3. **Configure MinIO bucket**:
|
|
183
|
+
- Access MinIO Console at http://localhost:9001
|
|
184
|
+
- Login with minioadmin/minioadmin
|
|
185
|
+
- Create bucket named "events"
|
|
186
|
+
- Configure lifecycle policies for event retention
|
|
187
|
+
|
|
188
|
+
4. **Set up message broker**:
|
|
189
|
+
- RabbitMQ: http://localhost:15672 (guest/guest)
|
|
190
|
+
- Configure exchanges and queues for event distribution
|
|
191
|
+
|
|
192
|
+
### Event Sourcing Patterns
|
|
193
|
+
|
|
194
|
+
This stack supports common event sourcing patterns:
|
|
195
|
+
|
|
196
|
+
#### Command Handler → Event Store
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
// Append events to store
|
|
200
|
+
async function handleCommand(command: Command) {
|
|
201
|
+
const events = aggregate.process(command);
|
|
202
|
+
await eventStore.append(aggregateId, events);
|
|
203
|
+
await messageBroker.publish(events);
|
|
204
|
+
}
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
#### Event Replay / Projection
|
|
208
|
+
|
|
209
|
+
```typescript
|
|
210
|
+
// Rebuild read models from events
|
|
211
|
+
async function rebuildProjection() {
|
|
212
|
+
const events = await eventStore.getEvents(fromTimestamp);
|
|
213
|
+
for (const event of events) {
|
|
214
|
+
await projection.handle(event);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
#### Snapshotting
|
|
220
|
+
|
|
221
|
+
```typescript
|
|
222
|
+
// Store snapshots in MinIO every N events
|
|
223
|
+
if (aggregate.version % SNAPSHOT_THRESHOLD === 0) {
|
|
224
|
+
const snapshot = aggregate.toSnapshot();
|
|
225
|
+
await minioClient.putObject('events',
|
|
226
|
+
`snapshots/${aggregateId}/${version}.json`,
|
|
227
|
+
JSON.stringify(snapshot)
|
|
228
|
+
);
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Observability
|
|
233
|
+
|
|
234
|
+
- **Traces**: View command flows in Jaeger (http://localhost:16686)
|
|
235
|
+
- **Metrics**: Monitor event throughput in Prometheus/Grafana
|
|
236
|
+
- **Logs**: Centralized logging via OTEL → Loki
|
|
237
|
+
|
|
238
|
+
### Event Store Best Practices
|
|
239
|
+
|
|
240
|
+
1. **Append-only**: Never modify or delete events
|
|
241
|
+
2. **Versioning**: Use optimistic concurrency with version numbers
|
|
242
|
+
3. **Idempotency**: Handle duplicate events gracefully
|
|
243
|
+
4. **Snapshots**: Store aggregate snapshots every N events
|
|
244
|
+
5. **Retention**: Archive old events to MinIO for compliance
|
|
245
|
+
|
|
246
|
+
### Message Broker Patterns
|
|
247
|
+
|
|
248
|
+
- **Publish/Subscribe**: Broadcast events to multiple consumers
|
|
249
|
+
- **Competing Consumers**: Scale event handlers horizontally
|
|
250
|
+
- **Dead Letter Queue**: Handle failed event processing
|
|
251
|
+
- **Event Ordering**: Ensure ordered processing per aggregate
|
|
252
|
+
|
|
253
|
+
### Next Steps
|
|
254
|
+
|
|
255
|
+
- Design your domain events and aggregates
|
|
256
|
+
- Implement command handlers with event sourcing
|
|
257
|
+
- Create read model projections
|
|
258
|
+
- Set up event subscribers on message broker
|
|
259
|
+
- Configure OTEL instrumentation for tracing
|
|
260
|
+
- Create Grafana dashboards for event metrics
|
|
261
|
+
- Implement snapshotting strategy
|
|
262
|
+
- Set up event archival to MinIO
|