@batchactions/distributed 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -279
- package/package.json +24 -13
package/README.md
CHANGED
|
@@ -1,329 +1,88 @@
|
|
|
1
1
|
# @batchactions/distributed
|
|
2
2
|
|
|
3
|
-
Distributed
|
|
3
|
+
Distributed orchestration for `@batchactions` imports.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Use this package when one process is not enough and you need multiple workers (Lambda, containers, queue workers) claiming and processing batches in parallel.
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
- You need to import **hundreds of thousands or millions of records** and a single process is too slow.
|
|
10
|
-
- You are running in **serverless** (AWS Lambda, Google Cloud Functions) and want to parallelize across multiple invocations.
|
|
11
|
-
- You need **crash resilience** — if a worker dies, another worker picks up the batch.
|
|
12
|
-
|
|
13
|
-
For simpler scenarios (< 100k records, single server), `@batchactions/core` alone is sufficient. Use `processChunk()` for serverless with time limits, or `maxConcurrentBatches` for in-process parallelism.
|
|
14
|
-
|
|
15
|
-
## Installation
|
|
7
|
+
## Install
|
|
16
8
|
|
|
17
9
|
```bash
|
|
18
|
-
npm install @batchactions/distributed
|
|
10
|
+
npm install @batchactions/distributed @batchactions/core @batchactions/import
|
|
19
11
|
```
|
|
20
12
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
- `@batchactions/core` >= 0.4.0
|
|
24
|
-
|
|
25
|
-
You also need a `DistributedStateStore` implementation. The official one is [`@batchactions/state-sequelize`](https://www.npmjs.com/package/@batchactions/state-sequelize):
|
|
13
|
+
You also need a `DistributedStateStore` implementation. Choose one:
|
|
26
14
|
|
|
27
15
|
```bash
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
## How It Works
|
|
32
|
-
|
|
33
|
-
A two-phase processing model:
|
|
16
|
+
# Option A: Sequelize
|
|
17
|
+
npm install @batchactions/state-sequelize sequelize
|
|
34
18
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
┌─────────────────────────────────┐
|
|
38
|
-
│ Stream source file │
|
|
39
|
-
│ Validate & materialize records │
|
|
40
|
-
│ Create batch boundaries │
|
|
41
|
-
│ Save everything to StateStore │
|
|
42
|
-
└──────────┬──────────────────────-─┘
|
|
43
|
-
│
|
|
44
|
-
{ jobId, totalBatches }
|
|
45
|
-
│
|
|
46
|
-
┌──────────────┼──────────────┐
|
|
47
|
-
▼ ▼ ▼
|
|
48
|
-
Phase 2: PROCESS (N parallel workers)
|
|
49
|
-
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
|
50
|
-
│ Worker 1 │ │ Worker 2 │ │ Worker N │
|
|
51
|
-
│ claimBatch │ │ claimBatch │ │ claimBatch │
|
|
52
|
-
│ process │ │ process │ │ process │
|
|
53
|
-
│ next batch │ │ next batch │ │ next batch │
|
|
54
|
-
│ ... │ │ ... │ │ ... │
|
|
55
|
-
└──────────────┘ └──────────────┘ └──────────────┘
|
|
56
|
-
│ │ │
|
|
57
|
-
└──────────────┼──────────────┘
|
|
58
|
-
│
|
|
59
|
-
tryFinalizeJob()
|
|
60
|
-
(exactly-once)
|
|
19
|
+
# Option B: Prisma (v6 or v7)
|
|
20
|
+
npm install @batchactions/state-prisma
|
|
61
21
|
```
|
|
62
22
|
|
|
63
|
-
|
|
23
|
+
## Processing Model
|
|
64
24
|
|
|
65
|
-
|
|
25
|
+
1. `prepare(source, parser)` runs once in an orchestrator process.
|
|
26
|
+
2. `processWorkerBatch(jobId, processor, workerId)` runs in N workers until no batches remain.
|
|
66
27
|
|
|
67
28
|
## Quick Start
|
|
68
29
|
|
|
69
|
-
### Orchestrator (Phase 1)
|
|
70
|
-
|
|
71
30
|
```typescript
|
|
72
31
|
import { DistributedImport } from '@batchactions/distributed';
|
|
73
32
|
import { CsvParser } from '@batchactions/import';
|
|
74
33
|
import { UrlSource } from '@batchactions/core';
|
|
75
34
|
import { SequelizeStateStore } from '@batchactions/state-sequelize';
|
|
76
|
-
import { Sequelize } from 'sequelize';
|
|
77
|
-
|
|
78
|
-
const sequelize = new Sequelize(process.env.DATABASE_URL!);
|
|
79
|
-
const stateStore = new SequelizeStateStore(sequelize);
|
|
80
|
-
await stateStore.initialize();
|
|
81
35
|
|
|
82
36
|
const di = new DistributedImport({
|
|
83
37
|
schema: {
|
|
84
38
|
fields: [
|
|
85
39
|
{ name: 'email', type: 'email', required: true },
|
|
86
40
|
{ name: 'name', type: 'string', required: true },
|
|
87
|
-
{ name: 'role', type: 'string', required: false, defaultValue: 'user' },
|
|
88
41
|
],
|
|
89
42
|
},
|
|
90
43
|
batchSize: 500,
|
|
91
|
-
stateStore,
|
|
44
|
+
stateStore: new SequelizeStateStore(sequelize),
|
|
92
45
|
continueOnError: true,
|
|
93
46
|
});
|
|
94
47
|
|
|
95
|
-
// Phase 1: Prepare
|
|
96
48
|
const source = new UrlSource('https://storage.example.com/users.csv');
|
|
97
|
-
const { jobId
|
|
98
|
-
|
|
99
|
-
console.log(`Job ${jobId}: ${totalRecords} records in ${totalBatches} batches`);
|
|
100
|
-
|
|
101
|
-
// Fan out: send { jobId } to N workers via SQS, SNS, EventBridge, etc.
|
|
102
|
-
await sqs.sendMessage({
|
|
103
|
-
QueueUrl: WORKER_QUEUE_URL,
|
|
104
|
-
MessageBody: JSON.stringify({ jobId }),
|
|
105
|
-
});
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
### Worker (Phase 2)
|
|
109
|
-
|
|
110
|
-
```typescript
|
|
111
|
-
import { DistributedImport } from '@batchactions/distributed';
|
|
112
|
-
import { SequelizeStateStore } from '@batchactions/state-sequelize';
|
|
113
|
-
import { Sequelize } from 'sequelize';
|
|
114
|
-
|
|
115
|
-
// Lambda handler
|
|
116
|
-
export async function handler(event: SQSEvent, context: Context) {
|
|
117
|
-
const { jobId } = JSON.parse(event.Records[0].body);
|
|
118
|
-
const workerId = context.awsRequestId;
|
|
119
|
-
|
|
120
|
-
const sequelize = new Sequelize(process.env.DATABASE_URL!);
|
|
121
|
-
const stateStore = new SequelizeStateStore(sequelize);
|
|
122
|
-
await stateStore.initialize();
|
|
123
|
-
|
|
124
|
-
const di = new DistributedImport({
|
|
125
|
-
schema: {
|
|
126
|
-
fields: [
|
|
127
|
-
{ name: 'email', type: 'email', required: true },
|
|
128
|
-
{ name: 'name', type: 'string', required: true },
|
|
129
|
-
{ name: 'role', type: 'string', required: false, defaultValue: 'user' },
|
|
130
|
-
],
|
|
131
|
-
},
|
|
132
|
-
batchSize: 500,
|
|
133
|
-
stateStore,
|
|
134
|
-
continueOnError: true,
|
|
135
|
-
});
|
|
136
|
-
|
|
137
|
-
// Process batches until none remain
|
|
138
|
-
while (true) {
|
|
139
|
-
const result = await di.processWorkerBatch(jobId, async (record) => {
|
|
140
|
-
await db.query(
|
|
141
|
-
'INSERT INTO users (email, name, role) VALUES ($1, $2, $3)',
|
|
142
|
-
[record.email, record.name, record.role],
|
|
143
|
-
);
|
|
144
|
-
}, workerId);
|
|
145
|
-
|
|
146
|
-
if (!result.claimed) {
|
|
147
|
-
console.log('No more batches to process');
|
|
148
|
-
break;
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
console.log(`Batch ${result.batchIndex}: ${result.processedCount} processed, ${result.failedCount} failed`);
|
|
152
|
-
|
|
153
|
-
if (result.jobComplete) {
|
|
154
|
-
console.log('Job finalized by this worker!');
|
|
155
|
-
break;
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
```
|
|
160
|
-
|
|
161
|
-
## Configuration
|
|
162
|
-
|
|
163
|
-
### `DistributedImportConfig`
|
|
164
|
-
|
|
165
|
-
| Property | Type | Default | Description |
|
|
166
|
-
|---|---|---|---|
|
|
167
|
-
| `schema` | `SchemaDefinition` | required | Field definitions and validation rules |
|
|
168
|
-
| `batchSize` | `number` | `100` | Records per batch |
|
|
169
|
-
| `continueOnError` | `boolean` | `true` | Continue when records fail validation or processing |
|
|
170
|
-
| `stateStore` | `StateStore` | required | Must implement `DistributedStateStore` (e.g. `SequelizeStateStore`) |
|
|
171
|
-
| `maxRetries` | `number` | `0` | Retry attempts for processor failures (exponential backoff) |
|
|
172
|
-
| `retryDelayMs` | `number` | `1000` | Base delay in ms between retry attempts |
|
|
173
|
-
| `hooks` | `JobHooks` | -- | Lifecycle hooks (`beforeValidate`, `afterValidate`, `beforeProcess`, `afterProcess`) |
|
|
174
|
-
| `duplicateChecker` | `DuplicateChecker` | -- | External duplicate detection |
|
|
175
|
-
| `staleBatchTimeoutMs` | `number` | `900000` | Timeout in ms before stale batches are reclaimed (15 min default) |
|
|
176
|
-
|
|
177
|
-
## API Reference
|
|
178
|
-
|
|
179
|
-
### `DistributedImport`
|
|
180
|
-
|
|
181
|
-
| Method | Description |
|
|
182
|
-
|---|---|
|
|
183
|
-
| `prepare(source, parser)` | Phase 1: Stream source, materialize records, create batches. Returns `PrepareResult`. |
|
|
184
|
-
| `processWorkerBatch(jobId, processor, workerId)` | Phase 2: Claim next batch, process records, finalize if last. Returns `DistributedBatchResult`. |
|
|
185
|
-
| `on(event, handler)` | Subscribe to a domain event. |
|
|
186
|
-
| `onAny(handler)` | Subscribe to all events. |
|
|
187
|
-
| `offAny(handler)` | Unsubscribe a wildcard handler. |
|
|
188
|
-
|
|
189
|
-
### `PrepareResult`
|
|
190
|
-
|
|
191
|
-
| Field | Type | Description |
|
|
192
|
-
|---|---|---|
|
|
193
|
-
| `jobId` | `string` | Unique job identifier. Pass this to workers. |
|
|
194
|
-
| `totalRecords` | `number` | Total records found in the source. |
|
|
195
|
-
| `totalBatches` | `number` | Number of batches created. |
|
|
49
|
+
const { jobId } = await di.prepare(source, new CsvParser());
|
|
196
50
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|---|---|---|
|
|
201
|
-
| `claimed` | `boolean` | Whether a batch was successfully claimed. `false` means no batches remain. |
|
|
202
|
-
| `batchId` | `string?` | ID of the batch that was processed. |
|
|
203
|
-
| `batchIndex` | `number?` | Index of the batch that was processed. |
|
|
204
|
-
| `processedCount` | `number` | Records successfully processed in this batch. |
|
|
205
|
-
| `failedCount` | `number` | Records that failed in this batch. |
|
|
206
|
-
| `jobComplete` | `boolean` | `true` if this worker finalized the entire job. |
|
|
207
|
-
| `jobId` | `string` | The job identifier. |
|
|
208
|
-
|
|
209
|
-
## Crash Recovery
|
|
210
|
-
|
|
211
|
-
If a worker crashes or times out, its claimed batch becomes "stale". The next `processWorkerBatch()` call automatically reclaims stale batches (based on `staleBatchTimeoutMs`) before claiming new ones.
|
|
212
|
-
|
|
213
|
-
**Requirements:**
|
|
214
|
-
|
|
215
|
-
- Your **processor callback must be idempotent**. If a batch is re-processed after a crash, records may be sent to the processor again.
|
|
216
|
-
- Use `ON CONFLICT DO NOTHING` / `INSERT ... IGNORE` or similar patterns in your database writes.
|
|
217
|
-
|
|
218
|
-
You can also manually reclaim stale batches:
|
|
219
|
-
|
|
220
|
-
```typescript
|
|
221
|
-
import { isDistributedStateStore } from '@batchactions/distributed';
|
|
222
|
-
|
|
223
|
-
if (isDistributedStateStore(stateStore)) {
|
|
224
|
-
const reclaimed = await stateStore.reclaimStaleBatches(jobId, 60_000); // 1 min timeout
|
|
225
|
-
console.log(`Reclaimed ${reclaimed} stale batches`);
|
|
51
|
+
while (true) {
|
|
52
|
+
const result = await di.processWorkerBatch(jobId, processRecord, workerId);
|
|
53
|
+
if (!result.claimed || result.jobComplete) break;
|
|
226
54
|
}
|
|
227
55
|
```
|
|
228
56
|
|
|
229
|
-
##
|
|
230
|
-
|
|
231
|
-
Each worker has its own local event bus. Subscribe to events for logging, metrics, or progress tracking:
|
|
57
|
+
## Main Exports
|
|
232
58
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
di.on('record:failed', (e) => {
|
|
239
|
-
console.error(`Record ${e.recordIndex} failed: ${e.error}`);
|
|
240
|
-
});
|
|
241
|
-
|
|
242
|
-
di.on('import:completed', (e) => {
|
|
243
|
-
// Only emitted by the worker that finalizes the job
|
|
244
|
-
console.log(`Job complete! ${e.summary.processed} processed, ${e.summary.failed} failed`);
|
|
245
|
-
});
|
|
246
|
-
|
|
247
|
-
// Forward all events (e.g. to CloudWatch, Datadog)
|
|
248
|
-
di.onAny((event) => {
|
|
249
|
-
metrics.emit(event.type, event);
|
|
250
|
-
});
|
|
251
|
-
```
|
|
59
|
+
- `DistributedImport`
|
|
60
|
+
- `PrepareResult`
|
|
61
|
+
- `DistributedBatchResult`, `DistributedBatchConfig`
|
|
62
|
+
- `DistributedStateStore` related types (re-exported)
|
|
63
|
+
- `isDistributedStateStore`
|
|
252
64
|
|
|
253
|
-
|
|
65
|
+
For full typed exports, see `packages/distributed/src/index.ts`.
|
|
254
66
|
|
|
255
|
-
##
|
|
67
|
+
## Compatibility
|
|
256
68
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
├── ProcessDistributedBatch.ts # Phase 2 use case
|
|
262
|
-
└── index.ts # Public API
|
|
263
|
-
|
|
264
|
-
Depends on:
|
|
265
|
-
└── @batchactions/core
|
|
266
|
-
├── DistributedStateStore # Port interface (extended StateStore)
|
|
267
|
-
├── BatchReservation # Domain types
|
|
268
|
-
├── SchemaValidator # Validation pipeline
|
|
269
|
-
└── EventBus # Event system
|
|
270
|
-
|
|
271
|
-
Implemented by:
|
|
272
|
-
└── @batchactions/state-sequelize
|
|
273
|
-
└── SequelizeStateStore # Concrete DistributedStateStore
|
|
274
|
-
├── bulkimport_jobs # Job state table
|
|
275
|
-
├── bulkimport_records # Record data table
|
|
276
|
-
└── bulkimport_batches # Batch metadata table (distributed)
|
|
277
|
-
```
|
|
278
|
-
|
|
279
|
-
## Implementing a Custom `DistributedStateStore`
|
|
280
|
-
|
|
281
|
-
If you don't use Sequelize, you can implement the `DistributedStateStore` interface:
|
|
282
|
-
|
|
283
|
-
```typescript
|
|
284
|
-
import type { DistributedStateStore, ClaimBatchResult, DistributedJobStatus, ProcessedRecord } from '@batchactions/distributed';
|
|
285
|
-
|
|
286
|
-
class MyDistributedStore implements DistributedStateStore {
|
|
287
|
-
// ... all StateStore methods plus:
|
|
288
|
-
|
|
289
|
-
async claimBatch(jobId: string, workerId: string): Promise<ClaimBatchResult> {
|
|
290
|
-
// Atomic: find first PENDING batch, set to PROCESSING with workerId
|
|
291
|
-
// Use SELECT FOR UPDATE SKIP LOCKED or similar
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
async releaseBatch(jobId: string, batchId: string, workerId: string): Promise<void> {
|
|
295
|
-
// Reset batch to PENDING (only if claimed by this worker)
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
async reclaimStaleBatches(jobId: string, timeoutMs: number): Promise<number> {
|
|
299
|
-
// Find PROCESSING batches with claimedAt older than timeout, reset to PENDING
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
async saveBatchRecords(jobId: string, batchId: string, records: readonly ProcessedRecord[]): Promise<void> {
|
|
303
|
-
// Bulk insert records for a batch
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
async getBatchRecords(jobId: string, batchId: string): Promise<readonly ProcessedRecord[]> {
|
|
307
|
-
// Load all records for a batch
|
|
308
|
-
}
|
|
69
|
+
- Node.js >= 20.0.0
|
|
70
|
+
- Peer dependencies:
|
|
71
|
+
- `@batchactions/core` >= 0.0.1
|
|
72
|
+
- `@batchactions/import` >= 0.0.1
|
|
309
73
|
|
|
310
|
-
|
|
311
|
-
// Aggregate: count batches by status
|
|
312
|
-
}
|
|
74
|
+
## Operational Notes
|
|
313
75
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
```
|
|
76
|
+
- Worker processors must be idempotent.
|
|
77
|
+
- Stale claimed batches are reclaimed automatically based on `staleBatchTimeoutMs`.
|
|
78
|
+
- Job finalization is exactly-once via `tryFinalizeJob()` in the store.
|
|
320
79
|
|
|
321
|
-
##
|
|
80
|
+
## Links
|
|
322
81
|
|
|
323
|
-
-
|
|
324
|
-
-
|
|
325
|
-
-
|
|
82
|
+
- Repository: https://github.com/vgpastor/batchactions/tree/main/packages/distributed
|
|
83
|
+
- Issues: https://github.com/vgpastor/batchactions/issues
|
|
84
|
+
- Contributing guide: https://github.com/vgpastor/batchactions/blob/main/CONTRIBUTING.md
|
|
326
85
|
|
|
327
86
|
## License
|
|
328
87
|
|
|
329
|
-
|
|
88
|
+
MIT
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@batchactions/distributed",
|
|
3
|
-
"version": "0.0.
|
|
4
|
-
"description": "Distributed
|
|
3
|
+
"version": "0.0.4",
|
|
4
|
+
"description": "Distributed worker orchestration for TypeScript batch imports with parallel processing, serverless fan-out, and atomic batch claiming",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": {
|
|
7
7
|
".": {
|
|
@@ -22,19 +22,30 @@
|
|
|
22
22
|
"test": "vitest run",
|
|
23
23
|
"test:watch": "vitest",
|
|
24
24
|
"typecheck": "tsc --noEmit",
|
|
25
|
-
"lint": "eslint src/ tests/"
|
|
25
|
+
"lint": "eslint src/ tests/",
|
|
26
|
+
"lint:fix": "eslint src/ tests/ --fix",
|
|
27
|
+
"format": "prettier --write .",
|
|
28
|
+
"format:check": "prettier --check ."
|
|
26
29
|
},
|
|
27
30
|
"keywords": [
|
|
28
|
-
"
|
|
29
|
-
"distributed",
|
|
30
|
-
"
|
|
31
|
+
"distributed-processing",
|
|
32
|
+
"distributed-workers",
|
|
33
|
+
"worker-orchestration",
|
|
31
34
|
"batch-processing",
|
|
35
|
+
"parallel-processing",
|
|
32
36
|
"serverless",
|
|
33
37
|
"lambda",
|
|
34
|
-
"
|
|
38
|
+
"queue-workers",
|
|
39
|
+
"typescript",
|
|
40
|
+
"nodejs",
|
|
41
|
+
"etl"
|
|
35
42
|
],
|
|
36
43
|
"license": "MIT",
|
|
37
|
-
"author": "
|
|
44
|
+
"author": "Victor Garcia <vgpastor@ingenierosweb.co>",
|
|
45
|
+
"homepage": "https://github.com/vgpastor/batchactions/tree/main/packages/distributed#readme",
|
|
46
|
+
"bugs": {
|
|
47
|
+
"url": "https://github.com/vgpastor/batchactions/issues"
|
|
48
|
+
},
|
|
38
49
|
"repository": {
|
|
39
50
|
"type": "git",
|
|
40
51
|
"url": "https://github.com/vgpastor/batchactions",
|
|
@@ -51,12 +62,12 @@
|
|
|
51
62
|
"@batchactions/core": "*",
|
|
52
63
|
"@batchactions/import": "*",
|
|
53
64
|
"@types/node": "^22.13.4",
|
|
54
|
-
"@typescript-eslint/eslint-plugin": "^8.
|
|
55
|
-
"@typescript-eslint/parser": "^8.
|
|
56
|
-
"eslint": "^9.
|
|
65
|
+
"@typescript-eslint/eslint-plugin": "^8.56.0",
|
|
66
|
+
"@typescript-eslint/parser": "^8.56.0",
|
|
67
|
+
"eslint": "^9.21.0",
|
|
57
68
|
"eslint-config-prettier": "^10.0.1",
|
|
58
|
-
"tsup": "^8.
|
|
69
|
+
"tsup": "^8.5.1",
|
|
59
70
|
"typescript": "^5.7.3",
|
|
60
|
-
"vitest": "^
|
|
71
|
+
"vitest": "^4.0.0"
|
|
61
72
|
}
|
|
62
73
|
}
|