@batchactions/distributed 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -280
- package/package.json +16 -9
package/README.md
CHANGED
|
@@ -1,329 +1,84 @@
|
|
|
1
1
|
# @batchactions/distributed
|
|
2
2
|
|
|
3
|
-
Distributed
|
|
3
|
+
Distributed orchestration for `@batchactions` imports.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Use this package when one process is not enough and you need multiple workers (Lambda, containers, queue workers) claiming and processing batches in parallel.
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
- You need to import **hundreds of thousands or millions of records** and a single process is too slow.
|
|
10
|
-
- You are running in **serverless** (AWS Lambda, Google Cloud Functions) and want to parallelize across multiple invocations.
|
|
11
|
-
- You need **crash resilience** — if a worker dies, another worker picks up the batch.
|
|
12
|
-
|
|
13
|
-
For simpler scenarios (< 100k records, single server), `@batchactions/core` alone is sufficient. Use `processChunk()` for serverless with time limits, or `maxConcurrentBatches` for in-process parallelism.
|
|
14
|
-
|
|
15
|
-
## Installation
|
|
7
|
+
## Install
|
|
16
8
|
|
|
17
9
|
```bash
|
|
18
|
-
npm install @batchactions/distributed
|
|
10
|
+
npm install @batchactions/distributed @batchactions/core @batchactions/import
|
|
19
11
|
```
|
|
20
12
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
- `@batchactions/core` >= 0.4.0
|
|
24
|
-
|
|
25
|
-
You also need a `DistributedStateStore` implementation. The official one is [`@batchactions/state-sequelize`](https://www.npmjs.com/package/@batchactions/state-sequelize):
|
|
13
|
+
You also need a `DistributedStateStore` implementation, for example:
|
|
26
14
|
|
|
27
15
|
```bash
|
|
28
|
-
npm install @batchactions/state-sequelize sequelize
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
## How It Works
|
|
32
|
-
|
|
33
|
-
A two-phase processing model:
|
|
34
|
-
|
|
35
|
-
```
|
|
36
|
-
Phase 1: PREPARE (single orchestrator)
|
|
37
|
-
┌─────────────────────────────────┐
|
|
38
|
-
│ Stream source file │
|
|
39
|
-
│ Validate & materialize records │
|
|
40
|
-
│ Create batch boundaries │
|
|
41
|
-
│ Save everything to StateStore │
|
|
42
|
-
└──────────┬──────────────────────-─┘
|
|
43
|
-
│
|
|
44
|
-
{ jobId, totalBatches }
|
|
45
|
-
│
|
|
46
|
-
┌──────────────┼──────────────┐
|
|
47
|
-
▼ ▼ ▼
|
|
48
|
-
Phase 2: PROCESS (N parallel workers)
|
|
49
|
-
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
|
50
|
-
│ Worker 1 │ │ Worker 2 │ │ Worker N │
|
|
51
|
-
│ claimBatch │ │ claimBatch │ │ claimBatch │
|
|
52
|
-
│ process │ │ process │ │ process │
|
|
53
|
-
│ next batch │ │ next batch │ │ next batch │
|
|
54
|
-
│ ... │ │ ... │ │ ... │
|
|
55
|
-
└──────────────┘ └──────────────┘ └──────────────┘
|
|
56
|
-
│ │ │
|
|
57
|
-
└──────────────┼──────────────┘
|
|
58
|
-
│
|
|
59
|
-
tryFinalizeJob()
|
|
60
|
-
(exactly-once)
|
|
16
|
+
npm install @batchactions/state-sequelize sequelize
|
|
61
17
|
```
|
|
62
18
|
|
|
63
|
-
|
|
19
|
+
## Processing Model
|
|
64
20
|
|
|
65
|
-
|
|
21
|
+
1. `prepare(source, parser)` runs once in an orchestrator process.
|
|
22
|
+
2. `processWorkerBatch(jobId, processor, workerId)` runs in N workers until no batches remain.
|
|
66
23
|
|
|
67
24
|
## Quick Start
|
|
68
25
|
|
|
69
|
-
### Orchestrator (Phase 1)
|
|
70
|
-
|
|
71
26
|
```typescript
|
|
72
27
|
import { DistributedImport } from '@batchactions/distributed';
|
|
73
28
|
import { CsvParser } from '@batchactions/import';
|
|
74
29
|
import { UrlSource } from '@batchactions/core';
|
|
75
30
|
import { SequelizeStateStore } from '@batchactions/state-sequelize';
|
|
76
|
-
import { Sequelize } from 'sequelize';
|
|
77
|
-
|
|
78
|
-
const sequelize = new Sequelize(process.env.DATABASE_URL!);
|
|
79
|
-
const stateStore = new SequelizeStateStore(sequelize);
|
|
80
|
-
await stateStore.initialize();
|
|
81
31
|
|
|
82
32
|
const di = new DistributedImport({
|
|
83
33
|
schema: {
|
|
84
34
|
fields: [
|
|
85
35
|
{ name: 'email', type: 'email', required: true },
|
|
86
36
|
{ name: 'name', type: 'string', required: true },
|
|
87
|
-
{ name: 'role', type: 'string', required: false, defaultValue: 'user' },
|
|
88
37
|
],
|
|
89
38
|
},
|
|
90
39
|
batchSize: 500,
|
|
91
|
-
stateStore,
|
|
40
|
+
stateStore: new SequelizeStateStore(sequelize),
|
|
92
41
|
continueOnError: true,
|
|
93
42
|
});
|
|
94
43
|
|
|
95
|
-
// Phase 1: Prepare
|
|
96
44
|
const source = new UrlSource('https://storage.example.com/users.csv');
|
|
97
|
-
const { jobId
|
|
98
|
-
|
|
99
|
-
console.log(`Job ${jobId}: ${totalRecords} records in ${totalBatches} batches`);
|
|
100
|
-
|
|
101
|
-
// Fan out: send { jobId } to N workers via SQS, SNS, EventBridge, etc.
|
|
102
|
-
await sqs.sendMessage({
|
|
103
|
-
QueueUrl: WORKER_QUEUE_URL,
|
|
104
|
-
MessageBody: JSON.stringify({ jobId }),
|
|
105
|
-
});
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
### Worker (Phase 2)
|
|
109
|
-
|
|
110
|
-
```typescript
|
|
111
|
-
import { DistributedImport } from '@batchactions/distributed';
|
|
112
|
-
import { SequelizeStateStore } from '@batchactions/state-sequelize';
|
|
113
|
-
import { Sequelize } from 'sequelize';
|
|
114
|
-
|
|
115
|
-
// Lambda handler
|
|
116
|
-
export async function handler(event: SQSEvent, context: Context) {
|
|
117
|
-
const { jobId } = JSON.parse(event.Records[0].body);
|
|
118
|
-
const workerId = context.awsRequestId;
|
|
119
|
-
|
|
120
|
-
const sequelize = new Sequelize(process.env.DATABASE_URL!);
|
|
121
|
-
const stateStore = new SequelizeStateStore(sequelize);
|
|
122
|
-
await stateStore.initialize();
|
|
123
|
-
|
|
124
|
-
const di = new DistributedImport({
|
|
125
|
-
schema: {
|
|
126
|
-
fields: [
|
|
127
|
-
{ name: 'email', type: 'email', required: true },
|
|
128
|
-
{ name: 'name', type: 'string', required: true },
|
|
129
|
-
{ name: 'role', type: 'string', required: false, defaultValue: 'user' },
|
|
130
|
-
],
|
|
131
|
-
},
|
|
132
|
-
batchSize: 500,
|
|
133
|
-
stateStore,
|
|
134
|
-
continueOnError: true,
|
|
135
|
-
});
|
|
45
|
+
const { jobId } = await di.prepare(source, new CsvParser());
|
|
136
46
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
await db.query(
|
|
141
|
-
'INSERT INTO users (email, name, role) VALUES ($1, $2, $3)',
|
|
142
|
-
[record.email, record.name, record.role],
|
|
143
|
-
);
|
|
144
|
-
}, workerId);
|
|
145
|
-
|
|
146
|
-
if (!result.claimed) {
|
|
147
|
-
console.log('No more batches to process');
|
|
148
|
-
break;
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
console.log(`Batch ${result.batchIndex}: ${result.processedCount} processed, ${result.failedCount} failed`);
|
|
152
|
-
|
|
153
|
-
if (result.jobComplete) {
|
|
154
|
-
console.log('Job finalized by this worker!');
|
|
155
|
-
break;
|
|
156
|
-
}
|
|
157
|
-
}
|
|
47
|
+
while (true) {
|
|
48
|
+
const result = await di.processWorkerBatch(jobId, processRecord, workerId);
|
|
49
|
+
if (!result.claimed || result.jobComplete) break;
|
|
158
50
|
}
|
|
159
51
|
```
|
|
160
52
|
|
|
161
|
-
##
|
|
162
|
-
|
|
163
|
-
### `DistributedImportConfig`
|
|
164
|
-
|
|
165
|
-
| Property | Type | Default | Description |
|
|
166
|
-
|---|---|---|---|
|
|
167
|
-
| `schema` | `SchemaDefinition` | required | Field definitions and validation rules |
|
|
168
|
-
| `batchSize` | `number` | `100` | Records per batch |
|
|
169
|
-
| `continueOnError` | `boolean` | `true` | Continue when records fail validation or processing |
|
|
170
|
-
| `stateStore` | `StateStore` | required | Must implement `DistributedStateStore` (e.g. `SequelizeStateStore`) |
|
|
171
|
-
| `maxRetries` | `number` | `0` | Retry attempts for processor failures (exponential backoff) |
|
|
172
|
-
| `retryDelayMs` | `number` | `1000` | Base delay in ms between retry attempts |
|
|
173
|
-
| `hooks` | `JobHooks` | -- | Lifecycle hooks (`beforeValidate`, `afterValidate`, `beforeProcess`, `afterProcess`) |
|
|
174
|
-
| `duplicateChecker` | `DuplicateChecker` | -- | External duplicate detection |
|
|
175
|
-
| `staleBatchTimeoutMs` | `number` | `900000` | Timeout in ms before stale batches are reclaimed (15 min default) |
|
|
176
|
-
|
|
177
|
-
## API Reference
|
|
178
|
-
|
|
179
|
-
### `DistributedImport`
|
|
180
|
-
|
|
181
|
-
| Method | Description |
|
|
182
|
-
|---|---|
|
|
183
|
-
| `prepare(source, parser)` | Phase 1: Stream source, materialize records, create batches. Returns `PrepareResult`. |
|
|
184
|
-
| `processWorkerBatch(jobId, processor, workerId)` | Phase 2: Claim next batch, process records, finalize if last. Returns `DistributedBatchResult`. |
|
|
185
|
-
| `on(event, handler)` | Subscribe to a domain event. |
|
|
186
|
-
| `onAny(handler)` | Subscribe to all events. |
|
|
187
|
-
| `offAny(handler)` | Unsubscribe a wildcard handler. |
|
|
188
|
-
|
|
189
|
-
### `PrepareResult`
|
|
190
|
-
|
|
191
|
-
| Field | Type | Description |
|
|
192
|
-
|---|---|---|
|
|
193
|
-
| `jobId` | `string` | Unique job identifier. Pass this to workers. |
|
|
194
|
-
| `totalRecords` | `number` | Total records found in the source. |
|
|
195
|
-
| `totalBatches` | `number` | Number of batches created. |
|
|
196
|
-
|
|
197
|
-
### `DistributedBatchResult`
|
|
198
|
-
|
|
199
|
-
| Field | Type | Description |
|
|
200
|
-
|---|---|---|
|
|
201
|
-
| `claimed` | `boolean` | Whether a batch was successfully claimed. `false` means no batches remain. |
|
|
202
|
-
| `batchId` | `string?` | ID of the batch that was processed. |
|
|
203
|
-
| `batchIndex` | `number?` | Index of the batch that was processed. |
|
|
204
|
-
| `processedCount` | `number` | Records successfully processed in this batch. |
|
|
205
|
-
| `failedCount` | `number` | Records that failed in this batch. |
|
|
206
|
-
| `jobComplete` | `boolean` | `true` if this worker finalized the entire job. |
|
|
207
|
-
| `jobId` | `string` | The job identifier. |
|
|
208
|
-
|
|
209
|
-
## Crash Recovery
|
|
210
|
-
|
|
211
|
-
If a worker crashes or times out, its claimed batch becomes "stale". The next `processWorkerBatch()` call automatically reclaims stale batches (based on `staleBatchTimeoutMs`) before claiming new ones.
|
|
53
|
+
## Main Exports
|
|
212
54
|
|
|
213
|
-
|
|
55
|
+
- `DistributedImport`
|
|
56
|
+
- `PrepareResult`
|
|
57
|
+
- `DistributedBatchResult`, `DistributedBatchConfig`
|
|
58
|
+
- `DistributedStateStore` related types (re-exported)
|
|
59
|
+
- `isDistributedStateStore`
|
|
214
60
|
|
|
215
|
-
|
|
216
|
-
- Use `ON CONFLICT DO NOTHING` / `INSERT ... IGNORE` or similar patterns in your database writes.
|
|
61
|
+
For full typed exports, see `packages/distributed/src/index.ts`.
|
|
217
62
|
|
|
218
|
-
|
|
63
|
+
## Compatibility
|
|
219
64
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
const reclaimed = await stateStore.reclaimStaleBatches(jobId, 60_000); // 1 min timeout
|
|
225
|
-
console.log(`Reclaimed ${reclaimed} stale batches`);
|
|
226
|
-
}
|
|
227
|
-
```
|
|
228
|
-
|
|
229
|
-
## Events
|
|
230
|
-
|
|
231
|
-
Each worker has its own local event bus. Subscribe to events for logging, metrics, or progress tracking:
|
|
232
|
-
|
|
233
|
-
```typescript
|
|
234
|
-
di.on('batch:claimed', (e) => {
|
|
235
|
-
console.log(`Worker claimed batch ${e.batchIndex} of job ${e.jobId}`);
|
|
236
|
-
});
|
|
237
|
-
|
|
238
|
-
di.on('record:failed', (e) => {
|
|
239
|
-
console.error(`Record ${e.recordIndex} failed: ${e.error}`);
|
|
240
|
-
});
|
|
241
|
-
|
|
242
|
-
di.on('import:completed', (e) => {
|
|
243
|
-
// Only emitted by the worker that finalizes the job
|
|
244
|
-
console.log(`Job complete! ${e.summary.processed} processed, ${e.summary.failed} failed`);
|
|
245
|
-
});
|
|
246
|
-
|
|
247
|
-
// Forward all events (e.g. to CloudWatch, Datadog)
|
|
248
|
-
di.onAny((event) => {
|
|
249
|
-
metrics.emit(event.type, event);
|
|
250
|
-
});
|
|
251
|
-
```
|
|
252
|
-
|
|
253
|
-
**Note:** `import:completed` is emitted only by the worker that finalizes the job (exactly once).
|
|
254
|
-
|
|
255
|
-
## Architecture
|
|
256
|
-
|
|
257
|
-
```
|
|
258
|
-
@batchactions/distributed
|
|
259
|
-
├── DistributedImport.ts # Facade (composition root)
|
|
260
|
-
├── PrepareDistributedImport.ts # Phase 1 use case
|
|
261
|
-
├── ProcessDistributedBatch.ts # Phase 2 use case
|
|
262
|
-
└── index.ts # Public API
|
|
263
|
-
|
|
264
|
-
Depends on:
|
|
265
|
-
└── @batchactions/core
|
|
266
|
-
├── DistributedStateStore # Port interface (extended StateStore)
|
|
267
|
-
├── BatchReservation # Domain types
|
|
268
|
-
├── SchemaValidator # Validation pipeline
|
|
269
|
-
└── EventBus # Event system
|
|
270
|
-
|
|
271
|
-
Implemented by:
|
|
272
|
-
└── @batchactions/state-sequelize
|
|
273
|
-
└── SequelizeStateStore # Concrete DistributedStateStore
|
|
274
|
-
├── bulkimport_jobs # Job state table
|
|
275
|
-
├── bulkimport_records # Record data table
|
|
276
|
-
└── bulkimport_batches # Batch metadata table (distributed)
|
|
277
|
-
```
|
|
278
|
-
|
|
279
|
-
## Implementing a Custom `DistributedStateStore`
|
|
280
|
-
|
|
281
|
-
If you don't use Sequelize, you can implement the `DistributedStateStore` interface:
|
|
282
|
-
|
|
283
|
-
```typescript
|
|
284
|
-
import type { DistributedStateStore, ClaimBatchResult, DistributedJobStatus, ProcessedRecord } from '@batchactions/distributed';
|
|
285
|
-
|
|
286
|
-
class MyDistributedStore implements DistributedStateStore {
|
|
287
|
-
// ... all StateStore methods plus:
|
|
288
|
-
|
|
289
|
-
async claimBatch(jobId: string, workerId: string): Promise<ClaimBatchResult> {
|
|
290
|
-
// Atomic: find first PENDING batch, set to PROCESSING with workerId
|
|
291
|
-
// Use SELECT FOR UPDATE SKIP LOCKED or similar
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
async releaseBatch(jobId: string, batchId: string, workerId: string): Promise<void> {
|
|
295
|
-
// Reset batch to PENDING (only if claimed by this worker)
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
async reclaimStaleBatches(jobId: string, timeoutMs: number): Promise<number> {
|
|
299
|
-
// Find PROCESSING batches with claimedAt older than timeout, reset to PENDING
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
async saveBatchRecords(jobId: string, batchId: string, records: readonly ProcessedRecord[]): Promise<void> {
|
|
303
|
-
// Bulk insert records for a batch
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
async getBatchRecords(jobId: string, batchId: string): Promise<readonly ProcessedRecord[]> {
|
|
307
|
-
// Load all records for a batch
|
|
308
|
-
}
|
|
65
|
+
- Node.js >= 20.0.0
|
|
66
|
+
- Peer dependencies:
|
|
67
|
+
- `@batchactions/core` >= 0.0.1
|
|
68
|
+
- `@batchactions/import` >= 0.0.1
|
|
309
69
|
|
|
310
|
-
|
|
311
|
-
// Aggregate: count batches by status
|
|
312
|
-
}
|
|
70
|
+
## Operational Notes
|
|
313
71
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
```
|
|
72
|
+
- Worker processors must be idempotent.
|
|
73
|
+
- Stale claimed batches are reclaimed automatically based on `staleBatchTimeoutMs`.
|
|
74
|
+
- Job finalization is exactly-once via `tryFinalizeJob()` in the store.
|
|
320
75
|
|
|
321
|
-
##
|
|
76
|
+
## Links
|
|
322
77
|
|
|
323
|
-
-
|
|
324
|
-
-
|
|
325
|
-
-
|
|
78
|
+
- Repository: https://github.com/vgpastor/batchactions/tree/main/packages/distributed
|
|
79
|
+
- Issues: https://github.com/vgpastor/batchactions/issues
|
|
80
|
+
- Contributing guide: https://github.com/vgpastor/batchactions/blob/main/CONTRIBUTING.md
|
|
326
81
|
|
|
327
82
|
## License
|
|
328
83
|
|
|
329
|
-
|
|
84
|
+
MIT
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@batchactions/distributed",
|
|
3
|
-
"version": "0.0.
|
|
4
|
-
"description": "Distributed parallel batch processing for @batchactions/core
|
|
3
|
+
"version": "0.0.3",
|
|
4
|
+
"description": "Distributed parallel batch processing for @batchactions/core with fan-out workers",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": {
|
|
7
7
|
".": {
|
|
@@ -22,7 +22,10 @@
|
|
|
22
22
|
"test": "vitest run",
|
|
23
23
|
"test:watch": "vitest",
|
|
24
24
|
"typecheck": "tsc --noEmit",
|
|
25
|
-
"lint": "eslint src/ tests/"
|
|
25
|
+
"lint": "eslint src/ tests/",
|
|
26
|
+
"lint:fix": "eslint src/ tests/ --fix",
|
|
27
|
+
"format": "prettier --write .",
|
|
28
|
+
"format:check": "prettier --check ."
|
|
26
29
|
},
|
|
27
30
|
"keywords": [
|
|
28
31
|
"batchactions",
|
|
@@ -34,7 +37,11 @@
|
|
|
34
37
|
"worker"
|
|
35
38
|
],
|
|
36
39
|
"license": "MIT",
|
|
37
|
-
"author": "
|
|
40
|
+
"author": "Victor Garcia <vgpastor@ingenierosweb.co>",
|
|
41
|
+
"homepage": "https://github.com/vgpastor/batchactions/tree/main/packages/distributed#readme",
|
|
42
|
+
"bugs": {
|
|
43
|
+
"url": "https://github.com/vgpastor/batchactions/issues"
|
|
44
|
+
},
|
|
38
45
|
"repository": {
|
|
39
46
|
"type": "git",
|
|
40
47
|
"url": "https://github.com/vgpastor/batchactions",
|
|
@@ -51,12 +58,12 @@
|
|
|
51
58
|
"@batchactions/core": "*",
|
|
52
59
|
"@batchactions/import": "*",
|
|
53
60
|
"@types/node": "^22.13.4",
|
|
54
|
-
"@typescript-eslint/eslint-plugin": "^8.
|
|
55
|
-
"@typescript-eslint/parser": "^8.
|
|
56
|
-
"eslint": "^9.
|
|
61
|
+
"@typescript-eslint/eslint-plugin": "^8.56.0",
|
|
62
|
+
"@typescript-eslint/parser": "^8.56.0",
|
|
63
|
+
"eslint": "^9.21.0",
|
|
57
64
|
"eslint-config-prettier": "^10.0.1",
|
|
58
|
-
"tsup": "^8.
|
|
65
|
+
"tsup": "^8.5.1",
|
|
59
66
|
"typescript": "^5.7.3",
|
|
60
|
-
"vitest": "^
|
|
67
|
+
"vitest": "^4.0.0"
|
|
61
68
|
}
|
|
62
69
|
}
|