@batchactions/distributed 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,143 @@
1
+ import { StateStore, JobHooks, DataSource, RecordProcessorFn, EventType, EventPayload, DomainEvent } from '@batchactions/core';
2
+ export { BatchReservation, ClaimBatchFailureReason, ClaimBatchResult, DistributedJobStatus, DistributedStateStore, isDistributedStateStore } from '@batchactions/core';
3
+ import { SchemaDefinition, DuplicateChecker, SourceParser } from '@batchactions/import';
4
+
5
+ /** Result of the prepare phase for distributed processing. */
6
+ interface PrepareResult {
7
+ /** Unique job identifier. Use this to dispatch workers. */
8
+ readonly jobId: string;
9
+ /** Total number of records found in the source. */
10
+ readonly totalRecords: number;
11
+ /** Total number of batches created. */
12
+ readonly totalBatches: number;
13
+ }
14
+
15
+ /** Result of processing a single distributed batch. */
16
+ interface DistributedBatchResult {
17
+ /** Whether a batch was successfully claimed. */
18
+ readonly claimed: boolean;
19
+ /** Batch ID that was processed (only if claimed). */
20
+ readonly batchId?: string;
21
+ /** Batch index that was processed (only if claimed). */
22
+ readonly batchIndex?: number;
23
+ /** Records successfully processed in this batch. */
24
+ readonly processedCount: number;
25
+ /** Records that failed in this batch. */
26
+ readonly failedCount: number;
27
+ /** Whether this worker finalized the entire job. */
28
+ readonly jobComplete: boolean;
29
+ /** The job identifier. */
30
+ readonly jobId: string;
31
+ }
32
+ /** Configuration for the distributed batch processor. */
33
+ interface DistributedBatchConfig {
34
+ readonly schema: SchemaDefinition;
35
+ readonly stateStore: StateStore;
36
+ readonly continueOnError?: boolean;
37
+ readonly maxRetries?: number;
38
+ readonly retryDelayMs?: number;
39
+ readonly hooks?: JobHooks;
40
+ readonly duplicateChecker?: DuplicateChecker;
41
+ }
42
+
43
+ /** Configuration for distributed import processing. */
44
+ interface DistributedImportConfig {
45
+ /** Schema definition for validation. */
46
+ readonly schema: SchemaDefinition;
47
+ /** Number of records per batch. Default: 100. */
48
+ readonly batchSize?: number;
49
+ /** Whether to continue processing on record errors. Default: true. */
50
+ readonly continueOnError?: boolean;
51
+ /**
52
+ * State store that implements `DistributedStateStore`.
53
+ * Required — must support atomic batch claiming.
54
+ */
55
+ readonly stateStore: StateStore;
56
+ /** Maximum retry attempts for processor failures. Default: 0. */
57
+ readonly maxRetries?: number;
58
+ /** Base delay in ms for retry backoff. Default: 1000. */
59
+ readonly retryDelayMs?: number;
60
+ /** Optional lifecycle hooks. */
61
+ readonly hooks?: JobHooks;
62
+ /** Optional external duplicate detection. */
63
+ readonly duplicateChecker?: DuplicateChecker;
64
+ /**
65
+ * Timeout in ms for stale batch reclamation. Default: 900000 (15 min).
66
+ * Batches stuck in PROCESSING longer than this are reclaimed for other workers.
67
+ */
68
+ readonly staleBatchTimeoutMs?: number;
69
+ }
70
+ /**
71
+ * Facade for distributed parallel batch processing.
72
+ *
73
+ * Two-phase processing model:
74
+ * 1. **Prepare** (single orchestrator): streams the source file, materializes
75
+ * records in the StateStore, and registers batch boundaries.
76
+ * 2. **Process** (N parallel workers): each worker calls `processWorkerBatch()`
77
+ * in a loop to claim and process batches until none remain.
78
+ *
79
+ * @example
80
+ * ```typescript
81
+ * // === Orchestrator Lambda ===
82
+ * const di = new DistributedImport(config);
83
+ * const { jobId, totalBatches } = await di.prepare(source, parser);
84
+ * // Fan out: send { jobId } to N worker Lambdas via SQS
85
+ *
86
+ * // === Worker Lambda ===
87
+ * const di = new DistributedImport(config);
88
+ * const workerId = context.awsRequestId;
89
+ * while (true) {
90
+ * const result = await di.processWorkerBatch(jobId, processor, workerId);
91
+ * if (!result.claimed || result.jobComplete) break;
92
+ * }
93
+ * ```
94
+ */
95
+ declare class DistributedImport {
96
+ private readonly config;
97
+ private readonly eventBus;
98
+ constructor(config: DistributedImportConfig);
99
+ /**
100
+ * Phase 1: Prepare the job for distributed processing.
101
+ *
102
+ * Streams the entire source, materializes all records in the StateStore,
103
+ * and creates batch metadata. Call this from a single orchestrator.
104
+ *
105
+ * @param source - Data source to read from.
106
+ * @param parser - Parser for the source format.
107
+ * @returns Preparation result with jobId, totalRecords, totalBatches.
108
+ */
109
+ prepare(source: DataSource, parser: SourceParser): Promise<PrepareResult>;
110
+ /**
111
+ * Phase 2: Claim and process the next available batch.
112
+ *
113
+ * Atomically claims an unclaimed batch, loads its records from the
114
+ * StateStore, validates and processes them. Returns immediately if
115
+ * no batches are available.
116
+ *
117
+ * Before claiming, reclaims any stale batches that have been stuck
118
+ * in PROCESSING longer than `staleBatchTimeoutMs`.
119
+ *
120
+ * Call this from each worker in a loop until `claimed` is `false`
121
+ * or `jobComplete` is `true`.
122
+ *
123
+ * @param jobId - The job ID returned by `prepare()`.
124
+ * @param processor - Callback invoked for each valid record. Must be idempotent.
125
+ * @param workerId - Unique identifier for this worker (e.g. Lambda request ID).
126
+ * @returns Result with batch details, counts, and completion status.
127
+ */
128
+ processWorkerBatch(jobId: string, processor: RecordProcessorFn, workerId: string): Promise<DistributedBatchResult>;
129
+ /**
130
+ * Subscribe to a specific domain event type.
131
+ *
132
+ * Events are local to this `DistributedImport` instance. Each worker
133
+ * has its own event bus. The `job:completed` event is only emitted
134
+ * by the worker that finalizes the job (exactly-once).
135
+ */
136
+ on<T extends EventType>(type: T, handler: (event: EventPayload<T>) => void): this;
137
+ /** Subscribe to all domain events. */
138
+ onAny(handler: (event: DomainEvent) => void): this;
139
+ /** Unsubscribe a wildcard handler. */
140
+ offAny(handler: (event: DomainEvent) => void): this;
141
+ }
142
+
143
+ export { type DistributedBatchConfig, type DistributedBatchResult, DistributedImport, type DistributedImportConfig, type PrepareResult };
@@ -0,0 +1,143 @@
1
+ import { StateStore, JobHooks, DataSource, RecordProcessorFn, EventType, EventPayload, DomainEvent } from '@batchactions/core';
2
+ export { BatchReservation, ClaimBatchFailureReason, ClaimBatchResult, DistributedJobStatus, DistributedStateStore, isDistributedStateStore } from '@batchactions/core';
3
+ import { SchemaDefinition, DuplicateChecker, SourceParser } from '@batchactions/import';
4
+
5
+ /** Result of the prepare phase for distributed processing. */
6
+ interface PrepareResult {
7
+ /** Unique job identifier. Use this to dispatch workers. */
8
+ readonly jobId: string;
9
+ /** Total number of records found in the source. */
10
+ readonly totalRecords: number;
11
+ /** Total number of batches created. */
12
+ readonly totalBatches: number;
13
+ }
14
+
15
+ /** Result of processing a single distributed batch. */
16
+ interface DistributedBatchResult {
17
+ /** Whether a batch was successfully claimed. */
18
+ readonly claimed: boolean;
19
+ /** Batch ID that was processed (only if claimed). */
20
+ readonly batchId?: string;
21
+ /** Batch index that was processed (only if claimed). */
22
+ readonly batchIndex?: number;
23
+ /** Records successfully processed in this batch. */
24
+ readonly processedCount: number;
25
+ /** Records that failed in this batch. */
26
+ readonly failedCount: number;
27
+ /** Whether this worker finalized the entire job. */
28
+ readonly jobComplete: boolean;
29
+ /** The job identifier. */
30
+ readonly jobId: string;
31
+ }
32
+ /** Configuration for the distributed batch processor. */
33
+ interface DistributedBatchConfig {
34
+ readonly schema: SchemaDefinition;
35
+ readonly stateStore: StateStore;
36
+ readonly continueOnError?: boolean;
37
+ readonly maxRetries?: number;
38
+ readonly retryDelayMs?: number;
39
+ readonly hooks?: JobHooks;
40
+ readonly duplicateChecker?: DuplicateChecker;
41
+ }
42
+
43
+ /** Configuration for distributed import processing. */
44
+ interface DistributedImportConfig {
45
+ /** Schema definition for validation. */
46
+ readonly schema: SchemaDefinition;
47
+ /** Number of records per batch. Default: 100. */
48
+ readonly batchSize?: number;
49
+ /** Whether to continue processing on record errors. Default: true. */
50
+ readonly continueOnError?: boolean;
51
+ /**
52
+ * State store that implements `DistributedStateStore`.
53
+ * Required — must support atomic batch claiming.
54
+ */
55
+ readonly stateStore: StateStore;
56
+ /** Maximum retry attempts for processor failures. Default: 0. */
57
+ readonly maxRetries?: number;
58
+ /** Base delay in ms for retry backoff. Default: 1000. */
59
+ readonly retryDelayMs?: number;
60
+ /** Optional lifecycle hooks. */
61
+ readonly hooks?: JobHooks;
62
+ /** Optional external duplicate detection. */
63
+ readonly duplicateChecker?: DuplicateChecker;
64
+ /**
65
+ * Timeout in ms for stale batch reclamation. Default: 900000 (15 min).
66
+ * Batches stuck in PROCESSING longer than this are reclaimed for other workers.
67
+ */
68
+ readonly staleBatchTimeoutMs?: number;
69
+ }
70
+ /**
71
+ * Facade for distributed parallel batch processing.
72
+ *
73
+ * Two-phase processing model:
74
+ * 1. **Prepare** (single orchestrator): streams the source file, materializes
75
+ * records in the StateStore, and registers batch boundaries.
76
+ * 2. **Process** (N parallel workers): each worker calls `processWorkerBatch()`
77
+ * in a loop to claim and process batches until none remain.
78
+ *
79
+ * @example
80
+ * ```typescript
81
+ * // === Orchestrator Lambda ===
82
+ * const di = new DistributedImport(config);
83
+ * const { jobId, totalBatches } = await di.prepare(source, parser);
84
+ * // Fan out: send { jobId } to N worker Lambdas via SQS
85
+ *
86
+ * // === Worker Lambda ===
87
+ * const di = new DistributedImport(config);
88
+ * const workerId = context.awsRequestId;
89
+ * while (true) {
90
+ * const result = await di.processWorkerBatch(jobId, processor, workerId);
91
+ * if (!result.claimed || result.jobComplete) break;
92
+ * }
93
+ * ```
94
+ */
95
+ declare class DistributedImport {
96
+ private readonly config;
97
+ private readonly eventBus;
98
+ constructor(config: DistributedImportConfig);
99
+ /**
100
+ * Phase 1: Prepare the job for distributed processing.
101
+ *
102
+ * Streams the entire source, materializes all records in the StateStore,
103
+ * and creates batch metadata. Call this from a single orchestrator.
104
+ *
105
+ * @param source - Data source to read from.
106
+ * @param parser - Parser for the source format.
107
+ * @returns Preparation result with jobId, totalRecords, totalBatches.
108
+ */
109
+ prepare(source: DataSource, parser: SourceParser): Promise<PrepareResult>;
110
+ /**
111
+ * Phase 2: Claim and process the next available batch.
112
+ *
113
+ * Atomically claims an unclaimed batch, loads its records from the
114
+ * StateStore, validates and processes them. Returns immediately if
115
+ * no batches are available.
116
+ *
117
+ * Before claiming, reclaims any stale batches that have been stuck
118
+ * in PROCESSING longer than `staleBatchTimeoutMs`.
119
+ *
120
+ * Call this from each worker in a loop until `claimed` is `false`
121
+ * or `jobComplete` is `true`.
122
+ *
123
+ * @param jobId - The job ID returned by `prepare()`.
124
+ * @param processor - Callback invoked for each valid record. Must be idempotent.
125
+ * @param workerId - Unique identifier for this worker (e.g. Lambda request ID).
126
+ * @returns Result with batch details, counts, and completion status.
127
+ */
128
+ processWorkerBatch(jobId: string, processor: RecordProcessorFn, workerId: string): Promise<DistributedBatchResult>;
129
+ /**
130
+ * Subscribe to a specific domain event type.
131
+ *
132
+ * Events are local to this `DistributedImport` instance. Each worker
133
+ * has its own event bus. The `job:completed` event is only emitted
134
+ * by the worker that finalizes the job (exactly-once).
135
+ */
136
+ on<T extends EventType>(type: T, handler: (event: EventPayload<T>) => void): this;
137
+ /** Subscribe to all domain events. */
138
+ onAny(handler: (event: DomainEvent) => void): this;
139
+ /** Unsubscribe a wildcard handler. */
140
+ offAny(handler: (event: DomainEvent) => void): this;
141
+ }
142
+
143
+ export { type DistributedBatchConfig, type DistributedBatchResult, DistributedImport, type DistributedImportConfig, type PrepareResult };