glost-processor 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/stream-processor.d.ts +145 -0
- package/dist/stream-processor.d.ts.map +1 -0
- package/dist/stream-processor.js +245 -0
- package/dist/stream-processor.js.map +1 -0
- package/package.json +2 -2
- package/src/__benchmarks__/stream-processor.bench.ts +250 -0
- package/src/__tests__/processor.test.ts +992 -0
- package/src/__tests__/stream-processor.test.ts +574 -0
- package/src/index.ts +6 -0
- package/src/stream-processor.ts +391 -0
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GLOST Stream Processor
|
|
3
|
+
*
|
|
4
|
+
* Streaming variant of GLOSTProcessor that yields processed sentence
|
|
5
|
+
* batches progressively using AsyncGenerator. Keeps the full document
|
|
6
|
+
* out of memory between chunks.
|
|
7
|
+
*
|
|
8
|
+
* Document-level transforms (extensions with streamingSupport !== 'chunk')
|
|
9
|
+
* run once on the full document before streaming begins. Chunk-compatible
|
|
10
|
+
* extensions (streamingSupport === 'chunk') then run on each batch.
|
|
11
|
+
*
|
|
12
|
+
* @packageDocumentation
|
|
13
|
+
*
|
|
14
|
+
* @example
|
|
15
|
+
* ```typescript
|
|
16
|
+
* import { GLOSTStreamProcessor } from "glost-processor";
|
|
17
|
+
*
|
|
18
|
+
* const processor = new GLOSTStreamProcessor()
|
|
19
|
+
* .use(transcription)
|
|
20
|
+
* .use(translation);
|
|
21
|
+
*
|
|
22
|
+
* for await (const chunk of processor.stream(document)) {
|
|
23
|
+
* console.log(chunk.sentences, chunk.isLast);
|
|
24
|
+
* }
|
|
25
|
+
* ```
|
|
26
|
+
*
|
|
27
|
+
* @since 0.7.0
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
import type { GLOSTRoot, GLOSTSentence } from "glost-core";
|
|
31
|
+
import type { GLOSTExtension } from "glost-extensions";
|
|
32
|
+
import {
|
|
33
|
+
processGLOSTWithExtensionsAsync,
|
|
34
|
+
processGLOSTChunkAsync,
|
|
35
|
+
extensionRegistry,
|
|
36
|
+
} from "glost-extensions";
|
|
37
|
+
import type { PluginSpec, Preset, ProcessorOptions } from "./types.js";
|
|
38
|
+
|
|
39
|
+
// ============================================================================
|
|
40
|
+
// Public types
|
|
41
|
+
// ============================================================================
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Options for stream() method
|
|
45
|
+
*
|
|
46
|
+
* @since 0.7.0
|
|
47
|
+
*/
|
|
48
|
+
export interface StreamOptions {
|
|
49
|
+
/**
|
|
50
|
+
* Number of sentences per chunk.
|
|
51
|
+
*
|
|
52
|
+
* A smaller value reduces latency to first yielded chunk; a larger
|
|
53
|
+
* value amortises per-chunk overhead. Default: 50.
|
|
54
|
+
*/
|
|
55
|
+
batchSize?: number;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* A single yielded chunk from the stream
|
|
60
|
+
*
|
|
61
|
+
* @since 0.7.0
|
|
62
|
+
*/
|
|
63
|
+
export interface ProcessedChunk {
|
|
64
|
+
/** Processed sentences in this batch */
|
|
65
|
+
sentences: GLOSTSentence[];
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Index of the source paragraph this chunk came from.
|
|
69
|
+
*
|
|
70
|
+
* When the document has multiple paragraphs each paragraph is
|
|
71
|
+
* chunked independently, so multiple consecutive chunks may share
|
|
72
|
+
* the same paragraphIndex.
|
|
73
|
+
*/
|
|
74
|
+
paragraphIndex: number;
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Index of this chunk within its paragraph (0-based).
|
|
78
|
+
*/
|
|
79
|
+
chunkIndex: number;
|
|
80
|
+
|
|
81
|
+
/** True for the final chunk across the whole document */
|
|
82
|
+
isLast: boolean;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// ============================================================================
|
|
86
|
+
// GLOSTStreamProcessor
|
|
87
|
+
// ============================================================================
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Streaming processor for GLOST documents
|
|
91
|
+
*
|
|
92
|
+
* Mirrors the `GLOSTProcessor` API (`.use()`, `.freeze()`) but adds a
|
|
93
|
+
* `.stream()` method that returns an `AsyncGenerator<ProcessedChunk>`.
|
|
94
|
+
*
|
|
95
|
+
* @since 0.7.0
|
|
96
|
+
*/
|
|
97
|
+
export class GLOSTStreamProcessor {
|
|
98
|
+
private plugins: Array<{ spec: PluginSpec; options?: unknown }> = [];
|
|
99
|
+
private options: ProcessorOptions = {};
|
|
100
|
+
private frozen = false;
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Create a new stream processor instance
|
|
104
|
+
*
|
|
105
|
+
* @param options - Initial processor options
|
|
106
|
+
*/
|
|
107
|
+
constructor(options: ProcessorOptions = {}) {
|
|
108
|
+
this.options = { ...options };
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Add a plugin, preset, or extension to the pipeline
|
|
113
|
+
*
|
|
114
|
+
* @param spec - Plugin function, extension object, preset, or ID
|
|
115
|
+
* @param options - Plugin options
|
|
116
|
+
* @returns This processor for chaining
|
|
117
|
+
*/
|
|
118
|
+
use(spec: PluginSpec | Preset, options?: unknown): this {
|
|
119
|
+
this.assertNotFrozen();
|
|
120
|
+
|
|
121
|
+
if (this.isPreset(spec)) {
|
|
122
|
+
return this.usePreset(spec);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
this.plugins.push({ spec, options });
|
|
126
|
+
return this;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Freeze the processor
|
|
131
|
+
*
|
|
132
|
+
* Returns a frozen processor that cannot be modified. Useful for
|
|
133
|
+
* reusing the same pipeline configuration across multiple documents.
|
|
134
|
+
*
|
|
135
|
+
* @returns A frozen copy of this processor
|
|
136
|
+
*/
|
|
137
|
+
freeze(): FrozenStreamProcessor {
|
|
138
|
+
const frozen = new GLOSTStreamProcessor(this.options);
|
|
139
|
+
frozen.plugins = [...this.plugins];
|
|
140
|
+
(frozen as unknown as { frozen: boolean }).frozen = true;
|
|
141
|
+
return frozen as unknown as FrozenStreamProcessor;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Stream a document as progressive sentence batches
|
|
146
|
+
*
|
|
147
|
+
* Processing phases:
|
|
148
|
+
* 1. All extensions with `streamingSupport !== 'chunk'` (i.e. `'none'`
|
|
149
|
+
* or `'full'`, or unset) run their `transform`, `visit`, and
|
|
150
|
+
* `enhanceMetadata` hooks on the **full** document.
|
|
151
|
+
* 2. The resulting document is split into sentence batches.
|
|
152
|
+
* 3. For each batch, extensions with `streamingSupport === 'chunk'`
|
|
153
|
+
* run their `visit` and `enhanceMetadata` hooks.
|
|
154
|
+
* 4. A `ProcessedChunk` is yielded.
|
|
155
|
+
*
|
|
156
|
+
* Cancellation: break out of the `for await` loop at any time. The
|
|
157
|
+
* generator will stop without processing remaining chunks.
|
|
158
|
+
*
|
|
159
|
+
* @param document - GLOST document to stream
|
|
160
|
+
* @param streamOptions - Streaming options (batchSize etc.)
|
|
161
|
+
* @yields `ProcessedChunk` objects in document order
|
|
162
|
+
*
|
|
163
|
+
* @example
|
|
164
|
+
* ```typescript
|
|
165
|
+
* for await (const chunk of processor.stream(doc, { batchSize: 20 })) {
|
|
166
|
+
* console.log(`para ${chunk.paragraphIndex} chunk ${chunk.chunkIndex}`);
|
|
167
|
+
* if (chunk.isLast) console.log("done");
|
|
168
|
+
* }
|
|
169
|
+
* ```
|
|
170
|
+
*/
|
|
171
|
+
async *stream(
|
|
172
|
+
document: GLOSTRoot,
|
|
173
|
+
streamOptions?: StreamOptions,
|
|
174
|
+
): AsyncGenerator<ProcessedChunk> {
|
|
175
|
+
const batchSize = streamOptions?.batchSize ?? 50;
|
|
176
|
+
const extensions = await this.resolveExtensions();
|
|
177
|
+
|
|
178
|
+
// Split extensions into doc-level and chunk-level
|
|
179
|
+
const docExtensions = extensions.filter(
|
|
180
|
+
(e) => e.streamingSupport !== "chunk",
|
|
181
|
+
);
|
|
182
|
+
const chunkExtensions = extensions.filter(
|
|
183
|
+
(e) => e.streamingSupport === "chunk",
|
|
184
|
+
);
|
|
185
|
+
|
|
186
|
+
// Phase 1: run doc-level transforms on the full document
|
|
187
|
+
let processedDoc = document;
|
|
188
|
+
if (docExtensions.length > 0) {
|
|
189
|
+
type OptionsWithData = ProcessorOptions & { data?: unknown };
|
|
190
|
+
const { data: _data, ...extOptions } =
|
|
191
|
+
this.options as OptionsWithData;
|
|
192
|
+
const result = await processGLOSTWithExtensionsAsync(
|
|
193
|
+
processedDoc,
|
|
194
|
+
docExtensions,
|
|
195
|
+
extOptions,
|
|
196
|
+
);
|
|
197
|
+
processedDoc = result.document;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Phase 2: collect all sentences grouped by paragraph index
|
|
201
|
+
const paragraphSentences = collectSentencesByParagraph(processedDoc);
|
|
202
|
+
|
|
203
|
+
if (paragraphSentences.length === 0) {
|
|
204
|
+
return;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Build a flat list of (paragraphIndex, chunkIndex, sentences) so
|
|
208
|
+
// we know the total chunk count upfront and can set isLast correctly.
|
|
209
|
+
type ChunkDescriptor = {
|
|
210
|
+
paragraphIndex: number;
|
|
211
|
+
chunkIndex: number;
|
|
212
|
+
sentences: GLOSTSentence[];
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
const allChunks: ChunkDescriptor[] = [];
|
|
216
|
+
|
|
217
|
+
for (
|
|
218
|
+
let pIdx = 0;
|
|
219
|
+
pIdx < paragraphSentences.length;
|
|
220
|
+
pIdx++
|
|
221
|
+
) {
|
|
222
|
+
const sentences = paragraphSentences[pIdx]!;
|
|
223
|
+
let chunkIndex = 0;
|
|
224
|
+
|
|
225
|
+
for (
|
|
226
|
+
let offset = 0;
|
|
227
|
+
offset < sentences.length;
|
|
228
|
+
offset += batchSize
|
|
229
|
+
) {
|
|
230
|
+
allChunks.push({
|
|
231
|
+
paragraphIndex: pIdx,
|
|
232
|
+
chunkIndex,
|
|
233
|
+
sentences: sentences.slice(offset, offset + batchSize),
|
|
234
|
+
});
|
|
235
|
+
chunkIndex++;
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
const totalChunks = allChunks.length;
|
|
240
|
+
|
|
241
|
+
// Phase 3: yield each chunk, optionally running chunk-level extensions
|
|
242
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
243
|
+
const descriptor = allChunks[i]!;
|
|
244
|
+
|
|
245
|
+
let processedSentences = descriptor.sentences;
|
|
246
|
+
|
|
247
|
+
if (chunkExtensions.length > 0) {
|
|
248
|
+
type OptionsWithData = ProcessorOptions & { data?: unknown };
|
|
249
|
+
const { data: _data, ...extOptions } =
|
|
250
|
+
this.options as OptionsWithData;
|
|
251
|
+
processedSentences = await processGLOSTChunkAsync(
|
|
252
|
+
processedSentences,
|
|
253
|
+
chunkExtensions,
|
|
254
|
+
extOptions,
|
|
255
|
+
);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
yield {
|
|
259
|
+
sentences: processedSentences,
|
|
260
|
+
paragraphIndex: descriptor.paragraphIndex,
|
|
261
|
+
chunkIndex: descriptor.chunkIndex,
|
|
262
|
+
isLast: i === totalChunks - 1,
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// =====================================================================
|
|
268
|
+
// Private helpers
|
|
269
|
+
// =====================================================================
|
|
270
|
+
|
|
271
|
+
private usePreset(preset: Preset): this {
|
|
272
|
+
for (const entry of preset.plugins) {
|
|
273
|
+
if (Array.isArray(entry)) {
|
|
274
|
+
const [plugin, opts] = entry;
|
|
275
|
+
this.use(plugin, opts);
|
|
276
|
+
} else {
|
|
277
|
+
this.use(entry);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
return this;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
private async resolveExtensions(): Promise<GLOSTExtension[]> {
|
|
284
|
+
const result: GLOSTExtension[] = [];
|
|
285
|
+
for (const { spec, options } of this.plugins) {
|
|
286
|
+
const ext = await this.resolvePlugin(spec, options);
|
|
287
|
+
if (ext) {
|
|
288
|
+
result.push(ext);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
return result;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
private async resolvePlugin(
|
|
295
|
+
spec: PluginSpec,
|
|
296
|
+
options?: unknown,
|
|
297
|
+
): Promise<GLOSTExtension | null> {
|
|
298
|
+
if (typeof spec === "string") {
|
|
299
|
+
const ext = extensionRegistry.get(spec);
|
|
300
|
+
if (!ext) {
|
|
301
|
+
throw new Error(`Plugin "${spec}" not found in registry`);
|
|
302
|
+
}
|
|
303
|
+
return ext;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
if (typeof spec === "function") {
|
|
307
|
+
const result = (spec as (opts?: unknown) => GLOSTExtension | void)(
|
|
308
|
+
options,
|
|
309
|
+
);
|
|
310
|
+
return result ?? null;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
return spec as GLOSTExtension;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
private isPreset(spec: unknown): spec is Preset {
|
|
317
|
+
return (
|
|
318
|
+
spec !== null &&
|
|
319
|
+
typeof spec === "object" &&
|
|
320
|
+
"plugins" in (spec as object) &&
|
|
321
|
+
Array.isArray((spec as Preset).plugins)
|
|
322
|
+
);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
private assertNotFrozen(): void {
|
|
326
|
+
if (this.frozen) {
|
|
327
|
+
throw new Error("Cannot modify frozen stream processor");
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// ============================================================================
|
|
333
|
+
// Frozen type
|
|
334
|
+
// ============================================================================
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* A frozen `GLOSTStreamProcessor` that cannot be modified.
|
|
338
|
+
*
|
|
339
|
+
* @since 0.7.0
|
|
340
|
+
*/
|
|
341
|
+
export type FrozenStreamProcessor = Omit<
|
|
342
|
+
GLOSTStreamProcessor,
|
|
343
|
+
"use" | "freeze"
|
|
344
|
+
> & { readonly frozen: true };
|
|
345
|
+
|
|
346
|
+
// ============================================================================
|
|
347
|
+
// Internal helpers
|
|
348
|
+
// ============================================================================
|
|
349
|
+
|
|
350
|
+
/**
|
|
351
|
+
* Collect all sentences from a GLOSTRoot, grouped by paragraph index.
|
|
352
|
+
*
|
|
353
|
+
* Only `SentenceNode` children of `ParagraphNode` children are
|
|
354
|
+
* collected. Sentences that appear directly under the root (without a
|
|
355
|
+
* wrapping paragraph) are collected as a single synthetic group at
|
|
356
|
+
* index 0.
|
|
357
|
+
*
|
|
358
|
+
* @internal
|
|
359
|
+
*/
|
|
360
|
+
function collectSentencesByParagraph(
|
|
361
|
+
document: GLOSTRoot,
|
|
362
|
+
): GLOSTSentence[][] {
|
|
363
|
+
const groups: GLOSTSentence[][] = [];
|
|
364
|
+
|
|
365
|
+
// Sentences that sit directly under the root (no paragraph wrapper)
|
|
366
|
+
const rootSentences: GLOSTSentence[] = [];
|
|
367
|
+
|
|
368
|
+
for (const child of document.children) {
|
|
369
|
+
if (child.type === "ParagraphNode" && "children" in child) {
|
|
370
|
+
const para = child as { type: string; children: unknown[] };
|
|
371
|
+
const sentences = para.children.filter(
|
|
372
|
+
(c): c is GLOSTSentence =>
|
|
373
|
+
typeof c === "object" &&
|
|
374
|
+
c !== null &&
|
|
375
|
+
(c as { type: string }).type === "SentenceNode",
|
|
376
|
+
);
|
|
377
|
+
if (sentences.length > 0) {
|
|
378
|
+
groups.push(sentences);
|
|
379
|
+
}
|
|
380
|
+
} else if (child.type === "SentenceNode") {
|
|
381
|
+
rootSentences.push(child as unknown as GLOSTSentence);
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// Prepend root-level sentences as paragraph 0 (if any)
|
|
386
|
+
if (rootSentences.length > 0) {
|
|
387
|
+
groups.unshift(rootSentences);
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
return groups;
|
|
391
|
+
}
|