glost-processor 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,574 @@
1
+ /**
2
+ * GLOSTStreamProcessor Tests
3
+ *
4
+ * Comprehensive test suite for the streaming processor that yields
5
+ * processed sentence batches progressively.
6
+ */
7
+
8
+ import { describe, it, expect, vi } from "vitest";
9
+ import {
10
+ GLOSTStreamProcessor,
11
+ type ProcessedChunk,
12
+ type StreamOptions,
13
+ } from "../stream-processor.js";
14
+ import type { GLOSTExtension } from "glost-extensions";
15
+ import type {
16
+ GLOSTRoot,
17
+ GLOSTSentence,
18
+ GLOSTParagraph,
19
+ GLOSTWord,
20
+ } from "glost-core";
21
+
22
+ // ============================================================================
23
+ // Helpers
24
+ // ============================================================================
25
+
26
+ function makeWord(text: string): GLOSTWord {
27
+ return {
28
+ type: "WordNode",
29
+ text,
30
+ metadata: {},
31
+ extras: {},
32
+ children: [{ type: "TextNode", value: text }],
33
+ } as unknown as GLOSTWord;
34
+ }
35
+
36
+ function makeSentence(words: GLOSTWord[]): GLOSTSentence {
37
+ return {
38
+ type: "SentenceNode",
39
+ children: words,
40
+ metadata: {},
41
+ extras: {},
42
+ } as unknown as GLOSTSentence;
43
+ }
44
+
45
+ function makeParagraph(sentences: GLOSTSentence[]): GLOSTParagraph {
46
+ return {
47
+ type: "ParagraphNode",
48
+ children: sentences,
49
+ metadata: {},
50
+ extras: {},
51
+ } as unknown as GLOSTParagraph;
52
+ }
53
+
54
+ function makeDocument(
55
+ paragraphs: GLOSTParagraph[],
56
+ lang = "en",
57
+ ): GLOSTRoot {
58
+ return {
59
+ type: "RootNode",
60
+ lang,
61
+ children: paragraphs,
62
+ metadata: {},
63
+ extras: {},
64
+ };
65
+ }
66
+
67
+ /** Create a document with N sentences spread across P paragraphs */
68
+ function makeLargeDocument(
69
+ sentenceCount: number,
70
+ paragraphCount = 1,
71
+ ): GLOSTRoot {
72
+ const sentencesPerParagraph = Math.ceil(
73
+ sentenceCount / paragraphCount,
74
+ );
75
+ const paragraphs: GLOSTParagraph[] = [];
76
+
77
+ let remaining = sentenceCount;
78
+ for (let p = 0; p < paragraphCount; p++) {
79
+ const count = Math.min(remaining, sentencesPerParagraph);
80
+ const sentences: GLOSTSentence[] = Array.from(
81
+ { length: count },
82
+ (_, i) =>
83
+ makeSentence([makeWord(`word_p${p}_s${i}`)]),
84
+ );
85
+ paragraphs.push(makeParagraph(sentences));
86
+ remaining -= count;
87
+ if (remaining <= 0) break;
88
+ }
89
+
90
+ return makeDocument(paragraphs);
91
+ }
92
+
93
+ // A no-op extension (streamingSupport defaults to 'none')
94
+ const noopExtension: GLOSTExtension = {
95
+ id: "noop",
96
+ name: "No-op",
97
+ transform: (tree) => tree,
98
+ };
99
+
100
+ // A chunk-level transcription extension
101
+ const chunkTranscriptionExtension: GLOSTExtension = {
102
+ id: "chunk-transcription",
103
+ name: "Chunk Transcription",
104
+ streamingSupport: "chunk",
105
+ visit: {
106
+ word: (node) => {
107
+ (node as unknown as { extras: Record<string, unknown> }).extras = {
108
+ ...(node as unknown as { extras: Record<string, unknown> }).extras,
109
+ transcription: `[${(node as unknown as { text: string }).text}]`,
110
+ };
111
+ },
112
+ },
113
+ };
114
+
115
+ // A doc-level transform extension (streamingSupport not set = 'none')
116
+ const docTransformExtension: GLOSTExtension = {
117
+ id: "doc-transform",
118
+ name: "Doc Transform",
119
+ transform: (tree) => {
120
+ // Tag the root metadata so we can assert it ran
121
+ return {
122
+ ...tree,
123
+ extras: { ...tree.extras, docTransformRan: true },
124
+ };
125
+ },
126
+ };
127
+
128
+ // ============================================================================
129
+ // Tests
130
+ // ============================================================================
131
+
132
+ describe("GLOSTStreamProcessor", () => {
133
+ describe("constructor", () => {
134
+ it("creates an instance", () => {
135
+ const sp = new GLOSTStreamProcessor();
136
+ expect(sp).toBeInstanceOf(GLOSTStreamProcessor);
137
+ });
138
+
139
+ it("accepts options", () => {
140
+ const sp = new GLOSTStreamProcessor({ lenient: true });
141
+ expect(sp).toBeInstanceOf(GLOSTStreamProcessor);
142
+ });
143
+ });
144
+
145
+ describe("use()", () => {
146
+ it("returns this for chaining", () => {
147
+ const sp = new GLOSTStreamProcessor();
148
+ const returned = sp.use(noopExtension);
149
+ expect(returned).toBe(sp);
150
+ });
151
+
152
+ it("throws when modifying a frozen processor", () => {
153
+ const frozen = new GLOSTStreamProcessor().freeze();
154
+ const mutable = frozen as unknown as GLOSTStreamProcessor;
155
+ expect(() => mutable.use(noopExtension))
156
+ .toThrow("Cannot modify frozen stream processor");
157
+ });
158
+
159
+ it("accepts presets", () => {
160
+ const sp = new GLOSTStreamProcessor().use({
161
+ id: "preset",
162
+ name: "Preset",
163
+ plugins: [noopExtension, chunkTranscriptionExtension],
164
+ });
165
+ expect(sp).toBeInstanceOf(GLOSTStreamProcessor);
166
+ });
167
+ });
168
+
169
+ describe("freeze()", () => {
170
+ it("returns a frozen processor", () => {
171
+ const frozen = new GLOSTStreamProcessor().freeze();
172
+ expect(
173
+ (frozen as unknown as { frozen: boolean }).frozen,
174
+ ).toBe(true);
175
+ });
176
+
177
+ it("frozen processor can still stream documents", async () => {
178
+ const frozen = new GLOSTStreamProcessor()
179
+ .use(noopExtension)
180
+ .freeze();
181
+ const doc = makeLargeDocument(3);
182
+ const chunks: ProcessedChunk[] = [];
183
+ for await (const chunk of frozen.stream(doc)) {
184
+ chunks.push(chunk);
185
+ }
186
+ expect(chunks.length).toBeGreaterThan(0);
187
+ });
188
+ });
189
+
190
+ describe("stream() — empty document", () => {
191
+ it("yields nothing for an empty document", async () => {
192
+ const sp = new GLOSTStreamProcessor();
193
+ const doc = makeDocument([]);
194
+ const chunks: ProcessedChunk[] = [];
195
+ for await (const chunk of sp.stream(doc)) {
196
+ chunks.push(chunk);
197
+ }
198
+ expect(chunks).toHaveLength(0);
199
+ });
200
+
201
+ it("yields nothing for paragraphs with no sentences", async () => {
202
+ const sp = new GLOSTStreamProcessor();
203
+ const doc = makeDocument([
204
+ {
205
+ type: "ParagraphNode",
206
+ children: [],
207
+ metadata: {},
208
+ extras: {},
209
+ } as unknown as GLOSTParagraph,
210
+ ]);
211
+ const chunks: ProcessedChunk[] = [];
212
+ for await (const chunk of sp.stream(doc)) {
213
+ chunks.push(chunk);
214
+ }
215
+ expect(chunks).toHaveLength(0);
216
+ });
217
+ });
218
+
219
+ describe("stream() — single paragraph", () => {
220
+ it("streams a single-sentence document as one chunk", async () => {
221
+ const sp = new GLOSTStreamProcessor();
222
+ const doc = makeLargeDocument(1);
223
+ const chunks: ProcessedChunk[] = [];
224
+ for await (const chunk of sp.stream(doc)) {
225
+ chunks.push(chunk);
226
+ }
227
+ expect(chunks).toHaveLength(1);
228
+ expect(chunks[0]!.sentences).toHaveLength(1);
229
+ expect(chunks[0]!.paragraphIndex).toBe(0);
230
+ expect(chunks[0]!.chunkIndex).toBe(0);
231
+ expect(chunks[0]!.isLast).toBe(true);
232
+ });
233
+
234
+ it("streams multiple sentences as one chunk when under batch", async () => {
235
+ const sp = new GLOSTStreamProcessor();
236
+ const doc = makeLargeDocument(5);
237
+ const chunks: ProcessedChunk[] = [];
238
+ for await (const chunk of sp.stream(doc, { batchSize: 50 })) {
239
+ chunks.push(chunk);
240
+ }
241
+ expect(chunks).toHaveLength(1);
242
+ expect(chunks[0]!.sentences).toHaveLength(5);
243
+ });
244
+
245
+ it("marks the only chunk as isLast", async () => {
246
+ const sp = new GLOSTStreamProcessor();
247
+ const doc = makeLargeDocument(1);
248
+ const chunks: ProcessedChunk[] = [];
249
+ for await (const chunk of sp.stream(doc)) {
250
+ chunks.push(chunk);
251
+ }
252
+ expect(chunks[0]!.isLast).toBe(true);
253
+ });
254
+ });
255
+
256
+ describe("stream() — multi-paragraph document", () => {
257
+ it("assigns correct paragraphIndex per chunk", async () => {
258
+ const sp = new GLOSTStreamProcessor();
259
+ // 2 paragraphs, 3 sentences each, batchSize=10 (no splitting)
260
+ const doc = makeLargeDocument(6, 2);
261
+ const chunks: ProcessedChunk[] = [];
262
+ for await (const chunk of sp.stream(doc, { batchSize: 50 })) {
263
+ chunks.push(chunk);
264
+ }
265
+ // 2 paragraphs -> 2 chunks (one per paragraph)
266
+ expect(chunks).toHaveLength(2);
267
+ expect(chunks[0]!.paragraphIndex).toBe(0);
268
+ expect(chunks[1]!.paragraphIndex).toBe(1);
269
+ });
270
+
271
+ it("marks only the last chunk as isLast", async () => {
272
+ const sp = new GLOSTStreamProcessor();
273
+ const doc = makeLargeDocument(6, 2);
274
+ const chunks: ProcessedChunk[] = [];
275
+ for await (const chunk of sp.stream(doc, { batchSize: 50 })) {
276
+ chunks.push(chunk);
277
+ }
278
+ const lastFlags = chunks.map((c) => c.isLast);
279
+ expect(lastFlags.slice(0, -1).every((f) => f === false)).toBe(
280
+ true,
281
+ );
282
+ expect(lastFlags[lastFlags.length - 1]).toBe(true);
283
+ });
284
+ });
285
+
286
+ describe("stream() — batchSize", () => {
287
+ it("respects batchSize option", async () => {
288
+ const sp = new GLOSTStreamProcessor();
289
+ // 10 sentences in 1 paragraph, batchSize 3
290
+ const doc = makeLargeDocument(10);
291
+ const chunks: ProcessedChunk[] = [];
292
+ for await (const chunk of sp.stream(doc, { batchSize: 3 })) {
293
+ chunks.push(chunk);
294
+ }
295
+ // ceil(10/3) = 4 chunks
296
+ expect(chunks).toHaveLength(4);
297
+ expect(chunks[0]!.sentences).toHaveLength(3);
298
+ expect(chunks[1]!.sentences).toHaveLength(3);
299
+ expect(chunks[2]!.sentences).toHaveLength(3);
300
+ expect(chunks[3]!.sentences).toHaveLength(1);
301
+ });
302
+
303
+ it("uses default batchSize of 50 when not specified", async () => {
304
+ const sp = new GLOSTStreamProcessor();
305
+ const doc = makeLargeDocument(100);
306
+ const chunks: ProcessedChunk[] = [];
307
+ for await (const chunk of sp.stream(doc)) {
308
+ chunks.push(chunk);
309
+ }
310
+ expect(chunks).toHaveLength(2);
311
+ expect(chunks[0]!.sentences).toHaveLength(50);
312
+ expect(chunks[1]!.sentences).toHaveLength(50);
313
+ });
314
+
315
+ it("assigns sequential chunkIndex within a paragraph", async () => {
316
+ const sp = new GLOSTStreamProcessor();
317
+ const doc = makeLargeDocument(9);
318
+ const chunks: ProcessedChunk[] = [];
319
+ for await (const chunk of sp.stream(doc, { batchSize: 3 })) {
320
+ chunks.push(chunk);
321
+ }
322
+ const indices = chunks.map((c) => c.chunkIndex);
323
+ expect(indices).toEqual([0, 1, 2]);
324
+ });
325
+ });
326
+
327
+ describe("stream() — doc-level transform", () => {
328
+ it("runs doc-level extensions before streaming", async () => {
329
+ const sp = new GLOSTStreamProcessor().use(docTransformExtension);
330
+ const doc = makeLargeDocument(2);
331
+ const chunks: ProcessedChunk[] = [];
332
+ for await (const chunk of sp.stream(doc)) {
333
+ chunks.push(chunk);
334
+ }
335
+ // docTransformExtension is 'none' mode — it runs on the full doc
336
+ // The sentences themselves are untouched by this transform
337
+ expect(chunks).toHaveLength(1);
338
+ });
339
+
340
+ it("does not run doc-level transform per chunk", async () => {
341
+ const calls: string[] = [];
342
+ const countingTransform: GLOSTExtension = {
343
+ id: "counting-transform",
344
+ name: "Counting Transform",
345
+ // streamingSupport unset => 'none' (doc-level)
346
+ transform: (tree) => {
347
+ calls.push("transform");
348
+ return tree;
349
+ },
350
+ };
351
+
352
+ const sp = new GLOSTStreamProcessor().use(countingTransform);
353
+ const doc = makeLargeDocument(10);
354
+ // batchSize=2 => 5 chunks
355
+ for await (const _chunk of sp.stream(doc, { batchSize: 2 })) {
356
+ // consume
357
+ }
358
+
359
+ // transform should run exactly once, not once per chunk
360
+ expect(calls).toHaveLength(1);
361
+ });
362
+ });
363
+
364
+ describe("stream() — chunk-level extensions", () => {
365
+ it("processes each batch with chunk-level extensions", async () => {
366
+ const sp = new GLOSTStreamProcessor().use(
367
+ chunkTranscriptionExtension,
368
+ );
369
+ const doc = makeLargeDocument(4);
370
+ const chunks: ProcessedChunk[] = [];
371
+ for await (const chunk of sp.stream(doc, { batchSize: 2 })) {
372
+ chunks.push(chunk);
373
+ }
374
+ expect(chunks).toHaveLength(2);
375
+ // Each sentence word should have transcription added
376
+ type SentWithChildren = { children: unknown[] };
377
+ type WordLike = {
378
+ type: string;
379
+ text: string;
380
+ extras: Record<string, unknown>;
381
+ };
382
+ for (const chunk of chunks) {
383
+ for (const sentence of chunk.sentences) {
384
+ const sent = sentence as unknown as SentWithChildren;
385
+ for (const child of sent.children) {
386
+ const word = child as unknown as WordLike;
387
+ if (word.type === "WordNode") {
388
+ expect(word.extras.transcription).toBe(
389
+ `[${word.text}]`,
390
+ );
391
+ }
392
+ }
393
+ }
394
+ }
395
+ });
396
+
397
+ it("runs chunk extensions once per chunk, not once globally", async () => {
398
+ const callsPerChunk: number[] = [];
399
+ let currentChunkCalls = 0;
400
+
401
+ const countingChunkExtension: GLOSTExtension = {
402
+ id: "counting-chunk",
403
+ name: "Counting Chunk",
404
+ streamingSupport: "chunk",
405
+ visit: {
406
+ word: (node) => {
407
+ currentChunkCalls++;
408
+ },
409
+ },
410
+ };
411
+
412
+ const sp = new GLOSTStreamProcessor().use(countingChunkExtension);
413
+ const doc = makeLargeDocument(6); // 6 sentences, 1 word each
414
+ for await (const chunk of sp.stream(doc, { batchSize: 3 })) {
415
+ callsPerChunk.push(currentChunkCalls);
416
+ currentChunkCalls = 0;
417
+ }
418
+
419
+ // 2 chunks, 3 words each
420
+ expect(callsPerChunk).toEqual([3, 3]);
421
+ });
422
+ });
423
+
424
+ describe("stream() — combined doc + chunk extensions", () => {
425
+ it("runs doc-level first then chunk-level per batch", async () => {
426
+ const executionOrder: string[] = [];
427
+
428
+ const docExt: GLOSTExtension = {
429
+ id: "doc-ext",
430
+ name: "Doc Ext",
431
+ transform: (tree) => {
432
+ executionOrder.push("doc-transform");
433
+ return tree;
434
+ },
435
+ };
436
+
437
+ const chunkExt: GLOSTExtension = {
438
+ id: "chunk-ext",
439
+ name: "Chunk Ext",
440
+ streamingSupport: "chunk",
441
+ visit: {
442
+ word: () => {
443
+ executionOrder.push("chunk-visit");
444
+ },
445
+ },
446
+ };
447
+
448
+ const sp = new GLOSTStreamProcessor()
449
+ .use(docExt)
450
+ .use(chunkExt);
451
+
452
+ const doc = makeLargeDocument(4);
453
+ // batchSize=2 => 2 chunks with 2 sentences each
454
+ for await (const _chunk of sp.stream(doc, { batchSize: 2 })) {
455
+ // consume
456
+ }
457
+
458
+ // doc-transform runs once at the start
459
+ expect(executionOrder[0]).toBe("doc-transform");
460
+ // chunk-visit runs for words in each chunk
461
+ const chunkVisits = executionOrder.filter(
462
+ (e) => e === "chunk-visit",
463
+ );
464
+ // 4 sentences, 1 word each => 4 chunk-visit calls total
465
+ expect(chunkVisits).toHaveLength(4);
466
+ });
467
+ });
468
+
469
+ describe("stream() — backward compat (no streamingSupport set)", () => {
470
+ it("treats extensions without streamingSupport as doc-level", async () => {
471
+ const calls: string[] = [];
472
+ const legacyExtension: GLOSTExtension = {
473
+ id: "legacy",
474
+ name: "Legacy",
475
+ // No streamingSupport field at all
476
+ visit: {
477
+ word: (node) => {
478
+ calls.push("legacy-visit");
479
+ },
480
+ },
481
+ };
482
+
483
+ const sp = new GLOSTStreamProcessor().use(legacyExtension);
484
+ const doc = makeLargeDocument(6);
485
+ for await (const _chunk of sp.stream(doc, { batchSize: 2 })) {
486
+ // consume
487
+ }
488
+
489
+ // Runs once on the full document (6 words) before streaming,
490
+ // not per chunk
491
+ expect(calls).toHaveLength(6);
492
+ });
493
+ });
494
+
495
+ describe("stream() — cancellation", () => {
496
+ it("stops processing when consumer breaks from for-await", async () => {
497
+ const processedChunks: ProcessedChunk[] = [];
498
+ const sp = new GLOSTStreamProcessor();
499
+ const doc = makeLargeDocument(100);
500
+
501
+ for await (const chunk of sp.stream(doc, { batchSize: 10 })) {
502
+ processedChunks.push(chunk);
503
+ if (processedChunks.length >= 2) {
504
+ break; // Cancel after 2 chunks
505
+ }
506
+ }
507
+
508
+ expect(processedChunks).toHaveLength(2);
509
+ });
510
+ });
511
+
512
+ describe("stream() — ProcessedChunk metadata", () => {
513
+ it("includes correct metadata on each chunk", async () => {
514
+ const sp = new GLOSTStreamProcessor();
515
+ const doc = makeLargeDocument(7);
516
+ const chunks: ProcessedChunk[] = [];
517
+ for await (const chunk of sp.stream(doc, { batchSize: 3 })) {
518
+ chunks.push(chunk);
519
+ }
520
+
521
+ // ceil(7/3) = 3 chunks
522
+ expect(chunks).toHaveLength(3);
523
+
524
+ expect(chunks[0]).toMatchObject({
525
+ paragraphIndex: 0,
526
+ chunkIndex: 0,
527
+ isLast: false,
528
+ });
529
+ expect(chunks[0]!.sentences).toHaveLength(3);
530
+
531
+ expect(chunks[1]).toMatchObject({
532
+ paragraphIndex: 0,
533
+ chunkIndex: 1,
534
+ isLast: false,
535
+ });
536
+ expect(chunks[1]!.sentences).toHaveLength(3);
537
+
538
+ expect(chunks[2]).toMatchObject({
539
+ paragraphIndex: 0,
540
+ chunkIndex: 2,
541
+ isLast: true,
542
+ });
543
+ expect(chunks[2]!.sentences).toHaveLength(1);
544
+ });
545
+ });
546
+
547
+ describe("stream() — reuse across multiple documents", () => {
548
+ it("frozen processor streams multiple documents correctly", async () => {
549
+ const frozen = new GLOSTStreamProcessor()
550
+ .use(chunkTranscriptionExtension)
551
+ .freeze();
552
+
553
+ const doc1 = makeLargeDocument(3);
554
+ const doc2 = makeLargeDocument(5);
555
+
556
+ const chunks1: ProcessedChunk[] = [];
557
+ for await (const chunk of frozen.stream(doc1)) {
558
+ chunks1.push(chunk);
559
+ }
560
+
561
+ const chunks2: ProcessedChunk[] = [];
562
+ for await (const chunk of frozen.stream(doc2)) {
563
+ chunks2.push(chunk);
564
+ }
565
+
566
+ expect(
567
+ chunks1.flatMap((c) => c.sentences).length,
568
+ ).toBe(3);
569
+ expect(
570
+ chunks2.flatMap((c) => c.sentences).length,
571
+ ).toBe(5);
572
+ });
573
+ });
574
+ });
package/src/index.ts CHANGED
@@ -38,6 +38,12 @@ export type {
38
38
  ProgressHook,
39
39
  ProgressStats,
40
40
  } from "./types.js";
41
+ export { GLOSTStreamProcessor } from "./stream-processor.js";
42
+ export type {
43
+ FrozenStreamProcessor,
44
+ StreamOptions,
45
+ ProcessedChunk,
46
+ } from "./stream-processor.js";
41
47
 
42
48
  import { GLOSTProcessor } from "./processor.js";
43
49
  import type { ProcessorOptions } from "./types.js";