glost-processor 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,391 @@
1
+ /**
2
+ * GLOST Stream Processor
3
+ *
4
+ * Streaming variant of GLOSTProcessor that yields processed sentence
5
+ * batches progressively using AsyncGenerator. Keeps the full document
6
+ * out of memory between chunks.
7
+ *
8
+ * Document-level transforms (extensions with streamingSupport !== 'chunk')
9
+ * run once on the full document before streaming begins. Chunk-compatible
10
+ * extensions (streamingSupport === 'chunk') then run on each batch.
11
+ *
12
+ * @packageDocumentation
13
+ *
14
+ * @example
15
+ * ```typescript
16
+ * import { GLOSTStreamProcessor } from "glost-processor";
17
+ *
18
+ * const processor = new GLOSTStreamProcessor()
19
+ * .use(transcription)
20
+ * .use(translation);
21
+ *
22
+ * for await (const chunk of processor.stream(document)) {
23
+ * console.log(chunk.sentences, chunk.isLast);
24
+ * }
25
+ * ```
26
+ *
27
+ * @since 0.7.0
28
+ */
29
+
30
+ import type { GLOSTRoot, GLOSTSentence } from "glost-core";
31
+ import type { GLOSTExtension } from "glost-extensions";
32
+ import {
33
+ processGLOSTWithExtensionsAsync,
34
+ processGLOSTChunkAsync,
35
+ extensionRegistry,
36
+ } from "glost-extensions";
37
+ import type { PluginSpec, Preset, ProcessorOptions } from "./types.js";
38
+
39
+ // ============================================================================
40
+ // Public types
41
+ // ============================================================================
42
+
43
+ /**
44
+ * Options for stream() method
45
+ *
46
+ * @since 0.7.0
47
+ */
48
+ export interface StreamOptions {
49
+ /**
50
+ * Number of sentences per chunk.
51
+ *
52
+ * A smaller value reduces latency to first yielded chunk; a larger
53
+ * value amortises per-chunk overhead. Default: 50.
54
+ */
55
+ batchSize?: number;
56
+ }
57
+
58
+ /**
59
+ * A single yielded chunk from the stream
60
+ *
61
+ * @since 0.7.0
62
+ */
63
+ export interface ProcessedChunk {
64
+ /** Processed sentences in this batch */
65
+ sentences: GLOSTSentence[];
66
+
67
+ /**
68
+ * Index of the source paragraph this chunk came from.
69
+ *
70
+ * When the document has multiple paragraphs each paragraph is
71
+ * chunked independently, so multiple consecutive chunks may share
72
+ * the same paragraphIndex.
73
+ */
74
+ paragraphIndex: number;
75
+
76
+ /**
77
+ * Index of this chunk within its paragraph (0-based).
78
+ */
79
+ chunkIndex: number;
80
+
81
+ /** True for the final chunk across the whole document */
82
+ isLast: boolean;
83
+ }
84
+
85
+ // ============================================================================
86
+ // GLOSTStreamProcessor
87
+ // ============================================================================
88
+
89
+ /**
90
+ * Streaming processor for GLOST documents
91
+ *
92
+ * Mirrors the `GLOSTProcessor` API (`.use()`, `.freeze()`) but adds a
93
+ * `.stream()` method that returns an `AsyncGenerator<ProcessedChunk>`.
94
+ *
95
+ * @since 0.7.0
96
+ */
97
+ export class GLOSTStreamProcessor {
98
+ private plugins: Array<{ spec: PluginSpec; options?: unknown }> = [];
99
+ private options: ProcessorOptions = {};
100
+ private frozen = false;
101
+
102
+ /**
103
+ * Create a new stream processor instance
104
+ *
105
+ * @param options - Initial processor options
106
+ */
107
+ constructor(options: ProcessorOptions = {}) {
108
+ this.options = { ...options };
109
+ }
110
+
111
+ /**
112
+ * Add a plugin, preset, or extension to the pipeline
113
+ *
114
+ * @param spec - Plugin function, extension object, preset, or ID
115
+ * @param options - Plugin options
116
+ * @returns This processor for chaining
117
+ */
118
+ use(spec: PluginSpec | Preset, options?: unknown): this {
119
+ this.assertNotFrozen();
120
+
121
+ if (this.isPreset(spec)) {
122
+ return this.usePreset(spec);
123
+ }
124
+
125
+ this.plugins.push({ spec, options });
126
+ return this;
127
+ }
128
+
129
+ /**
130
+ * Freeze the processor
131
+ *
132
+ * Returns a frozen processor that cannot be modified. Useful for
133
+ * reusing the same pipeline configuration across multiple documents.
134
+ *
135
+ * @returns A frozen copy of this processor
136
+ */
137
+ freeze(): FrozenStreamProcessor {
138
+ const frozen = new GLOSTStreamProcessor(this.options);
139
+ frozen.plugins = [...this.plugins];
140
+ (frozen as unknown as { frozen: boolean }).frozen = true;
141
+ return frozen as unknown as FrozenStreamProcessor;
142
+ }
143
+
144
+ /**
145
+ * Stream a document as progressive sentence batches
146
+ *
147
+ * Processing phases:
148
+ * 1. All extensions with `streamingSupport !== 'chunk'` (i.e. `'none'`
149
+ * or `'full'`, or unset) run their `transform`, `visit`, and
150
+ * `enhanceMetadata` hooks on the **full** document.
151
+ * 2. The resulting document is split into sentence batches.
152
+ * 3. For each batch, extensions with `streamingSupport === 'chunk'`
153
+ * run their `visit` and `enhanceMetadata` hooks.
154
+ * 4. A `ProcessedChunk` is yielded.
155
+ *
156
+ * Cancellation: break out of the `for await` loop at any time. The
157
+ * generator will stop without processing remaining chunks.
158
+ *
159
+ * @param document - GLOST document to stream
160
+ * @param streamOptions - Streaming options (batchSize etc.)
161
+ * @yields `ProcessedChunk` objects in document order
162
+ *
163
+ * @example
164
+ * ```typescript
165
+ * for await (const chunk of processor.stream(doc, { batchSize: 20 })) {
166
+ * console.log(`para ${chunk.paragraphIndex} chunk ${chunk.chunkIndex}`);
167
+ * if (chunk.isLast) console.log("done");
168
+ * }
169
+ * ```
170
+ */
171
+ async *stream(
172
+ document: GLOSTRoot,
173
+ streamOptions?: StreamOptions,
174
+ ): AsyncGenerator<ProcessedChunk> {
175
+ const batchSize = streamOptions?.batchSize ?? 50;
176
+ const extensions = await this.resolveExtensions();
177
+
178
+ // Split extensions into doc-level and chunk-level
179
+ const docExtensions = extensions.filter(
180
+ (e) => e.streamingSupport !== "chunk",
181
+ );
182
+ const chunkExtensions = extensions.filter(
183
+ (e) => e.streamingSupport === "chunk",
184
+ );
185
+
186
+ // Phase 1: run doc-level transforms on the full document
187
+ let processedDoc = document;
188
+ if (docExtensions.length > 0) {
189
+ type OptionsWithData = ProcessorOptions & { data?: unknown };
190
+ const { data: _data, ...extOptions } =
191
+ this.options as OptionsWithData;
192
+ const result = await processGLOSTWithExtensionsAsync(
193
+ processedDoc,
194
+ docExtensions,
195
+ extOptions,
196
+ );
197
+ processedDoc = result.document;
198
+ }
199
+
200
+ // Phase 2: collect all sentences grouped by paragraph index
201
+ const paragraphSentences = collectSentencesByParagraph(processedDoc);
202
+
203
+ if (paragraphSentences.length === 0) {
204
+ return;
205
+ }
206
+
207
+ // Build a flat list of (paragraphIndex, chunkIndex, sentences) so
208
+ // we know the total chunk count upfront and can set isLast correctly.
209
+ type ChunkDescriptor = {
210
+ paragraphIndex: number;
211
+ chunkIndex: number;
212
+ sentences: GLOSTSentence[];
213
+ };
214
+
215
+ const allChunks: ChunkDescriptor[] = [];
216
+
217
+ for (
218
+ let pIdx = 0;
219
+ pIdx < paragraphSentences.length;
220
+ pIdx++
221
+ ) {
222
+ const sentences = paragraphSentences[pIdx]!;
223
+ let chunkIndex = 0;
224
+
225
+ for (
226
+ let offset = 0;
227
+ offset < sentences.length;
228
+ offset += batchSize
229
+ ) {
230
+ allChunks.push({
231
+ paragraphIndex: pIdx,
232
+ chunkIndex,
233
+ sentences: sentences.slice(offset, offset + batchSize),
234
+ });
235
+ chunkIndex++;
236
+ }
237
+ }
238
+
239
+ const totalChunks = allChunks.length;
240
+
241
+ // Phase 3: yield each chunk, optionally running chunk-level extensions
242
+ for (let i = 0; i < totalChunks; i++) {
243
+ const descriptor = allChunks[i]!;
244
+
245
+ let processedSentences = descriptor.sentences;
246
+
247
+ if (chunkExtensions.length > 0) {
248
+ type OptionsWithData = ProcessorOptions & { data?: unknown };
249
+ const { data: _data, ...extOptions } =
250
+ this.options as OptionsWithData;
251
+ processedSentences = await processGLOSTChunkAsync(
252
+ processedSentences,
253
+ chunkExtensions,
254
+ extOptions,
255
+ );
256
+ }
257
+
258
+ yield {
259
+ sentences: processedSentences,
260
+ paragraphIndex: descriptor.paragraphIndex,
261
+ chunkIndex: descriptor.chunkIndex,
262
+ isLast: i === totalChunks - 1,
263
+ };
264
+ }
265
+ }
266
+
267
+ // =====================================================================
268
+ // Private helpers
269
+ // =====================================================================
270
+
271
+ private usePreset(preset: Preset): this {
272
+ for (const entry of preset.plugins) {
273
+ if (Array.isArray(entry)) {
274
+ const [plugin, opts] = entry;
275
+ this.use(plugin, opts);
276
+ } else {
277
+ this.use(entry);
278
+ }
279
+ }
280
+ return this;
281
+ }
282
+
283
+ private async resolveExtensions(): Promise<GLOSTExtension[]> {
284
+ const result: GLOSTExtension[] = [];
285
+ for (const { spec, options } of this.plugins) {
286
+ const ext = await this.resolvePlugin(spec, options);
287
+ if (ext) {
288
+ result.push(ext);
289
+ }
290
+ }
291
+ return result;
292
+ }
293
+
294
+ private async resolvePlugin(
295
+ spec: PluginSpec,
296
+ options?: unknown,
297
+ ): Promise<GLOSTExtension | null> {
298
+ if (typeof spec === "string") {
299
+ const ext = extensionRegistry.get(spec);
300
+ if (!ext) {
301
+ throw new Error(`Plugin "${spec}" not found in registry`);
302
+ }
303
+ return ext;
304
+ }
305
+
306
+ if (typeof spec === "function") {
307
+ const result = (spec as (opts?: unknown) => GLOSTExtension | void)(
308
+ options,
309
+ );
310
+ return result ?? null;
311
+ }
312
+
313
+ return spec as GLOSTExtension;
314
+ }
315
+
316
+ private isPreset(spec: unknown): spec is Preset {
317
+ return (
318
+ spec !== null &&
319
+ typeof spec === "object" &&
320
+ "plugins" in (spec as object) &&
321
+ Array.isArray((spec as Preset).plugins)
322
+ );
323
+ }
324
+
325
+ private assertNotFrozen(): void {
326
+ if (this.frozen) {
327
+ throw new Error("Cannot modify frozen stream processor");
328
+ }
329
+ }
330
+ }
331
+
332
+ // ============================================================================
333
+ // Frozen type
334
+ // ============================================================================
335
+
336
+ /**
337
+ * A frozen `GLOSTStreamProcessor` that cannot be modified.
338
+ *
339
+ * @since 0.7.0
340
+ */
341
+ export type FrozenStreamProcessor = Omit<
342
+ GLOSTStreamProcessor,
343
+ "use" | "freeze"
344
+ > & { readonly frozen: true };
345
+
346
+ // ============================================================================
347
+ // Internal helpers
348
+ // ============================================================================
349
+
350
+ /**
351
+ * Collect all sentences from a GLOSTRoot, grouped by paragraph index.
352
+ *
353
+ * Only `SentenceNode` children of `ParagraphNode` children are
354
+ * collected. Sentences that appear directly under the root (without a
355
+ * wrapping paragraph) are collected as a single synthetic group at
356
+ * index 0.
357
+ *
358
+ * @internal
359
+ */
360
+ function collectSentencesByParagraph(
361
+ document: GLOSTRoot,
362
+ ): GLOSTSentence[][] {
363
+ const groups: GLOSTSentence[][] = [];
364
+
365
+ // Sentences that sit directly under the root (no paragraph wrapper)
366
+ const rootSentences: GLOSTSentence[] = [];
367
+
368
+ for (const child of document.children) {
369
+ if (child.type === "ParagraphNode" && "children" in child) {
370
+ const para = child as { type: string; children: unknown[] };
371
+ const sentences = para.children.filter(
372
+ (c): c is GLOSTSentence =>
373
+ typeof c === "object" &&
374
+ c !== null &&
375
+ (c as { type: string }).type === "SentenceNode",
376
+ );
377
+ if (sentences.length > 0) {
378
+ groups.push(sentences);
379
+ }
380
+ } else if (child.type === "SentenceNode") {
381
+ rootSentences.push(child as unknown as GLOSTSentence);
382
+ }
383
+ }
384
+
385
+ // Prepend root-level sentences as paragraph 0 (if any)
386
+ if (rootSentences.length > 0) {
387
+ groups.unshift(rootSentences);
388
+ }
389
+
390
+ return groups;
391
+ }