glost-processor 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,250 @@
1
+ /**
2
+ * GLOSTStreamProcessor Performance Benchmarks
3
+ *
4
+ * Compares batch (eager) processing against streaming for documents
5
+ * of various sizes. Includes 10K and 100K word documents.
6
+ */
7
+
8
+ import { bench, describe } from "vitest";
9
+ import {
10
+ createGLOSTWordNode,
11
+ createSimpleDocument,
12
+ createSentenceFromWords,
13
+ createParagraphFromSentences,
14
+ createGLOSTRootNode,
15
+ } from "glost-core";
16
+ import type { GLOSTExtension } from "glost-extensions";
17
+ import { GLOSTProcessor } from "../processor.js";
18
+ import { GLOSTStreamProcessor } from "../stream-processor.js";
19
+
20
+ // ============================================================================
21
+ // Document factories
22
+ // ============================================================================
23
+
24
+ /** Create a document with `wordCount` words in a single sentence */
25
+ function makeDocumentByWords(wordCount: number) {
26
+ const words = Array.from({ length: wordCount }, (_, i) =>
27
+ createGLOSTWordNode({ value: `word${i}` }),
28
+ );
29
+ return createSimpleDocument(words, "en", "latin");
30
+ }
31
+
32
+ /**
33
+ * Create a document with `sentenceCount` sentences, each sentence
34
+ * containing `wordsPerSentence` words, spread across `paragraphCount`
35
+ * paragraphs.
36
+ */
37
+ function makeDocumentBySentences(
38
+ sentenceCount: number,
39
+ wordsPerSentence = 10,
40
+ paragraphCount = 1,
41
+ ) {
42
+ const sentencesPerParagraph = Math.ceil(
43
+ sentenceCount / paragraphCount,
44
+ );
45
+ const paragraphs = [];
46
+ let remaining = sentenceCount;
47
+
48
+ for (let p = 0; p < paragraphCount; p++) {
49
+ const count = Math.min(remaining, sentencesPerParagraph);
50
+ const sentences = Array.from({ length: count }, (_, s) => {
51
+ const words = Array.from({ length: wordsPerSentence }, (_, w) =>
52
+ createGLOSTWordNode({ value: `p${p}s${s}w${w}` }),
53
+ );
54
+ return createSentenceFromWords(words, "en", "latin");
55
+ });
56
+ paragraphs.push(createParagraphFromSentences(sentences));
57
+ remaining -= count;
58
+ if (remaining <= 0) break;
59
+ }
60
+
61
+ return createGLOSTRootNode({
62
+ lang: "en",
63
+ script: "latin",
64
+ children: paragraphs,
65
+ });
66
+ }
67
+
68
+ // ============================================================================
69
+ // Extensions
70
+ // ============================================================================
71
+
72
+ function createVisitExtension(id: string): GLOSTExtension {
73
+ return {
74
+ id,
75
+ name: `Visit Extension ${id}`,
76
+ visit: {
77
+ word: (node) => {
78
+ return {
79
+ ...node,
80
+ extras: { ...node.extras, [id]: true },
81
+ };
82
+ },
83
+ },
84
+ };
85
+ }
86
+
87
+ function createChunkVisitExtension(id: string): GLOSTExtension {
88
+ return {
89
+ id,
90
+ name: `Chunk Visit Extension ${id}`,
91
+ streamingSupport: "chunk",
92
+ visit: {
93
+ word: (node) => {
94
+ return {
95
+ ...node,
96
+ extras: { ...node.extras, [id]: true },
97
+ };
98
+ },
99
+ },
100
+ };
101
+ }
102
+
103
+ // ============================================================================
104
+ // Benchmark: medium documents (for baseline comparison)
105
+ // ============================================================================
106
+
107
+ describe("Streaming vs Batch: Medium Documents", () => {
108
+ const doc1k = makeDocumentBySentences(100, 10); // 1K words
109
+ const ext = createVisitExtension("test");
110
+ const chunkExt = createChunkVisitExtension("chunk-test");
111
+
112
+ bench("batch — 1K words (GLOSTProcessor)", async () => {
113
+ const proc = new GLOSTProcessor().use(ext);
114
+ await proc.process(doc1k);
115
+ });
116
+
117
+ bench("stream — 1K words, batchSize=50", async () => {
118
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
119
+ for await (const _chunk of proc.stream(doc1k, { batchSize: 50 })) {
120
+ // consume
121
+ }
122
+ });
123
+
124
+ bench("stream — 1K words, batchSize=10", async () => {
125
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
126
+ for await (const _chunk of proc.stream(doc1k, { batchSize: 10 })) {
127
+ // consume
128
+ }
129
+ });
130
+
131
+ bench("stream — 1K words, batchSize=100", async () => {
132
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
133
+ for await (const _chunk of proc.stream(doc1k, { batchSize: 100 })) {
134
+ // consume
135
+ }
136
+ });
137
+ });
138
+
139
+ // ============================================================================
140
+ // Benchmark: 10K word documents
141
+ // ============================================================================
142
+
143
+ describe("Streaming: 10K Word Documents", () => {
144
+ // 1000 sentences * 10 words = 10K words
145
+ const doc10k = makeDocumentBySentences(1000, 10);
146
+ const ext = createVisitExtension("test");
147
+ const chunkExt = createChunkVisitExtension("chunk-test");
148
+
149
+ bench("batch — 10K words (GLOSTProcessor)", async () => {
150
+ const proc = new GLOSTProcessor().use(ext);
151
+ await proc.process(doc10k);
152
+ });
153
+
154
+ bench("stream — 10K words, batchSize=50", async () => {
155
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
156
+ for await (const _chunk of proc.stream(doc10k, { batchSize: 50 })) {
157
+ // consume
158
+ }
159
+ });
160
+
161
+ bench("stream — 10K words, batchSize=100", async () => {
162
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
163
+ for await (const _chunk of proc.stream(doc10k, {
164
+ batchSize: 100,
165
+ })) {
166
+ // consume
167
+ }
168
+ });
169
+
170
+ bench("stream — 10K words, batchSize=500", async () => {
171
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
172
+ for await (const _chunk of proc.stream(doc10k, {
173
+ batchSize: 500,
174
+ })) {
175
+ // consume
176
+ }
177
+ });
178
+ });
179
+
180
+ // ============================================================================
181
+ // Benchmark: 100K word documents
182
+ // ============================================================================
183
+
184
+ describe("Streaming: 100K Word Documents", () => {
185
+ // 10000 sentences * 10 words = 100K words
186
+ const doc100k = makeDocumentBySentences(10_000, 10, 100);
187
+ const ext = createVisitExtension("test");
188
+ const chunkExt = createChunkVisitExtension("chunk-test");
189
+
190
+ bench("batch — 100K words (GLOSTProcessor)", async () => {
191
+ const proc = new GLOSTProcessor().use(ext);
192
+ await proc.process(doc100k);
193
+ });
194
+
195
+ bench("stream — 100K words, batchSize=50", async () => {
196
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
197
+ for await (const _chunk of proc.stream(doc100k, {
198
+ batchSize: 50,
199
+ })) {
200
+ // consume
201
+ }
202
+ });
203
+
204
+ bench("stream — 100K words, batchSize=500", async () => {
205
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
206
+ for await (const _chunk of proc.stream(doc100k, {
207
+ batchSize: 500,
208
+ })) {
209
+ // consume
210
+ }
211
+ });
212
+
213
+ bench("stream — 100K words, batchSize=1000", async () => {
214
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
215
+ for await (const _chunk of proc.stream(doc100k, {
216
+ batchSize: 1000,
217
+ })) {
218
+ // consume
219
+ }
220
+ });
221
+ });
222
+
223
+ // ============================================================================
224
+ // Benchmark: early termination via break
225
+ // ============================================================================
226
+
227
+ describe("Streaming: Early Termination", () => {
228
+ const doc10k = makeDocumentBySentences(1000, 10);
229
+ const chunkExt = createChunkVisitExtension("chunk-test");
230
+
231
+ bench("stream — cancel after first chunk (lazy win)", async () => {
232
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
233
+ for await (const chunk of proc.stream(doc10k, { batchSize: 50 })) {
234
+ break; // Only consume first chunk
235
+ }
236
+ });
237
+
238
+ bench(
239
+ "stream — cancel after 10 chunks vs full batch",
240
+ async () => {
241
+ const proc = new GLOSTStreamProcessor().use(chunkExt);
242
+ let count = 0;
243
+ for await (const _chunk of proc.stream(doc10k, {
244
+ batchSize: 50,
245
+ })) {
246
+ if (++count >= 10) break;
247
+ }
248
+ },
249
+ );
250
+ });