glost-processor 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/stream-processor.d.ts +145 -0
- package/dist/stream-processor.d.ts.map +1 -0
- package/dist/stream-processor.js +245 -0
- package/dist/stream-processor.js.map +1 -0
- package/package.json +2 -2
- package/src/__benchmarks__/stream-processor.bench.ts +250 -0
- package/src/__tests__/processor.test.ts +992 -0
- package/src/__tests__/stream-processor.test.ts +574 -0
- package/src/index.ts +6 -0
- package/src/stream-processor.ts +391 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GLOSTStreamProcessor Performance Benchmarks
|
|
3
|
+
*
|
|
4
|
+
* Compares batch (eager) processing against streaming for documents
|
|
5
|
+
* of various sizes. Includes 10K and 100K word documents.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { bench, describe } from "vitest";
|
|
9
|
+
import {
|
|
10
|
+
createGLOSTWordNode,
|
|
11
|
+
createSimpleDocument,
|
|
12
|
+
createSentenceFromWords,
|
|
13
|
+
createParagraphFromSentences,
|
|
14
|
+
createGLOSTRootNode,
|
|
15
|
+
} from "glost-core";
|
|
16
|
+
import type { GLOSTExtension } from "glost-extensions";
|
|
17
|
+
import { GLOSTProcessor } from "../processor.js";
|
|
18
|
+
import { GLOSTStreamProcessor } from "../stream-processor.js";
|
|
19
|
+
|
|
20
|
+
// ============================================================================
|
|
21
|
+
// Document factories
|
|
22
|
+
// ============================================================================
|
|
23
|
+
|
|
24
|
+
/** Create a document with `wordCount` words in a single sentence */
|
|
25
|
+
function makeDocumentByWords(wordCount: number) {
|
|
26
|
+
const words = Array.from({ length: wordCount }, (_, i) =>
|
|
27
|
+
createGLOSTWordNode({ value: `word${i}` }),
|
|
28
|
+
);
|
|
29
|
+
return createSimpleDocument(words, "en", "latin");
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Create a document with `sentenceCount` sentences, each sentence
|
|
34
|
+
* containing `wordsPerSentence` words, spread across `paragraphCount`
|
|
35
|
+
* paragraphs.
|
|
36
|
+
*/
|
|
37
|
+
function makeDocumentBySentences(
|
|
38
|
+
sentenceCount: number,
|
|
39
|
+
wordsPerSentence = 10,
|
|
40
|
+
paragraphCount = 1,
|
|
41
|
+
) {
|
|
42
|
+
const sentencesPerParagraph = Math.ceil(
|
|
43
|
+
sentenceCount / paragraphCount,
|
|
44
|
+
);
|
|
45
|
+
const paragraphs = [];
|
|
46
|
+
let remaining = sentenceCount;
|
|
47
|
+
|
|
48
|
+
for (let p = 0; p < paragraphCount; p++) {
|
|
49
|
+
const count = Math.min(remaining, sentencesPerParagraph);
|
|
50
|
+
const sentences = Array.from({ length: count }, (_, s) => {
|
|
51
|
+
const words = Array.from({ length: wordsPerSentence }, (_, w) =>
|
|
52
|
+
createGLOSTWordNode({ value: `p${p}s${s}w${w}` }),
|
|
53
|
+
);
|
|
54
|
+
return createSentenceFromWords(words, "en", "latin");
|
|
55
|
+
});
|
|
56
|
+
paragraphs.push(createParagraphFromSentences(sentences));
|
|
57
|
+
remaining -= count;
|
|
58
|
+
if (remaining <= 0) break;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return createGLOSTRootNode({
|
|
62
|
+
lang: "en",
|
|
63
|
+
script: "latin",
|
|
64
|
+
children: paragraphs,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// ============================================================================
|
|
69
|
+
// Extensions
|
|
70
|
+
// ============================================================================
|
|
71
|
+
|
|
72
|
+
function createVisitExtension(id: string): GLOSTExtension {
|
|
73
|
+
return {
|
|
74
|
+
id,
|
|
75
|
+
name: `Visit Extension ${id}`,
|
|
76
|
+
visit: {
|
|
77
|
+
word: (node) => {
|
|
78
|
+
return {
|
|
79
|
+
...node,
|
|
80
|
+
extras: { ...node.extras, [id]: true },
|
|
81
|
+
};
|
|
82
|
+
},
|
|
83
|
+
},
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function createChunkVisitExtension(id: string): GLOSTExtension {
|
|
88
|
+
return {
|
|
89
|
+
id,
|
|
90
|
+
name: `Chunk Visit Extension ${id}`,
|
|
91
|
+
streamingSupport: "chunk",
|
|
92
|
+
visit: {
|
|
93
|
+
word: (node) => {
|
|
94
|
+
return {
|
|
95
|
+
...node,
|
|
96
|
+
extras: { ...node.extras, [id]: true },
|
|
97
|
+
};
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// ============================================================================
|
|
104
|
+
// Benchmark: medium documents (for baseline comparison)
|
|
105
|
+
// ============================================================================
|
|
106
|
+
|
|
107
|
+
describe("Streaming vs Batch: Medium Documents", () => {
|
|
108
|
+
const doc1k = makeDocumentBySentences(100, 10); // 1K words
|
|
109
|
+
const ext = createVisitExtension("test");
|
|
110
|
+
const chunkExt = createChunkVisitExtension("chunk-test");
|
|
111
|
+
|
|
112
|
+
bench("batch — 1K words (GLOSTProcessor)", async () => {
|
|
113
|
+
const proc = new GLOSTProcessor().use(ext);
|
|
114
|
+
await proc.process(doc1k);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
bench("stream — 1K words, batchSize=50", async () => {
|
|
118
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
119
|
+
for await (const _chunk of proc.stream(doc1k, { batchSize: 50 })) {
|
|
120
|
+
// consume
|
|
121
|
+
}
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
bench("stream — 1K words, batchSize=10", async () => {
|
|
125
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
126
|
+
for await (const _chunk of proc.stream(doc1k, { batchSize: 10 })) {
|
|
127
|
+
// consume
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
bench("stream — 1K words, batchSize=100", async () => {
|
|
132
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
133
|
+
for await (const _chunk of proc.stream(doc1k, { batchSize: 100 })) {
|
|
134
|
+
// consume
|
|
135
|
+
}
|
|
136
|
+
});
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
// ============================================================================
|
|
140
|
+
// Benchmark: 10K word documents
|
|
141
|
+
// ============================================================================
|
|
142
|
+
|
|
143
|
+
describe("Streaming: 10K Word Documents", () => {
|
|
144
|
+
// 1000 sentences * 10 words = 10K words
|
|
145
|
+
const doc10k = makeDocumentBySentences(1000, 10);
|
|
146
|
+
const ext = createVisitExtension("test");
|
|
147
|
+
const chunkExt = createChunkVisitExtension("chunk-test");
|
|
148
|
+
|
|
149
|
+
bench("batch — 10K words (GLOSTProcessor)", async () => {
|
|
150
|
+
const proc = new GLOSTProcessor().use(ext);
|
|
151
|
+
await proc.process(doc10k);
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
bench("stream — 10K words, batchSize=50", async () => {
|
|
155
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
156
|
+
for await (const _chunk of proc.stream(doc10k, { batchSize: 50 })) {
|
|
157
|
+
// consume
|
|
158
|
+
}
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
bench("stream — 10K words, batchSize=100", async () => {
|
|
162
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
163
|
+
for await (const _chunk of proc.stream(doc10k, {
|
|
164
|
+
batchSize: 100,
|
|
165
|
+
})) {
|
|
166
|
+
// consume
|
|
167
|
+
}
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
bench("stream — 10K words, batchSize=500", async () => {
|
|
171
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
172
|
+
for await (const _chunk of proc.stream(doc10k, {
|
|
173
|
+
batchSize: 500,
|
|
174
|
+
})) {
|
|
175
|
+
// consume
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
// ============================================================================
|
|
181
|
+
// Benchmark: 100K word documents
|
|
182
|
+
// ============================================================================
|
|
183
|
+
|
|
184
|
+
describe("Streaming: 100K Word Documents", () => {
|
|
185
|
+
// 10000 sentences * 10 words = 100K words
|
|
186
|
+
const doc100k = makeDocumentBySentences(10_000, 10, 100);
|
|
187
|
+
const ext = createVisitExtension("test");
|
|
188
|
+
const chunkExt = createChunkVisitExtension("chunk-test");
|
|
189
|
+
|
|
190
|
+
bench("batch — 100K words (GLOSTProcessor)", async () => {
|
|
191
|
+
const proc = new GLOSTProcessor().use(ext);
|
|
192
|
+
await proc.process(doc100k);
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
bench("stream — 100K words, batchSize=50", async () => {
|
|
196
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
197
|
+
for await (const _chunk of proc.stream(doc100k, {
|
|
198
|
+
batchSize: 50,
|
|
199
|
+
})) {
|
|
200
|
+
// consume
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
bench("stream — 100K words, batchSize=500", async () => {
|
|
205
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
206
|
+
for await (const _chunk of proc.stream(doc100k, {
|
|
207
|
+
batchSize: 500,
|
|
208
|
+
})) {
|
|
209
|
+
// consume
|
|
210
|
+
}
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
bench("stream — 100K words, batchSize=1000", async () => {
|
|
214
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
215
|
+
for await (const _chunk of proc.stream(doc100k, {
|
|
216
|
+
batchSize: 1000,
|
|
217
|
+
})) {
|
|
218
|
+
// consume
|
|
219
|
+
}
|
|
220
|
+
});
|
|
221
|
+
});
|
|
222
|
+
|
|
223
|
+
// ============================================================================
|
|
224
|
+
// Benchmark: early termination via break
|
|
225
|
+
// ============================================================================
|
|
226
|
+
|
|
227
|
+
describe("Streaming: Early Termination", () => {
|
|
228
|
+
const doc10k = makeDocumentBySentences(1000, 10);
|
|
229
|
+
const chunkExt = createChunkVisitExtension("chunk-test");
|
|
230
|
+
|
|
231
|
+
bench("stream — cancel after first chunk (lazy win)", async () => {
|
|
232
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
233
|
+
for await (const chunk of proc.stream(doc10k, { batchSize: 50 })) {
|
|
234
|
+
break; // Only consume first chunk
|
|
235
|
+
}
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
bench(
|
|
239
|
+
"stream — cancel after 10 chunks vs full batch",
|
|
240
|
+
async () => {
|
|
241
|
+
const proc = new GLOSTStreamProcessor().use(chunkExt);
|
|
242
|
+
let count = 0;
|
|
243
|
+
for await (const _chunk of proc.stream(doc10k, {
|
|
244
|
+
batchSize: 50,
|
|
245
|
+
})) {
|
|
246
|
+
if (++count >= 10) break;
|
|
247
|
+
}
|
|
248
|
+
},
|
|
249
|
+
);
|
|
250
|
+
});
|