@eeacms/volto-eea-chatbot 1.0.11 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -2
- package/README.md +8 -8
- package/jest-addon.config.js +1 -1
- package/package.json +2 -1
- package/src/ChatBlock/chat/AIMessage.tsx +32 -25
- package/src/ChatBlock/chat/ChatWindow.tsx +1 -0
- package/src/ChatBlock/components/HalloumiFeedback.jsx +8 -4
- package/src/ChatBlock/components/markdown/ClaimModal.jsx +1 -1
- package/src/ChatBlock/components/markdown/ClaimSegments.jsx +2 -3
- package/src/ChatBlock/components/markdown/RenderClaimView.jsx +1 -1
- package/src/ChatBlock/components/markdown/index.js +41 -15
- package/src/ChatBlock/hooks/useChatController.ts +0 -15
- package/src/ChatBlock/hooks/useQualityMarkers.js +0 -11
- package/src/ChatBlock/style.less +47 -0
- package/src/ChatBlock/types/interfaces.ts +1 -0
- package/src/halloumi/filtering.js +149 -0
- package/src/halloumi/filtering.test.js +44 -0
- package/src/halloumi/generative.js +157 -53
- package/src/halloumi/generative.test.js +28 -8
- package/src/halloumi/markdown-splitter.js +172 -0
- package/src/halloumi/markdown-splitter.test.js +133 -0
- package/src/halloumi/middleware.js +6 -6
- package/src/halloumi/postprocessing.js +0 -26
- package/src/halloumi/preprocessing.js +78 -76
- package/src/halloumi/preprocessing.test.js +87 -148
- package/src/middleware.js +3 -0
- package/src/middleware.test.js +2 -0
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import debug from 'debug';
|
|
2
|
-
import fetch from 'node-fetch';
|
|
3
2
|
import fs from 'fs';
|
|
4
3
|
import {
|
|
5
4
|
getClaimsFromResponse,
|
|
6
5
|
getTokenProbabilitiesFromLogits,
|
|
7
6
|
} from './postprocessing';
|
|
8
|
-
import {
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
import { createChunkedHalloumiPrompts, getOffsets } from './preprocessing';
|
|
8
|
+
import { splitMarkdown, splitProse } from './markdown-splitter';
|
|
9
|
+
import { callLLM, excludeClaimSentences } from './filtering';
|
|
11
10
|
|
|
12
11
|
const log = debug('halloumi');
|
|
13
12
|
|
|
13
|
+
const tokenChoices = new Set(['supported', 'unsupported']);
|
|
14
|
+
|
|
14
15
|
function sigmoid(x) {
|
|
15
16
|
return 1 / (1 + Math.exp(-x));
|
|
16
17
|
}
|
|
@@ -21,45 +22,143 @@ export function applyPlattScaling(platt, probability) {
|
|
|
21
22
|
return sigmoid(-1 * (platt.a * log_prob + platt.b));
|
|
22
23
|
}
|
|
23
24
|
|
|
25
|
+
/**
|
|
26
|
+
* Merges claims from multiple chunked HallOumi responses.
|
|
27
|
+
* For each response sentence (claimId), combines segment citations
|
|
28
|
+
* and takes the max supported score.
|
|
29
|
+
*/
|
|
30
|
+
function mergeChunkClaims(chunkResults) {
|
|
31
|
+
const claimMap = new Map();
|
|
32
|
+
|
|
33
|
+
for (const claims of chunkResults) {
|
|
34
|
+
for (const claim of claims) {
|
|
35
|
+
if (!claimMap.has(claim.claimId)) {
|
|
36
|
+
claimMap.set(claim.claimId, { ...claim });
|
|
37
|
+
} else {
|
|
38
|
+
const existing = claimMap.get(claim.claimId);
|
|
39
|
+
existing.segments.push(...claim.segments);
|
|
40
|
+
// Keep the result with the higher supported score
|
|
41
|
+
const existingScore = existing.probabilities.get('supported') || 0;
|
|
42
|
+
const newScore = claim.probabilities.get('supported') || 0;
|
|
43
|
+
if (newScore > existingScore) {
|
|
44
|
+
existing.probabilities = claim.probabilities;
|
|
45
|
+
existing.explanation = claim.explanation;
|
|
46
|
+
existing.supported = claim.supported;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return Array.from(claimMap.values());
|
|
53
|
+
}
|
|
54
|
+
|
|
24
55
|
export async function getVerifyClaimResponse(
|
|
25
56
|
model,
|
|
26
57
|
sources,
|
|
27
|
-
|
|
28
|
-
|
|
58
|
+
answer,
|
|
59
|
+
{ ip } = {},
|
|
29
60
|
) {
|
|
30
|
-
|
|
31
|
-
|
|
61
|
+
const emptyResponse = {
|
|
62
|
+
claims: [],
|
|
63
|
+
segments: {},
|
|
64
|
+
};
|
|
65
|
+
if (!sources?.length || !answer) {
|
|
66
|
+
return { ...emptyResponse, reason: 'Context is empty' };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Split sentences
|
|
70
|
+
const responseSentences = splitMarkdown(answer);
|
|
71
|
+
const responseOffsets = getOffsets(answer, responseSentences);
|
|
72
|
+
|
|
73
|
+
// Filter claims and context in parallel
|
|
74
|
+
const [excludeResponseIndices] = await Promise.all([
|
|
75
|
+
excludeClaimSentences(responseSentences, { ip }),
|
|
76
|
+
]);
|
|
32
77
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
78
|
+
const contextSentences = [];
|
|
79
|
+
const indexedContextSentences = sources.reduce((acc, text, sourceIdx) => {
|
|
80
|
+
const sentences = splitProse(text, 50).map((sentence, sentenceIdx) => {
|
|
81
|
+
const globalId = acc.length + sentenceIdx + 1;
|
|
82
|
+
contextSentences.push(sentence);
|
|
83
|
+
return {
|
|
84
|
+
sentence,
|
|
85
|
+
sourceId: sourceIdx + 1,
|
|
86
|
+
globalId,
|
|
87
|
+
};
|
|
88
|
+
});
|
|
89
|
+
acc.push(...sentences);
|
|
90
|
+
return acc;
|
|
91
|
+
}, []);
|
|
92
|
+
const joinedContext = sources.join('');
|
|
93
|
+
const contextOffsets = getOffsets(joinedContext, contextSentences);
|
|
94
|
+
|
|
95
|
+
if (excludeResponseIndices.size === responseSentences.length) {
|
|
96
|
+
log('All response sentences excluded');
|
|
97
|
+
return {
|
|
98
|
+
...emptyResponse,
|
|
99
|
+
empty: 'Claims in the document could not be verified',
|
|
37
100
|
};
|
|
38
|
-
return response;
|
|
39
101
|
}
|
|
40
|
-
|
|
41
|
-
const
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
request:
|
|
102
|
+
log('Excluded response indices', excludeResponseIndices);
|
|
103
|
+
const { prompts } = createChunkedHalloumiPrompts({
|
|
104
|
+
indexedContextSentences,
|
|
105
|
+
responseSentences,
|
|
106
|
+
responseOffsets,
|
|
107
|
+
request: null,
|
|
108
|
+
excludeResponseIndices,
|
|
46
109
|
});
|
|
47
110
|
|
|
48
|
-
log(
|
|
111
|
+
log(`Split into ${prompts.length} chunk(s)`);
|
|
112
|
+
|
|
113
|
+
// Run all chunks in parallel
|
|
114
|
+
const chunkResults = await Promise.all(
|
|
115
|
+
prompts.map((chunkPrompt, i) => {
|
|
116
|
+
log(`Chunk ${i + 1} request`);
|
|
117
|
+
return halloumiGenerativeAPI(model, chunkPrompt, { ip });
|
|
118
|
+
}),
|
|
119
|
+
);
|
|
120
|
+
|
|
121
|
+
// Merge raw claims across chunks
|
|
122
|
+
const rawClaims = mergeChunkClaims(chunkResults);
|
|
123
|
+
|
|
124
|
+
const mergedPrompt = {
|
|
125
|
+
contextOffsets,
|
|
126
|
+
responseOffsets,
|
|
127
|
+
joinedContext,
|
|
128
|
+
};
|
|
129
|
+
const converted = convertGenerativesClaimToVerifyClaimResponse(
|
|
130
|
+
rawClaims,
|
|
131
|
+
mergedPrompt,
|
|
132
|
+
);
|
|
133
|
+
|
|
134
|
+
if (excludeResponseIndices.size > 0) {
|
|
135
|
+
for (const idx of excludeResponseIndices) {
|
|
136
|
+
if (responseOffsets.has(idx)) {
|
|
137
|
+
const offsets = responseOffsets.get(idx);
|
|
138
|
+
converted.claims.push({
|
|
139
|
+
claimId: idx,
|
|
140
|
+
claimString: responseSentences[idx - 1],
|
|
141
|
+
startOffset: offsets.startOffset,
|
|
142
|
+
endOffset: offsets.endOffset,
|
|
143
|
+
skipped: true,
|
|
144
|
+
score: null,
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
converted.claims.sort((a, b) => a.startOffset - b.startOffset);
|
|
149
|
+
}
|
|
49
150
|
|
|
50
|
-
const rawClaims = await halloumiGenerativeAPI(model, prompt);
|
|
51
|
-
log('Raw claims', rawClaims);
|
|
52
151
|
const result = {
|
|
53
|
-
...
|
|
152
|
+
...converted,
|
|
54
153
|
rawClaims,
|
|
55
|
-
|
|
154
|
+
...(prompts.length === 1
|
|
155
|
+
? { halloumiPrompt: prompts[0] }
|
|
156
|
+
: { halloumiPrompts: prompts }),
|
|
56
157
|
};
|
|
57
158
|
|
|
58
159
|
return result;
|
|
59
160
|
}
|
|
60
161
|
|
|
61
|
-
const tokenChoices = new Set(['supported', 'unsupported']);
|
|
62
|
-
|
|
63
162
|
/**
|
|
64
163
|
* Fetches a response from the LLM.
|
|
65
164
|
*
|
|
@@ -72,7 +171,7 @@ const tokenChoices = new Set(['supported', 'unsupported']);
|
|
|
72
171
|
* - `DUMP_HALLOUMI_REQ_FILE_PATH`: If set, the LLM request (URL and parameters) is dumped to the specified file path.
|
|
73
172
|
* - `DUMP_HALLOUMI_FILE_PATH`: If set, the LLM response is dumped to the specified file path.
|
|
74
173
|
*/
|
|
75
|
-
async function getLLMResponse(model, prompt) {
|
|
174
|
+
async function getLLMResponse(model, prompt, { ip } = {}) {
|
|
76
175
|
let jsonData;
|
|
77
176
|
|
|
78
177
|
if (process.env.MOCK_HALLOUMI_FILE_PATH) {
|
|
@@ -90,34 +189,17 @@ async function getLLMResponse(model, prompt) {
|
|
|
90
189
|
logprobs: true,
|
|
91
190
|
top_logprobs: 3,
|
|
92
191
|
};
|
|
93
|
-
const headers = {
|
|
94
|
-
'Content-Type': 'application/json',
|
|
95
|
-
accept: 'application/json',
|
|
96
|
-
};
|
|
97
|
-
if (model.apiKey) {
|
|
98
|
-
headers['Authorization'] = `Bearer ${model.apiKey}`;
|
|
99
|
-
}
|
|
100
192
|
|
|
101
|
-
const params = {
|
|
102
|
-
method: 'POST',
|
|
103
|
-
headers: headers,
|
|
104
|
-
body: JSON.stringify(data),
|
|
105
|
-
};
|
|
106
193
|
if (process.env.DUMP_HALLOUMI_REQ_FILE_PATH) {
|
|
107
194
|
const filePath = process.env.DUMP_HALLOUMI_REQ_FILE_PATH;
|
|
108
195
|
fs.writeFileSync(
|
|
109
196
|
filePath,
|
|
110
|
-
JSON.stringify(
|
|
111
|
-
{ url: model.apiUrl, params: { ...params, body: data } },
|
|
112
|
-
null,
|
|
113
|
-
2,
|
|
114
|
-
),
|
|
197
|
+
JSON.stringify({ url: model.apiUrl, body: data }, null, 2),
|
|
115
198
|
);
|
|
116
|
-
log(`Dumped halloumi
|
|
199
|
+
log(`Dumped halloumi request: ${filePath}`);
|
|
117
200
|
}
|
|
118
201
|
|
|
119
|
-
|
|
120
|
-
jsonData = await response.json();
|
|
202
|
+
jsonData = await callLLM(model.apiUrl, model.apiKey, data, { ip });
|
|
121
203
|
|
|
122
204
|
if (process.env.DUMP_HALLOUMI_FILE_PATH) {
|
|
123
205
|
const filePath = process.env.DUMP_HALLOUMI_FILE_PATH;
|
|
@@ -133,11 +215,19 @@ async function getLLMResponse(model, prompt) {
|
|
|
133
215
|
* @param response A string containing all claims and their information.
|
|
134
216
|
* @returns A list of claim objects.
|
|
135
217
|
*/
|
|
136
|
-
export async function halloumiGenerativeAPI(model, prompt) {
|
|
137
|
-
const jsonData = await getLLMResponse(model, prompt);
|
|
218
|
+
export async function halloumiGenerativeAPI(model, prompt, { ip } = {}) {
|
|
219
|
+
const jsonData = await getLLMResponse(model, prompt, { ip });
|
|
220
|
+
|
|
221
|
+
// Todo: restore log
|
|
222
|
+
// log('Generative response', jsonData);
|
|
138
223
|
|
|
139
|
-
|
|
140
|
-
|
|
224
|
+
const finishReason = jsonData.choices?.[0]?.finish_reason;
|
|
225
|
+
if (finishReason === 'length') {
|
|
226
|
+
throw new Error('HallOumi response truncated (finish_reason: length)');
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Todo: restore log
|
|
230
|
+
// log('Logprobs', jsonData.choices[0].logprobs.content);
|
|
141
231
|
|
|
142
232
|
const logits = jsonData.choices[0].logprobs.content;
|
|
143
233
|
const tokenProbabilities = getTokenProbabilitiesFromLogits(
|
|
@@ -149,11 +239,20 @@ export async function halloumiGenerativeAPI(model, prompt) {
|
|
|
149
239
|
);
|
|
150
240
|
|
|
151
241
|
if (parsedResponse.length !== tokenProbabilities.length) {
|
|
152
|
-
|
|
242
|
+
log(
|
|
243
|
+
'Warning: token probabilities (%d) and claims (%d) do not match — using available probabilities, defaulting remainder to 0.5',
|
|
244
|
+
tokenProbabilities.length,
|
|
245
|
+
parsedResponse.length,
|
|
246
|
+
);
|
|
153
247
|
}
|
|
154
248
|
|
|
249
|
+
const defaultScoreMap = new Map([
|
|
250
|
+
['supported', 0.5],
|
|
251
|
+
['unsupported', 0.5],
|
|
252
|
+
]);
|
|
253
|
+
|
|
155
254
|
for (let i = 0; i < parsedResponse.length; i++) {
|
|
156
|
-
const scoreMap = tokenProbabilities[i];
|
|
255
|
+
const scoreMap = tokenProbabilities[i] ?? new Map(defaultScoreMap);
|
|
157
256
|
if (model.plattScaling) {
|
|
158
257
|
const platt = model.plattScaling;
|
|
159
258
|
const unsupportedScore = applyPlattScaling(
|
|
@@ -183,12 +282,17 @@ export function convertGenerativesClaimToVerifyClaimResponse(
|
|
|
183
282
|
id,
|
|
184
283
|
startOffset: offset[1].startOffset,
|
|
185
284
|
endOffset: offset[1].endOffset,
|
|
285
|
+
text: prompt.joinedContext.slice(
|
|
286
|
+
offset[1].startOffset,
|
|
287
|
+
offset[1].endOffset,
|
|
288
|
+
),
|
|
186
289
|
};
|
|
187
290
|
}
|
|
188
291
|
|
|
189
292
|
for (const generativeClaim of generativeClaims) {
|
|
190
293
|
const segmentIds = [];
|
|
191
294
|
for (const seg of generativeClaim.segments) {
|
|
295
|
+
if (!seg) continue;
|
|
192
296
|
segmentIds.push(seg.toString());
|
|
193
297
|
}
|
|
194
298
|
|
|
@@ -77,17 +77,29 @@ describe('applyPlattScaling', () => {
|
|
|
77
77
|
describe('getVerifyClaimResponse', () => {
|
|
78
78
|
it('returns empty response when sources is empty', async () => {
|
|
79
79
|
const result = await getVerifyClaimResponse({}, [], 'claims');
|
|
80
|
-
expect(result).toEqual({
|
|
80
|
+
expect(result).toEqual({
|
|
81
|
+
claims: [],
|
|
82
|
+
segments: {},
|
|
83
|
+
reason: 'Context is empty',
|
|
84
|
+
});
|
|
81
85
|
});
|
|
82
86
|
|
|
83
87
|
it('returns empty response when sources is null', async () => {
|
|
84
88
|
const result = await getVerifyClaimResponse({}, null, 'claims');
|
|
85
|
-
expect(result).toEqual({
|
|
89
|
+
expect(result).toEqual({
|
|
90
|
+
claims: [],
|
|
91
|
+
segments: {},
|
|
92
|
+
reason: 'Context is empty',
|
|
93
|
+
});
|
|
86
94
|
});
|
|
87
95
|
|
|
88
96
|
it('returns empty response when claims is falsy', async () => {
|
|
89
97
|
const result = await getVerifyClaimResponse({}, ['source'], null);
|
|
90
|
-
expect(result).toEqual({
|
|
98
|
+
expect(result).toEqual({
|
|
99
|
+
claims: [],
|
|
100
|
+
segments: {},
|
|
101
|
+
reason: 'Context is empty',
|
|
102
|
+
});
|
|
91
103
|
});
|
|
92
104
|
});
|
|
93
105
|
|
|
@@ -257,7 +269,7 @@ describe('halloumiGenerativeAPI via real fetch', () => {
|
|
|
257
269
|
expect(callHeaders.Authorization).toBeUndefined();
|
|
258
270
|
});
|
|
259
271
|
|
|
260
|
-
it('
|
|
272
|
+
it('defaults to 0.5 when token probabilities and claims do not match', async () => {
|
|
261
273
|
jest.doMock('./postprocessing', () => ({
|
|
262
274
|
getClaimsFromResponse: jest.fn(() => [
|
|
263
275
|
{ claimId: 1, claimString: 'Claim 1' },
|
|
@@ -290,9 +302,11 @@ describe('halloumiGenerativeAPI via real fetch', () => {
|
|
|
290
302
|
responseOffsets: new Map(),
|
|
291
303
|
};
|
|
292
304
|
|
|
293
|
-
await
|
|
294
|
-
|
|
295
|
-
);
|
|
305
|
+
const result = await halloumiGenerativeAPI(model, prompt);
|
|
306
|
+
// First claim gets the available probability
|
|
307
|
+
expect(result[0].probabilities.get('supported')).toBe(0.9);
|
|
308
|
+
// Second claim defaults to 0.5
|
|
309
|
+
expect(result[1].probabilities.get('supported')).toBe(0.5);
|
|
296
310
|
});
|
|
297
311
|
});
|
|
298
312
|
|
|
@@ -316,6 +330,7 @@ describe('convertGenerativesClaimToVerifyClaimResponse', () => {
|
|
|
316
330
|
const prompt = {
|
|
317
331
|
contextOffsets: new Map([[1, { startOffset: 0, endOffset: 10 }]]),
|
|
318
332
|
responseOffsets: new Map([[1, { startOffset: 100, endOffset: 120 }]]),
|
|
333
|
+
joinedContext: 'Test conte',
|
|
319
334
|
};
|
|
320
335
|
|
|
321
336
|
const result = convertGenerativesClaimToVerifyClaimResponse(
|
|
@@ -336,7 +351,12 @@ describe('convertGenerativesClaimToVerifyClaimResponse', () => {
|
|
|
336
351
|
},
|
|
337
352
|
],
|
|
338
353
|
segments: {
|
|
339
|
-
1: {
|
|
354
|
+
1: {
|
|
355
|
+
id: '1',
|
|
356
|
+
startOffset: 0,
|
|
357
|
+
endOffset: 10,
|
|
358
|
+
text: 'Test conte',
|
|
359
|
+
},
|
|
340
360
|
},
|
|
341
361
|
});
|
|
342
362
|
});
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import nlp from 'compromise';
|
|
2
|
+
|
|
3
|
+
const MIN_SENTENCE_LENGTH = 15;
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Splits markdown text into segments meaningful for fact-checking.
|
|
7
|
+
*
|
|
8
|
+
* Strategy:
|
|
9
|
+
* 1. Split into markdown blocks (paragraphs, list items, table rows, headers)
|
|
10
|
+
* 2. For prose blocks, further split into sentences using Intl.Segmenter
|
|
11
|
+
* 3. Merge short fragments (< MIN_SENTENCE_LENGTH) into the next sentence
|
|
12
|
+
*
|
|
13
|
+
* Structural blocks (table rows, headers, list items) are kept as-is since
|
|
14
|
+
* they are already atomic units.
|
|
15
|
+
*
|
|
16
|
+
* @param {string} text Markdown text to split.
|
|
17
|
+
* @returns {string[]} Array of segment strings.
|
|
18
|
+
*/
|
|
19
|
+
export function splitMarkdown(text) {
|
|
20
|
+
const blocks = splitIntoBlocks(text);
|
|
21
|
+
const segments = [];
|
|
22
|
+
|
|
23
|
+
for (const block of blocks) {
|
|
24
|
+
if (isStructuralBlock(block)) {
|
|
25
|
+
if (block.trim().length > 0) {
|
|
26
|
+
segments.push(block);
|
|
27
|
+
}
|
|
28
|
+
} else {
|
|
29
|
+
segments.push(...splitProse(block));
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return segments;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Splits markdown text into structural blocks.
|
|
38
|
+
* Separates: headers, table rows, list items, horizontal rules, and prose.
|
|
39
|
+
* Prose lines within the same paragraph are joined together.
|
|
40
|
+
*/
|
|
41
|
+
function splitIntoBlocks(text) {
|
|
42
|
+
const lines = text.split('\n');
|
|
43
|
+
const blocks = [];
|
|
44
|
+
let currentProse = '';
|
|
45
|
+
|
|
46
|
+
const flushProse = () => {
|
|
47
|
+
if (currentProse) {
|
|
48
|
+
blocks.push(currentProse);
|
|
49
|
+
currentProse = '';
|
|
50
|
+
}
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
for (const line of lines) {
|
|
54
|
+
const trimmed = line.trimStart();
|
|
55
|
+
|
|
56
|
+
if (isTableRow(trimmed)) {
|
|
57
|
+
flushProse();
|
|
58
|
+
blocks.push(line);
|
|
59
|
+
} else if (isTableSeparator(trimmed)) {
|
|
60
|
+
flushProse();
|
|
61
|
+
// Skip separator rows — not verifiable content
|
|
62
|
+
} else if (isHeader(trimmed)) {
|
|
63
|
+
flushProse();
|
|
64
|
+
blocks.push(line);
|
|
65
|
+
} else if (isHorizontalRule(trimmed)) {
|
|
66
|
+
flushProse();
|
|
67
|
+
// Skip horizontal rules
|
|
68
|
+
} else if (isListItem(trimmed)) {
|
|
69
|
+
flushProse();
|
|
70
|
+
blocks.push(line);
|
|
71
|
+
} else if (trimmed === '') {
|
|
72
|
+
flushProse();
|
|
73
|
+
} else {
|
|
74
|
+
// Prose continuation — join into a single paragraph
|
|
75
|
+
currentProse += (currentProse ? ' ' : '') + line.trim();
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
flushProse();
|
|
80
|
+
return blocks;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function isTableRow(line) {
|
|
84
|
+
return /^\|.*\|/.test(line) && !isTableSeparator(line);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function isTableSeparator(line) {
|
|
88
|
+
return /^\|[\s\-:|]+\|$/.test(line);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function isHeader(line) {
|
|
92
|
+
return /^#{1,6}\s/.test(line);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function isHorizontalRule(line) {
|
|
96
|
+
return /^(\*{3,}|-{3,}|_{3,})\s*$/.test(line);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function isListItem(line) {
|
|
100
|
+
return /^(\d+\.\s+|[-*+]\s+)/.test(line);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Returns true if a block is structural (table row, header, list item)
|
|
105
|
+
* and should not be further split into sentences.
|
|
106
|
+
*/
|
|
107
|
+
function isStructuralBlock(block) {
|
|
108
|
+
const trimmed = block.trimStart();
|
|
109
|
+
return isTableRow(trimmed) || isHeader(trimmed) || isListItem(trimmed);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Splits a prose paragraph into sentences using sbd (sentence boundary detection),
|
|
114
|
+
* merging short fragments that aren't independently verifiable.
|
|
115
|
+
* sbd handles abbreviations (Dr., Mr., U.S., Ph.D.) correctly.
|
|
116
|
+
*/
|
|
117
|
+
export function splitProse(text, maxSentences = 0) {
|
|
118
|
+
// const segments = segment('en', text);
|
|
119
|
+
const doc = nlp(text);
|
|
120
|
+
const initialSentences = doc.sentences().out('array');
|
|
121
|
+
|
|
122
|
+
// Find each sentence's position in the original text
|
|
123
|
+
const positions = getPositions(text, initialSentences);
|
|
124
|
+
|
|
125
|
+
// Merge short sentences (< MIN_SENTENCE_LENGTH) into the next sentence
|
|
126
|
+
const merged = [];
|
|
127
|
+
const mergedPositions = [];
|
|
128
|
+
let pendingStart = null;
|
|
129
|
+
for (let i = 0; i < initialSentences.length; i++) {
|
|
130
|
+
if (pendingStart === null) {
|
|
131
|
+
pendingStart = positions[i].start;
|
|
132
|
+
}
|
|
133
|
+
if (
|
|
134
|
+
initialSentences[i].replaceAll('\n', '').length < MIN_SENTENCE_LENGTH &&
|
|
135
|
+
i < initialSentences.length - 1
|
|
136
|
+
) {
|
|
137
|
+
// Too short — will be merged with the next sentence
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
140
|
+
const s = text.slice(pendingStart, positions[i].end);
|
|
141
|
+
merged.push(s);
|
|
142
|
+
mergedPositions.push({ start: pendingStart, end: positions[i].end });
|
|
143
|
+
pendingStart = null;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if (maxSentences && merged.length > maxSentences) {
|
|
147
|
+
// Merge groups by slicing original text to preserve separators
|
|
148
|
+
const groupSize = Math.ceil(merged.length / maxSentences);
|
|
149
|
+
const groupedSentences = [];
|
|
150
|
+
for (let i = 0; i < merged.length; i += groupSize) {
|
|
151
|
+
const groupStart = mergedPositions[i].start;
|
|
152
|
+
const groupEnd =
|
|
153
|
+
mergedPositions[Math.min(i + groupSize, merged.length) - 1].end;
|
|
154
|
+
groupedSentences.push(text.slice(groupStart, groupEnd));
|
|
155
|
+
}
|
|
156
|
+
return groupedSentences;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
return merged;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
function getPositions(text, sentences) {
|
|
163
|
+
const positions = [];
|
|
164
|
+
let searchFrom = 0;
|
|
165
|
+
for (const sentence of sentences) {
|
|
166
|
+
const start = text.indexOf(sentence, searchFrom);
|
|
167
|
+
const end = start + sentence.length;
|
|
168
|
+
positions.push({ start, end });
|
|
169
|
+
searchFrom = end;
|
|
170
|
+
}
|
|
171
|
+
return positions;
|
|
172
|
+
}
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import { splitMarkdown } from './markdown-splitter';
|
|
2
|
+
|
|
3
|
+
const DUMMY_LLM_RESPONSE = `## European species most at risk
|
|
4
|
+
|
|
5
|
+
Below is a concise, evidence-based snapshot of the taxonomic groups and flagship species. The list combines information on trend magnitude and risk status.
|
|
6
|
+
|
|
7
|
+
| Taxonomic group | Species at risk | Main drivers | Signal |
|
|
8
|
+
|-----------------|-----------------|--------------|--------|
|
|
9
|
+
| **Birds** | Skylark (*Alauda arvensis*), European turtle-dove | Intensive agriculture, pesticide use | Farmland-bird index down 27% (1990-2019) |
|
|
10
|
+
| **Fish** | Houting (*Coregonus oxyrhynchus*), Danube salmon | River regulation, dam construction | Both listed as critically endangered |
|
|
11
|
+
| **Butterflies** | Large blue (*Phengaris arion*), Adonis blue | Loss of semi-natural grasslands | Grassland-butterfly index down 25% |
|
|
12
|
+
|
|
13
|
+
### Key take-aways
|
|
14
|
+
|
|
15
|
+
1. **Agricultural intensification** is the single biggest pressure across taxa.
|
|
16
|
+
2. **Habitat loss and degradation** underlie the perilous status of many specialists.
|
|
17
|
+
3. **Chemical stressors** affect both insects and the birds that depend on them.
|
|
18
|
+
4. **Climate change** amplifies existing pressures such as drought stress on amphibians.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
### Conservation priorities
|
|
23
|
+
|
|
24
|
+
- **Protect and restore semi-natural habitats** including high-diversity grasslands and river floodplains.
|
|
25
|
+
- **Implement agri-environment schemes** that limit pesticide use and maintain field margins.
|
|
26
|
+
- **Focus monitoring on flagship species** to gauge the effectiveness of policy actions.
|
|
27
|
+
|
|
28
|
+
If you need a deeper dive into a particular taxon, just let me know!`;
|
|
29
|
+
|
|
30
|
+
describe('splitMarkdown', () => {
|
|
31
|
+
it('should split the dummy LLM response into meaningful segments', () => {
|
|
32
|
+
const segments = splitMarkdown(DUMMY_LLM_RESPONSE);
|
|
33
|
+
|
|
34
|
+
// Headers are separate segments
|
|
35
|
+
expect(segments).toContain('## European species most at risk');
|
|
36
|
+
expect(segments).toContain('### Key take-aways');
|
|
37
|
+
expect(segments).toContain('### Conservation priorities');
|
|
38
|
+
|
|
39
|
+
// Table rows are individual segments (separator skipped)
|
|
40
|
+
const tableRows = segments.filter((s) => s.startsWith('|'));
|
|
41
|
+
expect(tableRows.length).toBe(4); // header row + 3 data rows
|
|
42
|
+
|
|
43
|
+
// Numbered list items are separate
|
|
44
|
+
const numberedItems = segments.filter((s) => /^\d+\./.test(s.trimStart()));
|
|
45
|
+
expect(numberedItems.length).toBe(4);
|
|
46
|
+
|
|
47
|
+
// Bullet list items are separate
|
|
48
|
+
const bulletItems = segments.filter((s) => /^-\s/.test(s.trimStart()));
|
|
49
|
+
expect(bulletItems.length).toBe(3);
|
|
50
|
+
|
|
51
|
+
// Prose paragraphs are split into sentences
|
|
52
|
+
expect(segments).toContainEqual(
|
|
53
|
+
expect.stringContaining('evidence-based snapshot'),
|
|
54
|
+
);
|
|
55
|
+
expect(segments).toContainEqual(expect.stringContaining('trend magnitude'));
|
|
56
|
+
|
|
57
|
+
// Horizontal rules are skipped
|
|
58
|
+
expect(segments).not.toContainEqual(expect.stringMatching(/^-{3,}$/));
|
|
59
|
+
|
|
60
|
+
// No empty segments
|
|
61
|
+
expect(segments.every((s) => s.trim().length > 0)).toBe(true);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it('should handle plain prose without markdown', () => {
|
|
65
|
+
const text =
|
|
66
|
+
'This is sentence one. This is sentence two. This is sentence three.';
|
|
67
|
+
const segments = splitMarkdown(text);
|
|
68
|
+
expect(segments).toEqual([
|
|
69
|
+
'This is sentence one.',
|
|
70
|
+
'This is sentence two.',
|
|
71
|
+
'This is sentence three.',
|
|
72
|
+
]);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it('should handle headers followed by prose', () => {
|
|
76
|
+
const text = '## My Header\n\nSome prose paragraph here.';
|
|
77
|
+
const segments = splitMarkdown(text);
|
|
78
|
+
expect(segments).toEqual(['## My Header', 'Some prose paragraph here.']);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('should skip table separator rows', () => {
|
|
82
|
+
const text = '| A | B |\n|---|---|\n| 1 | 2 |';
|
|
83
|
+
const segments = splitMarkdown(text);
|
|
84
|
+
expect(segments).toEqual(['| A | B |', '| 1 | 2 |']);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
it('should handle numbered lists', () => {
|
|
88
|
+
const text =
|
|
89
|
+
'1. First item with enough text.\n2. Second item also long enough.';
|
|
90
|
+
const segments = splitMarkdown(text);
|
|
91
|
+
expect(segments).toEqual([
|
|
92
|
+
'1. First item with enough text.',
|
|
93
|
+
'2. Second item also long enough.',
|
|
94
|
+
]);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
it('should handle bullet lists', () => {
|
|
98
|
+
const text =
|
|
99
|
+
'- First bullet point.\n- Second bullet point.\n- Third bullet point.';
|
|
100
|
+
const segments = splitMarkdown(text);
|
|
101
|
+
expect(segments).toEqual([
|
|
102
|
+
'- First bullet point.',
|
|
103
|
+
'- Second bullet point.',
|
|
104
|
+
'- Third bullet point.',
|
|
105
|
+
]);
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
it('should skip horizontal rules', () => {
|
|
109
|
+
const text = 'Before the rule.\n\n---\n\nAfter the rule.';
|
|
110
|
+
const segments = splitMarkdown(text);
|
|
111
|
+
expect(segments).toEqual(['Before the rule.', 'After the rule.']);
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
it('should return an empty array for empty string', () => {
|
|
115
|
+
expect(splitMarkdown('')).toEqual([]);
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
it('should merge short prose fragments', () => {
|
|
119
|
+
const text = 'Hi. This is a longer sentence here.';
|
|
120
|
+
const segments = splitMarkdown(text);
|
|
121
|
+
expect(segments).toEqual(['Hi. This is a longer sentence here.']);
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
it('should join multi-line prose into a single paragraph before splitting', () => {
|
|
125
|
+
const text =
|
|
126
|
+
'This is the first line of a paragraph.\nThis is the second line of the same paragraph.';
|
|
127
|
+
const segments = splitMarkdown(text);
|
|
128
|
+
// Should be joined and then split by Intl.Segmenter
|
|
129
|
+
expect(segments.length).toBeGreaterThanOrEqual(1);
|
|
130
|
+
expect(segments.join('')).toContain('first line');
|
|
131
|
+
expect(segments.join('')).toContain('second line');
|
|
132
|
+
});
|
|
133
|
+
});
|