@eeacms/volto-eea-chatbot 1.0.11 → 1.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,17 @@
1
1
  import debug from 'debug';
2
- import fetch from 'node-fetch';
3
2
  import fs from 'fs';
4
3
  import {
5
4
  getClaimsFromResponse,
6
5
  getTokenProbabilitiesFromLogits,
7
6
  } from './postprocessing';
8
- import { createHalloumiPrompt } from './preprocessing';
9
-
10
- // const CONTEXT_SEPARTOR = '\n---\n';
7
+ import { createChunkedHalloumiPrompts, getOffsets } from './preprocessing';
8
+ import { splitMarkdown, splitProse } from './markdown-splitter';
9
+ import { callLLM, excludeClaimSentences } from './filtering';
11
10
 
12
11
  const log = debug('halloumi');
13
12
 
13
+ const tokenChoices = new Set(['supported', 'unsupported']);
14
+
14
15
  function sigmoid(x) {
15
16
  return 1 / (1 + Math.exp(-x));
16
17
  }
@@ -21,45 +22,143 @@ export function applyPlattScaling(platt, probability) {
21
22
  return sigmoid(-1 * (platt.a * log_prob + platt.b));
22
23
  }
23
24
 
25
+ /**
26
+ * Merges claims from multiple chunked HallOumi responses.
27
+ * For each response sentence (claimId), combines segment citations
28
+ * and takes the max supported score.
29
+ */
30
+ function mergeChunkClaims(chunkResults) {
31
+ const claimMap = new Map();
32
+
33
+ for (const claims of chunkResults) {
34
+ for (const claim of claims) {
35
+ if (!claimMap.has(claim.claimId)) {
36
+ claimMap.set(claim.claimId, { ...claim });
37
+ } else {
38
+ const existing = claimMap.get(claim.claimId);
39
+ existing.segments.push(...claim.segments);
40
+ // Keep the result with the higher supported score
41
+ const existingScore = existing.probabilities.get('supported') || 0;
42
+ const newScore = claim.probabilities.get('supported') || 0;
43
+ if (newScore > existingScore) {
44
+ existing.probabilities = claim.probabilities;
45
+ existing.explanation = claim.explanation;
46
+ existing.supported = claim.supported;
47
+ }
48
+ }
49
+ }
50
+ }
51
+
52
+ return Array.from(claimMap.values());
53
+ }
54
+
24
55
  export async function getVerifyClaimResponse(
25
56
  model,
26
57
  sources,
27
- claims,
28
- maxContextSegments = 0,
58
+ answer,
59
+ { ip } = {},
29
60
  ) {
30
- // const contextSeparator = CONTEXT_SEPARTOR;
31
- // const joinedContext = sources.join(contextSeparator);
61
+ const emptyResponse = {
62
+ claims: [],
63
+ segments: {},
64
+ };
65
+ if (!sources?.length || !answer) {
66
+ return { ...emptyResponse, reason: 'Context is empty' };
67
+ }
68
+
69
+ // Split sentences
70
+ const responseSentences = splitMarkdown(answer);
71
+ const responseOffsets = getOffsets(answer, responseSentences);
72
+
73
+ // Filter claims and context in parallel
74
+ const [excludeResponseIndices] = await Promise.all([
75
+ excludeClaimSentences(responseSentences, { ip }),
76
+ ]);
32
77
 
33
- if (!sources?.length || !claims) {
34
- const response = {
35
- claims: [],
36
- segments: {},
78
+ const contextSentences = [];
79
+ const indexedContextSentences = sources.reduce((acc, text, sourceIdx) => {
80
+ const sentences = splitProse(text, 50).map((sentence, sentenceIdx) => {
81
+ const globalId = acc.length + sentenceIdx + 1;
82
+ contextSentences.push(sentence);
83
+ return {
84
+ sentence,
85
+ sourceId: sourceIdx + 1,
86
+ globalId,
87
+ };
88
+ });
89
+ acc.push(...sentences);
90
+ return acc;
91
+ }, []);
92
+ const joinedContext = sources.join('');
93
+ const contextOffsets = getOffsets(joinedContext, contextSentences);
94
+
95
+ if (excludeResponseIndices.size === responseSentences.length) {
96
+ log('All response sentences excluded');
97
+ return {
98
+ ...emptyResponse,
99
+ empty: 'Claims in the document could not be verified',
37
100
  };
38
- return response;
39
101
  }
40
-
41
- const prompt = createHalloumiPrompt({
42
- sources,
43
- response: claims,
44
- maxContextSegments,
45
- request: undefined,
102
+ log('Excluded response indices', excludeResponseIndices);
103
+ const { prompts } = createChunkedHalloumiPrompts({
104
+ indexedContextSentences,
105
+ responseSentences,
106
+ responseOffsets,
107
+ request: null,
108
+ excludeResponseIndices,
46
109
  });
47
110
 
48
- log('Halloumi prompt', JSON.stringify(prompt, null, 2));
111
+ log(`Split into ${prompts.length} chunk(s)`);
112
+
113
+ // Run all chunks in parallel
114
+ const chunkResults = await Promise.all(
115
+ prompts.map((chunkPrompt, i) => {
116
+ log(`Chunk ${i + 1} request`);
117
+ return halloumiGenerativeAPI(model, chunkPrompt, { ip });
118
+ }),
119
+ );
120
+
121
+ // Merge raw claims across chunks
122
+ const rawClaims = mergeChunkClaims(chunkResults);
123
+
124
+ const mergedPrompt = {
125
+ contextOffsets,
126
+ responseOffsets,
127
+ joinedContext,
128
+ };
129
+ const converted = convertGenerativesClaimToVerifyClaimResponse(
130
+ rawClaims,
131
+ mergedPrompt,
132
+ );
133
+
134
+ if (excludeResponseIndices.size > 0) {
135
+ for (const idx of excludeResponseIndices) {
136
+ if (responseOffsets.has(idx)) {
137
+ const offsets = responseOffsets.get(idx);
138
+ converted.claims.push({
139
+ claimId: idx,
140
+ claimString: responseSentences[idx - 1],
141
+ startOffset: offsets.startOffset,
142
+ endOffset: offsets.endOffset,
143
+ skipped: true,
144
+ score: null,
145
+ });
146
+ }
147
+ }
148
+ converted.claims.sort((a, b) => a.startOffset - b.startOffset);
149
+ }
49
150
 
50
- const rawClaims = await halloumiGenerativeAPI(model, prompt);
51
- log('Raw claims', rawClaims);
52
151
  const result = {
53
- ...convertGenerativesClaimToVerifyClaimResponse(rawClaims, prompt),
152
+ ...converted,
54
153
  rawClaims,
55
- halloumiPrompt: prompt,
154
+ ...(prompts.length === 1
155
+ ? { halloumiPrompt: prompts[0] }
156
+ : { halloumiPrompts: prompts }),
56
157
  };
57
158
 
58
159
  return result;
59
160
  }
60
161
 
61
- const tokenChoices = new Set(['supported', 'unsupported']);
62
-
63
162
  /**
64
163
  * Fetches a response from the LLM.
65
164
  *
@@ -72,7 +171,7 @@ const tokenChoices = new Set(['supported', 'unsupported']);
72
171
  * - `DUMP_HALLOUMI_REQ_FILE_PATH`: If set, the LLM request (URL and parameters) is dumped to the specified file path.
73
172
  * - `DUMP_HALLOUMI_FILE_PATH`: If set, the LLM response is dumped to the specified file path.
74
173
  */
75
- async function getLLMResponse(model, prompt) {
174
+ async function getLLMResponse(model, prompt, { ip } = {}) {
76
175
  let jsonData;
77
176
 
78
177
  if (process.env.MOCK_HALLOUMI_FILE_PATH) {
@@ -90,34 +189,17 @@ async function getLLMResponse(model, prompt) {
90
189
  logprobs: true,
91
190
  top_logprobs: 3,
92
191
  };
93
- const headers = {
94
- 'Content-Type': 'application/json',
95
- accept: 'application/json',
96
- };
97
- if (model.apiKey) {
98
- headers['Authorization'] = `Bearer ${model.apiKey}`;
99
- }
100
192
 
101
- const params = {
102
- method: 'POST',
103
- headers: headers,
104
- body: JSON.stringify(data),
105
- };
106
193
  if (process.env.DUMP_HALLOUMI_REQ_FILE_PATH) {
107
194
  const filePath = process.env.DUMP_HALLOUMI_REQ_FILE_PATH;
108
195
  fs.writeFileSync(
109
196
  filePath,
110
- JSON.stringify(
111
- { url: model.apiUrl, params: { ...params, body: data } },
112
- null,
113
- 2,
114
- ),
197
+ JSON.stringify({ url: model.apiUrl, body: data }, null, 2),
115
198
  );
116
- log(`Dumped halloumi response: ${filePath}`);
199
+ log(`Dumped halloumi request: ${filePath}`);
117
200
  }
118
201
 
119
- const response = await fetch(model.apiUrl, params);
120
- jsonData = await response.json();
202
+ jsonData = await callLLM(model.apiUrl, model.apiKey, data, { ip });
121
203
 
122
204
  if (process.env.DUMP_HALLOUMI_FILE_PATH) {
123
205
  const filePath = process.env.DUMP_HALLOUMI_FILE_PATH;
@@ -133,11 +215,19 @@ async function getLLMResponse(model, prompt) {
133
215
  * @param response A string containing all claims and their information.
134
216
  * @returns A list of claim objects.
135
217
  */
136
- export async function halloumiGenerativeAPI(model, prompt) {
137
- const jsonData = await getLLMResponse(model, prompt);
218
+ export async function halloumiGenerativeAPI(model, prompt, { ip } = {}) {
219
+ const jsonData = await getLLMResponse(model, prompt, { ip });
220
+
221
+ // Todo: restore log
222
+ // log('Generative response', jsonData);
138
223
 
139
- log('Generative response', jsonData);
140
- log('Logprobs', jsonData.choices[0].logprobs.content);
224
+ const finishReason = jsonData.choices?.[0]?.finish_reason;
225
+ if (finishReason === 'length') {
226
+ throw new Error('HallOumi response truncated (finish_reason: length)');
227
+ }
228
+
229
+ // Todo: restore log
230
+ // log('Logprobs', jsonData.choices[0].logprobs.content);
141
231
 
142
232
  const logits = jsonData.choices[0].logprobs.content;
143
233
  const tokenProbabilities = getTokenProbabilitiesFromLogits(
@@ -149,11 +239,20 @@ export async function halloumiGenerativeAPI(model, prompt) {
149
239
  );
150
240
 
151
241
  if (parsedResponse.length !== tokenProbabilities.length) {
152
- throw new Error('Token probabilities and claims do not match.');
242
+ log(
243
+ 'Warning: token probabilities (%d) and claims (%d) do not match — using available probabilities, defaulting remainder to 0.5',
244
+ tokenProbabilities.length,
245
+ parsedResponse.length,
246
+ );
153
247
  }
154
248
 
249
+ const defaultScoreMap = new Map([
250
+ ['supported', 0.5],
251
+ ['unsupported', 0.5],
252
+ ]);
253
+
155
254
  for (let i = 0; i < parsedResponse.length; i++) {
156
- const scoreMap = tokenProbabilities[i];
255
+ const scoreMap = tokenProbabilities[i] ?? new Map(defaultScoreMap);
157
256
  if (model.plattScaling) {
158
257
  const platt = model.plattScaling;
159
258
  const unsupportedScore = applyPlattScaling(
@@ -183,12 +282,17 @@ export function convertGenerativesClaimToVerifyClaimResponse(
183
282
  id,
184
283
  startOffset: offset[1].startOffset,
185
284
  endOffset: offset[1].endOffset,
285
+ text: prompt.joinedContext.slice(
286
+ offset[1].startOffset,
287
+ offset[1].endOffset,
288
+ ),
186
289
  };
187
290
  }
188
291
 
189
292
  for (const generativeClaim of generativeClaims) {
190
293
  const segmentIds = [];
191
294
  for (const seg of generativeClaim.segments) {
295
+ if (!seg) continue;
192
296
  segmentIds.push(seg.toString());
193
297
  }
194
298
 
@@ -77,17 +77,29 @@ describe('applyPlattScaling', () => {
77
77
  describe('getVerifyClaimResponse', () => {
78
78
  it('returns empty response when sources is empty', async () => {
79
79
  const result = await getVerifyClaimResponse({}, [], 'claims');
80
- expect(result).toEqual({ claims: [], segments: {} });
80
+ expect(result).toEqual({
81
+ claims: [],
82
+ segments: {},
83
+ reason: 'Context is empty',
84
+ });
81
85
  });
82
86
 
83
87
  it('returns empty response when sources is null', async () => {
84
88
  const result = await getVerifyClaimResponse({}, null, 'claims');
85
- expect(result).toEqual({ claims: [], segments: {} });
89
+ expect(result).toEqual({
90
+ claims: [],
91
+ segments: {},
92
+ reason: 'Context is empty',
93
+ });
86
94
  });
87
95
 
88
96
  it('returns empty response when claims is falsy', async () => {
89
97
  const result = await getVerifyClaimResponse({}, ['source'], null);
90
- expect(result).toEqual({ claims: [], segments: {} });
98
+ expect(result).toEqual({
99
+ claims: [],
100
+ segments: {},
101
+ reason: 'Context is empty',
102
+ });
91
103
  });
92
104
  });
93
105
 
@@ -257,7 +269,7 @@ describe('halloumiGenerativeAPI via real fetch', () => {
257
269
  expect(callHeaders.Authorization).toBeUndefined();
258
270
  });
259
271
 
260
- it('throws when token probabilities and claims do not match', async () => {
272
+ it('defaults to 0.5 when token probabilities and claims do not match', async () => {
261
273
  jest.doMock('./postprocessing', () => ({
262
274
  getClaimsFromResponse: jest.fn(() => [
263
275
  { claimId: 1, claimString: 'Claim 1' },
@@ -290,9 +302,11 @@ describe('halloumiGenerativeAPI via real fetch', () => {
290
302
  responseOffsets: new Map(),
291
303
  };
292
304
 
293
- await expect(halloumiGenerativeAPI(model, prompt)).rejects.toThrow(
294
- 'Token probabilities and claims do not match',
295
- );
305
+ const result = await halloumiGenerativeAPI(model, prompt);
306
+ // First claim gets the available probability
307
+ expect(result[0].probabilities.get('supported')).toBe(0.9);
308
+ // Second claim defaults to 0.5
309
+ expect(result[1].probabilities.get('supported')).toBe(0.5);
296
310
  });
297
311
  });
298
312
 
@@ -316,6 +330,7 @@ describe('convertGenerativesClaimToVerifyClaimResponse', () => {
316
330
  const prompt = {
317
331
  contextOffsets: new Map([[1, { startOffset: 0, endOffset: 10 }]]),
318
332
  responseOffsets: new Map([[1, { startOffset: 100, endOffset: 120 }]]),
333
+ joinedContext: 'Test conte',
319
334
  };
320
335
 
321
336
  const result = convertGenerativesClaimToVerifyClaimResponse(
@@ -336,7 +351,12 @@ describe('convertGenerativesClaimToVerifyClaimResponse', () => {
336
351
  },
337
352
  ],
338
353
  segments: {
339
- 1: { id: '1', startOffset: 0, endOffset: 10 },
354
+ 1: {
355
+ id: '1',
356
+ startOffset: 0,
357
+ endOffset: 10,
358
+ text: 'Test conte',
359
+ },
340
360
  },
341
361
  });
342
362
  });
@@ -0,0 +1,172 @@
1
+ import nlp from 'compromise';
2
+
3
+ const MIN_SENTENCE_LENGTH = 15;
4
+
5
+ /**
6
+ * Splits markdown text into segments meaningful for fact-checking.
7
+ *
8
+ * Strategy:
9
+ * 1. Split into markdown blocks (paragraphs, list items, table rows, headers)
10
+ * 2. For prose blocks, further split into sentences using Intl.Segmenter
11
+ * 3. Merge short fragments (< MIN_SENTENCE_LENGTH) into the next sentence
12
+ *
13
+ * Structural blocks (table rows, headers, list items) are kept as-is since
14
+ * they are already atomic units.
15
+ *
16
+ * @param {string} text Markdown text to split.
17
+ * @returns {string[]} Array of segment strings.
18
+ */
19
+ export function splitMarkdown(text) {
20
+ const blocks = splitIntoBlocks(text);
21
+ const segments = [];
22
+
23
+ for (const block of blocks) {
24
+ if (isStructuralBlock(block)) {
25
+ if (block.trim().length > 0) {
26
+ segments.push(block);
27
+ }
28
+ } else {
29
+ segments.push(...splitProse(block));
30
+ }
31
+ }
32
+
33
+ return segments;
34
+ }
35
+
36
+ /**
37
+ * Splits markdown text into structural blocks.
38
+ * Separates: headers, table rows, list items, horizontal rules, and prose.
39
+ * Prose lines within the same paragraph are joined together.
40
+ */
41
+ function splitIntoBlocks(text) {
42
+ const lines = text.split('\n');
43
+ const blocks = [];
44
+ let currentProse = '';
45
+
46
+ const flushProse = () => {
47
+ if (currentProse) {
48
+ blocks.push(currentProse);
49
+ currentProse = '';
50
+ }
51
+ };
52
+
53
+ for (const line of lines) {
54
+ const trimmed = line.trimStart();
55
+
56
+ if (isTableRow(trimmed)) {
57
+ flushProse();
58
+ blocks.push(line);
59
+ } else if (isTableSeparator(trimmed)) {
60
+ flushProse();
61
+ // Skip separator rows — not verifiable content
62
+ } else if (isHeader(trimmed)) {
63
+ flushProse();
64
+ blocks.push(line);
65
+ } else if (isHorizontalRule(trimmed)) {
66
+ flushProse();
67
+ // Skip horizontal rules
68
+ } else if (isListItem(trimmed)) {
69
+ flushProse();
70
+ blocks.push(line);
71
+ } else if (trimmed === '') {
72
+ flushProse();
73
+ } else {
74
+ // Prose continuation — join into a single paragraph
75
+ currentProse += (currentProse ? ' ' : '') + line.trim();
76
+ }
77
+ }
78
+
79
+ flushProse();
80
+ return blocks;
81
+ }
82
+
83
+ function isTableRow(line) {
84
+ return /^\|.*\|/.test(line) && !isTableSeparator(line);
85
+ }
86
+
87
+ function isTableSeparator(line) {
88
+ return /^\|[\s\-:|]+\|$/.test(line);
89
+ }
90
+
91
+ function isHeader(line) {
92
+ return /^#{1,6}\s/.test(line);
93
+ }
94
+
95
+ function isHorizontalRule(line) {
96
+ return /^(\*{3,}|-{3,}|_{3,})\s*$/.test(line);
97
+ }
98
+
99
+ function isListItem(line) {
100
+ return /^(\d+\.\s+|[-*+]\s+)/.test(line);
101
+ }
102
+
103
+ /**
104
+ * Returns true if a block is structural (table row, header, list item)
105
+ * and should not be further split into sentences.
106
+ */
107
+ function isStructuralBlock(block) {
108
+ const trimmed = block.trimStart();
109
+ return isTableRow(trimmed) || isHeader(trimmed) || isListItem(trimmed);
110
+ }
111
+
112
+ /**
113
+ * Splits a prose paragraph into sentences using sbd (sentence boundary detection),
114
+ * merging short fragments that aren't independently verifiable.
115
+ * sbd handles abbreviations (Dr., Mr., U.S., Ph.D.) correctly.
116
+ */
117
+ export function splitProse(text, maxSentences = 0) {
118
+ // const segments = segment('en', text);
119
+ const doc = nlp(text);
120
+ const initialSentences = doc.sentences().out('array');
121
+
122
+ // Find each sentence's position in the original text
123
+ const positions = getPositions(text, initialSentences);
124
+
125
+ // Merge short sentences (< MIN_SENTENCE_LENGTH) into the next sentence
126
+ const merged = [];
127
+ const mergedPositions = [];
128
+ let pendingStart = null;
129
+ for (let i = 0; i < initialSentences.length; i++) {
130
+ if (pendingStart === null) {
131
+ pendingStart = positions[i].start;
132
+ }
133
+ if (
134
+ initialSentences[i].replaceAll('\n', '').length < MIN_SENTENCE_LENGTH &&
135
+ i < initialSentences.length - 1
136
+ ) {
137
+ // Too short — will be merged with the next sentence
138
+ continue;
139
+ }
140
+ const s = text.slice(pendingStart, positions[i].end);
141
+ merged.push(s);
142
+ mergedPositions.push({ start: pendingStart, end: positions[i].end });
143
+ pendingStart = null;
144
+ }
145
+
146
+ if (maxSentences && merged.length > maxSentences) {
147
+ // Merge groups by slicing original text to preserve separators
148
+ const groupSize = Math.ceil(merged.length / maxSentences);
149
+ const groupedSentences = [];
150
+ for (let i = 0; i < merged.length; i += groupSize) {
151
+ const groupStart = mergedPositions[i].start;
152
+ const groupEnd =
153
+ mergedPositions[Math.min(i + groupSize, merged.length) - 1].end;
154
+ groupedSentences.push(text.slice(groupStart, groupEnd));
155
+ }
156
+ return groupedSentences;
157
+ }
158
+
159
+ return merged;
160
+ }
161
+
162
+ function getPositions(text, sentences) {
163
+ const positions = [];
164
+ let searchFrom = 0;
165
+ for (const sentence of sentences) {
166
+ const start = text.indexOf(sentence, searchFrom);
167
+ const end = start + sentence.length;
168
+ positions.push({ start, end });
169
+ searchFrom = end;
170
+ }
171
+ return positions;
172
+ }
@@ -0,0 +1,133 @@
1
+ import { splitMarkdown } from './markdown-splitter';
2
+
3
+ const DUMMY_LLM_RESPONSE = `## European species most at risk
4
+
5
+ Below is a concise, evidence-based snapshot of the taxonomic groups and flagship species. The list combines information on trend magnitude and risk status.
6
+
7
+ | Taxonomic group | Species at risk | Main drivers | Signal |
8
+ |-----------------|-----------------|--------------|--------|
9
+ | **Birds** | Skylark (*Alauda arvensis*), European turtle-dove | Intensive agriculture, pesticide use | Farmland-bird index down 27% (1990-2019) |
10
+ | **Fish** | Houting (*Coregonus oxyrhynchus*), Danube salmon | River regulation, dam construction | Both listed as critically endangered |
11
+ | **Butterflies** | Large blue (*Phengaris arion*), Adonis blue | Loss of semi-natural grasslands | Grassland-butterfly index down 25% |
12
+
13
+ ### Key take-aways
14
+
15
+ 1. **Agricultural intensification** is the single biggest pressure across taxa.
16
+ 2. **Habitat loss and degradation** underlie the perilous status of many specialists.
17
+ 3. **Chemical stressors** affect both insects and the birds that depend on them.
18
+ 4. **Climate change** amplifies existing pressures such as drought stress on amphibians.
19
+
20
+ ---
21
+
22
+ ### Conservation priorities
23
+
24
+ - **Protect and restore semi-natural habitats** including high-diversity grasslands and river floodplains.
25
+ - **Implement agri-environment schemes** that limit pesticide use and maintain field margins.
26
+ - **Focus monitoring on flagship species** to gauge the effectiveness of policy actions.
27
+
28
+ If you need a deeper dive into a particular taxon, just let me know!`;
29
+
30
+ describe('splitMarkdown', () => {
31
+ it('should split the dummy LLM response into meaningful segments', () => {
32
+ const segments = splitMarkdown(DUMMY_LLM_RESPONSE);
33
+
34
+ // Headers are separate segments
35
+ expect(segments).toContain('## European species most at risk');
36
+ expect(segments).toContain('### Key take-aways');
37
+ expect(segments).toContain('### Conservation priorities');
38
+
39
+ // Table rows are individual segments (separator skipped)
40
+ const tableRows = segments.filter((s) => s.startsWith('|'));
41
+ expect(tableRows.length).toBe(4); // header row + 3 data rows
42
+
43
+ // Numbered list items are separate
44
+ const numberedItems = segments.filter((s) => /^\d+\./.test(s.trimStart()));
45
+ expect(numberedItems.length).toBe(4);
46
+
47
+ // Bullet list items are separate
48
+ const bulletItems = segments.filter((s) => /^-\s/.test(s.trimStart()));
49
+ expect(bulletItems.length).toBe(3);
50
+
51
+ // Prose paragraphs are split into sentences
52
+ expect(segments).toContainEqual(
53
+ expect.stringContaining('evidence-based snapshot'),
54
+ );
55
+ expect(segments).toContainEqual(expect.stringContaining('trend magnitude'));
56
+
57
+ // Horizontal rules are skipped
58
+ expect(segments).not.toContainEqual(expect.stringMatching(/^-{3,}$/));
59
+
60
+ // No empty segments
61
+ expect(segments.every((s) => s.trim().length > 0)).toBe(true);
62
+ });
63
+
64
+ it('should handle plain prose without markdown', () => {
65
+ const text =
66
+ 'This is sentence one. This is sentence two. This is sentence three.';
67
+ const segments = splitMarkdown(text);
68
+ expect(segments).toEqual([
69
+ 'This is sentence one.',
70
+ 'This is sentence two.',
71
+ 'This is sentence three.',
72
+ ]);
73
+ });
74
+
75
+ it('should handle headers followed by prose', () => {
76
+ const text = '## My Header\n\nSome prose paragraph here.';
77
+ const segments = splitMarkdown(text);
78
+ expect(segments).toEqual(['## My Header', 'Some prose paragraph here.']);
79
+ });
80
+
81
+ it('should skip table separator rows', () => {
82
+ const text = '| A | B |\n|---|---|\n| 1 | 2 |';
83
+ const segments = splitMarkdown(text);
84
+ expect(segments).toEqual(['| A | B |', '| 1 | 2 |']);
85
+ });
86
+
87
+ it('should handle numbered lists', () => {
88
+ const text =
89
+ '1. First item with enough text.\n2. Second item also long enough.';
90
+ const segments = splitMarkdown(text);
91
+ expect(segments).toEqual([
92
+ '1. First item with enough text.',
93
+ '2. Second item also long enough.',
94
+ ]);
95
+ });
96
+
97
+ it('should handle bullet lists', () => {
98
+ const text =
99
+ '- First bullet point.\n- Second bullet point.\n- Third bullet point.';
100
+ const segments = splitMarkdown(text);
101
+ expect(segments).toEqual([
102
+ '- First bullet point.',
103
+ '- Second bullet point.',
104
+ '- Third bullet point.',
105
+ ]);
106
+ });
107
+
108
+ it('should skip horizontal rules', () => {
109
+ const text = 'Before the rule.\n\n---\n\nAfter the rule.';
110
+ const segments = splitMarkdown(text);
111
+ expect(segments).toEqual(['Before the rule.', 'After the rule.']);
112
+ });
113
+
114
+ it('should return an empty array for empty string', () => {
115
+ expect(splitMarkdown('')).toEqual([]);
116
+ });
117
+
118
+ it('should merge short prose fragments', () => {
119
+ const text = 'Hi. This is a longer sentence here.';
120
+ const segments = splitMarkdown(text);
121
+ expect(segments).toEqual(['Hi. This is a longer sentence here.']);
122
+ });
123
+
124
+ it('should join multi-line prose into a single paragraph before splitting', () => {
125
+ const text =
126
+ 'This is the first line of a paragraph.\nThis is the second line of the same paragraph.';
127
+ const segments = splitMarkdown(text);
128
+ // Should be joined and then split by Intl.Segmenter
129
+ expect(segments.length).toBeGreaterThanOrEqual(1);
130
+ expect(segments.join('')).toContain('first line');
131
+ expect(segments.join('')).toContain('second line');
132
+ });
133
+ });