npm - @eeacms/volto-eea-chatbot - Versions diffs - 1.0.11 → 1.0.13 - Mend

@eeacms/volto-eea-chatbot 1.0.11 → 1.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/CHANGELOG.md +26 -2
package/README.md +8 -8
package/jest-addon.config.js +1 -1
package/package.json +2 -1
package/src/ChatBlock/chat/AIMessage.tsx +32 -25
package/src/ChatBlock/chat/ChatWindow.tsx +1 -0
package/src/ChatBlock/components/HalloumiFeedback.jsx +8 -4
package/src/ChatBlock/components/markdown/ClaimModal.jsx +1 -1
package/src/ChatBlock/components/markdown/ClaimSegments.jsx +2 -3
package/src/ChatBlock/components/markdown/RenderClaimView.jsx +1 -1
package/src/ChatBlock/components/markdown/index.js +41 -15
package/src/ChatBlock/hooks/useChatController.ts +0 -15
package/src/ChatBlock/hooks/useQualityMarkers.js +0 -11
package/src/ChatBlock/style.less +47 -0
package/src/ChatBlock/types/interfaces.ts +1 -0
package/src/halloumi/filtering.js +149 -0
package/src/halloumi/filtering.test.js +44 -0
package/src/halloumi/generative.js +157 -53
package/src/halloumi/generative.test.js +28 -8
package/src/halloumi/markdown-splitter.js +172 -0
package/src/halloumi/markdown-splitter.test.js +133 -0
package/src/halloumi/middleware.js +6 -6
package/src/halloumi/postprocessing.js +0 -26
package/src/halloumi/preprocessing.js +78 -76
package/src/halloumi/preprocessing.test.js +87 -148
package/src/middleware.js +3 -0
package/src/middleware.test.js +2 -0

package/src/halloumi/generative.js CHANGED Viewed

@@ -1,16 +1,17 @@
 import debug from 'debug';
-import fetch from 'node-fetch';
 import fs from 'fs';
 import {
   getClaimsFromResponse,
   getTokenProbabilitiesFromLogits,
 } from './postprocessing';
-import { createHalloumiPrompt } from './preprocessing';
-// const CONTEXT_SEPARTOR = '\n---\n';
+import { createChunkedHalloumiPrompts, getOffsets } from './preprocessing';
+import { splitMarkdown, splitProse } from './markdown-splitter';
+import { callLLM, excludeClaimSentences } from './filtering';
 const log = debug('halloumi');
+const tokenChoices = new Set(['supported', 'unsupported']);
 function sigmoid(x) {
   return 1 / (1 + Math.exp(-x));
 }
@@ -21,45 +22,143 @@ export function applyPlattScaling(platt, probability) {
   return sigmoid(-1 * (platt.a * log_prob + platt.b));
 }
+/**
+ * Merges claims from multiple chunked HallOumi responses.
+ * For each response sentence (claimId), combines segment citations
+ * and takes the max supported score.
+ */
+function mergeChunkClaims(chunkResults) {
+  const claimMap = new Map();
+  for (const claims of chunkResults) {
+    for (const claim of claims) {
+      if (!claimMap.has(claim.claimId)) {
+        claimMap.set(claim.claimId, { ...claim });
+      } else {
+        const existing = claimMap.get(claim.claimId);
+        existing.segments.push(...claim.segments);
+        // Keep the result with the higher supported score
+        const existingScore = existing.probabilities.get('supported') || 0;
+        const newScore = claim.probabilities.get('supported') || 0;
+        if (newScore > existingScore) {
+          existing.probabilities = claim.probabilities;
+          existing.explanation = claim.explanation;
+          existing.supported = claim.supported;
+        }
+      }
+    }
+  }
+  return Array.from(claimMap.values());
+}
 export async function getVerifyClaimResponse(
   model,
   sources,
-  claims,
-  maxContextSegments = 0,
+  answer,
+  { ip } = {},
 ) {
-  // const contextSeparator = CONTEXT_SEPARTOR;
-  // const joinedContext = sources.join(contextSeparator);
+  const emptyResponse = {
+    claims: [],
+    segments: {},
+  };
+  if (!sources?.length || !answer) {
+    return { ...emptyResponse, reason: 'Context is empty' };
+  }
+  // Split sentences
+  const responseSentences = splitMarkdown(answer);
+  const responseOffsets = getOffsets(answer, responseSentences);
+  // Filter claims and context in parallel
+  const [excludeResponseIndices] = await Promise.all([
+    excludeClaimSentences(responseSentences, { ip }),
+  ]);
-  if (!sources?.length || !claims) {
-    const response = {
-      claims: [],
-      segments: {},
+  const contextSentences = [];
+  const indexedContextSentences = sources.reduce((acc, text, sourceIdx) => {
+    const sentences = splitProse(text, 50).map((sentence, sentenceIdx) => {
+      const globalId = acc.length + sentenceIdx + 1;
+      contextSentences.push(sentence);
+      return {
+        sentence,
+        sourceId: sourceIdx + 1,
+        globalId,
+      };
+    });
+    acc.push(...sentences);
+    return acc;
+  }, []);
+  const joinedContext = sources.join('');
+  const contextOffsets = getOffsets(joinedContext, contextSentences);
+  if (excludeResponseIndices.size === responseSentences.length) {
+    log('All response sentences excluded');
+    return {
+      ...emptyResponse,
+      empty: 'Claims in the document could not be verified',
     };
-    return response;
   }
-  const prompt = createHalloumiPrompt({
-    sources,
-    response: claims,
-    maxContextSegments,
-    request: undefined,
+  log('Excluded response indices', excludeResponseIndices);
+  const { prompts } = createChunkedHalloumiPrompts({
+    indexedContextSentences,
+    responseSentences,
+    responseOffsets,
+    request: null,
+    excludeResponseIndices,
   });
-  log('Halloumi prompt', JSON.stringify(prompt, null, 2));
+  log(`Split into ${prompts.length} chunk(s)`);
+  // Run all chunks in parallel
+  const chunkResults = await Promise.all(
+    prompts.map((chunkPrompt, i) => {
+      log(`Chunk ${i + 1} request`);
+      return halloumiGenerativeAPI(model, chunkPrompt, { ip });
+    }),
+  );
+  // Merge raw claims across chunks
+  const rawClaims = mergeChunkClaims(chunkResults);
+  const mergedPrompt = {
+    contextOffsets,
+    responseOffsets,
+    joinedContext,
+  };
+  const converted = convertGenerativesClaimToVerifyClaimResponse(
+    rawClaims,
+    mergedPrompt,
+  );
+  if (excludeResponseIndices.size > 0) {
+    for (const idx of excludeResponseIndices) {
+      if (responseOffsets.has(idx)) {
+        const offsets = responseOffsets.get(idx);
+        converted.claims.push({
+          claimId: idx,
+          claimString: responseSentences[idx - 1],
+          startOffset: offsets.startOffset,
+          endOffset: offsets.endOffset,
+          skipped: true,
+          score: null,
+        });
+      }
+    }
+    converted.claims.sort((a, b) => a.startOffset - b.startOffset);
+  }
-  const rawClaims = await halloumiGenerativeAPI(model, prompt);
-  log('Raw claims', rawClaims);
   const result = {
-    ...convertGenerativesClaimToVerifyClaimResponse(rawClaims, prompt),
+    ...converted,
     rawClaims,
-    halloumiPrompt: prompt,
+    ...(prompts.length === 1
+      ? { halloumiPrompt: prompts[0] }
+      : { halloumiPrompts: prompts }),
   };
   return result;
 }
-const tokenChoices = new Set(['supported', 'unsupported']);
 /**
  * Fetches a response from the LLM.
  *
@@ -72,7 +171,7 @@ const tokenChoices = new Set(['supported', 'unsupported']);
  * - `DUMP_HALLOUMI_REQ_FILE_PATH`: If set, the LLM request (URL and parameters) is dumped to the specified file path.
  * - `DUMP_HALLOUMI_FILE_PATH`: If set, the LLM response is dumped to the specified file path.
  */
-async function getLLMResponse(model, prompt) {
+async function getLLMResponse(model, prompt, { ip } = {}) {
   let jsonData;
   if (process.env.MOCK_HALLOUMI_FILE_PATH) {
@@ -90,34 +189,17 @@ async function getLLMResponse(model, prompt) {
     logprobs: true,
     top_logprobs: 3,
   };
-  const headers = {
-    'Content-Type': 'application/json',
-    accept: 'application/json',
-  };
-  if (model.apiKey) {
-    headers['Authorization'] = `Bearer ${model.apiKey}`;
-  }
-  const params = {
-    method: 'POST',
-    headers: headers,
-    body: JSON.stringify(data),
-  };
   if (process.env.DUMP_HALLOUMI_REQ_FILE_PATH) {
     const filePath = process.env.DUMP_HALLOUMI_REQ_FILE_PATH;
     fs.writeFileSync(
       filePath,
-      JSON.stringify(
-        { url: model.apiUrl, params: { ...params, body: data } },
-        null,
-        2,
-      ),
+      JSON.stringify({ url: model.apiUrl, body: data }, null, 2),
     );
-    log(`Dumped halloumi response: ${filePath}`);
+    log(`Dumped halloumi request: ${filePath}`);
   }
-  const response = await fetch(model.apiUrl, params);
-  jsonData = await response.json();
+  jsonData = await callLLM(model.apiUrl, model.apiKey, data, { ip });
   if (process.env.DUMP_HALLOUMI_FILE_PATH) {
     const filePath = process.env.DUMP_HALLOUMI_FILE_PATH;
@@ -133,11 +215,19 @@ async function getLLMResponse(model, prompt) {
  * @param response A string containing all claims and their information.
  * @returns A list of claim objects.
  */
-export async function halloumiGenerativeAPI(model, prompt) {
-  const jsonData = await getLLMResponse(model, prompt);
+export async function halloumiGenerativeAPI(model, prompt, { ip } = {}) {
+  const jsonData = await getLLMResponse(model, prompt, { ip });
+  // Todo: restore log
+  // log('Generative response', jsonData);
-  log('Generative response', jsonData);
-  log('Logprobs', jsonData.choices[0].logprobs.content);
+  const finishReason = jsonData.choices?.[0]?.finish_reason;
+  if (finishReason === 'length') {
+    throw new Error('HallOumi response truncated (finish_reason: length)');
+  }
+  // Todo: restore log
+  // log('Logprobs', jsonData.choices[0].logprobs.content);
   const logits = jsonData.choices[0].logprobs.content;
   const tokenProbabilities = getTokenProbabilitiesFromLogits(
@@ -149,11 +239,20 @@ export async function halloumiGenerativeAPI(model, prompt) {
   );
   if (parsedResponse.length !== tokenProbabilities.length) {
-    throw new Error('Token probabilities and claims do not match.');
+    log(
+      'Warning: token probabilities (%d) and claims (%d) do not match — using available probabilities, defaulting remainder to 0.5',
+      tokenProbabilities.length,
+      parsedResponse.length,
+    );
   }
+  const defaultScoreMap = new Map([
+    ['supported', 0.5],
+    ['unsupported', 0.5],
+  ]);
   for (let i = 0; i < parsedResponse.length; i++) {
-    const scoreMap = tokenProbabilities[i];
+    const scoreMap = tokenProbabilities[i] ?? new Map(defaultScoreMap);
     if (model.plattScaling) {
       const platt = model.plattScaling;
       const unsupportedScore = applyPlattScaling(
@@ -183,12 +282,17 @@ export function convertGenerativesClaimToVerifyClaimResponse(
       id,
       startOffset: offset[1].startOffset,
       endOffset: offset[1].endOffset,
+      text: prompt.joinedContext.slice(
+        offset[1].startOffset,
+        offset[1].endOffset,
+      ),
     };
   }
   for (const generativeClaim of generativeClaims) {
     const segmentIds = [];
     for (const seg of generativeClaim.segments) {
+      if (!seg) continue;
       segmentIds.push(seg.toString());
     }

package/src/halloumi/generative.test.js CHANGED Viewed

@@ -77,17 +77,29 @@ describe('applyPlattScaling', () => {
 describe('getVerifyClaimResponse', () => {
   it('returns empty response when sources is empty', async () => {
     const result = await getVerifyClaimResponse({}, [], 'claims');
-    expect(result).toEqual({ claims: [], segments: {} });
+    expect(result).toEqual({
+      claims: [],
+      segments: {},
+      reason: 'Context is empty',
+    });
   });
   it('returns empty response when sources is null', async () => {
     const result = await getVerifyClaimResponse({}, null, 'claims');
-    expect(result).toEqual({ claims: [], segments: {} });
+    expect(result).toEqual({
+      claims: [],
+      segments: {},
+      reason: 'Context is empty',
+    });
   });
   it('returns empty response when claims is falsy', async () => {
     const result = await getVerifyClaimResponse({}, ['source'], null);
-    expect(result).toEqual({ claims: [], segments: {} });
+    expect(result).toEqual({
+      claims: [],
+      segments: {},
+      reason: 'Context is empty',
+    });
   });
 });
@@ -257,7 +269,7 @@ describe('halloumiGenerativeAPI via real fetch', () => {
     expect(callHeaders.Authorization).toBeUndefined();
   });
-  it('throws when token probabilities and claims do not match', async () => {
+  it('defaults to 0.5 when token probabilities and claims do not match', async () => {
     jest.doMock('./postprocessing', () => ({
       getClaimsFromResponse: jest.fn(() => [
         { claimId: 1, claimString: 'Claim 1' },
@@ -290,9 +302,11 @@ describe('halloumiGenerativeAPI via real fetch', () => {
       responseOffsets: new Map(),
     };
-    await expect(halloumiGenerativeAPI(model, prompt)).rejects.toThrow(
-      'Token probabilities and claims do not match',
-    );
+    const result = await halloumiGenerativeAPI(model, prompt);
+    // First claim gets the available probability
+    expect(result[0].probabilities.get('supported')).toBe(0.9);
+    // Second claim defaults to 0.5
+    expect(result[1].probabilities.get('supported')).toBe(0.5);
   });
 });
@@ -316,6 +330,7 @@ describe('convertGenerativesClaimToVerifyClaimResponse', () => {
     const prompt = {
       contextOffsets: new Map([[1, { startOffset: 0, endOffset: 10 }]]),
       responseOffsets: new Map([[1, { startOffset: 100, endOffset: 120 }]]),
+      joinedContext: 'Test conte',
     };
     const result = convertGenerativesClaimToVerifyClaimResponse(
@@ -336,7 +351,12 @@ describe('convertGenerativesClaimToVerifyClaimResponse', () => {
         },
       ],
       segments: {
-        1: { id: '1', startOffset: 0, endOffset: 10 },
+        1: {
+          id: '1',
+          startOffset: 0,
+          endOffset: 10,
+          text: 'Test conte',
+        },
       },
     });
   });

package/src/halloumi/markdown-splitter.js ADDED Viewed

@@ -0,0 +1,172 @@
+import nlp from 'compromise';
+const MIN_SENTENCE_LENGTH = 15;
+/**
+ * Splits markdown text into segments meaningful for fact-checking.
+ *
+ * Strategy:
+ * 1. Split into markdown blocks (paragraphs, list items, table rows, headers)
+ * 2. For prose blocks, further split into sentences using Intl.Segmenter
+ * 3. Merge short fragments (< MIN_SENTENCE_LENGTH) into the next sentence
+ *
+ * Structural blocks (table rows, headers, list items) are kept as-is since
+ * they are already atomic units.
+ *
+ * @param {string} text Markdown text to split.
+ * @returns {string[]} Array of segment strings.
+ */
+export function splitMarkdown(text) {
+  const blocks = splitIntoBlocks(text);
+  const segments = [];
+  for (const block of blocks) {
+    if (isStructuralBlock(block)) {
+      if (block.trim().length > 0) {
+        segments.push(block);
+      }
+    } else {
+      segments.push(...splitProse(block));
+    }
+  }
+  return segments;
+}
+/**
+ * Splits markdown text into structural blocks.
+ * Separates: headers, table rows, list items, horizontal rules, and prose.
+ * Prose lines within the same paragraph are joined together.
+ */
+function splitIntoBlocks(text) {
+  const lines = text.split('\n');
+  const blocks = [];
+  let currentProse = '';
+  const flushProse = () => {
+    if (currentProse) {
+      blocks.push(currentProse);
+      currentProse = '';
+    }
+  };
+  for (const line of lines) {
+    const trimmed = line.trimStart();
+    if (isTableRow(trimmed)) {
+      flushProse();
+      blocks.push(line);
+    } else if (isTableSeparator(trimmed)) {
+      flushProse();
+      // Skip separator rows — not verifiable content
+    } else if (isHeader(trimmed)) {
+      flushProse();
+      blocks.push(line);
+    } else if (isHorizontalRule(trimmed)) {
+      flushProse();
+      // Skip horizontal rules
+    } else if (isListItem(trimmed)) {
+      flushProse();
+      blocks.push(line);
+    } else if (trimmed === '') {
+      flushProse();
+    } else {
+      // Prose continuation — join into a single paragraph
+      currentProse += (currentProse ? ' ' : '') + line.trim();
+    }
+  }
+  flushProse();
+  return blocks;
+}
+function isTableRow(line) {
+  return /^\|.*\|/.test(line) && !isTableSeparator(line);
+}
+function isTableSeparator(line) {
+  return /^\|[\s\-:|]+\|$/.test(line);
+}
+function isHeader(line) {
+  return /^#{1,6}\s/.test(line);
+}
+function isHorizontalRule(line) {
+  return /^(\*{3,}|-{3,}|_{3,})\s*$/.test(line);
+}
+function isListItem(line) {
+  return /^(\d+\.\s+|[-*+]\s+)/.test(line);
+}
+/**
+ * Returns true if a block is structural (table row, header, list item)
+ * and should not be further split into sentences.
+ */
+function isStructuralBlock(block) {
+  const trimmed = block.trimStart();
+  return isTableRow(trimmed) || isHeader(trimmed) || isListItem(trimmed);
+}
+/**
+ * Splits a prose paragraph into sentences using sbd (sentence boundary detection),
+ * merging short fragments that aren't independently verifiable.
+ * sbd handles abbreviations (Dr., Mr., U.S., Ph.D.) correctly.
+ */
+export function splitProse(text, maxSentences = 0) {
+  // const segments = segment('en', text);
+  const doc = nlp(text);
+  const initialSentences = doc.sentences().out('array');
+  // Find each sentence's position in the original text
+  const positions = getPositions(text, initialSentences);
+  // Merge short sentences (< MIN_SENTENCE_LENGTH) into the next sentence
+  const merged = [];
+  const mergedPositions = [];
+  let pendingStart = null;
+  for (let i = 0; i < initialSentences.length; i++) {
+    if (pendingStart === null) {
+      pendingStart = positions[i].start;
+    }
+    if (
+      initialSentences[i].replaceAll('\n', '').length < MIN_SENTENCE_LENGTH &&
+      i < initialSentences.length - 1
+    ) {
+      // Too short — will be merged with the next sentence
+      continue;
+    }
+    const s = text.slice(pendingStart, positions[i].end);
+    merged.push(s);
+    mergedPositions.push({ start: pendingStart, end: positions[i].end });
+    pendingStart = null;
+  }
+  if (maxSentences && merged.length > maxSentences) {
+    // Merge groups by slicing original text to preserve separators
+    const groupSize = Math.ceil(merged.length / maxSentences);
+    const groupedSentences = [];
+    for (let i = 0; i < merged.length; i += groupSize) {
+      const groupStart = mergedPositions[i].start;
+      const groupEnd =
+        mergedPositions[Math.min(i + groupSize, merged.length) - 1].end;
+      groupedSentences.push(text.slice(groupStart, groupEnd));
+    }
+    return groupedSentences;
+  }
+  return merged;
+}
+function getPositions(text, sentences) {
+  const positions = [];
+  let searchFrom = 0;
+  for (const sentence of sentences) {
+    const start = text.indexOf(sentence, searchFrom);
+    const end = start + sentence.length;
+    positions.push({ start, end });
+    searchFrom = end;
+  }
+  return positions;
+}

package/src/halloumi/markdown-splitter.test.js ADDED Viewed

@@ -0,0 +1,133 @@
+import { splitMarkdown } from './markdown-splitter';
+const DUMMY_LLM_RESPONSE = `## European species most at risk
+Below is a concise, evidence-based snapshot of the taxonomic groups and flagship species. The list combines information on trend magnitude and risk status.
+| Taxonomic group | Species at risk | Main drivers | Signal |
+|-----------------|-----------------|--------------|--------|
+| **Birds** | Skylark (*Alauda arvensis*), European turtle-dove | Intensive agriculture, pesticide use | Farmland-bird index down 27% (1990-2019) |
+| **Fish** | Houting (*Coregonus oxyrhynchus*), Danube salmon | River regulation, dam construction | Both listed as critically endangered |
+| **Butterflies** | Large blue (*Phengaris arion*), Adonis blue | Loss of semi-natural grasslands | Grassland-butterfly index down 25% |
+### Key take-aways
+1. **Agricultural intensification** is the single biggest pressure across taxa.
+2. **Habitat loss and degradation** underlie the perilous status of many specialists.
+3. **Chemical stressors** affect both insects and the birds that depend on them.
+4. **Climate change** amplifies existing pressures such as drought stress on amphibians.
+---
+### Conservation priorities
+- **Protect and restore semi-natural habitats** including high-diversity grasslands and river floodplains.
+- **Implement agri-environment schemes** that limit pesticide use and maintain field margins.
+- **Focus monitoring on flagship species** to gauge the effectiveness of policy actions.
+If you need a deeper dive into a particular taxon, just let me know!`;
+describe('splitMarkdown', () => {
+  it('should split the dummy LLM response into meaningful segments', () => {
+    const segments = splitMarkdown(DUMMY_LLM_RESPONSE);
+    // Headers are separate segments
+    expect(segments).toContain('## European species most at risk');
+    expect(segments).toContain('### Key take-aways');
+    expect(segments).toContain('### Conservation priorities');
+    // Table rows are individual segments (separator skipped)
+    const tableRows = segments.filter((s) => s.startsWith('|'));
+    expect(tableRows.length).toBe(4); // header row + 3 data rows
+    // Numbered list items are separate
+    const numberedItems = segments.filter((s) => /^\d+\./.test(s.trimStart()));
+    expect(numberedItems.length).toBe(4);
+    // Bullet list items are separate
+    const bulletItems = segments.filter((s) => /^-\s/.test(s.trimStart()));
+    expect(bulletItems.length).toBe(3);
+    // Prose paragraphs are split into sentences
+    expect(segments).toContainEqual(
+      expect.stringContaining('evidence-based snapshot'),
+    );
+    expect(segments).toContainEqual(expect.stringContaining('trend magnitude'));
+    // Horizontal rules are skipped
+    expect(segments).not.toContainEqual(expect.stringMatching(/^-{3,}$/));
+    // No empty segments
+    expect(segments.every((s) => s.trim().length > 0)).toBe(true);
+  });
+  it('should handle plain prose without markdown', () => {
+    const text =
+      'This is sentence one. This is sentence two. This is sentence three.';
+    const segments = splitMarkdown(text);
+    expect(segments).toEqual([
+      'This is sentence one.',
+      'This is sentence two.',
+      'This is sentence three.',
+    ]);
+  });
+  it('should handle headers followed by prose', () => {
+    const text = '## My Header\n\nSome prose paragraph here.';
+    const segments = splitMarkdown(text);
+    expect(segments).toEqual(['## My Header', 'Some prose paragraph here.']);
+  });
+  it('should skip table separator rows', () => {
+    const text = '| A | B |\n|---|---|\n| 1 | 2 |';
+    const segments = splitMarkdown(text);
+    expect(segments).toEqual(['| A | B |', '| 1 | 2 |']);
+  });
+  it('should handle numbered lists', () => {
+    const text =
+      '1. First item with enough text.\n2. Second item also long enough.';
+    const segments = splitMarkdown(text);
+    expect(segments).toEqual([
+      '1. First item with enough text.',
+      '2. Second item also long enough.',
+    ]);
+  });
+  it('should handle bullet lists', () => {
+    const text =
+      '- First bullet point.\n- Second bullet point.\n- Third bullet point.';
+    const segments = splitMarkdown(text);
+    expect(segments).toEqual([
+      '- First bullet point.',
+      '- Second bullet point.',
+      '- Third bullet point.',
+    ]);
+  });
+  it('should skip horizontal rules', () => {
+    const text = 'Before the rule.\n\n---\n\nAfter the rule.';
+    const segments = splitMarkdown(text);
+    expect(segments).toEqual(['Before the rule.', 'After the rule.']);
+  });
+  it('should return an empty array for empty string', () => {
+    expect(splitMarkdown('')).toEqual([]);
+  });
+  it('should merge short prose fragments', () => {
+    const text = 'Hi. This is a longer sentence here.';
+    const segments = splitMarkdown(text);
+    expect(segments).toEqual(['Hi. This is a longer sentence here.']);
+  });
+  it('should join multi-line prose into a single paragraph before splitting', () => {
+    const text =
+      'This is the first line of a paragraph.\nThis is the second line of the same paragraph.';
+    const segments = splitMarkdown(text);
+    // Should be joined and then split by Intl.Segmenter
+    expect(segments.length).toBeGreaterThanOrEqual(1);
+    expect(segments.join('')).toContain('first line');
+    expect(segments.join('')).toContain('second line');
+  });
+});