npm - @eeacms/volto-eea-chatbot - Versions diffs - 1.0.11 → 1.0.13 - Mend

@eeacms/volto-eea-chatbot 1.0.11 → 1.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/CHANGELOG.md +26 -2
package/README.md +8 -8
package/jest-addon.config.js +1 -1
package/package.json +2 -1
package/src/ChatBlock/chat/AIMessage.tsx +32 -25
package/src/ChatBlock/chat/ChatWindow.tsx +1 -0
package/src/ChatBlock/components/HalloumiFeedback.jsx +8 -4
package/src/ChatBlock/components/markdown/ClaimModal.jsx +1 -1
package/src/ChatBlock/components/markdown/ClaimSegments.jsx +2 -3
package/src/ChatBlock/components/markdown/RenderClaimView.jsx +1 -1
package/src/ChatBlock/components/markdown/index.js +41 -15
package/src/ChatBlock/hooks/useChatController.ts +0 -15
package/src/ChatBlock/hooks/useQualityMarkers.js +0 -11
package/src/ChatBlock/style.less +47 -0
package/src/ChatBlock/types/interfaces.ts +1 -0
package/src/halloumi/filtering.js +149 -0
package/src/halloumi/filtering.test.js +44 -0
package/src/halloumi/generative.js +157 -53
package/src/halloumi/generative.test.js +28 -8
package/src/halloumi/markdown-splitter.js +172 -0
package/src/halloumi/markdown-splitter.test.js +133 -0
package/src/halloumi/middleware.js +6 -6
package/src/halloumi/postprocessing.js +0 -26
package/src/halloumi/preprocessing.js +78 -76
package/src/halloumi/preprocessing.test.js +87 -148
package/src/middleware.js +3 -0
package/src/middleware.test.js +2 -0

package/src/halloumi/middleware.js CHANGED Viewed

@@ -1,7 +1,7 @@
-import debug from 'debug';
+// import debug from 'debug';
 import { getVerifyClaimResponse } from './generative';
-const log = debug('halloumi');
+// const log = debug('halloumi');
 const MSG_INVALID_CONFIGURATION =
   'Invalid configuration: missing LLMGW_TOKEN or LLMGW_URL';
@@ -45,8 +45,8 @@ export default async function middleware(req, res, next) {
   };
   const body = req.body;
-  log('Halloumi body', body);
-  const { sources, answer, maxContextSegments = 0 } = body;
+  // log('Halloumi body', body);
+  const { sources, answer } = body;
   res.set('Content-Type', 'application/json');
@@ -56,9 +56,9 @@ export default async function middleware(req, res, next) {
       // TODO: map with citation id
       sources,
       answer,
-      maxContextSegments,
+      { ip: req.headers['x-forwarded-for'] || req.ip },
     );
-    log('Halloumi response', resp);
+    // log('Halloumi response', resp);
     res.send(resp);
   } catch (error) {
     res.status(500).send({

package/src/halloumi/postprocessing.js CHANGED Viewed

@@ -1,29 +1,3 @@
-// /**
-//  * Represents a claim object with all relevant information.
-//  */
-// export interface GenerativeClaim {
-//     claimId: number;
-//     claimString: string;
-//     subclaims: string[];
-//     segments: number[];
-//     explanation: string;
-//     supported: boolean;
-//     probabilities: Map<string, number>;
-// }
-//
-// export interface OpenAITokenLogProb {
-//     token: string;
-//     bytes: number[];
-//     logprob: number;
-// }
-//
-// export interface OpenAILogProb {
-//     token: string;
-//     bytes: number[];
-//     logprob: number;
-//     top_logprobs: OpenAITokenLogProb[];
-// }
-//
 /**
  * Gets the claim id from a subsegment.
  * @param subsegment A subsegment string of the form "<|r1|".

package/src/halloumi/preprocessing.js CHANGED Viewed

@@ -1,46 +1,4 @@
-const DEFAULT_HALLOUMI_REQUEST =
-  'Make one or more claims about information in the documents.';
-/**
- * Splits a given text into sentences using sentence-splitter.
- * @param text The input string to split.
- * @returns An array of sentence strings.
- */
-export function splitIntoSentences(text, maxSegments = 0) {
-  const segmenter = new Intl.Segmenter('en', { granularity: 'sentence' });
-  const segments = Array.from(segmenter.segment(text)).map((s) => s.segment);
-  const initialSentences = [];
-  let currentSentence = '';
-  for (const segment of segments) {
-    currentSentence += segment;
-    if (currentSentence.trim().length > 8) {
-      initialSentences.push(currentSentence);
-      currentSentence = '';
-    }
-  }
-  // Push any remaining part that didn't make it to 8 characters
-  if (currentSentence) {
-    initialSentences.push(currentSentence);
-  }
-  if (maxSegments <= 0) {
-    return initialSentences;
-  }
-  if (initialSentences.length > maxSegments) {
-    const groupSize = Math.ceil(initialSentences.length / maxSegments);
-    const mergedSentences = [];
-    for (let i = 0; i < initialSentences.length; i += groupSize) {
-      const group = initialSentences.slice(i, i + groupSize);
-      mergedSentences.push(group.join(''));
-    }
-    return mergedSentences;
-  }
-  return initialSentences;
-}
+const MAX_CONTEXT_SEGMENTS_PER_CHUNK = 100;
 /**
  * Annotate a set of sentences with a given annotation character.
@@ -48,11 +6,26 @@ export function splitIntoSentences(text, maxSegments = 0) {
  * @param annotationChar The character to use for annotation.
  * @returns The annotated string with annotation characters + sentence number.
  */
-export function annotate(sentences, annotationChar) {
+export function annotate(sentences, annotationChar, excludeIndices) {
   return sentences
+    .map((sentence, i) => {
+      const id = i + 1;
+      if (excludeIndices && excludeIndices.has(id)) {
+        return '';
+      }
+      return `<|${annotationChar}${id}|><${sentence}><end||${annotationChar}>`;
+    })
+    .join('');
+}
+/**
+ * Annotates a chunk of indexed sentences with their global IDs.
+ */
+function annotateChunk(chunk, annotationChar) {
+  return chunk
     .map(
-      (sentence, i) =>
-        `<|${annotationChar}${i + 1}|><${sentence}><end||${annotationChar}>`,
+      ({ sentence, globalId }) =>
+        `<|${annotationChar}${globalId}|><${sentence}><end||${annotationChar}>`,
     )
     .join('');
 }
@@ -74,42 +47,71 @@ export function getOffsets(originalString, sentences) {
 }
 /**
- * Creates a Halloumi prompt from a given context, request and response.
- * @param context The context or document to reference.
- * @param response The response to the request.
- * @param request The request or question that was used to produce the response.
- * @returns The Halloumi prompt.
+ * Creates multiple HallOumi prompts by chunking context segments.
+ * Each chunk uses global segment IDs (s5, s42, ...) so no local-to-global
+ * mapping is needed when merging results.
+ *
+ * @returns {{ prompts: Array<{prompt, responseOffsets}> }}
  */
-export function createHalloumiPrompt({
-  sources,
-  response,
-  request,
-  maxContextSegments = 0,
+export function createChunkedHalloumiPrompts({
+  indexedContextSentences,
+  responseSentences,
+  responseOffsets,
+  request = 'Make one or more claims about information in the documents.',
+  excludeResponseIndices,
 }) {
-  const finalRequest = request || DEFAULT_HALLOUMI_REQUEST;
-  const contextSentences = sources.flatMap((text) =>
-    splitIntoSentences(text, maxContextSegments),
+  // Build response annotation (same for all chunks)
+  const annotatedResponseSentences = annotate(
+    responseSentences || [],
+    'r',
+    excludeResponseIndices,
   );
-  const joinedContext = sources.join('\n');
-  // const contextSentences = splitIntoSentences(sources, maxContextSegments);
-  const contextOffsets = getOffsets(joinedContext, contextSentences);
+  const annotatedRequest = `<|request|><${request}><end||request>`;
+  const annotatedResponse = `<|response|>${annotatedResponseSentences}<end||response>`;
-  const annotatedContextSentences = annotate(contextSentences, 's');
+  // Group sentences by source
+  const sourceGroups = [];
+  let currentSourceId = null;
+  for (const s of indexedContextSentences) {
+    if (s.sourceId !== currentSourceId) {
+      sourceGroups.push([]);
+      currentSourceId = s.sourceId;
+    }
+    sourceGroups[sourceGroups.length - 1].push(s);
+  }
-  const responseSentences = splitIntoSentences(response, maxContextSegments);
-  const responseOffsets = getOffsets(response, responseSentences);
-  const annotatedResponseSentences = annotate(responseSentences, 'r');
+  // Pack whole sources into chunks (first-fit decreasing bin packing)
+  // Sort by size descending so large sources get placed first
+  const sorted = [...sourceGroups].sort((a, b) => b.length - a.length);
+  const chunks = [];
+  const chunkSizes = [];
+  for (const group of sorted) {
+    let placed = false;
+    for (let c = 0; c < chunks.length; c++) {
+      if (chunkSizes[c] + group.length <= MAX_CONTEXT_SEGMENTS_PER_CHUNK) {
+        chunks[c].push(...group);
+        chunkSizes[c] += group.length;
+        placed = true;
+        break;
+      }
+    }
+    if (!placed) {
+      chunks.push([...group]);
+      chunkSizes.push(group.length);
+    }
+  }
+  if (chunks.length === 0) chunks.push([]);
-  const annotatedContext = `<|context|>${annotatedContextSentences}<end||context>`;
-  const annotatedRequest = `<|request|><${finalRequest.trim()}><end||request>`;
-  const annotatedResponse = `<|response|>${annotatedResponseSentences}<end||response>`;
+  // Build one prompt per chunk with global segment IDs
+  const prompts = chunks.map((chunk) => {
+    const annotatedContext = `<|context|>${annotateChunk(
+      chunk,
+      's',
+    )}<end||context>`;
+    const prompt = `${annotatedContext}${annotatedRequest}${annotatedResponse}`;
-  const prompt = `${annotatedContext}${annotatedRequest}${annotatedResponse}`;
-  const halloumiPrompt = {
-    prompt,
-    contextOffsets, // used by convertGenerativesClaimToVerifyClaimResponse
-    responseOffsets,
-  };
+    return { prompt, responseOffsets };
+  });
-  return halloumiPrompt;
+  return { prompts };
 }

package/src/halloumi/preprocessing.test.js CHANGED Viewed

@@ -1,69 +1,9 @@
 import {
-  createHalloumiPrompt,
-  splitIntoSentences,
+  createChunkedHalloumiPrompts,
   annotate,
   getOffsets,
 } from './preprocessing';
-describe('splitIntoSentences', () => {
-  it('should split a basic text into sentences', () => {
-    const text =
-      'This is sentence one. This is sentence two. This is sentence three.';
-    const expected = [
-      'This is sentence one. ',
-      'This is sentence two. ',
-      'This is sentence three.',
-    ];
-    expect(splitIntoSentences(text)).toEqual(expected);
-  });
-  it('should handle short sentences by merging them', () => {
-    const text = 'Short. This is a longer sentence. Also short.';
-    const expected = ['Short. This is a longer sentence. ', 'Also short.'];
-    expect(splitIntoSentences(text)).toEqual(expected);
-  });
-  it('should return an empty array for an empty string', () => {
-    expect(splitIntoSentences('')).toEqual([]);
-  });
-  it('should handle a single sentence', () => {
-    const text = 'This is a single sentence.';
-    expect(splitIntoSentences(text)).toEqual(['This is a single sentence.']);
-  });
-  it('should handle text without punctuation', () => {
-    const text = 'This is a sentence without punctuation';
-    expect(splitIntoSentences(text)).toEqual([
-      'This is a sentence without punctuation',
-    ]);
-  });
-  it('should not merge sentences when maxSegments is 0', () => {
-    const text = 'One. Two. Three. Four. Five.';
-    const expected = ['One. Two. ', 'Three. Four. ', 'Five.'];
-    expect(splitIntoSentences(text, 0)).toEqual(expected);
-  });
-  it('should not merge sentences when finalSentences.length <= maxSegments', () => {
-    const text = 'One. Two. Three.';
-    const expected = ['One. Two. ', 'Three.'];
-    expect(splitIntoSentences(text, 3)).toEqual(expected);
-  });
-  it('should merge sentences when finalSentences.length > maxSegments', () => {
-    const text = 'One. Two. Three. Four. Five.';
-    const expected = ['One. Two. Three. Four. ', 'Five.'];
-    expect(splitIntoSentences(text, 2)).toEqual(expected);
-  });
-  it('should merge sentences into a single segment if maxSegments is 1', () => {
-    const text = 'One. Two. Three. Four. Five.';
-    const expected = ['One. Two. Three. Four. Five.'];
-    expect(splitIntoSentences(text, 1)).toEqual(expected);
-  });
-});
 describe('annotate', () => {
   it('should annotate multiple sentences correctly', () => {
     const sentences = ['Sentence one.', 'Sentence two.'];
@@ -141,105 +81,104 @@ describe('getOffsets', () => {
   });
 });
-describe('createHalloumiPrompt', () => {
-  it('should create a Halloumi prompt with annotated context and response', () => {
-    const sources = [
-      'This is the first source. This is its second sentence.',
-      'This is the second source.',
+describe('createChunkedHalloumiPrompts', () => {
+  it('should create a single chunk for small input', () => {
+    const indexedContextSentences = [
+      { sentence: 'Context one.', sourceId: 1, globalId: 1 },
+      { sentence: 'Context two.', sourceId: 1, globalId: 2 },
     ];
-    const response = 'This is the response. It has two sentences.';
-    const request = 'Test request.';
-    const result = createHalloumiPrompt({ sources, response, request });
-    // Expect the prompt to contain annotated context and response
-    expect(result.prompt).toContain(
-      '<|context|><|s1|><This is the first source. ><end||s><|s2|><This is its second sentence.><end||s><|s3|><This is the second source.><end||s><end||context>',
-    );
-    expect(result.prompt).toContain('<|request|><Test request.><end||request>');
-    expect(result.prompt).toContain(
-      '<|response|><|r1|><This is the response. ><end||r><|r2|><It has two sentences.><end||r><end||response>',
-    );
-    // Expect contextOffsets and responseOffsets to be correctly populated
-    expect(result.contextOffsets).toBeInstanceOf(Map);
-    const s1 = 'This is the first source. ';
-    const s2 = 'This is its second sentence.';
-    const s3 = 'This is the second source.';
-    expect(result.contextOffsets.get(1)).toEqual({
-      startOffset: 0,
-      endOffset: s1.length,
-    });
-    expect(result.contextOffsets.get(2)).toEqual({
-      startOffset: s1.length,
-      endOffset: s1.length + s2.length,
-    });
-    expect(result.contextOffsets.get(3)).toEqual({
-      startOffset: s1.length + s2.length + 1, // +1 for the space between sentences
-      endOffset: s1.length + s2.length + 1 + s3.length,
-    });
-    expect(result.responseOffsets).toBeInstanceOf(Map);
-    const r1 = 'This is the response. ';
-    const r2 = 'It has two sentences.';
+    const responseSentences = ['Response one.', 'Response two.'];
+    const responseOffsets = new Map([
+      [1, { startOffset: 0, endOffset: 13 }],
+      [2, { startOffset: 14, endOffset: 28 }],
+    ]);
-    expect(result.responseOffsets.get(1)).toEqual({
-      startOffset: 0,
-      endOffset: r1.length,
-    });
-    expect(result.responseOffsets.get(2)).toEqual({
-      startOffset: r1.length,
-      endOffset: r1.length + r2.length,
+    const { prompts } = createChunkedHalloumiPrompts({
+      indexedContextSentences,
+      responseSentences,
+      responseOffsets,
     });
+    expect(prompts).toHaveLength(1);
+    expect(prompts[0].prompt).toContain('<|s1|><Context one.><end||s>');
+    expect(prompts[0].prompt).toContain('<|s2|><Context two.><end||s>');
+    expect(prompts[0].prompt).toContain('<|r1|><Response one.><end||r>');
+    expect(prompts[0].prompt).toContain('<|r2|><Response two.><end||r>');
   });
-  it('should handle empty sources, response, and request', () => {
-    const sources = [];
-    const response = '';
-    const request = '';
+  it('should handle empty context', () => {
+    const responseSentences = ['Response.'];
+    const responseOffsets = new Map([[1, { startOffset: 0, endOffset: 9 }]]);
-    const result = createHalloumiPrompt({ sources, response, request });
+    const { prompts } = createChunkedHalloumiPrompts({
+      indexedContextSentences: [],
+      responseSentences,
+      responseOffsets,
+    });
-    expect(result.prompt).toBe(
-      '<|context|><end||context><|request|><Make one or more claims about information in the documents.><end||request><|response|><end||response>',
-    );
-    expect(result.contextOffsets).toBeInstanceOf(Map);
-    expect(result.contextOffsets.size).toBe(0);
-    expect(result.responseOffsets).toBeInstanceOf(Map);
-    expect(result.responseOffsets.size).toBe(0);
+    expect(prompts).toHaveLength(1);
+    expect(prompts[0].prompt).toContain('<|context|><end||context>');
   });
-  it('should handle maxContextSegments correctly', () => {
-    const sources = [
-      'Sentence one. Sentence two. Sentence three. Sentence four.',
+  it('should exclude response sentences based on excludeResponseIndices', () => {
+    const indexedContextSentences = [
+      { sentence: 'Context.', sourceId: 1, globalId: 1 },
     ];
-    const response = 'Response one. Response two.';
-    const request = 'Test request.';
-    const maxContextSegments = 2;
-    const result = createHalloumiPrompt({
-      sources,
-      response,
-      request,
-      maxContextSegments,
-    });
+    const responseSentences = ['Keep this.', 'Skip this.', 'Keep too.'];
+    const responseOffsets = new Map([
+      [1, { startOffset: 0, endOffset: 10 }],
+      [2, { startOffset: 11, endOffset: 21 }],
+      [3, { startOffset: 22, endOffset: 31 }],
+    ]);
+    const excludeResponseIndices = new Set([2]);
-    // With maxContextSegments = 2, the 4 sentences should be merged into 2.
-    // "Sentence one. Sentence two." and "Sentence three. Sentence four."
-    expect(result.prompt).toContain(
-      '<|context|><|s1|><Sentence one. Sentence two. ><end||s><|s2|><Sentence three. Sentence four.><end||s><end||context>',
-    );
-    const mergedS1 = 'Sentence one. Sentence two. ';
-    const mergedS2 = 'Sentence three. Sentence four.';
-    expect(result.contextOffsets.get(1)).toEqual({
-      startOffset: 0,
-      endOffset: mergedS1.length,
+    const { prompts } = createChunkedHalloumiPrompts({
+      indexedContextSentences,
+      responseSentences,
+      responseOffsets,
+      excludeResponseIndices,
     });
-    expect(result.contextOffsets.get(2)).toEqual({
-      startOffset: mergedS1.length,
-      endOffset: mergedS1.length + mergedS2.length,
+    expect(prompts[0].prompt).toContain('<|r1|><Keep this.><end||r>');
+    expect(prompts[0].prompt).not.toContain('Skip this.');
+    expect(prompts[0].prompt).toContain('<|r3|><Keep too.><end||r>');
+  });
+  it('should keep sources together with bin packing', () => {
+    // 3 sources: 60 + 60 + 30 = 150 sentences
+    // Should pack into 2 chunks: (60+30) and (60) or similar
+    const indexed = [];
+    let gid = 1;
+    // Source 1: 60 sentences
+    for (let i = 0; i < 60; i++) {
+      indexed.push({ sentence: `S1-${i}`, sourceId: 1, globalId: gid++ });
+    }
+    // Source 2: 60 sentences
+    for (let i = 0; i < 60; i++) {
+      indexed.push({ sentence: `S2-${i}`, sourceId: 2, globalId: gid++ });
+    }
+    // Source 3: 30 sentences
+    for (let i = 0; i < 30; i++) {
+      indexed.push({ sentence: `S3-${i}`, sourceId: 3, globalId: gid++ });
+    }
+    const { prompts } = createChunkedHalloumiPrompts({
+      indexedContextSentences: indexed,
+      responseSentences: ['Claim.'],
+      responseOffsets: new Map([[1, { startOffset: 0, endOffset: 6 }]]),
     });
+    expect(prompts).toHaveLength(2);
+    // Each source should be entirely within one chunk
+    for (const prompt of prompts) {
+      const s1Count = (prompt.prompt.match(/S1-/g) || []).length;
+      const s2Count = (prompt.prompt.match(/S2-/g) || []).length;
+      const s3Count = (prompt.prompt.match(/S3-/g) || []).length;
+      if (s1Count > 0) expect(s1Count).toBe(60);
+      if (s2Count > 0) expect(s2Count).toBe(60);
+      if (s3Count > 0) expect(s3Count).toBe(30);
+    }
   });
 });

package/src/middleware.js CHANGED Viewed

@@ -151,6 +151,7 @@ async function send_onyx_request(
   res,
   { username, password, api_key, url, is_related_question },
 ) {
+  const forwardedFor = req.headers['x-forwarded-for'] || req.ip;
   let headers = {};
   if (!api_key) {
     await login(username, password);
@@ -166,11 +167,13 @@ async function send_onyx_request(
     headers = {
       Cookie: cached_auth_cookie,
       'Content-Type': 'application/json',
+      'X-Forwarded-For': forwardedFor,
     };
   } else {
     headers = {
       Authorization: 'Bearer ' + api_key,
       'Content-Type': 'application/json',
+      'X-Forwarded-For': forwardedFor,
     };
   }

package/src/middleware.test.js CHANGED Viewed

@@ -59,6 +59,8 @@ describe('src/middleware', () => {
       url: '/_da/chat/send-message',
       method: 'POST',
       body: { message: 'hello' },
+      ip: '127.0.0.1',
+      headers: {},
     };
     res = {
       send: jest.fn(),