npm - @deepcitation/deepcitation-js - Versions diffs - 1.1.18 → 1.1.19 - Mend

@deepcitation/deepcitation-js 1.1.18 → 1.1.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/lib/parsing/normalizeCitation.d.ts +1 -1
package/lib/parsing/normalizeCitation.js +28 -18
package/lib/parsing/parseCitation.js +4 -4
package/lib/prompts/citationPrompts.d.ts +3 -2
package/lib/prompts/citationPrompts.js +4 -3
package/lib/prompts/promptCompression.js +30 -16
package/package.json +1 -1

package/lib/parsing/normalizeCitation.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-export declare const removeCitations: (pageText: string, leaveValueBehind?: boolean) => string;
+export declare const removeCitations: (pageText: string, leaveKeySpanBehind?: boolean) => string;
 export declare const removePageNumberMetadata: (pageText: string) => string;
 export declare const removeLineIdMetadata: (pageText: string) => string;
 export declare const getCitationPageNumber: (startPageKey?: string | null) => number | null;

package/lib/parsing/normalizeCitation.js CHANGED Viewed

@@ -1,9 +1,9 @@
-export const removeCitations = (pageText, leaveValueBehind) => {
-    const citationRegex = /<cite\s+(?:fileId|attachmentId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
-    return pageText.replace(citationRegex, (match, attachmentId, pageNumber, index, fullPhrase, lineIds, value) => {
+export const removeCitations = (pageText, leaveKeySpanBehind) => {
+    const citationRegex = /<cite\s+(?:fileId|attachmentId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
+    return pageText.replace(citationRegex, (match, attachmentId, pageNumber, index, fullPhrase, keySpan, lineIds, value) => {
         //it is still value= so we need to remove the value=
-        if (leaveValueBehind) {
-            return value?.replace(/value=['"]|['"]/g, "") || "";
+        if (leaveKeySpanBehind) {
+            return keySpan?.replace(/key_span=['"]|['"]/g, "") || "";
         }
         else {
             return "";
@@ -45,19 +45,29 @@ const normalizeCitationContent = (input) => {
     // Replace ></cite> with /> for consistency
     normalized = normalized.replace(/><\/cite>/g, "/>");
     const canonicalizeCiteAttributeKey = (key) => {
-        if (key === "fullPhrase" || key === "full_phrase")
+        const lowerKey = key.toLowerCase();
+        if (lowerKey === "fullphrase" || lowerKey === "full_phrase")
             return "full_phrase";
-        if (key === "lineIds" || key === "line_ids")
+        if (lowerKey === "lineids" || lowerKey === "line_ids")
             return "line_ids";
-        if (key === "startPageKey" ||
-            key === "start_pageKey" ||
-            key === "start_page_key")
+        if (lowerKey === "startpagekey" ||
+            lowerKey === "start_pagekey" ||
+            lowerKey === "start_page_key")
             return "start_page_key";
-        if (key === "fileID" || key === "fileId" || key === "file_id" || key === "attachmentId" || key === "attachment_id")
-            return "file_id";
-        if (key === "keySpan" || key === "key_span")
+        if (lowerKey === "fileid" ||
+            lowerKey === "file_id" ||
+            lowerKey === "attachmentid" ||
+            lowerKey === "attachment_id")
+            return "attachment_id";
+        if (lowerKey === "keyspan" || lowerKey === "key_span")
             return "key_span";
-        return key;
+        if (lowerKey === "reasoning" || lowerKey === "value")
+            return lowerKey;
+        if (lowerKey === "timestamps" ||
+            lowerKey === "timestamp" ||
+            lowerKey === "timestamps")
+            return "timestamps";
+        return lowerKey;
     };
     // Helper to decode HTML entities (simple implementation, expand if needed)
     const decodeHtmlEntities = (str) => {
@@ -146,16 +156,16 @@ const normalizeCitationContent = (input) => {
         const startPageKeys = keys.filter((k) => k.startsWith("start_page"));
         const ordered = [];
         // Shared first
-        if (attrs.file_id)
-            ordered.push("file_id");
+        if (attrs.attachment_id)
+            ordered.push("attachment_id");
         if (hasTimestamps) {
-            // AV citations: fileId, full_phrase, timestamps, (optional reasoning/value), then any extras
+            // AV citations: attachment_id, full_phrase, timestamps, (optional reasoning/value), then any extras
             if (attrs.full_phrase)
                 ordered.push("full_phrase");
             ordered.push("timestamps");
         }
         else {
-            // Document citations: fileId, start_page*, full_phrase, key_span, line_ids, (optional reasoning/value), then any extras
+            // Document citations: attachment_id, start_page*, full_phrase, key_span, line_ids, (optional reasoning/value), then any extras
             if (startPageKeys.includes("start_page_key"))
                 ordered.push("start_page_key");
             startPageKeys

package/lib/parsing/parseCitation.js CHANGED Viewed

@@ -94,9 +94,9 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
     // 4: full_phrase content (escaped)
     // 5: key_span content (escaped)
     // 6: line_ids content
-    // 6: Optional Key (value|reasoning)
-    // 7: Optional Value content (escaped)
-    const citationRegex = /<cite\s+file(?:_id|Id)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
+    // 7: Optional Key (value|reasoning)
+    // 8: Optional Value content (escaped)
+    const citationRegex = /<cite\s+(?:attachment_id|attachmentId|file_id|fileId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
     const citationMatches = [...middleCite.matchAll(citationRegex)];
     const match = citationMatches?.[0];
     const pageNumber = match?.[2] ? parseInt(match?.[2]) : undefined;
@@ -133,7 +133,7 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
     // 3: timestamps content
     // 4: Optional Key (value|reasoning)
     // 5: Optional Value content (escaped)
-    const avCitationRegex = /<cite\s+file(?:_id|Id)='(\w{0,25})'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+timestamps='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
+    const avCitationRegex = /<cite\s+(?:attachment_id|attachmentId|file_id|fileId)='(\w{0,25})'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+timestamps='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
     const avCitationMatches = [...middleCite.matchAll(avCitationRegex)];
     const avMatch = avCitationMatches?.[0];
     let timestamps;

package/lib/prompts/citationPrompts.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-export declare const CITATION_MARKDOWN_SYNTAX_PROMPT = "\nCitation syntax to use within Markdown:\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite file_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <file_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />\n\n\u2022 Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.\n\u2022 start_page_key, full_phrase, and line_ids are required for each citation.\n\u2022 Infer line_ids, as we only provide the first, last, and every 5th line. When copying a previous <cite />, use the full info from the previous citation without changing the start_page_key, line_ids, or any other <cite /> attributes.\n\u2022 Use refer to line_ids inclusively, and use a range (or single) for each citation, split multiple sequential line_ids into multiple citations.\n\u2022 These citations will be replaced and displayed in-line as a numeric element (e.g. [1]), the markdown preceding <cite /> should read naturally with only one <cite /> per sentence with rare exceptions for two <cite /> in a sentence. <cite /> often present best at the end of the sentence, and are not grouped at the end of the document.\n\u2022 The full_phrase should be the exact verbatim text of the phrase or paragraph from the source document to support the insight or idea.\n\u2022 We do NOT put the full_phrase inside <cite ...></cite>; we only use full_phrase inside the full_phrase attribute.\n";
-export declare const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = "\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite file_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />\n\u2022 These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.\n";
+export declare const CITATION_MARKDOWN_SYNTAX_PROMPT = "\nCitation syntax to use within Markdown:\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite attachment_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <attachment_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />\n\n\u2022 Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.\n\u2022 start_page_key, full_phrase, and line_ids are required for each citation.\n\u2022 Infer line_ids, as we only provide the first, last, and every 5th line. When copying a previous <cite />, use the full info from the previous citation without changing the start_page_key, line_ids, or any other <cite /> attributes.\n\u2022 Use refer to line_ids inclusively, and use a range (or single) for each citation, split multiple sequential line_ids into multiple citations.\n\u2022 These citations will be replaced and displayed in-line as a numeric element (e.g. [1]), the markdown preceding <cite /> should read naturally with only one <cite /> per sentence with rare exceptions for two <cite /> in a sentence. <cite /> often present best at the end of the sentence, and are not grouped at the end of the document.\n\u2022 The full_phrase should be the exact verbatim text of the phrase or paragraph from the source document to support the insight or idea.\n\u2022 We do NOT put the full_phrase inside <cite ...></cite>; we only use full_phrase inside the full_phrase attribute.\n";
+export declare const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = "\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite attachment_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />\n\u2022 These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.\n";
 export interface WrapSystemPromptOptions {
     /** The original system prompt to wrap with citation instructions */
     systemPrompt: string;
@@ -134,4 +134,5 @@ export declare const CITATION_AV_BASED_JSON_OUTPUT_FORMAT: {
             description: string;
         };
     };
+    required: string[];
 };

package/lib/prompts/citationPrompts.js CHANGED Viewed

@@ -1,7 +1,7 @@
 export const CITATION_MARKDOWN_SYNTAX_PROMPT = `
 Citation syntax to use within Markdown:
 • To support any ideas or information that requires a citation from the provided content, use the following citation syntax:
-<cite file_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <file_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />
+<cite attachment_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <attachment_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />
 • Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.
 • start_page_key, full_phrase, and line_ids are required for each citation.
@@ -13,7 +13,7 @@ Citation syntax to use within Markdown:
 `;
 export const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = `
 • To support any ideas or information that requires a citation from the provided content, use the following citation syntax:
-<cite file_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />
+<cite attachment_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />
 • These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.
 `;
 /**
@@ -121,7 +121,7 @@ export const CITATION_JSON_OUTPUT_FORMAT = {
         },
         fullPhrase: {
             type: "string",
-            description: "The verbatim text of the terse phrase inside <file_text /> to support the citation (if there is a detected OCR correction, use the corrected text)",
+            description: "The verbatim text of the terse phrase inside <attachment_text /> to support the citation (if there is a detected OCR correction, use the corrected text)",
         },
         keySpan: {
             type: "string",
@@ -164,4 +164,5 @@ export const CITATION_AV_BASED_JSON_OUTPUT_FORMAT = {
             description: "The timestamp of the audio or video frame including milliseconds formatted as: HH:MM:SS.SSS",
         },
     },
+    required: ["attachmentId", "startPageKey", "fullPhrase", "timestamps"],
 };

package/lib/prompts/promptCompression.js CHANGED Viewed

@@ -14,17 +14,20 @@ function buildSafePrefixMap(ids, prompt) {
             const digitCount = (prefix.match(/\d/g) || []).length;
             const letterCount = (prefix.match(/[a-zA-Z]/g) || []).length;
             if (prefix.length < MIN_PREFIX_LENGTH ||
-                (digitCount > 0 && letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_AT_LEAST_ONE_DIGIT) ||
-                (digitCount === 0 && letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_NO_DIGITS)) {
+                (digitCount > 0 &&
+                    letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_AT_LEAST_ONE_DIGIT) ||
+                (digitCount === 0 &&
+                    letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_NO_DIGITS)) {
                 continue;
             }
             // 1) Unique among IDs
-            if (ids.some(other => other !== id && other.startsWith(prefix))) {
+            if (ids.some((other) => other !== id && other.startsWith(prefix))) {
                 continue;
             }
             // 2) Only appears in prompt as part of the full ID
             const esc = (s) => s.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&");
-            const prefixCount = (prompt.match(new RegExp(esc(prefix), "g")) || []).length;
+            const prefixCount = (prompt.match(new RegExp(esc(prefix), "g")) || [])
+                .length;
             const fullCount = (prompt.match(new RegExp(esc(id), "g")) || []).length;
             if (prefixCount !== fullCount) {
                 continue;
@@ -91,18 +94,29 @@ export function decompressPromptIds(compressed, prefixMap) {
         const escPrefix = prefix.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&");
         text = text.replace(new RegExp(escPrefix, "g"), full);
     }
-    //this is for citation attachmentId, file_id, or fileId
-    if (entries.length === 1 && (text.includes("file_id='") || text.includes('file_id="'))) {
-        const fullId = entries[0][1];
-        text = text.replace(/file_id='[^']*'|file_id="[^"]*"/g, `file_id='${fullId}'`);
-    }
-    else if (entries.length === 1 && (text.includes("fileId='") || text.includes('fileId="'))) {
-        const fullId = entries[0][1];
-        text = text.replace(/fileId='[^']*'|fileId="[^"]*"/g, `fileId='${fullId}'`);
-    }
-    else if (entries.length === 1 && (text.includes("attachmentId='") || text.includes('attachmentId="'))) {
-        const fullId = entries[0][1];
-        text = text.replace(/attachmentId='[^']*'|attachmentId="[^"]*"/g, `attachmentId='${fullId}'`);
+    // Handle cases where the LLM may output ID in a different attribute format
+    // We look for common ID attribute patterns and replace compressed prefixes within them
+    // Note: fileId variants are supported for backwards compatibility with legacy citations
+    const idAttributeKeys = [
+        "attachmentId",
+        "attachment_id",
+        "attachment_ID",
+        "attachmentID",
+        "fileId",
+        "file_id",
+        "file_ID",
+        "fileID",
+        "fileid",
+    ];
+    // For each prefix, look for it within ID attribute values and replace with full ID
+    for (const [prefix, full] of entries) {
+        const escPrefix = prefix.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&");
+        const keyPattern = idAttributeKeys.join("|");
+        const quotePattern = "([\"'`])";
+        // Match: attributeName = 'prefix' or attributeName="prefix" etc.
+        // Only replace the prefix part, preserving the attribute name and quotes
+        const re = new RegExp(`(${keyPattern})(\\s*=\\s*)${quotePattern}${escPrefix}\\3`, "g");
+        text = text.replace(re, `$1$2$3${full}$3`);
     }
     const newLength = text?.length;
     const diff = originalLength - newLength;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@deepcitation/deepcitation-js",
-    "version": "1.1.18",
+    "version": "1.1.19",
     "description": "DeepCitation JavaScript SDK for deterministic AI citation verification",
     "type": "module",
     "private": false,