@deepcitation/deepcitation-js 1.1.18 → 1.1.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- export declare const removeCitations: (pageText: string, leaveValueBehind?: boolean) => string;
1
+ export declare const removeCitations: (pageText: string, leaveKeySpanBehind?: boolean) => string;
2
2
  export declare const removePageNumberMetadata: (pageText: string) => string;
3
3
  export declare const removeLineIdMetadata: (pageText: string) => string;
4
4
  export declare const getCitationPageNumber: (startPageKey?: string | null) => number | null;
@@ -1,9 +1,9 @@
1
- export const removeCitations = (pageText, leaveValueBehind) => {
2
- const citationRegex = /<cite\s+(?:fileId|attachmentId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
3
- return pageText.replace(citationRegex, (match, attachmentId, pageNumber, index, fullPhrase, lineIds, value) => {
1
+ export const removeCitations = (pageText, leaveKeySpanBehind) => {
2
+ const citationRegex = /<cite\s+(?:fileId|attachmentId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
3
+ return pageText.replace(citationRegex, (match, attachmentId, pageNumber, index, fullPhrase, keySpan, lineIds, value) => {
4
4
  //it is still value= so we need to remove the value=
5
- if (leaveValueBehind) {
6
- return value?.replace(/value=['"]|['"]/g, "") || "";
5
+ if (leaveKeySpanBehind) {
6
+ return keySpan?.replace(/key_span=['"]|['"]/g, "") || "";
7
7
  }
8
8
  else {
9
9
  return "";
@@ -45,19 +45,29 @@ const normalizeCitationContent = (input) => {
45
45
  // Replace ></cite> with /> for consistency
46
46
  normalized = normalized.replace(/><\/cite>/g, "/>");
47
47
  const canonicalizeCiteAttributeKey = (key) => {
48
- if (key === "fullPhrase" || key === "full_phrase")
48
+ const lowerKey = key.toLowerCase();
49
+ if (lowerKey === "fullphrase" || lowerKey === "full_phrase")
49
50
  return "full_phrase";
50
- if (key === "lineIds" || key === "line_ids")
51
+ if (lowerKey === "lineids" || lowerKey === "line_ids")
51
52
  return "line_ids";
52
- if (key === "startPageKey" ||
53
- key === "start_pageKey" ||
54
- key === "start_page_key")
53
+ if (lowerKey === "startpagekey" ||
54
+ lowerKey === "start_pagekey" ||
55
+ lowerKey === "start_page_key")
55
56
  return "start_page_key";
56
- if (key === "fileID" || key === "fileId" || key === "file_id" || key === "attachmentId" || key === "attachment_id")
57
- return "file_id";
58
- if (key === "keySpan" || key === "key_span")
57
+ if (lowerKey === "fileid" ||
58
+ lowerKey === "file_id" ||
59
+ lowerKey === "attachmentid" ||
60
+ lowerKey === "attachment_id")
61
+ return "attachment_id";
62
+ if (lowerKey === "keyspan" || lowerKey === "key_span")
59
63
  return "key_span";
60
- return key;
64
+ if (lowerKey === "reasoning" || lowerKey === "value")
65
+ return lowerKey;
66
+ if (lowerKey === "timestamps" ||
67
+ lowerKey === "timestamp" ||
68
+ lowerKey === "timestamps")
69
+ return "timestamps";
70
+ return lowerKey;
61
71
  };
62
72
  // Helper to decode HTML entities (simple implementation, expand if needed)
63
73
  const decodeHtmlEntities = (str) => {
@@ -146,16 +156,16 @@ const normalizeCitationContent = (input) => {
146
156
  const startPageKeys = keys.filter((k) => k.startsWith("start_page"));
147
157
  const ordered = [];
148
158
  // Shared first
149
- if (attrs.file_id)
150
- ordered.push("file_id");
159
+ if (attrs.attachment_id)
160
+ ordered.push("attachment_id");
151
161
  if (hasTimestamps) {
152
- // AV citations: fileId, full_phrase, timestamps, (optional reasoning/value), then any extras
162
+ // AV citations: attachment_id, full_phrase, timestamps, (optional reasoning/value), then any extras
153
163
  if (attrs.full_phrase)
154
164
  ordered.push("full_phrase");
155
165
  ordered.push("timestamps");
156
166
  }
157
167
  else {
158
- // Document citations: fileId, start_page*, full_phrase, key_span, line_ids, (optional reasoning/value), then any extras
168
+ // Document citations: attachment_id, start_page*, full_phrase, key_span, line_ids, (optional reasoning/value), then any extras
159
169
  if (startPageKeys.includes("start_page_key"))
160
170
  ordered.push("start_page_key");
161
171
  startPageKeys
@@ -94,9 +94,9 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
94
94
  // 4: full_phrase content (escaped)
95
95
  // 5: key_span content (escaped)
96
96
  // 6: line_ids content
97
- // 6: Optional Key (value|reasoning)
98
- // 7: Optional Value content (escaped)
99
- const citationRegex = /<cite\s+file(?:_id|Id)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
97
+ // 7: Optional Key (value|reasoning)
98
+ // 8: Optional Value content (escaped)
99
+ const citationRegex = /<cite\s+(?:attachment_id|attachmentId|file_id|fileId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
100
100
  const citationMatches = [...middleCite.matchAll(citationRegex)];
101
101
  const match = citationMatches?.[0];
102
102
  const pageNumber = match?.[2] ? parseInt(match?.[2]) : undefined;
@@ -133,7 +133,7 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
133
133
  // 3: timestamps content
134
134
  // 4: Optional Key (value|reasoning)
135
135
  // 5: Optional Value content (escaped)
136
- const avCitationRegex = /<cite\s+file(?:_id|Id)='(\w{0,25})'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+timestamps='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
136
+ const avCitationRegex = /<cite\s+(?:attachment_id|attachmentId|file_id|fileId)='(\w{0,25})'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+timestamps='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
137
137
  const avCitationMatches = [...middleCite.matchAll(avCitationRegex)];
138
138
  const avMatch = avCitationMatches?.[0];
139
139
  let timestamps;
@@ -1,5 +1,5 @@
1
- export declare const CITATION_MARKDOWN_SYNTAX_PROMPT = "\nCitation syntax to use within Markdown:\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite file_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <file_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />\n\n\u2022 Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.\n\u2022 start_page_key, full_phrase, and line_ids are required for each citation.\n\u2022 Infer line_ids, as we only provide the first, last, and every 5th line. When copying a previous <cite />, use the full info from the previous citation without changing the start_page_key, line_ids, or any other <cite /> attributes.\n\u2022 Use refer to line_ids inclusively, and use a range (or single) for each citation, split multiple sequential line_ids into multiple citations.\n\u2022 These citations will be replaced and displayed in-line as a numeric element (e.g. [1]), the markdown preceding <cite /> should read naturally with only one <cite /> per sentence with rare exceptions for two <cite /> in a sentence. <cite /> often present best at the end of the sentence, and are not grouped at the end of the document.\n\u2022 The full_phrase should be the exact verbatim text of the phrase or paragraph from the source document to support the insight or idea.\n\u2022 We do NOT put the full_phrase inside <cite ...></cite>; we only use full_phrase inside the full_phrase attribute.\n";
2
- export declare const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = "\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite file_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />\n\u2022 These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.\n";
1
+ export declare const CITATION_MARKDOWN_SYNTAX_PROMPT = "\nCitation syntax to use within Markdown:\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite attachment_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <attachment_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />\n\n\u2022 Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.\n\u2022 start_page_key, full_phrase, and line_ids are required for each citation.\n\u2022 Infer line_ids, as we only provide the first, last, and every 5th line. When copying a previous <cite />, use the full info from the previous citation without changing the start_page_key, line_ids, or any other <cite /> attributes.\n\u2022 Use refer to line_ids inclusively, and use a range (or single) for each citation, split multiple sequential line_ids into multiple citations.\n\u2022 These citations will be replaced and displayed in-line as a numeric element (e.g. [1]), the markdown preceding <cite /> should read naturally with only one <cite /> per sentence with rare exceptions for two <cite /> in a sentence. <cite /> often present best at the end of the sentence, and are not grouped at the end of the document.\n\u2022 The full_phrase should be the exact verbatim text of the phrase or paragraph from the source document to support the insight or idea.\n\u2022 We do NOT put the full_phrase inside <cite ...></cite>; we only use full_phrase inside the full_phrase attribute.\n";
2
+ export declare const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = "\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite attachment_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />\n\u2022 These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.\n";
3
3
  export interface WrapSystemPromptOptions {
4
4
  /** The original system prompt to wrap with citation instructions */
5
5
  systemPrompt: string;
@@ -134,4 +134,5 @@ export declare const CITATION_AV_BASED_JSON_OUTPUT_FORMAT: {
134
134
  description: string;
135
135
  };
136
136
  };
137
+ required: string[];
137
138
  };
@@ -1,7 +1,7 @@
1
1
  export const CITATION_MARKDOWN_SYNTAX_PROMPT = `
2
2
  Citation syntax to use within Markdown:
3
3
  • To support any ideas or information that requires a citation from the provided content, use the following citation syntax:
4
- <cite file_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <file_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />
4
+ <cite attachment_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <attachment_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />
5
5
 
6
6
  • Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.
7
7
  • start_page_key, full_phrase, and line_ids are required for each citation.
@@ -13,7 +13,7 @@ Citation syntax to use within Markdown:
13
13
  `;
14
14
  export const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = `
15
15
  • To support any ideas or information that requires a citation from the provided content, use the following citation syntax:
16
- <cite file_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />
16
+ <cite attachment_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />
17
17
  • These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.
18
18
  `;
19
19
  /**
@@ -121,7 +121,7 @@ export const CITATION_JSON_OUTPUT_FORMAT = {
121
121
  },
122
122
  fullPhrase: {
123
123
  type: "string",
124
- description: "The verbatim text of the terse phrase inside <file_text /> to support the citation (if there is a detected OCR correction, use the corrected text)",
124
+ description: "The verbatim text of the terse phrase inside <attachment_text /> to support the citation (if there is a detected OCR correction, use the corrected text)",
125
125
  },
126
126
  keySpan: {
127
127
  type: "string",
@@ -164,4 +164,5 @@ export const CITATION_AV_BASED_JSON_OUTPUT_FORMAT = {
164
164
  description: "The timestamp of the audio or video frame including milliseconds formatted as: HH:MM:SS.SSS",
165
165
  },
166
166
  },
167
+ required: ["attachmentId", "startPageKey", "fullPhrase", "timestamps"],
167
168
  };
@@ -14,17 +14,20 @@ function buildSafePrefixMap(ids, prompt) {
14
14
  const digitCount = (prefix.match(/\d/g) || []).length;
15
15
  const letterCount = (prefix.match(/[a-zA-Z]/g) || []).length;
16
16
  if (prefix.length < MIN_PREFIX_LENGTH ||
17
- (digitCount > 0 && letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_AT_LEAST_ONE_DIGIT) ||
18
- (digitCount === 0 && letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_NO_DIGITS)) {
17
+ (digitCount > 0 &&
18
+ letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_AT_LEAST_ONE_DIGIT) ||
19
+ (digitCount === 0 &&
20
+ letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_NO_DIGITS)) {
19
21
  continue;
20
22
  }
21
23
  // 1) Unique among IDs
22
- if (ids.some(other => other !== id && other.startsWith(prefix))) {
24
+ if (ids.some((other) => other !== id && other.startsWith(prefix))) {
23
25
  continue;
24
26
  }
25
27
  // 2) Only appears in prompt as part of the full ID
26
28
  const esc = (s) => s.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&");
27
- const prefixCount = (prompt.match(new RegExp(esc(prefix), "g")) || []).length;
29
+ const prefixCount = (prompt.match(new RegExp(esc(prefix), "g")) || [])
30
+ .length;
28
31
  const fullCount = (prompt.match(new RegExp(esc(id), "g")) || []).length;
29
32
  if (prefixCount !== fullCount) {
30
33
  continue;
@@ -91,18 +94,29 @@ export function decompressPromptIds(compressed, prefixMap) {
91
94
  const escPrefix = prefix.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&");
92
95
  text = text.replace(new RegExp(escPrefix, "g"), full);
93
96
  }
94
- //this is for citation attachmentId, file_id, or fileId
95
- if (entries.length === 1 && (text.includes("file_id='") || text.includes('file_id="'))) {
96
- const fullId = entries[0][1];
97
- text = text.replace(/file_id='[^']*'|file_id="[^"]*"/g, `file_id='${fullId}'`);
98
- }
99
- else if (entries.length === 1 && (text.includes("fileId='") || text.includes('fileId="'))) {
100
- const fullId = entries[0][1];
101
- text = text.replace(/fileId='[^']*'|fileId="[^"]*"/g, `fileId='${fullId}'`);
102
- }
103
- else if (entries.length === 1 && (text.includes("attachmentId='") || text.includes('attachmentId="'))) {
104
- const fullId = entries[0][1];
105
- text = text.replace(/attachmentId='[^']*'|attachmentId="[^"]*"/g, `attachmentId='${fullId}'`);
97
+ // Handle cases where the LLM may output ID in a different attribute format
98
+ // We look for common ID attribute patterns and replace compressed prefixes within them
99
+ // Note: fileId variants are supported for backwards compatibility with legacy citations
100
+ const idAttributeKeys = [
101
+ "attachmentId",
102
+ "attachment_id",
103
+ "attachment_ID",
104
+ "attachmentID",
105
+ "fileId",
106
+ "file_id",
107
+ "file_ID",
108
+ "fileID",
109
+ "fileid",
110
+ ];
111
+ // For each prefix, look for it within ID attribute values and replace with full ID
112
+ for (const [prefix, full] of entries) {
113
+ const escPrefix = prefix.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&");
114
+ const keyPattern = idAttributeKeys.join("|");
115
+ const quotePattern = "([\"'`])";
116
+ // Match: attributeName = 'prefix' or attributeName="prefix" etc.
117
+ // Only replace the prefix part, preserving the attribute name and quotes
118
+ const re = new RegExp(`(${keyPattern})(\\s*=\\s*)${quotePattern}${escPrefix}\\3`, "g");
119
+ text = text.replace(re, `$1$2$3${full}$3`);
106
120
  }
107
121
  const newLength = text?.length;
108
122
  const diff = originalLength - newLength;
@@ -8,15 +8,18 @@ export declare function deterministicIdFromVerification(verification: Verificati
8
8
  export interface Verification {
9
9
  attachmentId?: string | null;
10
10
  label?: string | null;
11
- pageNumber?: number | null;
12
- timestamp?: number | null;
13
11
  citation?: Citation;
14
12
  searchState?: SearchState | null;
15
- hitIndexWithinPage?: number | null;
16
13
  highlightColor?: string | null;
14
+ pageNumber?: number | null;
15
+ lineIds?: number[] | null;
16
+ timestamps?: {
17
+ startTime?: string;
18
+ endTime?: string;
19
+ } | null;
20
+ hitIndexWithinPage?: number | null;
17
21
  matchSnippet?: string | null;
18
22
  pdfSpaceItem?: PdfSpaceItem;
19
23
  verificationImageBase64?: string | null;
20
- source?: string | null;
21
24
  verifiedAt?: Date;
22
25
  }
@@ -5,16 +5,12 @@ export const BLANK_VERIFICATION = {
5
5
  attachmentId: null,
6
6
  pageNumber: NOT_FOUND_VERIFICATION_INDEX,
7
7
  matchSnippet: null,
8
- source: null,
9
8
  citation: {
10
- attachmentId: undefined,
11
- startPageKey: null,
12
- fullPhrase: null,
13
- keySpan: null,
14
- lineIds: null,
15
- reasoning: null,
16
9
  pageNumber: NOT_FOUND_VERIFICATION_INDEX,
17
10
  },
11
+ searchState: {
12
+ status: "not_found",
13
+ },
18
14
  };
19
15
  export function deterministicIdFromVerification(verification) {
20
16
  return sha1Hash(`${verification.label}-${verification.attachmentId}-${verification.pageNumber}-${verification.hitIndexWithinPage}-${verification.matchSnippet}-${verification?.hitIndexWithinPage}`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@deepcitation/deepcitation-js",
3
- "version": "1.1.18",
3
+ "version": "1.1.20",
4
4
  "description": "DeepCitation JavaScript SDK for deterministic AI citation verification",
5
5
  "type": "module",
6
6
  "private": false,