@deepcitation/deepcitation-js 1.1.18 → 1.1.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/parsing/normalizeCitation.d.ts +1 -1
- package/lib/parsing/normalizeCitation.js +28 -18
- package/lib/parsing/parseCitation.js +4 -4
- package/lib/prompts/citationPrompts.d.ts +3 -2
- package/lib/prompts/citationPrompts.js +4 -3
- package/lib/prompts/promptCompression.js +30 -16
- package/package.json +1 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export declare const removeCitations: (pageText: string,
|
|
1
|
+
export declare const removeCitations: (pageText: string, leaveKeySpanBehind?: boolean) => string;
|
|
2
2
|
export declare const removePageNumberMetadata: (pageText: string) => string;
|
|
3
3
|
export declare const removeLineIdMetadata: (pageText: string) => string;
|
|
4
4
|
export declare const getCitationPageNumber: (startPageKey?: string | null) => number | null;
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
export const removeCitations = (pageText,
|
|
2
|
-
const citationRegex = /<cite\s+(?:fileId|attachmentId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
|
|
3
|
-
return pageText.replace(citationRegex, (match, attachmentId, pageNumber, index, fullPhrase, lineIds, value) => {
|
|
1
|
+
export const removeCitations = (pageText, leaveKeySpanBehind) => {
|
|
2
|
+
const citationRegex = /<cite\s+(?:fileId|attachmentId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
|
|
3
|
+
return pageText.replace(citationRegex, (match, attachmentId, pageNumber, index, fullPhrase, keySpan, lineIds, value) => {
|
|
4
4
|
//it is still value= so we need to remove the value=
|
|
5
|
-
if (
|
|
6
|
-
return
|
|
5
|
+
if (leaveKeySpanBehind) {
|
|
6
|
+
return keySpan?.replace(/key_span=['"]|['"]/g, "") || "";
|
|
7
7
|
}
|
|
8
8
|
else {
|
|
9
9
|
return "";
|
|
@@ -45,19 +45,29 @@ const normalizeCitationContent = (input) => {
|
|
|
45
45
|
// Replace ></cite> with /> for consistency
|
|
46
46
|
normalized = normalized.replace(/><\/cite>/g, "/>");
|
|
47
47
|
const canonicalizeCiteAttributeKey = (key) => {
|
|
48
|
-
|
|
48
|
+
const lowerKey = key.toLowerCase();
|
|
49
|
+
if (lowerKey === "fullphrase" || lowerKey === "full_phrase")
|
|
49
50
|
return "full_phrase";
|
|
50
|
-
if (
|
|
51
|
+
if (lowerKey === "lineids" || lowerKey === "line_ids")
|
|
51
52
|
return "line_ids";
|
|
52
|
-
if (
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
if (lowerKey === "startpagekey" ||
|
|
54
|
+
lowerKey === "start_pagekey" ||
|
|
55
|
+
lowerKey === "start_page_key")
|
|
55
56
|
return "start_page_key";
|
|
56
|
-
if (
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
if (lowerKey === "fileid" ||
|
|
58
|
+
lowerKey === "file_id" ||
|
|
59
|
+
lowerKey === "attachmentid" ||
|
|
60
|
+
lowerKey === "attachment_id")
|
|
61
|
+
return "attachment_id";
|
|
62
|
+
if (lowerKey === "keyspan" || lowerKey === "key_span")
|
|
59
63
|
return "key_span";
|
|
60
|
-
|
|
64
|
+
if (lowerKey === "reasoning" || lowerKey === "value")
|
|
65
|
+
return lowerKey;
|
|
66
|
+
if (lowerKey === "timestamps" ||
|
|
67
|
+
lowerKey === "timestamp" ||
|
|
68
|
+
lowerKey === "timestamps")
|
|
69
|
+
return "timestamps";
|
|
70
|
+
return lowerKey;
|
|
61
71
|
};
|
|
62
72
|
// Helper to decode HTML entities (simple implementation, expand if needed)
|
|
63
73
|
const decodeHtmlEntities = (str) => {
|
|
@@ -146,16 +156,16 @@ const normalizeCitationContent = (input) => {
|
|
|
146
156
|
const startPageKeys = keys.filter((k) => k.startsWith("start_page"));
|
|
147
157
|
const ordered = [];
|
|
148
158
|
// Shared first
|
|
149
|
-
if (attrs.
|
|
150
|
-
ordered.push("
|
|
159
|
+
if (attrs.attachment_id)
|
|
160
|
+
ordered.push("attachment_id");
|
|
151
161
|
if (hasTimestamps) {
|
|
152
|
-
// AV citations:
|
|
162
|
+
// AV citations: attachment_id, full_phrase, timestamps, (optional reasoning/value), then any extras
|
|
153
163
|
if (attrs.full_phrase)
|
|
154
164
|
ordered.push("full_phrase");
|
|
155
165
|
ordered.push("timestamps");
|
|
156
166
|
}
|
|
157
167
|
else {
|
|
158
|
-
// Document citations:
|
|
168
|
+
// Document citations: attachment_id, start_page*, full_phrase, key_span, line_ids, (optional reasoning/value), then any extras
|
|
159
169
|
if (startPageKeys.includes("start_page_key"))
|
|
160
170
|
ordered.push("start_page_key");
|
|
161
171
|
startPageKeys
|
|
@@ -94,9 +94,9 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
|
|
|
94
94
|
// 4: full_phrase content (escaped)
|
|
95
95
|
// 5: key_span content (escaped)
|
|
96
96
|
// 6: line_ids content
|
|
97
|
-
//
|
|
98
|
-
//
|
|
99
|
-
const citationRegex = /<cite\s+
|
|
97
|
+
// 7: Optional Key (value|reasoning)
|
|
98
|
+
// 8: Optional Value content (escaped)
|
|
99
|
+
const citationRegex = /<cite\s+(?:attachment_id|attachmentId|file_id|fileId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
|
|
100
100
|
const citationMatches = [...middleCite.matchAll(citationRegex)];
|
|
101
101
|
const match = citationMatches?.[0];
|
|
102
102
|
const pageNumber = match?.[2] ? parseInt(match?.[2]) : undefined;
|
|
@@ -133,7 +133,7 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
|
|
|
133
133
|
// 3: timestamps content
|
|
134
134
|
// 4: Optional Key (value|reasoning)
|
|
135
135
|
// 5: Optional Value content (escaped)
|
|
136
|
-
const avCitationRegex = /<cite\s+
|
|
136
|
+
const avCitationRegex = /<cite\s+(?:attachment_id|attachmentId|file_id|fileId)='(\w{0,25})'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+timestamps='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
|
|
137
137
|
const avCitationMatches = [...middleCite.matchAll(avCitationRegex)];
|
|
138
138
|
const avMatch = avCitationMatches?.[0];
|
|
139
139
|
let timestamps;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
export declare const CITATION_MARKDOWN_SYNTAX_PROMPT = "\nCitation syntax to use within Markdown:\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite
|
|
2
|
-
export declare const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = "\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite
|
|
1
|
+
export declare const CITATION_MARKDOWN_SYNTAX_PROMPT = "\nCitation syntax to use within Markdown:\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite attachment_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <attachment_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />\n\n\u2022 Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.\n\u2022 start_page_key, full_phrase, and line_ids are required for each citation.\n\u2022 Infer line_ids, as we only provide the first, last, and every 5th line. When copying a previous <cite />, use the full info from the previous citation without changing the start_page_key, line_ids, or any other <cite /> attributes.\n\u2022 Use refer to line_ids inclusively, and use a range (or single) for each citation, split multiple sequential line_ids into multiple citations.\n\u2022 These citations will be replaced and displayed in-line as a numeric element (e.g. [1]), the markdown preceding <cite /> should read naturally with only one <cite /> per sentence with rare exceptions for two <cite /> in a sentence. <cite /> often present best at the end of the sentence, and are not grouped at the end of the document.\n\u2022 The full_phrase should be the exact verbatim text of the phrase or paragraph from the source document to support the insight or idea.\n\u2022 We do NOT put the full_phrase inside <cite ...></cite>; we only use full_phrase inside the full_phrase attribute.\n";
|
|
2
|
+
export declare const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = "\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite attachment_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />\n\u2022 These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.\n";
|
|
3
3
|
export interface WrapSystemPromptOptions {
|
|
4
4
|
/** The original system prompt to wrap with citation instructions */
|
|
5
5
|
systemPrompt: string;
|
|
@@ -134,4 +134,5 @@ export declare const CITATION_AV_BASED_JSON_OUTPUT_FORMAT: {
|
|
|
134
134
|
description: string;
|
|
135
135
|
};
|
|
136
136
|
};
|
|
137
|
+
required: string[];
|
|
137
138
|
};
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
export const CITATION_MARKDOWN_SYNTAX_PROMPT = `
|
|
2
2
|
Citation syntax to use within Markdown:
|
|
3
3
|
• To support any ideas or information that requires a citation from the provided content, use the following citation syntax:
|
|
4
|
-
<cite
|
|
4
|
+
<cite attachment_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <attachment_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />
|
|
5
5
|
|
|
6
6
|
• Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.
|
|
7
7
|
• start_page_key, full_phrase, and line_ids are required for each citation.
|
|
@@ -13,7 +13,7 @@ Citation syntax to use within Markdown:
|
|
|
13
13
|
`;
|
|
14
14
|
export const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = `
|
|
15
15
|
• To support any ideas or information that requires a citation from the provided content, use the following citation syntax:
|
|
16
|
-
<cite
|
|
16
|
+
<cite attachment_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />
|
|
17
17
|
• These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.
|
|
18
18
|
`;
|
|
19
19
|
/**
|
|
@@ -121,7 +121,7 @@ export const CITATION_JSON_OUTPUT_FORMAT = {
|
|
|
121
121
|
},
|
|
122
122
|
fullPhrase: {
|
|
123
123
|
type: "string",
|
|
124
|
-
description: "The verbatim text of the terse phrase inside <
|
|
124
|
+
description: "The verbatim text of the terse phrase inside <attachment_text /> to support the citation (if there is a detected OCR correction, use the corrected text)",
|
|
125
125
|
},
|
|
126
126
|
keySpan: {
|
|
127
127
|
type: "string",
|
|
@@ -164,4 +164,5 @@ export const CITATION_AV_BASED_JSON_OUTPUT_FORMAT = {
|
|
|
164
164
|
description: "The timestamp of the audio or video frame including milliseconds formatted as: HH:MM:SS.SSS",
|
|
165
165
|
},
|
|
166
166
|
},
|
|
167
|
+
required: ["attachmentId", "startPageKey", "fullPhrase", "timestamps"],
|
|
167
168
|
};
|
|
@@ -14,17 +14,20 @@ function buildSafePrefixMap(ids, prompt) {
|
|
|
14
14
|
const digitCount = (prefix.match(/\d/g) || []).length;
|
|
15
15
|
const letterCount = (prefix.match(/[a-zA-Z]/g) || []).length;
|
|
16
16
|
if (prefix.length < MIN_PREFIX_LENGTH ||
|
|
17
|
-
(digitCount > 0 &&
|
|
18
|
-
|
|
17
|
+
(digitCount > 0 &&
|
|
18
|
+
letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_AT_LEAST_ONE_DIGIT) ||
|
|
19
|
+
(digitCount === 0 &&
|
|
20
|
+
letterCount < MIN_CHARACTERS_PER_PREFIX_WITH_NO_DIGITS)) {
|
|
19
21
|
continue;
|
|
20
22
|
}
|
|
21
23
|
// 1) Unique among IDs
|
|
22
|
-
if (ids.some(other => other !== id && other.startsWith(prefix))) {
|
|
24
|
+
if (ids.some((other) => other !== id && other.startsWith(prefix))) {
|
|
23
25
|
continue;
|
|
24
26
|
}
|
|
25
27
|
// 2) Only appears in prompt as part of the full ID
|
|
26
28
|
const esc = (s) => s.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
27
|
-
const prefixCount = (prompt.match(new RegExp(esc(prefix), "g")) || [])
|
|
29
|
+
const prefixCount = (prompt.match(new RegExp(esc(prefix), "g")) || [])
|
|
30
|
+
.length;
|
|
28
31
|
const fullCount = (prompt.match(new RegExp(esc(id), "g")) || []).length;
|
|
29
32
|
if (prefixCount !== fullCount) {
|
|
30
33
|
continue;
|
|
@@ -91,18 +94,29 @@ export function decompressPromptIds(compressed, prefixMap) {
|
|
|
91
94
|
const escPrefix = prefix.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
92
95
|
text = text.replace(new RegExp(escPrefix, "g"), full);
|
|
93
96
|
}
|
|
94
|
-
//
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
97
|
+
// Handle cases where the LLM may output ID in a different attribute format
|
|
98
|
+
// We look for common ID attribute patterns and replace compressed prefixes within them
|
|
99
|
+
// Note: fileId variants are supported for backwards compatibility with legacy citations
|
|
100
|
+
const idAttributeKeys = [
|
|
101
|
+
"attachmentId",
|
|
102
|
+
"attachment_id",
|
|
103
|
+
"attachment_ID",
|
|
104
|
+
"attachmentID",
|
|
105
|
+
"fileId",
|
|
106
|
+
"file_id",
|
|
107
|
+
"file_ID",
|
|
108
|
+
"fileID",
|
|
109
|
+
"fileid",
|
|
110
|
+
];
|
|
111
|
+
// For each prefix, look for it within ID attribute values and replace with full ID
|
|
112
|
+
for (const [prefix, full] of entries) {
|
|
113
|
+
const escPrefix = prefix.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
114
|
+
const keyPattern = idAttributeKeys.join("|");
|
|
115
|
+
const quotePattern = "([\"'`])";
|
|
116
|
+
// Match: attributeName = 'prefix' or attributeName="prefix" etc.
|
|
117
|
+
// Only replace the prefix part, preserving the attribute name and quotes
|
|
118
|
+
const re = new RegExp(`(${keyPattern})(\\s*=\\s*)${quotePattern}${escPrefix}\\3`, "g");
|
|
119
|
+
text = text.replace(re, `$1$2$3${full}$3`);
|
|
106
120
|
}
|
|
107
121
|
const newLength = text?.length;
|
|
108
122
|
const diff = originalLength - newLength;
|