@deepcitation/deepcitation-js 1.1.27 → 1.1.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +253 -253
- package/lib/chunk-2IZXUOQR.js +66 -0
- package/lib/chunk-4FGOHQFP.cjs +66 -0
- package/lib/chunk-CFXDRAJL.cjs +1 -0
- package/lib/chunk-DEUSSEFH.js +2 -0
- package/lib/chunk-F2MMVEVC.cjs +1 -0
- package/lib/chunk-J7U6YFOI.cjs +2 -0
- package/lib/chunk-O2XFH626.js +1 -0
- package/lib/chunk-RQPZSRID.js +1 -0
- package/lib/client/index.cjs +1 -0
- package/lib/client/{DeepCitation.d.ts → index.d.cts} +159 -3
- package/lib/client/index.d.ts +342 -2
- package/lib/client/index.js +1 -1
- package/lib/index.cjs +1 -0
- package/lib/index.d.cts +127 -0
- package/lib/index.d.ts +126 -22
- package/lib/index.js +1 -20
- package/lib/prompts/index.cjs +1 -0
- package/lib/prompts/index.d.cts +196 -0
- package/lib/prompts/index.d.ts +196 -3
- package/lib/prompts/index.js +1 -3
- package/lib/react/index.cjs +4 -0
- package/lib/react/index.js +4 -20
- package/lib/types/index.cjs +1 -0
- package/lib/types/index.d.cts +96 -0
- package/lib/types/index.d.ts +96 -11
- package/lib/types/index.js +1 -7
- package/package.json +46 -11
- package/lib/client/DeepCitation.js +0 -374
- package/lib/client/types.d.ts +0 -154
- package/lib/client/types.js +0 -1
- package/lib/parsing/normalizeCitation.d.ts +0 -5
- package/lib/parsing/normalizeCitation.js +0 -198
- package/lib/parsing/parseCitation.d.ts +0 -79
- package/lib/parsing/parseCitation.js +0 -431
- package/lib/parsing/parseWorkAround.d.ts +0 -2
- package/lib/parsing/parseWorkAround.js +0 -73
- package/lib/prompts/citationPrompts.d.ts +0 -138
- package/lib/prompts/citationPrompts.js +0 -168
- package/lib/prompts/promptCompression.d.ts +0 -14
- package/lib/prompts/promptCompression.js +0 -127
- package/lib/prompts/types.d.ts +0 -4
- package/lib/prompts/types.js +0 -1
- package/lib/react/CitationComponent.d.ts +0 -106
- package/lib/react/CitationComponent.js +0 -419
- package/lib/react/CitationVariants.d.ts +0 -132
- package/lib/react/CitationVariants.js +0 -277
- package/lib/react/DiffDisplay.d.ts +0 -10
- package/lib/react/DiffDisplay.js +0 -33
- package/lib/react/Popover.d.ts +0 -15
- package/lib/react/Popover.js +0 -20
- package/lib/react/UrlCitationComponent.d.ts +0 -83
- package/lib/react/UrlCitationComponent.js +0 -224
- package/lib/react/VerificationTabs.d.ts +0 -10
- package/lib/react/VerificationTabs.js +0 -36
- package/lib/react/icons.d.ts +0 -22
- package/lib/react/icons.js +0 -16
- package/lib/react/index.d.ts +0 -17
- package/lib/react/primitives.d.ts +0 -99
- package/lib/react/primitives.js +0 -187
- package/lib/react/types.d.ts +0 -315
- package/lib/react/types.js +0 -1
- package/lib/react/useSmartDiff.d.ts +0 -16
- package/lib/react/useSmartDiff.js +0 -64
- package/lib/react/utils.d.ts +0 -44
- package/lib/react/utils.js +0 -88
- package/lib/types/boxes.d.ts +0 -11
- package/lib/types/boxes.js +0 -1
- package/lib/types/citation.d.ts +0 -39
- package/lib/types/citation.js +0 -1
- package/lib/types/search.d.ts +0 -19
- package/lib/types/search.js +0 -1
- package/lib/types/verification.d.ts +0 -27
- package/lib/types/verification.js +0 -11
- package/lib/utils/diff.d.ts +0 -60
- package/lib/utils/diff.js +0 -414
- package/lib/utils/sha.d.ts +0 -10
- package/lib/utils/sha.js +0 -108
|
@@ -1,431 +0,0 @@
|
|
|
1
|
-
import { normalizeCitations } from "./normalizeCitation.js";
|
|
2
|
-
import { generateCitationKey } from "../react/utils.js";
|
|
3
|
-
/**
|
|
4
|
-
* Parses a line_ids string that may contain individual numbers, ranges, or both.
|
|
5
|
-
* Examples: "1,2,3", "5-10", "1,5-7,10", "20-20"
|
|
6
|
-
*
|
|
7
|
-
* @param lineIdsString - The raw line_ids string (e.g., "1,5-7,10")
|
|
8
|
-
* @returns Sorted array of unique line IDs, or undefined if empty/invalid
|
|
9
|
-
*/
|
|
10
|
-
function parseLineIds(lineIdsString) {
|
|
11
|
-
if (!lineIdsString)
|
|
12
|
-
return undefined;
|
|
13
|
-
const lineIds = [];
|
|
14
|
-
const parts = lineIdsString.split(",");
|
|
15
|
-
for (const part of parts) {
|
|
16
|
-
const trimmed = part.trim();
|
|
17
|
-
if (!trimmed)
|
|
18
|
-
continue;
|
|
19
|
-
// Check if this part is a range (e.g., "5-10")
|
|
20
|
-
if (trimmed.includes("-")) {
|
|
21
|
-
const [startStr, endStr] = trimmed.split("-");
|
|
22
|
-
const start = parseInt(startStr, 10);
|
|
23
|
-
const end = parseInt(endStr, 10);
|
|
24
|
-
if (!isNaN(start) && !isNaN(end) && start <= end) {
|
|
25
|
-
// Expand the range
|
|
26
|
-
for (let i = start; i <= end; i++) {
|
|
27
|
-
lineIds.push(i);
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
else if (!isNaN(start)) {
|
|
31
|
-
// If only start is valid, just use it
|
|
32
|
-
lineIds.push(start);
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
else {
|
|
36
|
-
// Single number
|
|
37
|
-
const num = parseInt(trimmed, 10);
|
|
38
|
-
if (!isNaN(num)) {
|
|
39
|
-
lineIds.push(num);
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
if (lineIds.length === 0)
|
|
44
|
-
return undefined;
|
|
45
|
-
// Sort and deduplicate
|
|
46
|
-
return [...new Set(lineIds)].sort((a, b) => a - b);
|
|
47
|
-
}
|
|
48
|
-
/**
|
|
49
|
-
* Calculates the verification status of a citation based on the found highlight and search state.
|
|
50
|
-
*
|
|
51
|
-
* @param verification - The found highlight location, or null/undefined if not found
|
|
52
|
-
* @returns An object containing boolean flags for verification status
|
|
53
|
-
*/
|
|
54
|
-
export function getCitationStatus(verification) {
|
|
55
|
-
const status = verification?.status;
|
|
56
|
-
const isMiss = status === "not_found";
|
|
57
|
-
const isFullMatchWithMissedValue = status === "found_phrase_missed_value";
|
|
58
|
-
const isFoundValueMissedFullMatch = status === "found_key_span_only";
|
|
59
|
-
const isPartialMatch = status === "partial_text_found" ||
|
|
60
|
-
status === "found_on_other_page" ||
|
|
61
|
-
status === "found_on_other_line" ||
|
|
62
|
-
status === "first_word_found";
|
|
63
|
-
const isVerified = status === "found" ||
|
|
64
|
-
isFoundValueMissedFullMatch ||
|
|
65
|
-
isPartialMatch ||
|
|
66
|
-
isFullMatchWithMissedValue;
|
|
67
|
-
const isPending = status === "pending" || status === "loading" || !status;
|
|
68
|
-
return { isVerified, isMiss, isPartialMatch, isPending };
|
|
69
|
-
}
|
|
70
|
-
export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVerbose) => {
|
|
71
|
-
// Helper: Remove wrapper quotes and unescape internal single quotes (e.g. It\'s -> It's)
|
|
72
|
-
const cleanAndUnescape = (str) => {
|
|
73
|
-
if (!str)
|
|
74
|
-
return undefined;
|
|
75
|
-
// Remove surrounding quotes if present (regex usually handles this, but safety first)
|
|
76
|
-
const trimmed = str.replace(/^['"]|['"]$/g, "");
|
|
77
|
-
// Replace escaped single quotes with actual single quotes
|
|
78
|
-
return trimmed.replace(/\\'/g, "'");
|
|
79
|
-
};
|
|
80
|
-
const citationNumber = citationCounterRef?.current
|
|
81
|
-
? citationCounterRef.current++
|
|
82
|
-
: undefined;
|
|
83
|
-
const beforeCite = fragment.substring(0, fragment.indexOf("<cite"));
|
|
84
|
-
const afterCite = fragment.includes("/>")
|
|
85
|
-
? fragment.slice(fragment.indexOf("/>") + 2)
|
|
86
|
-
: "";
|
|
87
|
-
const middleCite = fragment.substring(fragment.indexOf("<cite"), fragment.indexOf("/>") + 2);
|
|
88
|
-
// GROUPS:
|
|
89
|
-
// 1: attachmentId
|
|
90
|
-
// 2: start_page number
|
|
91
|
-
// 3: index number
|
|
92
|
-
// 4: full_phrase content (escaped)
|
|
93
|
-
// 5: key_span content (escaped)
|
|
94
|
-
// 6: line_ids content
|
|
95
|
-
// 7: Optional Key (value|reasoning)
|
|
96
|
-
// 8: Optional Value content (escaped)
|
|
97
|
-
const citationRegex = /<cite\s+(?:attachment_id|attachmentId|file_id|fileId)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
|
|
98
|
-
const citationMatches = [...middleCite.matchAll(citationRegex)];
|
|
99
|
-
const match = citationMatches?.[0];
|
|
100
|
-
const pageNumber = match?.[2] ? parseInt(match?.[2]) : undefined;
|
|
101
|
-
const pageIndex = match?.[3] ? parseInt(match?.[3]) : undefined;
|
|
102
|
-
let rawAttachmentId = match?.[1];
|
|
103
|
-
let attachmentId = rawAttachmentId?.length === 20 ? rawAttachmentId : mdAttachmentId || rawAttachmentId;
|
|
104
|
-
// Use helper to handle escaped quotes inside the phrase
|
|
105
|
-
let fullPhrase = cleanAndUnescape(match?.[4]);
|
|
106
|
-
let keySpan = cleanAndUnescape(match?.[5]);
|
|
107
|
-
// Handle the optional attribute (value or reasoning)
|
|
108
|
-
let value;
|
|
109
|
-
let reasoning;
|
|
110
|
-
const optionalKey = match?.[7]; // "value" or "reasoning"
|
|
111
|
-
const optionalContent = cleanAndUnescape(match?.[8]);
|
|
112
|
-
if (optionalKey === "value") {
|
|
113
|
-
value = optionalContent;
|
|
114
|
-
}
|
|
115
|
-
else if (optionalKey === "reasoning") {
|
|
116
|
-
reasoning = optionalContent;
|
|
117
|
-
}
|
|
118
|
-
let lineIds;
|
|
119
|
-
try {
|
|
120
|
-
// match[6] is line_ids
|
|
121
|
-
const lineIdsString = match?.[6]?.replace(/[A-Za-z_[\](){}:]/g, "");
|
|
122
|
-
lineIds = lineIdsString ? parseLineIds(lineIdsString) : undefined;
|
|
123
|
-
}
|
|
124
|
-
catch (e) {
|
|
125
|
-
if (isVerbose)
|
|
126
|
-
console.error("Error parsing lineIds", e);
|
|
127
|
-
}
|
|
128
|
-
// GROUPS for AV:
|
|
129
|
-
// 1: attachmentId
|
|
130
|
-
// 2: full_phrase content (escaped)
|
|
131
|
-
// 3: timestamps content
|
|
132
|
-
// 4: Optional Key (value|reasoning)
|
|
133
|
-
// 5: Optional Value content (escaped)
|
|
134
|
-
const avCitationRegex = /<cite\s+(?:attachment_id|attachmentId|file_id|fileId)='(\w{0,25})'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+timestamps='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
|
|
135
|
-
const avCitationMatches = [...middleCite.matchAll(avCitationRegex)];
|
|
136
|
-
const avMatch = avCitationMatches?.[0];
|
|
137
|
-
let timestamps;
|
|
138
|
-
if (avMatch) {
|
|
139
|
-
rawAttachmentId = avMatch?.[1];
|
|
140
|
-
attachmentId = rawAttachmentId?.length === 20 ? rawAttachmentId : mdAttachmentId || rawAttachmentId;
|
|
141
|
-
fullPhrase = cleanAndUnescape(avMatch?.[2]);
|
|
142
|
-
const timestampsString = avMatch?.[3]?.replace(/timestamps=['"]|['"]/g, "");
|
|
143
|
-
const [startTime, endTime] = timestampsString?.split("-") || [];
|
|
144
|
-
const avOptionalKey = avMatch?.[4];
|
|
145
|
-
const avOptionalContent = cleanAndUnescape(avMatch?.[5]);
|
|
146
|
-
if (avOptionalKey === "value") {
|
|
147
|
-
value = avOptionalContent;
|
|
148
|
-
}
|
|
149
|
-
else if (avOptionalKey === "reasoning") {
|
|
150
|
-
reasoning = avOptionalContent;
|
|
151
|
-
}
|
|
152
|
-
timestamps = { startTime, endTime };
|
|
153
|
-
}
|
|
154
|
-
const citation = {
|
|
155
|
-
attachmentId: attachmentId,
|
|
156
|
-
pageNumber,
|
|
157
|
-
startPageKey: `page_number_${pageNumber || 1}_index_${pageIndex || 0}`,
|
|
158
|
-
fullPhrase,
|
|
159
|
-
keySpan: keySpan || value,
|
|
160
|
-
citationNumber,
|
|
161
|
-
lineIds,
|
|
162
|
-
beforeCite,
|
|
163
|
-
timestamps,
|
|
164
|
-
reasoning,
|
|
165
|
-
};
|
|
166
|
-
return {
|
|
167
|
-
beforeCite,
|
|
168
|
-
afterCite,
|
|
169
|
-
citation,
|
|
170
|
-
};
|
|
171
|
-
};
|
|
172
|
-
/**
|
|
173
|
-
* Parses a JSON-based citation object into a Citation.
|
|
174
|
-
* Supports both camelCase and snake_case property names.
|
|
175
|
-
*
|
|
176
|
-
* @param jsonCitation - The JSON citation object (can have camelCase or snake_case properties)
|
|
177
|
-
* @param citationNumber - Optional citation number for ordering
|
|
178
|
-
* @returns Parsed Citation object
|
|
179
|
-
*/
|
|
180
|
-
const parseJsonCitation = (jsonCitation, citationNumber) => {
|
|
181
|
-
if (!jsonCitation) {
|
|
182
|
-
return null;
|
|
183
|
-
}
|
|
184
|
-
// Support both camelCase and snake_case property names
|
|
185
|
-
const fullPhrase = jsonCitation.fullPhrase ?? jsonCitation.full_phrase;
|
|
186
|
-
const startPageKey = jsonCitation.startPageKey ?? jsonCitation.start_page_key;
|
|
187
|
-
const keySpan = jsonCitation.keySpan ?? jsonCitation.key_span;
|
|
188
|
-
const rawLineIds = jsonCitation.lineIds ?? jsonCitation.line_ids;
|
|
189
|
-
const attachmentId = jsonCitation.attachmentId ?? jsonCitation.attachment_id ?? jsonCitation.fileId ?? jsonCitation.file_id;
|
|
190
|
-
const reasoning = jsonCitation.reasoning;
|
|
191
|
-
const value = jsonCitation.value;
|
|
192
|
-
if (!fullPhrase) {
|
|
193
|
-
return null;
|
|
194
|
-
}
|
|
195
|
-
// Parse startPageKey format: "page_number_PAGE_index_INDEX" or simple "PAGE_INDEX"
|
|
196
|
-
let pageNumber;
|
|
197
|
-
if (startPageKey) {
|
|
198
|
-
// Try full format first: page_number_5_index_2 or pageKey_5_index_2
|
|
199
|
-
const pageMatch = startPageKey.match(/page[_a-zA-Z]*(\d+)_index_(\d+)/i);
|
|
200
|
-
if (pageMatch) {
|
|
201
|
-
pageNumber = parseInt(pageMatch[1], 10);
|
|
202
|
-
}
|
|
203
|
-
else {
|
|
204
|
-
// Try simple n_m format: 5_4 (page 5, index 4)
|
|
205
|
-
const simpleMatch = startPageKey.match(/^(\d+)_(\d+)$/);
|
|
206
|
-
if (simpleMatch) {
|
|
207
|
-
pageNumber = parseInt(simpleMatch[1], 10);
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
// Sort lineIds if present
|
|
212
|
-
const lineIds = rawLineIds?.length
|
|
213
|
-
? [...rawLineIds].sort((a, b) => a - b)
|
|
214
|
-
: undefined;
|
|
215
|
-
const citation = {
|
|
216
|
-
attachmentId,
|
|
217
|
-
pageNumber,
|
|
218
|
-
fullPhrase,
|
|
219
|
-
citationNumber,
|
|
220
|
-
lineIds,
|
|
221
|
-
keySpan: keySpan || value,
|
|
222
|
-
reasoning,
|
|
223
|
-
};
|
|
224
|
-
return citation;
|
|
225
|
-
};
|
|
226
|
-
/**
|
|
227
|
-
* Checks if an object has citation-like properties (camelCase or snake_case).
|
|
228
|
-
*/
|
|
229
|
-
const hasCitationProperties = (item) => typeof item === "object" &&
|
|
230
|
-
item !== null &&
|
|
231
|
-
("fullPhrase" in item ||
|
|
232
|
-
"full_phrase" in item ||
|
|
233
|
-
"startPageKey" in item ||
|
|
234
|
-
"start_page_key" in item ||
|
|
235
|
-
"keySpan" in item ||
|
|
236
|
-
"key_span" in item ||
|
|
237
|
-
"lineIds" in item ||
|
|
238
|
-
"line_ids" in item);
|
|
239
|
-
/**
|
|
240
|
-
* Checks if the input appears to be JSON-based citations.
|
|
241
|
-
* Looks for array of objects with citation-like properties (supports both camelCase and snake_case).
|
|
242
|
-
*/
|
|
243
|
-
const isJsonCitationFormat = (data) => {
|
|
244
|
-
if (Array.isArray(data)) {
|
|
245
|
-
return data.length > 0 && data.some(hasCitationProperties);
|
|
246
|
-
}
|
|
247
|
-
if (typeof data === "object" && data !== null) {
|
|
248
|
-
return hasCitationProperties(data);
|
|
249
|
-
}
|
|
250
|
-
return false;
|
|
251
|
-
};
|
|
252
|
-
/**
|
|
253
|
-
* Extracts citations from JSON format (array or single object).
|
|
254
|
-
*/
|
|
255
|
-
const extractJsonCitations = (data) => {
|
|
256
|
-
const citations = {};
|
|
257
|
-
const items = Array.isArray(data) ? data : [data];
|
|
258
|
-
let citationNumber = 1;
|
|
259
|
-
for (const item of items) {
|
|
260
|
-
const citation = parseJsonCitation(item, citationNumber++);
|
|
261
|
-
if (citation && citation.fullPhrase) {
|
|
262
|
-
const citationKey = generateCitationKey(citation);
|
|
263
|
-
citations[citationKey] = citation;
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
return citations;
|
|
267
|
-
};
|
|
268
|
-
/**
|
|
269
|
-
* Recursively traverses an object looking for `citation` or `citations` properties
|
|
270
|
-
* that match our JSON citation format.
|
|
271
|
-
*/
|
|
272
|
-
const findJsonCitationsInObject = (obj, found) => {
|
|
273
|
-
if (!obj || typeof obj !== "object")
|
|
274
|
-
return;
|
|
275
|
-
// Check for citation/citations properties
|
|
276
|
-
if (obj.citation && isJsonCitationFormat(obj.citation)) {
|
|
277
|
-
const items = Array.isArray(obj.citation) ? obj.citation : [obj.citation];
|
|
278
|
-
found.push(...items);
|
|
279
|
-
}
|
|
280
|
-
if (obj.citations && isJsonCitationFormat(obj.citations)) {
|
|
281
|
-
const items = Array.isArray(obj.citations)
|
|
282
|
-
? obj.citations
|
|
283
|
-
: [obj.citations];
|
|
284
|
-
found.push(...items);
|
|
285
|
-
}
|
|
286
|
-
// Recurse into object properties
|
|
287
|
-
if (Array.isArray(obj)) {
|
|
288
|
-
for (const item of obj) {
|
|
289
|
-
findJsonCitationsInObject(item, found);
|
|
290
|
-
}
|
|
291
|
-
}
|
|
292
|
-
else {
|
|
293
|
-
for (const key of Object.keys(obj)) {
|
|
294
|
-
if (key !== "citation" && key !== "citations") {
|
|
295
|
-
findJsonCitationsInObject(obj[key], found);
|
|
296
|
-
}
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
};
|
|
300
|
-
/**
|
|
301
|
-
* Extracts XML citations from text using <cite ... /> tags.
|
|
302
|
-
*/
|
|
303
|
-
const extractXmlCitations = (text) => {
|
|
304
|
-
const normalizedText = normalizeCitations(text);
|
|
305
|
-
// Find all <cite ... /> tags
|
|
306
|
-
const citeRegex = /<cite\s+[^>]*\/>/g;
|
|
307
|
-
const matches = normalizedText.match(citeRegex);
|
|
308
|
-
if (!matches || matches.length === 0)
|
|
309
|
-
return {};
|
|
310
|
-
const citations = {};
|
|
311
|
-
const citationCounterRef = { current: 1 };
|
|
312
|
-
for (const match of matches) {
|
|
313
|
-
const { citation } = parseCitation(match, undefined, citationCounterRef);
|
|
314
|
-
if (citation && citation.fullPhrase) {
|
|
315
|
-
const citationKey = generateCitationKey(citation);
|
|
316
|
-
citations[citationKey] = citation;
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
return citations;
|
|
320
|
-
};
|
|
321
|
-
/**
|
|
322
|
-
* Extracts all citations from LLM output.
|
|
323
|
-
* Supports both XML <cite ... /> tags (embedded in strings/markdown) and JSON-based citation formats.
|
|
324
|
-
*
|
|
325
|
-
* For object input:
|
|
326
|
-
* - Traverses the object looking for `citation` or `citations` properties matching JSON format
|
|
327
|
-
* - Also stringifies the object to find embedded XML citations in markdown content
|
|
328
|
-
*
|
|
329
|
-
* @param llmOutput - The LLM output (string or object)
|
|
330
|
-
* @returns Dictionary of parsed Citation objects keyed by citation key
|
|
331
|
-
*/
|
|
332
|
-
export const getAllCitationsFromLlmOutput = (llmOutput) => {
|
|
333
|
-
if (!llmOutput)
|
|
334
|
-
return {};
|
|
335
|
-
const citations = {};
|
|
336
|
-
if (typeof llmOutput === "object") {
|
|
337
|
-
// Check if the root object itself is JSON citation format
|
|
338
|
-
if (isJsonCitationFormat(llmOutput)) {
|
|
339
|
-
const jsonCitations = extractJsonCitations(llmOutput);
|
|
340
|
-
Object.assign(citations, jsonCitations);
|
|
341
|
-
}
|
|
342
|
-
else {
|
|
343
|
-
// Traverse object for nested citation/citations properties
|
|
344
|
-
const foundJsonCitations = [];
|
|
345
|
-
findJsonCitationsInObject(llmOutput, foundJsonCitations);
|
|
346
|
-
if (foundJsonCitations.length > 0) {
|
|
347
|
-
const jsonCitations = extractJsonCitations(foundJsonCitations);
|
|
348
|
-
Object.assign(citations, jsonCitations);
|
|
349
|
-
}
|
|
350
|
-
}
|
|
351
|
-
// Also stringify and parse for embedded XML citations in markdown
|
|
352
|
-
const text = JSON.stringify(llmOutput);
|
|
353
|
-
const xmlCitations = extractXmlCitations(text);
|
|
354
|
-
Object.assign(citations, xmlCitations);
|
|
355
|
-
}
|
|
356
|
-
else if (typeof llmOutput === "string") {
|
|
357
|
-
// String input - parse for XML citations
|
|
358
|
-
const xmlCitations = extractXmlCitations(llmOutput);
|
|
359
|
-
Object.assign(citations, xmlCitations);
|
|
360
|
-
}
|
|
361
|
-
return citations;
|
|
362
|
-
};
|
|
363
|
-
/**
|
|
364
|
-
* Groups citations by their attachmentId for multi-file verification scenarios.
|
|
365
|
-
* This is useful when you have citations from multiple files and need to
|
|
366
|
-
* verify them against their respective source documents.
|
|
367
|
-
*
|
|
368
|
-
* @param citations - Array of Citation objects or a dictionary of citations
|
|
369
|
-
* @returns Map of attachmentId to dictionary of citations from that file
|
|
370
|
-
*
|
|
371
|
-
* @example
|
|
372
|
-
* ```typescript
|
|
373
|
-
* const citations = getAllCitationsFromLlmOutput(response.content);
|
|
374
|
-
* const citationsByAttachment = groupCitationsByAttachmentId(citations);
|
|
375
|
-
*
|
|
376
|
-
* // Verify citations for each file
|
|
377
|
-
* for (const [attachmentId, fileCitations] of citationsByAttachment) {
|
|
378
|
-
* const verified = await dc.verifyCitations(attachmentId, fileCitations);
|
|
379
|
-
* // Process verification results...
|
|
380
|
-
* }
|
|
381
|
-
* ```
|
|
382
|
-
*/
|
|
383
|
-
export function groupCitationsByAttachmentId(citations) {
|
|
384
|
-
const grouped = new Map();
|
|
385
|
-
// Normalize input to entries
|
|
386
|
-
const entries = Array.isArray(citations)
|
|
387
|
-
? citations.map((c, idx) => [generateCitationKey(c) || String(idx + 1), c])
|
|
388
|
-
: Object.entries(citations);
|
|
389
|
-
for (const [key, citation] of entries) {
|
|
390
|
-
const attachmentId = citation.attachmentId || "";
|
|
391
|
-
if (!grouped.has(attachmentId)) {
|
|
392
|
-
grouped.set(attachmentId, {});
|
|
393
|
-
}
|
|
394
|
-
grouped.get(attachmentId)[key] = citation;
|
|
395
|
-
}
|
|
396
|
-
return grouped;
|
|
397
|
-
}
|
|
398
|
-
/**
|
|
399
|
-
* Groups citations by their attachmentId and returns as a plain object.
|
|
400
|
-
* Alternative to groupCitationsByAttachmentId that returns a plain object instead of a Map.
|
|
401
|
-
*
|
|
402
|
-
* @param citations - Array of Citation objects or a dictionary of citations
|
|
403
|
-
* @returns Object with attachmentId keys mapping to citation dictionaries
|
|
404
|
-
*
|
|
405
|
-
* @example
|
|
406
|
-
* ```typescript
|
|
407
|
-
* const citations = getAllCitationsFromLlmOutput(response.content);
|
|
408
|
-
* const citationsByAttachment = groupCitationsByAttachmentIdObject(citations);
|
|
409
|
-
*
|
|
410
|
-
* // Verify citations for each file using Promise.all
|
|
411
|
-
* const verificationPromises = Object.entries(citationsByAttachment).map(
|
|
412
|
-
* ([attachmentId, fileCitations]) => dc.verifyCitations(attachmentId, fileCitations)
|
|
413
|
-
* );
|
|
414
|
-
* const results = await Promise.all(verificationPromises);
|
|
415
|
-
* ```
|
|
416
|
-
*/
|
|
417
|
-
export function groupCitationsByAttachmentIdObject(citations) {
|
|
418
|
-
const grouped = {};
|
|
419
|
-
// Normalize input to entries
|
|
420
|
-
const entries = Array.isArray(citations)
|
|
421
|
-
? citations.map((c, idx) => [generateCitationKey(c) || String(idx + 1), c])
|
|
422
|
-
: Object.entries(citations);
|
|
423
|
-
for (const [key, citation] of entries) {
|
|
424
|
-
const attachmentId = citation.attachmentId || "";
|
|
425
|
-
if (!grouped[attachmentId]) {
|
|
426
|
-
grouped[attachmentId] = {};
|
|
427
|
-
}
|
|
428
|
-
grouped[attachmentId][key] = citation;
|
|
429
|
-
}
|
|
430
|
-
return grouped;
|
|
431
|
-
}
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
//flash and flash lite get super confused if we ask for a MD table and infinite loop
|
|
2
|
-
const MIN_CONTENT_LENGTH_FOR_GEMINI_GARBAGE = 64;
|
|
3
|
-
export const isGeminiGarbage = (content) => {
|
|
4
|
-
if (!content)
|
|
5
|
-
return false;
|
|
6
|
-
const trimmedContent = content.trim();
|
|
7
|
-
if (trimmedContent.length < MIN_CONTENT_LENGTH_FOR_GEMINI_GARBAGE)
|
|
8
|
-
return false;
|
|
9
|
-
const firstCharacter = trimmedContent?.[0];
|
|
10
|
-
for (let i = 1; i < trimmedContent.length; i++) {
|
|
11
|
-
if (trimmedContent[i] !== firstCharacter)
|
|
12
|
-
return false;
|
|
13
|
-
}
|
|
14
|
-
return true;
|
|
15
|
-
};
|
|
16
|
-
// helps clean up infinite rambling bug output from gemini
|
|
17
|
-
export function cleanRepeatingLastSentence(text) {
|
|
18
|
-
text = text.trim();
|
|
19
|
-
const MIN_REPETITIONS = 2;
|
|
20
|
-
const MIN_SENTENCE_CONTENT_LENGTH = 10;
|
|
21
|
-
const sentenceEndRegex = /[.?!](?=\s+|$)/g;
|
|
22
|
-
let match;
|
|
23
|
-
const sentenceEndIndices = [];
|
|
24
|
-
while ((match = sentenceEndRegex.exec(text)) !== null) {
|
|
25
|
-
sentenceEndIndices.push(match.index);
|
|
26
|
-
}
|
|
27
|
-
if (sentenceEndIndices.length < 2) {
|
|
28
|
-
return text;
|
|
29
|
-
}
|
|
30
|
-
const lastTerminatorIndex = sentenceEndIndices[sentenceEndIndices.length - 1];
|
|
31
|
-
const secondLastTerminatorIndex = sentenceEndIndices[sentenceEndIndices.length - 2];
|
|
32
|
-
const repeatingUnit = text.substring(secondLastTerminatorIndex + 1, lastTerminatorIndex + 1);
|
|
33
|
-
const unitLength = repeatingUnit.length;
|
|
34
|
-
const sentenceContent = repeatingUnit.trim().slice(0, -1);
|
|
35
|
-
if (sentenceContent.length < MIN_SENTENCE_CONTENT_LENGTH) {
|
|
36
|
-
return text;
|
|
37
|
-
}
|
|
38
|
-
if (unitLength <= 0) {
|
|
39
|
-
return text;
|
|
40
|
-
}
|
|
41
|
-
if (text.length < unitLength * MIN_REPETITIONS) {
|
|
42
|
-
return text;
|
|
43
|
-
}
|
|
44
|
-
let repetitionsFound = 0;
|
|
45
|
-
let currentCheckEndIndex = lastTerminatorIndex + 1;
|
|
46
|
-
if (text.endsWith(repeatingUnit)) {
|
|
47
|
-
currentCheckEndIndex = text.length;
|
|
48
|
-
}
|
|
49
|
-
let firstRepetitionStartIndex = -1;
|
|
50
|
-
while (true) {
|
|
51
|
-
const checkStartIndex = currentCheckEndIndex - unitLength;
|
|
52
|
-
if (checkStartIndex < 0) {
|
|
53
|
-
break;
|
|
54
|
-
}
|
|
55
|
-
const chunk = text.substring(checkStartIndex, currentCheckEndIndex);
|
|
56
|
-
if (chunk === repeatingUnit) {
|
|
57
|
-
repetitionsFound++;
|
|
58
|
-
firstRepetitionStartIndex = checkStartIndex;
|
|
59
|
-
currentCheckEndIndex = checkStartIndex;
|
|
60
|
-
}
|
|
61
|
-
else {
|
|
62
|
-
break;
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
if (repetitionsFound >= MIN_REPETITIONS) {
|
|
66
|
-
const textBeforeRepetitions = text.substring(0, firstRepetitionStartIndex);
|
|
67
|
-
const result = textBeforeRepetitions + repeatingUnit;
|
|
68
|
-
return result;
|
|
69
|
-
}
|
|
70
|
-
else {
|
|
71
|
-
return text;
|
|
72
|
-
}
|
|
73
|
-
}
|
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
export declare const CITATION_MARKDOWN_SYNTAX_PROMPT = "\nCitation syntax to use within Markdown:\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite attachment_id='attachment_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <attachment_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim 1-3 words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />\n\n\u2022 Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.\n\u2022 start_page_key, full_phrase, and line_ids are required for each citation.\n\u2022 Infer line_ids, as we only provide the first, last, and every 5th line. When copying a previous <cite />, use the full info from the previous citation without changing the start_page_key, line_ids, or any other <cite /> attributes.\n\u2022 Use refer to line_ids inclusively, and use a range (or single) for each citation, split multiple sequential line_ids into multiple citations.\n\u2022 These citations will be replaced and displayed in-line as a numeric element (e.g. [1]), the markdown preceding <cite /> should read naturally with only one <cite /> per sentence with rare exceptions for two <cite /> in a sentence. <cite /> often present best at the end of the sentence, and are not grouped at the end of the document.\n\u2022 The full_phrase should be the exact verbatim text of the phrase or paragraph from the source document to support the insight or idea.\n\u2022 We do NOT put the full_phrase inside <cite ...></cite>; we only use full_phrase inside the full_phrase attribute.\n";
|
|
2
|
-
export declare const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = "\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite attachment_id='attachment_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />\n\u2022 These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.\n";
|
|
3
|
-
export interface WrapSystemPromptOptions {
|
|
4
|
-
/** The original system prompt to wrap with citation instructions */
|
|
5
|
-
systemPrompt: string;
|
|
6
|
-
/** Whether to use audio/video citation format (with timestamps) instead of text-based (with line IDs) */
|
|
7
|
-
isAudioVideo?: boolean;
|
|
8
|
-
prependCitationInstructions?: boolean;
|
|
9
|
-
}
|
|
10
|
-
export interface WrapCitationPromptOptions {
|
|
11
|
-
/** The original system prompt to wrap with citation instructions */
|
|
12
|
-
systemPrompt: string;
|
|
13
|
-
/** The original user prompt */
|
|
14
|
-
userPrompt: string;
|
|
15
|
-
/** The extracted file text with metadata (from uploadFile response). Can be a single string or array for multiple files. */
|
|
16
|
-
deepTextPromptPortion?: string | string[];
|
|
17
|
-
/** Whether to use audio/video citation format (with timestamps) instead of text-based (with line IDs) */
|
|
18
|
-
isAudioVideo?: boolean;
|
|
19
|
-
}
|
|
20
|
-
export interface WrapCitationPromptResult {
|
|
21
|
-
/** Enhanced system prompt with citation instructions */
|
|
22
|
-
enhancedSystemPrompt: string;
|
|
23
|
-
/** Enhanced user prompt (currently passed through unchanged) */
|
|
24
|
-
enhancedUserPrompt: string;
|
|
25
|
-
}
|
|
26
|
-
/**
|
|
27
|
-
* Wraps your existing system prompt with DeepCitation's citation syntax instructions.
|
|
28
|
-
* This enables LLMs to output verifiable citations that can be checked against source documents.
|
|
29
|
-
*
|
|
30
|
-
* @example
|
|
31
|
-
* ```typescript
|
|
32
|
-
* import { wrapSystemCitationPrompt } from '@deepcitation/deepcitation-js';
|
|
33
|
-
*
|
|
34
|
-
* const systemPrompt = "You are a helpful assistant that analyzes documents.";
|
|
35
|
-
* const enhanced = wrapSystemCitationPrompt({ systemPrompt });
|
|
36
|
-
*
|
|
37
|
-
* // Use enhanced prompt with your LLM
|
|
38
|
-
* const response = await openai.chat.completions.create({
|
|
39
|
-
* messages: [{ role: "system", content: enhanced }],
|
|
40
|
-
* // ...
|
|
41
|
-
* });
|
|
42
|
-
* ```
|
|
43
|
-
*/
|
|
44
|
-
export declare function wrapSystemCitationPrompt(options: WrapSystemPromptOptions): string;
|
|
45
|
-
/**
|
|
46
|
-
* Wraps both system and user prompts with DeepCitation's citation syntax instructions.
|
|
47
|
-
* This is the recommended way to prepare prompts for citation verification.
|
|
48
|
-
*
|
|
49
|
-
* @example
|
|
50
|
-
* ```typescript
|
|
51
|
-
* import { wrapCitationPrompt } from '@deepcitation/deepcitation-js';
|
|
52
|
-
*
|
|
53
|
-
* // Single file
|
|
54
|
-
* const { enhancedSystemPrompt, enhancedUserPrompt } = wrapCitationPrompt({
|
|
55
|
-
* systemPrompt: "You are a helpful assistant.",
|
|
56
|
-
* userPrompt: "Analyze this document and summarize it.",
|
|
57
|
-
* deepTextPromptPortion, // from uploadFile response
|
|
58
|
-
* });
|
|
59
|
-
*
|
|
60
|
-
* // Multiple files
|
|
61
|
-
* const { enhancedSystemPrompt, enhancedUserPrompt } = wrapCitationPrompt({
|
|
62
|
-
* systemPrompt: "You are a helpful assistant.",
|
|
63
|
-
* userPrompt: "Compare these documents.",
|
|
64
|
-
* deepTextPromptPortion: [deepTextPromptPortion1, deepTextPromptPortion2], // array of file texts
|
|
65
|
-
* });
|
|
66
|
-
*
|
|
67
|
-
* // Use enhanced prompts with your LLM
|
|
68
|
-
* const response = await llm.chat({
|
|
69
|
-
* messages: [
|
|
70
|
-
* { role: "system", content: enhancedSystemPrompt },
|
|
71
|
-
* { role: "user", content: enhancedUserPrompt },
|
|
72
|
-
* ],
|
|
73
|
-
* });
|
|
74
|
-
* ```
|
|
75
|
-
*/
|
|
76
|
-
export declare function wrapCitationPrompt(options: WrapCitationPromptOptions): WrapCitationPromptResult;
|
|
77
|
-
export declare const CITATION_JSON_OUTPUT_FORMAT: {
|
|
78
|
-
type: string;
|
|
79
|
-
properties: {
|
|
80
|
-
attachmentId: {
|
|
81
|
-
type: string;
|
|
82
|
-
};
|
|
83
|
-
startPageKey: {
|
|
84
|
-
type: string;
|
|
85
|
-
description: string;
|
|
86
|
-
};
|
|
87
|
-
reasoning: {
|
|
88
|
-
type: string;
|
|
89
|
-
description: string;
|
|
90
|
-
};
|
|
91
|
-
fullPhrase: {
|
|
92
|
-
type: string;
|
|
93
|
-
description: string;
|
|
94
|
-
};
|
|
95
|
-
keySpan: {
|
|
96
|
-
type: string;
|
|
97
|
-
description: string;
|
|
98
|
-
};
|
|
99
|
-
lineIds: {
|
|
100
|
-
type: string;
|
|
101
|
-
items: {
|
|
102
|
-
type: string;
|
|
103
|
-
};
|
|
104
|
-
description: string;
|
|
105
|
-
};
|
|
106
|
-
};
|
|
107
|
-
required: string[];
|
|
108
|
-
};
|
|
109
|
-
export declare const CITATION_AV_BASED_JSON_OUTPUT_FORMAT: {
|
|
110
|
-
type: string;
|
|
111
|
-
properties: {
|
|
112
|
-
attachmentId: {
|
|
113
|
-
type: string;
|
|
114
|
-
};
|
|
115
|
-
startPageKey: {
|
|
116
|
-
type: string;
|
|
117
|
-
description: string;
|
|
118
|
-
};
|
|
119
|
-
fullPhrase: {
|
|
120
|
-
type: string;
|
|
121
|
-
description: string;
|
|
122
|
-
};
|
|
123
|
-
timestamps: {
|
|
124
|
-
type: string;
|
|
125
|
-
properties: {
|
|
126
|
-
startTime: {
|
|
127
|
-
type: string;
|
|
128
|
-
};
|
|
129
|
-
endTime: {
|
|
130
|
-
type: string;
|
|
131
|
-
};
|
|
132
|
-
};
|
|
133
|
-
required: string[];
|
|
134
|
-
description: string;
|
|
135
|
-
};
|
|
136
|
-
};
|
|
137
|
-
required: string[];
|
|
138
|
-
};
|