@deepcitation/deepcitation-js 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/lib/client/DeepCitation.d.ts +12 -10
- package/lib/client/DeepCitation.js +67 -132
- package/lib/client/index.d.ts +1 -1
- package/lib/client/types.d.ts +4 -15
- package/lib/index.d.ts +7 -8
- package/lib/index.js +5 -6
- package/lib/parsing/normalizeCitation.js +17 -11
- package/lib/parsing/parseCitation.js +35 -14
- package/lib/prompts/citationPrompts.d.ts +9 -5
- package/lib/prompts/citationPrompts.js +27 -12
- package/lib/types/citation.d.ts +1 -3
- package/lib/types/citation.js +0 -1
- package/lib/types/foundHighlight.d.ts +2 -2
- package/lib/types/index.d.ts +1 -1
- package/lib/types/index.js +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -41,7 +41,7 @@ Get a free API key at [deepcitation.com](https://deepcitation.com/signup) — no
|
|
|
41
41
|
|
|
42
42
|
```bash
|
|
43
43
|
# .env
|
|
44
|
-
DEEPCITATION_API_KEY=
|
|
44
|
+
DEEPCITATION_API_KEY=sk-dc-your_api_key_here
|
|
45
45
|
```
|
|
46
46
|
|
|
47
47
|
---
|
|
@@ -60,7 +60,7 @@ import { DeepCitation, wrapCitationPrompt } from "@deepcitation/deepcitation-js"
|
|
|
60
60
|
const dc = new DeepCitation({ apiKey: process.env.DEEPCITATION_API_KEY });
|
|
61
61
|
|
|
62
62
|
// Upload source files
|
|
63
|
-
const { fileDataParts,
|
|
63
|
+
const { fileDataParts, deepTextPromptPortion } = await dc.prepareFiles([
|
|
64
64
|
{ file: pdfBuffer, filename: "report.pdf" },
|
|
65
65
|
]);
|
|
66
66
|
|
|
@@ -68,7 +68,7 @@ const { fileDataParts, fileDeepTexts } = await dc.prepareFiles([
|
|
|
68
68
|
const { enhancedSystemPrompt, enhancedUserPrompt } = wrapCitationPrompt({
|
|
69
69
|
systemPrompt: "You are a helpful assistant...",
|
|
70
70
|
userPrompt: "Analyze this document",
|
|
71
|
-
|
|
71
|
+
deepTextPromptPortion,
|
|
72
72
|
});
|
|
73
73
|
|
|
74
74
|
// Call your LLM
|
|
@@ -123,7 +123,7 @@ function Response({ citations, verifications }) {
|
|
|
123
123
|
|
|
124
124
|
```typescript
|
|
125
125
|
const dc = new DeepCitation({
|
|
126
|
-
apiKey: string, // Your API key (
|
|
126
|
+
apiKey: string, // Your API key (sk-dc-*)
|
|
127
127
|
apiUrl?: string, // Optional: Custom API URL
|
|
128
128
|
});
|
|
129
129
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { Citation } from "../types/index";
|
|
2
|
-
import type { CitationInput, ConvertFileInput, ConvertFileResponse, DeepCitationConfig, FileInput, PrepareConvertedFileOptions, PrepareFilesResult, UploadFileOptions, UploadFileResponse,
|
|
2
|
+
import type { CitationInput, ConvertFileInput, ConvertFileResponse, DeepCitationConfig, FileInput, PrepareConvertedFileOptions, PrepareFilesResult, UploadFileOptions, UploadFileResponse, VerifyCitationsFromLlmOutput, VerifyCitationsOptions, VerifyCitationsResponse } from "./types";
|
|
3
3
|
/**
|
|
4
4
|
* DeepCitation client for file upload and citation verification.
|
|
5
5
|
*
|
|
@@ -33,6 +33,8 @@ export declare class DeepCitation {
|
|
|
33
33
|
* This allows users to reference files by their own IDs
|
|
34
34
|
*/
|
|
35
35
|
private fileIdMap;
|
|
36
|
+
/** Store file mapping and return public response */
|
|
37
|
+
private storeAndReturnResponse;
|
|
36
38
|
/**
|
|
37
39
|
* Create a new DeepCitation client instance.
|
|
38
40
|
*
|
|
@@ -91,7 +93,7 @@ export declare class DeepCitation {
|
|
|
91
93
|
* });
|
|
92
94
|
*
|
|
93
95
|
* // Then prepare the file for verification
|
|
94
|
-
* const {
|
|
96
|
+
* const { deepTextPromptPortion, fileId } = await dc.prepareConvertedFile({
|
|
95
97
|
* fileId: result.fileId
|
|
96
98
|
* });
|
|
97
99
|
* ```
|
|
@@ -99,7 +101,7 @@ export declare class DeepCitation {
|
|
|
99
101
|
convertToPdf(input: ConvertFileInput | string): Promise<ConvertFileResponse>;
|
|
100
102
|
/**
|
|
101
103
|
* Prepare a previously converted file for citation verification.
|
|
102
|
-
* Use this after calling convertToPdf() to extract text and get
|
|
104
|
+
* Use this after calling convertToPdf() to extract text and get deepTextPromptPortion.
|
|
103
105
|
*
|
|
104
106
|
* @param options - Options with fileId from convertFile
|
|
105
107
|
* @returns Upload response with fileId and extracted text
|
|
@@ -110,11 +112,11 @@ export declare class DeepCitation {
|
|
|
110
112
|
* const converted = await dc.convertToPdf({ url: "https://example.com/article" });
|
|
111
113
|
*
|
|
112
114
|
* // Then prepare it for verification
|
|
113
|
-
* const {
|
|
115
|
+
* const { deepTextPromptPortion, fileId } = await dc.prepareConvertedFile({
|
|
114
116
|
* fileId: converted.fileId
|
|
115
117
|
* });
|
|
116
118
|
*
|
|
117
|
-
* // Use
|
|
119
|
+
* // Use deepTextPromptPortion in your LLM prompt...
|
|
118
120
|
* ```
|
|
119
121
|
*/
|
|
120
122
|
prepareConvertedFile(options: PrepareConvertedFileOptions): Promise<UploadFileResponse>;
|
|
@@ -123,20 +125,20 @@ export declare class DeepCitation {
|
|
|
123
125
|
* This is the recommended way to prepare files for LLM prompts.
|
|
124
126
|
*
|
|
125
127
|
* @param files - Array of files to upload with optional filenames and fileIds
|
|
126
|
-
* @returns Object containing fileDataParts for verification and
|
|
128
|
+
* @returns Object containing fileDataParts for verification and deepTextPromptPortion for LLM
|
|
127
129
|
*
|
|
128
130
|
* @example
|
|
129
131
|
* ```typescript
|
|
130
|
-
* const { fileDataParts,
|
|
132
|
+
* const { fileDataParts, deepTextPromptPortion } = await dc.prepareFiles([
|
|
131
133
|
* { file: pdfBuffer, filename: "report.pdf" },
|
|
132
134
|
* { file: invoiceBuffer, filename: "invoice.pdf" },
|
|
133
135
|
* ]);
|
|
134
136
|
*
|
|
135
|
-
* // Use
|
|
137
|
+
* // Use deepTextPromptPortion in wrapCitationPrompt
|
|
136
138
|
* const { enhancedSystemPrompt, enhancedUserPrompt } = wrapCitationPrompt({
|
|
137
139
|
* systemPrompt,
|
|
138
140
|
* userPrompt,
|
|
139
|
-
*
|
|
141
|
+
* deepTextPromptPortion
|
|
140
142
|
* });
|
|
141
143
|
*
|
|
142
144
|
* // Use fileDataParts later for verification
|
|
@@ -185,7 +187,7 @@ export declare class DeepCitation {
|
|
|
185
187
|
* }
|
|
186
188
|
* ```
|
|
187
189
|
*/
|
|
188
|
-
verifyCitationsFromLlmOutput(input:
|
|
190
|
+
verifyCitationsFromLlmOutput(input: VerifyCitationsFromLlmOutput, citations?: {
|
|
189
191
|
[key: string]: Citation;
|
|
190
192
|
}): Promise<VerifyCitationsResponse>;
|
|
191
193
|
/**
|
|
@@ -1,6 +1,25 @@
|
|
|
1
1
|
import { getAllCitationsFromLlmOutput } from "../parsing/parseCitation";
|
|
2
2
|
import { generateCitationKey } from "../react/utils";
|
|
3
3
|
const DEFAULT_API_URL = "https://api.deepcitation.com";
|
|
4
|
+
/** Convert File/Blob/Buffer to a Blob suitable for FormData */
|
|
5
|
+
function toBlob(file, filename) {
|
|
6
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(file)) {
|
|
7
|
+
const uint8 = Uint8Array.from(file);
|
|
8
|
+
return { blob: new Blob([uint8]), name: filename || "document" };
|
|
9
|
+
}
|
|
10
|
+
if (file instanceof Blob) {
|
|
11
|
+
return {
|
|
12
|
+
blob: file,
|
|
13
|
+
name: filename || (file instanceof File ? file.name : "document"),
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
throw new Error("Invalid file type. Expected File, Blob, or Buffer.");
|
|
17
|
+
}
|
|
18
|
+
/** Extract error message from API response */
|
|
19
|
+
async function extractErrorMessage(response, fallbackAction) {
|
|
20
|
+
const error = await response.json().catch(() => ({}));
|
|
21
|
+
return error?.error?.message || `${fallbackAction} failed with status ${response.status}`;
|
|
22
|
+
}
|
|
4
23
|
/**
|
|
5
24
|
* DeepCitation client for file upload and citation verification.
|
|
6
25
|
*
|
|
@@ -34,6 +53,12 @@ export class DeepCitation {
|
|
|
34
53
|
* This allows users to reference files by their own IDs
|
|
35
54
|
*/
|
|
36
55
|
fileIdMap = new Map();
|
|
56
|
+
/** Store file mapping and return public response */
|
|
57
|
+
storeAndReturnResponse(apiResponse) {
|
|
58
|
+
this.fileIdMap.set(apiResponse.fileId, { attachmentId: apiResponse.attachmentId });
|
|
59
|
+
const { attachmentId: _, ...publicResponse } = apiResponse;
|
|
60
|
+
return publicResponse;
|
|
61
|
+
}
|
|
37
62
|
/**
|
|
38
63
|
* Create a new DeepCitation client instance.
|
|
39
64
|
*
|
|
@@ -71,51 +96,22 @@ export class DeepCitation {
|
|
|
71
96
|
* ```
|
|
72
97
|
*/
|
|
73
98
|
async uploadFile(file, options) {
|
|
99
|
+
const { blob, name } = toBlob(file, options?.filename);
|
|
74
100
|
const formData = new FormData();
|
|
75
|
-
|
|
76
|
-
if (
|
|
77
|
-
// Node.js Buffer - copy to a new ArrayBuffer for Blob compatibility
|
|
78
|
-
const filename = options?.filename || "document";
|
|
79
|
-
// Use Uint8Array.from to create a copy that's definitely backed by ArrayBuffer (not SharedArrayBuffer)
|
|
80
|
-
const uint8 = Uint8Array.from(file);
|
|
81
|
-
const blob = new Blob([uint8]);
|
|
82
|
-
formData.append("file", blob, filename);
|
|
83
|
-
}
|
|
84
|
-
else if (file instanceof Blob) {
|
|
85
|
-
// File or Blob
|
|
86
|
-
const filename = options?.filename || (file instanceof File ? file.name : "document");
|
|
87
|
-
formData.append("file", file, filename);
|
|
88
|
-
}
|
|
89
|
-
else {
|
|
90
|
-
throw new Error("Invalid file type. Expected File, Blob, or Buffer.");
|
|
91
|
-
}
|
|
92
|
-
// Add optional fields
|
|
93
|
-
if (options?.fileId) {
|
|
101
|
+
formData.append("file", blob, name);
|
|
102
|
+
if (options?.fileId)
|
|
94
103
|
formData.append("fileId", options.fileId);
|
|
95
|
-
|
|
96
|
-
if (options?.filename) {
|
|
104
|
+
if (options?.filename)
|
|
97
105
|
formData.append("filename", options.filename);
|
|
98
|
-
}
|
|
99
106
|
const response = await fetch(`${this.apiUrl}/prepareFile`, {
|
|
100
107
|
method: "POST",
|
|
101
|
-
headers: {
|
|
102
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
103
|
-
},
|
|
108
|
+
headers: { Authorization: `Bearer ${this.apiKey}` },
|
|
104
109
|
body: formData,
|
|
105
110
|
});
|
|
106
111
|
if (!response.ok) {
|
|
107
|
-
|
|
108
|
-
throw new Error(error?.error?.message || `Upload failed with status ${response.status}`);
|
|
112
|
+
throw new Error(await extractErrorMessage(response, "Upload"));
|
|
109
113
|
}
|
|
110
|
-
|
|
111
|
-
const apiResponse = (await response.json());
|
|
112
|
-
// Store the mapping for later verification calls
|
|
113
|
-
this.fileIdMap.set(apiResponse.fileId, {
|
|
114
|
-
attachmentId: apiResponse.attachmentId,
|
|
115
|
-
});
|
|
116
|
-
// Return public response without internal fields
|
|
117
|
-
const { attachmentId: _attachmentId, ...publicResponse } = apiResponse;
|
|
118
|
-
return publicResponse;
|
|
114
|
+
return this.storeAndReturnResponse(await response.json());
|
|
119
115
|
}
|
|
120
116
|
/**
|
|
121
117
|
* Convert a URL or Office file to PDF for citation verification.
|
|
@@ -144,85 +140,53 @@ export class DeepCitation {
|
|
|
144
140
|
* });
|
|
145
141
|
*
|
|
146
142
|
* // Then prepare the file for verification
|
|
147
|
-
* const {
|
|
143
|
+
* const { deepTextPromptPortion, fileId } = await dc.prepareConvertedFile({
|
|
148
144
|
* fileId: result.fileId
|
|
149
145
|
* });
|
|
150
146
|
* ```
|
|
151
147
|
*/
|
|
152
148
|
async convertToPdf(input) {
|
|
153
|
-
// Handle string URL shorthand
|
|
154
149
|
const inputObj = typeof input === "string" ? { url: input } : input;
|
|
155
|
-
const { url, file, filename, fileId
|
|
150
|
+
const { url, file, filename, fileId } = inputObj;
|
|
156
151
|
if (!url && !file) {
|
|
157
152
|
throw new Error("Either url or file must be provided");
|
|
158
153
|
}
|
|
159
154
|
let response;
|
|
160
155
|
if (url) {
|
|
161
|
-
// URL conversion - send as JSON
|
|
162
156
|
response = await fetch(`${this.apiUrl}/convertFile`, {
|
|
163
157
|
method: "POST",
|
|
164
158
|
headers: {
|
|
165
159
|
Authorization: `Bearer ${this.apiKey}`,
|
|
166
160
|
"Content-Type": "application/json",
|
|
167
161
|
},
|
|
168
|
-
body: JSON.stringify({
|
|
169
|
-
url,
|
|
170
|
-
filename,
|
|
171
|
-
fileId,
|
|
172
|
-
singlePage,
|
|
173
|
-
}),
|
|
162
|
+
body: JSON.stringify({ url, filename, fileId }),
|
|
174
163
|
});
|
|
175
164
|
}
|
|
176
|
-
else
|
|
177
|
-
|
|
165
|
+
else {
|
|
166
|
+
const { blob, name } = toBlob(file, filename);
|
|
178
167
|
const formData = new FormData();
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
const uint8 = Uint8Array.from(file);
|
|
182
|
-
const blob = new Blob([uint8]);
|
|
183
|
-
formData.append("file", blob, fname);
|
|
184
|
-
}
|
|
185
|
-
else if (file instanceof Blob) {
|
|
186
|
-
const fname = filename || (file instanceof File ? file.name : "document");
|
|
187
|
-
formData.append("file", file, fname);
|
|
188
|
-
}
|
|
189
|
-
else {
|
|
190
|
-
throw new Error("Invalid file type. Expected File, Blob, or Buffer.");
|
|
191
|
-
}
|
|
192
|
-
if (fileId) {
|
|
168
|
+
formData.append("file", blob, name);
|
|
169
|
+
if (fileId)
|
|
193
170
|
formData.append("fileId", fileId);
|
|
194
|
-
|
|
195
|
-
if (filename) {
|
|
171
|
+
if (filename)
|
|
196
172
|
formData.append("filename", filename);
|
|
197
|
-
}
|
|
198
173
|
response = await fetch(`${this.apiUrl}/convertFile`, {
|
|
199
174
|
method: "POST",
|
|
200
|
-
headers: {
|
|
201
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
202
|
-
},
|
|
175
|
+
headers: { Authorization: `Bearer ${this.apiKey}` },
|
|
203
176
|
body: formData,
|
|
204
177
|
});
|
|
205
178
|
}
|
|
206
|
-
else {
|
|
207
|
-
throw new Error("Either url or file must be provided");
|
|
208
|
-
}
|
|
209
179
|
if (!response.ok) {
|
|
210
|
-
|
|
211
|
-
throw new Error(error?.error?.message || `Conversion failed with status ${response.status}`);
|
|
180
|
+
throw new Error(await extractErrorMessage(response, "Conversion"));
|
|
212
181
|
}
|
|
213
|
-
// Internal response includes attachmentId which we need for the two-step flow
|
|
214
182
|
const apiResponse = (await response.json());
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
attachmentId: apiResponse.attachmentId,
|
|
218
|
-
});
|
|
219
|
-
// Return public response without internal fields
|
|
220
|
-
const { attachmentId: _attachmentId, ...publicResponse } = apiResponse;
|
|
183
|
+
this.fileIdMap.set(apiResponse.fileId, { attachmentId: apiResponse.attachmentId });
|
|
184
|
+
const { attachmentId: _, ...publicResponse } = apiResponse;
|
|
221
185
|
return publicResponse;
|
|
222
186
|
}
|
|
223
187
|
/**
|
|
224
188
|
* Prepare a previously converted file for citation verification.
|
|
225
|
-
* Use this after calling convertToPdf() to extract text and get
|
|
189
|
+
* Use this after calling convertToPdf() to extract text and get deepTextPromptPortion.
|
|
226
190
|
*
|
|
227
191
|
* @param options - Options with fileId from convertFile
|
|
228
192
|
* @returns Upload response with fileId and extracted text
|
|
@@ -233,15 +197,14 @@ export class DeepCitation {
|
|
|
233
197
|
* const converted = await dc.convertToPdf({ url: "https://example.com/article" });
|
|
234
198
|
*
|
|
235
199
|
* // Then prepare it for verification
|
|
236
|
-
* const {
|
|
200
|
+
* const { deepTextPromptPortion, fileId } = await dc.prepareConvertedFile({
|
|
237
201
|
* fileId: converted.fileId
|
|
238
202
|
* });
|
|
239
203
|
*
|
|
240
|
-
* // Use
|
|
204
|
+
* // Use deepTextPromptPortion in your LLM prompt...
|
|
241
205
|
* ```
|
|
242
206
|
*/
|
|
243
207
|
async prepareConvertedFile(options) {
|
|
244
|
-
// Look up the internal attachmentId from the fileId
|
|
245
208
|
const fileInfo = this.fileIdMap.get(options.fileId);
|
|
246
209
|
if (!fileInfo) {
|
|
247
210
|
throw new Error(`File ID "${options.fileId}" not found. Make sure to call convertToPdf() first.`);
|
|
@@ -258,38 +221,29 @@ export class DeepCitation {
|
|
|
258
221
|
}),
|
|
259
222
|
});
|
|
260
223
|
if (!response.ok) {
|
|
261
|
-
|
|
262
|
-
throw new Error(error?.error?.message || `Prepare failed with status ${response.status}`);
|
|
224
|
+
throw new Error(await extractErrorMessage(response, "Prepare"));
|
|
263
225
|
}
|
|
264
|
-
|
|
265
|
-
const apiResponse = (await response.json());
|
|
266
|
-
// Update the mapping (attachmentId should remain the same)
|
|
267
|
-
this.fileIdMap.set(apiResponse.fileId, {
|
|
268
|
-
attachmentId: apiResponse.attachmentId,
|
|
269
|
-
});
|
|
270
|
-
// Return public response without internal fields
|
|
271
|
-
const { attachmentId: _attachmentId, ...publicResponse } = apiResponse;
|
|
272
|
-
return publicResponse;
|
|
226
|
+
return this.storeAndReturnResponse(await response.json());
|
|
273
227
|
}
|
|
274
228
|
/**
|
|
275
229
|
* Upload multiple files for citation verification and get structured content.
|
|
276
230
|
* This is the recommended way to prepare files for LLM prompts.
|
|
277
231
|
*
|
|
278
232
|
* @param files - Array of files to upload with optional filenames and fileIds
|
|
279
|
-
* @returns Object containing fileDataParts for verification and
|
|
233
|
+
* @returns Object containing fileDataParts for verification and deepTextPromptPortion for LLM
|
|
280
234
|
*
|
|
281
235
|
* @example
|
|
282
236
|
* ```typescript
|
|
283
|
-
* const { fileDataParts,
|
|
237
|
+
* const { fileDataParts, deepTextPromptPortion } = await dc.prepareFiles([
|
|
284
238
|
* { file: pdfBuffer, filename: "report.pdf" },
|
|
285
239
|
* { file: invoiceBuffer, filename: "invoice.pdf" },
|
|
286
240
|
* ]);
|
|
287
241
|
*
|
|
288
|
-
* // Use
|
|
242
|
+
* // Use deepTextPromptPortion in wrapCitationPrompt
|
|
289
243
|
* const { enhancedSystemPrompt, enhancedUserPrompt } = wrapCitationPrompt({
|
|
290
244
|
* systemPrompt,
|
|
291
245
|
* userPrompt,
|
|
292
|
-
*
|
|
246
|
+
* deepTextPromptPortion
|
|
293
247
|
* });
|
|
294
248
|
*
|
|
295
249
|
* // Use fileDataParts later for verification
|
|
@@ -298,17 +252,17 @@ export class DeepCitation {
|
|
|
298
252
|
*/
|
|
299
253
|
async prepareFiles(files) {
|
|
300
254
|
if (files.length === 0) {
|
|
301
|
-
return { fileDataParts: [],
|
|
255
|
+
return { fileDataParts: [], deepTextPromptPortion: [] };
|
|
302
256
|
}
|
|
303
257
|
// Upload all files in parallel
|
|
304
258
|
const uploadPromises = files.map(({ file, filename, fileId }) => this.uploadFile(file, { filename, fileId }));
|
|
305
259
|
const results = await Promise.all(uploadPromises);
|
|
306
260
|
// Extract file data parts and file deep texts
|
|
307
|
-
const fileDataParts = results.map(result => ({
|
|
261
|
+
const fileDataParts = results.map((result) => ({
|
|
308
262
|
fileId: result.fileId,
|
|
309
263
|
}));
|
|
310
|
-
const
|
|
311
|
-
return { fileDataParts,
|
|
264
|
+
const deepTextPromptPortion = results.map((result) => result.deepTextPromptPortion);
|
|
265
|
+
return { fileDataParts, deepTextPromptPortion };
|
|
312
266
|
}
|
|
313
267
|
/**
|
|
314
268
|
* Verify citations against a previously uploaded file.
|
|
@@ -376,8 +330,7 @@ export class DeepCitation {
|
|
|
376
330
|
}),
|
|
377
331
|
});
|
|
378
332
|
if (!response.ok) {
|
|
379
|
-
|
|
380
|
-
throw new Error(error?.error?.message || `Verification failed with status ${response.status}`);
|
|
333
|
+
throw new Error(await extractErrorMessage(response, "Verification"));
|
|
381
334
|
}
|
|
382
335
|
return (await response.json());
|
|
383
336
|
}
|
|
@@ -412,7 +365,7 @@ export class DeepCitation {
|
|
|
412
365
|
// Note: fileDataParts is now only used to identify which files to verify
|
|
413
366
|
// The mapping from fileId to attachmentId must be registered via uploadFile() or prepareFiles()
|
|
414
367
|
// in the same session. For Zero Data Retention scenarios, use verifyCitations() directly.
|
|
415
|
-
// Group citations by fileId
|
|
368
|
+
// Group citations by fileId
|
|
416
369
|
const citationsByFile = new Map();
|
|
417
370
|
for (const [key, citation] of Object.entries(citations)) {
|
|
418
371
|
const fileId = citation.fileId || "";
|
|
@@ -421,34 +374,16 @@ export class DeepCitation {
|
|
|
421
374
|
}
|
|
422
375
|
citationsByFile.get(fileId)[key] = citation;
|
|
423
376
|
}
|
|
424
|
-
//
|
|
425
|
-
const
|
|
377
|
+
// Filter to only registered files and verify in parallel
|
|
378
|
+
const verificationPromises = [];
|
|
426
379
|
for (const [fileId, fileCitations] of citationsByFile) {
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
if (!fileInfo) {
|
|
430
|
-
// Skip citations for unregistered files
|
|
431
|
-
continue;
|
|
380
|
+
if (this.fileIdMap.has(fileId)) {
|
|
381
|
+
verificationPromises.push(this.verifyCitations(fileId, fileCitations, { outputImageFormat }));
|
|
432
382
|
}
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
"Content-Type": "application/json",
|
|
438
|
-
},
|
|
439
|
-
body: JSON.stringify({
|
|
440
|
-
data: {
|
|
441
|
-
attachmentId: fileInfo.attachmentId,
|
|
442
|
-
citations: fileCitations,
|
|
443
|
-
outputImageFormat,
|
|
444
|
-
},
|
|
445
|
-
}),
|
|
446
|
-
});
|
|
447
|
-
if (!response.ok) {
|
|
448
|
-
const error = await response.json().catch(() => ({}));
|
|
449
|
-
throw new Error(error?.error?.message || `Verification failed with status ${response.status}`);
|
|
450
|
-
}
|
|
451
|
-
const result = (await response.json());
|
|
383
|
+
}
|
|
384
|
+
const results = await Promise.all(verificationPromises);
|
|
385
|
+
const allHighlights = {};
|
|
386
|
+
for (const result of results) {
|
|
452
387
|
Object.assign(allHighlights, result.foundHighlights);
|
|
453
388
|
}
|
|
454
389
|
return { foundHighlights: allHighlights };
|
package/lib/client/index.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
export { DeepCitation } from "./DeepCitation";
|
|
2
|
-
export type { DeepCitationConfig, UploadFileResponse, UploadFileOptions, VerifyCitationsResponse, VerifyCitationsOptions, CitationInput, FileInput, FileDataPart, PrepareFilesResult,
|
|
2
|
+
export type { DeepCitationConfig, UploadFileResponse, UploadFileOptions, VerifyCitationsResponse, VerifyCitationsOptions, CitationInput, FileInput, FileDataPart, PrepareFilesResult, VerifyCitationsFromLlmOutput, ConvertFileInput, ConvertFileResponse, PrepareConvertedFileOptions, } from "./types";
|
package/lib/client/types.d.ts
CHANGED
|
@@ -3,7 +3,7 @@ import type { Citation, FoundHighlightLocation } from "../types/index";
|
|
|
3
3
|
* Configuration options for the DeepCitation client
|
|
4
4
|
*/
|
|
5
5
|
export interface DeepCitationConfig {
|
|
6
|
-
/** Your DeepCitation API key (starts with
|
|
6
|
+
/** Your DeepCitation API key (starts with sk-dc-) */
|
|
7
7
|
apiKey: string;
|
|
8
8
|
/** Optional custom API base URL. Defaults to https://api.deepcitation.com */
|
|
9
9
|
apiUrl?: string;
|
|
@@ -15,7 +15,7 @@ export interface UploadFileResponse {
|
|
|
15
15
|
/** The file ID assigned by DeepCitation (custom or auto-generated) */
|
|
16
16
|
fileId: string;
|
|
17
17
|
/** The full text content formatted for LLM prompts with page markers and line IDs. Use this in your user prompts. */
|
|
18
|
-
|
|
18
|
+
deepTextPromptPortion: string;
|
|
19
19
|
/** Form fields extracted from PDF forms */
|
|
20
20
|
formFields?: Array<{
|
|
21
21
|
name: string;
|
|
@@ -89,12 +89,12 @@ export interface PrepareFilesResult {
|
|
|
89
89
|
/** Array of file references for verification */
|
|
90
90
|
fileDataParts: FileDataPart[];
|
|
91
91
|
/** Array of formatted text content for LLM prompts (with page markers and line IDs) */
|
|
92
|
-
|
|
92
|
+
deepTextPromptPortion: string[];
|
|
93
93
|
}
|
|
94
94
|
/**
|
|
95
95
|
* Input for verifyCitationsFromLlmOutput
|
|
96
96
|
*/
|
|
97
|
-
export interface
|
|
97
|
+
export interface VerifyCitationsFromLlmOutput {
|
|
98
98
|
/** The LLM response containing citations */
|
|
99
99
|
llmOutput: string;
|
|
100
100
|
/** Optional file references (required for Zero Data Retention or after storage expires) */
|
|
@@ -114,8 +114,6 @@ export interface ConvertFileInput {
|
|
|
114
114
|
filename?: string;
|
|
115
115
|
/** Optional custom file ID */
|
|
116
116
|
fileId?: string;
|
|
117
|
-
/** For URLs: render as single long page instead of paginated */
|
|
118
|
-
singlePage?: boolean;
|
|
119
117
|
}
|
|
120
118
|
/**
|
|
121
119
|
* Response from convertFile
|
|
@@ -146,12 +144,3 @@ export interface PrepareConvertedFileOptions {
|
|
|
146
144
|
/** The file ID from a previous convertFile call */
|
|
147
145
|
fileId: string;
|
|
148
146
|
}
|
|
149
|
-
/**
|
|
150
|
-
* @deprecated Use PrepareConvertedFileOptions instead
|
|
151
|
-
*/
|
|
152
|
-
export interface PrepareFileFromAttachmentOptions {
|
|
153
|
-
/** The attachment ID from a previous convertFile call */
|
|
154
|
-
attachmentId: string;
|
|
155
|
-
/** Optional custom file ID */
|
|
156
|
-
fileId?: string;
|
|
157
|
-
}
|
package/lib/index.d.ts
CHANGED
|
@@ -3,15 +3,15 @@
|
|
|
3
3
|
* @packageDocumentation
|
|
4
4
|
*/
|
|
5
5
|
export { DeepCitation } from "./client/index.js";
|
|
6
|
-
export type { DeepCitationConfig, UploadFileResponse, UploadFileOptions, VerifyCitationsResponse, VerifyCitationsOptions, CitationInput, FileInput, FileDataPart, PrepareFilesResult,
|
|
6
|
+
export type { DeepCitationConfig, UploadFileResponse, UploadFileOptions, VerifyCitationsResponse, VerifyCitationsOptions, CitationInput, FileInput, FileDataPart, PrepareFilesResult, VerifyCitationsFromLlmOutput, } from "./client/index.js";
|
|
7
7
|
export { parseCitation, getCitationStatus, getAllCitationsFromLlmOutput, groupCitationsByFileId, groupCitationsByFileIdObject, } from "./parsing/parseCitation.js";
|
|
8
|
-
export { normalizeCitations, getCitationPageNumber } from "./parsing/normalizeCitation.js";
|
|
9
|
-
export { isGeminiGarbage, cleanRepeatingLastSentence } from "./parsing/parseWorkAround.js";
|
|
8
|
+
export { normalizeCitations, getCitationPageNumber, } from "./parsing/normalizeCitation.js";
|
|
9
|
+
export { isGeminiGarbage, cleanRepeatingLastSentence, } from "./parsing/parseWorkAround.js";
|
|
10
10
|
export type { Citation, CitationStatus, VerifyCitationRequest, VerifyCitationResponse, OutputImageFormat, } from "./types/citation.js";
|
|
11
|
-
export {
|
|
11
|
+
export { DEFAULT_OUTPUT_IMAGE_FORMAT } from "./types/citation.js";
|
|
12
12
|
export type { FoundHighlightLocation } from "./types/foundHighlight.js";
|
|
13
13
|
export { NOT_FOUND_HIGHLIGHT_INDEX, PENDING_HIGHLIGHT_INDEX, BLANK_HIGHLIGHT_LOCATION, deterministicIdFromHighlightLocation, } from "./types/foundHighlight.js";
|
|
14
|
-
export type { SearchState, SearchStatus, SearchMethod, SearchAttempt } from "./types/search.js";
|
|
14
|
+
export type { SearchState, SearchStatus, SearchMethod, SearchAttempt, } from "./types/search.js";
|
|
15
15
|
export type { ScreenBox, PdfSpaceItem, IVertex } from "./types/boxes.js";
|
|
16
16
|
export { sha1Hash } from "./utils/sha.js";
|
|
17
17
|
export { generateCitationKey } from "./react/utils.js";
|
|
@@ -19,7 +19,6 @@ export { generateCitationInstanceId } from "./react/utils.js";
|
|
|
19
19
|
export { CITATION_X_PADDING, CITATION_Y_PADDING } from "./react/utils.js";
|
|
20
20
|
export { CITATION_JSON_OUTPUT_FORMAT, CITATION_MARKDOWN_SYNTAX_PROMPT, AV_CITATION_MARKDOWN_SYNTAX_PROMPT, CITATION_AV_BASED_JSON_OUTPUT_FORMAT, wrapSystemCitationPrompt, wrapCitationPrompt, } from "./prompts/citationPrompts.js";
|
|
21
21
|
export type { WrapSystemPromptOptions, WrapCitationPromptOptions, WrapCitationPromptResult, } from "./prompts/citationPrompts.js";
|
|
22
|
-
export { removeLineIdMetadata, removePageNumberMetadata, removeCitations } from "./parsing/normalizeCitation.js";
|
|
23
|
-
export { compressPromptIds, decompressPromptIds } from "./prompts/promptCompression.js";
|
|
22
|
+
export { removeLineIdMetadata, removePageNumberMetadata, removeCitations, } from "./parsing/normalizeCitation.js";
|
|
23
|
+
export { compressPromptIds, decompressPromptIds, } from "./prompts/promptCompression.js";
|
|
24
24
|
export type { CompressedResult } from "./prompts/types.js";
|
|
25
|
-
export { CitationComponent } from "./react/CitationComponent.js";
|
package/lib/index.js
CHANGED
|
@@ -6,9 +6,9 @@
|
|
|
6
6
|
export { DeepCitation } from "./client/index.js";
|
|
7
7
|
// Parsing
|
|
8
8
|
export { parseCitation, getCitationStatus, getAllCitationsFromLlmOutput, groupCitationsByFileId, groupCitationsByFileIdObject, } from "./parsing/parseCitation.js";
|
|
9
|
-
export { normalizeCitations, getCitationPageNumber } from "./parsing/normalizeCitation.js";
|
|
10
|
-
export { isGeminiGarbage, cleanRepeatingLastSentence } from "./parsing/parseWorkAround.js";
|
|
11
|
-
export {
|
|
9
|
+
export { normalizeCitations, getCitationPageNumber, } from "./parsing/normalizeCitation.js";
|
|
10
|
+
export { isGeminiGarbage, cleanRepeatingLastSentence, } from "./parsing/parseWorkAround.js";
|
|
11
|
+
export { DEFAULT_OUTPUT_IMAGE_FORMAT } from "./types/citation.js";
|
|
12
12
|
export { NOT_FOUND_HIGHLIGHT_INDEX, PENDING_HIGHLIGHT_INDEX, BLANK_HIGHLIGHT_LOCATION, deterministicIdFromHighlightLocation, } from "./types/foundHighlight.js";
|
|
13
13
|
// Utilities
|
|
14
14
|
export { sha1Hash } from "./utils/sha.js";
|
|
@@ -17,6 +17,5 @@ export { generateCitationInstanceId } from "./react/utils.js";
|
|
|
17
17
|
export { CITATION_X_PADDING, CITATION_Y_PADDING } from "./react/utils.js";
|
|
18
18
|
// Prompts
|
|
19
19
|
export { CITATION_JSON_OUTPUT_FORMAT, CITATION_MARKDOWN_SYNTAX_PROMPT, AV_CITATION_MARKDOWN_SYNTAX_PROMPT, CITATION_AV_BASED_JSON_OUTPUT_FORMAT, wrapSystemCitationPrompt, wrapCitationPrompt, } from "./prompts/citationPrompts.js";
|
|
20
|
-
export { removeLineIdMetadata, removePageNumberMetadata, removeCitations } from "./parsing/normalizeCitation.js";
|
|
21
|
-
export { compressPromptIds, decompressPromptIds } from "./prompts/promptCompression.js";
|
|
22
|
-
export { CitationComponent } from "./react/CitationComponent.js";
|
|
20
|
+
export { removeLineIdMetadata, removePageNumberMetadata, removeCitations, } from "./parsing/normalizeCitation.js";
|
|
21
|
+
export { compressPromptIds, decompressPromptIds, } from "./prompts/promptCompression.js";
|
|
@@ -35,7 +35,7 @@ export const normalizeCitations = (response) => {
|
|
|
35
35
|
return normalizeCitationContent(trimmedResponse);
|
|
36
36
|
}
|
|
37
37
|
trimmedResponse = citationParts
|
|
38
|
-
.map(part =>
|
|
38
|
+
.map((part) => part.startsWith("<cite") ? normalizeCitationContent(part) : part)
|
|
39
39
|
.join("");
|
|
40
40
|
return trimmedResponse;
|
|
41
41
|
};
|
|
@@ -49,10 +49,14 @@ const normalizeCitationContent = (input) => {
|
|
|
49
49
|
return "full_phrase";
|
|
50
50
|
if (key === "lineIds" || key === "line_ids")
|
|
51
51
|
return "line_ids";
|
|
52
|
-
if (key === "startPageKey" ||
|
|
52
|
+
if (key === "startPageKey" ||
|
|
53
|
+
key === "start_pageKey" ||
|
|
54
|
+
key === "start_page_key")
|
|
53
55
|
return "start_page_key";
|
|
54
56
|
if (key === "fileID" || key === "fileId" || key === "file_id")
|
|
55
57
|
return "file_id";
|
|
58
|
+
if (key === "keySpan" || key === "key_span")
|
|
59
|
+
return "key_span";
|
|
56
60
|
return key;
|
|
57
61
|
};
|
|
58
62
|
// Helper to decode HTML entities (simple implementation, expand if needed)
|
|
@@ -67,7 +71,7 @@ const normalizeCitationContent = (input) => {
|
|
|
67
71
|
// 2. ROBUST TEXT ATTRIBUTE PARSING (reasoning, value, full_phrase)
|
|
68
72
|
// This regex matches: Key = Quote -> Content (lazy) -> Lookahead for (Next Attribute OR End of Tag)
|
|
69
73
|
// It effectively ignores quotes inside the content during the initial capture.
|
|
70
|
-
const textAttributeRegex = /(fullPhrase|full_phrase|reasoning|value)\s*=\s*(['"])([\s\S]*?)(?=\s+(?:line_ids|lineIds|timestamps|fileId|file_id|start_page_key|start_pageKey|startPageKey|reasoning|value|full_phrase)|\s*\/?>)/gm;
|
|
74
|
+
const textAttributeRegex = /(fullPhrase|full_phrase|keySpan|key_span|reasoning|value)\s*=\s*(['"])([\s\S]*?)(?=\s+(?:line_ids|lineIds|timestamps|fileId|file_id|start_page_key|start_pageKey|startPageKey|keySpan|key_span|reasoning|value|full_phrase)|\s*\/?>)/gm;
|
|
71
75
|
normalized = normalized.replace(textAttributeRegex, (_match, key, openQuote, rawContent) => {
|
|
72
76
|
let content = rawContent;
|
|
73
77
|
// The lazy match usually captures the closing quote because the lookahead
|
|
@@ -139,7 +143,7 @@ const normalizeCitationContent = (input) => {
|
|
|
139
143
|
if (keys.length === 0)
|
|
140
144
|
return tag;
|
|
141
145
|
const hasTimestamps = typeof attrs.timestamps === "string" && attrs.timestamps.length > 0;
|
|
142
|
-
const startPageKeys = keys.filter(k => k.startsWith("start_page"));
|
|
146
|
+
const startPageKeys = keys.filter((k) => k.startsWith("start_page"));
|
|
143
147
|
const ordered = [];
|
|
144
148
|
// Shared first
|
|
145
149
|
if (attrs.file_id)
|
|
@@ -151,15 +155,17 @@ const normalizeCitationContent = (input) => {
|
|
|
151
155
|
ordered.push("timestamps");
|
|
152
156
|
}
|
|
153
157
|
else {
|
|
154
|
-
// Document citations: fileId, start_page*, full_phrase, line_ids, (optional reasoning/value), then any extras
|
|
158
|
+
// Document citations: fileId, start_page*, full_phrase, key_span, line_ids, (optional reasoning/value), then any extras
|
|
155
159
|
if (startPageKeys.includes("start_page_key"))
|
|
156
160
|
ordered.push("start_page_key");
|
|
157
161
|
startPageKeys
|
|
158
|
-
.filter(k => k !== "start_page_key")
|
|
162
|
+
.filter((k) => k !== "start_page_key")
|
|
159
163
|
.sort()
|
|
160
|
-
.forEach(k => ordered.push(k));
|
|
164
|
+
.forEach((k) => ordered.push(k));
|
|
161
165
|
if (attrs.full_phrase)
|
|
162
166
|
ordered.push("full_phrase");
|
|
167
|
+
if (attrs.key_span)
|
|
168
|
+
ordered.push("key_span");
|
|
163
169
|
if (attrs.line_ids)
|
|
164
170
|
ordered.push("line_ids");
|
|
165
171
|
}
|
|
@@ -171,12 +177,12 @@ const normalizeCitationContent = (input) => {
|
|
|
171
177
|
// Any remaining attributes, stable + deterministic (alpha)
|
|
172
178
|
const used = new Set(ordered);
|
|
173
179
|
keys
|
|
174
|
-
.filter(k => !used.has(k))
|
|
180
|
+
.filter((k) => !used.has(k))
|
|
175
181
|
.sort()
|
|
176
|
-
.forEach(k => ordered.push(k));
|
|
177
|
-
const rebuiltAttrs = ordered.map(k => `${k}='${attrs[k]}'`).join(" ");
|
|
182
|
+
.forEach((k) => ordered.push(k));
|
|
183
|
+
const rebuiltAttrs = ordered.map((k) => `${k}='${attrs[k]}'`).join(" ");
|
|
178
184
|
return `<cite ${rebuiltAttrs} />`;
|
|
179
185
|
};
|
|
180
|
-
normalized = normalized.replace(/<cite\b[\s\S]*?\/>/gm, tag => reorderCiteTagAttributes(tag));
|
|
186
|
+
normalized = normalized.replace(/<cite\b[\s\S]*?\/>/gm, (tag) => reorderCiteTagAttributes(tag));
|
|
181
187
|
return normalized;
|
|
182
188
|
};
|
|
@@ -16,8 +16,13 @@ export function getCitationStatus(foundHighlight) {
|
|
|
16
16
|
searchState?.status === "found_on_other_page" ||
|
|
17
17
|
searchState?.status === "found_on_other_line" ||
|
|
18
18
|
searchState?.status === "first_word_found";
|
|
19
|
-
const isVerified = searchState?.status === "found" ||
|
|
20
|
-
|
|
19
|
+
const isVerified = searchState?.status === "found" ||
|
|
20
|
+
isFoundValueMissedFullMatch ||
|
|
21
|
+
isPartialMatch ||
|
|
22
|
+
isFullMatchWithMissedValue;
|
|
23
|
+
const isPending = searchState?.status === "pending" ||
|
|
24
|
+
searchState?.status === "loading" ||
|
|
25
|
+
!searchState;
|
|
21
26
|
return { isVerified, isMiss, isPartialMatch, isPending };
|
|
22
27
|
}
|
|
23
28
|
export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVerbose) => {
|
|
@@ -30,19 +35,24 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
|
|
|
30
35
|
// Replace escaped single quotes with actual single quotes
|
|
31
36
|
return trimmed.replace(/\\'/g, "'");
|
|
32
37
|
};
|
|
33
|
-
const citationNumber = citationCounterRef?.current
|
|
38
|
+
const citationNumber = citationCounterRef?.current
|
|
39
|
+
? citationCounterRef.current++
|
|
40
|
+
: undefined;
|
|
34
41
|
const beforeCite = fragment.substring(0, fragment.indexOf("<cite"));
|
|
35
|
-
const afterCite = fragment.includes("/>")
|
|
42
|
+
const afterCite = fragment.includes("/>")
|
|
43
|
+
? fragment.slice(fragment.indexOf("/>") + 2)
|
|
44
|
+
: "";
|
|
36
45
|
const middleCite = fragment.substring(fragment.indexOf("<cite"), fragment.indexOf("/>") + 2);
|
|
37
46
|
// GROUPS:
|
|
38
47
|
// 1: fileId
|
|
39
48
|
// 2: start_page number
|
|
40
49
|
// 3: index number
|
|
41
50
|
// 4: full_phrase content (escaped)
|
|
42
|
-
// 5:
|
|
51
|
+
// 5: key_span content (escaped)
|
|
52
|
+
// 6: line_ids content
|
|
43
53
|
// 6: Optional Key (value|reasoning)
|
|
44
54
|
// 7: Optional Value content (escaped)
|
|
45
|
-
const citationRegex = /<cite\s+file(?:_id|Id)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
|
|
55
|
+
const citationRegex = /<cite\s+file(?:_id|Id)='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+key_span='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
|
|
46
56
|
const citationMatches = [...middleCite.matchAll(citationRegex)];
|
|
47
57
|
const match = citationMatches?.[0];
|
|
48
58
|
const rawCitationMd = match?.[0];
|
|
@@ -51,11 +61,12 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
|
|
|
51
61
|
let attachmentId = fileId?.length === 20 ? fileId : mdAttachmentId || match?.[1];
|
|
52
62
|
// Use helper to handle escaped quotes inside the phrase
|
|
53
63
|
let fullPhrase = cleanAndUnescape(match?.[4]);
|
|
64
|
+
let keySpan = cleanAndUnescape(match?.[5]);
|
|
54
65
|
// Handle the optional attribute (value or reasoning)
|
|
55
66
|
let value;
|
|
56
67
|
let reasoning;
|
|
57
|
-
const optionalKey = match?.[
|
|
58
|
-
const optionalContent = cleanAndUnescape(match?.[
|
|
68
|
+
const optionalKey = match?.[7]; // "value" or "reasoning"
|
|
69
|
+
const optionalContent = cleanAndUnescape(match?.[8]);
|
|
59
70
|
if (optionalKey === "value") {
|
|
60
71
|
value = optionalContent;
|
|
61
72
|
}
|
|
@@ -65,12 +76,12 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
|
|
|
65
76
|
let lineIds;
|
|
66
77
|
try {
|
|
67
78
|
// match[5] is line_ids
|
|
68
|
-
const lineIdsString = match?.[
|
|
79
|
+
const lineIdsString = match?.[6]?.replace(/[A-Za-z_[\](){}:]/g, "");
|
|
69
80
|
lineIds = lineIdsString
|
|
70
81
|
? lineIdsString
|
|
71
82
|
.split(",")
|
|
72
|
-
.map(id => (isNaN(parseInt(id)) ? undefined : parseInt(id)))
|
|
73
|
-
.filter(id => id !== undefined)
|
|
83
|
+
.map((id) => (isNaN(parseInt(id)) ? undefined : parseInt(id)))
|
|
84
|
+
.filter((id) => id !== undefined)
|
|
74
85
|
.sort((a, b) => a - b)
|
|
75
86
|
: undefined;
|
|
76
87
|
}
|
|
@@ -90,7 +101,8 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
|
|
|
90
101
|
let timestamps;
|
|
91
102
|
if (avMatch) {
|
|
92
103
|
fileId = avMatch?.[1];
|
|
93
|
-
attachmentId =
|
|
104
|
+
attachmentId =
|
|
105
|
+
fileId?.length === 20 ? fileId : mdAttachmentId || avMatch?.[1];
|
|
94
106
|
fullPhrase = cleanAndUnescape(avMatch?.[2]);
|
|
95
107
|
const timestampsString = avMatch?.[3]?.replace(/timestamps=['"]|['"]/g, "");
|
|
96
108
|
const [startTime, endTime] = timestampsString?.split("-") || [];
|
|
@@ -110,6 +122,7 @@ export const parseCitation = (fragment, mdAttachmentId, citationCounterRef, isVe
|
|
|
110
122
|
fileId: attachmentId,
|
|
111
123
|
pageNumber,
|
|
112
124
|
fullPhrase,
|
|
125
|
+
keySpan,
|
|
113
126
|
citationNumber,
|
|
114
127
|
lineIds,
|
|
115
128
|
rawCitationMd,
|
|
@@ -139,6 +152,7 @@ const parseJsonCitation = (jsonCitation, citationNumber) => {
|
|
|
139
152
|
// Support both camelCase and snake_case property names
|
|
140
153
|
const fullPhrase = jsonCitation.fullPhrase ?? jsonCitation.full_phrase;
|
|
141
154
|
const startPageKey = jsonCitation.startPageKey ?? jsonCitation.start_page_key;
|
|
155
|
+
const keySpan = jsonCitation.keySpan ?? jsonCitation.key_span;
|
|
142
156
|
const rawLineIds = jsonCitation.lineIds ?? jsonCitation.line_ids;
|
|
143
157
|
const fileId = jsonCitation.fileId ?? jsonCitation.file_id;
|
|
144
158
|
const reasoning = jsonCitation.reasoning;
|
|
@@ -155,13 +169,16 @@ const parseJsonCitation = (jsonCitation, citationNumber) => {
|
|
|
155
169
|
}
|
|
156
170
|
}
|
|
157
171
|
// Sort lineIds if present
|
|
158
|
-
const lineIds = rawLineIds?.length
|
|
172
|
+
const lineIds = rawLineIds?.length
|
|
173
|
+
? [...rawLineIds].sort((a, b) => a - b)
|
|
174
|
+
: undefined;
|
|
159
175
|
const citation = {
|
|
160
176
|
fileId,
|
|
161
177
|
pageNumber,
|
|
162
178
|
fullPhrase,
|
|
163
179
|
citationNumber,
|
|
164
180
|
lineIds,
|
|
181
|
+
keySpan,
|
|
165
182
|
reasoning,
|
|
166
183
|
value,
|
|
167
184
|
};
|
|
@@ -176,6 +193,8 @@ const hasCitationProperties = (item) => typeof item === "object" &&
|
|
|
176
193
|
"full_phrase" in item ||
|
|
177
194
|
"startPageKey" in item ||
|
|
178
195
|
"start_page_key" in item ||
|
|
196
|
+
"keySpan" in item ||
|
|
197
|
+
"key_span" in item ||
|
|
179
198
|
"lineIds" in item ||
|
|
180
199
|
"line_ids" in item);
|
|
181
200
|
/**
|
|
@@ -220,7 +239,9 @@ const findJsonCitationsInObject = (obj, found) => {
|
|
|
220
239
|
found.push(...items);
|
|
221
240
|
}
|
|
222
241
|
if (obj.citations && isJsonCitationFormat(obj.citations)) {
|
|
223
|
-
const items = Array.isArray(obj.citations)
|
|
242
|
+
const items = Array.isArray(obj.citations)
|
|
243
|
+
? obj.citations
|
|
244
|
+
: [obj.citations];
|
|
224
245
|
found.push(...items);
|
|
225
246
|
}
|
|
226
247
|
// Recurse into object properties
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
export declare const CITATION_MARKDOWN_SYNTAX_PROMPT = "\nCitation syntax to use within Markdown:\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite file_id='file_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <file_text
|
|
2
|
-
export declare const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = "\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite file_id='file_id' full_phrase='the verbatim text of the phrase
|
|
1
|
+
export declare const CITATION_MARKDOWN_SYNTAX_PROMPT = "\nCitation syntax to use within Markdown:\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite file_id='file_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <file_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim value or words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />\n\n\u2022 Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.\n\u2022 start_page_key, full_phrase, and line_ids are required for each citation.\n\u2022 Infer line_ids, as we only provide the first, last, and every 5th line. When copying a previous <cite />, use the full info from the previous citation without changing the start_page_key, line_ids, or any other <cite /> attributes.\n\u2022 Use refer to line_ids inclusively, and use a range (or single) for each citation, split multiple sequential line_ids into multiple citations.\n\u2022 These citations will be replaced and displayed in-line as a numeric element (e.g. [1]), the markdown preceding <cite /> should read naturally with only one <cite /> per sentence with rare exceptions for two <cite /> in a sentence. <cite /> often present best at the end of the sentence, and are not grouped at the end of the document.\n\u2022 The full_phrase should be the exact verbatim text of the phrase or paragraph from the source document to support the insight or idea.\n\u2022 We do NOT put the full_phrase inside <cite ...></cite>; we only use full_phrase inside the full_phrase attribute.\n";
|
|
2
|
+
export declare const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = "\n\u2022 To support any ideas or information that requires a citation from the provided content, use the following citation syntax:\n<cite file_id='file_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />\n\u2022 These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.\n";
|
|
3
3
|
export interface WrapSystemPromptOptions {
|
|
4
4
|
/** The original system prompt to wrap with citation instructions */
|
|
5
5
|
systemPrompt: string;
|
|
@@ -13,7 +13,7 @@ export interface WrapCitationPromptOptions {
|
|
|
13
13
|
/** The original user prompt */
|
|
14
14
|
userPrompt: string;
|
|
15
15
|
/** The extracted file text with metadata (from uploadFile response). Can be a single string or array for multiple files. */
|
|
16
|
-
|
|
16
|
+
deepTextPromptPortion?: string | string[];
|
|
17
17
|
/** Whether to use audio/video citation format (with timestamps) instead of text-based (with line IDs) */
|
|
18
18
|
isAudioVideo?: boolean;
|
|
19
19
|
}
|
|
@@ -54,14 +54,14 @@ export declare function wrapSystemCitationPrompt(options: WrapSystemPromptOption
|
|
|
54
54
|
* const { enhancedSystemPrompt, enhancedUserPrompt } = wrapCitationPrompt({
|
|
55
55
|
* systemPrompt: "You are a helpful assistant.",
|
|
56
56
|
* userPrompt: "Analyze this document and summarize it.",
|
|
57
|
-
*
|
|
57
|
+
* deepTextPromptPortion, // from uploadFile response
|
|
58
58
|
* });
|
|
59
59
|
*
|
|
60
60
|
* // Multiple files
|
|
61
61
|
* const { enhancedSystemPrompt, enhancedUserPrompt } = wrapCitationPrompt({
|
|
62
62
|
* systemPrompt: "You are a helpful assistant.",
|
|
63
63
|
* userPrompt: "Compare these documents.",
|
|
64
|
-
*
|
|
64
|
+
* deepTextPromptPortion: [deepTextPromptPortion1, deepTextPromptPortion2], // array of file texts
|
|
65
65
|
* });
|
|
66
66
|
*
|
|
67
67
|
* // Use enhanced prompts with your LLM
|
|
@@ -92,6 +92,10 @@ export declare const CITATION_JSON_OUTPUT_FORMAT: {
|
|
|
92
92
|
type: string;
|
|
93
93
|
description: string;
|
|
94
94
|
};
|
|
95
|
+
keySpan: {
|
|
96
|
+
type: string;
|
|
97
|
+
description: string;
|
|
98
|
+
};
|
|
95
99
|
lineIds: {
|
|
96
100
|
type: string;
|
|
97
101
|
items: {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
export const CITATION_MARKDOWN_SYNTAX_PROMPT = `
|
|
2
2
|
Citation syntax to use within Markdown:
|
|
3
3
|
• To support any ideas or information that requires a citation from the provided content, use the following citation syntax:
|
|
4
|
-
<cite file_id='file_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <file_text
|
|
4
|
+
<cite file_id='file_id' start_page_key='page_number_PAGE_index_INDEX' full_phrase='the verbatim text of the terse phrase inside <file_text />; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' key_span='the verbatim value or words within full_phrase that best support the citation' line_ids='2-6' reasoning='the terse logic used to conclude the citation' />
|
|
5
5
|
|
|
6
6
|
• Very important: for page numbers, only use the page number and page index info from the page_number_PAGE_index_INDEX format (e.g. <page_number_1_index_0>) and never from the contents inside the page.
|
|
7
7
|
• start_page_key, full_phrase, and line_ids are required for each citation.
|
|
@@ -13,7 +13,7 @@ Citation syntax to use within Markdown:
|
|
|
13
13
|
`;
|
|
14
14
|
export const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = `
|
|
15
15
|
• To support any ideas or information that requires a citation from the provided content, use the following citation syntax:
|
|
16
|
-
<cite file_id='file_id' full_phrase='the verbatim text of the phrase
|
|
16
|
+
<cite file_id='file_id' full_phrase='the verbatim text of the phrase; remember to escape quotes and newlines inside the full_phrase to remain as valid JSON' timestamps='HH:MM:SS.SSS-HH:MM:SS.SSS' reasoning='the logic connecting the form section requirements to the supporting source citation' />
|
|
17
17
|
• These citations are displayed in-line or in the relevant list item, and are not grouped at the end of the document.
|
|
18
18
|
`;
|
|
19
19
|
/**
|
|
@@ -35,8 +35,10 @@ export const AV_CITATION_MARKDOWN_SYNTAX_PROMPT = `
|
|
|
35
35
|
* ```
|
|
36
36
|
*/
|
|
37
37
|
export function wrapSystemCitationPrompt(options) {
|
|
38
|
-
const { systemPrompt, isAudioVideo = false, prependCitationInstructions = false } = options;
|
|
39
|
-
const citationPrompt = isAudioVideo
|
|
38
|
+
const { systemPrompt, isAudioVideo = false, prependCitationInstructions = false, } = options;
|
|
39
|
+
const citationPrompt = isAudioVideo
|
|
40
|
+
? AV_CITATION_MARKDOWN_SYNTAX_PROMPT
|
|
41
|
+
: CITATION_MARKDOWN_SYNTAX_PROMPT;
|
|
40
42
|
if (prependCitationInstructions) {
|
|
41
43
|
return `${citationPrompt.trim()}
|
|
42
44
|
|
|
@@ -59,14 +61,14 @@ ${citationPrompt.trim()}`;
|
|
|
59
61
|
* const { enhancedSystemPrompt, enhancedUserPrompt } = wrapCitationPrompt({
|
|
60
62
|
* systemPrompt: "You are a helpful assistant.",
|
|
61
63
|
* userPrompt: "Analyze this document and summarize it.",
|
|
62
|
-
*
|
|
64
|
+
* deepTextPromptPortion, // from uploadFile response
|
|
63
65
|
* });
|
|
64
66
|
*
|
|
65
67
|
* // Multiple files
|
|
66
68
|
* const { enhancedSystemPrompt, enhancedUserPrompt } = wrapCitationPrompt({
|
|
67
69
|
* systemPrompt: "You are a helpful assistant.",
|
|
68
70
|
* userPrompt: "Compare these documents.",
|
|
69
|
-
*
|
|
71
|
+
* deepTextPromptPortion: [deepTextPromptPortion1, deepTextPromptPortion2], // array of file texts
|
|
70
72
|
* });
|
|
71
73
|
*
|
|
72
74
|
* // Use enhanced prompts with your LLM
|
|
@@ -79,21 +81,23 @@ ${citationPrompt.trim()}`;
|
|
|
79
81
|
* ```
|
|
80
82
|
*/
|
|
81
83
|
export function wrapCitationPrompt(options) {
|
|
82
|
-
const { systemPrompt, userPrompt,
|
|
84
|
+
const { systemPrompt, userPrompt, deepTextPromptPortion, isAudioVideo = false, } = options;
|
|
83
85
|
const enhancedSystemPrompt = wrapSystemCitationPrompt({
|
|
84
86
|
systemPrompt,
|
|
85
87
|
isAudioVideo,
|
|
86
88
|
});
|
|
87
89
|
// Build enhanced user prompt with file content if provided
|
|
88
90
|
let enhancedUserPrompt = userPrompt;
|
|
89
|
-
if (
|
|
90
|
-
const fileTexts = Array.isArray(
|
|
91
|
+
if (deepTextPromptPortion) {
|
|
92
|
+
const fileTexts = Array.isArray(deepTextPromptPortion)
|
|
93
|
+
? deepTextPromptPortion
|
|
94
|
+
: [deepTextPromptPortion];
|
|
91
95
|
const fileContent = fileTexts
|
|
92
96
|
.map((text, index) => {
|
|
93
97
|
if (fileTexts.length === 1) {
|
|
94
|
-
return
|
|
98
|
+
return `\n${text}`;
|
|
95
99
|
}
|
|
96
|
-
return
|
|
100
|
+
return `\n${text}`;
|
|
97
101
|
})
|
|
98
102
|
.join("\n\n");
|
|
99
103
|
enhancedUserPrompt = `${fileContent}\n\n${userPrompt}`;
|
|
@@ -119,13 +123,24 @@ export const CITATION_JSON_OUTPUT_FORMAT = {
|
|
|
119
123
|
type: "string",
|
|
120
124
|
description: "The verbatim text of the terse phrase inside <file_text /> to support the value description (if there is a detected OCR correction, use the corrected text)",
|
|
121
125
|
},
|
|
126
|
+
keySpan: {
|
|
127
|
+
type: "string",
|
|
128
|
+
description: "the verbatim value or words within fullPhrase that best support the citation",
|
|
129
|
+
},
|
|
122
130
|
lineIds: {
|
|
123
131
|
type: "array",
|
|
124
132
|
items: { type: "number" },
|
|
125
133
|
description: "Infer lineIds, as we only provide the first, last, and every 5th line. Provide inclusive lineIds for the fullPhrase.",
|
|
126
134
|
},
|
|
127
135
|
},
|
|
128
|
-
required: [
|
|
136
|
+
required: [
|
|
137
|
+
"fileId",
|
|
138
|
+
"startPageKey",
|
|
139
|
+
"reasoning",
|
|
140
|
+
"fullPhrase",
|
|
141
|
+
"keySpan",
|
|
142
|
+
"lineIds",
|
|
143
|
+
],
|
|
129
144
|
};
|
|
130
145
|
export const CITATION_AV_BASED_JSON_OUTPUT_FORMAT = {
|
|
131
146
|
type: "object",
|
package/lib/types/citation.d.ts
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { type ScreenBox } from "./boxes";
|
|
2
2
|
import { type FoundHighlightLocation } from "./foundHighlight";
|
|
3
|
-
export declare const VERIFICATION_VERSION_NUMBER = "0.4.37";
|
|
4
3
|
export type OutputImageFormat = "jpeg" | "png" | "avif" | undefined | null;
|
|
5
4
|
export declare const DEFAULT_OUTPUT_IMAGE_FORMAT: "avif";
|
|
6
5
|
export interface VerifyCitationResponse {
|
|
@@ -19,6 +18,7 @@ export interface VerifyCitationRequest {
|
|
|
19
18
|
export interface Citation {
|
|
20
19
|
fileId?: string;
|
|
21
20
|
fullPhrase?: string | null;
|
|
21
|
+
keySpan?: string | null;
|
|
22
22
|
value?: string | null;
|
|
23
23
|
startPageKey?: string | null;
|
|
24
24
|
pageNumber?: number | null;
|
|
@@ -33,8 +33,6 @@ export interface Citation {
|
|
|
33
33
|
fragmentContext?: string | null;
|
|
34
34
|
rawCitationMd?: string;
|
|
35
35
|
beforeCite?: string;
|
|
36
|
-
formFieldName?: string | null;
|
|
37
|
-
formFieldValue?: string | null;
|
|
38
36
|
}
|
|
39
37
|
export interface CitationStatus {
|
|
40
38
|
isVerified: boolean;
|
package/lib/types/citation.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { type Citation } from "./citation";
|
|
2
2
|
import { type SearchState } from "./search";
|
|
3
3
|
import { type PdfSpaceItem } from "./boxes";
|
|
4
4
|
export declare const NOT_FOUND_HIGHLIGHT_INDEX = -1;
|
|
@@ -18,6 +18,6 @@ export interface FoundHighlightLocation {
|
|
|
18
18
|
matchSnippet?: string | null;
|
|
19
19
|
pdfSpaceItem?: PdfSpaceItem;
|
|
20
20
|
verificationImageBase64?: string | null;
|
|
21
|
-
source?:
|
|
21
|
+
source?: string | null;
|
|
22
22
|
verifiedAt?: Date;
|
|
23
23
|
}
|
package/lib/types/index.d.ts
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* @packageDocumentation
|
|
5
5
|
*/
|
|
6
6
|
export type { Citation, CitationStatus, VerifyCitationRequest, VerifyCitationResponse, OutputImageFormat, } from "./citation.js";
|
|
7
|
-
export {
|
|
7
|
+
export { DEFAULT_OUTPUT_IMAGE_FORMAT } from "./citation.js";
|
|
8
8
|
export type { FoundHighlightLocation } from "./foundHighlight.js";
|
|
9
9
|
export { NOT_FOUND_HIGHLIGHT_INDEX, PENDING_HIGHLIGHT_INDEX, BLANK_HIGHLIGHT_LOCATION, deterministicIdFromHighlightLocation, } from "./foundHighlight.js";
|
|
10
10
|
export type { SearchState, SearchStatus } from "./search.js";
|
package/lib/types/index.js
CHANGED
|
@@ -3,5 +3,5 @@
|
|
|
3
3
|
*
|
|
4
4
|
* @packageDocumentation
|
|
5
5
|
*/
|
|
6
|
-
export {
|
|
6
|
+
export { DEFAULT_OUTPUT_IMAGE_FORMAT } from "./citation.js";
|
|
7
7
|
export { NOT_FOUND_HIGHLIGHT_INDEX, PENDING_HIGHLIGHT_INDEX, BLANK_HIGHLIGHT_LOCATION, deterministicIdFromHighlightLocation, } from "./foundHighlight.js";
|