@deepcitation/deepcitation-js 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +71 -1197
- package/lib/client/DeepCitation.d.ts +204 -0
- package/lib/client/DeepCitation.js +473 -0
- package/lib/client/index.d.ts +2 -0
- package/lib/client/index.js +1 -0
- package/lib/client/types.d.ts +157 -0
- package/lib/client/types.js +1 -0
- package/lib/index.d.ts +25 -0
- package/lib/index.js +22 -0
- package/lib/parsing/normalizeCitation.d.ts +5 -0
- package/lib/parsing/normalizeCitation.js +182 -0
- package/lib/parsing/parseCitation.d.ts +79 -0
- package/lib/parsing/parseCitation.js +371 -0
- package/lib/parsing/parseWorkAround.d.ts +2 -0
- package/lib/parsing/parseWorkAround.js +73 -0
- package/lib/prompts/citationPrompts.d.ts +133 -0
- package/lib/prompts/citationPrompts.js +152 -0
- package/lib/prompts/index.d.ts +3 -0
- package/lib/prompts/index.js +3 -0
- package/lib/prompts/promptCompression.d.ts +14 -0
- package/lib/prompts/promptCompression.js +109 -0
- package/lib/prompts/types.d.ts +4 -0
- package/lib/prompts/types.js +1 -0
- package/lib/react/CitationComponent.d.ts +134 -0
- package/lib/react/CitationComponent.js +376 -0
- package/lib/react/CitationVariants.d.ts +135 -0
- package/lib/react/CitationVariants.js +283 -0
- package/lib/react/DiffDisplay.d.ts +10 -0
- package/lib/react/DiffDisplay.js +33 -0
- package/lib/react/UrlCitationComponent.d.ts +83 -0
- package/lib/react/UrlCitationComponent.js +224 -0
- package/lib/react/VerificationTabs.d.ts +10 -0
- package/lib/react/VerificationTabs.js +36 -0
- package/lib/react/icons.d.ts +8 -0
- package/lib/react/icons.js +9 -0
- package/lib/react/index.d.ts +16 -0
- package/lib/react/index.js +18 -0
- package/lib/react/primitives.d.ts +104 -0
- package/lib/react/primitives.js +190 -0
- package/lib/react/types.d.ts +192 -0
- package/lib/react/types.js +1 -0
- package/lib/react/useSmartDiff.d.ts +16 -0
- package/lib/react/useSmartDiff.js +64 -0
- package/lib/react/utils.d.ts +34 -0
- package/lib/react/utils.js +59 -0
- package/lib/types/boxes.d.ts +11 -0
- package/lib/types/boxes.js +1 -0
- package/lib/types/citation.d.ts +44 -0
- package/lib/types/citation.js +2 -0
- package/lib/types/foundHighlight.d.ts +23 -0
- package/lib/types/foundHighlight.js +22 -0
- package/lib/types/index.d.ts +11 -0
- package/lib/types/index.js +7 -0
- package/lib/types/search.d.ts +30 -0
- package/lib/types/search.js +1 -0
- package/lib/utils/sha.d.ts +10 -0
- package/lib/utils/sha.js +108 -0
- package/package.json +6 -5
- /package/{src → lib}/react/styles.css +0 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import type { Citation, FoundHighlightLocation } from "../types/index";
|
|
2
|
+
/**
|
|
3
|
+
* Configuration options for the DeepCitation client
|
|
4
|
+
*/
|
|
5
|
+
export interface DeepCitationConfig {
|
|
6
|
+
/** Your DeepCitation API key (starts with dc_live_ or dc_test_) */
|
|
7
|
+
apiKey: string;
|
|
8
|
+
/** Optional custom API base URL. Defaults to https://api.deepcitation.com */
|
|
9
|
+
apiUrl?: string;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Response from uploading a file for citation verification
|
|
13
|
+
*/
|
|
14
|
+
export interface UploadFileResponse {
|
|
15
|
+
/** The file ID assigned by DeepCitation (custom or auto-generated) */
|
|
16
|
+
fileId: string;
|
|
17
|
+
/** The full text content formatted for LLM prompts with page markers and line IDs. Use this in your user prompts. */
|
|
18
|
+
fileDeepText: string;
|
|
19
|
+
/** Form fields extracted from PDF forms */
|
|
20
|
+
formFields?: Array<{
|
|
21
|
+
name: string;
|
|
22
|
+
value?: string;
|
|
23
|
+
pageIndex?: number;
|
|
24
|
+
type?: string;
|
|
25
|
+
}>;
|
|
26
|
+
/** Metadata about the processed file */
|
|
27
|
+
metadata: {
|
|
28
|
+
filename: string;
|
|
29
|
+
mimeType: string;
|
|
30
|
+
pageCount: number;
|
|
31
|
+
textByteSize: number;
|
|
32
|
+
};
|
|
33
|
+
/** Processing status */
|
|
34
|
+
status: "ready" | "error";
|
|
35
|
+
/** Time taken to process the file in milliseconds */
|
|
36
|
+
processingTimeMs?: number;
|
|
37
|
+
/** Error message if status is "error" */
|
|
38
|
+
error?: string;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Options for file upload
|
|
42
|
+
*/
|
|
43
|
+
export interface UploadFileOptions {
|
|
44
|
+
/** Optional custom file ID to use instead of auto-generated one */
|
|
45
|
+
fileId?: string;
|
|
46
|
+
/** Optional custom filename (uses File.name if not provided) */
|
|
47
|
+
filename?: string;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Response from verifying citations
|
|
51
|
+
*/
|
|
52
|
+
export interface VerifyCitationsResponse {
|
|
53
|
+
/** Map of citation keys to their verification results */
|
|
54
|
+
foundHighlights: Record<string, FoundHighlightLocation>;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Options for citation verification
|
|
58
|
+
*/
|
|
59
|
+
export interface VerifyCitationsOptions {
|
|
60
|
+
/** Output image format for verification screenshots */
|
|
61
|
+
outputImageFormat?: "jpeg" | "png" | "avif";
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Simplified citation input for verification
|
|
65
|
+
*/
|
|
66
|
+
export type CitationInput = Citation | Record<string, Citation>;
|
|
67
|
+
/**
|
|
68
|
+
* Input for file upload in prepareFiles
|
|
69
|
+
*/
|
|
70
|
+
export interface FileInput {
|
|
71
|
+
/** The file content (File, Blob, or Buffer) */
|
|
72
|
+
file: File | Blob | Buffer;
|
|
73
|
+
/** Optional filename */
|
|
74
|
+
filename?: string;
|
|
75
|
+
/** Optional custom file ID */
|
|
76
|
+
fileId?: string;
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* File reference returned from prepareFiles
|
|
80
|
+
*/
|
|
81
|
+
export interface FileDataPart {
|
|
82
|
+
/** The file ID assigned by DeepCitation */
|
|
83
|
+
fileId: string;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Result from prepareFiles
|
|
87
|
+
*/
|
|
88
|
+
export interface PrepareFilesResult {
|
|
89
|
+
/** Array of file references for verification */
|
|
90
|
+
fileDataParts: FileDataPart[];
|
|
91
|
+
/** Array of formatted text content for LLM prompts (with page markers and line IDs) */
|
|
92
|
+
fileDeepTexts: string[];
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Input for verifyCitationsFromLlmOutput
|
|
96
|
+
*/
|
|
97
|
+
export interface VerifyCitationsFromLlmOutputInput {
|
|
98
|
+
/** The LLM response containing citations */
|
|
99
|
+
llmOutput: string;
|
|
100
|
+
/** Optional file references (required for Zero Data Retention or after storage expires) */
|
|
101
|
+
fileDataParts?: FileDataPart[];
|
|
102
|
+
/** Output image format for verification screenshots */
|
|
103
|
+
outputImageFormat?: "jpeg" | "png" | "avif";
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Input for convertFile - convert URL or Office file to PDF
|
|
107
|
+
*/
|
|
108
|
+
export interface ConvertFileInput {
|
|
109
|
+
/** URL to convert to PDF (for web pages or direct PDF links) */
|
|
110
|
+
url?: string;
|
|
111
|
+
/** Office file to convert (doc, docx, xls, xlsx, ppt, pptx, odt, ods, odp) */
|
|
112
|
+
file?: File | Blob | Buffer;
|
|
113
|
+
/** Optional custom filename for the converted PDF */
|
|
114
|
+
filename?: string;
|
|
115
|
+
/** Optional custom file ID */
|
|
116
|
+
fileId?: string;
|
|
117
|
+
/** For URLs: render as single long page instead of paginated */
|
|
118
|
+
singlePage?: boolean;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Response from convertFile
|
|
122
|
+
*/
|
|
123
|
+
export interface ConvertFileResponse {
|
|
124
|
+
/** The file ID assigned by DeepCitation. Pass this to prepareConvertedFile(). */
|
|
125
|
+
fileId: string;
|
|
126
|
+
/** Metadata about the conversion */
|
|
127
|
+
metadata: {
|
|
128
|
+
/** Original filename before conversion */
|
|
129
|
+
originalFilename: string;
|
|
130
|
+
/** Original MIME type before conversion */
|
|
131
|
+
originalMimeType: string;
|
|
132
|
+
/** MIME type after conversion (always application/pdf) */
|
|
133
|
+
convertedMimeType: string;
|
|
134
|
+
/** Time taken for conversion in milliseconds */
|
|
135
|
+
conversionTimeMs: number;
|
|
136
|
+
};
|
|
137
|
+
/** Conversion status */
|
|
138
|
+
status: "converted" | "error";
|
|
139
|
+
/** Error message if status is "error" */
|
|
140
|
+
error?: string;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Options for processing a converted file
|
|
144
|
+
*/
|
|
145
|
+
export interface PrepareConvertedFileOptions {
|
|
146
|
+
/** The file ID from a previous convertFile call */
|
|
147
|
+
fileId: string;
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* @deprecated Use PrepareConvertedFileOptions instead
|
|
151
|
+
*/
|
|
152
|
+
export interface PrepareFileFromAttachmentOptions {
|
|
153
|
+
/** The attachment ID from a previous convertFile call */
|
|
154
|
+
attachmentId: string;
|
|
155
|
+
/** Optional custom file ID */
|
|
156
|
+
fileId?: string;
|
|
157
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/lib/index.d.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DeepCitation - Citation parsing, verification, and rendering library
|
|
3
|
+
* @packageDocumentation
|
|
4
|
+
*/
|
|
5
|
+
export { DeepCitation } from "./client/index.js";
|
|
6
|
+
export type { DeepCitationConfig, UploadFileResponse, UploadFileOptions, VerifyCitationsResponse, VerifyCitationsOptions, CitationInput, FileInput, FileDataPart, PrepareFilesResult, VerifyCitationsFromLlmOutputInput, } from "./client/index.js";
|
|
7
|
+
export { parseCitation, getCitationStatus, getAllCitationsFromLlmOutput, groupCitationsByFileId, groupCitationsByFileIdObject, } from "./parsing/parseCitation.js";
|
|
8
|
+
export { normalizeCitations, getCitationPageNumber } from "./parsing/normalizeCitation.js";
|
|
9
|
+
export { isGeminiGarbage, cleanRepeatingLastSentence } from "./parsing/parseWorkAround.js";
|
|
10
|
+
export type { Citation, CitationStatus, VerifyCitationRequest, VerifyCitationResponse, OutputImageFormat, } from "./types/citation.js";
|
|
11
|
+
export { VERIFICATION_VERSION_NUMBER, DEFAULT_OUTPUT_IMAGE_FORMAT } from "./types/citation.js";
|
|
12
|
+
export type { FoundHighlightLocation } from "./types/foundHighlight.js";
|
|
13
|
+
export { NOT_FOUND_HIGHLIGHT_INDEX, PENDING_HIGHLIGHT_INDEX, BLANK_HIGHLIGHT_LOCATION, deterministicIdFromHighlightLocation, } from "./types/foundHighlight.js";
|
|
14
|
+
export type { SearchState, SearchStatus, SearchMethod, SearchAttempt } from "./types/search.js";
|
|
15
|
+
export type { ScreenBox, PdfSpaceItem, IVertex } from "./types/boxes.js";
|
|
16
|
+
export { sha1Hash } from "./utils/sha.js";
|
|
17
|
+
export { generateCitationKey } from "./react/utils.js";
|
|
18
|
+
export { generateCitationInstanceId } from "./react/utils.js";
|
|
19
|
+
export { CITATION_X_PADDING, CITATION_Y_PADDING } from "./react/utils.js";
|
|
20
|
+
export { CITATION_JSON_OUTPUT_FORMAT, CITATION_MARKDOWN_SYNTAX_PROMPT, AV_CITATION_MARKDOWN_SYNTAX_PROMPT, CITATION_AV_BASED_JSON_OUTPUT_FORMAT, wrapSystemCitationPrompt, wrapCitationPrompt, } from "./prompts/citationPrompts.js";
|
|
21
|
+
export type { WrapSystemPromptOptions, WrapCitationPromptOptions, WrapCitationPromptResult, } from "./prompts/citationPrompts.js";
|
|
22
|
+
export { removeLineIdMetadata, removePageNumberMetadata, removeCitations } from "./parsing/normalizeCitation.js";
|
|
23
|
+
export { compressPromptIds, decompressPromptIds } from "./prompts/promptCompression.js";
|
|
24
|
+
export type { CompressedResult } from "./prompts/types.js";
|
|
25
|
+
export { CitationComponent } from "./react/CitationComponent.js";
|
package/lib/index.js
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DeepCitation - Citation parsing, verification, and rendering library
|
|
3
|
+
* @packageDocumentation
|
|
4
|
+
*/
|
|
5
|
+
// Client
|
|
6
|
+
export { DeepCitation } from "./client/index.js";
|
|
7
|
+
// Parsing
|
|
8
|
+
export { parseCitation, getCitationStatus, getAllCitationsFromLlmOutput, groupCitationsByFileId, groupCitationsByFileIdObject, } from "./parsing/parseCitation.js";
|
|
9
|
+
export { normalizeCitations, getCitationPageNumber } from "./parsing/normalizeCitation.js";
|
|
10
|
+
export { isGeminiGarbage, cleanRepeatingLastSentence } from "./parsing/parseWorkAround.js";
|
|
11
|
+
export { VERIFICATION_VERSION_NUMBER, DEFAULT_OUTPUT_IMAGE_FORMAT } from "./types/citation.js";
|
|
12
|
+
export { NOT_FOUND_HIGHLIGHT_INDEX, PENDING_HIGHLIGHT_INDEX, BLANK_HIGHLIGHT_LOCATION, deterministicIdFromHighlightLocation, } from "./types/foundHighlight.js";
|
|
13
|
+
// Utilities
|
|
14
|
+
export { sha1Hash } from "./utils/sha.js";
|
|
15
|
+
export { generateCitationKey } from "./react/utils.js";
|
|
16
|
+
export { generateCitationInstanceId } from "./react/utils.js";
|
|
17
|
+
export { CITATION_X_PADDING, CITATION_Y_PADDING } from "./react/utils.js";
|
|
18
|
+
// Prompts
|
|
19
|
+
export { CITATION_JSON_OUTPUT_FORMAT, CITATION_MARKDOWN_SYNTAX_PROMPT, AV_CITATION_MARKDOWN_SYNTAX_PROMPT, CITATION_AV_BASED_JSON_OUTPUT_FORMAT, wrapSystemCitationPrompt, wrapCitationPrompt, } from "./prompts/citationPrompts.js";
|
|
20
|
+
export { removeLineIdMetadata, removePageNumberMetadata, removeCitations } from "./parsing/normalizeCitation.js";
|
|
21
|
+
export { compressPromptIds, decompressPromptIds } from "./prompts/promptCompression.js";
|
|
22
|
+
export { CitationComponent } from "./react/CitationComponent.js";
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export declare const removeCitations: (pageText: string, leaveValueBehind?: boolean) => string;
|
|
2
|
+
export declare const removePageNumberMetadata: (pageText: string) => string;
|
|
3
|
+
export declare const removeLineIdMetadata: (pageText: string) => string;
|
|
4
|
+
export declare const getCitationPageNumber: (startPageKey?: string | null) => number | null;
|
|
5
|
+
export declare const normalizeCitations: (response: string) => string;
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
export const removeCitations = (pageText, leaveValueBehind) => {
|
|
2
|
+
const citationRegex = /<cite\s+fileId='(\w{0,25})'\s+start_page[\_a-zA-Z]*='page[\_a-zA-Z]*(\d+)_index_(\d+)'\s+full_phrase='((?:[^'\\]|\\.)*)'\s+line(?:_ids|Ids)='([^']+)'(?:\s+(value|reasoning)='((?:[^'\\]|\\.)*)')?\s*\/>/g;
|
|
3
|
+
return pageText.replace(citationRegex, (match, fileId, pageNumber, index, fullPhrase, lineIds, value) => {
|
|
4
|
+
//it is still value= so we need to remove the value=
|
|
5
|
+
if (leaveValueBehind) {
|
|
6
|
+
return value?.replace(/value=['"]|['"]/g, "") || "";
|
|
7
|
+
}
|
|
8
|
+
else {
|
|
9
|
+
return "";
|
|
10
|
+
}
|
|
11
|
+
});
|
|
12
|
+
};
|
|
13
|
+
export const removePageNumberMetadata = (pageText) => {
|
|
14
|
+
return pageText
|
|
15
|
+
.replace(/<page_number_\d+_index_\d+>/g, "")
|
|
16
|
+
.replace(/<\/page_number_\d+_index_\d+>/g, "")
|
|
17
|
+
.trim();
|
|
18
|
+
};
|
|
19
|
+
export const removeLineIdMetadata = (pageText) => {
|
|
20
|
+
const lineIdRegex = /<line id="[^"]*">|<\/line>/g;
|
|
21
|
+
return pageText.replace(lineIdRegex, "");
|
|
22
|
+
};
|
|
23
|
+
export const getCitationPageNumber = (startPageKey) => {
|
|
24
|
+
//page_number_{page_number}_index_{page_index} or page_number_{page_number} or page_key_{page_number}_index_{page_index}
|
|
25
|
+
if (!startPageKey)
|
|
26
|
+
return null;
|
|
27
|
+
//regex first \d+ is the page number
|
|
28
|
+
const pageNumber = startPageKey.match(/\d+/)?.[0];
|
|
29
|
+
return pageNumber ? parseInt(pageNumber) : null;
|
|
30
|
+
};
|
|
31
|
+
export const normalizeCitations = (response) => {
|
|
32
|
+
let trimmedResponse = response?.trim() || "";
|
|
33
|
+
const citationParts = trimmedResponse.split(/(<cite[\s\S]*?(?:\/>|<\/cite>))/gm);
|
|
34
|
+
if (citationParts.length <= 1) {
|
|
35
|
+
return normalizeCitationContent(trimmedResponse);
|
|
36
|
+
}
|
|
37
|
+
trimmedResponse = citationParts
|
|
38
|
+
.map(part => (part.startsWith("<cite") ? normalizeCitationContent(part) : part))
|
|
39
|
+
.join("");
|
|
40
|
+
return trimmedResponse;
|
|
41
|
+
};
|
|
42
|
+
const normalizeCitationContent = (input) => {
|
|
43
|
+
let normalized = input;
|
|
44
|
+
// 1. Standardize self-closing tags
|
|
45
|
+
// Replace ></cite> with /> for consistency
|
|
46
|
+
normalized = normalized.replace(/><\/cite>/g, "/>");
|
|
47
|
+
const canonicalizeCiteAttributeKey = (key) => {
|
|
48
|
+
if (key === "fullPhrase" || key === "full_phrase")
|
|
49
|
+
return "full_phrase";
|
|
50
|
+
if (key === "lineIds" || key === "line_ids")
|
|
51
|
+
return "line_ids";
|
|
52
|
+
if (key === "startPageKey" || key === "start_pageKey" || key === "start_page_key")
|
|
53
|
+
return "start_page_key";
|
|
54
|
+
if (key === "fileID" || key === "fileId" || key === "file_id")
|
|
55
|
+
return "file_id";
|
|
56
|
+
return key;
|
|
57
|
+
};
|
|
58
|
+
// Helper to decode HTML entities (simple implementation, expand if needed)
|
|
59
|
+
const decodeHtmlEntities = (str) => {
|
|
60
|
+
return str
|
|
61
|
+
.replace(/"/g, '"')
|
|
62
|
+
.replace(/'/g, "'")
|
|
63
|
+
.replace(/</g, "<")
|
|
64
|
+
.replace(/>/g, ">")
|
|
65
|
+
.replace(/&/g, "&");
|
|
66
|
+
};
|
|
67
|
+
// 2. ROBUST TEXT ATTRIBUTE PARSING (reasoning, value, full_phrase)
|
|
68
|
+
// This regex matches: Key = Quote -> Content (lazy) -> Lookahead for (Next Attribute OR End of Tag)
|
|
69
|
+
// It effectively ignores quotes inside the content during the initial capture.
|
|
70
|
+
const textAttributeRegex = /(fullPhrase|full_phrase|reasoning|value)\s*=\s*(['"])([\s\S]*?)(?=\s+(?:line_ids|lineIds|timestamps|fileId|file_id|start_page_key|start_pageKey|startPageKey|reasoning|value|full_phrase)|\s*\/?>)/gm;
|
|
71
|
+
normalized = normalized.replace(textAttributeRegex, (_match, key, openQuote, rawContent) => {
|
|
72
|
+
let content = rawContent;
|
|
73
|
+
// The lazy match usually captures the closing quote because the lookahead
|
|
74
|
+
// starts at the space *after* the attribute. We must strip it.
|
|
75
|
+
if (content.endsWith(openQuote)) {
|
|
76
|
+
content = content.slice(0, -1);
|
|
77
|
+
}
|
|
78
|
+
// 1. Normalization: Flatten newlines to spaces
|
|
79
|
+
content = content.replace(/(\r?\n)+/g, " ");
|
|
80
|
+
// 2. Decode entities to get raw text (e.g., ' -> ')
|
|
81
|
+
content = decodeHtmlEntities(content);
|
|
82
|
+
// 3. Remove Markdown bold/italic markers often hallucinated by LLMs inside attributes
|
|
83
|
+
content = content.replace(/(\*|_){2,}/g, "");
|
|
84
|
+
// 4. Sanitize Quotes:
|
|
85
|
+
// First, unescape existing backslashed quotes to avoid double escaping (e.g. \\' -> ')
|
|
86
|
+
content = content.replace(/\\\\'/g, "'");
|
|
87
|
+
content = content.replace(/\\'/g, "'");
|
|
88
|
+
content = content.replace(/'/g, "\\'");
|
|
89
|
+
content = content.replace(/\\\\"/g, '"');
|
|
90
|
+
content = content.replace(/\\"/g, '"');
|
|
91
|
+
content = content.replace(/"/g, '\\"');
|
|
92
|
+
// 5. Remove * from the content, sometimes a md list will really mess things up here so we remove it
|
|
93
|
+
content = content.replace(/\*/g, ""); //this is a hack to remove the * from the content
|
|
94
|
+
return `${canonicalizeCiteAttributeKey(key)}='${content}'`;
|
|
95
|
+
});
|
|
96
|
+
// 3. ROBUST LINE_ID / TIMESTAMP PARSING
|
|
97
|
+
// Handles unquoted, single quoted, or double quoted numbers/ranges.
|
|
98
|
+
// Can handle line_ids appearing anywhere in the tag, not just at the end.
|
|
99
|
+
normalized = normalized.replace(/(line_ids|lineIds|timestamps)=['"]?([\[\]\(\){}A-Za-z0-9_\-, ]+)['"]?(\s*\/?>|\s+)/gm, (_match, key, rawValue, trailingChars) => {
|
|
100
|
+
// Clean up the value (remove generic text, keep numbers/separators)
|
|
101
|
+
let cleanedValue = rawValue.replace(/[A-Za-z\[\]\(\){}]/g, "");
|
|
102
|
+
// Expand ranges (e.g., "1-3" -> "1,2,3")
|
|
103
|
+
cleanedValue = cleanedValue.replace(/(\d+)-(\d+)/g, (_rangeMatch, start, end) => {
|
|
104
|
+
const startNum = parseInt(start, 10);
|
|
105
|
+
const endNum = parseInt(end, 10);
|
|
106
|
+
const range = [];
|
|
107
|
+
// Handle ascending range
|
|
108
|
+
if (startNum <= endNum) {
|
|
109
|
+
for (let i = startNum; i <= endNum; i++) {
|
|
110
|
+
range.push(i);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
// Fallback for weird descending ranges or just return start
|
|
115
|
+
range.push(startNum);
|
|
116
|
+
}
|
|
117
|
+
return range.join(",");
|
|
118
|
+
});
|
|
119
|
+
// Normalize commas
|
|
120
|
+
cleanedValue = cleanedValue.replace(/,+/g, ",").replace(/^,|,$/g, "");
|
|
121
|
+
// Return standardized format: key='value' + preserved trailing characters (space or />)
|
|
122
|
+
return `${canonicalizeCiteAttributeKey(key)}='${cleanedValue}'${trailingChars}`;
|
|
123
|
+
});
|
|
124
|
+
// 4. Re-order <cite ... /> attributes to match the strict parsing expectations in `citationParser.ts`
|
|
125
|
+
// (the parser uses regexes that assume a canonical attribute order).
|
|
126
|
+
const reorderCiteTagAttributes = (tag) => {
|
|
127
|
+
// Match both single-quoted and double-quoted attributes
|
|
128
|
+
const attrRegex = /([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(['"])((?:[^'"\\\n]|\\.)*)(?:\2)/g;
|
|
129
|
+
const attrs = {};
|
|
130
|
+
let match;
|
|
131
|
+
while ((match = attrRegex.exec(tag))) {
|
|
132
|
+
const rawKey = match[1];
|
|
133
|
+
const value = match[3]; // match[2] is the quote character
|
|
134
|
+
const key = canonicalizeCiteAttributeKey(rawKey);
|
|
135
|
+
attrs[key] = value;
|
|
136
|
+
}
|
|
137
|
+
// If we didn't find any parsable attrs, don't touch the tag.
|
|
138
|
+
const keys = Object.keys(attrs);
|
|
139
|
+
if (keys.length === 0)
|
|
140
|
+
return tag;
|
|
141
|
+
const hasTimestamps = typeof attrs.timestamps === "string" && attrs.timestamps.length > 0;
|
|
142
|
+
const startPageKeys = keys.filter(k => k.startsWith("start_page"));
|
|
143
|
+
const ordered = [];
|
|
144
|
+
// Shared first
|
|
145
|
+
if (attrs.file_id)
|
|
146
|
+
ordered.push("file_id");
|
|
147
|
+
if (hasTimestamps) {
|
|
148
|
+
// AV citations: fileId, full_phrase, timestamps, (optional reasoning/value), then any extras
|
|
149
|
+
if (attrs.full_phrase)
|
|
150
|
+
ordered.push("full_phrase");
|
|
151
|
+
ordered.push("timestamps");
|
|
152
|
+
}
|
|
153
|
+
else {
|
|
154
|
+
// Document citations: fileId, start_page*, full_phrase, line_ids, (optional reasoning/value), then any extras
|
|
155
|
+
if (startPageKeys.includes("start_page_key"))
|
|
156
|
+
ordered.push("start_page_key");
|
|
157
|
+
startPageKeys
|
|
158
|
+
.filter(k => k !== "start_page_key")
|
|
159
|
+
.sort()
|
|
160
|
+
.forEach(k => ordered.push(k));
|
|
161
|
+
if (attrs.full_phrase)
|
|
162
|
+
ordered.push("full_phrase");
|
|
163
|
+
if (attrs.line_ids)
|
|
164
|
+
ordered.push("line_ids");
|
|
165
|
+
}
|
|
166
|
+
// Optional attrs supported by the parser (but not required)
|
|
167
|
+
if (attrs.reasoning)
|
|
168
|
+
ordered.push("reasoning");
|
|
169
|
+
if (attrs.value)
|
|
170
|
+
ordered.push("value");
|
|
171
|
+
// Any remaining attributes, stable + deterministic (alpha)
|
|
172
|
+
const used = new Set(ordered);
|
|
173
|
+
keys
|
|
174
|
+
.filter(k => !used.has(k))
|
|
175
|
+
.sort()
|
|
176
|
+
.forEach(k => ordered.push(k));
|
|
177
|
+
const rebuiltAttrs = ordered.map(k => `${k}='${attrs[k]}'`).join(" ");
|
|
178
|
+
return `<cite ${rebuiltAttrs} />`;
|
|
179
|
+
};
|
|
180
|
+
normalized = normalized.replace(/<cite\b[\s\S]*?\/>/gm, tag => reorderCiteTagAttributes(tag));
|
|
181
|
+
return normalized;
|
|
182
|
+
};
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import { type FoundHighlightLocation } from "../types/foundHighlight.js";
|
|
2
|
+
import { type Citation, type CitationStatus } from "../types/citation.js";
|
|
3
|
+
/**
|
|
4
|
+
* Calculates the verification status of a citation based on the found highlight and search state.
|
|
5
|
+
*
|
|
6
|
+
* @param foundHighlight - The found highlight location, or null/undefined if not found
|
|
7
|
+
* @returns An object containing boolean flags for verification status
|
|
8
|
+
*/
|
|
9
|
+
export declare function getCitationStatus(foundHighlight: FoundHighlightLocation | null | undefined): CitationStatus;
|
|
10
|
+
export declare const parseCitation: (fragment: string, mdAttachmentId?: string | null, citationCounterRef?: any | null, isVerbose?: boolean) => {
|
|
11
|
+
beforeCite: string;
|
|
12
|
+
afterCite: string;
|
|
13
|
+
citation: Citation;
|
|
14
|
+
};
|
|
15
|
+
/**
|
|
16
|
+
* Extracts all citations from LLM output.
|
|
17
|
+
* Supports both XML <cite ... /> tags (embedded in strings/markdown) and JSON-based citation formats.
|
|
18
|
+
*
|
|
19
|
+
* For object input:
|
|
20
|
+
* - Traverses the object looking for `citation` or `citations` properties matching JSON format
|
|
21
|
+
* - Also stringifies the object to find embedded XML citations in markdown content
|
|
22
|
+
*
|
|
23
|
+
* @param llmOutput - The LLM output (string or object)
|
|
24
|
+
* @returns Dictionary of parsed Citation objects keyed by citation key
|
|
25
|
+
*/
|
|
26
|
+
export declare const getAllCitationsFromLlmOutput: (llmOutput: any) => {
|
|
27
|
+
[key: string]: Citation;
|
|
28
|
+
};
|
|
29
|
+
/**
|
|
30
|
+
* Groups citations by their fileId for multi-file verification scenarios.
|
|
31
|
+
* This is useful when you have citations from multiple files and need to
|
|
32
|
+
* verify them against their respective source documents.
|
|
33
|
+
*
|
|
34
|
+
* @param citations - Array of Citation objects or a dictionary of citations
|
|
35
|
+
* @returns Map of fileId to dictionary of citations from that file
|
|
36
|
+
*
|
|
37
|
+
* @example
|
|
38
|
+
* ```typescript
|
|
39
|
+
* const citations = getAllCitationsFromLlmOutput(response.content);
|
|
40
|
+
* const citationsByFile = groupCitationsByFileId(citations);
|
|
41
|
+
*
|
|
42
|
+
* // Verify citations for each file
|
|
43
|
+
* for (const [fileId, fileCitations] of citationsByFile) {
|
|
44
|
+
* const verified = await dc.verifyCitations(fileId, fileCitations);
|
|
45
|
+
* // Process verification results...
|
|
46
|
+
* }
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
export declare function groupCitationsByFileId(citations: Citation[] | {
|
|
50
|
+
[key: string]: Citation;
|
|
51
|
+
}): Map<string, {
|
|
52
|
+
[key: string]: Citation;
|
|
53
|
+
}>;
|
|
54
|
+
/**
|
|
55
|
+
* Groups citations by their fileId and returns as a plain object.
|
|
56
|
+
* Alternative to groupCitationsByFileId that returns a plain object instead of a Map.
|
|
57
|
+
*
|
|
58
|
+
* @param citations - Array of Citation objects or a dictionary of citations
|
|
59
|
+
* @returns Object with fileId keys mapping to citation dictionaries
|
|
60
|
+
*
|
|
61
|
+
* @example
|
|
62
|
+
* ```typescript
|
|
63
|
+
* const citations = getAllCitationsFromLlmOutput(response.content);
|
|
64
|
+
* const citationsByFile = groupCitationsByFileIdObject(citations);
|
|
65
|
+
*
|
|
66
|
+
* // Verify citations for each file using Promise.all
|
|
67
|
+
* const verificationPromises = Object.entries(citationsByFile).map(
|
|
68
|
+
* ([fileId, fileCitations]) => dc.verifyCitations(fileId, fileCitations)
|
|
69
|
+
* );
|
|
70
|
+
* const results = await Promise.all(verificationPromises);
|
|
71
|
+
* ```
|
|
72
|
+
*/
|
|
73
|
+
export declare function groupCitationsByFileIdObject(citations: Citation[] | {
|
|
74
|
+
[key: string]: Citation;
|
|
75
|
+
}): {
|
|
76
|
+
[fileId: string]: {
|
|
77
|
+
[key: string]: Citation;
|
|
78
|
+
};
|
|
79
|
+
};
|