@xevos117/mcp-zotero 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +24 -0
- package/README.md +127 -0
- package/build/citation-injector/citation-formatter.js +30 -0
- package/build/citation-injector/field-codes.js +33 -0
- package/build/citation-injector/injector.js +145 -0
- package/build/citation-injector/xml-utils.js +19 -0
- package/build/citation-injector/zcite-normalizer.js +232 -0
- package/build/server.js +75 -0
- package/build/tools/add-items-by-doi.js +140 -0
- package/build/tools/add-linked-url-attachment.js +92 -0
- package/build/tools/add-web-item.js +97 -0
- package/build/tools/create-collection.js +63 -0
- package/build/tools/find-and-attach-pdfs.js +208 -0
- package/build/tools/get-collection-items.js +84 -0
- package/build/tools/get-collections.js +39 -0
- package/build/tools/get-item-fulltext.js +101 -0
- package/build/tools/get-items-details.js +73 -0
- package/build/tools/get-user-id.js +12 -0
- package/build/tools/import-pdf-to-zotero.js +129 -0
- package/build/tools/index.js +60 -0
- package/build/tools/inject-citations.js +83 -0
- package/build/tools/search-library.js +94 -0
- package/build/types/csl-types.js +1 -0
- package/build/types/zotero-types.js +9 -0
- package/build/utils/concurrency.js +28 -0
- package/build/utils/csl-to-zotero.js +77 -0
- package/build/utils/doi-resolver.js +30 -0
- package/build/utils/error-formatter.js +13 -0
- package/build/utils/fetch-retry.js +34 -0
- package/build/utils/item-formatter.js +9 -0
- package/build/utils/logger.js +14 -0
- package/build/utils/pdf-text-extractor.js +9 -0
- package/build/utils/pdf-uploader.js +230 -0
- package/build/utils/unpaywall.js +116 -0
- package/build/utils/zotero-fulltext.js +22 -0
- package/package.json +58 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Xevos117
|
|
4
|
+
|
|
5
|
+
Based on mcp-zotero by Abhishek Kalia (https://github.com/kaliaboi/mcp-zotero),
|
|
6
|
+
Copyright (c) 2024 Abhishek Kalia, licensed under the MIT License.
|
|
7
|
+
|
|
8
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
9
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
10
|
+
in the Software without restriction, including without limitation the rights
|
|
11
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
12
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
13
|
+
furnished to do so, subject to the following conditions:
|
|
14
|
+
|
|
15
|
+
The above copyright notice and this permission notice shall be included in all
|
|
16
|
+
copies or substantial portions of the Software.
|
|
17
|
+
|
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
19
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
20
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
21
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
22
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
23
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
24
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# MCP Zotero
|
|
2
|
+
|
|
3
|
+
> **Note:** This is an unofficial community project and is not affiliated with, endorsed by, or supported by the Zotero team or the Corporation for Digital Scholarship. "Zotero" is a registered trademark of the Corporation for Digital Scholarship.
|
|
4
|
+
|
|
5
|
+
A Model Context Protocol server for Zotero integration. It gives any LLM full access to your Zotero library: search, organize, add papers by DOI, import PDFs, read full-text content, and inject live citations into Word documents.
|
|
6
|
+
|
|
7
|
+
> Originally based on [mcp-zotero](https://github.com/kaliaboi/mcp-zotero) by Abhishek Kalia.
|
|
8
|
+
> This project has since been extensively rewritten with a new architecture, 13 tools (up from 5), citation injection, PDF management, and Claude skill support.
|
|
9
|
+
|
|
10
|
+
## How it works
|
|
11
|
+
|
|
12
|
+
The server is designed to be **usable by any LLM without external documentation**. On connection, it sends workflow instructions via the MCP `instructions` field, and each tool description includes cross-references and usage guidance. An LLM that has never seen this server before can discover the full workflow — from adding papers to producing a cited Word document — directly from the tool listing.
|
|
13
|
+
|
|
14
|
+
For advanced use cases (PDF upload policy, citation style guidance, source transparency), a **Claude skill** is included for Claude.ai Projects. But the skill is optional: the MCP server is fully self-documenting.
|
|
15
|
+
|
|
16
|
+
## Local vs Remote LLMs
|
|
17
|
+
|
|
18
|
+
| Scenario | MCP server | Skill needed? |
|
|
19
|
+
|---|---|---|
|
|
20
|
+
| Local LLM (Claude Code, LM Studio, etc.) | All 13 tools | No |
|
|
21
|
+
| Remote/sandboxed LLM (Claude.ai Projects) | API tools (search, add, metadata) | Yes, for citation injection |
|
|
22
|
+
|
|
23
|
+
Local LLMs with filesystem access can use all tools directly, including `inject_citations` which reads and writes `.docx` files on disk.
|
|
24
|
+
|
|
25
|
+
Remote LLMs without filesystem access can use the included **Claude skill** (`skills/zotero-skill-mcp-integrations/`), which runs citation injection entirely inside the sandbox. MCP tools handle all Zotero API operations; the skill handles document assembly.
|
|
26
|
+
|
|
27
|
+
## Setup
|
|
28
|
+
|
|
29
|
+
1. Get your Zotero credentials:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# Create an API key at https://www.zotero.org/settings/keys
|
|
33
|
+
# (enable library read/write + file access)
|
|
34
|
+
# Then retrieve your user ID:
|
|
35
|
+
curl -H "Zotero-API-Key: YOUR_API_KEY" https://api.zotero.org/keys/current
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
2. Set environment variables:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
export ZOTERO_API_KEY="your-api-key"
|
|
42
|
+
export ZOTERO_USER_ID="user-id-from-curl"
|
|
43
|
+
export UNPAYWALL_EMAIL="your@email.edu" # Optional: enables OA PDF lookup via Unpaywall
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Environment Variables
|
|
47
|
+
|
|
48
|
+
| Variable | Required | Description |
|
|
49
|
+
|---|---|---|
|
|
50
|
+
| `ZOTERO_API_KEY` | Yes | API key for Zotero Web API v3. Create one at [zotero.org/settings/keys](https://www.zotero.org/settings/keys) with library read/write and file access permissions. |
|
|
51
|
+
| `ZOTERO_USER_ID` | Yes | Your Zotero numeric user ID. Retrieve it with `curl -H "Zotero-API-Key: KEY" https://api.zotero.org/keys/current`. |
|
|
52
|
+
| `UNPAYWALL_EMAIL` | No | Email for Unpaywall API requests ([rate-limit policy](https://unpaywall.org/products/api)). Enables OA PDF lookup in `add_items_by_doi` and `find_and_attach_pdfs`. If not set, OA PDF features are silently skipped. |
|
|
53
|
+
|
|
54
|
+
## Integration with Claude Desktop
|
|
55
|
+
|
|
56
|
+
Add to your Claude Desktop configuration:
|
|
57
|
+
|
|
58
|
+
```json
|
|
59
|
+
{
|
|
60
|
+
"mcpServers": {
|
|
61
|
+
"zotero": {
|
|
62
|
+
"command": "npx",
|
|
63
|
+
"args": ["-y", "@xevos117/mcp-zotero"],
|
|
64
|
+
"env": {
|
|
65
|
+
"ZOTERO_API_KEY": "YOUR_API_KEY",
|
|
66
|
+
"ZOTERO_USER_ID": "YOUR_USER_ID",
|
|
67
|
+
"UNPAYWALL_EMAIL": "YOUR_EMAIL"
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Integration with Claude Code
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
claude mcp add-json "zotero" '{"command":"npx","args":["tsx","src/server.ts"],"env":{"ZOTERO_API_KEY":"...","ZOTERO_USER_ID":"..."}}'
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Available Tools
|
|
81
|
+
|
|
82
|
+
### Library browsing
|
|
83
|
+
|
|
84
|
+
| Tool | Description |
|
|
85
|
+
|---|---|
|
|
86
|
+
| `get_collections` | List all collections (folders) with keys, names, and parent relationships |
|
|
87
|
+
| `get_collection_items` | Get items in a specific collection with keys, titles, authors, dates |
|
|
88
|
+
| `search_library` | Search by query, or list items sorted by field (date, title, etc.) |
|
|
89
|
+
| `get_items_details` | Batch metadata retrieval for multiple items in a single call |
|
|
90
|
+
| `get_item_fulltext` | Get full-text content of a PDF attachment via Zotero's fulltext index |
|
|
91
|
+
|
|
92
|
+
### Adding content
|
|
93
|
+
|
|
94
|
+
| Tool | Description |
|
|
95
|
+
|---|---|
|
|
96
|
+
| `add_items_by_doi` | Add papers by DOI with automatic metadata resolution. Auto-attaches OA PDFs via Unpaywall |
|
|
97
|
+
| `add_web_item` | Save a web page as a Zotero item (for articles without DOI) |
|
|
98
|
+
| `create_collection` | Create a new collection, optionally nested under a parent |
|
|
99
|
+
| `import_pdf_to_zotero` | Download a PDF from URL, upload to Zotero storage, auto-index full text |
|
|
100
|
+
| `find_and_attach_pdfs` | Batch OA PDF lookup and auto-attach via Unpaywall (by item keys or collection) |
|
|
101
|
+
| `add_linked_url_attachment` | Attach a URL to an existing item or create a standalone link |
|
|
102
|
+
|
|
103
|
+
### Citation & documents
|
|
104
|
+
|
|
105
|
+
| Tool | Description |
|
|
106
|
+
|---|---|
|
|
107
|
+
| `inject_citations` | Inject live Zotero citations into a Word document. Supports APA, IEEE, Vancouver, Harvard, Chicago |
|
|
108
|
+
| `get_user_id` | Returns the configured Zotero user ID |
|
|
109
|
+
|
|
110
|
+
## Development
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
npm install
|
|
114
|
+
npm run build # Compile TypeScript
|
|
115
|
+
npm test # Run tests (vitest, 299 tests)
|
|
116
|
+
npx tsx src/server.ts # Run directly without building
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Debug with MCP Inspector
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
npx @modelcontextprotocol/inspector npx tsx src/server.ts
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## License
|
|
126
|
+
|
|
127
|
+
MIT - see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
export function formatCitationText(items, style, num) {
|
|
2
|
+
if (style === "ieee" || style === "vancouver") {
|
|
3
|
+
return num ? `[${num}]` : "[?]";
|
|
4
|
+
}
|
|
5
|
+
const parts = items.map((item) => {
|
|
6
|
+
const authors = item.author;
|
|
7
|
+
const firstAuthor = authors?.[0]?.family ?? "Unknown";
|
|
8
|
+
const year = item.issued?.["date-parts"]?.[0]?.[0]?.toString() ?? "n.d.";
|
|
9
|
+
let authorText;
|
|
10
|
+
if (!authors || authors.length === 0) {
|
|
11
|
+
const title = item.title;
|
|
12
|
+
authorText = title
|
|
13
|
+
? title.length > 30
|
|
14
|
+
? `"${title.substring(0, 30)}..."`
|
|
15
|
+
: `"${title}"`
|
|
16
|
+
: "Unknown";
|
|
17
|
+
}
|
|
18
|
+
else if (authors.length > 2) {
|
|
19
|
+
authorText = `${firstAuthor} et al.`;
|
|
20
|
+
}
|
|
21
|
+
else if (authors.length === 2) {
|
|
22
|
+
authorText = `${firstAuthor} & ${authors[1].family}`;
|
|
23
|
+
}
|
|
24
|
+
else {
|
|
25
|
+
authorText = firstAuthor;
|
|
26
|
+
}
|
|
27
|
+
return `${authorText}, ${year}`;
|
|
28
|
+
});
|
|
29
|
+
return `(${parts.join("; ")})`;
|
|
30
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { randomUUID } from "node:crypto";
|
|
2
|
+
import { escapeXml } from "./xml-utils.js";
|
|
3
|
+
export function generateZoteroFieldCode(citationItems, formattedText) {
|
|
4
|
+
const citationId = randomUUID().slice(0, 8);
|
|
5
|
+
const cslCitation = {
|
|
6
|
+
citationID: citationId,
|
|
7
|
+
properties: { formattedCitation: formattedText },
|
|
8
|
+
citationItems,
|
|
9
|
+
schema: "https://github.com/citation-style-language/schema/raw/master/csl-citation.json",
|
|
10
|
+
};
|
|
11
|
+
const instrText = ` ADDIN ZOTERO_ITEM CSL_CITATION ${JSON.stringify(cslCitation)} `;
|
|
12
|
+
const escapedInstrText = escapeXml(instrText);
|
|
13
|
+
return [
|
|
14
|
+
'<w:r><w:fldChar w:fldCharType="begin"/></w:r>',
|
|
15
|
+
`<w:r><w:instrText xml:space="preserve">${escapedInstrText}</w:instrText></w:r>`,
|
|
16
|
+
'<w:r><w:fldChar w:fldCharType="separate"/></w:r>',
|
|
17
|
+
`<w:r><w:rPr><w:noProof/></w:rPr><w:t>${escapeXml(formattedText)}</w:t></w:r>`,
|
|
18
|
+
'<w:r><w:fldChar w:fldCharType="end"/></w:r>',
|
|
19
|
+
].join("");
|
|
20
|
+
}
|
|
21
|
+
export function generateBibliographyFieldCode() {
|
|
22
|
+
return [
|
|
23
|
+
"<w:p>",
|
|
24
|
+
'<w:r><w:fldChar w:fldCharType="begin"/></w:r>',
|
|
25
|
+
'<w:r><w:instrText xml:space="preserve">',
|
|
26
|
+
" ADDIN ZOTERO_BIBL {"uncited":[],"omitted":[],"custom":[]} CSL_BIBLIOGRAPHY ",
|
|
27
|
+
"</w:instrText></w:r>",
|
|
28
|
+
'<w:r><w:fldChar w:fldCharType="separate"/></w:r>',
|
|
29
|
+
"<w:r><w:rPr><w:noProof/></w:rPr><w:t>[Bibliography will be generated by Zotero]</w:t></w:r>",
|
|
30
|
+
'<w:r><w:fldChar w:fldCharType="end"/></w:r>',
|
|
31
|
+
"</w:p>",
|
|
32
|
+
].join("");
|
|
33
|
+
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import JSZip from "jszip";
|
|
2
|
+
import { readFile, writeFile } from "node:fs/promises";
|
|
3
|
+
import { generateZoteroFieldCode, generateBibliographyFieldCode } from "./field-codes.js";
|
|
4
|
+
import { formatCitationText } from "./citation-formatter.js";
|
|
5
|
+
import { regexEscape, unescapeXml } from "./xml-utils.js";
|
|
6
|
+
import { normalizeZciteTags } from "./zcite-normalizer.js";
|
|
7
|
+
import { zoteroItemToCsl } from "../utils/csl-to-zotero.js";
|
|
8
|
+
function parseZciteMatches(documentXml) {
|
|
9
|
+
const findRegex = /<zcite\s+[\s\S]*?\/>/g;
|
|
10
|
+
const matches = [];
|
|
11
|
+
let findMatch;
|
|
12
|
+
while ((findMatch = findRegex.exec(documentXml)) !== null) {
|
|
13
|
+
const fullMatch = findMatch[0];
|
|
14
|
+
const cleanTag = unescapeXml(fullMatch);
|
|
15
|
+
const attrRegex = /(\w+)="([^"]*)"/g;
|
|
16
|
+
const attrs = {};
|
|
17
|
+
let attrMatch;
|
|
18
|
+
while ((attrMatch = attrRegex.exec(cleanTag)) !== null) {
|
|
19
|
+
// Unescape attribute values since the zcite tag is XML
|
|
20
|
+
// and values like "pp. 12 & 15" need a second unescape
|
|
21
|
+
attrs[attrMatch[1]] = unescapeXml(attrMatch[2]);
|
|
22
|
+
}
|
|
23
|
+
if (!attrs["keys"])
|
|
24
|
+
continue;
|
|
25
|
+
matches.push({
|
|
26
|
+
fullMatch,
|
|
27
|
+
keys: attrs["keys"].split(","),
|
|
28
|
+
locator: attrs["locator"] || undefined,
|
|
29
|
+
prefix: attrs["prefix"] || undefined,
|
|
30
|
+
suffix: attrs["suffix"] || undefined,
|
|
31
|
+
num: attrs["num"] || undefined,
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
return matches;
|
|
35
|
+
}
|
|
36
|
+
async function fetchCslData(keys, zoteroApi, userId) {
|
|
37
|
+
const cslData = new Map();
|
|
38
|
+
for (const key of keys) {
|
|
39
|
+
const response = await zoteroApi
|
|
40
|
+
.library("user", userId)
|
|
41
|
+
.items(key)
|
|
42
|
+
.get();
|
|
43
|
+
const zoteroItem = response.getData();
|
|
44
|
+
cslData.set(key, zoteroItemToCsl(zoteroItem));
|
|
45
|
+
}
|
|
46
|
+
return cslData;
|
|
47
|
+
}
|
|
48
|
+
function buildCitationItems(match, cslData, userId) {
|
|
49
|
+
return match.keys.map((key, idx) => {
|
|
50
|
+
const itemData = cslData.get(key) ?? { type: "article-journal" };
|
|
51
|
+
const item = {
|
|
52
|
+
id: idx,
|
|
53
|
+
uris: [`http://zotero.org/users/${userId}/items/${key}`],
|
|
54
|
+
uri: [`http://zotero.org/users/${userId}/items/${key}`],
|
|
55
|
+
itemData,
|
|
56
|
+
};
|
|
57
|
+
if (match.locator)
|
|
58
|
+
item.locator = match.locator;
|
|
59
|
+
if (match.prefix)
|
|
60
|
+
item.prefix = match.prefix;
|
|
61
|
+
if (match.suffix)
|
|
62
|
+
item.suffix = match.suffix;
|
|
63
|
+
return item;
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
function replaceZciteInXml(xml, escapedZciteTag, fieldCodeXml) {
|
|
67
|
+
// Case A (preferred): tag is the sole content of a <w:r>
|
|
68
|
+
// Use [^<]*(?:<(?!/w:rPr>)[^<]*)* instead of .*? inside <w:rPr> to prevent
|
|
69
|
+
// matching across element boundaries in minified (single-line) XML.
|
|
70
|
+
const soloRunRegex = new RegExp(`<w:r>(?:<w:rPr>[^<]*(?:<(?!/w:rPr>)[^<]*)*</w:rPr>)?<w:t[^>]*>${regexEscape(escapedZciteTag)}</w:t></w:r>`);
|
|
71
|
+
if (soloRunRegex.test(xml)) {
|
|
72
|
+
return xml.replace(soloRunRegex, fieldCodeXml);
|
|
73
|
+
}
|
|
74
|
+
// Case B (fallback): tag is inline with other text
|
|
75
|
+
const inlineRegex = new RegExp(`(<w:r>(?:<w:rPr>([^<]*(?:<(?!/w:rPr>)[^<]*)*)</w:rPr>)?<w:t[^>]*>)([^<]*)${regexEscape(escapedZciteTag)}([^<]*)(</w:t></w:r>)`);
|
|
76
|
+
const inlineMatch = xml.match(inlineRegex);
|
|
77
|
+
if (inlineMatch) {
|
|
78
|
+
const rPr = inlineMatch[2]
|
|
79
|
+
? `<w:rPr>${inlineMatch[2]}</w:rPr>`
|
|
80
|
+
: "";
|
|
81
|
+
const textBefore = inlineMatch[3];
|
|
82
|
+
const textAfter = inlineMatch[4];
|
|
83
|
+
let replacement = "";
|
|
84
|
+
if (textBefore) {
|
|
85
|
+
replacement += `<w:r>${rPr}<w:t xml:space="preserve">${textBefore}</w:t></w:r>`;
|
|
86
|
+
}
|
|
87
|
+
replacement += fieldCodeXml;
|
|
88
|
+
if (textAfter) {
|
|
89
|
+
replacement += `<w:r>${rPr}<w:t xml:space="preserve">${textAfter}</w:t></w:r>`;
|
|
90
|
+
}
|
|
91
|
+
return xml.replace(inlineRegex, replacement);
|
|
92
|
+
}
|
|
93
|
+
return xml;
|
|
94
|
+
}
|
|
95
|
+
export async function injectCitations(filePath, zoteroApi, userId, style) {
|
|
96
|
+
const fileBuffer = await readFile(filePath);
|
|
97
|
+
const zip = await JSZip.loadAsync(fileBuffer);
|
|
98
|
+
const documentEntry = zip.file("word/document.xml");
|
|
99
|
+
if (!documentEntry) {
|
|
100
|
+
throw new Error("Invalid .docx file: word/document.xml not found");
|
|
101
|
+
}
|
|
102
|
+
let documentXml = await documentEntry.async("string");
|
|
103
|
+
documentXml = normalizeZciteTags(documentXml);
|
|
104
|
+
const matches = parseZciteMatches(documentXml);
|
|
105
|
+
if (matches.length === 0) {
|
|
106
|
+
const outputPath = filePath.replace(".docx", "_cited.docx");
|
|
107
|
+
const buffer = await zip.generateAsync({ type: "nodebuffer" });
|
|
108
|
+
await writeFile(outputPath, buffer);
|
|
109
|
+
return { outputPath, found: 0, injected: 0, warnings: [] };
|
|
110
|
+
}
|
|
111
|
+
// Warn if using a numbered style but tags are missing the num attribute
|
|
112
|
+
const warnings = [];
|
|
113
|
+
if (style === "ieee" || style === "vancouver") {
|
|
114
|
+
const withNum = matches.filter((m) => m.num !== undefined).length;
|
|
115
|
+
if (withNum < matches.length) {
|
|
116
|
+
warnings.push(`Style '${style}' requires 'num' attribute on <zcite> tags. Found ${withNum}/${matches.length} tags with num.`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
// Collect all unique item keys
|
|
120
|
+
const uniqueKeys = new Set(matches.flatMap((m) => m.keys));
|
|
121
|
+
// Fetch CSL data from Zotero
|
|
122
|
+
const cslData = await fetchCslData(uniqueKeys, zoteroApi, userId);
|
|
123
|
+
// Replace each zcite tag with a field code
|
|
124
|
+
let injected = 0;
|
|
125
|
+
for (const match of matches) {
|
|
126
|
+
const citationItems = buildCitationItems(match, cslData, userId);
|
|
127
|
+
const itemDataList = match.keys.map((k) => cslData.get(k) ?? { type: "article-journal" });
|
|
128
|
+
const formattedText = formatCitationText(itemDataList, style, match.num);
|
|
129
|
+
const fieldCodeXml = generateZoteroFieldCode(citationItems, formattedText);
|
|
130
|
+
const newXml = replaceZciteInXml(documentXml, match.fullMatch, fieldCodeXml);
|
|
131
|
+
if (newXml !== documentXml) {
|
|
132
|
+
documentXml = newXml;
|
|
133
|
+
injected++;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
// Append bibliography before </w:body>
|
|
137
|
+
const biblXml = generateBibliographyFieldCode();
|
|
138
|
+
documentXml = documentXml.replace("</w:body>", `${biblXml}</w:body>`);
|
|
139
|
+
// Save
|
|
140
|
+
zip.file("word/document.xml", documentXml);
|
|
141
|
+
const outputPath = filePath.replace(".docx", "_cited.docx");
|
|
142
|
+
const buffer = await zip.generateAsync({ type: "nodebuffer" });
|
|
143
|
+
await writeFile(outputPath, buffer);
|
|
144
|
+
return { outputPath, found: matches.length, injected, warnings };
|
|
145
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export function escapeXml(text) {
|
|
2
|
+
return text
|
|
3
|
+
.replace(/&/g, "&")
|
|
4
|
+
.replace(/</g, "<")
|
|
5
|
+
.replace(/>/g, ">")
|
|
6
|
+
.replace(/"/g, """)
|
|
7
|
+
.replace(/'/g, "'");
|
|
8
|
+
}
|
|
9
|
+
export function unescapeXml(text) {
|
|
10
|
+
return text
|
|
11
|
+
.replace(/</g, "<")
|
|
12
|
+
.replace(/>/g, ">")
|
|
13
|
+
.replace(/"/g, '"')
|
|
14
|
+
.replace(/'/g, "'")
|
|
15
|
+
.replace(/&/g, "&"); // MUST be last to avoid double-unescaping
|
|
16
|
+
}
|
|
17
|
+
export function regexEscape(text) {
|
|
18
|
+
return text.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
19
|
+
}
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
import { XMLParser, XMLBuilder } from "fast-xml-parser";
|
|
2
|
+
// Pattern to detect a complete zcite tag in entity-encoded text
|
|
3
|
+
// (processEntities: false keeps < etc. as literals)
|
|
4
|
+
const ZCITE_PATTERN = /<zcite\s+[\s\S]*?\/>/;
|
|
5
|
+
const PARSER_OPTIONS = {
|
|
6
|
+
preserveOrder: true,
|
|
7
|
+
ignoreAttributes: false,
|
|
8
|
+
processEntities: false,
|
|
9
|
+
trimValues: false,
|
|
10
|
+
// Prevent fast-xml-parser from parsing numeric/boolean text
|
|
11
|
+
parseTagValue: false,
|
|
12
|
+
parseAttributeValue: false,
|
|
13
|
+
};
|
|
14
|
+
const BUILDER_OPTIONS = {
|
|
15
|
+
preserveOrder: true,
|
|
16
|
+
ignoreAttributes: false,
|
|
17
|
+
processEntities: false,
|
|
18
|
+
format: false,
|
|
19
|
+
suppressEmptyNode: false,
|
|
20
|
+
suppressBooleanAttributes: false,
|
|
21
|
+
};
|
|
22
|
+
/**
|
|
23
|
+
* Extract text content from a w:r (run) node in the parsed tree.
|
|
24
|
+
* Returns the concatenated #text of all w:t children, or "" if none.
|
|
25
|
+
*/
|
|
26
|
+
function extractRunText(runChildren) {
|
|
27
|
+
let text = "";
|
|
28
|
+
for (const child of runChildren) {
|
|
29
|
+
if ("w:t" in child) {
|
|
30
|
+
const wtChildren = child["w:t"];
|
|
31
|
+
for (const tc of wtChildren) {
|
|
32
|
+
if ("#text" in tc) {
|
|
33
|
+
text += String(tc["#text"]);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
return text;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Check if a node is a w:r element (run).
|
|
42
|
+
*/
|
|
43
|
+
function isRunNode(node) {
|
|
44
|
+
return "w:r" in node;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Check if a run has any w:t children (text elements).
|
|
48
|
+
*/
|
|
49
|
+
function hasTextContent(runChildren) {
|
|
50
|
+
return runChildren.some((child) => "w:t" in child);
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Collect groups of consecutive w:r nodes that have text content.
|
|
54
|
+
* Non-w:r nodes or w:r nodes without w:t break the group.
|
|
55
|
+
*/
|
|
56
|
+
function collectConsecutiveRunGroups(children) {
|
|
57
|
+
const groups = [];
|
|
58
|
+
let currentGroup = null;
|
|
59
|
+
for (let i = 0; i < children.length; i++) {
|
|
60
|
+
const child = children[i];
|
|
61
|
+
if (isRunNode(child)) {
|
|
62
|
+
const runChildren = child["w:r"];
|
|
63
|
+
if (hasTextContent(runChildren)) {
|
|
64
|
+
const text = extractRunText(runChildren);
|
|
65
|
+
if (currentGroup) {
|
|
66
|
+
currentGroup.endIndex = i;
|
|
67
|
+
currentGroup.texts.push(text);
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
currentGroup = { startIndex: i, endIndex: i, texts: [text] };
|
|
71
|
+
}
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Non-w:r or w:r without w:t → flush current group
|
|
76
|
+
if (currentGroup && currentGroup.texts.length >= 2) {
|
|
77
|
+
groups.push(currentGroup);
|
|
78
|
+
}
|
|
79
|
+
currentGroup = null;
|
|
80
|
+
}
|
|
81
|
+
// Flush final group
|
|
82
|
+
if (currentGroup && currentGroup.texts.length >= 2) {
|
|
83
|
+
groups.push(currentGroup);
|
|
84
|
+
}
|
|
85
|
+
return groups;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Given a group of consecutive runs, find a zcite tag that spans multiple runs.
|
|
89
|
+
* Returns the sub-range [startOffset, endOffset] within the group (0-indexed),
|
|
90
|
+
* or null if no split zcite found.
|
|
91
|
+
*/
|
|
92
|
+
function findSplitZcite(group) {
|
|
93
|
+
const { texts } = group;
|
|
94
|
+
// Sliding window: try all start positions
|
|
95
|
+
for (let start = 0; start < texts.length; start++) {
|
|
96
|
+
let concat = "";
|
|
97
|
+
for (let end = start; end < texts.length; end++) {
|
|
98
|
+
concat += texts[end];
|
|
99
|
+
// Only interested in splits (spanning 2+ runs)
|
|
100
|
+
if (end <= start)
|
|
101
|
+
continue;
|
|
102
|
+
const m = ZCITE_PATTERN.exec(concat);
|
|
103
|
+
if (m) {
|
|
104
|
+
// Verify the match actually spans runs: the zcite must start before
|
|
105
|
+
// the last run's text and end at or after the last run's start.
|
|
106
|
+
// i.e., it's not entirely within one run of this sub-range.
|
|
107
|
+
const matchStart = m.index;
|
|
108
|
+
const matchEnd = m.index + m[0].length;
|
|
109
|
+
// Calculate where the last run's text starts in the concatenated string
|
|
110
|
+
const lastRunStart = concat.length - texts[end].length;
|
|
111
|
+
// The zcite is split if it starts before lastRunStart AND extends
|
|
112
|
+
// into the last run's territory, OR if it starts in a prior run.
|
|
113
|
+
if (matchStart < lastRunStart && matchEnd > lastRunStart) {
|
|
114
|
+
return { startOffset: start, endOffset: end };
|
|
115
|
+
}
|
|
116
|
+
// Also check: could a zcite fully within a single earlier run exist?
|
|
117
|
+
// If so, skip this end and continue — no split needed for this combo.
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return null;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Merge runs from startIdx to endIdx (inclusive) in the children array.
|
|
125
|
+
* The merged run gets the w:rPr from the first run and a single w:t with
|
|
126
|
+
* the concatenated text of all merged runs.
|
|
127
|
+
*/
|
|
128
|
+
function mergeRunsInPlace(children, group, startOffset, endOffset) {
|
|
129
|
+
const absStart = group.startIndex + startOffset;
|
|
130
|
+
const absEnd = group.startIndex + endOffset;
|
|
131
|
+
const count = absEnd - absStart + 1;
|
|
132
|
+
// Concatenate text from all runs in the range
|
|
133
|
+
let mergedText = "";
|
|
134
|
+
for (let i = absStart; i <= absEnd; i++) {
|
|
135
|
+
const runChildren = children[i]["w:r"];
|
|
136
|
+
mergedText += extractRunText(runChildren);
|
|
137
|
+
}
|
|
138
|
+
// Build new run node
|
|
139
|
+
const firstRunChildren = children[absStart]["w:r"];
|
|
140
|
+
const newRunChildren = [];
|
|
141
|
+
// Copy w:rPr from first run if present
|
|
142
|
+
for (const child of firstRunChildren) {
|
|
143
|
+
if ("w:rPr" in child) {
|
|
144
|
+
newRunChildren.push(child);
|
|
145
|
+
break;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
// Add single w:t with merged text
|
|
149
|
+
newRunChildren.push({
|
|
150
|
+
"w:t": [{ "#text": mergedText }],
|
|
151
|
+
":@": { "@_xml:space": "preserve" },
|
|
152
|
+
});
|
|
153
|
+
const newRun = { "w:r": newRunChildren };
|
|
154
|
+
// Copy attributes from first run if any
|
|
155
|
+
const firstRunAttrs = children[absStart][":@"];
|
|
156
|
+
if (firstRunAttrs) {
|
|
157
|
+
newRun[":@"] = firstRunAttrs;
|
|
158
|
+
}
|
|
159
|
+
// Splice: replace N runs with 1 merged run
|
|
160
|
+
children.splice(absStart, count, newRun);
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Scan a paragraph's children for split zcite tags and merge them.
|
|
164
|
+
* Returns true if any modifications were made.
|
|
165
|
+
*/
|
|
166
|
+
function normalizeParagraphRuns(children) {
|
|
167
|
+
let modified = false;
|
|
168
|
+
// Use a while loop since indices shift after each merge
|
|
169
|
+
let changed = true;
|
|
170
|
+
while (changed) {
|
|
171
|
+
changed = false;
|
|
172
|
+
const groups = collectConsecutiveRunGroups(children);
|
|
173
|
+
for (const group of groups) {
|
|
174
|
+
const split = findSplitZcite(group);
|
|
175
|
+
if (split) {
|
|
176
|
+
mergeRunsInPlace(children, group, split.startOffset, split.endOffset);
|
|
177
|
+
modified = true;
|
|
178
|
+
changed = true;
|
|
179
|
+
break; // Restart scan since indices shifted
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
return modified;
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* Recursively walk the parsed tree, finding w:p elements and normalizing
|
|
187
|
+
* their runs.
|
|
188
|
+
*/
|
|
189
|
+
function walkAndNormalize(nodes) {
|
|
190
|
+
let modified = false;
|
|
191
|
+
for (const node of nodes) {
|
|
192
|
+
if ("w:p" in node) {
|
|
193
|
+
const pChildren = node["w:p"];
|
|
194
|
+
if (normalizeParagraphRuns(pChildren)) {
|
|
195
|
+
modified = true;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
// Recurse into all child arrays
|
|
199
|
+
for (const key of Object.keys(node)) {
|
|
200
|
+
if (key === ":@" || key === "#text")
|
|
201
|
+
continue;
|
|
202
|
+
const value = node[key];
|
|
203
|
+
if (Array.isArray(value)) {
|
|
204
|
+
if (walkAndNormalize(value)) {
|
|
205
|
+
modified = true;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
return modified;
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Pre-process document.xml to merge zcite tags that Word has split across
|
|
214
|
+
* multiple w:r runs (e.g. due to language or font changes mid-tag).
|
|
215
|
+
*
|
|
216
|
+
* If no split zcites are found, returns the original string unchanged
|
|
217
|
+
* (zero risk of round-trip encoding differences).
|
|
218
|
+
*/
|
|
219
|
+
export function normalizeZciteTags(documentXml) {
|
|
220
|
+
// Quick check: if no zcite at all, skip parsing entirely
|
|
221
|
+
if (!documentXml.includes("zcite")) {
|
|
222
|
+
return documentXml;
|
|
223
|
+
}
|
|
224
|
+
const parser = new XMLParser(PARSER_OPTIONS);
|
|
225
|
+
const parsed = parser.parse(documentXml);
|
|
226
|
+
const modified = walkAndNormalize(parsed);
|
|
227
|
+
if (!modified) {
|
|
228
|
+
return documentXml; // Return original string, no changes
|
|
229
|
+
}
|
|
230
|
+
const builder = new XMLBuilder(BUILDER_OPTIONS);
|
|
231
|
+
return builder.build(parsed);
|
|
232
|
+
}
|