@xevos117/mcp-zotero 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/LICENSE +24 -0
  2. package/README.md +127 -0
  3. package/build/citation-injector/citation-formatter.js +30 -0
  4. package/build/citation-injector/field-codes.js +33 -0
  5. package/build/citation-injector/injector.js +145 -0
  6. package/build/citation-injector/xml-utils.js +19 -0
  7. package/build/citation-injector/zcite-normalizer.js +232 -0
  8. package/build/server.js +75 -0
  9. package/build/tools/add-items-by-doi.js +140 -0
  10. package/build/tools/add-linked-url-attachment.js +92 -0
  11. package/build/tools/add-web-item.js +97 -0
  12. package/build/tools/create-collection.js +63 -0
  13. package/build/tools/find-and-attach-pdfs.js +208 -0
  14. package/build/tools/get-collection-items.js +84 -0
  15. package/build/tools/get-collections.js +39 -0
  16. package/build/tools/get-item-fulltext.js +101 -0
  17. package/build/tools/get-items-details.js +73 -0
  18. package/build/tools/get-user-id.js +12 -0
  19. package/build/tools/import-pdf-to-zotero.js +129 -0
  20. package/build/tools/index.js +60 -0
  21. package/build/tools/inject-citations.js +83 -0
  22. package/build/tools/search-library.js +94 -0
  23. package/build/types/csl-types.js +1 -0
  24. package/build/types/zotero-types.js +9 -0
  25. package/build/utils/concurrency.js +28 -0
  26. package/build/utils/csl-to-zotero.js +77 -0
  27. package/build/utils/doi-resolver.js +30 -0
  28. package/build/utils/error-formatter.js +13 -0
  29. package/build/utils/fetch-retry.js +34 -0
  30. package/build/utils/item-formatter.js +9 -0
  31. package/build/utils/logger.js +14 -0
  32. package/build/utils/pdf-text-extractor.js +9 -0
  33. package/build/utils/pdf-uploader.js +230 -0
  34. package/build/utils/unpaywall.js +116 -0
  35. package/build/utils/zotero-fulltext.js +22 -0
  36. package/package.json +58 -0
package/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Xevos117
4
+
5
+ Based on mcp-zotero by Abhishek Kalia (https://github.com/kaliaboi/mcp-zotero),
6
+ Copyright (c) 2024 Abhishek Kalia, licensed under the MIT License.
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ of this software and associated documentation files (the "Software"), to deal
10
+ in the Software without restriction, including without limitation the rights
11
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ copies of the Software, and to permit persons to whom the Software is
13
+ furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in all
16
+ copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,127 @@
1
+ # MCP Zotero
2
+
3
+ > **Note:** This is an unofficial community project and is not affiliated with, endorsed by, or supported by the Zotero team or the Corporation for Digital Scholarship. "Zotero" is a registered trademark of the Corporation for Digital Scholarship.
4
+
5
+ A Model Context Protocol server for Zotero integration. It gives any LLM full access to your Zotero library: search, organize, add papers by DOI, import PDFs, read full-text content, and inject live citations into Word documents.
6
+
7
+ > Originally based on [mcp-zotero](https://github.com/kaliaboi/mcp-zotero) by Abhishek Kalia.
8
+ > This project has since been extensively rewritten with a new architecture, 13 tools (up from 5), citation injection, PDF management, and Claude skill support.
9
+
10
+ ## How it works
11
+
12
+ The server is designed to be **usable by any LLM without external documentation**. On connection, it sends workflow instructions via the MCP `instructions` field, and each tool description includes cross-references and usage guidance. An LLM that has never seen this server before can discover the full workflow — from adding papers to producing a cited Word document — directly from the tool listing.
13
+
14
+ For advanced use cases (PDF upload policy, citation style guidance, source transparency), a **Claude skill** is included for Claude.ai Projects. But the skill is optional: the MCP server is fully self-documenting.
15
+
16
+ ## Local vs Remote LLMs
17
+
18
+ | Scenario | MCP server | Skill needed? |
19
+ |---|---|---|
20
+ | Local LLM (Claude Code, LM Studio, etc.) | All 13 tools | No |
21
+ | Remote/sandboxed LLM (Claude.ai Projects) | API tools (search, add, metadata) | Yes, for citation injection |
22
+
23
+ Local LLMs with filesystem access can use all tools directly, including `inject_citations` which reads and writes `.docx` files on disk.
24
+
25
+ Remote LLMs without filesystem access can use the included **Claude skill** (`skills/zotero-skill-mcp-integrations/`), which runs citation injection entirely inside the sandbox. MCP tools handle all Zotero API operations; the skill handles document assembly.
26
+
27
+ ## Setup
28
+
29
+ 1. Get your Zotero credentials:
30
+
31
+ ```bash
32
+ # Create an API key at https://www.zotero.org/settings/keys
33
+ # (enable library read/write + file access)
34
+ # Then retrieve your user ID:
35
+ curl -H "Zotero-API-Key: YOUR_API_KEY" https://api.zotero.org/keys/current
36
+ ```
37
+
38
+ 2. Set environment variables:
39
+
40
+ ```bash
41
+ export ZOTERO_API_KEY="your-api-key"
42
+ export ZOTERO_USER_ID="user-id-from-curl"
43
+ export UNPAYWALL_EMAIL="your@email.edu" # Optional: enables OA PDF lookup via Unpaywall
44
+ ```
45
+
46
+ ## Environment Variables
47
+
48
+ | Variable | Required | Description |
49
+ |---|---|---|
50
+ | `ZOTERO_API_KEY` | Yes | API key for Zotero Web API v3. Create one at [zotero.org/settings/keys](https://www.zotero.org/settings/keys) with library read/write and file access permissions. |
51
+ | `ZOTERO_USER_ID` | Yes | Your Zotero numeric user ID. Retrieve it with `curl -H "Zotero-API-Key: KEY" https://api.zotero.org/keys/current`. |
52
+ | `UNPAYWALL_EMAIL` | No | Email for Unpaywall API requests ([rate-limit policy](https://unpaywall.org/products/api)). Enables OA PDF lookup in `add_items_by_doi` and `find_and_attach_pdfs`. If not set, OA PDF features are silently skipped. |
53
+
54
+ ## Integration with Claude Desktop
55
+
56
+ Add to your Claude Desktop configuration:
57
+
58
+ ```json
59
+ {
60
+ "mcpServers": {
61
+ "zotero": {
62
+ "command": "npx",
63
+ "args": ["-y", "@xevos117/mcp-zotero"],
64
+ "env": {
65
+ "ZOTERO_API_KEY": "YOUR_API_KEY",
66
+ "ZOTERO_USER_ID": "YOUR_USER_ID",
67
+ "UNPAYWALL_EMAIL": "YOUR_EMAIL"
68
+ }
69
+ }
70
+ }
71
+ }
72
+ ```
73
+
74
+ ## Integration with Claude Code
75
+
76
+ ```bash
77
+ claude mcp add-json "zotero" '{"command":"npx","args":["tsx","src/server.ts"],"env":{"ZOTERO_API_KEY":"...","ZOTERO_USER_ID":"..."}}'
78
+ ```
79
+
80
+ ## Available Tools
81
+
82
+ ### Library browsing
83
+
84
+ | Tool | Description |
85
+ |---|---|
86
+ | `get_collections` | List all collections (folders) with keys, names, and parent relationships |
87
+ | `get_collection_items` | Get items in a specific collection with keys, titles, authors, dates |
88
+ | `search_library` | Search by query, or list items sorted by field (date, title, etc.) |
89
+ | `get_items_details` | Batch metadata retrieval for multiple items in a single call |
90
+ | `get_item_fulltext` | Get full-text content of a PDF attachment via Zotero's fulltext index |
91
+
92
+ ### Adding content
93
+
94
+ | Tool | Description |
95
+ |---|---|
96
+ | `add_items_by_doi` | Add papers by DOI with automatic metadata resolution. Auto-attaches OA PDFs via Unpaywall |
97
+ | `add_web_item` | Save a web page as a Zotero item (for articles without DOI) |
98
+ | `create_collection` | Create a new collection, optionally nested under a parent |
99
+ | `import_pdf_to_zotero` | Download a PDF from URL, upload to Zotero storage, auto-index full text |
100
+ | `find_and_attach_pdfs` | Batch OA PDF lookup and auto-attach via Unpaywall (by item keys or collection) |
101
+ | `add_linked_url_attachment` | Attach a URL to an existing item or create a standalone link |
102
+
103
+ ### Citation & documents
104
+
105
+ | Tool | Description |
106
+ |---|---|
107
+ | `inject_citations` | Inject live Zotero citations into a Word document. Supports APA, IEEE, Vancouver, Harvard, Chicago |
108
+ | `get_user_id` | Returns the configured Zotero user ID |
109
+
110
+ ## Development
111
+
112
+ ```bash
113
+ npm install
114
+ npm run build # Compile TypeScript
115
+ npm test # Run tests (vitest, 299 tests)
116
+ npx tsx src/server.ts # Run directly without building
117
+ ```
118
+
119
+ ### Debug with MCP Inspector
120
+
121
+ ```bash
122
+ npx @modelcontextprotocol/inspector npx tsx src/server.ts
123
+ ```
124
+
125
+ ## License
126
+
127
+ MIT - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,30 @@
1
+ export function formatCitationText(items, style, num) {
2
+ if (style === "ieee" || style === "vancouver") {
3
+ return num ? `[${num}]` : "[?]";
4
+ }
5
+ const parts = items.map((item) => {
6
+ const authors = item.author;
7
+ const firstAuthor = authors?.[0]?.family ?? "Unknown";
8
+ const year = item.issued?.["date-parts"]?.[0]?.[0]?.toString() ?? "n.d.";
9
+ let authorText;
10
+ if (!authors || authors.length === 0) {
11
+ const title = item.title;
12
+ authorText = title
13
+ ? title.length > 30
14
+ ? `"${title.substring(0, 30)}..."`
15
+ : `"${title}"`
16
+ : "Unknown";
17
+ }
18
+ else if (authors.length > 2) {
19
+ authorText = `${firstAuthor} et al.`;
20
+ }
21
+ else if (authors.length === 2) {
22
+ authorText = `${firstAuthor} & ${authors[1].family}`;
23
+ }
24
+ else {
25
+ authorText = firstAuthor;
26
+ }
27
+ return `${authorText}, ${year}`;
28
+ });
29
+ return `(${parts.join("; ")})`;
30
+ }
@@ -0,0 +1,33 @@
1
+ import { randomUUID } from "node:crypto";
2
+ import { escapeXml } from "./xml-utils.js";
3
+ export function generateZoteroFieldCode(citationItems, formattedText) {
4
+ const citationId = randomUUID().slice(0, 8);
5
+ const cslCitation = {
6
+ citationID: citationId,
7
+ properties: { formattedCitation: formattedText },
8
+ citationItems,
9
+ schema: "https://github.com/citation-style-language/schema/raw/master/csl-citation.json",
10
+ };
11
+ const instrText = ` ADDIN ZOTERO_ITEM CSL_CITATION ${JSON.stringify(cslCitation)} `;
12
+ const escapedInstrText = escapeXml(instrText);
13
+ return [
14
+ '<w:r><w:fldChar w:fldCharType="begin"/></w:r>',
15
+ `<w:r><w:instrText xml:space="preserve">${escapedInstrText}</w:instrText></w:r>`,
16
+ '<w:r><w:fldChar w:fldCharType="separate"/></w:r>',
17
+ `<w:r><w:rPr><w:noProof/></w:rPr><w:t>${escapeXml(formattedText)}</w:t></w:r>`,
18
+ '<w:r><w:fldChar w:fldCharType="end"/></w:r>',
19
+ ].join("");
20
+ }
21
+ export function generateBibliographyFieldCode() {
22
+ return [
23
+ "<w:p>",
24
+ '<w:r><w:fldChar w:fldCharType="begin"/></w:r>',
25
+ '<w:r><w:instrText xml:space="preserve">',
26
+ " ADDIN ZOTERO_BIBL {&quot;uncited&quot;:[],&quot;omitted&quot;:[],&quot;custom&quot;:[]} CSL_BIBLIOGRAPHY ",
27
+ "</w:instrText></w:r>",
28
+ '<w:r><w:fldChar w:fldCharType="separate"/></w:r>',
29
+ "<w:r><w:rPr><w:noProof/></w:rPr><w:t>[Bibliography will be generated by Zotero]</w:t></w:r>",
30
+ '<w:r><w:fldChar w:fldCharType="end"/></w:r>',
31
+ "</w:p>",
32
+ ].join("");
33
+ }
@@ -0,0 +1,145 @@
1
+ import JSZip from "jszip";
2
+ import { readFile, writeFile } from "node:fs/promises";
3
+ import { generateZoteroFieldCode, generateBibliographyFieldCode } from "./field-codes.js";
4
+ import { formatCitationText } from "./citation-formatter.js";
5
+ import { regexEscape, unescapeXml } from "./xml-utils.js";
6
+ import { normalizeZciteTags } from "./zcite-normalizer.js";
7
+ import { zoteroItemToCsl } from "../utils/csl-to-zotero.js";
8
+ function parseZciteMatches(documentXml) {
9
+ const findRegex = /&lt;zcite\s+[\s\S]*?\/&gt;/g;
10
+ const matches = [];
11
+ let findMatch;
12
+ while ((findMatch = findRegex.exec(documentXml)) !== null) {
13
+ const fullMatch = findMatch[0];
14
+ const cleanTag = unescapeXml(fullMatch);
15
+ const attrRegex = /(\w+)="([^"]*)"/g;
16
+ const attrs = {};
17
+ let attrMatch;
18
+ while ((attrMatch = attrRegex.exec(cleanTag)) !== null) {
19
+ // Unescape attribute values since the zcite tag is XML
20
+ // and values like "pp. 12 &amp; 15" need a second unescape
21
+ attrs[attrMatch[1]] = unescapeXml(attrMatch[2]);
22
+ }
23
+ if (!attrs["keys"])
24
+ continue;
25
+ matches.push({
26
+ fullMatch,
27
+ keys: attrs["keys"].split(","),
28
+ locator: attrs["locator"] || undefined,
29
+ prefix: attrs["prefix"] || undefined,
30
+ suffix: attrs["suffix"] || undefined,
31
+ num: attrs["num"] || undefined,
32
+ });
33
+ }
34
+ return matches;
35
+ }
36
+ async function fetchCslData(keys, zoteroApi, userId) {
37
+ const cslData = new Map();
38
+ for (const key of keys) {
39
+ const response = await zoteroApi
40
+ .library("user", userId)
41
+ .items(key)
42
+ .get();
43
+ const zoteroItem = response.getData();
44
+ cslData.set(key, zoteroItemToCsl(zoteroItem));
45
+ }
46
+ return cslData;
47
+ }
48
+ function buildCitationItems(match, cslData, userId) {
49
+ return match.keys.map((key, idx) => {
50
+ const itemData = cslData.get(key) ?? { type: "article-journal" };
51
+ const item = {
52
+ id: idx,
53
+ uris: [`http://zotero.org/users/${userId}/items/${key}`],
54
+ uri: [`http://zotero.org/users/${userId}/items/${key}`],
55
+ itemData,
56
+ };
57
+ if (match.locator)
58
+ item.locator = match.locator;
59
+ if (match.prefix)
60
+ item.prefix = match.prefix;
61
+ if (match.suffix)
62
+ item.suffix = match.suffix;
63
+ return item;
64
+ });
65
+ }
66
+ function replaceZciteInXml(xml, escapedZciteTag, fieldCodeXml) {
67
+ // Case A (preferred): tag is the sole content of a <w:r>
68
+ // Use [^<]*(?:<(?!/w:rPr>)[^<]*)* instead of .*? inside <w:rPr> to prevent
69
+ // matching across element boundaries in minified (single-line) XML.
70
+ const soloRunRegex = new RegExp(`<w:r>(?:<w:rPr>[^<]*(?:<(?!/w:rPr>)[^<]*)*</w:rPr>)?<w:t[^>]*>${regexEscape(escapedZciteTag)}</w:t></w:r>`);
71
+ if (soloRunRegex.test(xml)) {
72
+ return xml.replace(soloRunRegex, fieldCodeXml);
73
+ }
74
+ // Case B (fallback): tag is inline with other text
75
+ const inlineRegex = new RegExp(`(<w:r>(?:<w:rPr>([^<]*(?:<(?!/w:rPr>)[^<]*)*)</w:rPr>)?<w:t[^>]*>)([^<]*)${regexEscape(escapedZciteTag)}([^<]*)(</w:t></w:r>)`);
76
+ const inlineMatch = xml.match(inlineRegex);
77
+ if (inlineMatch) {
78
+ const rPr = inlineMatch[2]
79
+ ? `<w:rPr>${inlineMatch[2]}</w:rPr>`
80
+ : "";
81
+ const textBefore = inlineMatch[3];
82
+ const textAfter = inlineMatch[4];
83
+ let replacement = "";
84
+ if (textBefore) {
85
+ replacement += `<w:r>${rPr}<w:t xml:space="preserve">${textBefore}</w:t></w:r>`;
86
+ }
87
+ replacement += fieldCodeXml;
88
+ if (textAfter) {
89
+ replacement += `<w:r>${rPr}<w:t xml:space="preserve">${textAfter}</w:t></w:r>`;
90
+ }
91
+ return xml.replace(inlineRegex, replacement);
92
+ }
93
+ return xml;
94
+ }
95
+ export async function injectCitations(filePath, zoteroApi, userId, style) {
96
+ const fileBuffer = await readFile(filePath);
97
+ const zip = await JSZip.loadAsync(fileBuffer);
98
+ const documentEntry = zip.file("word/document.xml");
99
+ if (!documentEntry) {
100
+ throw new Error("Invalid .docx file: word/document.xml not found");
101
+ }
102
+ let documentXml = await documentEntry.async("string");
103
+ documentXml = normalizeZciteTags(documentXml);
104
+ const matches = parseZciteMatches(documentXml);
105
+ if (matches.length === 0) {
106
+ const outputPath = filePath.replace(".docx", "_cited.docx");
107
+ const buffer = await zip.generateAsync({ type: "nodebuffer" });
108
+ await writeFile(outputPath, buffer);
109
+ return { outputPath, found: 0, injected: 0, warnings: [] };
110
+ }
111
+ // Warn if using a numbered style but tags are missing the num attribute
112
+ const warnings = [];
113
+ if (style === "ieee" || style === "vancouver") {
114
+ const withNum = matches.filter((m) => m.num !== undefined).length;
115
+ if (withNum < matches.length) {
116
+ warnings.push(`Style '${style}' requires 'num' attribute on <zcite> tags. Found ${withNum}/${matches.length} tags with num.`);
117
+ }
118
+ }
119
+ // Collect all unique item keys
120
+ const uniqueKeys = new Set(matches.flatMap((m) => m.keys));
121
+ // Fetch CSL data from Zotero
122
+ const cslData = await fetchCslData(uniqueKeys, zoteroApi, userId);
123
+ // Replace each zcite tag with a field code
124
+ let injected = 0;
125
+ for (const match of matches) {
126
+ const citationItems = buildCitationItems(match, cslData, userId);
127
+ const itemDataList = match.keys.map((k) => cslData.get(k) ?? { type: "article-journal" });
128
+ const formattedText = formatCitationText(itemDataList, style, match.num);
129
+ const fieldCodeXml = generateZoteroFieldCode(citationItems, formattedText);
130
+ const newXml = replaceZciteInXml(documentXml, match.fullMatch, fieldCodeXml);
131
+ if (newXml !== documentXml) {
132
+ documentXml = newXml;
133
+ injected++;
134
+ }
135
+ }
136
+ // Append bibliography before </w:body>
137
+ const biblXml = generateBibliographyFieldCode();
138
+ documentXml = documentXml.replace("</w:body>", `${biblXml}</w:body>`);
139
+ // Save
140
+ zip.file("word/document.xml", documentXml);
141
+ const outputPath = filePath.replace(".docx", "_cited.docx");
142
+ const buffer = await zip.generateAsync({ type: "nodebuffer" });
143
+ await writeFile(outputPath, buffer);
144
+ return { outputPath, found: matches.length, injected, warnings };
145
+ }
@@ -0,0 +1,19 @@
1
+ export function escapeXml(text) {
2
+ return text
3
+ .replace(/&/g, "&amp;")
4
+ .replace(/</g, "&lt;")
5
+ .replace(/>/g, "&gt;")
6
+ .replace(/"/g, "&quot;")
7
+ .replace(/'/g, "&apos;");
8
+ }
9
+ export function unescapeXml(text) {
10
+ return text
11
+ .replace(/&lt;/g, "<")
12
+ .replace(/&gt;/g, ">")
13
+ .replace(/&quot;/g, '"')
14
+ .replace(/&apos;/g, "'")
15
+ .replace(/&amp;/g, "&"); // MUST be last to avoid double-unescaping
16
+ }
17
+ export function regexEscape(text) {
18
+ return text.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
19
+ }
@@ -0,0 +1,232 @@
1
+ import { XMLParser, XMLBuilder } from "fast-xml-parser";
2
+ // Pattern to detect a complete zcite tag in entity-encoded text
3
+ // (processEntities: false keeps &lt; etc. as literals)
4
+ const ZCITE_PATTERN = /&lt;zcite\s+[\s\S]*?\/&gt;/;
5
+ const PARSER_OPTIONS = {
6
+ preserveOrder: true,
7
+ ignoreAttributes: false,
8
+ processEntities: false,
9
+ trimValues: false,
10
+ // Prevent fast-xml-parser from parsing numeric/boolean text
11
+ parseTagValue: false,
12
+ parseAttributeValue: false,
13
+ };
14
+ const BUILDER_OPTIONS = {
15
+ preserveOrder: true,
16
+ ignoreAttributes: false,
17
+ processEntities: false,
18
+ format: false,
19
+ suppressEmptyNode: false,
20
+ suppressBooleanAttributes: false,
21
+ };
22
+ /**
23
+ * Extract text content from a w:r (run) node in the parsed tree.
24
+ * Returns the concatenated #text of all w:t children, or "" if none.
25
+ */
26
+ function extractRunText(runChildren) {
27
+ let text = "";
28
+ for (const child of runChildren) {
29
+ if ("w:t" in child) {
30
+ const wtChildren = child["w:t"];
31
+ for (const tc of wtChildren) {
32
+ if ("#text" in tc) {
33
+ text += String(tc["#text"]);
34
+ }
35
+ }
36
+ }
37
+ }
38
+ return text;
39
+ }
40
+ /**
41
+ * Check if a node is a w:r element (run).
42
+ */
43
+ function isRunNode(node) {
44
+ return "w:r" in node;
45
+ }
46
+ /**
47
+ * Check if a run has any w:t children (text elements).
48
+ */
49
+ function hasTextContent(runChildren) {
50
+ return runChildren.some((child) => "w:t" in child);
51
+ }
52
+ /**
53
+ * Collect groups of consecutive w:r nodes that have text content.
54
+ * Non-w:r nodes or w:r nodes without w:t break the group.
55
+ */
56
+ function collectConsecutiveRunGroups(children) {
57
+ const groups = [];
58
+ let currentGroup = null;
59
+ for (let i = 0; i < children.length; i++) {
60
+ const child = children[i];
61
+ if (isRunNode(child)) {
62
+ const runChildren = child["w:r"];
63
+ if (hasTextContent(runChildren)) {
64
+ const text = extractRunText(runChildren);
65
+ if (currentGroup) {
66
+ currentGroup.endIndex = i;
67
+ currentGroup.texts.push(text);
68
+ }
69
+ else {
70
+ currentGroup = { startIndex: i, endIndex: i, texts: [text] };
71
+ }
72
+ continue;
73
+ }
74
+ }
75
+ // Non-w:r or w:r without w:t → flush current group
76
+ if (currentGroup && currentGroup.texts.length >= 2) {
77
+ groups.push(currentGroup);
78
+ }
79
+ currentGroup = null;
80
+ }
81
+ // Flush final group
82
+ if (currentGroup && currentGroup.texts.length >= 2) {
83
+ groups.push(currentGroup);
84
+ }
85
+ return groups;
86
+ }
87
+ /**
88
+ * Given a group of consecutive runs, find a zcite tag that spans multiple runs.
89
+ * Returns the sub-range [startOffset, endOffset] within the group (0-indexed),
90
+ * or null if no split zcite found.
91
+ */
92
+ function findSplitZcite(group) {
93
+ const { texts } = group;
94
+ // Sliding window: try all start positions
95
+ for (let start = 0; start < texts.length; start++) {
96
+ let concat = "";
97
+ for (let end = start; end < texts.length; end++) {
98
+ concat += texts[end];
99
+ // Only interested in splits (spanning 2+ runs)
100
+ if (end <= start)
101
+ continue;
102
+ const m = ZCITE_PATTERN.exec(concat);
103
+ if (m) {
104
+ // Verify the match actually spans runs: the zcite must start before
105
+ // the last run's text and end at or after the last run's start.
106
+ // i.e., it's not entirely within one run of this sub-range.
107
+ const matchStart = m.index;
108
+ const matchEnd = m.index + m[0].length;
109
+ // Calculate where the last run's text starts in the concatenated string
110
+ const lastRunStart = concat.length - texts[end].length;
111
+ // The zcite is split if it starts before lastRunStart AND extends
112
+ // into the last run's territory, OR if it starts in a prior run.
113
+ if (matchStart < lastRunStart && matchEnd > lastRunStart) {
114
+ return { startOffset: start, endOffset: end };
115
+ }
116
+ // Also check: could a zcite fully within a single earlier run exist?
117
+ // If so, skip this end and continue — no split needed for this combo.
118
+ }
119
+ }
120
+ }
121
+ return null;
122
+ }
123
+ /**
124
+ * Merge runs from startIdx to endIdx (inclusive) in the children array.
125
+ * The merged run gets the w:rPr from the first run and a single w:t with
126
+ * the concatenated text of all merged runs.
127
+ */
128
+ function mergeRunsInPlace(children, group, startOffset, endOffset) {
129
+ const absStart = group.startIndex + startOffset;
130
+ const absEnd = group.startIndex + endOffset;
131
+ const count = absEnd - absStart + 1;
132
+ // Concatenate text from all runs in the range
133
+ let mergedText = "";
134
+ for (let i = absStart; i <= absEnd; i++) {
135
+ const runChildren = children[i]["w:r"];
136
+ mergedText += extractRunText(runChildren);
137
+ }
138
+ // Build new run node
139
+ const firstRunChildren = children[absStart]["w:r"];
140
+ const newRunChildren = [];
141
+ // Copy w:rPr from first run if present
142
+ for (const child of firstRunChildren) {
143
+ if ("w:rPr" in child) {
144
+ newRunChildren.push(child);
145
+ break;
146
+ }
147
+ }
148
+ // Add single w:t with merged text
149
+ newRunChildren.push({
150
+ "w:t": [{ "#text": mergedText }],
151
+ ":@": { "@_xml:space": "preserve" },
152
+ });
153
+ const newRun = { "w:r": newRunChildren };
154
+ // Copy attributes from first run if any
155
+ const firstRunAttrs = children[absStart][":@"];
156
+ if (firstRunAttrs) {
157
+ newRun[":@"] = firstRunAttrs;
158
+ }
159
+ // Splice: replace N runs with 1 merged run
160
+ children.splice(absStart, count, newRun);
161
+ }
162
+ /**
163
+ * Scan a paragraph's children for split zcite tags and merge them.
164
+ * Returns true if any modifications were made.
165
+ */
166
+ function normalizeParagraphRuns(children) {
167
+ let modified = false;
168
+ // Use a while loop since indices shift after each merge
169
+ let changed = true;
170
+ while (changed) {
171
+ changed = false;
172
+ const groups = collectConsecutiveRunGroups(children);
173
+ for (const group of groups) {
174
+ const split = findSplitZcite(group);
175
+ if (split) {
176
+ mergeRunsInPlace(children, group, split.startOffset, split.endOffset);
177
+ modified = true;
178
+ changed = true;
179
+ break; // Restart scan since indices shifted
180
+ }
181
+ }
182
+ }
183
+ return modified;
184
+ }
185
+ /**
186
+ * Recursively walk the parsed tree, finding w:p elements and normalizing
187
+ * their runs.
188
+ */
189
+ function walkAndNormalize(nodes) {
190
+ let modified = false;
191
+ for (const node of nodes) {
192
+ if ("w:p" in node) {
193
+ const pChildren = node["w:p"];
194
+ if (normalizeParagraphRuns(pChildren)) {
195
+ modified = true;
196
+ }
197
+ }
198
+ // Recurse into all child arrays
199
+ for (const key of Object.keys(node)) {
200
+ if (key === ":@" || key === "#text")
201
+ continue;
202
+ const value = node[key];
203
+ if (Array.isArray(value)) {
204
+ if (walkAndNormalize(value)) {
205
+ modified = true;
206
+ }
207
+ }
208
+ }
209
+ }
210
+ return modified;
211
+ }
212
+ /**
213
+ * Pre-process document.xml to merge zcite tags that Word has split across
214
+ * multiple w:r runs (e.g. due to language or font changes mid-tag).
215
+ *
216
+ * If no split zcites are found, returns the original string unchanged
217
+ * (zero risk of round-trip encoding differences).
218
+ */
219
+ export function normalizeZciteTags(documentXml) {
220
+ // Quick check: if no zcite at all, skip parsing entirely
221
+ if (!documentXml.includes("zcite")) {
222
+ return documentXml;
223
+ }
224
+ const parser = new XMLParser(PARSER_OPTIONS);
225
+ const parsed = parser.parse(documentXml);
226
+ const modified = walkAndNormalize(parsed);
227
+ if (!modified) {
228
+ return documentXml; // Return original string, no changes
229
+ }
230
+ const builder = new XMLBuilder(BUILDER_OPTIONS);
231
+ return builder.build(parsed);
232
+ }