openrxiv-cli 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/api-client.d.ts +96 -0
- package/dist/api/api-client.d.ts.map +1 -0
- package/dist/api/api-client.js +257 -0
- package/dist/aws/bucket-explorer.d.ts +26 -0
- package/dist/aws/bucket-explorer.d.ts.map +1 -0
- package/dist/aws/bucket-explorer.js +220 -0
- package/dist/aws/config.d.ts +5 -0
- package/dist/aws/config.d.ts.map +1 -0
- package/dist/aws/config.js +36 -0
- package/dist/aws/downloader.d.ts +13 -0
- package/dist/aws/downloader.d.ts.map +1 -0
- package/dist/aws/downloader.js +115 -0
- package/dist/aws/month-lister.d.ts +18 -0
- package/dist/aws/month-lister.d.ts.map +1 -0
- package/dist/aws/month-lister.js +90 -0
- package/dist/commands/batch-info.d.ts +3 -0
- package/dist/commands/batch-info.d.ts.map +1 -0
- package/dist/commands/batch-info.js +213 -0
- package/dist/commands/batch-process.d.ts +3 -0
- package/dist/commands/batch-process.d.ts.map +1 -0
- package/dist/commands/batch-process.js +557 -0
- package/dist/commands/download.d.ts +3 -0
- package/dist/commands/download.d.ts.map +1 -0
- package/dist/commands/download.js +76 -0
- package/dist/commands/index.d.ts +6 -0
- package/dist/commands/index.d.ts.map +1 -0
- package/dist/commands/index.js +5 -0
- package/dist/commands/list.d.ts +3 -0
- package/dist/commands/list.d.ts.map +1 -0
- package/dist/commands/list.js +18 -0
- package/dist/commands/summary.d.ts +3 -0
- package/dist/commands/summary.d.ts.map +1 -0
- package/dist/commands/summary.js +249 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35 -0
- package/dist/utils/batches.d.ts +9 -0
- package/dist/utils/batches.d.ts.map +1 -0
- package/dist/utils/batches.js +61 -0
- package/dist/utils/batches.test.d.ts +2 -0
- package/dist/utils/batches.test.d.ts.map +1 -0
- package/dist/utils/batches.test.js +119 -0
- package/dist/utils/default-server.d.ts +3 -0
- package/dist/utils/default-server.d.ts.map +1 -0
- package/dist/utils/default-server.js +20 -0
- package/dist/utils/index.d.ts +5 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +5 -0
- package/dist/utils/meca-processor.d.ts +28 -0
- package/dist/utils/meca-processor.d.ts.map +1 -0
- package/dist/utils/meca-processor.js +503 -0
- package/dist/utils/meca-processor.test.d.ts +2 -0
- package/dist/utils/meca-processor.test.d.ts.map +1 -0
- package/dist/utils/meca-processor.test.js +123 -0
- package/dist/utils/months.d.ts +36 -0
- package/dist/utils/months.d.ts.map +1 -0
- package/dist/utils/months.js +135 -0
- package/dist/utils/months.test.d.ts +2 -0
- package/dist/utils/months.test.d.ts.map +1 -0
- package/dist/utils/months.test.js +209 -0
- package/dist/utils/requester-pays-error.d.ts +6 -0
- package/dist/utils/requester-pays-error.d.ts.map +1 -0
- package/dist/utils/requester-pays-error.js +20 -0
- package/dist/version.d.ts +3 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +2 -0
- package/package.json +67 -0
|
@@ -0,0 +1,503 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { fromXml } from 'xast-util-from-xml';
|
|
4
|
+
import axios from 'axios';
|
|
5
|
+
import AdmZip from 'adm-zip';
|
|
6
|
+
import { characterEntities } from 'character-entities';
|
|
7
|
+
import { exec } from 'child_process';
|
|
8
|
+
import { promisify } from 'util';
|
|
9
|
+
const execAsync = promisify(exec);
|
|
10
|
+
/**
|
|
11
|
+
* Process a MECA file and extract metadata
|
|
12
|
+
* @param mecaPath Path to the MECA file (local file path)
|
|
13
|
+
* @param options Processing options
|
|
14
|
+
* @returns ProcessMecaResult with success status and extracted paper data
|
|
15
|
+
*/
|
|
16
|
+
export async function processMecaFile(mecaPath, options) {
|
|
17
|
+
try {
|
|
18
|
+
console.log(`🔍 Processing MECA file: ${mecaPath}`);
|
|
19
|
+
// Create output directory if specified
|
|
20
|
+
if (options.output && !fs.existsSync(options.output)) {
|
|
21
|
+
fs.mkdirSync(options.output, { recursive: true });
|
|
22
|
+
}
|
|
23
|
+
// Extract MECA file (auto-select method based on file size and options)
|
|
24
|
+
const extractedDir = await extractMecaAuto(mecaPath, options.output, options.selective);
|
|
25
|
+
console.log(`📂 Extracted to: ${extractedDir}`);
|
|
26
|
+
// Parse manifest
|
|
27
|
+
const manifest = await parseManifest(extractedDir);
|
|
28
|
+
console.log(`📋 Found ${manifest.item.length} items`);
|
|
29
|
+
// Find JATS XML file
|
|
30
|
+
const jatsFile = findJATSFile(manifest, extractedDir);
|
|
31
|
+
if (!jatsFile) {
|
|
32
|
+
throw new Error('No JATS XML file found in manifest');
|
|
33
|
+
}
|
|
34
|
+
console.log(`📄 JATS file found: ${jatsFile}`);
|
|
35
|
+
// Parse JATS XML using unified ecosystem
|
|
36
|
+
console.log(`🔍 Starting JATS parsing...`);
|
|
37
|
+
let jatsData;
|
|
38
|
+
try {
|
|
39
|
+
jatsData = await parseJATS(jatsFile);
|
|
40
|
+
console.log(`🔍 JATS parsed - DOI: ${jatsData.doi}, Version: ${jatsData.version}, Received Date: ${jatsData.receivedDate}`);
|
|
41
|
+
}
|
|
42
|
+
catch (jatsError) {
|
|
43
|
+
console.error('❌ Error parsing JATS:', jatsError);
|
|
44
|
+
throw jatsError;
|
|
45
|
+
}
|
|
46
|
+
// Prepare paper data
|
|
47
|
+
const paperData = {
|
|
48
|
+
doi: jatsData.doi,
|
|
49
|
+
version: jatsData.articleVersion,
|
|
50
|
+
receivedDate: new Date(jatsData.receivedDate).toISOString(),
|
|
51
|
+
acceptedDate: jatsData.acceptedDate
|
|
52
|
+
? new Date(jatsData.acceptedDate).toISOString()
|
|
53
|
+
: undefined,
|
|
54
|
+
batch: options.batch,
|
|
55
|
+
server: options.server,
|
|
56
|
+
s3Key: options.s3Key,
|
|
57
|
+
fileSize: fs.statSync(mecaPath).size,
|
|
58
|
+
title: jatsData.title,
|
|
59
|
+
};
|
|
60
|
+
// Post to API
|
|
61
|
+
const apiResponse = await postToAPI(paperData, options.apiUrl, options.apiKey);
|
|
62
|
+
console.log('✅ Paper added to database');
|
|
63
|
+
return {
|
|
64
|
+
success: true,
|
|
65
|
+
paper: apiResponse,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
catch (error) {
|
|
69
|
+
return {
|
|
70
|
+
success: false,
|
|
71
|
+
error: error instanceof Error ? error.message : 'Unknown error',
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
async function extractMecaAuto(mecaPath, outputDir = './downloads', selective = true) {
|
|
76
|
+
// Check file size to determine extraction method
|
|
77
|
+
const stats = fs.statSync(mecaPath);
|
|
78
|
+
const fileSizeGB = stats.size / (1024 * 1024 * 1024);
|
|
79
|
+
const LARGE_FILE_THRESHOLD_GB = 1.9;
|
|
80
|
+
if (fileSizeGB > LARGE_FILE_THRESHOLD_GB) {
|
|
81
|
+
console.log(`🚨 File is larger than ${LARGE_FILE_THRESHOLD_GB} GB, using unzip for efficiency`);
|
|
82
|
+
return await extractMeca(mecaPath, outputDir);
|
|
83
|
+
}
|
|
84
|
+
else if (selective) {
|
|
85
|
+
return await extractMecaSelective(mecaPath, outputDir);
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
return await extractMeca(mecaPath, outputDir);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
async function extractMeca(mecaPath, outputDir) {
|
|
92
|
+
const extractedDir = path.join(outputDir, path.basename(mecaPath, path.extname(mecaPath)));
|
|
93
|
+
if (!fs.existsSync(extractedDir)) {
|
|
94
|
+
fs.mkdirSync(extractedDir, { recursive: true });
|
|
95
|
+
}
|
|
96
|
+
// Check if unzip is available
|
|
97
|
+
const unzipAvailable = await execAsync('which unzip').catch(() => false);
|
|
98
|
+
if (unzipAvailable) {
|
|
99
|
+
try {
|
|
100
|
+
// Use unzip command for file system extraction (handles large files better)
|
|
101
|
+
const { stderr } = await execAsync(`unzip -q "${mecaPath}" -d "${extractedDir}"`);
|
|
102
|
+
if (stderr && !stderr.includes('warning')) {
|
|
103
|
+
console.warn(` ⚠️ Unzip warnings: ${stderr}`);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
catch (error) {
|
|
107
|
+
console.error(` ❌ Unzip failed: ${error}`);
|
|
108
|
+
throw new Error(`Failed to extract MECA file with unzip: ${error}`);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
// Fallback to AdmZip for full extraction
|
|
113
|
+
const zip = new AdmZip(mecaPath);
|
|
114
|
+
zip.extractAllTo(extractedDir, true);
|
|
115
|
+
}
|
|
116
|
+
return extractedDir;
|
|
117
|
+
}
|
|
118
|
+
async function extractMecaSelective(mecaPath, outputDir) {
|
|
119
|
+
const extractedDir = path.join(outputDir, path.basename(mecaPath, path.extname(mecaPath)));
|
|
120
|
+
if (!fs.existsSync(extractedDir)) {
|
|
121
|
+
fs.mkdirSync(extractedDir, { recursive: true });
|
|
122
|
+
}
|
|
123
|
+
console.log(' 📦 Using selective extraction (manifest + JATS only)...');
|
|
124
|
+
// Use adm-zip for selective ZIP extraction
|
|
125
|
+
const zip = new AdmZip(mecaPath);
|
|
126
|
+
// First, extract just the manifest to see what's available
|
|
127
|
+
const manifestEntry = zip.getEntry('manifest.xml');
|
|
128
|
+
if (!manifestEntry) {
|
|
129
|
+
throw new Error('Manifest not found in MECA file');
|
|
130
|
+
}
|
|
131
|
+
// Extract manifest
|
|
132
|
+
zip.extractEntryTo('manifest.xml', extractedDir, false, true);
|
|
133
|
+
console.log(' 📋 Manifest extracted');
|
|
134
|
+
// Parse manifest to find JATS file path from manifest
|
|
135
|
+
const manifest = await parseManifest(extractedDir);
|
|
136
|
+
// Find the JATS file path from the manifest (without constructing full path yet)
|
|
137
|
+
let jatsRelativePath = null;
|
|
138
|
+
for (const item of manifest.item) {
|
|
139
|
+
if (item['@_type'] === 'article') {
|
|
140
|
+
for (const instance of item.instance) {
|
|
141
|
+
if (instance['@_media-type'] === 'application/xml' &&
|
|
142
|
+
instance['@_href'].endsWith('.xml') &&
|
|
143
|
+
!instance['@_href'].includes('manifest') &&
|
|
144
|
+
!instance['@_href'].includes('directives')) {
|
|
145
|
+
// Normalize path separators to Unix style
|
|
146
|
+
jatsRelativePath = instance['@_href'].replace(/\\/g, '/');
|
|
147
|
+
break;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
if (jatsRelativePath)
|
|
151
|
+
break;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
if (!jatsRelativePath) {
|
|
155
|
+
throw new Error('No JATS XML file found in manifest');
|
|
156
|
+
}
|
|
157
|
+
console.log(` 📄 Found JATS file in manifest: ${jatsRelativePath}`);
|
|
158
|
+
// Extract the JATS file using the relative path from manifest
|
|
159
|
+
const jatsEntry = zip.getEntry(jatsRelativePath) || zip.getEntry(jatsRelativePath.replace(/\//g, '\\'));
|
|
160
|
+
if (jatsEntry) {
|
|
161
|
+
console.log(` 🔍 Extracting JATS file: ${jatsRelativePath} to ${extractedDir}`);
|
|
162
|
+
// Create the target directory structure if it doesn't exist
|
|
163
|
+
const jatsTargetPath = path.join(extractedDir, jatsRelativePath);
|
|
164
|
+
const jatsTargetDir = path.dirname(jatsTargetPath);
|
|
165
|
+
if (!fs.existsSync(jatsTargetDir)) {
|
|
166
|
+
fs.mkdirSync(jatsTargetDir, { recursive: true });
|
|
167
|
+
console.log(` 📁 Created directory: ${jatsTargetDir}`);
|
|
168
|
+
}
|
|
169
|
+
// Extract the JATS file content and write it to the correct location
|
|
170
|
+
const jatsContent = jatsEntry.getData();
|
|
171
|
+
fs.writeFileSync(jatsTargetPath, jatsContent);
|
|
172
|
+
console.log(` 📄 JATS file extracted: ${path.basename(jatsRelativePath)}`);
|
|
173
|
+
// Verify the file was extracted successfully
|
|
174
|
+
console.log(` 🔍 Verifying file exists at: ${jatsTargetPath}`);
|
|
175
|
+
if (!fs.existsSync(jatsTargetPath)) {
|
|
176
|
+
// Debug: list what was actually extracted
|
|
177
|
+
console.log(` 🔍 Debug: Checking extracted directory contents:`);
|
|
178
|
+
const listExtractedFiles = (dir, prefix = '') => {
|
|
179
|
+
if (fs.existsSync(dir)) {
|
|
180
|
+
const items = fs.readdirSync(dir);
|
|
181
|
+
items.forEach((item) => {
|
|
182
|
+
const itemPath = path.join(dir, item);
|
|
183
|
+
const stat = fs.statSync(itemPath);
|
|
184
|
+
if (stat.isDirectory()) {
|
|
185
|
+
console.log(` ${prefix}📁 ${item}/`);
|
|
186
|
+
listExtractedFiles(itemPath, prefix + ' ');
|
|
187
|
+
}
|
|
188
|
+
else {
|
|
189
|
+
console.log(` ${prefix}📄 ${item}`);
|
|
190
|
+
}
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
};
|
|
194
|
+
listExtractedFiles(extractedDir);
|
|
195
|
+
throw new Error(`JATS file was not extracted successfully to: ${jatsTargetPath}`);
|
|
196
|
+
}
|
|
197
|
+
console.log(` ✅ JATS file verified at: ${jatsTargetPath}`);
|
|
198
|
+
}
|
|
199
|
+
else {
|
|
200
|
+
throw new Error(`Could not extract JATS file: ${jatsRelativePath}`);
|
|
201
|
+
}
|
|
202
|
+
return extractedDir;
|
|
203
|
+
}
|
|
204
|
+
async function parseManifest(extractedDir) {
|
|
205
|
+
const manifestPath = path.join(extractedDir, 'manifest.xml');
|
|
206
|
+
if (!fs.existsSync(manifestPath)) {
|
|
207
|
+
throw new Error('Manifest file not found');
|
|
208
|
+
}
|
|
209
|
+
let manifestContent = fs.readFileSync(manifestPath, 'utf-8');
|
|
210
|
+
// Preprocess XML content to fix common HTML entities and reorder XML declaration if needed
|
|
211
|
+
manifestContent = preprocessXMLContent(manifestContent);
|
|
212
|
+
// Use xast-util-from-xml to parse the manifest XML
|
|
213
|
+
const ast = fromXml(manifestContent);
|
|
214
|
+
// Parse the manifest structure
|
|
215
|
+
return parseManifestStructure(ast);
|
|
216
|
+
}
|
|
217
|
+
function parseManifestStructure(ast) {
|
|
218
|
+
// Navigate through the AST to find manifest content
|
|
219
|
+
const manifest = findElement(ast, 'manifest');
|
|
220
|
+
if (!manifest) {
|
|
221
|
+
throw new Error('Manifest element not found');
|
|
222
|
+
}
|
|
223
|
+
const items = findElements(manifest, 'item');
|
|
224
|
+
const itemData = items.map((item) => {
|
|
225
|
+
const id = getAttribute(item, 'id');
|
|
226
|
+
const type = getAttribute(item, 'type');
|
|
227
|
+
const title = getTextContent(item);
|
|
228
|
+
const instances = findElements(item, 'instance');
|
|
229
|
+
const instanceData = instances.map((instance) => {
|
|
230
|
+
const mediaType = getAttribute(instance, 'media-type');
|
|
231
|
+
const href = getAttribute(instance, 'href');
|
|
232
|
+
return {
|
|
233
|
+
'@_media-type': mediaType || '',
|
|
234
|
+
'@_href': href || '',
|
|
235
|
+
};
|
|
236
|
+
});
|
|
237
|
+
return {
|
|
238
|
+
'@_id': id || '',
|
|
239
|
+
'@_type': type || '',
|
|
240
|
+
'@_title': title || '',
|
|
241
|
+
instance: instanceData,
|
|
242
|
+
};
|
|
243
|
+
});
|
|
244
|
+
return {
|
|
245
|
+
item: itemData,
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
function findJATSFile(manifest, extractedDir) {
|
|
249
|
+
for (const item of manifest.item) {
|
|
250
|
+
if (item['@_type'] === 'article') {
|
|
251
|
+
for (const instance of item.instance) {
|
|
252
|
+
if (instance['@_media-type'] === 'application/xml' &&
|
|
253
|
+
instance['@_href'].endsWith('.xml') &&
|
|
254
|
+
!instance['@_href'].includes('manifest') &&
|
|
255
|
+
!instance['@_href'].includes('directives')) {
|
|
256
|
+
// Normalize path separators to Unix style
|
|
257
|
+
const normalizedPath = instance['@_href'].replace(/\\/g, '/');
|
|
258
|
+
return path.join(extractedDir, normalizedPath);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
return null;
|
|
264
|
+
}
|
|
265
|
+
async function parseJATS(jatsFile) {
|
|
266
|
+
let jatsContent = fs.readFileSync(jatsFile, 'utf-8');
|
|
267
|
+
// Preprocess XML content to fix common HTML entities
|
|
268
|
+
jatsContent = preprocessXMLContent(jatsContent);
|
|
269
|
+
// Use xast-util-from-xml for JATS parsing
|
|
270
|
+
const ast = fromXml(jatsContent);
|
|
271
|
+
// Extract metadata from the parsed JATS AST
|
|
272
|
+
const doi = extractDOI(ast);
|
|
273
|
+
const versionInfo = extractVersion(ast);
|
|
274
|
+
const dates = extractDates(ast);
|
|
275
|
+
const title = extractTitle(ast);
|
|
276
|
+
return {
|
|
277
|
+
doi,
|
|
278
|
+
version: versionInfo.version,
|
|
279
|
+
articleVersion: versionInfo.articleVersion,
|
|
280
|
+
receivedDate: dates.receivedDate,
|
|
281
|
+
acceptedDate: dates.acceptedDate,
|
|
282
|
+
title,
|
|
283
|
+
};
|
|
284
|
+
}
|
|
285
|
+
/**
|
|
286
|
+
* Preprocess XML content to fix common HTML entities that cause parsing errors
|
|
287
|
+
* @param xmlContent Raw XML content
|
|
288
|
+
* @returns Preprocessed XML content with entities replaced
|
|
289
|
+
*/
|
|
290
|
+
export function preprocessXMLContent(xmlContent) {
|
|
291
|
+
// Handle cases where XML declaration is not on the first line or has leading whitespace
|
|
292
|
+
// Split content into lines and look for XML declaration
|
|
293
|
+
const lines = xmlContent.split('\n');
|
|
294
|
+
const xmlDeclarationIndex = lines.findIndex((line) => line.trim().startsWith('<?xml'));
|
|
295
|
+
if (xmlDeclarationIndex >= 0) {
|
|
296
|
+
if (xmlDeclarationIndex === 0) {
|
|
297
|
+
// XML declaration is on first line but may have leading whitespace
|
|
298
|
+
// Trim the first line to remove leading whitespace
|
|
299
|
+
lines[0] = lines[0].trim();
|
|
300
|
+
// Reconstruct the content
|
|
301
|
+
xmlContent = lines.join('\n');
|
|
302
|
+
}
|
|
303
|
+
else if (xmlDeclarationIndex < 5) {
|
|
304
|
+
// XML declaration is found but not on first line, reorder lines
|
|
305
|
+
const xmlDeclaration = lines[xmlDeclarationIndex];
|
|
306
|
+
// Remove the XML declaration from its current position
|
|
307
|
+
lines.splice(xmlDeclarationIndex, 1);
|
|
308
|
+
// Insert it at the beginning (without leading whitespace)
|
|
309
|
+
lines.unshift(xmlDeclaration.trim());
|
|
310
|
+
// Reconstruct the content
|
|
311
|
+
xmlContent = lines.join('\n');
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
// One specific case in January 2019
|
|
315
|
+
xmlContent = xmlContent.replace('<fn id="n1"fn-type="equal">', '<fn id="n1" fn-type="equal">');
|
|
316
|
+
// Define all valid HTML entities that we recognize
|
|
317
|
+
const validEntities = Object.keys(characterEntities);
|
|
318
|
+
// First, escape any unescaped ampersands that cause "Unterminated reference" errors
|
|
319
|
+
// This handles cases like "Bill & Melinda" where & is not properly escaped
|
|
320
|
+
const validEntityPattern = validEntities.join('|');
|
|
321
|
+
let processedContent = xmlContent.replace(new RegExp(`&(?!(?:${validEntityPattern}|#\\d+);)`, 'g'), '&');
|
|
322
|
+
// Now replace HTML entities with their Unicode equivalents
|
|
323
|
+
// This handles cases like –, <, etc.
|
|
324
|
+
// Note: We do NOT convert & to & to avoid circular problems
|
|
325
|
+
const entityReplacements = {
|
|
326
|
+
...Object.fromEntries(Object.entries(characterEntities).map(([key, value]) => [`&${key};`, value])),
|
|
327
|
+
'<': '<', // less than
|
|
328
|
+
'>': '>', // greater than
|
|
329
|
+
'&': '&', // ampersand
|
|
330
|
+
};
|
|
331
|
+
// Replace HTML entities
|
|
332
|
+
for (const [entity, replacement] of Object.entries(entityReplacements)) {
|
|
333
|
+
processedContent = processedContent.replace(new RegExp(entity, 'g'), replacement);
|
|
334
|
+
}
|
|
335
|
+
return processedContent;
|
|
336
|
+
}
|
|
337
|
+
// Helper functions to navigate the XAST
|
|
338
|
+
function findElement(node, name) {
|
|
339
|
+
if (node.type === 'element' && node.name === name) {
|
|
340
|
+
return node;
|
|
341
|
+
}
|
|
342
|
+
if (node.children) {
|
|
343
|
+
for (const child of node.children) {
|
|
344
|
+
if (child.type === 'element') {
|
|
345
|
+
const found = findElement(child, name);
|
|
346
|
+
if (found)
|
|
347
|
+
return found;
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
return null;
|
|
352
|
+
}
|
|
353
|
+
function findElements(node, name) {
|
|
354
|
+
const results = [];
|
|
355
|
+
if (node.type === 'element' && node.name === name) {
|
|
356
|
+
results.push(node);
|
|
357
|
+
}
|
|
358
|
+
if (node.children) {
|
|
359
|
+
for (const child of node.children) {
|
|
360
|
+
if (child.type === 'element') {
|
|
361
|
+
results.push(...findElements(child, name));
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
return results;
|
|
366
|
+
}
|
|
367
|
+
function getAttribute(node, name) {
|
|
368
|
+
if (node.attributes && node.attributes[name]) {
|
|
369
|
+
const value = node.attributes[name];
|
|
370
|
+
return value ? String(value) : null;
|
|
371
|
+
}
|
|
372
|
+
return null;
|
|
373
|
+
}
|
|
374
|
+
function extractDOI(ast) {
|
|
375
|
+
// Look for article-id with pub-id-type="doi"
|
|
376
|
+
const articleIds = findElements(ast, 'article-id');
|
|
377
|
+
for (const id of articleIds) {
|
|
378
|
+
const pubIdType = getAttribute(id, 'pub-id-type');
|
|
379
|
+
if (pubIdType === 'doi') {
|
|
380
|
+
// Get the text content
|
|
381
|
+
const textContent = getTextContent(id);
|
|
382
|
+
if (textContent) {
|
|
383
|
+
return textContent.trim();
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
throw new Error('DOI not found in JATS XML');
|
|
388
|
+
}
|
|
389
|
+
function extractVersion(ast) {
|
|
390
|
+
// Look for article-version
|
|
391
|
+
const versionElement = findElement(ast, 'article-version');
|
|
392
|
+
if (versionElement) {
|
|
393
|
+
const textContent = getTextContent(versionElement);
|
|
394
|
+
if (textContent) {
|
|
395
|
+
const version = textContent.trim();
|
|
396
|
+
// Extract the version number from strings like "1.4" -> articleVersion = 4
|
|
397
|
+
const versionParts = version.split('.');
|
|
398
|
+
const articleVersion = versionParts.length > 1 ? parseInt(versionParts[1]) : 1;
|
|
399
|
+
return { version, articleVersion };
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
return { version: '1.1', articleVersion: 1 }; // Default version
|
|
403
|
+
}
|
|
404
|
+
function extractDates(ast) {
|
|
405
|
+
let receivedDate;
|
|
406
|
+
let acceptedDate;
|
|
407
|
+
// Look for dates in history section
|
|
408
|
+
const history = findElement(ast, 'history');
|
|
409
|
+
if (history) {
|
|
410
|
+
const dates = findElements(history, 'date');
|
|
411
|
+
for (const date of dates) {
|
|
412
|
+
const dateType = getAttribute(date, 'date-type');
|
|
413
|
+
const yearElement = findElement(date, 'year');
|
|
414
|
+
const monthElement = findElement(date, 'month');
|
|
415
|
+
const dayElement = findElement(date, 'day');
|
|
416
|
+
if (yearElement && monthElement && dayElement) {
|
|
417
|
+
const year = getTextContent(yearElement);
|
|
418
|
+
const month = getTextContent(monthElement);
|
|
419
|
+
const day = getTextContent(dayElement);
|
|
420
|
+
if (year && month && day) {
|
|
421
|
+
const dateString = `${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}`;
|
|
422
|
+
if (dateType === 'received') {
|
|
423
|
+
receivedDate = dateString;
|
|
424
|
+
}
|
|
425
|
+
else if (dateType === 'accepted') {
|
|
426
|
+
acceptedDate = dateString;
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
// Fallback: look for pub-date with pub-type="epub" (bioRxiv format)
|
|
433
|
+
if (!receivedDate) {
|
|
434
|
+
const pubDates = findElements(ast, 'pub-date');
|
|
435
|
+
for (const date of pubDates) {
|
|
436
|
+
const pubType = getAttribute(date, 'pub-type');
|
|
437
|
+
if (pubType === 'epub') {
|
|
438
|
+
const yearElement = findElement(date, 'year');
|
|
439
|
+
const monthElement = findElement(date, 'month');
|
|
440
|
+
const dayElement = findElement(date, 'day');
|
|
441
|
+
if (yearElement && monthElement && dayElement) {
|
|
442
|
+
const year = getTextContent(yearElement);
|
|
443
|
+
const month = getTextContent(monthElement);
|
|
444
|
+
const day = getTextContent(dayElement);
|
|
445
|
+
if (year && month && day) {
|
|
446
|
+
receivedDate = `${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}`;
|
|
447
|
+
break;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
// If no received date found, fall back to accepted date
|
|
454
|
+
if (!receivedDate) {
|
|
455
|
+
if (acceptedDate) {
|
|
456
|
+
console.log(`⚠️ No received date found, falling back to accepted date: ${acceptedDate}`);
|
|
457
|
+
receivedDate = acceptedDate;
|
|
458
|
+
}
|
|
459
|
+
else {
|
|
460
|
+
throw new Error('Neither received date nor accepted date found in JATS XML');
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
return { receivedDate, acceptedDate };
|
|
464
|
+
}
|
|
465
|
+
function extractTitle(ast) {
|
|
466
|
+
// Look for article-title
|
|
467
|
+
const titleElement = findElement(ast, 'article-title');
|
|
468
|
+
if (titleElement) {
|
|
469
|
+
const textContent = getTextContent(titleElement);
|
|
470
|
+
if (textContent) {
|
|
471
|
+
return textContent.trim();
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
return undefined;
|
|
475
|
+
}
|
|
476
|
+
function getTextContent(node) {
|
|
477
|
+
if (node.children) {
|
|
478
|
+
let text = '';
|
|
479
|
+
for (const child of node.children) {
|
|
480
|
+
if (child.type === 'text' && child.value) {
|
|
481
|
+
text += child.value;
|
|
482
|
+
}
|
|
483
|
+
else if (child.type === 'element' && child.children) {
|
|
484
|
+
const childText = getTextContent(child);
|
|
485
|
+
if (childText) {
|
|
486
|
+
text += childText;
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
return text || null;
|
|
491
|
+
}
|
|
492
|
+
return null;
|
|
493
|
+
}
|
|
494
|
+
async function postToAPI(paperData, apiUrl, apiKey) {
|
|
495
|
+
const headers = {
|
|
496
|
+
'Content-Type': 'application/json',
|
|
497
|
+
};
|
|
498
|
+
if (apiKey) {
|
|
499
|
+
headers['Authorization'] = `Bearer ${apiKey}`;
|
|
500
|
+
}
|
|
501
|
+
const response = await axios.post(`${apiUrl}/v1/works`, paperData, { headers });
|
|
502
|
+
return response.data;
|
|
503
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"meca-processor.test.d.ts","sourceRoot":"","sources":["../../src/utils/meca-processor.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
2
|
+
import { preprocessXMLContent } from './meca-processor.js';
|
|
3
|
+
// Mock fs module for testing
|
|
4
|
+
vi.mock('fs', () => ({
|
|
5
|
+
default: {
|
|
6
|
+
statSync: vi.fn(),
|
|
7
|
+
existsSync: vi.fn(),
|
|
8
|
+
readFileSync: vi.fn(),
|
|
9
|
+
writeFileSync: vi.fn(),
|
|
10
|
+
mkdirSync: vi.fn(),
|
|
11
|
+
},
|
|
12
|
+
}));
|
|
13
|
+
describe('preprocessXMLContent - Ampersand Handling', () => {
|
|
14
|
+
it.each([
|
|
15
|
+
{
|
|
16
|
+
name: 'should escape unescaped ampersands',
|
|
17
|
+
input: 'Bill & Melinda Gates Foundation',
|
|
18
|
+
expected: 'Bill & Melinda Gates Foundation',
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
name: 'should preserve already escaped ampersands',
|
|
22
|
+
input: 'Bill & Melinda Gates Foundation',
|
|
23
|
+
expected: 'Bill & Melinda Gates Foundation',
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
name: 'should preserve numeric entities for ampersands',
|
|
27
|
+
input: 'Bill & Melinda Gates Foundation',
|
|
28
|
+
expected: 'Bill & Melinda Gates Foundation',
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
name: 'should handle mixed ampersand scenarios',
|
|
32
|
+
input: 'Company & Associates & Partners < 100',
|
|
33
|
+
expected: 'Company & Associates & Partners < 100',
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
name: 'should handle complex academic text with various ampersands',
|
|
37
|
+
input: 'The Bill & Melinda Gates Foundation & European Commission's Horizon 2020',
|
|
38
|
+
expected: 'The Bill & Melinda Gates Foundation & European Commission's Horizon 2020',
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
name: 'should handle multiple unescaped ampersands in sequence',
|
|
42
|
+
input: 'A & B & C & D',
|
|
43
|
+
expected: 'A & B & C & D',
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
name: 'should preserve valid HTML entities while escaping unescaped ampersands',
|
|
47
|
+
input: 'Temperature < 5 °C & Pressure > 100 μPa',
|
|
48
|
+
expected: 'Temperature < 5 °C & Pressure > 100 μPa',
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
name: 'rsquor',
|
|
52
|
+
input: 'dataset’s terms of use.',
|
|
53
|
+
expected: 'dataset’s terms of use.',
|
|
54
|
+
},
|
|
55
|
+
])('$name', ({ input, expected }) => {
|
|
56
|
+
const result = preprocessXMLContent(input);
|
|
57
|
+
expect(result).toBe(expected);
|
|
58
|
+
});
|
|
59
|
+
});
|
|
60
|
+
describe('preprocessXMLContent - XML Declaration Reordering', () => {
|
|
61
|
+
it('should reorder XML declaration to first line when it appears after DOCTYPE', () => {
|
|
62
|
+
const input = `<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
|
|
63
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
64
|
+
<article xmlns:mml="http://www.w3.org/1998/Math/MathML">
|
|
65
|
+
<title>Test Article</title>
|
|
66
|
+
</article>`;
|
|
67
|
+
const result = preprocessXMLContent(input);
|
|
68
|
+
// Check that XML declaration is now first
|
|
69
|
+
expect(result.startsWith('<?xml version="1.0" encoding="UTF-8"?>')).toBe(true);
|
|
70
|
+
// Check that DOCTYPE is second
|
|
71
|
+
expect(result.includes('<!DOCTYPE article PUBLIC')).toBe(true);
|
|
72
|
+
// Check that the content is preserved
|
|
73
|
+
expect(result).toContain('<article xmlns:mml="http://www.w3.org/1998/Math/MathML">');
|
|
74
|
+
expect(result).toContain('<title>Test Article</title>');
|
|
75
|
+
});
|
|
76
|
+
it('should trim leading whitespace from XML declaration on first line', () => {
|
|
77
|
+
const input = ` <?xml version="1.0" encoding="UTF-8"?>
|
|
78
|
+
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
|
|
79
|
+
<article xmlns:mml="http://www.w3.org/1998/Math/MathML">
|
|
80
|
+
<title>Test Article</title>
|
|
81
|
+
</article>`;
|
|
82
|
+
const result = preprocessXMLContent(input);
|
|
83
|
+
// Check that XML declaration is first and has no leading whitespace
|
|
84
|
+
expect(result.startsWith('<?xml version="1.0" encoding="UTF-8"?>')).toBe(true);
|
|
85
|
+
// Check that DOCTYPE is second
|
|
86
|
+
expect(result.includes('<!DOCTYPE article PUBLIC')).toBe(true);
|
|
87
|
+
// Check that the content is preserved
|
|
88
|
+
expect(result).toContain('<article xmlns:mml="http://www.w3.org/1998/Math/MathML">');
|
|
89
|
+
expect(result).toContain('<title>Test Article</title>');
|
|
90
|
+
});
|
|
91
|
+
it('should not reorder when XML declaration is already first', () => {
|
|
92
|
+
const input = `<?xml version="1.0" encoding="UTF-8"?>
|
|
93
|
+
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
|
|
94
|
+
<article>
|
|
95
|
+
<title>Test Article</title>
|
|
96
|
+
</article>`;
|
|
97
|
+
const result = preprocessXMLContent(input);
|
|
98
|
+
// Should remain unchanged
|
|
99
|
+
expect(result).toBe(input);
|
|
100
|
+
});
|
|
101
|
+
it('should not reorder when XML declaration is beyond 4th line', () => {
|
|
102
|
+
const input = `<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
|
|
103
|
+
<article>
|
|
104
|
+
<title>Test Article</title>
|
|
105
|
+
<abstract>This is a test abstract</abstract>
|
|
106
|
+
<body>Test body content</body>
|
|
107
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
108
|
+
</article>`;
|
|
109
|
+
const result = preprocessXMLContent(input);
|
|
110
|
+
// Should not reorder since XML declaration is on the 5th line (index 4)
|
|
111
|
+
// and we only reorder if it's within the first 4 lines (indices 0-3)
|
|
112
|
+
expect(result).toBe(input);
|
|
113
|
+
});
|
|
114
|
+
it('should handle case with no XML declaration', () => {
|
|
115
|
+
const input = `<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
|
|
116
|
+
<article>
|
|
117
|
+
<title>Test Article</title>
|
|
118
|
+
</article>`;
|
|
119
|
+
const result = preprocessXMLContent(input);
|
|
120
|
+
// Should remain unchanged
|
|
121
|
+
expect(result).toBe(input);
|
|
122
|
+
});
|
|
123
|
+
});
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Month utility functions for batch processing
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Generate a range of months to process backwards from current month to 2018-12
|
|
6
|
+
*/
|
|
7
|
+
export declare function generateMonthRange(): string[];
|
|
8
|
+
/**
|
|
9
|
+
* Parse month input and return array of months to process
|
|
10
|
+
*/
|
|
11
|
+
export declare function parseMonthInput(monthInput: string): string[];
|
|
12
|
+
/**
|
|
13
|
+
* Parse wildcard pattern like "2025-*" to get all months in that year
|
|
14
|
+
*/
|
|
15
|
+
export declare function parseWildcardPattern(pattern: string): string[];
|
|
16
|
+
/**
|
|
17
|
+
* Validate month format (YYYY-MM)
|
|
18
|
+
*/
|
|
19
|
+
export declare function validateMonthFormat(month: string): boolean;
|
|
20
|
+
/**
|
|
21
|
+
* Sort months chronologically (oldest first)
|
|
22
|
+
*/
|
|
23
|
+
export declare function sortMonthsChronologically(months: string[]): string[];
|
|
24
|
+
/**
|
|
25
|
+
* Get current month in YYYY-MM format
|
|
26
|
+
*/
|
|
27
|
+
export declare function getCurrentMonth(): string;
|
|
28
|
+
/**
|
|
29
|
+
* Get previous month in YYYY-MM format
|
|
30
|
+
*/
|
|
31
|
+
export declare function getPreviousMonth(): string;
|
|
32
|
+
/**
|
|
33
|
+
* Check if a month is in the future
|
|
34
|
+
*/
|
|
35
|
+
export declare function isFutureMonth(month: string): boolean;
|
|
36
|
+
//# sourceMappingURL=months.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"months.d.ts","sourceRoot":"","sources":["../../src/utils/months.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,wBAAgB,kBAAkB,IAAI,MAAM,EAAE,CAuB7C;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,EAAE,CA6B5D;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAwB9D;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAW1D;AAED;;GAEG;AACH,wBAAgB,yBAAyB,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,EAAE,CAUpE;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,MAAM,CAKxC;AAED;;GAEG;AACH,wBAAgB,gBAAgB,IAAI,MAAM,CAMzC;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAOpD"}
|