openrxiv-cli 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/api/api-client.d.ts +96 -0
  2. package/dist/api/api-client.d.ts.map +1 -0
  3. package/dist/api/api-client.js +257 -0
  4. package/dist/aws/bucket-explorer.d.ts +26 -0
  5. package/dist/aws/bucket-explorer.d.ts.map +1 -0
  6. package/dist/aws/bucket-explorer.js +220 -0
  7. package/dist/aws/config.d.ts +5 -0
  8. package/dist/aws/config.d.ts.map +1 -0
  9. package/dist/aws/config.js +36 -0
  10. package/dist/aws/downloader.d.ts +13 -0
  11. package/dist/aws/downloader.d.ts.map +1 -0
  12. package/dist/aws/downloader.js +115 -0
  13. package/dist/aws/month-lister.d.ts +18 -0
  14. package/dist/aws/month-lister.d.ts.map +1 -0
  15. package/dist/aws/month-lister.js +90 -0
  16. package/dist/commands/batch-info.d.ts +3 -0
  17. package/dist/commands/batch-info.d.ts.map +1 -0
  18. package/dist/commands/batch-info.js +213 -0
  19. package/dist/commands/batch-process.d.ts +3 -0
  20. package/dist/commands/batch-process.d.ts.map +1 -0
  21. package/dist/commands/batch-process.js +557 -0
  22. package/dist/commands/download.d.ts +3 -0
  23. package/dist/commands/download.d.ts.map +1 -0
  24. package/dist/commands/download.js +76 -0
  25. package/dist/commands/index.d.ts +6 -0
  26. package/dist/commands/index.d.ts.map +1 -0
  27. package/dist/commands/index.js +5 -0
  28. package/dist/commands/list.d.ts +3 -0
  29. package/dist/commands/list.d.ts.map +1 -0
  30. package/dist/commands/list.js +18 -0
  31. package/dist/commands/summary.d.ts +3 -0
  32. package/dist/commands/summary.d.ts.map +1 -0
  33. package/dist/commands/summary.js +249 -0
  34. package/dist/index.d.ts +7 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +35 -0
  37. package/dist/utils/batches.d.ts +9 -0
  38. package/dist/utils/batches.d.ts.map +1 -0
  39. package/dist/utils/batches.js +61 -0
  40. package/dist/utils/batches.test.d.ts +2 -0
  41. package/dist/utils/batches.test.d.ts.map +1 -0
  42. package/dist/utils/batches.test.js +119 -0
  43. package/dist/utils/default-server.d.ts +3 -0
  44. package/dist/utils/default-server.d.ts.map +1 -0
  45. package/dist/utils/default-server.js +20 -0
  46. package/dist/utils/index.d.ts +5 -0
  47. package/dist/utils/index.d.ts.map +1 -0
  48. package/dist/utils/index.js +5 -0
  49. package/dist/utils/meca-processor.d.ts +28 -0
  50. package/dist/utils/meca-processor.d.ts.map +1 -0
  51. package/dist/utils/meca-processor.js +503 -0
  52. package/dist/utils/meca-processor.test.d.ts +2 -0
  53. package/dist/utils/meca-processor.test.d.ts.map +1 -0
  54. package/dist/utils/meca-processor.test.js +123 -0
  55. package/dist/utils/months.d.ts +36 -0
  56. package/dist/utils/months.d.ts.map +1 -0
  57. package/dist/utils/months.js +135 -0
  58. package/dist/utils/months.test.d.ts +2 -0
  59. package/dist/utils/months.test.d.ts.map +1 -0
  60. package/dist/utils/months.test.js +209 -0
  61. package/dist/utils/requester-pays-error.d.ts +6 -0
  62. package/dist/utils/requester-pays-error.d.ts.map +1 -0
  63. package/dist/utils/requester-pays-error.js +20 -0
  64. package/dist/version.d.ts +3 -0
  65. package/dist/version.d.ts.map +1 -0
  66. package/dist/version.js +2 -0
  67. package/package.json +67 -0
@@ -0,0 +1,503 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import { fromXml } from 'xast-util-from-xml';
4
+ import axios from 'axios';
5
+ import AdmZip from 'adm-zip';
6
+ import { characterEntities } from 'character-entities';
7
+ import { exec } from 'child_process';
8
+ import { promisify } from 'util';
9
+ const execAsync = promisify(exec);
10
+ /**
11
+ * Process a MECA file and extract metadata
12
+ * @param mecaPath Path to the MECA file (local file path)
13
+ * @param options Processing options
14
+ * @returns ProcessMecaResult with success status and extracted paper data
15
+ */
16
+ export async function processMecaFile(mecaPath, options) {
17
+ try {
18
+ console.log(`🔍 Processing MECA file: ${mecaPath}`);
19
+ // Create output directory if specified
20
+ if (options.output && !fs.existsSync(options.output)) {
21
+ fs.mkdirSync(options.output, { recursive: true });
22
+ }
23
+ // Extract MECA file (auto-select method based on file size and options)
24
+ const extractedDir = await extractMecaAuto(mecaPath, options.output, options.selective);
25
+ console.log(`📂 Extracted to: ${extractedDir}`);
26
+ // Parse manifest
27
+ const manifest = await parseManifest(extractedDir);
28
+ console.log(`📋 Found ${manifest.item.length} items`);
29
+ // Find JATS XML file
30
+ const jatsFile = findJATSFile(manifest, extractedDir);
31
+ if (!jatsFile) {
32
+ throw new Error('No JATS XML file found in manifest');
33
+ }
34
+ console.log(`📄 JATS file found: ${jatsFile}`);
35
+ // Parse JATS XML using unified ecosystem
36
+ console.log(`🔍 Starting JATS parsing...`);
37
+ let jatsData;
38
+ try {
39
+ jatsData = await parseJATS(jatsFile);
40
+ console.log(`🔍 JATS parsed - DOI: ${jatsData.doi}, Version: ${jatsData.version}, Received Date: ${jatsData.receivedDate}`);
41
+ }
42
+ catch (jatsError) {
43
+ console.error('❌ Error parsing JATS:', jatsError);
44
+ throw jatsError;
45
+ }
46
+ // Prepare paper data
47
+ const paperData = {
48
+ doi: jatsData.doi,
49
+ version: jatsData.articleVersion,
50
+ receivedDate: new Date(jatsData.receivedDate).toISOString(),
51
+ acceptedDate: jatsData.acceptedDate
52
+ ? new Date(jatsData.acceptedDate).toISOString()
53
+ : undefined,
54
+ batch: options.batch,
55
+ server: options.server,
56
+ s3Key: options.s3Key,
57
+ fileSize: fs.statSync(mecaPath).size,
58
+ title: jatsData.title,
59
+ };
60
+ // Post to API
61
+ const apiResponse = await postToAPI(paperData, options.apiUrl, options.apiKey);
62
+ console.log('✅ Paper added to database');
63
+ return {
64
+ success: true,
65
+ paper: apiResponse,
66
+ };
67
+ }
68
+ catch (error) {
69
+ return {
70
+ success: false,
71
+ error: error instanceof Error ? error.message : 'Unknown error',
72
+ };
73
+ }
74
+ }
75
+ async function extractMecaAuto(mecaPath, outputDir = './downloads', selective = true) {
76
+ // Check file size to determine extraction method
77
+ const stats = fs.statSync(mecaPath);
78
+ const fileSizeGB = stats.size / (1024 * 1024 * 1024);
79
+ const LARGE_FILE_THRESHOLD_GB = 1.9;
80
+ if (fileSizeGB > LARGE_FILE_THRESHOLD_GB) {
81
+ console.log(`🚨 File is larger than ${LARGE_FILE_THRESHOLD_GB} GB, using unzip for efficiency`);
82
+ return await extractMeca(mecaPath, outputDir);
83
+ }
84
+ else if (selective) {
85
+ return await extractMecaSelective(mecaPath, outputDir);
86
+ }
87
+ else {
88
+ return await extractMeca(mecaPath, outputDir);
89
+ }
90
+ }
91
+ async function extractMeca(mecaPath, outputDir) {
92
+ const extractedDir = path.join(outputDir, path.basename(mecaPath, path.extname(mecaPath)));
93
+ if (!fs.existsSync(extractedDir)) {
94
+ fs.mkdirSync(extractedDir, { recursive: true });
95
+ }
96
+ // Check if unzip is available
97
+ const unzipAvailable = await execAsync('which unzip').catch(() => false);
98
+ if (unzipAvailable) {
99
+ try {
100
+ // Use unzip command for file system extraction (handles large files better)
101
+ const { stderr } = await execAsync(`unzip -q "${mecaPath}" -d "${extractedDir}"`);
102
+ if (stderr && !stderr.includes('warning')) {
103
+ console.warn(` ⚠️ Unzip warnings: ${stderr}`);
104
+ }
105
+ }
106
+ catch (error) {
107
+ console.error(` ❌ Unzip failed: ${error}`);
108
+ throw new Error(`Failed to extract MECA file with unzip: ${error}`);
109
+ }
110
+ }
111
+ else {
112
+ // Fallback to AdmZip for full extraction
113
+ const zip = new AdmZip(mecaPath);
114
+ zip.extractAllTo(extractedDir, true);
115
+ }
116
+ return extractedDir;
117
+ }
118
+ async function extractMecaSelective(mecaPath, outputDir) {
119
+ const extractedDir = path.join(outputDir, path.basename(mecaPath, path.extname(mecaPath)));
120
+ if (!fs.existsSync(extractedDir)) {
121
+ fs.mkdirSync(extractedDir, { recursive: true });
122
+ }
123
+ console.log(' 📦 Using selective extraction (manifest + JATS only)...');
124
+ // Use adm-zip for selective ZIP extraction
125
+ const zip = new AdmZip(mecaPath);
126
+ // First, extract just the manifest to see what's available
127
+ const manifestEntry = zip.getEntry('manifest.xml');
128
+ if (!manifestEntry) {
129
+ throw new Error('Manifest not found in MECA file');
130
+ }
131
+ // Extract manifest
132
+ zip.extractEntryTo('manifest.xml', extractedDir, false, true);
133
+ console.log(' 📋 Manifest extracted');
134
+ // Parse manifest to find JATS file path from manifest
135
+ const manifest = await parseManifest(extractedDir);
136
+ // Find the JATS file path from the manifest (without constructing full path yet)
137
+ let jatsRelativePath = null;
138
+ for (const item of manifest.item) {
139
+ if (item['@_type'] === 'article') {
140
+ for (const instance of item.instance) {
141
+ if (instance['@_media-type'] === 'application/xml' &&
142
+ instance['@_href'].endsWith('.xml') &&
143
+ !instance['@_href'].includes('manifest') &&
144
+ !instance['@_href'].includes('directives')) {
145
+ // Normalize path separators to Unix style
146
+ jatsRelativePath = instance['@_href'].replace(/\\/g, '/');
147
+ break;
148
+ }
149
+ }
150
+ if (jatsRelativePath)
151
+ break;
152
+ }
153
+ }
154
+ if (!jatsRelativePath) {
155
+ throw new Error('No JATS XML file found in manifest');
156
+ }
157
+ console.log(` 📄 Found JATS file in manifest: ${jatsRelativePath}`);
158
+ // Extract the JATS file using the relative path from manifest
159
+ const jatsEntry = zip.getEntry(jatsRelativePath) || zip.getEntry(jatsRelativePath.replace(/\//g, '\\'));
160
+ if (jatsEntry) {
161
+ console.log(` 🔍 Extracting JATS file: ${jatsRelativePath} to ${extractedDir}`);
162
+ // Create the target directory structure if it doesn't exist
163
+ const jatsTargetPath = path.join(extractedDir, jatsRelativePath);
164
+ const jatsTargetDir = path.dirname(jatsTargetPath);
165
+ if (!fs.existsSync(jatsTargetDir)) {
166
+ fs.mkdirSync(jatsTargetDir, { recursive: true });
167
+ console.log(` 📁 Created directory: ${jatsTargetDir}`);
168
+ }
169
+ // Extract the JATS file content and write it to the correct location
170
+ const jatsContent = jatsEntry.getData();
171
+ fs.writeFileSync(jatsTargetPath, jatsContent);
172
+ console.log(` 📄 JATS file extracted: ${path.basename(jatsRelativePath)}`);
173
+ // Verify the file was extracted successfully
174
+ console.log(` 🔍 Verifying file exists at: ${jatsTargetPath}`);
175
+ if (!fs.existsSync(jatsTargetPath)) {
176
+ // Debug: list what was actually extracted
177
+ console.log(` 🔍 Debug: Checking extracted directory contents:`);
178
+ const listExtractedFiles = (dir, prefix = '') => {
179
+ if (fs.existsSync(dir)) {
180
+ const items = fs.readdirSync(dir);
181
+ items.forEach((item) => {
182
+ const itemPath = path.join(dir, item);
183
+ const stat = fs.statSync(itemPath);
184
+ if (stat.isDirectory()) {
185
+ console.log(` ${prefix}📁 ${item}/`);
186
+ listExtractedFiles(itemPath, prefix + ' ');
187
+ }
188
+ else {
189
+ console.log(` ${prefix}📄 ${item}`);
190
+ }
191
+ });
192
+ }
193
+ };
194
+ listExtractedFiles(extractedDir);
195
+ throw new Error(`JATS file was not extracted successfully to: ${jatsTargetPath}`);
196
+ }
197
+ console.log(` ✅ JATS file verified at: ${jatsTargetPath}`);
198
+ }
199
+ else {
200
+ throw new Error(`Could not extract JATS file: ${jatsRelativePath}`);
201
+ }
202
+ return extractedDir;
203
+ }
204
+ async function parseManifest(extractedDir) {
205
+ const manifestPath = path.join(extractedDir, 'manifest.xml');
206
+ if (!fs.existsSync(manifestPath)) {
207
+ throw new Error('Manifest file not found');
208
+ }
209
+ let manifestContent = fs.readFileSync(manifestPath, 'utf-8');
210
+ // Preprocess XML content to fix common HTML entities and reorder XML declaration if needed
211
+ manifestContent = preprocessXMLContent(manifestContent);
212
+ // Use xast-util-from-xml to parse the manifest XML
213
+ const ast = fromXml(manifestContent);
214
+ // Parse the manifest structure
215
+ return parseManifestStructure(ast);
216
+ }
217
+ function parseManifestStructure(ast) {
218
+ // Navigate through the AST to find manifest content
219
+ const manifest = findElement(ast, 'manifest');
220
+ if (!manifest) {
221
+ throw new Error('Manifest element not found');
222
+ }
223
+ const items = findElements(manifest, 'item');
224
+ const itemData = items.map((item) => {
225
+ const id = getAttribute(item, 'id');
226
+ const type = getAttribute(item, 'type');
227
+ const title = getTextContent(item);
228
+ const instances = findElements(item, 'instance');
229
+ const instanceData = instances.map((instance) => {
230
+ const mediaType = getAttribute(instance, 'media-type');
231
+ const href = getAttribute(instance, 'href');
232
+ return {
233
+ '@_media-type': mediaType || '',
234
+ '@_href': href || '',
235
+ };
236
+ });
237
+ return {
238
+ '@_id': id || '',
239
+ '@_type': type || '',
240
+ '@_title': title || '',
241
+ instance: instanceData,
242
+ };
243
+ });
244
+ return {
245
+ item: itemData,
246
+ };
247
+ }
248
+ function findJATSFile(manifest, extractedDir) {
249
+ for (const item of manifest.item) {
250
+ if (item['@_type'] === 'article') {
251
+ for (const instance of item.instance) {
252
+ if (instance['@_media-type'] === 'application/xml' &&
253
+ instance['@_href'].endsWith('.xml') &&
254
+ !instance['@_href'].includes('manifest') &&
255
+ !instance['@_href'].includes('directives')) {
256
+ // Normalize path separators to Unix style
257
+ const normalizedPath = instance['@_href'].replace(/\\/g, '/');
258
+ return path.join(extractedDir, normalizedPath);
259
+ }
260
+ }
261
+ }
262
+ }
263
+ return null;
264
+ }
265
+ async function parseJATS(jatsFile) {
266
+ let jatsContent = fs.readFileSync(jatsFile, 'utf-8');
267
+ // Preprocess XML content to fix common HTML entities
268
+ jatsContent = preprocessXMLContent(jatsContent);
269
+ // Use xast-util-from-xml for JATS parsing
270
+ const ast = fromXml(jatsContent);
271
+ // Extract metadata from the parsed JATS AST
272
+ const doi = extractDOI(ast);
273
+ const versionInfo = extractVersion(ast);
274
+ const dates = extractDates(ast);
275
+ const title = extractTitle(ast);
276
+ return {
277
+ doi,
278
+ version: versionInfo.version,
279
+ articleVersion: versionInfo.articleVersion,
280
+ receivedDate: dates.receivedDate,
281
+ acceptedDate: dates.acceptedDate,
282
+ title,
283
+ };
284
+ }
285
+ /**
286
+ * Preprocess XML content to fix common HTML entities that cause parsing errors
287
+ * @param xmlContent Raw XML content
288
+ * @returns Preprocessed XML content with entities replaced
289
+ */
290
+ export function preprocessXMLContent(xmlContent) {
291
+ // Handle cases where XML declaration is not on the first line or has leading whitespace
292
+ // Split content into lines and look for XML declaration
293
+ const lines = xmlContent.split('\n');
294
+ const xmlDeclarationIndex = lines.findIndex((line) => line.trim().startsWith('<?xml'));
295
+ if (xmlDeclarationIndex >= 0) {
296
+ if (xmlDeclarationIndex === 0) {
297
+ // XML declaration is on first line but may have leading whitespace
298
+ // Trim the first line to remove leading whitespace
299
+ lines[0] = lines[0].trim();
300
+ // Reconstruct the content
301
+ xmlContent = lines.join('\n');
302
+ }
303
+ else if (xmlDeclarationIndex < 5) {
304
+ // XML declaration is found but not on first line, reorder lines
305
+ const xmlDeclaration = lines[xmlDeclarationIndex];
306
+ // Remove the XML declaration from its current position
307
+ lines.splice(xmlDeclarationIndex, 1);
308
+ // Insert it at the beginning (without leading whitespace)
309
+ lines.unshift(xmlDeclaration.trim());
310
+ // Reconstruct the content
311
+ xmlContent = lines.join('\n');
312
+ }
313
+ }
314
+ // One specific case in January 2019
315
+ xmlContent = xmlContent.replace('<fn id="n1"fn-type="equal">', '<fn id="n1" fn-type="equal">');
316
+ // Define all valid HTML entities that we recognize
317
+ const validEntities = Object.keys(characterEntities);
318
+ // First, escape any unescaped ampersands that cause "Unterminated reference" errors
319
+ // This handles cases like "Bill & Melinda" where & is not properly escaped
320
+ const validEntityPattern = validEntities.join('|');
321
+ let processedContent = xmlContent.replace(new RegExp(`&(?!(?:${validEntityPattern}|#\\d+);)`, 'g'), '&#38;');
322
+ // Now replace HTML entities with their Unicode equivalents
323
+ // This handles cases like &ndash;, &lt;, etc.
324
+ // Note: We do NOT convert &amp; to & to avoid circular problems
325
+ const entityReplacements = {
326
+ ...Object.fromEntries(Object.entries(characterEntities).map(([key, value]) => [`&${key};`, value])),
327
+ '&lt;': '&#60;', // less than
328
+ '&gt;': '&#62;', // greater than
329
+ '&amp;': '&#38;', // ampersand
330
+ };
331
+ // Replace HTML entities
332
+ for (const [entity, replacement] of Object.entries(entityReplacements)) {
333
+ processedContent = processedContent.replace(new RegExp(entity, 'g'), replacement);
334
+ }
335
+ return processedContent;
336
+ }
337
+ // Helper functions to navigate the XAST
338
+ function findElement(node, name) {
339
+ if (node.type === 'element' && node.name === name) {
340
+ return node;
341
+ }
342
+ if (node.children) {
343
+ for (const child of node.children) {
344
+ if (child.type === 'element') {
345
+ const found = findElement(child, name);
346
+ if (found)
347
+ return found;
348
+ }
349
+ }
350
+ }
351
+ return null;
352
+ }
353
+ function findElements(node, name) {
354
+ const results = [];
355
+ if (node.type === 'element' && node.name === name) {
356
+ results.push(node);
357
+ }
358
+ if (node.children) {
359
+ for (const child of node.children) {
360
+ if (child.type === 'element') {
361
+ results.push(...findElements(child, name));
362
+ }
363
+ }
364
+ }
365
+ return results;
366
+ }
367
+ function getAttribute(node, name) {
368
+ if (node.attributes && node.attributes[name]) {
369
+ const value = node.attributes[name];
370
+ return value ? String(value) : null;
371
+ }
372
+ return null;
373
+ }
374
+ function extractDOI(ast) {
375
+ // Look for article-id with pub-id-type="doi"
376
+ const articleIds = findElements(ast, 'article-id');
377
+ for (const id of articleIds) {
378
+ const pubIdType = getAttribute(id, 'pub-id-type');
379
+ if (pubIdType === 'doi') {
380
+ // Get the text content
381
+ const textContent = getTextContent(id);
382
+ if (textContent) {
383
+ return textContent.trim();
384
+ }
385
+ }
386
+ }
387
+ throw new Error('DOI not found in JATS XML');
388
+ }
389
+ function extractVersion(ast) {
390
+ // Look for article-version
391
+ const versionElement = findElement(ast, 'article-version');
392
+ if (versionElement) {
393
+ const textContent = getTextContent(versionElement);
394
+ if (textContent) {
395
+ const version = textContent.trim();
396
+ // Extract the version number from strings like "1.4" -> articleVersion = 4
397
+ const versionParts = version.split('.');
398
+ const articleVersion = versionParts.length > 1 ? parseInt(versionParts[1]) : 1;
399
+ return { version, articleVersion };
400
+ }
401
+ }
402
+ return { version: '1.1', articleVersion: 1 }; // Default version
403
+ }
404
+ function extractDates(ast) {
405
+ let receivedDate;
406
+ let acceptedDate;
407
+ // Look for dates in history section
408
+ const history = findElement(ast, 'history');
409
+ if (history) {
410
+ const dates = findElements(history, 'date');
411
+ for (const date of dates) {
412
+ const dateType = getAttribute(date, 'date-type');
413
+ const yearElement = findElement(date, 'year');
414
+ const monthElement = findElement(date, 'month');
415
+ const dayElement = findElement(date, 'day');
416
+ if (yearElement && monthElement && dayElement) {
417
+ const year = getTextContent(yearElement);
418
+ const month = getTextContent(monthElement);
419
+ const day = getTextContent(dayElement);
420
+ if (year && month && day) {
421
+ const dateString = `${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}`;
422
+ if (dateType === 'received') {
423
+ receivedDate = dateString;
424
+ }
425
+ else if (dateType === 'accepted') {
426
+ acceptedDate = dateString;
427
+ }
428
+ }
429
+ }
430
+ }
431
+ }
432
+ // Fallback: look for pub-date with pub-type="epub" (bioRxiv format)
433
+ if (!receivedDate) {
434
+ const pubDates = findElements(ast, 'pub-date');
435
+ for (const date of pubDates) {
436
+ const pubType = getAttribute(date, 'pub-type');
437
+ if (pubType === 'epub') {
438
+ const yearElement = findElement(date, 'year');
439
+ const monthElement = findElement(date, 'month');
440
+ const dayElement = findElement(date, 'day');
441
+ if (yearElement && monthElement && dayElement) {
442
+ const year = getTextContent(yearElement);
443
+ const month = getTextContent(monthElement);
444
+ const day = getTextContent(dayElement);
445
+ if (year && month && day) {
446
+ receivedDate = `${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}`;
447
+ break;
448
+ }
449
+ }
450
+ }
451
+ }
452
+ }
453
+ // If no received date found, fall back to accepted date
454
+ if (!receivedDate) {
455
+ if (acceptedDate) {
456
+ console.log(`⚠️ No received date found, falling back to accepted date: ${acceptedDate}`);
457
+ receivedDate = acceptedDate;
458
+ }
459
+ else {
460
+ throw new Error('Neither received date nor accepted date found in JATS XML');
461
+ }
462
+ }
463
+ return { receivedDate, acceptedDate };
464
+ }
465
+ function extractTitle(ast) {
466
+ // Look for article-title
467
+ const titleElement = findElement(ast, 'article-title');
468
+ if (titleElement) {
469
+ const textContent = getTextContent(titleElement);
470
+ if (textContent) {
471
+ return textContent.trim();
472
+ }
473
+ }
474
+ return undefined;
475
+ }
476
+ function getTextContent(node) {
477
+ if (node.children) {
478
+ let text = '';
479
+ for (const child of node.children) {
480
+ if (child.type === 'text' && child.value) {
481
+ text += child.value;
482
+ }
483
+ else if (child.type === 'element' && child.children) {
484
+ const childText = getTextContent(child);
485
+ if (childText) {
486
+ text += childText;
487
+ }
488
+ }
489
+ }
490
+ return text || null;
491
+ }
492
+ return null;
493
+ }
494
+ async function postToAPI(paperData, apiUrl, apiKey) {
495
+ const headers = {
496
+ 'Content-Type': 'application/json',
497
+ };
498
+ if (apiKey) {
499
+ headers['Authorization'] = `Bearer ${apiKey}`;
500
+ }
501
+ const response = await axios.post(`${apiUrl}/v1/works`, paperData, { headers });
502
+ return response.data;
503
+ }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=meca-processor.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"meca-processor.test.d.ts","sourceRoot":"","sources":["../../src/utils/meca-processor.test.ts"],"names":[],"mappings":""}
@@ -0,0 +1,123 @@
1
+ import { describe, it, expect, vi } from 'vitest';
2
+ import { preprocessXMLContent } from './meca-processor.js';
3
+ // Mock fs module for testing
4
+ vi.mock('fs', () => ({
5
+ default: {
6
+ statSync: vi.fn(),
7
+ existsSync: vi.fn(),
8
+ readFileSync: vi.fn(),
9
+ writeFileSync: vi.fn(),
10
+ mkdirSync: vi.fn(),
11
+ },
12
+ }));
13
+ describe('preprocessXMLContent - Ampersand Handling', () => {
14
+ it.each([
15
+ {
16
+ name: 'should escape unescaped ampersands',
17
+ input: 'Bill & Melinda Gates Foundation',
18
+ expected: 'Bill &#38; Melinda Gates Foundation',
19
+ },
20
+ {
21
+ name: 'should preserve already escaped ampersands',
22
+ input: 'Bill &amp; Melinda Gates Foundation',
23
+ expected: 'Bill &#38; Melinda Gates Foundation',
24
+ },
25
+ {
26
+ name: 'should preserve numeric entities for ampersands',
27
+ input: 'Bill &#38; Melinda Gates Foundation',
28
+ expected: 'Bill &#38; Melinda Gates Foundation',
29
+ },
30
+ {
31
+ name: 'should handle mixed ampersand scenarios',
32
+ input: 'Company & Associates &amp; Partners &lt; 100',
33
+ expected: 'Company &#38; Associates &#38; Partners &#60; 100',
34
+ },
35
+ {
36
+ name: 'should handle complex academic text with various ampersands',
37
+ input: 'The Bill & Melinda Gates Foundation &amp; European Commission&#39;s Horizon 2020',
38
+ expected: 'The Bill &#38; Melinda Gates Foundation &#38; European Commission&#39;s Horizon 2020',
39
+ },
40
+ {
41
+ name: 'should handle multiple unescaped ampersands in sequence',
42
+ input: 'A & B & C & D',
43
+ expected: 'A &#38; B &#38; C &#38; D',
44
+ },
45
+ {
46
+ name: 'should preserve valid HTML entities while escaping unescaped ampersands',
47
+ input: 'Temperature &lt; 5 &deg;C &#38; Pressure &gt; 100 &mu;Pa',
48
+ expected: 'Temperature &#60; 5 °C &#38; Pressure &#62; 100 μPa',
49
+ },
50
+ {
51
+ name: 'rsquor',
52
+ input: 'dataset&rsquor;s terms of use.',
53
+ expected: 'dataset’s terms of use.',
54
+ },
55
+ ])('$name', ({ input, expected }) => {
56
+ const result = preprocessXMLContent(input);
57
+ expect(result).toBe(expected);
58
+ });
59
+ });
60
+ describe('preprocessXMLContent - XML Declaration Reordering', () => {
61
+ it('should reorder XML declaration to first line when it appears after DOCTYPE', () => {
62
+ const input = `<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
63
+ <?xml version="1.0" encoding="UTF-8"?>
64
+ <article xmlns:mml="http://www.w3.org/1998/Math/MathML">
65
+ <title>Test Article</title>
66
+ </article>`;
67
+ const result = preprocessXMLContent(input);
68
+ // Check that XML declaration is now first
69
+ expect(result.startsWith('<?xml version="1.0" encoding="UTF-8"?>')).toBe(true);
70
+ // Check that DOCTYPE is second
71
+ expect(result.includes('<!DOCTYPE article PUBLIC')).toBe(true);
72
+ // Check that the content is preserved
73
+ expect(result).toContain('<article xmlns:mml="http://www.w3.org/1998/Math/MathML">');
74
+ expect(result).toContain('<title>Test Article</title>');
75
+ });
76
+ it('should trim leading whitespace from XML declaration on first line', () => {
77
+ const input = ` <?xml version="1.0" encoding="UTF-8"?>
78
+ <!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
79
+ <article xmlns:mml="http://www.w3.org/1998/Math/MathML">
80
+ <title>Test Article</title>
81
+ </article>`;
82
+ const result = preprocessXMLContent(input);
83
+ // Check that XML declaration is first and has no leading whitespace
84
+ expect(result.startsWith('<?xml version="1.0" encoding="UTF-8"?>')).toBe(true);
85
+ // Check that DOCTYPE is second
86
+ expect(result.includes('<!DOCTYPE article PUBLIC')).toBe(true);
87
+ // Check that the content is preserved
88
+ expect(result).toContain('<article xmlns:mml="http://www.w3.org/1998/Math/MathML">');
89
+ expect(result).toContain('<title>Test Article</title>');
90
+ });
91
+ it('should not reorder when XML declaration is already first', () => {
92
+ const input = `<?xml version="1.0" encoding="UTF-8"?>
93
+ <!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
94
+ <article>
95
+ <title>Test Article</title>
96
+ </article>`;
97
+ const result = preprocessXMLContent(input);
98
+ // Should remain unchanged
99
+ expect(result).toBe(input);
100
+ });
101
+ it('should not reorder when XML declaration is beyond 4th line', () => {
102
+ const input = `<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
103
+ <article>
104
+ <title>Test Article</title>
105
+ <abstract>This is a test abstract</abstract>
106
+ <body>Test body content</body>
107
+ <?xml version="1.0" encoding="UTF-8"?>
108
+ </article>`;
109
+ const result = preprocessXMLContent(input);
110
+ // Should not reorder since XML declaration is on the 5th line (index 4)
111
+ // and we only reorder if it's within the first 4 lines (indices 0-3)
112
+ expect(result).toBe(input);
113
+ });
114
+ it('should handle case with no XML declaration', () => {
115
+ const input = `<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.2d1 20170631//EN" "JATS-archivearticle1.dtd">
116
+ <article>
117
+ <title>Test Article</title>
118
+ </article>`;
119
+ const result = preprocessXMLContent(input);
120
+ // Should remain unchanged
121
+ expect(result).toBe(input);
122
+ });
123
+ });
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Month utility functions for batch processing
3
+ */
4
+ /**
5
+ * Generate a range of months to process backwards from current month to 2018-12
6
+ */
7
+ export declare function generateMonthRange(): string[];
8
+ /**
9
+ * Parse month input and return array of months to process
10
+ */
11
+ export declare function parseMonthInput(monthInput: string): string[];
12
+ /**
13
+ * Parse wildcard pattern like "2025-*" to get all months in that year
14
+ */
15
+ export declare function parseWildcardPattern(pattern: string): string[];
16
+ /**
17
+ * Validate month format (YYYY-MM)
18
+ */
19
+ export declare function validateMonthFormat(month: string): boolean;
20
+ /**
21
+ * Sort months chronologically (oldest first)
22
+ */
23
+ export declare function sortMonthsChronologically(months: string[]): string[];
24
+ /**
25
+ * Get current month in YYYY-MM format
26
+ */
27
+ export declare function getCurrentMonth(): string;
28
+ /**
29
+ * Get previous month in YYYY-MM format
30
+ */
31
+ export declare function getPreviousMonth(): string;
32
+ /**
33
+ * Check if a month is in the future
34
+ */
35
+ export declare function isFutureMonth(month: string): boolean;
36
+ //# sourceMappingURL=months.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"months.d.ts","sourceRoot":"","sources":["../../src/utils/months.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,wBAAgB,kBAAkB,IAAI,MAAM,EAAE,CAuB7C;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,EAAE,CA6B5D;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAwB9D;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAW1D;AAED;;GAEG;AACH,wBAAgB,yBAAyB,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,EAAE,CAUpE;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,MAAM,CAKxC;AAED;;GAEG;AACH,wBAAgB,gBAAgB,IAAI,MAAM,CAMzC;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAOpD"}