@wonderwhy-er/desktop-commander 0.2.23 → 0.2.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -55
- package/dist/config-manager.d.ts +5 -0
- package/dist/config-manager.js +9 -0
- package/dist/custom-stdio.d.ts +1 -0
- package/dist/custom-stdio.js +19 -0
- package/dist/handlers/filesystem-handlers.d.ts +4 -0
- package/dist/handlers/filesystem-handlers.js +120 -14
- package/dist/handlers/node-handlers.d.ts +6 -0
- package/dist/handlers/node-handlers.js +73 -0
- package/dist/index.js +5 -3
- package/dist/search-manager.d.ts +25 -0
- package/dist/search-manager.js +212 -0
- package/dist/server.d.ts +11 -0
- package/dist/server.js +188 -73
- package/dist/terminal-manager.d.ts +56 -2
- package/dist/terminal-manager.js +169 -13
- package/dist/tools/edit.d.ts +28 -4
- package/dist/tools/edit.js +87 -4
- package/dist/tools/filesystem.d.ts +23 -12
- package/dist/tools/filesystem.js +201 -416
- package/dist/tools/improved-process-tools.d.ts +2 -2
- package/dist/tools/improved-process-tools.js +244 -214
- package/dist/tools/mime-types.d.ts +1 -0
- package/dist/tools/mime-types.js +7 -0
- package/dist/tools/pdf/extract-images.d.ts +34 -0
- package/dist/tools/pdf/extract-images.js +132 -0
- package/dist/tools/pdf/index.d.ts +6 -0
- package/dist/tools/pdf/index.js +3 -0
- package/dist/tools/pdf/lib/pdf2md.d.ts +36 -0
- package/dist/tools/pdf/lib/pdf2md.js +76 -0
- package/dist/tools/pdf/manipulations.d.ts +13 -0
- package/dist/tools/pdf/manipulations.js +96 -0
- package/dist/tools/pdf/markdown.d.ts +7 -0
- package/dist/tools/pdf/markdown.js +37 -0
- package/dist/tools/pdf/utils.d.ts +12 -0
- package/dist/tools/pdf/utils.js +34 -0
- package/dist/tools/schemas.d.ts +167 -12
- package/dist/tools/schemas.js +54 -5
- package/dist/types.d.ts +2 -1
- package/dist/utils/ab-test.d.ts +8 -0
- package/dist/utils/ab-test.js +76 -0
- package/dist/utils/capture.js +5 -0
- package/dist/utils/feature-flags.js +7 -4
- package/dist/utils/files/base.d.ts +167 -0
- package/dist/utils/files/base.js +5 -0
- package/dist/utils/files/binary.d.ts +21 -0
- package/dist/utils/files/binary.js +65 -0
- package/dist/utils/files/excel.d.ts +24 -0
- package/dist/utils/files/excel.js +416 -0
- package/dist/utils/files/factory.d.ts +40 -0
- package/dist/utils/files/factory.js +101 -0
- package/dist/utils/files/image.d.ts +21 -0
- package/dist/utils/files/image.js +78 -0
- package/dist/utils/files/index.d.ts +10 -0
- package/dist/utils/files/index.js +13 -0
- package/dist/utils/files/pdf.d.ts +32 -0
- package/dist/utils/files/pdf.js +142 -0
- package/dist/utils/files/text.d.ts +63 -0
- package/dist/utils/files/text.js +357 -0
- package/dist/utils/open-browser.d.ts +9 -0
- package/dist/utils/open-browser.js +43 -0
- package/dist/utils/ripgrep-resolver.js +3 -2
- package/dist/utils/system-info.d.ts +5 -0
- package/dist/utils/system-info.js +71 -3
- package/dist/utils/usageTracker.js +6 -0
- package/dist/utils/welcome-onboarding.d.ts +9 -0
- package/dist/utils/welcome-onboarding.js +37 -0
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +14 -3
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import { getDocumentProxy, extractImages } from 'unpdf';
|
|
2
|
+
/**
|
|
3
|
+
* Optimized image extraction from PDF using unpdf's built-in extractImages method
|
|
4
|
+
* @param pdfBuffer PDF file as Uint8Array
|
|
5
|
+
* @param pageNumbers Optional array of specific page numbers to process
|
|
6
|
+
* @param compressionOptions Image compression settings
|
|
7
|
+
* @returns Record of page numbers to extracted images
|
|
8
|
+
*/
|
|
9
|
+
export async function extractImagesFromPdf(pdfBuffer, pageNumbers, compressionOptions = {}) {
|
|
10
|
+
const pdfDocument = await getDocumentProxy(pdfBuffer);
|
|
11
|
+
const pagesToProcess = pageNumbers || Array.from({ length: pdfDocument.numPages }, (_, i) => i + 1);
|
|
12
|
+
const pageResults = {};
|
|
13
|
+
try {
|
|
14
|
+
// Process pages in parallel batches for better performance
|
|
15
|
+
const batchSize = 5; // Process 5 pages at a time
|
|
16
|
+
const batches = [];
|
|
17
|
+
for (let i = 0; i < pagesToProcess.length; i += batchSize) {
|
|
18
|
+
batches.push(pagesToProcess.slice(i, i + batchSize));
|
|
19
|
+
}
|
|
20
|
+
for (let batchIndex = 0; batchIndex < batches.length; batchIndex++) {
|
|
21
|
+
const batch = batches[batchIndex];
|
|
22
|
+
const batchPromises = batch.map(async (pageNum) => {
|
|
23
|
+
if (pageNum < 1 || pageNum > pdfDocument.numPages) {
|
|
24
|
+
return { pageNum, images: [] };
|
|
25
|
+
}
|
|
26
|
+
try {
|
|
27
|
+
// Use unpdf's built-in extractImages
|
|
28
|
+
const extractedImages = await extractImages(pdfDocument, pageNum);
|
|
29
|
+
const pageImages = [];
|
|
30
|
+
for (let index = 0; index < extractedImages.length; index++) {
|
|
31
|
+
const img = extractedImages[index];
|
|
32
|
+
const originalSize = img.data.length;
|
|
33
|
+
try {
|
|
34
|
+
const compressionResult = await convertRawImageToBase64(img.data, img.width, img.height, img.channels, compressionOptions);
|
|
35
|
+
if (compressionResult) {
|
|
36
|
+
pageImages.push({
|
|
37
|
+
objId: index, // Use index as objId since unpdf doesn't provide original objId
|
|
38
|
+
width: img.width,
|
|
39
|
+
height: img.height,
|
|
40
|
+
data: compressionResult.data,
|
|
41
|
+
mimeType: compressionResult.mimeType,
|
|
42
|
+
originalSize,
|
|
43
|
+
compressedSize: Math.round(compressionResult.data.length * 0.75) // Approximate base64 overhead
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
catch (err) {
|
|
48
|
+
// Ignore conversion errors as requested
|
|
49
|
+
console.warn(`Failed to convert image ${index} on page ${pageNum}:`, err instanceof Error ? err.message : String(err));
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return { pageNum, images: pageImages };
|
|
53
|
+
}
|
|
54
|
+
catch (error) {
|
|
55
|
+
console.warn(`Failed to extract images from page ${pageNum}:`, error instanceof Error ? error.message : String(error));
|
|
56
|
+
return { pageNum, images: [] };
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
// Wait for the current batch to complete
|
|
60
|
+
const batchResults = await Promise.all(batchPromises);
|
|
61
|
+
// Store results
|
|
62
|
+
for (const { pageNum, images } of batchResults) {
|
|
63
|
+
pageResults[pageNum] = images;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
finally {
|
|
68
|
+
// Clean up document
|
|
69
|
+
try {
|
|
70
|
+
if (typeof pdfDocument.cleanup === 'function') {
|
|
71
|
+
await pdfDocument.cleanup(false);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
catch (e) { /* Ignore cleanup errors */ }
|
|
75
|
+
try {
|
|
76
|
+
if (typeof pdfDocument.destroy === 'function') {
|
|
77
|
+
await pdfDocument.destroy();
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
catch (e) { /* Ignore cleanup errors */ }
|
|
81
|
+
}
|
|
82
|
+
return pageResults;
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Convert raw image data to compressed base64 using sharp
|
|
86
|
+
*/
|
|
87
|
+
async function convertRawImageToBase64(data, width, height, channels, options = {}) {
|
|
88
|
+
const { format = 'webp', quality = 85, maxDimension = 1200 } = options;
|
|
89
|
+
// Smart resizing - only resize large images
|
|
90
|
+
let targetWidth = width;
|
|
91
|
+
let targetHeight = height;
|
|
92
|
+
if (width > maxDimension || height > maxDimension) {
|
|
93
|
+
const scale = maxDimension / Math.max(width, height);
|
|
94
|
+
targetWidth = Math.round(width * scale);
|
|
95
|
+
targetHeight = Math.round(height * scale);
|
|
96
|
+
}
|
|
97
|
+
try {
|
|
98
|
+
// Try to dynamically import sharp
|
|
99
|
+
const sharp = (await import('sharp')).default;
|
|
100
|
+
// sharp takes Buffer, Uint8Array, etc.
|
|
101
|
+
// unpdf returns Uint8ClampedArray, which works with Buffer.from()
|
|
102
|
+
let pipeline = sharp(Buffer.from(data), {
|
|
103
|
+
raw: {
|
|
104
|
+
width,
|
|
105
|
+
height,
|
|
106
|
+
channels: channels
|
|
107
|
+
}
|
|
108
|
+
});
|
|
109
|
+
if (targetWidth !== width || targetHeight !== height) {
|
|
110
|
+
pipeline = pipeline.resize(targetWidth, targetHeight);
|
|
111
|
+
}
|
|
112
|
+
let outputBuffer;
|
|
113
|
+
let mimeType;
|
|
114
|
+
if (format === 'jpeg') {
|
|
115
|
+
outputBuffer = await pipeline.jpeg({ quality }).toBuffer();
|
|
116
|
+
mimeType = 'image/jpeg';
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
// Default to webp
|
|
120
|
+
outputBuffer = await pipeline.webp({ quality }).toBuffer();
|
|
121
|
+
mimeType = 'image/webp';
|
|
122
|
+
}
|
|
123
|
+
return {
|
|
124
|
+
data: outputBuffer.toString('base64'),
|
|
125
|
+
mimeType
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
catch (error) {
|
|
129
|
+
console.warn('Image conversion failed (likely missing sharp or invalid data):', error instanceof Error ? error.message : String(error));
|
|
130
|
+
return null;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { editPdf } from './manipulations.js';
|
|
2
|
+
export type { PdfOperations, PdfInsertOperation, PdfDeleteOperation } from './manipulations.js';
|
|
3
|
+
export { parsePdfToMarkdown, parseMarkdownToPdf } from './markdown.js';
|
|
4
|
+
export type { PdfMetadata, PdfPageItem } from './lib/pdf2md.js';
|
|
5
|
+
export { extractImagesFromPdf } from './extract-images.js';
|
|
6
|
+
export type { ImageInfo, PageImages } from './extract-images.js';
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { ImageInfo } from '../extract-images.js';
|
|
2
|
+
/**
|
|
3
|
+
* PDF metadata structure
|
|
4
|
+
*/
|
|
5
|
+
export interface PdfMetadata {
|
|
6
|
+
fileSize?: number;
|
|
7
|
+
totalPages: number;
|
|
8
|
+
title?: string;
|
|
9
|
+
author?: string;
|
|
10
|
+
creator?: string;
|
|
11
|
+
producer?: string;
|
|
12
|
+
version?: string;
|
|
13
|
+
creationDate?: string;
|
|
14
|
+
modificationDate?: string;
|
|
15
|
+
isEncrypted?: boolean;
|
|
16
|
+
}
|
|
17
|
+
export interface PdfPageItem {
|
|
18
|
+
text: string;
|
|
19
|
+
images: ImageInfo[];
|
|
20
|
+
pageNumber: number;
|
|
21
|
+
}
|
|
22
|
+
export interface PdfParseResult {
|
|
23
|
+
pages: PdfPageItem[];
|
|
24
|
+
metadata: PdfMetadata;
|
|
25
|
+
}
|
|
26
|
+
export type PageRange = {
|
|
27
|
+
offset: number;
|
|
28
|
+
length: number;
|
|
29
|
+
};
|
|
30
|
+
/**
|
|
31
|
+
* Reads a PDF and converts it to Markdown, returning structured data.
|
|
32
|
+
* @param pdfBuffer The PDF buffer to convert.
|
|
33
|
+
* @param pageNumbers The page numbers to extract. If empty, all pages are extracted.
|
|
34
|
+
* @returns A Promise that resolves to a PdfParseResult object containing the parsed data.
|
|
35
|
+
*/
|
|
36
|
+
export declare function pdf2md(pdfBuffer: Uint8Array, pageNumbers?: number[] | PageRange): Promise<PdfParseResult>;
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
import { generatePageNumbers } from '../utils.js';
|
|
3
|
+
import { extractImagesFromPdf } from '../extract-images.js';
|
|
4
|
+
const require = createRequire(import.meta.url);
|
|
5
|
+
const { parse } = require('@opendocsg/pdf2md/lib/util/pdf');
|
|
6
|
+
const { makeTransformations, transform } = require('@opendocsg/pdf2md/lib/util/transformations');
|
|
7
|
+
/**
|
|
8
|
+
* Extracts metadata from a PDF document.
|
|
9
|
+
* @param pdfDocument The PDF document to extract metadata from.
|
|
10
|
+
* @returns A PdfMetadata object containing the extracted metadata.
|
|
11
|
+
*/
|
|
12
|
+
const extractMetadata = ({ pdfDocument, metadata }) => ({
|
|
13
|
+
totalPages: pdfDocument.numPages,
|
|
14
|
+
title: metadata.Title,
|
|
15
|
+
author: metadata.Author,
|
|
16
|
+
creator: metadata.Creator,
|
|
17
|
+
producer: metadata.Producer,
|
|
18
|
+
version: metadata.PDFFormatVersion,
|
|
19
|
+
creationDate: metadata.CreationDate,
|
|
20
|
+
modificationDate: metadata.ModDate,
|
|
21
|
+
isEncrypted: metadata.IsEncrypted,
|
|
22
|
+
});
|
|
23
|
+
/**
|
|
24
|
+
* Reads a PDF and converts it to Markdown, returning structured data.
|
|
25
|
+
* @param pdfBuffer The PDF buffer to convert.
|
|
26
|
+
* @param pageNumbers The page numbers to extract. If empty, all pages are extracted.
|
|
27
|
+
* @returns A Promise that resolves to a PdfParseResult object containing the parsed data.
|
|
28
|
+
*/
|
|
29
|
+
export async function pdf2md(pdfBuffer, pageNumbers = []) {
|
|
30
|
+
const result = await parse(pdfBuffer);
|
|
31
|
+
const { fonts, pages, pdfDocument } = result;
|
|
32
|
+
// Calculate which pages to process
|
|
33
|
+
const filterPageNumbers = Array.isArray(pageNumbers) ?
|
|
34
|
+
pageNumbers :
|
|
35
|
+
generatePageNumbers(pageNumbers.offset, pageNumbers.length, pages.length);
|
|
36
|
+
// Filter and transform pages
|
|
37
|
+
const pagesToProcess = filterPageNumbers.length === 0 ?
|
|
38
|
+
pages :
|
|
39
|
+
pages.filter((_, index) => filterPageNumbers.includes(index + 1));
|
|
40
|
+
const pageNumberMap = filterPageNumbers.length === 0 ?
|
|
41
|
+
pages.map((_, index) => index + 1) :
|
|
42
|
+
filterPageNumbers.filter(pageNum => pageNum >= 1 && pageNum <= pages.length);
|
|
43
|
+
const transformations = makeTransformations(fonts.map);
|
|
44
|
+
const parseResult = transform(pagesToProcess, transformations);
|
|
45
|
+
// Extract images
|
|
46
|
+
const imagesByPage = await extractImagesFromPdf(pdfBuffer, pageNumberMap, { format: 'webp', quality: 85 });
|
|
47
|
+
// Create pages without images for now
|
|
48
|
+
const processedPages = parseResult.pages.map((page, index) => {
|
|
49
|
+
const pageNumber = pageNumberMap[index];
|
|
50
|
+
return {
|
|
51
|
+
pageNumber,
|
|
52
|
+
text: page.items.join('\n') + '\n',
|
|
53
|
+
images: imagesByPage[pageNumber] || [],
|
|
54
|
+
};
|
|
55
|
+
});
|
|
56
|
+
const metadata = extractMetadata(result);
|
|
57
|
+
try {
|
|
58
|
+
return { pages: processedPages, metadata };
|
|
59
|
+
}
|
|
60
|
+
finally {
|
|
61
|
+
if (pdfDocument) {
|
|
62
|
+
try {
|
|
63
|
+
if (typeof pdfDocument.cleanup === 'function') {
|
|
64
|
+
await pdfDocument.cleanup(false);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
catch (e) { }
|
|
68
|
+
try {
|
|
69
|
+
if (typeof pdfDocument.destroy === 'function') {
|
|
70
|
+
await pdfDocument.destroy();
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
catch (e) { }
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { PdfInsertOperationSchema, PdfDeleteOperationSchema, PdfOperationSchema } from '../schemas.js';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
type PdfInsertOperation = z.infer<typeof PdfInsertOperationSchema>;
|
|
4
|
+
type PdfDeleteOperation = z.infer<typeof PdfDeleteOperationSchema>;
|
|
5
|
+
type PdfOperations = z.infer<typeof PdfOperationSchema>;
|
|
6
|
+
export type { PdfOperations, PdfInsertOperation, PdfDeleteOperation };
|
|
7
|
+
/**
|
|
8
|
+
* Edit an existing PDF by deleting or inserting pages
|
|
9
|
+
* @param pdfPath Path to the PDF file to edit
|
|
10
|
+
* @param operations List of operations to perform
|
|
11
|
+
* @returns The modified PDF as a Uint8Array
|
|
12
|
+
*/
|
|
13
|
+
export declare function editPdf(pdfPath: string, operations: PdfOperations[]): Promise<Uint8Array>;
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import fs from 'fs/promises';
|
|
2
|
+
import { PDFDocument } from 'pdf-lib';
|
|
3
|
+
import { normalizePageIndexes } from './utils.js';
|
|
4
|
+
import { parseMarkdownToPdf } from './markdown.js';
|
|
5
|
+
async function loadPdfDocumentFromBuffer(filePathOrBuffer) {
|
|
6
|
+
const buffer = typeof filePathOrBuffer === 'string' ? await fs.readFile(filePathOrBuffer) : filePathOrBuffer;
|
|
7
|
+
const pdfBytes = new Uint8Array(buffer);
|
|
8
|
+
return await PDFDocument.load(pdfBytes);
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Delete pages from a PDF document
|
|
12
|
+
* @param pdfDoc PDF document to delete pages from
|
|
13
|
+
* @param pageIndexes Page indices to delete, negative indices are from end
|
|
14
|
+
*/
|
|
15
|
+
function deletePages(pdfDoc, pageIndexes) {
|
|
16
|
+
const pageCount = pdfDoc.getPageCount();
|
|
17
|
+
// Transform negative indices to absolute and filter valid ones
|
|
18
|
+
const normalizedIndexes = normalizePageIndexes(pageIndexes, pageCount).sort((a, b) => b - a);
|
|
19
|
+
for (const pageIndex of normalizedIndexes) {
|
|
20
|
+
pdfDoc.removePage(pageIndex);
|
|
21
|
+
}
|
|
22
|
+
return pdfDoc;
|
|
23
|
+
}
|
|
24
|
+
function getPageLayout(page) {
|
|
25
|
+
const { width, height } = page.getSize();
|
|
26
|
+
const mediaBox = page.getMediaBox(); // Full page area
|
|
27
|
+
const cropBox = page.getCropBox(); // Visible area (may indicate margins)
|
|
28
|
+
// Calculate margins (if CropBox differs from MediaBox)
|
|
29
|
+
let marginLeft = cropBox.x - mediaBox.x;
|
|
30
|
+
let marginBottom = cropBox.y - mediaBox.y;
|
|
31
|
+
let marginRight = (mediaBox.x + mediaBox.width) - (cropBox.x + cropBox.width);
|
|
32
|
+
let marginTop = (mediaBox.y + mediaBox.height) - (cropBox.y + cropBox.height);
|
|
33
|
+
if (marginLeft === 0 && marginRight === 0 && marginTop === 0 && marginBottom === 0) {
|
|
34
|
+
marginLeft = 72;
|
|
35
|
+
marginBottom = 72;
|
|
36
|
+
marginRight = 72;
|
|
37
|
+
marginTop = 72;
|
|
38
|
+
}
|
|
39
|
+
// Convert points to inches (1 inch = 72 points)
|
|
40
|
+
// Puppeteer requires standard units and doesn't accept decimal points
|
|
41
|
+
const pointsToInches = (pts) => (pts / 72).toFixed(4);
|
|
42
|
+
return {
|
|
43
|
+
format: undefined, // Explicitly disable format to use custom dimensions
|
|
44
|
+
width: `${pointsToInches(width)}in`,
|
|
45
|
+
height: `${pointsToInches(height)}in`,
|
|
46
|
+
margin: {
|
|
47
|
+
top: `${pointsToInches(marginTop)}in`,
|
|
48
|
+
right: `${pointsToInches(marginRight)}in`,
|
|
49
|
+
bottom: `${pointsToInches(marginBottom)}in`,
|
|
50
|
+
left: `${pointsToInches(marginLeft)}in`
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
async function insertPages(destPdfDocument, pageIndex, sourcePdfDocument) {
|
|
55
|
+
let insertPosition = pageIndex < 0 ? destPdfDocument.getPageCount() + pageIndex : pageIndex;
|
|
56
|
+
if (insertPosition < 0 || insertPosition > destPdfDocument.getPageCount()) {
|
|
57
|
+
throw new Error('Invalid page index');
|
|
58
|
+
}
|
|
59
|
+
const copiedPages = await destPdfDocument.copyPages(sourcePdfDocument, sourcePdfDocument.getPageIndices());
|
|
60
|
+
for (let i = 0; i < copiedPages.length; i++) {
|
|
61
|
+
destPdfDocument.insertPage(insertPosition + i, copiedPages[i]);
|
|
62
|
+
}
|
|
63
|
+
return destPdfDocument;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Edit an existing PDF by deleting or inserting pages
|
|
67
|
+
* @param pdfPath Path to the PDF file to edit
|
|
68
|
+
* @param operations List of operations to perform
|
|
69
|
+
* @returns The modified PDF as a Uint8Array
|
|
70
|
+
*/
|
|
71
|
+
export async function editPdf(pdfPath, operations) {
|
|
72
|
+
const pdfDoc = await loadPdfDocumentFromBuffer(pdfPath);
|
|
73
|
+
// Get page layout from the ORIGINAL first page
|
|
74
|
+
const pageLayout = pdfDoc.getPageCount() > 0 ? getPageLayout(pdfDoc.getPage(0)) : undefined;
|
|
75
|
+
for (const op of operations) {
|
|
76
|
+
if (op.type === 'delete') {
|
|
77
|
+
deletePages(pdfDoc, op.pageIndexes);
|
|
78
|
+
}
|
|
79
|
+
else if (op.type == 'insert') {
|
|
80
|
+
let sourcePdfDocument;
|
|
81
|
+
if (op.markdown !== undefined) {
|
|
82
|
+
const pdfOptions = pageLayout ? { pdf_options: pageLayout } : undefined;
|
|
83
|
+
const pdfBuffer = await parseMarkdownToPdf(op.markdown, pdfOptions);
|
|
84
|
+
sourcePdfDocument = await loadPdfDocumentFromBuffer(pdfBuffer);
|
|
85
|
+
}
|
|
86
|
+
else if (op.sourcePdfPath) {
|
|
87
|
+
sourcePdfDocument = await loadPdfDocumentFromBuffer(op.sourcePdfPath);
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
throw new Error('No source provided for insert operation');
|
|
91
|
+
}
|
|
92
|
+
await insertPages(pdfDoc, op.pageIndex, sourcePdfDocument);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return await pdfDoc.save();
|
|
96
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { PageRange } from './lib/pdf2md.js';
|
|
2
|
+
import { PdfParseResult } from './lib/pdf2md.js';
|
|
3
|
+
/**
|
|
4
|
+
* Convert PDF to Markdown using @opendocsg/pdf2md
|
|
5
|
+
*/
|
|
6
|
+
export declare function parsePdfToMarkdown(source: string, pageNumbers?: number[] | PageRange): Promise<PdfParseResult>;
|
|
7
|
+
export declare function parseMarkdownToPdf(markdown: string, options?: any): Promise<Buffer>;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import fs from 'fs/promises';
|
|
2
|
+
import { mdToPdf } from 'md-to-pdf';
|
|
3
|
+
import { pdf2md } from './lib/pdf2md.js';
|
|
4
|
+
const isUrl = (source) => source.startsWith('http://') || source.startsWith('https://');
|
|
5
|
+
async function loadPdfToBuffer(source) {
|
|
6
|
+
if (isUrl(source)) {
|
|
7
|
+
const response = await fetch(source);
|
|
8
|
+
return await response.arrayBuffer();
|
|
9
|
+
}
|
|
10
|
+
else {
|
|
11
|
+
return await fs.readFile(source);
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Convert PDF to Markdown using @opendocsg/pdf2md
|
|
16
|
+
*/
|
|
17
|
+
export async function parsePdfToMarkdown(source, pageNumbers = []) {
|
|
18
|
+
try {
|
|
19
|
+
const data = await loadPdfToBuffer(source);
|
|
20
|
+
// @ts-ignore: Type definition mismatch for ESM usage
|
|
21
|
+
return await pdf2md(new Uint8Array(data), pageNumbers);
|
|
22
|
+
}
|
|
23
|
+
catch (error) {
|
|
24
|
+
console.error("Error converting PDF to Markdown (v3):", error);
|
|
25
|
+
throw error;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
export async function parseMarkdownToPdf(markdown, options = {}) {
|
|
29
|
+
try {
|
|
30
|
+
const pdf = await mdToPdf({ content: markdown }, options);
|
|
31
|
+
return pdf.content;
|
|
32
|
+
}
|
|
33
|
+
catch (error) {
|
|
34
|
+
console.error('Error creating PDF:', error);
|
|
35
|
+
throw error;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Normalize page indexes, handling negative indices and removing duplicates
|
|
3
|
+
*/
|
|
4
|
+
export declare const normalizePageIndexes: (pageIndexes: number[], pageCount: number) => number[];
|
|
5
|
+
/**
|
|
6
|
+
* Generate page numbers based on offset and length
|
|
7
|
+
* @param offset Zero-based offset or negative for counting from end
|
|
8
|
+
* @param length Number of pages to generate
|
|
9
|
+
* @param totalPages Total number of pages in the document
|
|
10
|
+
* @returns Array of page numbers
|
|
11
|
+
*/
|
|
12
|
+
export declare function generatePageNumbers(offset: number, length: number, totalPages: number): number[];
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Normalize page indexes, handling negative indices and removing duplicates
|
|
3
|
+
*/
|
|
4
|
+
export const normalizePageIndexes = (pageIndexes, pageCount) => {
|
|
5
|
+
const normalizedIndexes = pageIndexes
|
|
6
|
+
.map(idx => idx < 0 ? pageCount + idx : idx)
|
|
7
|
+
.filter(idx => idx >= 0 && idx < pageCount);
|
|
8
|
+
// Use Set to remove duplicates
|
|
9
|
+
return [...new Set(normalizedIndexes)];
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* Generate page numbers based on offset and length
|
|
13
|
+
* @param offset Zero-based offset or negative for counting from end
|
|
14
|
+
* @param length Number of pages to generate
|
|
15
|
+
* @param totalPages Total number of pages in the document
|
|
16
|
+
* @returns Array of page numbers
|
|
17
|
+
*/
|
|
18
|
+
export function generatePageNumbers(offset, length, totalPages) {
|
|
19
|
+
// Compute 1-based start page
|
|
20
|
+
const startPage = offset < 0
|
|
21
|
+
? totalPages + offset + 1
|
|
22
|
+
: offset + 1;
|
|
23
|
+
// Clamp start page
|
|
24
|
+
if (startPage > totalPages)
|
|
25
|
+
return [];
|
|
26
|
+
const safeStart = Math.max(1, startPage);
|
|
27
|
+
// Compute final page (inclusive), truncated by totalPages
|
|
28
|
+
const endPage = Math.min(safeStart + length - 1, totalPages);
|
|
29
|
+
const count = endPage - safeStart + 1;
|
|
30
|
+
if (count <= 0)
|
|
31
|
+
return [];
|
|
32
|
+
// Preallocate array for speed
|
|
33
|
+
return Array.from({ length: count }, (_, i) => safeStart + i);
|
|
34
|
+
}
|