@vertesia/converters 0.80.0-dev.20251121 → 0.80.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -0
- package/package.json +12 -1
package/README.md
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# @vertesia/converters
|
|
2
|
+
|
|
3
|
+
Image and document conversion utilities for Node.js. Provides functions for image transformation, PDF to text extraction, and document to Markdown conversion.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Image Transformation**: Resize and convert images using Sharp
|
|
8
|
+
- **PDF to Text**: Extract text from PDF files using MuTool
|
|
9
|
+
- **Document to Markdown**: Convert various document formats to Markdown using Pandoc
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
npm install @vertesia/converters
|
|
15
|
+
# or
|
|
16
|
+
pnpm add @vertesia/converters
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### System Dependencies
|
|
20
|
+
|
|
21
|
+
Some converters require external tools to be installed:
|
|
22
|
+
|
|
23
|
+
- **Image conversion**: No external dependencies (uses Sharp)
|
|
24
|
+
- **PDF to text**: Requires [MuTool](https://mupdf.com/docs/mutool.html) (`mutool` command)
|
|
25
|
+
- **Document to Markdown**: Requires [Pandoc](https://pandoc.org/) (`pandoc` command)
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
### Image Transformation
|
|
30
|
+
|
|
31
|
+
Transform images with resizing and format conversion:
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
import {
|
|
35
|
+
transformImage,
|
|
36
|
+
transformImageToBuffer,
|
|
37
|
+
transformImageToFile
|
|
38
|
+
} from '@vertesia/converters';
|
|
39
|
+
|
|
40
|
+
// Transform image to a stream
|
|
41
|
+
import { createReadStream, createWriteStream } from 'fs';
|
|
42
|
+
|
|
43
|
+
const input = createReadStream('input.jpg');
|
|
44
|
+
const output = createWriteStream('output.webp');
|
|
45
|
+
|
|
46
|
+
await transformImage(input, output, {
|
|
47
|
+
max_hw: 1024, // Max width/height (maintains aspect ratio)
|
|
48
|
+
format: 'webp' // Output format
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
// Transform image to buffer
|
|
52
|
+
const buffer = await transformImageToBuffer(inputBuffer, {
|
|
53
|
+
max_hw: 800,
|
|
54
|
+
format: 'png'
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// Transform image to file
|
|
58
|
+
await transformImageToFile(inputBuffer, 'output.jpg', {
|
|
59
|
+
max_hw: 1200,
|
|
60
|
+
format: 'jpeg'
|
|
61
|
+
});
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### PDF to Text
|
|
65
|
+
|
|
66
|
+
Extract text content from PDF files:
|
|
67
|
+
|
|
68
|
+
```typescript
|
|
69
|
+
import { pdfToText, pdfToTextBuffer, pdfFileToText } from '@vertesia/converters';
|
|
70
|
+
|
|
71
|
+
// From buffer to string
|
|
72
|
+
const text = await pdfToText(pdfBuffer);
|
|
73
|
+
|
|
74
|
+
// From buffer to buffer
|
|
75
|
+
const textBuffer = await pdfToTextBuffer(pdfBuffer);
|
|
76
|
+
|
|
77
|
+
// From file to file
|
|
78
|
+
await pdfFileToText('input.pdf', 'output.txt');
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Document to Markdown
|
|
82
|
+
|
|
83
|
+
Convert documents to Markdown format using Pandoc:
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
import { manyToMarkdown } from '@vertesia/converters';
|
|
87
|
+
import { createReadStream } from 'fs';
|
|
88
|
+
|
|
89
|
+
// Convert DOCX to Markdown
|
|
90
|
+
const stream = createReadStream('document.docx');
|
|
91
|
+
const markdown = await manyToMarkdown(stream, 'docx');
|
|
92
|
+
|
|
93
|
+
// Convert HTML to Markdown
|
|
94
|
+
const htmlStream = createReadStream('page.html');
|
|
95
|
+
const md = await manyToMarkdown(htmlStream, 'html');
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Supported input formats include all formats supported by Pandoc: `docx`, `html`, `latex`, `rst`, `textile`, `org`, `mediawiki`, and many more.
|
|
99
|
+
|
|
100
|
+
## API Reference
|
|
101
|
+
|
|
102
|
+
### Image Functions
|
|
103
|
+
|
|
104
|
+
| Function | Description |
|
|
105
|
+
|----------|-------------|
|
|
106
|
+
| `transformImage(input, output, opts)` | Transform image from stream to stream |
|
|
107
|
+
| `transformImageToBuffer(input, opts)` | Transform image to buffer |
|
|
108
|
+
| `transformImageToFile(input, output, opts)` | Transform image to file |
|
|
109
|
+
|
|
110
|
+
#### TransformOptions
|
|
111
|
+
|
|
112
|
+
| Option | Type | Description |
|
|
113
|
+
|--------|------|-------------|
|
|
114
|
+
| `max_hw` | `number` | Maximum width/height (maintains aspect ratio, no upscaling) |
|
|
115
|
+
| `format` | `string` | Output format (`jpeg`, `png`, `webp`, `avif`, etc.) |
|
|
116
|
+
|
|
117
|
+
### PDF Functions
|
|
118
|
+
|
|
119
|
+
| Function | Description |
|
|
120
|
+
|----------|-------------|
|
|
121
|
+
| `pdfToText(buffer)` | Convert PDF buffer to text string |
|
|
122
|
+
| `pdfToTextBuffer(buffer)` | Convert PDF buffer to text buffer |
|
|
123
|
+
| `pdfFileToText(input, output)` | Convert PDF file to text file |
|
|
124
|
+
|
|
125
|
+
### Document Functions
|
|
126
|
+
|
|
127
|
+
| Function | Description |
|
|
128
|
+
|----------|-------------|
|
|
129
|
+
| `manyToMarkdown(stream, format)` | Convert document stream to Markdown |
|
|
130
|
+
|
|
131
|
+
## Requirements
|
|
132
|
+
|
|
133
|
+
- Node.js 18+
|
|
134
|
+
- MuTool (for PDF conversion)
|
|
135
|
+
- Pandoc (for document conversion)
|
|
136
|
+
|
|
137
|
+
## License
|
|
138
|
+
|
|
139
|
+
Apache-2.0
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vertesia/converters",
|
|
3
|
-
"version": "0.80.0
|
|
3
|
+
"version": "0.80.0",
|
|
4
4
|
"description": "Image and content converters",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"types": "./lib/types/index.d.ts",
|
|
@@ -29,6 +29,17 @@
|
|
|
29
29
|
"url": "https://github.com/vertesia/composableai.git",
|
|
30
30
|
"directory": "packages/converters"
|
|
31
31
|
},
|
|
32
|
+
"keywords": [
|
|
33
|
+
"vertesia",
|
|
34
|
+
"converters",
|
|
35
|
+
"image",
|
|
36
|
+
"pdf",
|
|
37
|
+
"markdown",
|
|
38
|
+
"sharp",
|
|
39
|
+
"pandoc",
|
|
40
|
+
"mutool",
|
|
41
|
+
"typescript"
|
|
42
|
+
],
|
|
32
43
|
"ts_dual_module": {
|
|
33
44
|
"outDir": "lib"
|
|
34
45
|
},
|