utilitas 2000.3.16 → 2000.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/utilitas.lite.mjs +1 -1
- package/dist/utilitas.lite.mjs.map +1 -1
- package/lib/manifest.mjs +2 -1
- package/lib/storage.mjs +1 -1
- package/lib/vision.mjs +84 -14
- package/package.json +2 -1
package/lib/manifest.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
const manifest = {
|
|
2
2
|
"name": "utilitas",
|
|
3
3
|
"description": "Just another common utility for JavaScript.",
|
|
4
|
-
"version": "2000.3.
|
|
4
|
+
"version": "2000.3.17",
|
|
5
5
|
"private": false,
|
|
6
6
|
"homepage": "https://github.com/Leask/utilitas",
|
|
7
7
|
"main": "index.mjs",
|
|
@@ -51,6 +51,7 @@ const manifest = {
|
|
|
51
51
|
"office-text-extractor": "^3.0.3",
|
|
52
52
|
"openai": "^6.9.1",
|
|
53
53
|
"pdfjs-dist": "^5.4.394",
|
|
54
|
+
"pdf-lib": "^1.17.1",
|
|
54
55
|
"pg": "^8.16.3",
|
|
55
56
|
"pgvector": "^0.2.1",
|
|
56
57
|
"ping": "^1.0.0",
|
package/lib/storage.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
log as _log,
|
|
3
3
|
base64Decode, base64Encode, ensureString, extract, ignoreErrFunc,
|
|
4
|
-
mergeAtoB, need, throwError, trim,
|
|
4
|
+
mergeAtoB, need, throwError, trim, which,
|
|
5
5
|
} from './utilitas.mjs';
|
|
6
6
|
|
|
7
7
|
import { fileTypeFromBuffer } from 'file-type';
|
package/lib/vision.mjs
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
import { convert, DATAURL, BUFFER, FILE } from './storage.mjs';
|
|
10
10
|
import fs from 'node:fs';
|
|
11
11
|
|
|
12
|
-
const _NEED = ['office-text-extractor', 'pdfjs-dist', 'tesseract.js'];
|
|
12
|
+
const _NEED = ['office-text-extractor', 'pdfjs-dist', 'pdf-lib', 'tesseract.js'];
|
|
13
13
|
const clients = {};
|
|
14
14
|
const errorMessage = 'Invalid input data.';
|
|
15
15
|
const log = content => _log(content, import.meta.url);
|
|
@@ -144,27 +144,96 @@ const ocr = async (file, options = {}) => {
|
|
|
144
144
|
const client = clients?.[provider];
|
|
145
145
|
assert(client, 'No available OCR provider.');
|
|
146
146
|
const model = options?.model || client.model;
|
|
147
|
-
const document_url = await convert(file, { ...options, expected: DATAURL });
|
|
148
147
|
switch (provider) {
|
|
149
148
|
case GOOGLE_MISTRAL:
|
|
150
149
|
const key = await getGoogleAuthTokenByAuth(client.auth);
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
150
|
+
const inputPdfs = await splitPdf(file, {
|
|
151
|
+
...options, expected: DATAURL, size: 2,
|
|
152
|
+
});
|
|
153
|
+
const resps = (await Promise.all(inputPdfs.map(
|
|
154
|
+
async document_url => await (await fetch(
|
|
155
|
+
`https://${client.region}-aiplatform.googleapis.com/v1/`
|
|
156
|
+
+ `projects/${client.project}/locations/${client.region}/`
|
|
157
|
+
+ `publishers/mistralai/models/${model}:rawPredict`, {
|
|
158
|
+
method: 'POST', headers: {
|
|
159
|
+
'Content-Type': 'application/json',
|
|
160
|
+
'Authorization': `Bearer ${key}`
|
|
161
|
+
}, body: JSON.stringify({
|
|
162
|
+
model, include_image_base64: true,
|
|
163
|
+
document: { type: 'document_url', document_url },
|
|
164
|
+
})
|
|
165
|
+
})).json()
|
|
166
|
+
))).filter(x => x?.pages?.length);
|
|
167
|
+
const resp = {
|
|
168
|
+
pages: [], usage_info: { pages_processed: 0, doc_size_bytes: 0 }
|
|
169
|
+
};
|
|
170
|
+
resps.map(x => {
|
|
171
|
+
x.pages.map(p => {
|
|
172
|
+
p.index = resp.pages.length;
|
|
173
|
+
resp.pages.push(p);
|
|
174
|
+
p.images.map(i => {
|
|
175
|
+
const oId = i.id;
|
|
176
|
+
i.id = `page-${p.index}-${oId}`;
|
|
177
|
+
p.markdown = p.markdown.replaceAll(
|
|
178
|
+
``, ``
|
|
179
|
+
);
|
|
180
|
+
});
|
|
181
|
+
});
|
|
182
|
+
resp.model = x.model;
|
|
183
|
+
resp.usage_info.pages_processed += x.usage_info.pages_processed;
|
|
184
|
+
resp.usage_info.doc_size_bytes += x.usage_info.doc_size_bytes;
|
|
185
|
+
});
|
|
186
|
+
if (options?.raw) { return resp; }
|
|
187
|
+
else if (options?.paging) { return resp.pages; }
|
|
188
|
+
const markdown = [];
|
|
189
|
+
resp.images = {};
|
|
190
|
+
for (const p of resp.pages) {
|
|
191
|
+
markdown.push(p.markdown);
|
|
192
|
+
await Promise.all(p.images.map(async i => {
|
|
193
|
+
const id = i.id;
|
|
194
|
+
i.width = i.bottom_right_x - i.top_left_x;
|
|
195
|
+
i.height = i.bottom_right_y - i.top_left_y;
|
|
196
|
+
i.annotation = i.image_annotation;
|
|
197
|
+
i.data = await convert(i.image_base64, {
|
|
198
|
+
...options, input: 'DATAURL',
|
|
199
|
+
});
|
|
200
|
+
[
|
|
201
|
+
'id', 'image_annotation', 'image_base64', 'top_left_x',
|
|
202
|
+
'top_left_y', 'bottom_right_x', 'bottom_right_y',
|
|
203
|
+
].map(k => delete i[k]);
|
|
204
|
+
resp.images[id] = i;
|
|
205
|
+
}));
|
|
206
|
+
}
|
|
207
|
+
resp.text = markdown.join('\n\n');
|
|
208
|
+
delete resp.pages;
|
|
209
|
+
return resp;
|
|
163
210
|
default:
|
|
164
211
|
throw new Error('Invalid provider.');
|
|
165
212
|
}
|
|
166
213
|
};
|
|
167
214
|
|
|
215
|
+
const splitPdf = async (file, options) => {
|
|
216
|
+
const [content, { PDFDocument }] = await Promise.all([
|
|
217
|
+
convert(file, { ...options, expected: BUFFER }), need('pdf-lib')
|
|
218
|
+
]);
|
|
219
|
+
const [doc, result] = [await PDFDocument.load(content), []];
|
|
220
|
+
const count = doc.getPageCount();
|
|
221
|
+
const size = ~~options?.size || Infinity;
|
|
222
|
+
for (let i = 0; i < count; i += size) {
|
|
223
|
+
result.push((async () => {
|
|
224
|
+
const sub = await PDFDocument.create();
|
|
225
|
+
const copied = await sub.copyPages(doc, Array.from(
|
|
226
|
+
{ length: Math.min(size, count - i) }, (_, j) => i + j
|
|
227
|
+
));
|
|
228
|
+
copied.forEach(page => sub.addPage(page));
|
|
229
|
+
return await convert(Buffer.from(await sub.save()), {
|
|
230
|
+
...options, input: 'BUFFER',
|
|
231
|
+
});
|
|
232
|
+
})());
|
|
233
|
+
}
|
|
234
|
+
return await Promise.all(result);
|
|
235
|
+
};
|
|
236
|
+
|
|
168
237
|
export default init;
|
|
169
238
|
export {
|
|
170
239
|
_NEED,
|
|
@@ -174,4 +243,5 @@ export {
|
|
|
174
243
|
ocr,
|
|
175
244
|
ocrImage,
|
|
176
245
|
parseOfficeFile,
|
|
246
|
+
splitPdf,
|
|
177
247
|
};
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "utilitas",
|
|
3
3
|
"description": "Just another common utility for JavaScript.",
|
|
4
|
-
"version": "2000.3.
|
|
4
|
+
"version": "2000.3.17",
|
|
5
5
|
"private": false,
|
|
6
6
|
"homepage": "https://github.com/Leask/utilitas",
|
|
7
7
|
"main": "index.mjs",
|
|
@@ -62,6 +62,7 @@
|
|
|
62
62
|
"office-text-extractor": "^3.0.3",
|
|
63
63
|
"openai": "^6.9.1",
|
|
64
64
|
"pdfjs-dist": "^5.4.394",
|
|
65
|
+
"pdf-lib": "^1.17.1",
|
|
65
66
|
"pg": "^8.16.3",
|
|
66
67
|
"pgvector": "^0.2.1",
|
|
67
68
|
"ping": "^1.0.0",
|