utilitas 2000.3.16 → 2000.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/manifest.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  const manifest = {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.16",
4
+ "version": "2000.3.17",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -51,6 +51,7 @@ const manifest = {
51
51
  "office-text-extractor": "^3.0.3",
52
52
  "openai": "^6.9.1",
53
53
  "pdfjs-dist": "^5.4.394",
54
+ "pdf-lib": "^1.17.1",
54
55
  "pg": "^8.16.3",
55
56
  "pgvector": "^0.2.1",
56
57
  "ping": "^1.0.0",
package/lib/storage.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  log as _log,
3
3
  base64Decode, base64Encode, ensureString, extract, ignoreErrFunc,
4
- mergeAtoB, need, throwError, trim, voidFunc, which,
4
+ mergeAtoB, need, throwError, trim, which,
5
5
  } from './utilitas.mjs';
6
6
 
7
7
  import { fileTypeFromBuffer } from 'file-type';
package/lib/vision.mjs CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  import { convert, DATAURL, BUFFER, FILE } from './storage.mjs';
10
10
  import fs from 'node:fs';
11
11
 
12
- const _NEED = ['office-text-extractor', 'pdfjs-dist', 'tesseract.js'];
12
+ const _NEED = ['office-text-extractor', 'pdfjs-dist', 'pdf-lib', 'tesseract.js'];
13
13
  const clients = {};
14
14
  const errorMessage = 'Invalid input data.';
15
15
  const log = content => _log(content, import.meta.url);
@@ -144,27 +144,96 @@ const ocr = async (file, options = {}) => {
144
144
  const client = clients?.[provider];
145
145
  assert(client, 'No available OCR provider.');
146
146
  const model = options?.model || client.model;
147
- const document_url = await convert(file, { ...options, expected: DATAURL });
148
147
  switch (provider) {
149
148
  case GOOGLE_MISTRAL:
150
149
  const key = await getGoogleAuthTokenByAuth(client.auth);
151
- return await (await fetch(
152
- `https://${client.region}-aiplatform.googleapis.com/v1/`
153
- + `projects/${client.project}/locations/${client.region}/`
154
- + `publishers/mistralai/models/${model}:rawPredict`, {
155
- method: 'POST', headers: {
156
- 'Content-Type': 'application/json',
157
- 'Authorization': `Bearer ${key}`
158
- }, body: JSON.stringify({
159
- model, include_image_base64: true,
160
- document: { type: 'document_url', document_url },
161
- })
162
- })).json();
150
+ const inputPdfs = await splitPdf(file, {
151
+ ...options, expected: DATAURL, size: 2,
152
+ });
153
+ const resps = (await Promise.all(inputPdfs.map(
154
+ async document_url => await (await fetch(
155
+ `https://${client.region}-aiplatform.googleapis.com/v1/`
156
+ + `projects/${client.project}/locations/${client.region}/`
157
+ + `publishers/mistralai/models/${model}:rawPredict`, {
158
+ method: 'POST', headers: {
159
+ 'Content-Type': 'application/json',
160
+ 'Authorization': `Bearer ${key}`
161
+ }, body: JSON.stringify({
162
+ model, include_image_base64: true,
163
+ document: { type: 'document_url', document_url },
164
+ })
165
+ })).json()
166
+ ))).filter(x => x?.pages?.length);
167
+ const resp = {
168
+ pages: [], usage_info: { pages_processed: 0, doc_size_bytes: 0 }
169
+ };
170
+ resps.map(x => {
171
+ x.pages.map(p => {
172
+ p.index = resp.pages.length;
173
+ resp.pages.push(p);
174
+ p.images.map(i => {
175
+ const oId = i.id;
176
+ i.id = `page-${p.index}-${oId}`;
177
+ p.markdown = p.markdown.replaceAll(
178
+ `![${oId}](${oId})`, `![${i.id}](${i.id})`
179
+ );
180
+ });
181
+ });
182
+ resp.model = x.model;
183
+ resp.usage_info.pages_processed += x.usage_info.pages_processed;
184
+ resp.usage_info.doc_size_bytes += x.usage_info.doc_size_bytes;
185
+ });
186
+ if (options?.raw) { return resp; }
187
+ else if (options?.paging) { return resp.pages; }
188
+ const markdown = [];
189
+ resp.images = {};
190
+ for (const p of resp.pages) {
191
+ markdown.push(p.markdown);
192
+ await Promise.all(p.images.map(async i => {
193
+ const id = i.id;
194
+ i.width = i.bottom_right_x - i.top_left_x;
195
+ i.height = i.bottom_right_y - i.top_left_y;
196
+ i.annotation = i.image_annotation;
197
+ i.data = await convert(i.image_base64, {
198
+ ...options, input: 'DATAURL',
199
+ });
200
+ [
201
+ 'id', 'image_annotation', 'image_base64', 'top_left_x',
202
+ 'top_left_y', 'bottom_right_x', 'bottom_right_y',
203
+ ].map(k => delete i[k]);
204
+ resp.images[id] = i;
205
+ }));
206
+ }
207
+ resp.text = markdown.join('\n\n');
208
+ delete resp.pages;
209
+ return resp;
163
210
  default:
164
211
  throw new Error('Invalid provider.');
165
212
  }
166
213
  };
167
214
 
215
+ const splitPdf = async (file, options) => {
216
+ const [content, { PDFDocument }] = await Promise.all([
217
+ convert(file, { ...options, expected: BUFFER }), need('pdf-lib')
218
+ ]);
219
+ const [doc, result] = [await PDFDocument.load(content), []];
220
+ const count = doc.getPageCount();
221
+ const size = ~~options?.size || Infinity;
222
+ for (let i = 0; i < count; i += size) {
223
+ result.push((async () => {
224
+ const sub = await PDFDocument.create();
225
+ const copied = await sub.copyPages(doc, Array.from(
226
+ { length: Math.min(size, count - i) }, (_, j) => i + j
227
+ ));
228
+ copied.forEach(page => sub.addPage(page));
229
+ return await convert(Buffer.from(await sub.save()), {
230
+ ...options, input: 'BUFFER',
231
+ });
232
+ })());
233
+ }
234
+ return await Promise.all(result);
235
+ };
236
+
168
237
  export default init;
169
238
  export {
170
239
  _NEED,
@@ -174,4 +243,5 @@ export {
174
243
  ocr,
175
244
  ocrImage,
176
245
  parseOfficeFile,
246
+ splitPdf,
177
247
  };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.16",
4
+ "version": "2000.3.17",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -62,6 +62,7 @@
62
62
  "office-text-extractor": "^3.0.3",
63
63
  "openai": "^6.9.1",
64
64
  "pdfjs-dist": "^5.4.394",
65
+ "pdf-lib": "^1.17.1",
65
66
  "pg": "^8.16.3",
66
67
  "pgvector": "^0.2.1",
67
68
  "ping": "^1.0.0",