@truto/truto-jsonata 1.0.9 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -881,6 +881,23 @@ expression.evaluate({ file}).then(result => { console.log(result); });
881
881
 
882
882
  </details>
883
883
 
884
+ <details>
885
+ <summary>getDataUri(file)</summary>
886
+
887
+ Converts a `Blob` or `Buffer` or `Readable Stream` to a data URI.
888
+
889
+ **Example:**
890
+
891
+ ```javascript
892
+ import trutoJsonata from '@truto/truto-jsonata';
893
+ const file = new Blob(['Hello, World!'], { type: 'text/plain' });
894
+ const expression = trutoJsonata("getDataUri(file)");
895
+ expression.evaluate({ file}).then(result => { console.log(result); });
896
+ // Output: 'data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=='
897
+ ```
898
+
899
+ </details>
900
+
884
901
  <details>
885
902
  <summary>blob(content, options)</summary>
886
903
 
@@ -1241,6 +1258,25 @@ expression3.evaluate({ jsonData: jsonData3, options: options3 }).then(result =>
1241
1258
 
1242
1259
  </details>
1243
1260
 
1261
+ <details>
1262
+ <summary>parseDocument(file)</summary>
1263
+
1264
+ Parses a document file (e.g., PDF, DOCX) and extracts text content.
1265
+
1266
+ **Example:**
1267
+
1268
+ ```javascript
1269
+ import trutoJsonata from '@truto/truto-jsonata';
1270
+ const file = new Blob(['Hello, World!'], { type: 'application/pdf' });
1271
+ const buffer = await file.arrayBuffer();
1272
+ const expression = trutoJsonata("$parseDocument(buffer)");
1273
+ expression.evaluate({ file}).then(result => { console.log(result); });
1274
+ // Output: 'Hello, World!'
1275
+
1276
+ ````
1277
+
1278
+ </details>
1279
+
1244
1280
  ### Markdown and Text Conversion
1245
1281
 
1246
1282
  <details>
@@ -1855,6 +1891,35 @@ expression.evaluate({ body, api_key }).then(result => { console.log(result); });
1855
1891
 
1856
1892
  </details>
1857
1893
 
1894
+ <details>
1895
+ <summary>recursiveCharacterTextSplitter(text, options)</summary>
1896
+
1897
+ Splits a text into an array of characters, words, or sentences, recursively.
1898
+
1899
+ **Parameters:**
1900
+
1901
+ - **text**: The input text to split.
1902
+ - **options**: An object containing the following properties:
1903
+ - **chunkSize**: The maximum number of characters, words, or sentences per chunk (default is `200`).
1904
+ - **chunkOverlap**: The number of characters, words, or sentences to overlap between chunks (default is `60`).
1905
+
1906
+ **Example Usage:**
1907
+
1908
+ ```javascript
1909
+ import trutoJsonata from '@truto/truto-jsonata';
1910
+
1911
+ const text = "Hello, World! This is a sample text.";
1912
+ const options = {
1913
+ chunkSize: 10,
1914
+ chunkOverlap: 3
1915
+ };
1916
+ const expression = trutoJsonata("$recursiveCharacterTextSplitter(text, options)");
1917
+ expression.evaluate(text).then(result => { console.log(result); });
1918
+ // Output: ["Hello, Wo", "lo, World", "rld! This", "is a samp", "ample text", "text."]
1919
+ ```
1920
+
1921
+ </details>
1922
+
1858
1923
 
1859
1924
  ### Miscellaneous
1860
1925
 
@@ -2015,3 +2080,23 @@ expression.evaluate({}).then(result => { console.log(result); });
2015
2080
  ```
2016
2081
 
2017
2082
  </details>
2083
+
2084
+
2085
+
2086
+ <details>
2087
+ <summary>chunk(arr,size)</summary>
2088
+
2089
+ Chunks an array into smaller arrays of a specified size.
2090
+
2091
+ **Example:**
2092
+
2093
+ ```javascript
2094
+ import trutoJsonata from '@truto/truto-jsonata';
2095
+
2096
+
2097
+ const expression = trutoJsonata("$chunk([1, 2, 3, 4, 5], 2)");
2098
+ expression.evaluate({}).then(result => { console.log(result); });
2099
+ // Output: [[1,2],[3,4],[5]]
2100
+ ```
2101
+
2102
+ </details>
package/dist/main.cjs CHANGED
@@ -9,6 +9,13 @@ var $dxT2C$turndownplugingfm = require("turndown-plugin-gfm");
9
9
  var $dxT2C$xmljs = require("xml-js");
10
10
  var $dxT2C$json2md = require("json2md");
11
11
  var $dxT2C$mime = require("mime");
12
+ var $dxT2C$pmap = require("p-map");
13
+ var $dxT2C$pretry = require("p-retry");
14
+ var $dxT2C$officeparser = require("officeparser");
15
+ var $dxT2C$filetype = require("file-type");
16
+ var $dxT2C$pdfjsserverless = require("pdfjs-serverless");
17
+ var $dxT2C$langchaintextsplitters = require("@langchain/textsplitters");
18
+ var $dxT2C$nodestreamweb = require("node:stream/web");
12
19
 
13
20
  function $parcel$interopDefault(a) {
14
21
  return a && a.__esModule ? a.default : a;
@@ -1335,6 +1342,13 @@ var $77081a2d6d46cd50$export$2e2bcd8739ae039 = $77081a2d6d46cd50$var$base64decod
1335
1342
 
1336
1343
 
1337
1344
 
1345
+ function $49ae26210d3b6a4c$var$toNumber(value) {
1346
+ return (0, $dxT2C$lodashes.toNumber)(value);
1347
+ }
1348
+ var $49ae26210d3b6a4c$export$2e2bcd8739ae039 = $49ae26210d3b6a4c$var$toNumber;
1349
+
1350
+
1351
+
1338
1352
 
1339
1353
  async function $2c639152271462a6$var$convertMarkdownToHtml(markdown, options) {
1340
1354
  return (0, $dxT2C$marked.marked).parse(markdown, options);
@@ -1342,23 +1356,129 @@ async function $2c639152271462a6$var$convertMarkdownToHtml(markdown, options) {
1342
1356
  var $2c639152271462a6$export$2e2bcd8739ae039 = $2c639152271462a6$var$convertMarkdownToHtml;
1343
1357
 
1344
1358
 
1359
+
1360
+
1361
+
1345
1362
  async function $caaee789061bb8bb$var$generateEmbeddingsCohere(body, api_key) {
1346
- const response = await fetch("https://api.cohere.com/v2/embed", {
1347
- method: "POST",
1348
- headers: {
1349
- accept: "application/json",
1350
- "content-type": "application/json",
1351
- "user-agent": "truto",
1352
- Authorization: `Bearer ${api_key}`
1353
- },
1354
- body: JSON.stringify(body)
1363
+ if (!(0, $dxT2C$lodashes.isEmpty)(body.texts)) {
1364
+ const chunks = (0, $dxT2C$lodashes.chunk)((0, $dxT2C$lodashes.castArray)(body.texts), 20);
1365
+ return await (0, ($parcel$interopDefault($dxT2C$pmap)))(chunks, async (chunk)=>{
1366
+ return await (0, ($parcel$interopDefault($dxT2C$pretry)))(async ()=>{
1367
+ const response = await fetch("https://api.cohere.com/v2/embed", {
1368
+ method: "POST",
1369
+ headers: {
1370
+ accept: "application/json",
1371
+ "content-type": "application/json",
1372
+ "user-agent": "truto",
1373
+ Authorization: `Bearer ${api_key}`
1374
+ },
1375
+ body: JSON.stringify({
1376
+ ...body,
1377
+ texts: chunk
1378
+ })
1379
+ });
1380
+ if (!response.ok) throw new Error(await response.text());
1381
+ return await response.json();
1382
+ }, {
1383
+ retries: 10,
1384
+ maxTimeout: 600000
1385
+ });
1386
+ }, {
1387
+ concurrency: 1
1388
+ });
1389
+ } else if (!(0, $dxT2C$lodashes.isEmpty)(body.images)) return await (0, ($parcel$interopDefault($dxT2C$pretry)))(async ()=>{
1390
+ const response = await fetch("https://api.cohere.com/v2/embed", {
1391
+ method: "POST",
1392
+ headers: {
1393
+ accept: "application/json",
1394
+ "content-type": "application/json",
1395
+ "user-agent": "truto",
1396
+ Authorization: `Bearer ${api_key}`
1397
+ },
1398
+ body: JSON.stringify(body)
1399
+ });
1400
+ if (!response.ok) throw new Error(await response.text());
1401
+ return await response.json();
1402
+ }, {
1403
+ retries: 10,
1404
+ maxTimeout: 600000
1355
1405
  });
1356
- if (!response.ok) throw new Error(await response.text());
1357
- return await response.json();
1358
1406
  }
1359
1407
  var $caaee789061bb8bb$export$2e2bcd8739ae039 = $caaee789061bb8bb$var$generateEmbeddingsCohere;
1360
1408
 
1361
1409
 
1410
+
1411
+
1412
+
1413
+
1414
+ async function $05e3378f7c17d263$var$parsePdf(buffer) {
1415
+ const data = buffer.buffer.slice(buffer?.byteOffset, buffer?.byteOffset + buffer?.byteLength);
1416
+ const { getDocument: getDocument } = await (0, $dxT2C$pdfjsserverless.resolvePDFJS)();
1417
+ const doc = await getDocument({
1418
+ data: data,
1419
+ useSystemFonts: true
1420
+ }).promise;
1421
+ // Get metadata and initialize output object
1422
+ const output = [];
1423
+ // Iterate through each page and fetch the text content
1424
+ for(let i = 1; i <= doc.numPages; i++){
1425
+ const page = await doc.getPage(i);
1426
+ const textContent = await page.getTextContent();
1427
+ const contents = textContent.items.map((item)=>(0, $dxT2C$lodashes.get)(item, "str")).join(" ");
1428
+ // Add page content to output
1429
+ if (contents) output.push((0, $dxT2C$lodashes.join)([
1430
+ `Page Number: ${i}`,
1431
+ contents
1432
+ ], "/\n"));
1433
+ }
1434
+ // Return the results as JSON
1435
+ return (0, $dxT2C$lodashes.join)(output, "\n");
1436
+ }
1437
+ async function $05e3378f7c17d263$var$parseDocument(file) {
1438
+ if (file) {
1439
+ const chunks = [];
1440
+ for await (const chunk of file)chunks.push(Buffer.from(chunk));
1441
+ const buffer = Buffer.concat(chunks);
1442
+ const fileExt = (await (0, $dxT2C$filetype.fileTypeFromBuffer)(buffer))?.ext;
1443
+ if (fileExt === "pdf") return await $05e3378f7c17d263$var$parsePdf(buffer);
1444
+ return await (0, $dxT2C$officeparser.parseOfficeAsync)(buffer);
1445
+ }
1446
+ return await (0, $dxT2C$officeparser.parseOfficeAsync)(file);
1447
+ }
1448
+ var $05e3378f7c17d263$export$2e2bcd8739ae039 = $05e3378f7c17d263$var$parseDocument;
1449
+
1450
+
1451
+
1452
+ function $fe4dcef142601b8c$export$e600492876ee595b(text, options = {
1453
+ chunkSize: 200,
1454
+ chunkOverlap: 60
1455
+ }) {
1456
+ const splitter = new (0, $dxT2C$langchaintextsplitters.RecursiveCharacterTextSplitter)(options);
1457
+ return splitter.splitText(text);
1458
+ }
1459
+
1460
+
1461
+
1462
+
1463
+ async function $15c432f5f036a88a$var$getDataUri(file) {
1464
+ if (file instanceof (0, $dxT2C$nodestreamweb.ReadableStream)) {
1465
+ const chunks = [];
1466
+ for await (const chunk of file)chunks.push(Buffer.from(chunk));
1467
+ const buffer = Buffer.concat(chunks);
1468
+ const base64Image = buffer.toString("base64");
1469
+ const mimeType = (await (0, $dxT2C$filetype.fileTypeFromBuffer)(buffer))?.mime;
1470
+ // Construct the data URI for a PNG image
1471
+ return `data:${mimeType};base64,${base64Image}`;
1472
+ }
1473
+ const arrayBuffer = file instanceof Blob ? await file.arrayBuffer() : file;
1474
+ const buffer = Buffer.from(arrayBuffer);
1475
+ const base64Image = buffer.toString("base64");
1476
+ const mimeType = (await (0, $dxT2C$filetype.fileTypeFromBuffer)(buffer))?.mime;
1477
+ return `data:${mimeType};base64,${base64Image}`;
1478
+ }
1479
+ var $15c432f5f036a88a$export$2e2bcd8739ae039 = $15c432f5f036a88a$var$getDataUri;
1480
+
1481
+
1362
1482
  function $af351c41b7fd6f79$export$2e2bcd8739ae039(expression) {
1363
1483
  expression.registerFunction("dtFromIso", (0, $bab42b5e4be720d3$export$2e2bcd8739ae039));
1364
1484
  expression.registerFunction("base64decode", (0, $77081a2d6d46cd50$export$2e2bcd8739ae039));
@@ -1381,6 +1501,7 @@ function $af351c41b7fd6f79$export$2e2bcd8739ae039(expression) {
1381
1501
  expression.registerFunction("difference", function(arr1, arr2) {
1382
1502
  return (0, $dxT2C$lodashes.difference)(arr1, arr2);
1383
1503
  });
1504
+ expression.registerFunction("toNumber", (0, $49ae26210d3b6a4c$export$2e2bcd8739ae039));
1384
1505
  expression.registerFunction("jsonParse", (0, $491a3e93ed46e445$export$2e2bcd8739ae039));
1385
1506
  expression.registerFunction("getMimeType", (0, $cff411daa1ea9f52$export$2e2bcd8739ae039));
1386
1507
  expression.registerFunction("uuid", (0, $b3c3220fc07098c9$export$2e2bcd8739ae039));
@@ -1425,6 +1546,9 @@ function $af351c41b7fd6f79$export$2e2bcd8739ae039(expression) {
1425
1546
  expression.registerFunction("values", function(obj) {
1426
1547
  return (0, $dxT2C$lodashes.values)(obj);
1427
1548
  });
1549
+ expression.registerFunction("chunk", function(arr, size) {
1550
+ return (0, $dxT2C$lodashes.chunk)((0, $dxT2C$lodashes.castArray)(arr), size);
1551
+ });
1428
1552
  expression.registerFunction("wrap", function(value, wrapper, endWrapper) {
1429
1553
  return (0, $dxT2C$lodashes.join)([
1430
1554
  wrapper,
@@ -1432,6 +1556,9 @@ function $af351c41b7fd6f79$export$2e2bcd8739ae039(expression) {
1432
1556
  endWrapper || wrapper
1433
1557
  ], "");
1434
1558
  });
1559
+ expression.registerFunction("parseDocument", (0, $05e3378f7c17d263$export$2e2bcd8739ae039));
1560
+ expression.registerFunction("recursiveCharacterTextSplitter", (0, $fe4dcef142601b8c$export$e600492876ee595b));
1561
+ expression.registerFunction("getDataUri", (0, $15c432f5f036a88a$export$2e2bcd8739ae039));
1435
1562
  return expression;
1436
1563
  }
1437
1564