@nocobase/ai 2.1.0-beta.13 → 2.1.0-beta.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+ import { Document } from '@langchain/core/documents';
10
+ export declare const loadByWorker: (extname: string, blob: Blob) => Promise<Document[]>;
@@ -0,0 +1,90 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ var __create = Object.create;
11
+ var __defProp = Object.defineProperty;
12
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
13
+ var __getOwnPropNames = Object.getOwnPropertyNames;
14
+ var __getProtoOf = Object.getPrototypeOf;
15
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
16
+ var __name = (target, value) => __defProp(target, "name", { value, configurable: true });
17
+ var __export = (target, all) => {
18
+ for (var name in all)
19
+ __defProp(target, name, { get: all[name], enumerable: true });
20
+ };
21
+ var __copyProps = (to, from, except, desc) => {
22
+ if (from && typeof from === "object" || typeof from === "function") {
23
+ for (let key of __getOwnPropNames(from))
24
+ if (!__hasOwnProp.call(to, key) && key !== except)
25
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
26
+ }
27
+ return to;
28
+ };
29
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
30
+ // If the importer is in node compatibility mode or this is not an ESM
31
+ // file that has been converted to a CommonJS file using a Babel-
32
+ // compatible transform (i.e. "__esModule" has not been set), then set
33
+ // "default" to the CommonJS "module.exports" for node compatibility.
34
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
35
+ mod
36
+ ));
37
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
38
+ var document_loader_exports = {};
39
+ __export(document_loader_exports, {
40
+ loadByWorker: () => loadByWorker
41
+ });
42
+ module.exports = __toCommonJS(document_loader_exports);
43
+ var import_node_worker_threads = require("node:worker_threads");
44
+ var import_node_path = __toESM(require("node:path"));
45
+ const loadByWorker = /* @__PURE__ */ __name(async (extname, blob) => {
46
+ const buffer = Buffer.from(await blob.arrayBuffer());
47
+ const isTsRuntime = __filename.endsWith(".ts");
48
+ const workerPath = import_node_path.default.join(__dirname, `loader.worker.${isTsRuntime ? "ts" : "js"}`);
49
+ const worker = new import_node_worker_threads.Worker(workerPath, {
50
+ execArgv: isTsRuntime ? ["--require", "tsx/cjs"] : void 0
51
+ });
52
+ return new Promise((resolve, reject) => {
53
+ let settled = false;
54
+ const close = /* @__PURE__ */ __name((error, result) => {
55
+ if (settled) {
56
+ return;
57
+ }
58
+ settled = true;
59
+ if (error) {
60
+ reject(error);
61
+ return;
62
+ }
63
+ resolve(result || []);
64
+ }, "close");
65
+ worker.once("message", (payload) => {
66
+ if (payload == null ? void 0 : payload.error) {
67
+ close(new Error(payload.error));
68
+ return;
69
+ }
70
+ close(void 0, (payload == null ? void 0 : payload.documents) || []);
71
+ });
72
+ worker.once("error", (error) => close(error));
73
+ worker.once("exit", (code) => {
74
+ if (!settled && code !== 0) {
75
+ close(new Error(`Document loader worker exited with code ${code}`));
76
+ }
77
+ });
78
+ worker.postMessage({
79
+ extname,
80
+ mimeType: blob.type,
81
+ buffer: Uint8Array.from(buffer)
82
+ });
83
+ }).finally(() => {
84
+ worker.terminate().catch(() => void 0);
85
+ });
86
+ }, "loadByWorker");
87
+ // Annotate the CommonJS export names for ESM import in node:
88
+ 0 && (module.exports = {
89
+ loadByWorker
90
+ });
@@ -0,0 +1,9 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+ export {};
@@ -0,0 +1,83 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ var __defProp = Object.defineProperty;
11
+ var __name = (target, value) => __defProp(target, "name", { value, configurable: true });
12
+ var import_pdf = require("@langchain/community/document_loaders/fs/pdf");
13
+ var import_pptx = require("@langchain/community/document_loaders/fs/pptx");
14
+ var import_docx = require("@langchain/community/document_loaders/fs/docx");
15
+ var import_node_worker_threads = require("node:worker_threads");
16
+ var import_text = require("./vendor/langchain/document_loaders/fs/text");
17
+ var import_csv = require("@langchain/community/document_loaders/fs/csv");
18
+ var import_xlsx = require("./xlsx");
19
+ var _a;
20
+ const loadPdf = /* @__PURE__ */ __name(async (blob) => {
21
+ const loader = new import_pdf.PDFLoader(blob);
22
+ return loader.load();
23
+ }, "loadPdf");
24
+ const loadDoc = /* @__PURE__ */ __name(async (blob, type) => {
25
+ const loader = new import_docx.DocxLoader(blob, { type });
26
+ return loader.load();
27
+ }, "loadDoc");
28
+ const loadPpt = /* @__PURE__ */ __name(async (blob) => {
29
+ const loader = new import_pptx.PPTXLoader(blob);
30
+ return loader.load();
31
+ }, "loadPpt");
32
+ const loadTxt = /* @__PURE__ */ __name(async (blob) => {
33
+ const loader = new import_text.TextLoader(blob);
34
+ return loader.load();
35
+ }, "loadTxt");
36
+ const loadCsv = /* @__PURE__ */ __name(async (blob) => {
37
+ const loader = new import_csv.CSVLoader(blob);
38
+ return loader.load();
39
+ }, "loadCsv");
40
+ const loadByExtname = /* @__PURE__ */ __name(async (payload) => {
41
+ const blob = new Blob([Buffer.from(payload.buffer)], { type: payload.mimeType ?? "application/octet-stream" });
42
+ switch (payload.extname) {
43
+ case ".pdf":
44
+ return loadPdf(blob);
45
+ case ".ppt":
46
+ case ".pptx":
47
+ return loadPpt(blob);
48
+ case ".doc":
49
+ return loadDoc(blob, "doc");
50
+ case ".docx":
51
+ return loadDoc(blob, "docx");
52
+ case ".csv":
53
+ return loadCsv(blob);
54
+ case ".xls":
55
+ case ".xlsx":
56
+ return (0, import_xlsx.loadXlsx)(blob);
57
+ case ".json":
58
+ case ".md":
59
+ case ".txt":
60
+ return loadTxt(blob);
61
+ default:
62
+ return [];
63
+ }
64
+ }, "loadByExtname");
65
+ (_a = import_node_worker_threads.parentPort) == null ? void 0 : _a.on("message", async (payload) => {
66
+ var _a2, _b;
67
+ try {
68
+ const documents = await loadByExtname(payload);
69
+ const response = {
70
+ documents: documents.map((doc) => ({
71
+ pageContent: doc.pageContent,
72
+ metadata: doc.metadata,
73
+ id: doc.id
74
+ }))
75
+ };
76
+ (_a2 = import_node_worker_threads.parentPort) == null ? void 0 : _a2.postMessage(response);
77
+ } catch (error) {
78
+ const response = {
79
+ error: String((error == null ? void 0 : error.stack) || error)
80
+ };
81
+ (_b = import_node_worker_threads.parentPort) == null ? void 0 : _b.postMessage(response);
82
+ }
83
+ });
@@ -0,0 +1,20 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+ /// <reference types="node" />
10
+ import { Document } from '@langchain/core/documents';
11
+ import { BaseDocumentLoader } from '@langchain/core/document_loaders/base';
12
+ export declare class TextLoader extends BaseDocumentLoader {
13
+ private filePathOrBlob;
14
+ constructor(filePathOrBlob: any);
15
+ parse(raw: any): Promise<any[]>;
16
+ load(): Promise<Document<any>[]>;
17
+ static imports(): Promise<{
18
+ readFile: typeof import("fs/promises").readFile;
19
+ }>;
20
+ }
@@ -0,0 +1,99 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ var __create = Object.create;
11
+ var __defProp = Object.defineProperty;
12
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
13
+ var __getOwnPropNames = Object.getOwnPropertyNames;
14
+ var __getProtoOf = Object.getPrototypeOf;
15
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
16
+ var __name = (target, value) => __defProp(target, "name", { value, configurable: true });
17
+ var __export = (target, all) => {
18
+ for (var name in all)
19
+ __defProp(target, name, { get: all[name], enumerable: true });
20
+ };
21
+ var __copyProps = (to, from, except, desc) => {
22
+ if (from && typeof from === "object" || typeof from === "function") {
23
+ for (let key of __getOwnPropNames(from))
24
+ if (!__hasOwnProp.call(to, key) && key !== except)
25
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
26
+ }
27
+ return to;
28
+ };
29
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
30
+ // If the importer is in node compatibility mode or this is not an ESM
31
+ // file that has been converted to a CommonJS file using a Babel-
32
+ // compatible transform (i.e. "__esModule" has not been set), then set
33
+ // "default" to the CommonJS "module.exports" for node compatibility.
34
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
35
+ mod
36
+ ));
37
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
38
+ var text_exports = {};
39
+ __export(text_exports, {
40
+ TextLoader: () => TextLoader
41
+ });
42
+ module.exports = __toCommonJS(text_exports);
43
+ var import_documents = require("@langchain/core/documents");
44
+ var import_env = require("@langchain/core/utils/env");
45
+ var import_base = require("@langchain/core/document_loaders/base");
46
+ const _TextLoader = class _TextLoader extends import_base.BaseDocumentLoader {
47
+ filePathOrBlob;
48
+ constructor(filePathOrBlob) {
49
+ super();
50
+ this.filePathOrBlob = filePathOrBlob;
51
+ }
52
+ async parse(raw) {
53
+ return [raw];
54
+ }
55
+ async load() {
56
+ let text;
57
+ let metadata;
58
+ if (typeof this.filePathOrBlob === "string") {
59
+ const { readFile } = await _TextLoader.imports();
60
+ text = await readFile(this.filePathOrBlob, "utf8");
61
+ metadata = { source: this.filePathOrBlob };
62
+ } else {
63
+ text = await this.filePathOrBlob.text();
64
+ metadata = { source: "blob", blobType: this.filePathOrBlob.type };
65
+ }
66
+ const parsed = await this.parse(text);
67
+ parsed.forEach((pageContent, i) => {
68
+ if (typeof pageContent !== "string") {
69
+ throw new Error(`Expected string, at position ${i} got ${typeof pageContent}`);
70
+ }
71
+ });
72
+ return parsed.map(
73
+ (pageContent, i) => new import_documents.Document({
74
+ pageContent,
75
+ metadata: parsed.length === 1 ? metadata : {
76
+ ...metadata,
77
+ line: i + 1
78
+ }
79
+ })
80
+ );
81
+ }
82
+ static async imports() {
83
+ try {
84
+ const { readFile } = await import("node:fs/promises");
85
+ return { readFile };
86
+ } catch (e) {
87
+ console.error(e);
88
+ throw new Error(
89
+ `Failed to load fs/promises. TextLoader available only on environment 'node'. It appears you are running environment '${(0, import_env.getEnv)()}'. See https://<link to docs> for alternatives.`
90
+ );
91
+ }
92
+ }
93
+ };
94
+ __name(_TextLoader, "TextLoader");
95
+ let TextLoader = _TextLoader;
96
+ // Annotate the CommonJS export names for ESM import in node:
97
+ 0 && (module.exports = {
98
+ TextLoader
99
+ });
@@ -0,0 +1,10 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+ import { Document } from '@langchain/core/documents';
10
+ export declare const loadXlsx: (blob: Blob) => Promise<Document[]>;
@@ -0,0 +1,100 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ var __create = Object.create;
11
+ var __defProp = Object.defineProperty;
12
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
13
+ var __getOwnPropNames = Object.getOwnPropertyNames;
14
+ var __getProtoOf = Object.getPrototypeOf;
15
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
16
+ var __name = (target, value) => __defProp(target, "name", { value, configurable: true });
17
+ var __export = (target, all) => {
18
+ for (var name in all)
19
+ __defProp(target, name, { get: all[name], enumerable: true });
20
+ };
21
+ var __copyProps = (to, from, except, desc) => {
22
+ if (from && typeof from === "object" || typeof from === "function") {
23
+ for (let key of __getOwnPropNames(from))
24
+ if (!__hasOwnProp.call(to, key) && key !== except)
25
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
26
+ }
27
+ return to;
28
+ };
29
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
30
+ // If the importer is in node compatibility mode or this is not an ESM
31
+ // file that has been converted to a CommonJS file using a Babel-
32
+ // compatible transform (i.e. "__esModule" has not been set), then set
33
+ // "default" to the CommonJS "module.exports" for node compatibility.
34
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
35
+ mod
36
+ ));
37
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
38
+ var xlsx_exports = {};
39
+ __export(xlsx_exports, {
40
+ loadXlsx: () => loadXlsx
41
+ });
42
+ module.exports = __toCommonJS(xlsx_exports);
43
+ var import_documents = require("@langchain/core/documents");
44
+ var XLSX = __toESM(require("xlsx"));
45
+ const normalizeCellValue = /* @__PURE__ */ __name((value) => {
46
+ if (value === void 0 || value === null) {
47
+ return "";
48
+ }
49
+ return String(value);
50
+ }, "normalizeCellValue");
51
+ const trimTrailingEmptyCells = /* @__PURE__ */ __name((row) => {
52
+ let end = row.length;
53
+ while (end > 0 && normalizeCellValue(row[end - 1]).trim() === "") {
54
+ end -= 1;
55
+ }
56
+ return row.slice(0, end);
57
+ }, "trimTrailingEmptyCells");
58
+ const sheetToLines = /* @__PURE__ */ __name((sheet) => {
59
+ const rows = XLSX.utils.sheet_to_json(sheet, {
60
+ header: 1,
61
+ raw: false,
62
+ defval: "",
63
+ blankrows: false
64
+ });
65
+ return rows.map((row) => trimTrailingEmptyCells(Array.isArray(row) ? row : [])).filter((row) => row.length > 0).map((row) => row.map((cell) => normalizeCellValue(cell)).join(" ")).filter((line) => line.trim().length > 0);
66
+ }, "sheetToLines");
67
+ const loadXlsx = /* @__PURE__ */ __name(async (blob) => {
68
+ const buffer = await blob.arrayBuffer();
69
+ const workbook = XLSX.read(buffer, {
70
+ type: "array",
71
+ cellText: true
72
+ });
73
+ const documents = [];
74
+ workbook.SheetNames.forEach((sheetName, index) => {
75
+ const sheet = workbook.Sheets[sheetName];
76
+ if (!sheet) {
77
+ return;
78
+ }
79
+ const lines = sheetToLines(sheet);
80
+ if (!lines.length) {
81
+ return;
82
+ }
83
+ documents.push(
84
+ new import_documents.Document({
85
+ pageContent: [`Sheet: ${sheetName}`, ...lines].join("\n"),
86
+ metadata: {
87
+ source: "blob",
88
+ blobType: blob.type,
89
+ sheetName,
90
+ sheetIndex: index
91
+ }
92
+ })
93
+ );
94
+ });
95
+ return documents;
96
+ }, "loadXlsx");
97
+ // Annotate the CommonJS export names for ESM import in node:
98
+ 0 && (module.exports = {
99
+ loadXlsx
100
+ });
package/lib/index.d.ts CHANGED
@@ -10,3 +10,4 @@ export * from './ai-manager';
10
10
  export * from './document-manager';
11
11
  export * from './tools-manager';
12
12
  export * from './loader';
13
+ export * from './document-loader';
package/lib/index.js CHANGED
@@ -27,10 +27,12 @@ __reExport(src_exports, require("./ai-manager"), module.exports);
27
27
  __reExport(src_exports, require("./document-manager"), module.exports);
28
28
  __reExport(src_exports, require("./tools-manager"), module.exports);
29
29
  __reExport(src_exports, require("./loader"), module.exports);
30
+ __reExport(src_exports, require("./document-loader"), module.exports);
30
31
  // Annotate the CommonJS export names for ESM import in node:
31
32
  0 && (module.exports = {
32
33
  ...require("./ai-manager"),
33
34
  ...require("./document-manager"),
34
35
  ...require("./tools-manager"),
35
- ...require("./loader")
36
+ ...require("./loader"),
37
+ ...require("./document-loader")
36
38
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nocobase/ai",
3
- "version": "2.1.0-beta.13",
3
+ "version": "2.1.0-beta.15",
4
4
  "description": "",
5
5
  "license": "Apache-2.0",
6
6
  "main": "./lib/index.js",
@@ -16,21 +16,23 @@
16
16
  "@langchain/langgraph-checkpoint": "^1.0.0",
17
17
  "@langchain/ollama": "^1.2.2",
18
18
  "@langchain/openai": "^1.2.7",
19
- "@nocobase/logger": "2.1.0-beta.13",
20
- "@nocobase/resourcer": "2.1.0-beta.13",
21
- "@nocobase/utils": "2.1.0-beta.13",
19
+ "@nocobase/logger": "2.1.0-beta.15",
20
+ "@nocobase/resourcer": "2.1.0-beta.15",
21
+ "@nocobase/utils": "2.1.0-beta.15",
22
+ "d3-dsv": "2",
22
23
  "fast-glob": "^3.3.2",
23
24
  "flexsearch": "^0.8.2",
24
25
  "langchain": "^1.2.24",
25
26
  "mammoth": "^1.10.0",
26
27
  "officeparser": "^5.2.0",
27
28
  "pdf-parse": "^1.1.1",
28
- "word-extractor": "^1.0.4"
29
+ "word-extractor": "^1.0.4",
30
+ "xlsx": "^0.18.5"
29
31
  },
30
32
  "repository": {
31
33
  "type": "git",
32
34
  "url": "git+https://github.com/nocobase/nocobase.git",
33
35
  "directory": "packages/ai"
34
36
  },
35
- "gitHead": "691716e5f4e5f8bd3859d65bc8a29b4e3c32209b"
37
+ "gitHead": "dc1aceea6357e6ab149976c2a236fc4b6bee1370"
36
38
  }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ import { Document } from '@langchain/core/documents';
11
+ import { Worker } from 'node:worker_threads';
12
+ import path from 'node:path';
13
+
14
+ export const loadByWorker = async (extname: string, blob: Blob): Promise<Document[]> => {
15
+ const buffer = Buffer.from(await blob.arrayBuffer());
16
+ const isTsRuntime = __filename.endsWith('.ts');
17
+ const workerPath = path.join(__dirname, `loader.worker.${isTsRuntime ? 'ts' : 'js'}`);
18
+ const worker = new Worker(workerPath, {
19
+ execArgv: isTsRuntime ? ['--require', 'tsx/cjs'] : undefined,
20
+ });
21
+ return new Promise<Document[]>((resolve, reject) => {
22
+ let settled = false;
23
+ const close = (error?: Error, result?: Document[]) => {
24
+ if (settled) {
25
+ return;
26
+ }
27
+ settled = true;
28
+ if (error) {
29
+ reject(error);
30
+ return;
31
+ }
32
+ resolve(result || []);
33
+ };
34
+
35
+ worker.once('message', (payload: { documents?: Document[]; error?: string }) => {
36
+ if (payload?.error) {
37
+ close(new Error(payload.error));
38
+ return;
39
+ }
40
+ close(undefined, payload?.documents || []);
41
+ });
42
+ worker.once('error', (error) => close(error));
43
+ worker.once('exit', (code) => {
44
+ if (!settled && code !== 0) {
45
+ close(new Error(`Document loader worker exited with code ${code}`));
46
+ }
47
+ });
48
+
49
+ worker.postMessage({
50
+ extname,
51
+ mimeType: blob.type,
52
+ buffer: Uint8Array.from(buffer),
53
+ });
54
+ }).finally(() => {
55
+ worker.terminate().catch(() => undefined);
56
+ });
57
+ };
@@ -0,0 +1,100 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
11
+ import { PPTXLoader } from '@langchain/community/document_loaders/fs/pptx';
12
+ import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
13
+ import type { Document } from '@langchain/core/documents';
14
+ import { parentPort } from 'node:worker_threads';
15
+ import { TextLoader } from './vendor/langchain/document_loaders/fs/text';
16
+ import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
17
+ import { loadXlsx } from './xlsx';
18
+
19
+ type ParsePayload = {
20
+ extname: string;
21
+ mimeType?: string;
22
+ buffer: Uint8Array;
23
+ };
24
+
25
+ type WorkerResponse = {
26
+ documents?: Array<Pick<Document, 'pageContent' | 'metadata' | 'id'>>;
27
+ error?: string;
28
+ };
29
+
30
+ const loadPdf = async (blob: Blob): Promise<Document[]> => {
31
+ const loader = new PDFLoader(blob);
32
+ return loader.load();
33
+ };
34
+
35
+ const loadDoc = async (blob: Blob, type: 'docx' | 'doc'): Promise<Document[]> => {
36
+ const loader = new DocxLoader(blob, { type });
37
+ return loader.load();
38
+ };
39
+
40
+ const loadPpt = async (blob: Blob): Promise<Document[]> => {
41
+ const loader = new PPTXLoader(blob);
42
+ return loader.load();
43
+ };
44
+
45
+ const loadTxt = async (blob: Blob): Promise<Document[]> => {
46
+ const loader = new TextLoader(blob);
47
+ return loader.load();
48
+ };
49
+
50
+ const loadCsv = async (blob: Blob): Promise<Document[]> => {
51
+ const loader = new CSVLoader(blob);
52
+ return loader.load();
53
+ };
54
+
55
+ const loadByExtname = async (payload: ParsePayload): Promise<Document[]> => {
56
+ // @ts-ignore
57
+ const blob = new Blob([Buffer.from(payload.buffer)], { type: payload.mimeType ?? 'application/octet-stream' });
58
+
59
+ switch (payload.extname) {
60
+ case '.pdf':
61
+ return loadPdf(blob);
62
+ case '.ppt':
63
+ case '.pptx':
64
+ return loadPpt(blob);
65
+ case '.doc':
66
+ return loadDoc(blob, 'doc');
67
+ case '.docx':
68
+ return loadDoc(blob, 'docx');
69
+ case '.csv':
70
+ return loadCsv(blob);
71
+ case '.xls':
72
+ case '.xlsx':
73
+ return loadXlsx(blob);
74
+ case '.json':
75
+ case '.md':
76
+ case '.txt':
77
+ return loadTxt(blob);
78
+ default:
79
+ return [];
80
+ }
81
+ };
82
+
83
+ parentPort?.on('message', async (payload: ParsePayload) => {
84
+ try {
85
+ const documents = await loadByExtname(payload);
86
+ const response: WorkerResponse = {
87
+ documents: documents.map((doc) => ({
88
+ pageContent: doc.pageContent,
89
+ metadata: doc.metadata,
90
+ id: doc.id,
91
+ })),
92
+ };
93
+ parentPort?.postMessage(response);
94
+ } catch (error) {
95
+ const response: WorkerResponse = {
96
+ error: String(error?.stack || error),
97
+ };
98
+ parentPort?.postMessage(response);
99
+ }
100
+ });
@@ -0,0 +1,72 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ import { Document } from '@langchain/core/documents';
11
+ import { getEnv } from '@langchain/core/utils/env';
12
+ import { BaseDocumentLoader } from '@langchain/core/document_loaders/base';
13
+
14
+ export class TextLoader extends BaseDocumentLoader {
15
+ private filePathOrBlob: any;
16
+
17
+ constructor(filePathOrBlob) {
18
+ super();
19
+ this.filePathOrBlob = filePathOrBlob;
20
+ }
21
+
22
+ async parse(raw) {
23
+ return [raw];
24
+ }
25
+
26
+ async load() {
27
+ let text;
28
+ let metadata;
29
+
30
+ if (typeof this.filePathOrBlob === 'string') {
31
+ const { readFile } = await TextLoader.imports();
32
+ text = await readFile(this.filePathOrBlob, 'utf8');
33
+ metadata = { source: this.filePathOrBlob };
34
+ } else {
35
+ text = await this.filePathOrBlob.text();
36
+ metadata = { source: 'blob', blobType: this.filePathOrBlob.type };
37
+ }
38
+
39
+ const parsed = await this.parse(text);
40
+ parsed.forEach((pageContent, i) => {
41
+ if (typeof pageContent !== 'string') {
42
+ throw new Error(`Expected string, at position ${i} got ${typeof pageContent}`);
43
+ }
44
+ });
45
+
46
+ return parsed.map(
47
+ (pageContent, i) =>
48
+ new Document({
49
+ pageContent,
50
+ metadata:
51
+ parsed.length === 1
52
+ ? metadata
53
+ : {
54
+ ...metadata,
55
+ line: i + 1,
56
+ },
57
+ }),
58
+ );
59
+ }
60
+
61
+ static async imports() {
62
+ try {
63
+ const { readFile } = await import('node:fs/promises');
64
+ return { readFile };
65
+ } catch (e) {
66
+ console.error(e);
67
+ throw new Error(
68
+ `Failed to load fs/promises. TextLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.`,
69
+ );
70
+ }
71
+ }
72
+ }
@@ -0,0 +1,82 @@
1
+ /**
2
+ * This file is part of the NocoBase (R) project.
3
+ * Copyright (c) 2020-2024 NocoBase Co., Ltd.
4
+ * Authors: NocoBase Team.
5
+ *
6
+ * This project is dual-licensed under AGPL-3.0 and NocoBase Commercial License.
7
+ * For more information, please refer to: https://www.nocobase.com/agreement.
8
+ */
9
+
10
+ import { Document } from '@langchain/core/documents';
11
+ import * as XLSX from 'xlsx';
12
+
13
+ const normalizeCellValue = (value: unknown): string => {
14
+ if (value === undefined || value === null) {
15
+ return '';
16
+ }
17
+
18
+ return String(value);
19
+ };
20
+
21
+ const trimTrailingEmptyCells = (row: unknown[]): unknown[] => {
22
+ let end = row.length;
23
+
24
+ while (end > 0 && normalizeCellValue(row[end - 1]).trim() === '') {
25
+ end -= 1;
26
+ }
27
+
28
+ return row.slice(0, end);
29
+ };
30
+
31
+ const sheetToLines = (sheet: XLSX.WorkSheet): string[] => {
32
+ const rows = XLSX.utils.sheet_to_json(sheet, {
33
+ header: 1,
34
+ raw: false,
35
+ defval: '',
36
+ blankrows: false,
37
+ }) as unknown[][];
38
+
39
+ return rows
40
+ .map((row) => trimTrailingEmptyCells(Array.isArray(row) ? row : []))
41
+ .filter((row) => row.length > 0)
42
+ .map((row) => row.map((cell) => normalizeCellValue(cell)).join('\t'))
43
+ .filter((line) => line.trim().length > 0);
44
+ };
45
+
46
+ export const loadXlsx = async (blob: Blob): Promise<Document[]> => {
47
+ const buffer = await blob.arrayBuffer();
48
+ const workbook = XLSX.read(buffer, {
49
+ type: 'array',
50
+ cellText: true,
51
+ });
52
+
53
+ const documents: Document[] = [];
54
+
55
+ workbook.SheetNames.forEach((sheetName, index) => {
56
+ const sheet = workbook.Sheets[sheetName];
57
+
58
+ if (!sheet) {
59
+ return;
60
+ }
61
+
62
+ const lines = sheetToLines(sheet);
63
+
64
+ if (!lines.length) {
65
+ return;
66
+ }
67
+
68
+ documents.push(
69
+ new Document({
70
+ pageContent: [`Sheet: ${sheetName}`, ...lines].join('\n'),
71
+ metadata: {
72
+ source: 'blob',
73
+ blobType: blob.type,
74
+ sheetName,
75
+ sheetIndex: index,
76
+ },
77
+ }),
78
+ );
79
+ });
80
+
81
+ return documents;
82
+ };
package/src/index.ts CHANGED
@@ -11,3 +11,4 @@ export * from './ai-manager';
11
11
  export * from './document-manager';
12
12
  export * from './tools-manager';
13
13
  export * from './loader';
14
+ export * from './document-loader';