botrun-horse 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/bin/bh.mjs +193 -0
- package/bin/commands/dag-cmd.mjs +74 -0
- package/bin/commands/db-cmd.mjs +73 -0
- package/bin/commands/doc.mjs +185 -0
- package/bin/commands/gemini.mjs +120 -0
- package/bin/commands/help.mjs +109 -0
- package/bin/commands/legal.mjs +174 -0
- package/bin/commands/nchc.mjs +212 -0
- package/bin/commands/openrouter.mjs +154 -0
- package/bin/commands/prompt.mjs +175 -0
- package/bin/commands/schema.mjs +258 -0
- package/bin/commands/search.mjs +46 -0
- package/bin/commands/writing.mjs +33 -0
- package/lib/core/adapters/base.mjs +52 -0
- package/lib/core/adapters/claude.mjs +13 -0
- package/lib/core/adapters/gemini-api.mjs +174 -0
- package/lib/core/adapters/gemini-shared.mjs +164 -0
- package/lib/core/adapters/gemini-vertex.mjs +232 -0
- package/lib/core/adapters/local.mjs +13 -0
- package/lib/core/adapters/nchc.mjs +236 -0
- package/lib/core/adapters/openai-shared.mjs +34 -0
- package/lib/core/adapters/openrouter.mjs +304 -0
- package/lib/core/ai-cache.mjs +277 -0
- package/lib/core/ai-router.mjs +217 -0
- package/lib/core/cli-utils.mjs +170 -0
- package/lib/core/dag.mjs +114 -0
- package/lib/core/db.mjs +412 -0
- package/lib/core/env.mjs +64 -0
- package/lib/core/llm.mjs +58 -0
- package/lib/core/paths.mjs +115 -0
- package/lib/core/proxy.mjs +46 -0
- package/lib/core/watermelon.mjs +9 -0
- package/lib/doc/index.mjs +419 -0
- package/lib/doc/office2text.mjs +234 -0
- package/lib/doc/pdf2text.mjs +133 -0
- package/lib/doc/split.mjs +132 -0
- package/lib/flows/draft-writing.mjs +29 -0
- package/lib/flows/gemini-ask.mjs +185 -0
- package/lib/flows/hatch-portal.mjs +13 -0
- package/lib/flows/legal-ask.mjs +325 -0
- package/lib/flows/openai-agent.mjs +167 -0
- package/lib/flows/opencode-agent.mjs +240 -0
- package/lib/flows/openrouter-ask.mjs +111 -0
- package/lib/flows/review-doc.mjs +18 -0
- package/lib/ocr/index.mjs +6 -0
- package/lib/portal/hatch.mjs +6 -0
- package/lib/portal/index.mjs +6 -0
- package/lib/prompt/prompt-search.mjs +55 -0
- package/lib/prompt/prompt-store.mjs +94 -0
- package/lib/prompt/prompts/zero-framework/coding.md +15 -0
- package/lib/prompt/prompts/zero-framework/search.md +12 -0
- package/lib/prompt/prompts/zero-framework/slice.md +11 -0
- package/lib/search/crawler.mjs +6 -0
- package/lib/search/index.mjs +7 -0
- package/lib/tools/fs-tools.mjs +268 -0
- package/lib/tools/index.mjs +27 -0
- package/lib/writing/generate.mjs +86 -0
- package/lib/writing/generators/nstc-generators.mjs +279 -0
- package/lib/writing/generators/nstc-top5.mjs +554 -0
- package/lib/writing/index.mjs +5 -0
- package/lib/writing/layouts/nstc-layout.mjs +249 -0
- package/lib/writing/renderer.mjs +61 -0
- package/package.json +35 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
// lib/core/paths.mjs — 統一路徑解析
|
|
2
|
+
// botrun-horse 架構:lib 放腦、projects 放料
|
|
3
|
+
|
|
4
|
+
import path from 'path';
|
|
5
|
+
import fs from 'fs';
|
|
6
|
+
import { fileURLToPath } from 'url';
|
|
7
|
+
import { execSync } from 'child_process';
|
|
8
|
+
|
|
9
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
|
+
|
|
11
|
+
/** 專案根目錄 */
|
|
12
|
+
export function root() {
|
|
13
|
+
return path.resolve(__dirname, '../..');
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/** 取得指定專案的根目錄 */
|
|
17
|
+
export function projectDir(name) {
|
|
18
|
+
return path.join(root(), 'projects', name);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/** 取得指定專案的 config.json 路徑 */
|
|
22
|
+
export function projectConfig(name) {
|
|
23
|
+
return path.join(projectDir(name), 'config.json');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** 取得指定專案的 data/ 目錄 */
|
|
27
|
+
export function projectData(name) {
|
|
28
|
+
return path.join(projectDir(name), 'data');
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/** 取得指定專案的 db/ 目錄 */
|
|
32
|
+
export function projectDb(name) {
|
|
33
|
+
return path.join(projectDir(name), 'db');
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/** 取得指定專案的 output/ 目錄 */
|
|
37
|
+
export function projectOutput(name) {
|
|
38
|
+
return path.join(projectDir(name), 'output');
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** 取得指定專案的 DAG 定義檔路徑 */
|
|
42
|
+
export function dagDefinitionPath(name) {
|
|
43
|
+
return path.join(projectData(name), 'dag-definition.json');
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** 取得指定專案的 DAG 狀態檔路徑 */
|
|
47
|
+
export function dagStatePath(name) {
|
|
48
|
+
return path.join(projectData(name), 'dag-state.json');
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/** 取得指定專案的 SQLite DB 路徑 */
|
|
52
|
+
export function dbPath(name) {
|
|
53
|
+
return path.join(projectDb(name), 'docs.db');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* 跨平台工具執行檔路徑偵測
|
|
58
|
+
*
|
|
59
|
+
* 優先順序:
|
|
60
|
+
* 1. 環境變數 BH_{NAME}_BIN(如 BH_PDFTOTEXT_BIN=/usr/local/bin/pdftotext)
|
|
61
|
+
* 2. `which <name>` 系統指令
|
|
62
|
+
* 3. 常見安裝路徑手動掃描
|
|
63
|
+
* 4. fallback 參數(預設直接使用工具名稱,期待在 PATH 中)
|
|
64
|
+
*
|
|
65
|
+
* @param {string} name - 工具名稱(如 'pdftotext', 'qpdf', 'libreoffice')
|
|
66
|
+
* @param {string} fallback - 找不到時的回退值(預設與 name 相同)
|
|
67
|
+
* @returns {string} 工具執行檔完整路徑或名稱
|
|
68
|
+
*/
|
|
69
|
+
const _toolCache = {};
|
|
70
|
+
const SEARCH_DIRS = [
|
|
71
|
+
'/usr/local/bin',
|
|
72
|
+
'/usr/bin',
|
|
73
|
+
'/bin',
|
|
74
|
+
'/opt/homebrew/bin', // macOS Homebrew (Apple Silicon)
|
|
75
|
+
'/usr/local/opt/poppler/bin', // macOS Homebrew Poppler
|
|
76
|
+
'/opt/local/bin', // MacPorts
|
|
77
|
+
'/snap/bin', // Ubuntu Snap
|
|
78
|
+
'/var/lib/flatpak/exports/bin', // Flatpak
|
|
79
|
+
];
|
|
80
|
+
|
|
81
|
+
export function toolBin(name, fallback = null) {
|
|
82
|
+
if (_toolCache[name] !== undefined) return _toolCache[name];
|
|
83
|
+
|
|
84
|
+
// 1. 環境變數覆蓋(最高優先)
|
|
85
|
+
const envKey = `BH_${name.toUpperCase().replace(/[^A-Z0-9]/g, '_')}_BIN`;
|
|
86
|
+
if (process.env[envKey]) {
|
|
87
|
+
_toolCache[name] = process.env[envKey];
|
|
88
|
+
return _toolCache[name];
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// 2. which 指令(Unix/Linux/macOS)
|
|
92
|
+
try {
|
|
93
|
+
const found = execSync(`which ${name} 2>/dev/null`, { stdio: 'pipe' }).toString().trim();
|
|
94
|
+
if (found) {
|
|
95
|
+
_toolCache[name] = found;
|
|
96
|
+
return found;
|
|
97
|
+
}
|
|
98
|
+
} catch {
|
|
99
|
+
// which 失敗,繼續嘗試
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// 3. 手動掃描常見路徑
|
|
103
|
+
for (const dir of SEARCH_DIRS) {
|
|
104
|
+
const candidate = path.join(dir, name);
|
|
105
|
+
if (fs.existsSync(candidate)) {
|
|
106
|
+
_toolCache[name] = candidate;
|
|
107
|
+
return candidate;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// 4. fallback:直接使用工具名稱,期待在 PATH 中
|
|
112
|
+
const result = fallback || name;
|
|
113
|
+
_toolCache[name] = result;
|
|
114
|
+
return result;
|
|
115
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
// lib/core/proxy.mjs — HTTP/HTTPS Proxy 全局設定
|
|
2
|
+
//
|
|
3
|
+
// 問題背景:
|
|
4
|
+
// 容器環境中 no_proxy 包含 *.googleapis.com,但容器本身沒有直連外網的 DNS,
|
|
5
|
+
// 導致 fetch('https://generativelanguage.googleapis.com/...') 出現 EAI_AGAIN 錯誤。
|
|
6
|
+
// 解法:用 undici ProxyAgent 直接接管全局 fetch dispatcher,繞過 no_proxy 限制,
|
|
7
|
+
// 強制所有請求走 HTTPS_PROXY(代理伺服器有完整 DNS 解析能力)。
|
|
8
|
+
//
|
|
9
|
+
// 使用方式(在 bin/bh.mjs 最頂端 import):
|
|
10
|
+
// import { setupHttpProxy } from '../lib/core/proxy.mjs';
|
|
11
|
+
// setupHttpProxy();
|
|
12
|
+
//
|
|
13
|
+
// 環境變數(優先順序):
|
|
14
|
+
// HTTPS_PROXY > https_proxy > HTTP_PROXY > http_proxy
|
|
15
|
+
|
|
16
|
+
import { setGlobalDispatcher, ProxyAgent } from 'undici';
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* 偵測並設定 HTTP/HTTPS Proxy
|
|
20
|
+
* 強制使用 ProxyAgent 接管 Node.js 全局 fetch,忽略 no_proxy 限制,
|
|
21
|
+
* 確保 googleapis.com、anthropic.com 等 AI API 端點可正常連線。
|
|
22
|
+
*
|
|
23
|
+
* @returns {boolean} 是否成功設定 proxy
|
|
24
|
+
*/
|
|
25
|
+
export function setupHttpProxy() {
|
|
26
|
+
const proxyUrl =
|
|
27
|
+
process.env.HTTPS_PROXY ||
|
|
28
|
+
process.env.https_proxy ||
|
|
29
|
+
process.env.HTTP_PROXY ||
|
|
30
|
+
process.env.http_proxy;
|
|
31
|
+
|
|
32
|
+
if (!proxyUrl) return false;
|
|
33
|
+
|
|
34
|
+
// 刻意不使用 EnvHttpProxyAgent(它會讀 no_proxy)
|
|
35
|
+
// 直接用 ProxyAgent 強制所有 fetch 走 proxy
|
|
36
|
+
// 原因:容器環境 no_proxy 含 *.googleapis.com,但直連 DNS 失敗
|
|
37
|
+
const dispatcher = new ProxyAgent({ uri: proxyUrl });
|
|
38
|
+
setGlobalDispatcher(dispatcher);
|
|
39
|
+
|
|
40
|
+
if (process.env.BH_DEBUG_PROXY) {
|
|
41
|
+
const displayUrl = proxyUrl.replace(/:\/\/[^@]+@/, '://***@'); // 隱藏 JWT token
|
|
42
|
+
process.stderr.write(`[proxy] 已啟用 ProxyAgent: ${displayUrl}\n`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return true;
|
|
46
|
+
}
|
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
// lib/doc/index.mjs — 通用文件入庫流水線
|
|
2
|
+
//
|
|
3
|
+
// 支援格式:PDF、Office 全系列(.docx/.xlsx/.pptx/.odt/.odp/.ods 等)
|
|
4
|
+
//
|
|
5
|
+
// Pipeline(方案 E 混合最佳化):
|
|
6
|
+
// PDF: pdftotext 直接按頁萃取(不需先 split)→ SQLite FTS5
|
|
7
|
+
// Office: LibreOffice --convert-to txt → 換頁符號分頁 → SQLite FTS5
|
|
8
|
+
// 拆頁 PDF:可選(--split-pdf flag),僅在需要頁面級別 PDF 引證時執行
|
|
9
|
+
//
|
|
10
|
+
// 斷點續作:
|
|
11
|
+
// 每個檔案處理前先查 ingestion_log。
|
|
12
|
+
// status='done' 的檔案預設跳過(除非 --force 重新入庫)。
|
|
13
|
+
//
|
|
14
|
+
// 平行策略(類 GNU parallel -j N):
|
|
15
|
+
// - 多個文件以 concurrency semaphore 控制並發數
|
|
16
|
+
// - 單一文件內部的頁面萃取已在 pdf2text.mjs 內部平行化
|
|
17
|
+
// - SQLite WAL 模式支援並發寫入
|
|
18
|
+
|
|
19
|
+
import fs from 'fs';
|
|
20
|
+
import path from 'path';
|
|
21
|
+
import { splitPdf, getPdfPageCount } from './split.mjs';
|
|
22
|
+
import { extractAllPages } from './pdf2text.mjs';
|
|
23
|
+
import { extractOfficePages, extractTextFilePages, isOfficeFile, isTextFile } from './office2text.mjs';
|
|
24
|
+
import { DocStore } from '../core/db.mjs';
|
|
25
|
+
|
|
26
|
+
// ─────────────────────────────────────────────────────────
|
|
27
|
+
// 核心入庫函式
|
|
28
|
+
// ─────────────────────────────────────────────────────────
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* 匯入單一 PDF 到 SQLite
|
|
32
|
+
*
|
|
33
|
+
* Pipeline: 萃取文字 → 入庫(split PDF 為可選)
|
|
34
|
+
*
|
|
35
|
+
* @param {string} filePath - PDF 檔案路徑
|
|
36
|
+
* @param {object} opts
|
|
37
|
+
* @param {string} opts.dbPath - SQLite DB 路徑
|
|
38
|
+
* @param {string} [opts.splitDir] - 拆頁 PDF 輸出目錄(不提供 = 不拆頁)
|
|
39
|
+
* @param {string} [opts.docType] - 文件類型(手動指定)
|
|
40
|
+
* @param {string} [opts.title] - 文件標題(手動指定)
|
|
41
|
+
* @param {boolean} [opts.force] - true = 強制重新入庫(忽略 ingestion_log)
|
|
42
|
+
* @param {number} [opts.splitConcurrency] - 拆頁並發數(預設不限)
|
|
43
|
+
* @returns {Promise<{ docId: number, pages: number, skipped: boolean }>}
|
|
44
|
+
*/
|
|
45
|
+
export async function ingestOnePdf(filePath, opts = {}) {
|
|
46
|
+
const {
|
|
47
|
+
dbPath = './output/pdf_docs.db',
|
|
48
|
+
splitDir = null,
|
|
49
|
+
docType = null,
|
|
50
|
+
title = null,
|
|
51
|
+
force = false,
|
|
52
|
+
splitConcurrency = 0,
|
|
53
|
+
} = opts;
|
|
54
|
+
|
|
55
|
+
const absPath = path.resolve(filePath);
|
|
56
|
+
const filename = path.basename(absPath);
|
|
57
|
+
|
|
58
|
+
const store = new DocStore(dbPath);
|
|
59
|
+
store.initSchema();
|
|
60
|
+
|
|
61
|
+
try {
|
|
62
|
+
// 斷點續作:已成功匯入的跳過
|
|
63
|
+
if (!force && store.isIngested(absPath)) {
|
|
64
|
+
const doc = store.db.prepare('SELECT id, total_pages FROM documents WHERE source_path = ?').get(absPath);
|
|
65
|
+
store.close();
|
|
66
|
+
return { docId: doc?.id ?? -1, pages: doc?.total_pages ?? 0, skipped: true };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const fileSize = fs.statSync(absPath).size;
|
|
70
|
+
const parsed = parseFilename(filename);
|
|
71
|
+
const finalDocType = docType || parsed.type;
|
|
72
|
+
const finalTitle = title || parsed.title;
|
|
73
|
+
|
|
74
|
+
// 1. 萃取所有頁面文字(不需先拆頁)
|
|
75
|
+
const textResults = await extractAllPages(absPath);
|
|
76
|
+
const totalPages = textResults.length;
|
|
77
|
+
|
|
78
|
+
// 2. 可選:拆頁成獨立 PDF(若有指定 splitDir)
|
|
79
|
+
let splitMap = {};
|
|
80
|
+
if (splitDir) {
|
|
81
|
+
const baseName = path.basename(absPath, '.pdf');
|
|
82
|
+
const pageSplitDir = path.join(splitDir, baseName);
|
|
83
|
+
const splitResults = await splitPdf(absPath, pageSplitDir, { concurrency: splitConcurrency });
|
|
84
|
+
splitMap = Object.fromEntries(splitResults.map(s => [s.page, s.path]));
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// 3. 入庫
|
|
88
|
+
const docId = store.insertDocument({
|
|
89
|
+
sourcePath: absPath,
|
|
90
|
+
filename,
|
|
91
|
+
docType: finalDocType,
|
|
92
|
+
title: finalTitle,
|
|
93
|
+
totalPages,
|
|
94
|
+
fileSize,
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
for (const { page, text } of textResults) {
|
|
98
|
+
store.insertPage({
|
|
99
|
+
docId,
|
|
100
|
+
pageNumber: page,
|
|
101
|
+
pageText: text,
|
|
102
|
+
sourcePath: absPath,
|
|
103
|
+
sourceFile: filename,
|
|
104
|
+
sourcePage: page,
|
|
105
|
+
splitPdf: splitMap[page] || null,
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
store.logIngested(absPath, docId, totalPages);
|
|
110
|
+
return { docId, pages: totalPages, skipped: false };
|
|
111
|
+
} catch (err) {
|
|
112
|
+
store.logFailed(absPath, err.message);
|
|
113
|
+
throw err;
|
|
114
|
+
} finally {
|
|
115
|
+
store.close();
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* 匯入單一 Office 文件到 SQLite
|
|
121
|
+
*
|
|
122
|
+
* Pipeline: LibreOffice 轉換 → 逐頁萃取 → 入庫
|
|
123
|
+
*
|
|
124
|
+
* @param {string} filePath - Office 文件路徑
|
|
125
|
+
* @param {object} opts - 同 ingestOnePdf
|
|
126
|
+
* @returns {Promise<{ docId: number, pages: number, skipped: boolean }>}
|
|
127
|
+
*/
|
|
128
|
+
export async function ingestOneOffice(filePath, opts = {}) {
|
|
129
|
+
const {
|
|
130
|
+
dbPath = './output/pdf_docs.db',
|
|
131
|
+
docType = null,
|
|
132
|
+
title = null,
|
|
133
|
+
force = false,
|
|
134
|
+
} = opts;
|
|
135
|
+
|
|
136
|
+
const absPath = path.resolve(filePath);
|
|
137
|
+
const filename = path.basename(absPath);
|
|
138
|
+
|
|
139
|
+
const store = new DocStore(dbPath);
|
|
140
|
+
store.initSchema();
|
|
141
|
+
|
|
142
|
+
try {
|
|
143
|
+
if (!force && store.isIngested(absPath)) {
|
|
144
|
+
const doc = store.db.prepare('SELECT id, total_pages FROM documents WHERE source_path = ?').get(absPath);
|
|
145
|
+
store.close();
|
|
146
|
+
return { docId: doc?.id ?? -1, pages: doc?.total_pages ?? 0, skipped: true };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const fileSize = fs.statSync(absPath).size;
|
|
150
|
+
const parsed = parseFilename(filename);
|
|
151
|
+
const finalDocType = docType || parsed.type;
|
|
152
|
+
const finalTitle = title || parsed.title;
|
|
153
|
+
|
|
154
|
+
// 1. 萃取所有頁面文字(LibreOffice 轉換)
|
|
155
|
+
const textResults = await extractOfficePages(absPath);
|
|
156
|
+
const totalPages = textResults.length;
|
|
157
|
+
|
|
158
|
+
// 2. 入庫
|
|
159
|
+
const docId = store.insertDocument({
|
|
160
|
+
sourcePath: absPath,
|
|
161
|
+
filename,
|
|
162
|
+
docType: finalDocType,
|
|
163
|
+
title: finalTitle,
|
|
164
|
+
totalPages,
|
|
165
|
+
fileSize,
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
for (const { page, text } of textResults) {
|
|
169
|
+
store.insertPage({
|
|
170
|
+
docId,
|
|
171
|
+
pageNumber: page,
|
|
172
|
+
pageText: text,
|
|
173
|
+
sourcePath: absPath,
|
|
174
|
+
sourceFile: filename,
|
|
175
|
+
sourcePage: page,
|
|
176
|
+
splitPdf: null,
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
store.logIngested(absPath, docId, totalPages);
|
|
181
|
+
return { docId, pages: totalPages, skipped: false };
|
|
182
|
+
} catch (err) {
|
|
183
|
+
store.logFailed(absPath, err.message);
|
|
184
|
+
throw err;
|
|
185
|
+
} finally {
|
|
186
|
+
store.close();
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* 匯入單一純文字檔案到 SQLite(.txt / .md / .rst 等)
|
|
192
|
+
*
|
|
193
|
+
* @param {string} filePath - 純文字檔案路徑
|
|
194
|
+
* @param {object} opts - 同 ingestOnePdf
|
|
195
|
+
* @returns {Promise<{ docId: number, pages: number, skipped: boolean }>}
|
|
196
|
+
*/
|
|
197
|
+
export async function ingestOneText(filePath, opts = {}) {
|
|
198
|
+
const {
|
|
199
|
+
dbPath = './output/pdf_docs.db',
|
|
200
|
+
docType = null,
|
|
201
|
+
title = null,
|
|
202
|
+
force = false,
|
|
203
|
+
} = opts;
|
|
204
|
+
|
|
205
|
+
const absPath = path.resolve(filePath);
|
|
206
|
+
const filename = path.basename(absPath);
|
|
207
|
+
|
|
208
|
+
const store = new DocStore(dbPath);
|
|
209
|
+
store.initSchema();
|
|
210
|
+
|
|
211
|
+
try {
|
|
212
|
+
if (!force && store.isIngested(absPath)) {
|
|
213
|
+
const doc = store.db.prepare('SELECT id, total_pages FROM documents WHERE source_path = ?').get(absPath);
|
|
214
|
+
store.close();
|
|
215
|
+
return { docId: doc?.id ?? -1, pages: doc?.total_pages ?? 0, skipped: true };
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const fileSize = fs.statSync(absPath).size;
|
|
219
|
+
const parsed = parseFilename(filename);
|
|
220
|
+
const finalDocType = docType || parsed.type || '純文字';
|
|
221
|
+
const finalTitle = title || parsed.title;
|
|
222
|
+
|
|
223
|
+
const textResults = await extractTextFilePages(absPath);
|
|
224
|
+
const totalPages = textResults.length;
|
|
225
|
+
|
|
226
|
+
const docId = store.insertDocument({
|
|
227
|
+
sourcePath: absPath,
|
|
228
|
+
filename,
|
|
229
|
+
docType: finalDocType,
|
|
230
|
+
title: finalTitle,
|
|
231
|
+
totalPages,
|
|
232
|
+
fileSize,
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
for (const { page, text } of textResults) {
|
|
236
|
+
store.insertPage({
|
|
237
|
+
docId,
|
|
238
|
+
pageNumber: page,
|
|
239
|
+
pageText: text,
|
|
240
|
+
sourcePath: absPath,
|
|
241
|
+
sourceFile: filename,
|
|
242
|
+
sourcePage: page,
|
|
243
|
+
splitPdf: null,
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
store.logIngested(absPath, docId, totalPages);
|
|
248
|
+
return { docId, pages: totalPages, skipped: false };
|
|
249
|
+
} catch (err) {
|
|
250
|
+
store.logFailed(absPath, err.message);
|
|
251
|
+
throw err;
|
|
252
|
+
} finally {
|
|
253
|
+
store.close();
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* 通用匯入單一檔案(自動根據副檔名選擇 PDF / Office / 純文字流程)
|
|
259
|
+
*
|
|
260
|
+
* @param {string} filePath - 檔案路徑(.pdf / .docx / .xlsx / .txt / .md 等)
|
|
261
|
+
* @param {object} opts - 同 ingestOnePdf
|
|
262
|
+
* @returns {Promise<{ docId: number, pages: number, skipped: boolean }>}
|
|
263
|
+
*/
|
|
264
|
+
export async function ingestOne(filePath, opts = {}) {
|
|
265
|
+
if (isTextFile(filePath)) {
|
|
266
|
+
return ingestOneText(filePath, opts);
|
|
267
|
+
}
|
|
268
|
+
if (isOfficeFile(filePath)) {
|
|
269
|
+
return ingestOneOffice(filePath, opts);
|
|
270
|
+
}
|
|
271
|
+
return ingestOnePdf(filePath, opts);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* 批次匯入多個檔案(並發控制,類 GNU parallel -j N)
|
|
276
|
+
*
|
|
277
|
+
* @param {string[]} filePaths - 檔案路徑陣列
|
|
278
|
+
* @param {object} opts - 同 ingestOne 的 opts
|
|
279
|
+
* @param {number} concurrency - 最大並發文件數(預設 5)
|
|
280
|
+
* @returns {Promise<{ ok: number, fail: number, skip: number, results: object[] }>}
|
|
281
|
+
*/
|
|
282
|
+
export async function ingestBatch(filePaths, opts = {}, concurrency = 5) {
|
|
283
|
+
const results = [];
|
|
284
|
+
let ok = 0, fail = 0, skip = 0;
|
|
285
|
+
|
|
286
|
+
// Semaphore 實作(類 GNU parallel -j N)
|
|
287
|
+
let running = 0;
|
|
288
|
+
let idx = 0;
|
|
289
|
+
|
|
290
|
+
await new Promise((resolve, reject) => {
|
|
291
|
+
function schedule() {
|
|
292
|
+
if (idx >= filePaths.length && running === 0) {
|
|
293
|
+
resolve();
|
|
294
|
+
return;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
while (running < concurrency && idx < filePaths.length) {
|
|
298
|
+
const filePath = filePaths[idx++];
|
|
299
|
+
running++;
|
|
300
|
+
|
|
301
|
+
ingestOne(filePath, opts)
|
|
302
|
+
.then(r => {
|
|
303
|
+
running--;
|
|
304
|
+
if (r.skipped) {
|
|
305
|
+
skip++;
|
|
306
|
+
results.push({ path: filePath, ...r, status: 'skip' });
|
|
307
|
+
process.stdout.write(`SKIP\t${path.basename(filePath)}\t(已入庫)\n`);
|
|
308
|
+
} else {
|
|
309
|
+
ok++;
|
|
310
|
+
results.push({ path: filePath, ...r, status: 'ok' });
|
|
311
|
+
process.stdout.write(`OK\t${path.basename(filePath)}\t${r.pages}頁\n`);
|
|
312
|
+
}
|
|
313
|
+
schedule();
|
|
314
|
+
})
|
|
315
|
+
.catch(err => {
|
|
316
|
+
running--;
|
|
317
|
+
fail++;
|
|
318
|
+
results.push({ path: filePath, error: err.message, status: 'fail' });
|
|
319
|
+
process.stderr.write(`FAIL\t${path.basename(filePath)}\t${err.message}\n`);
|
|
320
|
+
schedule();
|
|
321
|
+
});
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
schedule();
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
return { ok, fail, skip, results };
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
/**
|
|
331
|
+
* 掃描目錄並批次匯入所有 PDF 與 Office 文件
|
|
332
|
+
*
|
|
333
|
+
* 遞迴掃描(--recursive flag 控制)或僅掃描頂層。
|
|
334
|
+
*
|
|
335
|
+
* @param {string|string[]} dirOrDirs - 單一目錄路徑或陣列
|
|
336
|
+
* @param {object} opts - 同 ingestOne 的 opts
|
|
337
|
+
* @param {number} concurrency - 最大並發數(預設 5)
|
|
338
|
+
* @param {boolean} recursive - 是否遞迴掃描子目錄(預設 false)
|
|
339
|
+
* @returns {Promise<{ ok, fail, skip, results }>}
|
|
340
|
+
*/
|
|
341
|
+
export async function ingestDir(dirOrDirs, opts = {}, concurrency = 5, recursive = false) {
|
|
342
|
+
const dirs = Array.isArray(dirOrDirs) ? dirOrDirs : [dirOrDirs];
|
|
343
|
+
const allFiles = [];
|
|
344
|
+
|
|
345
|
+
for (const dir of dirs) {
|
|
346
|
+
const absDir = path.resolve(dir);
|
|
347
|
+
const found = scanDir(absDir, recursive);
|
|
348
|
+
allFiles.push(...found);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// 依路徑排序(確保處理順序可預期,方便 debug)
|
|
352
|
+
allFiles.sort();
|
|
353
|
+
|
|
354
|
+
return ingestBatch(allFiles, opts, concurrency);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// ─────────────────────────────────────────────────────────
|
|
358
|
+
// 工具函式
|
|
359
|
+
// ─────────────────────────────────────────────────────────
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* 掃描目錄,收集所有支援的文件路徑
|
|
363
|
+
*
|
|
364
|
+
* @param {string} dir - 目錄路徑
|
|
365
|
+
* @param {boolean} recursive - 是否遞迴
|
|
366
|
+
* @returns {string[]} 檔案路徑陣列
|
|
367
|
+
*/
|
|
368
|
+
function scanDir(dir, recursive = false) {
|
|
369
|
+
if (!fs.existsSync(dir)) return [];
|
|
370
|
+
|
|
371
|
+
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
372
|
+
const files = [];
|
|
373
|
+
|
|
374
|
+
for (const entry of entries) {
|
|
375
|
+
const fullPath = path.join(dir, entry.name);
|
|
376
|
+
if (entry.isDirectory() && recursive) {
|
|
377
|
+
files.push(...scanDir(fullPath, true));
|
|
378
|
+
} else if (entry.isFile()) {
|
|
379
|
+
const ext = path.extname(entry.name).toLowerCase();
|
|
380
|
+
if (ext === '.pdf' || isOfficeFile(fullPath) || isTextFile(fullPath)) {
|
|
381
|
+
files.push(fullPath);
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
return files;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
/**
|
|
390
|
+
* 從檔名 best-effort 解析文件 metadata
|
|
391
|
+
*
|
|
392
|
+
* 支援的命名慣例(依優先順序):
|
|
393
|
+
* 1. "{序號}_{類型}_{標題}.pdf" → { id, type, title }
|
|
394
|
+
* 2. "{序號}_{標題}.pdf" → { id, type:null, title }
|
|
395
|
+
* 3. 其他任何格式 → { id:null, type:null, title: 檔名去副檔名 }
|
|
396
|
+
*
|
|
397
|
+
* 永遠回傳有效結果,不拋出錯誤(graceful fallback)
|
|
398
|
+
*
|
|
399
|
+
* @param {string} filename - 檔案名稱(含副檔名)
|
|
400
|
+
* @returns {{ id: string|null, type: string|null, title: string }}
|
|
401
|
+
*/
|
|
402
|
+
export function parseFilename(filename) {
|
|
403
|
+
const ext = path.extname(filename);
|
|
404
|
+
const base = filename.slice(0, -ext.length);
|
|
405
|
+
const parts = base.split('_');
|
|
406
|
+
|
|
407
|
+
// 格式 1:{序號}_{類型}_{標題} — 至少三段,第一段是純數字
|
|
408
|
+
if (parts.length >= 3 && /^\d+$/.test(parts[0])) {
|
|
409
|
+
return { id: parts[0], type: parts[1], title: parts.slice(2).join('_') };
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// 格式 2:{序號}_{標題} — 兩段,第一段是純數字
|
|
413
|
+
if (parts.length === 2 && /^\d+$/.test(parts[0])) {
|
|
414
|
+
return { id: parts[0], type: null, title: parts[1] };
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// 格式 3:任何其他命名
|
|
418
|
+
return { id: null, type: null, title: base };
|
|
419
|
+
}
|