@openclaw-china/shared 0.1.21 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@openclaw-china/shared",
3
- "version": "0.1.21",
3
+ "version": "0.1.22",
4
4
  "type": "module",
5
5
  "exports": {
6
6
  ".": "./src/index.ts"
package/src/index.ts CHANGED
@@ -6,3 +6,4 @@ export * from "./policy/index.js";
6
6
  export * from "./http/index.js";
7
7
  export * from "./types/common.js";
8
8
  export * from "./file/index.js";
9
+ export * from "./media/index.js";
@@ -0,0 +1,57 @@
1
+ /**
2
+ * 媒体处理模块
3
+ *
4
+ * 提供统一的媒体解析、路径处理和文件读取功能
5
+ *
6
+ * @module @openclaw-china/shared/media
7
+ */
8
+
9
+ // 媒体解析
10
+ export {
11
+ // 类型
12
+ type MediaType,
13
+ type ExtractedMedia,
14
+ type MediaParseResult,
15
+ type MediaParseOptions,
16
+ // 常量
17
+ IMAGE_EXTENSIONS,
18
+ AUDIO_EXTENSIONS,
19
+ VIDEO_EXTENSIONS,
20
+ NON_IMAGE_EXTENSIONS,
21
+ // 路径处理函数
22
+ isHttpUrl,
23
+ isFileUrl,
24
+ isLocalReference,
25
+ normalizeLocalPath,
26
+ stripTitleFromUrl,
27
+ getExtension,
28
+ isImagePath,
29
+ isNonImageFilePath,
30
+ detectMediaType,
31
+ // 媒体提取函数
32
+ extractMediaFromText,
33
+ extractImagesFromText,
34
+ extractFilesFromText,
35
+ } from "./media-parser.js";
36
+
37
+ // 媒体 IO
38
+ export {
39
+ // 类型
40
+ type MediaReadResult,
41
+ type MediaReadOptions,
42
+ type PathSecurityOptions,
43
+ // 错误类
44
+ FileSizeLimitError,
45
+ MediaTimeoutError,
46
+ PathSecurityError,
47
+ // 路径安全
48
+ validatePathSecurity,
49
+ getDefaultAllowedPrefixes,
50
+ // MIME 类型
51
+ getMimeType,
52
+ // 媒体读取函数
53
+ fetchMediaFromUrl,
54
+ readMediaFromLocal,
55
+ readMedia,
56
+ readMediaBatch,
57
+ } from "./media-io.js";
@@ -0,0 +1,423 @@
1
+ /**
2
+ * 媒体 IO 模块
3
+ *
4
+ * 提供统一的媒体文件下载和读取功能
5
+ *
6
+ * @module @openclaw-china/shared/media
7
+ */
8
+
9
+ import * as fs from "fs";
10
+ import * as fsPromises from "fs/promises";
11
+ import * as path from "path";
12
+ import * as os from "os";
13
+ import { isHttpUrl, normalizeLocalPath, getExtension } from "./media-parser.js";
14
+
15
+ // ============================================================================
16
+ // 类型定义
17
+ // ============================================================================
18
+
19
+ /**
20
+ * 媒体读取结果
21
+ */
22
+ export interface MediaReadResult {
23
+ /** 文件内容 Buffer */
24
+ buffer: Buffer;
25
+ /** 文件名 */
26
+ fileName: string;
27
+ /** 文件大小(字节) */
28
+ size: number;
29
+ /** MIME 类型(如果可检测) */
30
+ mimeType?: string;
31
+ }
32
+
33
+ /**
34
+ * 媒体读取选项
35
+ */
36
+ export interface MediaReadOptions {
37
+ /** 超时时间(毫秒),默认 30000 */
38
+ timeout?: number;
39
+ /** 最大文件大小(字节),默认 100MB */
40
+ maxSize?: number;
41
+ /** 自定义 fetch 函数(用于依赖注入) */
42
+ fetch?: typeof globalThis.fetch;
43
+ }
44
+
45
+ /**
46
+ * 路径安全检查选项
47
+ */
48
+ export interface PathSecurityOptions {
49
+ /** 允许的路径前缀白名单 */
50
+ allowedPrefixes?: string[];
51
+ /** 最大路径长度,默认 4096 */
52
+ maxPathLength?: number;
53
+ /** 是否禁止路径穿越,默认 true */
54
+ preventTraversal?: boolean;
55
+ }
56
+
57
+ // ============================================================================
58
+ // 常量定义
59
+ // ============================================================================
60
+
61
+ /** 默认超时时间(毫秒) */
62
+ const DEFAULT_TIMEOUT = 30000;
63
+
64
+ /** 默认最大文件大小(100MB) */
65
+ const DEFAULT_MAX_SIZE = 100 * 1024 * 1024;
66
+
67
+ /** 默认最大路径长度 */
68
+ const DEFAULT_MAX_PATH_LENGTH = 4096;
69
+
70
+ /** 默认允许的路径前缀(Unix) */
71
+ const DEFAULT_UNIX_PREFIXES = [
72
+ "/tmp",
73
+ "/var/tmp",
74
+ "/private/tmp",
75
+ "/Users",
76
+ "/home",
77
+ "/root",
78
+ ];
79
+
80
+ /** 扩展名到 MIME 类型映射 */
81
+ const EXT_TO_MIME: Record<string, string> = {
82
+ // 图片
83
+ jpg: "image/jpeg",
84
+ jpeg: "image/jpeg",
85
+ png: "image/png",
86
+ gif: "image/gif",
87
+ webp: "image/webp",
88
+ bmp: "image/bmp",
89
+ svg: "image/svg+xml",
90
+ ico: "image/x-icon",
91
+ // 音频
92
+ mp3: "audio/mpeg",
93
+ wav: "audio/wav",
94
+ ogg: "audio/ogg",
95
+ m4a: "audio/x-m4a",
96
+ amr: "audio/amr",
97
+ // 视频
98
+ mp4: "video/mp4",
99
+ mov: "video/quicktime",
100
+ avi: "video/x-msvideo",
101
+ mkv: "video/x-matroska",
102
+ webm: "video/webm",
103
+ // 文档
104
+ pdf: "application/pdf",
105
+ doc: "application/msword",
106
+ docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
107
+ xls: "application/vnd.ms-excel",
108
+ xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
109
+ ppt: "application/vnd.ms-powerpoint",
110
+ pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
111
+ txt: "text/plain",
112
+ csv: "text/csv",
113
+ // 压缩包
114
+ zip: "application/zip",
115
+ rar: "application/x-rar-compressed",
116
+ "7z": "application/x-7z-compressed",
117
+ tar: "application/x-tar",
118
+ gz: "application/gzip",
119
+ };
120
+
121
+ // ============================================================================
122
+ // 错误类
123
+ // ============================================================================
124
+
125
+ /**
126
+ * 文件大小超限错误
127
+ */
128
+ export class FileSizeLimitError extends Error {
129
+ /** 实际文件大小(字节) */
130
+ public readonly actualSize: number;
131
+ /** 大小限制(字节) */
132
+ public readonly limitSize: number;
133
+
134
+ constructor(actualSize: number, limitSize: number) {
135
+ super(`File size ${actualSize} bytes exceeds limit ${limitSize} bytes`);
136
+ this.name = "FileSizeLimitError";
137
+ this.actualSize = actualSize;
138
+ this.limitSize = limitSize;
139
+
140
+ if (Error.captureStackTrace) {
141
+ Error.captureStackTrace(this, FileSizeLimitError);
142
+ }
143
+ }
144
+ }
145
+
146
+ /**
147
+ * 下载超时错误
148
+ */
149
+ export class MediaTimeoutError extends Error {
150
+ /** 超时时间(毫秒) */
151
+ public readonly timeoutMs: number;
152
+
153
+ constructor(timeoutMs: number) {
154
+ super(`Operation timed out after ${timeoutMs}ms`);
155
+ this.name = "MediaTimeoutError";
156
+ this.timeoutMs = timeoutMs;
157
+
158
+ if (Error.captureStackTrace) {
159
+ Error.captureStackTrace(this, MediaTimeoutError);
160
+ }
161
+ }
162
+ }
163
+
164
+ /**
165
+ * 路径安全错误
166
+ */
167
+ export class PathSecurityError extends Error {
168
+ /** 不安全的路径 */
169
+ public readonly unsafePath: string;
170
+ /** 错误原因 */
171
+ public readonly reason: string;
172
+
173
+ constructor(unsafePath: string, reason: string) {
174
+ super(`Path security violation: ${reason} - ${unsafePath}`);
175
+ this.name = "PathSecurityError";
176
+ this.unsafePath = unsafePath;
177
+ this.reason = reason;
178
+
179
+ if (Error.captureStackTrace) {
180
+ Error.captureStackTrace(this, PathSecurityError);
181
+ }
182
+ }
183
+ }
184
+
185
+ // ============================================================================
186
+ // 路径安全检查
187
+ // ============================================================================
188
+
189
+ /**
190
+ * 检查路径是否安全
191
+ *
192
+ * @param filePath - 要检查的路径
193
+ * @param options - 安全检查选项
194
+ * @throws PathSecurityError 如果路径不安全
195
+ */
196
+ export function validatePathSecurity(
197
+ filePath: string,
198
+ options: PathSecurityOptions = {}
199
+ ): void {
200
+ const {
201
+ allowedPrefixes,
202
+ maxPathLength = DEFAULT_MAX_PATH_LENGTH,
203
+ preventTraversal = true,
204
+ } = options;
205
+
206
+ // 检查路径长度
207
+ if (filePath.length > maxPathLength) {
208
+ throw new PathSecurityError(
209
+ filePath,
210
+ `Path length ${filePath.length} exceeds maximum ${maxPathLength}`
211
+ );
212
+ }
213
+
214
+ // 检查路径穿越
215
+ if (preventTraversal) {
216
+ const normalized = path.normalize(filePath);
217
+ if (normalized.includes("..")) {
218
+ throw new PathSecurityError(filePath, "Path traversal detected");
219
+ }
220
+ }
221
+
222
+ // 检查路径前缀白名单
223
+ if (allowedPrefixes && allowedPrefixes.length > 0) {
224
+ const normalizedPath = path.normalize(filePath);
225
+ const isAllowed = allowedPrefixes.some((prefix) =>
226
+ normalizedPath.startsWith(path.normalize(prefix))
227
+ );
228
+ if (!isAllowed) {
229
+ throw new PathSecurityError(
230
+ filePath,
231
+ `Path not in allowed prefixes: ${allowedPrefixes.join(", ")}`
232
+ );
233
+ }
234
+ }
235
+ }
236
+
237
+ /**
238
+ * 获取默认的路径白名单
239
+ */
240
+ export function getDefaultAllowedPrefixes(): string[] {
241
+ if (process.platform === "win32") {
242
+ // Windows: 允许所有驱动器的临时目录和用户目录
243
+ const tempDir = os.tmpdir();
244
+ const homeDir = os.homedir();
245
+ return [tempDir, homeDir];
246
+ }
247
+ return DEFAULT_UNIX_PREFIXES;
248
+ }
249
+
250
+ // ============================================================================
251
+ // MIME 类型检测
252
+ // ============================================================================
253
+
254
+ /**
255
+ * 根据文件扩展名获取 MIME 类型
256
+ */
257
+ export function getMimeType(filePath: string): string | undefined {
258
+ const ext = getExtension(filePath);
259
+ return EXT_TO_MIME[ext];
260
+ }
261
+
262
+ // ============================================================================
263
+ // 媒体读取函数
264
+ // ============================================================================
265
+
266
+ /**
267
+ * 从 HTTP URL 下载媒体
268
+ *
269
+ * @param url - 媒体 URL
270
+ * @param options - 读取选项
271
+ * @returns 媒体读取结果
272
+ */
273
+ export async function fetchMediaFromUrl(
274
+ url: string,
275
+ options: MediaReadOptions = {}
276
+ ): Promise<MediaReadResult> {
277
+ const {
278
+ timeout = DEFAULT_TIMEOUT,
279
+ maxSize = DEFAULT_MAX_SIZE,
280
+ fetch: customFetch = globalThis.fetch,
281
+ } = options;
282
+
283
+ const controller = new AbortController();
284
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
285
+
286
+ try {
287
+ const response = await customFetch(url, { signal: controller.signal });
288
+
289
+ if (!response.ok) {
290
+ const errorText = await response.text();
291
+ throw new Error(`HTTP ${response.status}: ${errorText}`);
292
+ }
293
+
294
+ // 检查 Content-Length
295
+ const contentLength = response.headers.get("content-length");
296
+ if (contentLength) {
297
+ const size = parseInt(contentLength, 10);
298
+ if (size > maxSize) {
299
+ throw new FileSizeLimitError(size, maxSize);
300
+ }
301
+ }
302
+
303
+ const arrayBuffer = await response.arrayBuffer();
304
+ const buffer = Buffer.from(arrayBuffer);
305
+
306
+ // 检查实际大小
307
+ if (buffer.length > maxSize) {
308
+ throw new FileSizeLimitError(buffer.length, maxSize);
309
+ }
310
+
311
+ // 提取文件名
312
+ let fileName = "file";
313
+ try {
314
+ const urlPath = new URL(url).pathname;
315
+ fileName = path.basename(urlPath) || "file";
316
+ } catch {
317
+ // 忽略 URL 解析错误
318
+ }
319
+
320
+ // 获取 MIME 类型
321
+ const mimeType =
322
+ response.headers.get("content-type")?.split(";")[0].trim() ||
323
+ getMimeType(fileName);
324
+
325
+ return {
326
+ buffer,
327
+ fileName,
328
+ size: buffer.length,
329
+ mimeType,
330
+ };
331
+ } catch (error) {
332
+ if (error instanceof Error && error.name === "AbortError") {
333
+ throw new MediaTimeoutError(timeout);
334
+ }
335
+ throw error;
336
+ } finally {
337
+ clearTimeout(timeoutId);
338
+ }
339
+ }
340
+
341
+ /**
342
+ * 从本地路径读取媒体
343
+ *
344
+ * @param filePath - 本地文件路径(支持 file://, MEDIA:, attachment:// 前缀)
345
+ * @param options - 读取选项
346
+ * @returns 媒体读取结果
347
+ */
348
+ export async function readMediaFromLocal(
349
+ filePath: string,
350
+ options: MediaReadOptions & PathSecurityOptions = {}
351
+ ): Promise<MediaReadResult> {
352
+ const { maxSize = DEFAULT_MAX_SIZE } = options;
353
+
354
+ // 规范化路径
355
+ const localPath = normalizeLocalPath(filePath);
356
+
357
+ // 安全检查
358
+ validatePathSecurity(localPath, options);
359
+
360
+ // 检查文件存在性
361
+ if (!fs.existsSync(localPath)) {
362
+ throw new Error(`File not found: ${localPath}`);
363
+ }
364
+
365
+ // 检查文件大小
366
+ const stats = await fsPromises.stat(localPath);
367
+ if (stats.size > maxSize) {
368
+ throw new FileSizeLimitError(stats.size, maxSize);
369
+ }
370
+
371
+ // 读取文件
372
+ const buffer = await fsPromises.readFile(localPath);
373
+ const fileName = path.basename(localPath);
374
+ const mimeType = getMimeType(localPath);
375
+
376
+ return {
377
+ buffer,
378
+ fileName,
379
+ size: buffer.length,
380
+ mimeType,
381
+ };
382
+ }
383
+
384
+ /**
385
+ * 统一的媒体读取函数
386
+ * 自动判断是 HTTP URL 还是本地路径
387
+ *
388
+ * @param source - 媒体源(URL 或本地路径)
389
+ * @param options - 读取选项
390
+ * @returns 媒体读取结果
391
+ */
392
+ export async function readMedia(
393
+ source: string,
394
+ options: MediaReadOptions & PathSecurityOptions = {}
395
+ ): Promise<MediaReadResult> {
396
+ if (isHttpUrl(source)) {
397
+ return fetchMediaFromUrl(source, options);
398
+ }
399
+ return readMediaFromLocal(source, options);
400
+ }
401
+
402
+ /**
403
+ * 批量读取媒体
404
+ *
405
+ * @param sources - 媒体源列表
406
+ * @param options - 读取选项
407
+ * @returns 媒体读取结果列表(包含成功和失败的结果)
408
+ */
409
+ export async function readMediaBatch(
410
+ sources: string[],
411
+ options: MediaReadOptions & PathSecurityOptions = {}
412
+ ): Promise<Array<{ source: string; result?: MediaReadResult; error?: Error }>> {
413
+ const results = await Promise.allSettled(
414
+ sources.map((source) => readMedia(source, options))
415
+ );
416
+
417
+ return results.map((result, index) => {
418
+ if (result.status === "fulfilled") {
419
+ return { source: sources[index], result: result.value };
420
+ }
421
+ return { source: sources[index], error: result.reason as Error };
422
+ });
423
+ }
@@ -0,0 +1,722 @@
1
+ /**
2
+ * 媒体解析模块
3
+ *
4
+ * 提供统一的媒体路径提取、解析和规范化功能
5
+ * 支持 Markdown 图片、HTML img 标签、MEDIA: 标记、本地路径等多种格式
6
+ *
7
+ * @module @openclaw-china/shared/media
8
+ */
9
+
10
+ import * as fs from "fs";
11
+ import * as os from "os";
12
+ import * as path from "path";
13
+ import { fileURLToPath } from "url";
14
+
15
+ // ============================================================================
16
+ // 类型定义
17
+ // ============================================================================
18
+
19
+ /**
20
+ * 媒体类型
21
+ */
22
+ export type MediaType = "image" | "audio" | "video" | "file";
23
+
24
+ /**
25
+ * 媒体来源类型
26
+ */
27
+ export type MediaSourceKind = "markdown" | "markdown-linked" | "html" | "bare";
28
+
29
+ /**
30
+ * 提取的媒体项
31
+ */
32
+ export interface ExtractedMedia {
33
+ /** 原始路径或 URL */
34
+ source: string;
35
+ /** 规范化后的本地路径(仅本地文件有效) */
36
+ localPath?: string;
37
+ /** 媒体类型 */
38
+ type: MediaType;
39
+ /** 是否为本地文件 */
40
+ isLocal: boolean;
41
+ /** 是否为 HTTP URL */
42
+ isHttp: boolean;
43
+ /** 文件名 */
44
+ fileName?: string;
45
+ /** 来源类型:markdown/html/bare */
46
+ sourceKind?: MediaSourceKind;
47
+ }
48
+
49
+ /**
50
+ * 媒体解析结果
51
+ */
52
+ export interface MediaParseResult {
53
+ /** 清理后的文本(移除媒体标记) */
54
+ text: string;
55
+ /** 提取的图片列表 */
56
+ images: ExtractedMedia[];
57
+ /** 提取的非图片文件列表 */
58
+ files: ExtractedMedia[];
59
+ /** 所有媒体列表(图片 + 文件) */
60
+ all: ExtractedMedia[];
61
+ }
62
+
63
+ /**
64
+ * 媒体解析选项
65
+ */
66
+ export interface MediaParseOptions {
67
+ /** 是否从文本中移除媒体标记,默认 true */
68
+ removeFromText?: boolean;
69
+ /** 是否检查本地文件存在性,默认 false */
70
+ checkExists?: boolean;
71
+ /** 文件存在性检查函数(用于依赖注入) */
72
+ existsSync?: (path: string) => boolean;
73
+ /** 是否解析行首 MEDIA: 指令,默认 false */
74
+ parseMediaLines?: boolean;
75
+ /** 是否解析 Markdown 图片,默认 true */
76
+ parseMarkdownImages?: boolean;
77
+ /** 是否解析 HTML img 标签,默认 true */
78
+ parseHtmlImages?: boolean;
79
+ /** 是否解析裸露的本地路径,默认 true */
80
+ parseBarePaths?: boolean;
81
+ /** 是否解析 Markdown 链接中的文件,默认 true */
82
+ parseMarkdownLinks?: boolean;
83
+ }
84
+
85
+ // ============================================================================
86
+ // 常量定义
87
+ // ============================================================================
88
+
89
+ /**
90
+ * 图片扩展名集合
91
+ */
92
+ export const IMAGE_EXTENSIONS = new Set([
93
+ "png",
94
+ "jpg",
95
+ "jpeg",
96
+ "gif",
97
+ "webp",
98
+ "bmp",
99
+ "tiff",
100
+ "tif",
101
+ "heic",
102
+ "heif",
103
+ "svg",
104
+ "ico",
105
+ ]);
106
+
107
+ /**
108
+ * 音频扩展名集合
109
+ */
110
+ export const AUDIO_EXTENSIONS = new Set([
111
+ "mp3",
112
+ "wav",
113
+ "ogg",
114
+ "m4a",
115
+ "amr",
116
+ "flac",
117
+ "aac",
118
+ "wma",
119
+ ]);
120
+
121
+ /**
122
+ * 视频扩展名集合
123
+ */
124
+ export const VIDEO_EXTENSIONS = new Set([
125
+ "mp4",
126
+ "mov",
127
+ "avi",
128
+ "mkv",
129
+ "webm",
130
+ "flv",
131
+ "wmv",
132
+ "m4v",
133
+ ]);
134
+
135
+ /**
136
+ * 非图片文件扩展名集合(用于文件提取)
137
+ */
138
+ export const NON_IMAGE_EXTENSIONS = new Set([
139
+ // 文档
140
+ "pdf",
141
+ "doc",
142
+ "docx",
143
+ "xls",
144
+ "xlsx",
145
+ "csv",
146
+ "ppt",
147
+ "pptx",
148
+ "txt",
149
+ "md",
150
+ "rtf",
151
+ "odt",
152
+ "ods",
153
+ // 压缩包
154
+ "zip",
155
+ "rar",
156
+ "7z",
157
+ "tar",
158
+ "gz",
159
+ "tgz",
160
+ "bz2",
161
+ // 音频
162
+ ...AUDIO_EXTENSIONS,
163
+ // 视频
164
+ ...VIDEO_EXTENSIONS,
165
+ // 数据
166
+ "json",
167
+ "xml",
168
+ "yaml",
169
+ "yml",
170
+ ]);
171
+
172
+ // ============================================================================
173
+ // 正则表达式
174
+ // ============================================================================
175
+
176
+ /**
177
+ * Markdown 图片语法: ![alt](path)
178
+ * 支持 file://, MEDIA:, attachment://, 绝对路径
179
+ */
180
+ const MARKDOWN_IMAGE_RE =
181
+ /!\[([^\]]*)\]\(([^)]+)\)/g;
182
+
183
+ /**
184
+ * Markdown 链接中的图片: [![alt](img)](link)
185
+ */
186
+ const MARKDOWN_LINKED_IMAGE_RE =
187
+ /\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)/g;
188
+
189
+ /**
190
+ * HTML img 标签
191
+ */
192
+ const HTML_IMAGE_RE =
193
+ /<img\b[^>]*\bsrc\s*=\s*(?:"([^"]+)"|'([^']+)'|([^\s>]+))[^>]*>/gi;
194
+
195
+ /**
196
+ * Markdown 链接语法: [label](path)
197
+ */
198
+ const MARKDOWN_LINK_RE = /\[([^\]]*)\]\(([^)]+)\)/g;
199
+
200
+ /**
201
+ * 本地图片路径(裸露的,非 Markdown 格式)
202
+ * 支持 Unix 和 Windows 路径
203
+ */
204
+ const BARE_IMAGE_PATH_RE =
205
+ /`?((?:\/(?:tmp|var|private|Users|home|root)\/[^\s`'",)]+|[A-Za-z]:[\\/][^\s`'",)]+)\.(?:png|jpg|jpeg|gif|bmp|webp|svg|ico))`?/gi;
206
+
207
+ /**
208
+ * 本地文件路径(非图片)
209
+ * 动态生成,包含所有非图片扩展名
210
+ */
211
+ const NON_IMAGE_EXT_PATTERN = Array.from(NON_IMAGE_EXTENSIONS).join("|");
212
+ const WINDOWS_PATH_SEP = String.raw`(?:\\\\|\\)`;
213
+ const WINDOWS_FILE_PATH = String.raw`[A-Za-z]:${WINDOWS_PATH_SEP}(?:[^\\/:*?"<>|\r\n]+${WINDOWS_PATH_SEP})*[^\\/:*?"<>|\r\n]+`;
214
+ const UNIX_FILE_PATH = String.raw`\/(?:tmp|var|private|Users|home|root)\/[^\s'",)]+`;
215
+ const BARE_FILE_PATH_RE = new RegExp(
216
+ String.raw`\`?((?:${UNIX_FILE_PATH}|${WINDOWS_FILE_PATH})\.(?:${NON_IMAGE_EXT_PATTERN}))\`?`,
217
+ "gi"
218
+ );
219
+
220
+ // MEDIA: 行解析辅助
221
+ const MEDIA_LINE_PREFIX = "MEDIA:";
222
+
223
+ function unwrapMediaLinePayload(value: string): string | undefined {
224
+ const trimmed = value.trim();
225
+ if (trimmed.length < 2) return undefined;
226
+ const first = trimmed[0];
227
+ const last = trimmed[trimmed.length - 1];
228
+ if (first !== last) return undefined;
229
+ if (first !== `"` && first !== "'" && first !== "`") return undefined;
230
+ return trimmed.slice(1, -1).trim();
231
+ }
232
+
233
+ function cleanMediaLineCandidate(value: string): string {
234
+ return value.replace(/^[`"'[{(<]+/, "").replace(/[`"'\])}>.,;]+$/, "");
235
+ }
236
+
237
+ function splitMediaLineCandidates(payload: string): string[] {
238
+ const unwrapped = unwrapMediaLinePayload(payload);
239
+ if (unwrapped) return [unwrapped];
240
+ return payload.split(/\s+/).filter(Boolean);
241
+ }
242
+
243
+ // ============================================================================
244
+ // 路径处理函数
245
+ // ============================================================================
246
+
247
+ /**
248
+ * 检查是否为 HTTP/HTTPS URL
249
+ */
250
+ export function isHttpUrl(value: string): boolean {
251
+ return /^https?:\/\//i.test(value);
252
+ }
253
+
254
+ /**
255
+ * 检查是否为 file:// URL
256
+ */
257
+ export function isFileUrl(value: string): boolean {
258
+ return /^file:\/\//i.test(value);
259
+ }
260
+
261
+ /**
262
+ * 检查是否为本地路径引用
263
+ * 支持 file://, MEDIA:, attachment://, 绝对路径
264
+ */
265
+ export function isLocalReference(raw: string): boolean {
266
+ if (isHttpUrl(raw)) return false;
267
+ return (
268
+ raw.startsWith("file://") ||
269
+ raw.startsWith("MEDIA:") ||
270
+ raw.startsWith("attachment://") ||
271
+ raw.startsWith("/") ||
272
+ raw.startsWith("~") ||
273
+ /^[a-zA-Z]:[\\/]/.test(raw)
274
+ );
275
+ }
276
+
277
+ /**
278
+ * 规范化本地路径
279
+ * 移除 file://, MEDIA:, attachment:// 前缀,并解码 URI
280
+ */
281
+ export function normalizeLocalPath(raw: string): string {
282
+ let p = raw.trim();
283
+
284
+ // 处理 file:// URL
285
+ if (isFileUrl(p)) {
286
+ try {
287
+ return fileURLToPath(p);
288
+ } catch {
289
+ p = p.replace(/^file:\/\/\/?/i, "");
290
+ }
291
+ }
292
+
293
+ // 处理其他前缀
294
+ if (p.startsWith("MEDIA:")) {
295
+ p = p.replace(/^MEDIA:/i, "");
296
+ } else if (p.startsWith("attachment://")) {
297
+ p = p.replace(/^attachment:\/\//i, "");
298
+ }
299
+
300
+ // 处理转义空格
301
+ p = p.replace(/\\ /g, " ");
302
+
303
+ // 尝试 URI 解码
304
+ try {
305
+ p = decodeURIComponent(p);
306
+ } catch {
307
+ // 忽略解码错误
308
+ }
309
+
310
+ // 处理波浪号路径 (~)
311
+ if (p.startsWith("~/") || p === "~") {
312
+ p = path.join(os.homedir(), p.slice(1));
313
+ } else if (p.startsWith("~")) {
314
+ // ~username 格式,在 Windows 上不常见,保持原样
315
+ // 在 Unix 上可以用 os.homedir() 的父目录 + username,但这里简化处理
316
+ }
317
+
318
+ // 处理相对路径
319
+ if (!path.isAbsolute(p)) {
320
+ p = path.resolve(process.cwd(), p);
321
+ }
322
+
323
+ return p;
324
+ }
325
+
326
+ /**
327
+ * 从 URL 中移除标题部分(Markdown 语法中的 "title")
328
+ */
329
+ export function stripTitleFromUrl(value: string): string {
330
+ const trimmed = value.trim();
331
+ // Only strip when the title is explicitly quoted: url "title" or url 'title'
332
+ const match = trimmed.match(/^(\S+)\s+["'][^"']*["']\s*$/);
333
+ return match ? match[1] : trimmed;
334
+ }
335
+
336
+ /**
337
+ * 获取文件扩展名(不含点)
338
+ */
339
+ export function getExtension(filePath: string): string {
340
+ const ext = path.extname(filePath).toLowerCase();
341
+ return ext.startsWith(".") ? ext.slice(1) : ext;
342
+ }
343
+
344
+ /**
345
+ * 检查是否为图片路径
346
+ */
347
+ export function isImagePath(filePath: string): boolean {
348
+ const ext = getExtension(filePath);
349
+ return ext ? IMAGE_EXTENSIONS.has(ext) : false;
350
+ }
351
+
352
+ /**
353
+ * 检查是否为非图片文件路径
354
+ */
355
+ export function isNonImageFilePath(filePath: string): boolean {
356
+ const ext = getExtension(filePath);
357
+ return ext ? NON_IMAGE_EXTENSIONS.has(ext) : false;
358
+ }
359
+
360
+ /**
361
+ * 根据文件扩展名检测媒体类型
362
+ */
363
+ export function detectMediaType(filePath: string): MediaType {
364
+ const ext = getExtension(filePath);
365
+
366
+ if (IMAGE_EXTENSIONS.has(ext)) return "image";
367
+ if (AUDIO_EXTENSIONS.has(ext)) return "audio";
368
+ if (VIDEO_EXTENSIONS.has(ext)) return "video";
369
+
370
+ return "file";
371
+ }
372
+
373
+ // ============================================================================
374
+ // 媒体提取函数
375
+ // ============================================================================
376
+
377
+ /**
378
+ * 创建 ExtractedMedia 对象
379
+ */
380
+ function createExtractedMedia(
381
+ source: string,
382
+ sourceKind: MediaSourceKind,
383
+ options?: MediaParseOptions
384
+ ): ExtractedMedia {
385
+ const isHttp = isHttpUrl(source);
386
+ const isLocal = !isHttp && isLocalReference(source);
387
+ const cleanSource = stripTitleFromUrl(source);
388
+
389
+ let localPath: string | undefined;
390
+ let fileName: string | undefined;
391
+
392
+ if (isLocal) {
393
+ localPath = normalizeLocalPath(cleanSource);
394
+ fileName = path.basename(localPath);
395
+ } else if (isHttp) {
396
+ try {
397
+ const url = new URL(cleanSource);
398
+ fileName = path.basename(url.pathname) || undefined;
399
+ } catch {
400
+ // 忽略 URL 解析错误
401
+ }
402
+ }
403
+
404
+ const type = detectMediaType(fileName || cleanSource);
405
+
406
+ return {
407
+ source: cleanSource,
408
+ localPath,
409
+ type,
410
+ isLocal,
411
+ isHttp,
412
+ fileName,
413
+ sourceKind,
414
+ };
415
+ }
416
+
417
+ /**
418
+ * 从文本中提取所有媒体
419
+ *
420
+ * @param text - 要解析的文本
421
+ * @param options - 解析选项
422
+ * @returns 解析结果,包含清理后的文本和提取的媒体列表
423
+ */
424
+ export function extractMediaFromText(
425
+ text: string,
426
+ options: MediaParseOptions = {}
427
+ ): MediaParseResult {
428
+ const {
429
+ removeFromText = true,
430
+ checkExists = false,
431
+ existsSync,
432
+ parseMediaLines = false,
433
+ parseMarkdownImages = true,
434
+ parseHtmlImages = true,
435
+ parseBarePaths = true,
436
+ parseMarkdownLinks = true,
437
+ } = options;
438
+
439
+ const images: ExtractedMedia[] = [];
440
+ const files: ExtractedMedia[] = [];
441
+ const seenSources = new Set<string>();
442
+ let result = text;
443
+
444
+ // 辅助函数:添加媒体项(去重)
445
+ const addMedia = (media: ExtractedMedia): boolean => {
446
+ const key = media.localPath || media.source;
447
+ if (seenSources.has(key)) return false;
448
+
449
+ // 检查文件存在性
450
+ if (checkExists && media.isLocal && media.localPath) {
451
+ const exists = existsSync
452
+ ? existsSync(media.localPath)
453
+ : fs.existsSync(media.localPath);
454
+ if (!exists) return false;
455
+ }
456
+
457
+ seenSources.add(key);
458
+
459
+ if (media.type === "image") {
460
+ images.push(media);
461
+ } else {
462
+ files.push(media);
463
+ }
464
+ return true;
465
+ };
466
+
467
+ // 0. 解析行首 MEDIA: 指令
468
+ if (parseMediaLines) {
469
+ const lines = result.split("\n");
470
+ const keptLines: string[] = [];
471
+ for (const line of lines) {
472
+ const trimmedStart = line.trimStart();
473
+ if (!trimmedStart.startsWith(MEDIA_LINE_PREFIX)) {
474
+ keptLines.push(line);
475
+ continue;
476
+ }
477
+
478
+ const payload = trimmedStart.slice(MEDIA_LINE_PREFIX.length).trim();
479
+ if (!payload) {
480
+ keptLines.push(line);
481
+ continue;
482
+ }
483
+
484
+ const candidates = splitMediaLineCandidates(payload);
485
+ let addedAny = false;
486
+ for (const raw of candidates) {
487
+ const candidate = stripTitleFromUrl(cleanMediaLineCandidate(raw));
488
+ if (!candidate) continue;
489
+ if (!isHttpUrl(candidate) && !isLocalReference(candidate)) {
490
+ continue;
491
+ }
492
+ const media = createExtractedMedia(candidate, "bare", options);
493
+ if (addMedia(media)) {
494
+ addedAny = true;
495
+ }
496
+ }
497
+
498
+ if (!addedAny || !removeFromText) {
499
+ keptLines.push(line);
500
+ }
501
+ }
502
+
503
+ if (removeFromText) {
504
+ result = keptLines.join("\n");
505
+ }
506
+ }
507
+
508
+ // 收集需要替换的位置(用于安全替换)
509
+ type Replacement = { start: number; end: number; replacement: string };
510
+ const replacements: Replacement[] = [];
511
+
512
+ // 辅助函数:应用替换(从后向前,避免索引错位)
513
+ const applyReplacements = (): void => {
514
+ if (replacements.length === 0) return;
515
+ // 按起始位置降序排序,从后向前替换
516
+ replacements.sort((a, b) => b.start - a.start);
517
+ for (const { start, end, replacement } of replacements) {
518
+ result = result.slice(0, start) + replacement + result.slice(end);
519
+ }
520
+ replacements.length = 0; // 清空
521
+ };
522
+
523
+ // 1. 解析 Markdown 链接中的图片: [![alt](img)](link)
524
+ if (parseMarkdownImages) {
525
+ const linkedMatches = [...text.matchAll(MARKDOWN_LINKED_IMAGE_RE)];
526
+ for (const match of linkedMatches) {
527
+ const [fullMatch, _alt, imgSrc] = match;
528
+ const media = createExtractedMedia(imgSrc, "markdown", options);
529
+ if (media.type === "image") {
530
+ addMedia(media);
531
+ if (removeFromText && match.index !== undefined) {
532
+ replacements.push({
533
+ start: match.index,
534
+ end: match.index + fullMatch.length,
535
+ replacement: "",
536
+ });
537
+ }
538
+ }
539
+ }
540
+ applyReplacements();
541
+ }
542
+
543
+ // 2. 解析 Markdown 图片: ![alt](path)
544
+ if (parseMarkdownImages) {
545
+ const mdMatches = [...result.matchAll(MARKDOWN_IMAGE_RE)];
546
+ for (const match of mdMatches) {
547
+ const [fullMatch, _alt, src] = match;
548
+ const media = createExtractedMedia(src, "markdown", options);
549
+ if (media.type === "image") {
550
+ addMedia(media);
551
+ if (removeFromText && match.index !== undefined) {
552
+ replacements.push({
553
+ start: match.index,
554
+ end: match.index + fullMatch.length,
555
+ replacement: "",
556
+ });
557
+ }
558
+ }
559
+ }
560
+ applyReplacements();
561
+ }
562
+
563
+ // 3. 解析 HTML img 标签
564
+ if (parseHtmlImages) {
565
+ const htmlMatches = [...result.matchAll(HTML_IMAGE_RE)];
566
+ for (const match of htmlMatches) {
567
+ const [fullMatch, src1, src2, src3] = match;
568
+ const src = src1 || src2 || src3;
569
+ if (src) {
570
+ const media = createExtractedMedia(src, "html", options);
571
+ if (media.type === "image") {
572
+ addMedia(media);
573
+ if (removeFromText && match.index !== undefined) {
574
+ replacements.push({
575
+ start: match.index,
576
+ end: match.index + fullMatch.length,
577
+ replacement: "",
578
+ });
579
+ }
580
+ }
581
+ }
582
+ }
583
+ applyReplacements();
584
+ }
585
+
586
+ // 4. 解析 Markdown 链接中的文件: [label](path)
587
+ if (parseMarkdownLinks) {
588
+ // 重置正则
589
+ MARKDOWN_LINK_RE.lastIndex = 0;
590
+ const linkMatches = [...result.matchAll(MARKDOWN_LINK_RE)];
591
+ for (const match of linkMatches) {
592
+ const [fullMatch, _label, rawPath] = match;
593
+ const idx = match.index ?? 0;
594
+
595
+ // 跳过图片语法 ![...](...) - 检查前一个字符是否为 !
596
+ if (idx > 0 && result[idx - 1] === "!") continue;
597
+
598
+ // 只处理本地引用
599
+ if (!isLocalReference(rawPath)) continue;
600
+
601
+ const media = createExtractedMedia(rawPath, "markdown", options);
602
+
603
+ // 只处理非图片文件
604
+ if (media.type !== "image" && isNonImageFilePath(media.localPath || rawPath)) {
605
+ if (addMedia(media)) {
606
+ if (removeFromText && match.index !== undefined) {
607
+ const fileName = media.fileName || path.basename(rawPath);
608
+ replacements.push({
609
+ start: match.index,
610
+ end: match.index + fullMatch.length,
611
+ replacement: `[文件: ${fileName}]`,
612
+ });
613
+ }
614
+ }
615
+ }
616
+ }
617
+ applyReplacements();
618
+ }
619
+
620
+ // 5. 解析裸露的本地图片路径
621
+ if (parseBarePaths && parseMarkdownImages) {
622
+ // 重置正则
623
+ BARE_IMAGE_PATH_RE.lastIndex = 0;
624
+ const bareImageMatches = [...result.matchAll(BARE_IMAGE_PATH_RE)];
625
+
626
+ // 过滤掉已经在 Markdown 语法中的路径
627
+ const newBareImageMatches = bareImageMatches.filter((m) => {
628
+ const idx = m.index ?? 0;
629
+ const before = result.slice(Math.max(0, idx - 10), idx);
630
+ return !before.includes("](");
631
+ });
632
+
633
+ for (const match of newBareImageMatches) {
634
+ const [fullMatch, rawPath] = match;
635
+ const media = createExtractedMedia(rawPath, "bare", options);
636
+ if (media.type === "image") {
637
+ addMedia(media);
638
+ if (removeFromText && match.index !== undefined) {
639
+ replacements.push({
640
+ start: match.index,
641
+ end: match.index + fullMatch.length,
642
+ replacement: "",
643
+ });
644
+ }
645
+ }
646
+ }
647
+ applyReplacements();
648
+ }
649
+
650
+ // 6. 解析裸露的本地文件路径(非图片)
651
+ if (parseBarePaths && parseMarkdownLinks) {
652
+ // 重置正则
653
+ BARE_FILE_PATH_RE.lastIndex = 0;
654
+ const bareFileMatches = [...result.matchAll(BARE_FILE_PATH_RE)];
655
+
656
+ for (const match of bareFileMatches) {
657
+ const [fullMatch, rawPath] = match;
658
+ const media = createExtractedMedia(rawPath, "bare", options);
659
+
660
+ if (media.type !== "image") {
661
+ if (addMedia(media)) {
662
+ if (removeFromText && match.index !== undefined) {
663
+ const fileName = media.fileName || path.basename(rawPath);
664
+ replacements.push({
665
+ start: match.index,
666
+ end: match.index + fullMatch.length,
667
+ replacement: `[文件: ${fileName}]`,
668
+ });
669
+ }
670
+ }
671
+ }
672
+ }
673
+ applyReplacements();
674
+ }
675
+
676
+ // 清理多余的空行
677
+ if (removeFromText) {
678
+ result = result.replace(/\n{3,}/g, "\n\n").trim();
679
+ }
680
+
681
+ return {
682
+ text: result,
683
+ images,
684
+ files,
685
+ all: [...images, ...files],
686
+ };
687
+ }
688
+
689
+ /**
690
+ * 仅提取图片(简化版)
691
+ */
692
+ export function extractImagesFromText(
693
+ text: string,
694
+ options: Omit<MediaParseOptions, "parseMarkdownLinks"> = {}
695
+ ): { text: string; images: ExtractedMedia[] } {
696
+ const result = extractMediaFromText(text, {
697
+ ...options,
698
+ parseMarkdownLinks: false,
699
+ });
700
+ return {
701
+ text: result.text,
702
+ images: result.images,
703
+ };
704
+ }
705
+
706
+ /**
707
+ * 仅提取文件(简化版)
708
+ */
709
+ export function extractFilesFromText(
710
+ text: string,
711
+ options: Omit<MediaParseOptions, "parseMarkdownImages" | "parseHtmlImages"> = {}
712
+ ): { text: string; files: ExtractedMedia[] } {
713
+ const result = extractMediaFromText(text, {
714
+ ...options,
715
+ parseMarkdownImages: false,
716
+ parseHtmlImages: false,
717
+ });
718
+ return {
719
+ text: result.text,
720
+ files: result.files,
721
+ };
722
+ }