video-pipeline 1.2.4 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +1 -1
- package/CHANGELOG.md +33 -0
- package/README.md +86 -9
- package/package.json +1 -1
- package/process_videos.js +151 -37
package/.env.example
CHANGED
|
@@ -170,4 +170,4 @@ AI_TIMEOUT=300
|
|
|
170
170
|
AI_TEMPERATURE=0.3
|
|
171
171
|
# 【关联】提示词模板,{content} 占位符会被识别文本替换
|
|
172
172
|
# 【自由】提示词内容可随意修改,但必须保留 {content} 占位符
|
|
173
|
-
AI_PROMPT_TPL
|
|
173
|
+
AI_PROMPT_TPL=帮我归纳总结一下提供内容的关键词,尽可能全面,无遗漏,无重复,无幻想,关键词之间用英文逗号分隔开。如果内容为英文,则关键词全部是英文,如果内容是中文,则关键词以中文为主,可以附带一些英文关键词。这是内容:{content}
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,38 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [1.2.6] - 2026-06-11
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
|
|
7
|
+
- 报告按 sheet/站点分目录存储 (`6610c57`)
|
|
8
|
+
- 统一三种来源报告格式 + 修复多处 bug (`2a6f606`)
|
|
9
|
+
|
|
10
|
+
### Bug Fixes
|
|
11
|
+
|
|
12
|
+
- groupBySheetMap 返回 Map 而非普通对象,修复 for...of 不可迭代错误 (`dfae532`)
|
|
13
|
+
|
|
14
|
+
### Documentation
|
|
15
|
+
|
|
16
|
+
- update (`6ddc48b`)
|
|
17
|
+
- 修正 --input 模式的 {sheet} 表述为固定 local (`be61e29`)
|
|
18
|
+
- 输出结构速查表 — 三来源×四环节对照 (`248168c`)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
## [1.2.5] - 2026-06-11
|
|
22
|
+
|
|
23
|
+
### Features
|
|
24
|
+
|
|
25
|
+
- 新增 --offset / --limit 参数,支持跳过和限量处理 Excel 数据 (`d691822`)
|
|
26
|
+
|
|
27
|
+
### Bug Fixes
|
|
28
|
+
|
|
29
|
+
- base_dir 改用 cwd 而非脚本安装目录,修复全局安装后路径解析错误 (`0fe5b1e`)
|
|
30
|
+
|
|
31
|
+
### Documentation
|
|
32
|
+
|
|
33
|
+
- update (`45a8d45`)
|
|
34
|
+
|
|
35
|
+
|
|
3
36
|
## [1.2.4] - 2026-06-11
|
|
4
37
|
|
|
5
38
|
### Features
|
package/README.md
CHANGED
|
@@ -158,8 +158,22 @@ WHISPER_LANGUAGE=zh # 空=多语言自动检测(默认),需要指
|
|
|
158
158
|
├── transcoded/ # ffmpeg 转码输出(wav 16kHz mono)
|
|
159
159
|
│ ├── YouTube视频/
|
|
160
160
|
│ └── 普诺赛中文站/
|
|
161
|
-
├── reports/ #
|
|
162
|
-
│
|
|
161
|
+
├── reports/ # 执行报告(按 sheet/站点分目录)
|
|
162
|
+
│ ├── YouTube视频/
|
|
163
|
+
│ │ ├── report_YYYYMMDD_HHMMSS.json # JSON 报告(机器可读,用于重跑)
|
|
164
|
+
│ │ └── tasks/ # 人类可读文本摘要
|
|
165
|
+
│ │ ├── 2143.txt
|
|
166
|
+
│ │ └── ...
|
|
167
|
+
│ ├── 普诺赛中文站/
|
|
168
|
+
│ │ ├── report_YYYYMMDD_HHMMSS.json
|
|
169
|
+
│ │ └── tasks/
|
|
170
|
+
│ │ └── ...
|
|
171
|
+
│ ├── youtube/ # --url 模式按平台名分目录
|
|
172
|
+
│ │ ├── report_YYYYMMDD_HHMMSS.json
|
|
173
|
+
│ │ └── tasks/
|
|
174
|
+
│ └── local/ # --input 模式默认目录
|
|
175
|
+
│ ├── report_YYYYMMDD_HHMMSS.json
|
|
176
|
+
│ └── tasks/
|
|
163
177
|
├── scripts/ # 辅助脚本
|
|
164
178
|
│ ├── release.js # 版本发布脚本
|
|
165
179
|
│ └── regenerate-changelog.js # CHANGELOG 重建脚本
|
|
@@ -244,17 +258,21 @@ node process_videos.js --sheet "YouTube视频" --concurrency 2 --retry 3
|
|
|
244
258
|
|
|
245
259
|
# 先干跑预览
|
|
246
260
|
node process_videos.js --dry-run
|
|
261
|
+
|
|
262
|
+
# Excel 数据量大时,偏移+限量调试
|
|
263
|
+
node process_videos.js --offset 10 --limit 5 --dry-run # 跳过前10条,预览5条
|
|
264
|
+
node process_videos.js --limit 3 --concurrency 1 # 只处理前3条
|
|
247
265
|
```
|
|
248
266
|
|
|
249
267
|
### 重跑失败
|
|
250
268
|
|
|
251
269
|
```bash
|
|
252
|
-
# 第一次跑完后生成 reports/report_xxx.json
|
|
270
|
+
# 第一次跑完后生成 reports/{sheet名称}/report_xxx.json
|
|
253
271
|
# 查看失败项:
|
|
254
|
-
node process_videos.js --retry-failed reports/report_20260610_143000.json --dry-run
|
|
272
|
+
node process_videos.js --retry-failed reports/YouTube视频/report_20260610_143000.json --dry-run
|
|
255
273
|
|
|
256
274
|
# 重跑:
|
|
257
|
-
node process_videos.js --retry-failed reports/report_20260610_143000.json --concurrency 2 --retry 3
|
|
275
|
+
node process_videos.js --retry-failed reports/YouTube视频/report_20260610_143000.json --concurrency 2 --retry 3
|
|
258
276
|
```
|
|
259
277
|
|
|
260
278
|
### 超时控制(防止任务卡死)
|
|
@@ -344,6 +362,8 @@ node process_videos.js --input "downloads/产品介绍.mp4" --step analyze
|
|
|
344
362
|
|---|---|---|---|
|
|
345
363
|
| `--sheet <name>` | str | 全部 | 指定 sheet 名称 |
|
|
346
364
|
| `--id <id>` | str | — | 指定 extra.id 或 title(单条测试) |
|
|
365
|
+
| `--offset <n>` | int | 0 | 跳过前 N 条任务(从 0 开始),适合调试大量数据 |
|
|
366
|
+
| `--limit <n>` | int | 0 | 最多处理 N 条任务,0 表示无限制 |
|
|
347
367
|
| `--step <step>` | str | 全跑 | 只执行某步:`download` / `transcode` / `transcribe` / `analyze` |
|
|
348
368
|
| `--force` | flag | off | 强制重做下载+转码,忽略已有文件 |
|
|
349
369
|
| `--concurrency <n>` | int | 1 | 并发数,建议 2~3 |
|
|
@@ -354,7 +374,7 @@ node process_videos.js --input "downloads/产品介绍.mp4" --step analyze
|
|
|
354
374
|
| `--transcribe-timeout <n>` | int | 600 | 单个识别任务最长执行时间(秒) |
|
|
355
375
|
| `--analyze-timeout <n>` | int | 300 | 单个 AI 分析任务最长执行时间(秒) |
|
|
356
376
|
| `--dry-run` | flag | off | 干跑模式,只列任务不执行 |
|
|
357
|
-
| `--retry-failed <path>` | path | — | 从报告 JSON
|
|
377
|
+
| `--retry-failed <path>` | path | — | 从报告 JSON 重跑失败项(如 `reports/YouTube视频/report_xxx.json`) |
|
|
358
378
|
| `--init` | flag | off | 复制 .env.example 到当前目录并重命名为 .env |
|
|
359
379
|
| `--file <path>` | path | — | 指定 Excel 文件路径(优先级高于 EXCEL_FILE 环境变量) |
|
|
360
380
|
| `--input <path>` | path | — | 指定本地视频文件路径(跳过下载,直接转码→识别→分析) |
|
|
@@ -504,9 +524,64 @@ node process_videos.js --sheet "YouTube视频" --step analyze --concurrency 2
|
|
|
504
524
|
|
|
505
525
|
---
|
|
506
526
|
|
|
507
|
-
##
|
|
527
|
+
## 输出结构速查表
|
|
528
|
+
|
|
529
|
+
三种输入来源在不同处理环节的输出路径汇总如下。所有路径均以 `output/` 为根(可通过 `DOWNLOADS_DIR` / `TRANSCODED_DIR` / `REPORTS_DIR` 环境变量覆盖)。
|
|
530
|
+
|
|
531
|
+
> `{sheet}` = Excel 工作表名(如 `YouTube视频`、`普诺赛中文站`)
|
|
532
|
+
> `{platform}` = 视频平台标识(如 `youtube`、`bilibili`、`tencentVid`、`youku`)
|
|
533
|
+
> `{stem}` = 去重后的安全文件名(不含扩展名)
|
|
534
|
+
|
|
535
|
+
### ① Excel 批量模式(默认)
|
|
536
|
+
|
|
537
|
+
| 环节 | 输出路径 | 产物格式 | 说明 |
|
|
538
|
+
|------|---------|---------|------|
|
|
539
|
+
| 下载 | `output/downloads/{sheet}/{stem}.mp4` | 视频 | yt-dlp 下载原始视频 |
|
|
540
|
+
| 转码 | `output/transcoded/{sheet}/{stem}.wav` | 音频 | ffmpeg 转 16kHz mono WAV |
|
|
541
|
+
| JSON 报告 | `output/reports/{sheet}/report_YYYYMMDD_HHMMSS.json` | JSON | 机器可读,含 summary + failed_items,可供 --retry-failed 重跑 |
|
|
542
|
+
| 文本报告 | `output/reports/{sheet}/tasks/{stem}.txt` | 文本 | 人类可读,含语音识别原文 + AI 关键词分析 |
|
|
543
|
+
|
|
544
|
+
> 多 sheet 同时执行时,每个 sheet 独立一个子目录,互不干扰。
|
|
545
|
+
|
|
546
|
+
### ② --url 直链模式
|
|
547
|
+
|
|
548
|
+
| 环节 | 输出路径 | 产物格式 | 说明 |
|
|
549
|
+
|------|---------|---------|------|
|
|
550
|
+
| 下载 | `output/downloads/{platform}/{name}.mp4` | 视频 | yt-dlp 下载单个视频 |
|
|
551
|
+
| 转码 | `output/transcoded/{platform}/{name}.wav` | 音频 | ffmpeg 转 16kHz mono WAV |
|
|
552
|
+
| JSON 报告 | `output/reports/{platform}/report_YYYYMMDD_HHMMSS.json` | JSON | 格式与 Excel 模式一致 |
|
|
553
|
+
| 文本报告 | `output/reports/{platform}/tasks/{name}.txt` | 文本 | 含识别原文 + AI 分析 |
|
|
554
|
+
|
|
555
|
+
> `{platform}` 由脚本自动从 URL 解析,如 `https://www.youtube.com/watch?v=xxx` → `youtube`。
|
|
556
|
+
|
|
557
|
+
### ③ --input 本地文件模式
|
|
558
|
+
|
|
559
|
+
| 环节 | 输出路径 | 产物格式 | 说明 |
|
|
560
|
+
|------|---------|---------|------|
|
|
561
|
+
| 下载 | —(跳过) | — | 本地文件无需下载 |
|
|
562
|
+
| 转码 | `output/transcoded/local/{stem}.wav` | 音频 | ffmpeg 转 16kHz mono WAV |
|
|
563
|
+
| JSON 报告 | `output/reports/local/report_YYYYMMDD_HHMMSS.json` | JSON | 格式与 Excel 模式一致 |
|
|
564
|
+
| 文本报告 | `output/reports/local/tasks/{stem}.txt` | 文本 | 含识别原文 + AI 分析 |
|
|
565
|
+
|
|
566
|
+
> `local` 是 `--input` 模式的固定目录名(与 Excel 模式的 sheet 名无关),所有本地文件处理结果统一归入此目录。
|
|
567
|
+
|
|
568
|
+
---
|
|
569
|
+
|
|
570
|
+
### 三种来源对比一览
|
|
571
|
+
|
|
572
|
+
| 维度 | Excel 批量 | --url 直链 | --input 本地文件 |
|
|
573
|
+
|------|-----------|-----------|-----------------|
|
|
574
|
+
| 输入 | Excel 行(多视频批量) | 单个视频 URL | 本地视频/音频文件 |
|
|
575
|
+
| 下载目录 | `downloads/{sheet}/` | `downloads/{platform}/` | 无 |
|
|
576
|
+
| 转码目录 | `transcoded/{sheet}/` | `transcoded/{platform}/` | `transcoded/local/` |
|
|
577
|
+
| 报告目录 | `reports/{sheet}/` | `reports/{platform}/` | `reports/local/` |
|
|
578
|
+
| 分组依据 | Excel sheet 名 | URL 解析的平台名 | 固定 `local` |
|
|
579
|
+
| 并发支持 | ✅ 多线程 | ❌ 单任务 | ❌ 单任务 |
|
|
580
|
+
| 支持 --retry-failed | ✅ | ❌ | ❌ |
|
|
508
581
|
|
|
509
|
-
|
|
582
|
+
---
|
|
583
|
+
|
|
584
|
+
### JSON 报告结构
|
|
510
585
|
|
|
511
586
|
```json
|
|
512
587
|
{
|
|
@@ -532,6 +607,8 @@ node process_videos.js --sheet "YouTube视频" --step analyze --concurrency 2
|
|
|
532
607
|
}
|
|
533
608
|
```
|
|
534
609
|
|
|
610
|
+
### 状态含义
|
|
611
|
+
|
|
535
612
|
- **success**:下载 + 转码 + 识别全部成功(AI 分析失败不影响此状态)
|
|
536
613
|
- **partial**:下载 + 转码成功,识别或 AI 分析失败
|
|
537
614
|
- **failed**:下载或转码失败
|
|
@@ -554,7 +631,7 @@ node process_videos.js --sheet "YouTube视频" --id 2143 --retry 2
|
|
|
554
631
|
node process_videos.js --concurrency 3 --retry 3
|
|
555
632
|
|
|
556
633
|
# 4. 查看报告,重跑失败项
|
|
557
|
-
node process_videos.js --retry-failed reports/report_xxx.json --concurrency 2 --retry 3
|
|
634
|
+
node process_videos.js --retry-failed reports/YouTube视频/report_xxx.json --concurrency 2 --retry 3
|
|
558
635
|
```
|
|
559
636
|
|
|
560
637
|
---
|
package/package.json
CHANGED
package/process_videos.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* node process_videos.js --sheet "普诺赛中文站" --id 427
|
|
9
9
|
* node process_videos.js --step download
|
|
10
10
|
* node process_videos.js --dry-run
|
|
11
|
+
* node process_videos.js --offset 10 --limit 5 # 跳过前10条,只处理5条
|
|
11
12
|
*/
|
|
12
13
|
|
|
13
14
|
// ============================== 依赖 ==============================
|
|
@@ -34,7 +35,7 @@ dotenv.config({ path: _dotenvPath });
|
|
|
34
35
|
// ============================== 路径配置 ==============================
|
|
35
36
|
const __filename = fileURLToPath(import.meta.url);
|
|
36
37
|
const __dirname = path.dirname(__filename);
|
|
37
|
-
const BASE_DIR =
|
|
38
|
+
const BASE_DIR = process.cwd();
|
|
38
39
|
|
|
39
40
|
function envPath(key, defaultValue) {
|
|
40
41
|
const val = process.env[key] || defaultValue;
|
|
@@ -1156,11 +1157,11 @@ function writeAllContentsToExcel(results, keywordsDict = null) {
|
|
|
1156
1157
|
}
|
|
1157
1158
|
|
|
1158
1159
|
function groupBySheetMap(updates) {
|
|
1159
|
-
const result =
|
|
1160
|
+
const result = new Map();
|
|
1160
1161
|
for (const [compositeKey, text] of updates) {
|
|
1161
1162
|
const [sheetName, key] = compositeKey.split('|');
|
|
1162
|
-
if (!result
|
|
1163
|
-
result
|
|
1163
|
+
if (!result.has(sheetName)) result.set(sheetName, {});
|
|
1164
|
+
result.get(sheetName)[key] = text;
|
|
1164
1165
|
}
|
|
1165
1166
|
return result;
|
|
1166
1167
|
}
|
|
@@ -1177,10 +1178,31 @@ function computeSummary(results) {
|
|
|
1177
1178
|
return { total: results.length, success, partial, failed, no_video: noVideo };
|
|
1178
1179
|
}
|
|
1179
1180
|
|
|
1180
|
-
|
|
1181
|
-
|
|
1181
|
+
/**
|
|
1182
|
+
* 生成执行报告 JSON 文件。
|
|
1183
|
+
* - 提供 sheetName 时:报告存入 REPORTS_DIR/{sheetName}/report_{ts}.json
|
|
1184
|
+
* - 不提供时:按 r.sheet 分组,每 sheet 调用自身,返回路径数组
|
|
1185
|
+
*/
|
|
1186
|
+
function generateReport(results, config, sheetName) {
|
|
1187
|
+
if (!sheetName) {
|
|
1188
|
+
// ── 按 sheet 分组生成 ──
|
|
1189
|
+
const sheetGroups = new Map();
|
|
1190
|
+
for (const r of results) {
|
|
1191
|
+
if (!sheetGroups.has(r.sheet)) sheetGroups.set(r.sheet, []);
|
|
1192
|
+
sheetGroups.get(r.sheet).push(r);
|
|
1193
|
+
}
|
|
1194
|
+
const paths = [];
|
|
1195
|
+
for (const [sheet, items] of sheetGroups) {
|
|
1196
|
+
paths.push(generateReport(items, config, sheet));
|
|
1197
|
+
}
|
|
1198
|
+
return paths;
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
// ── 单 sheet 报告 ──
|
|
1202
|
+
const dir = path.join(REPORTS_DIR, sheetName);
|
|
1203
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
1182
1204
|
const ts = new Date().toISOString().replace(/[-:T]/g, '').slice(0, 15).replace(/(\d{8})(\d{6})/, '$1_$2');
|
|
1183
|
-
const reportFile = path.join(
|
|
1205
|
+
const reportFile = path.join(dir, `report_${ts}.json`);
|
|
1184
1206
|
|
|
1185
1207
|
const summary = computeSummary(results);
|
|
1186
1208
|
|
|
@@ -1527,62 +1549,106 @@ async function runInputTask(opts) {
|
|
|
1527
1549
|
|
|
1528
1550
|
console.log(c('dim', '\n── 开始执行 ──\n'));
|
|
1529
1551
|
|
|
1552
|
+
// ── 解决 stem 重名 ──
|
|
1553
|
+
let usedStem = stem;
|
|
1554
|
+
{
|
|
1555
|
+
let counter = 1;
|
|
1556
|
+
const tcDir = path.join(TRANSCODED_DIR, sheetName);
|
|
1557
|
+
fs.mkdirSync(tcDir, { recursive: true });
|
|
1558
|
+
let testPath = path.join(tcDir, usedStem + TRANSCODE_EXT);
|
|
1559
|
+
while (fs.existsSync(testPath) && !steps.includes('transcode')) {
|
|
1560
|
+
// 跳过转码但转码产物已存在 → 直接用
|
|
1561
|
+
break;
|
|
1562
|
+
}
|
|
1563
|
+
if (steps.includes('transcode') && !force) {
|
|
1564
|
+
while (fs.existsSync(testPath)) {
|
|
1565
|
+
usedStem = `${stem}_${counter}`;
|
|
1566
|
+
testPath = path.join(tcDir, usedStem + TRANSCODE_EXT);
|
|
1567
|
+
counter++;
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1570
|
+
}
|
|
1571
|
+
if (usedStem !== stem) {
|
|
1572
|
+
console.log(` ⚠️ stem "${stem}" 已存在 → 使用 "${usedStem}"`);
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
// ── 构建 TaskResult ──
|
|
1576
|
+
const result = new TaskResult(sheetName, usedStem, path.basename(inputPath), 'local', null, usedStem);
|
|
1577
|
+
result.download = new StepResult('skipped');
|
|
1578
|
+
|
|
1530
1579
|
// ── download: 跳过(本地文件)──
|
|
1531
|
-
console.log(` [${
|
|
1580
|
+
console.log(` [${usedStem}] 📥 下载: ${c('yellow', '已跳过 (本地文件)')}`);
|
|
1532
1581
|
|
|
1533
1582
|
// ── transcode ──
|
|
1534
1583
|
let tcFile = null;
|
|
1535
1584
|
if (steps.includes('transcode')) {
|
|
1536
|
-
console.log(` [${
|
|
1585
|
+
console.log(` [${usedStem}] 🎵 开始转码...`);
|
|
1537
1586
|
try {
|
|
1538
1587
|
const { file, error } = await stepTranscode(inputPath, sheetName, maxRetries, retryDelay, force, transcodeTimeout);
|
|
1539
1588
|
tcFile = file;
|
|
1540
1589
|
if (file && fs.existsSync(file)) {
|
|
1541
1590
|
const size = (fs.statSync(file).size / 1024 / 1024).toFixed(1);
|
|
1542
|
-
console.log(` [${
|
|
1591
|
+
console.log(` [${usedStem}] 🎵 转码完成: ${file} (${size} MB)`);
|
|
1592
|
+
result.transcode = new StepResult('success', file);
|
|
1543
1593
|
} else {
|
|
1544
|
-
console.log(` [${
|
|
1594
|
+
console.log(` [${usedStem}] 🎵 转码: ${c(file ? 'yellow' : 'red', file ? '已跳过 (文件已存在)' : '失败 — ' + (error || ''))}`);
|
|
1595
|
+
result.transcode = new StepResult(file ? 'skipped' : 'failed', file, error);
|
|
1545
1596
|
}
|
|
1546
1597
|
} catch (e) {
|
|
1547
|
-
console.log(` [${
|
|
1598
|
+
console.log(` [${usedStem}] 🎵 转码: ${c('red', '异常 — ' + (e.message || '').slice(0, 200))}`);
|
|
1599
|
+
result.transcode = new StepResult('failed', null, String(e.message).slice(0, 500));
|
|
1548
1600
|
}
|
|
1549
1601
|
if (!tcFile) {
|
|
1550
1602
|
console.log(c('yellow', '\n⚠️ 转码未产出文件,后续步骤将跳过\n'));
|
|
1603
|
+
result.overall_status = 'failed';
|
|
1604
|
+
result.error = 'transcode failed';
|
|
1605
|
+
return result;
|
|
1551
1606
|
}
|
|
1552
1607
|
} else if (steps.includes('transcribe')) {
|
|
1553
|
-
// 无 transcode 步骤但有 transcribe:优先使用已有转码文件
|
|
1554
1608
|
const tcDir = path.join(TRANSCODED_DIR, sheetName);
|
|
1555
|
-
const expectedTc = path.join(tcDir,
|
|
1609
|
+
const expectedTc = path.join(tcDir, usedStem + TRANSCODE_EXT);
|
|
1556
1610
|
if (fs.existsSync(expectedTc)) {
|
|
1557
1611
|
tcFile = expectedTc;
|
|
1558
|
-
|
|
1612
|
+
result.transcode = new StepResult('success', tcFile);
|
|
1613
|
+
console.log(` [${usedStem}] 🎵 转码: ${c('yellow', '使用已有文件 ' + path.basename(expectedTc))}`);
|
|
1559
1614
|
} else {
|
|
1560
|
-
console.log(` [${
|
|
1615
|
+
console.log(` [${usedStem}] 🎵 转码: ${c('red', '未找到转码文件,将尝试用原始文件识别(可能失败)')}`);
|
|
1561
1616
|
tcFile = inputPath;
|
|
1617
|
+
result.transcode = new StepResult('warning', inputPath, 'transcode file not found, using raw input');
|
|
1562
1618
|
}
|
|
1563
1619
|
} else {
|
|
1564
1620
|
tcFile = inputPath;
|
|
1621
|
+
result.transcode = new StepResult('success', inputPath);
|
|
1565
1622
|
}
|
|
1566
1623
|
|
|
1567
1624
|
// ── transcribe ──
|
|
1568
1625
|
let transcribeText = '';
|
|
1569
1626
|
if (steps.includes('transcribe') && tcFile) {
|
|
1570
1627
|
if (!whisperAvailable) {
|
|
1571
|
-
console.log(` [${
|
|
1628
|
+
console.log(` [${usedStem}] 📝 识别: ${c('red', 'whisper 不可用')}`);
|
|
1629
|
+
result.transcribe = new StepResult('failed', null, 'whisper unreachable');
|
|
1630
|
+
result.overall_status = 'failed';
|
|
1631
|
+
result.error = 'whisper unreachable';
|
|
1632
|
+
return result;
|
|
1572
1633
|
} else {
|
|
1573
|
-
console.log(` [${
|
|
1634
|
+
console.log(` [${usedStem}] 📝 开始语音识别...`);
|
|
1574
1635
|
try {
|
|
1575
1636
|
const { text, error } = await stepTranscribe(tcFile, maxRetries, retryDelay, transcribeTimeout);
|
|
1576
1637
|
if (text && typeof text === 'string') {
|
|
1577
1638
|
transcribeText = text;
|
|
1578
|
-
console.log(` [${
|
|
1639
|
+
console.log(` [${usedStem}] 📝 识别完成: ${text.length} 字符`);
|
|
1640
|
+
result.transcribe = new StepResult('success', text);
|
|
1579
1641
|
} else {
|
|
1580
|
-
console.log(` [${
|
|
1642
|
+
console.log(` [${usedStem}] 📝 识别: ${c('red', '失败 — ' + (error || ''))}`);
|
|
1643
|
+
result.transcribe = new StepResult('failed', null, error);
|
|
1581
1644
|
}
|
|
1582
1645
|
} catch (e) {
|
|
1583
|
-
console.log(` [${
|
|
1646
|
+
console.log(` [${usedStem}] 📝 识别: ${c('red', '异常 — ' + (e.message || '').slice(0, 200))}`);
|
|
1647
|
+
result.transcribe = new StepResult('failed', null, String(e.message).slice(0, 500));
|
|
1584
1648
|
}
|
|
1585
1649
|
}
|
|
1650
|
+
} else {
|
|
1651
|
+
result.transcribe = new StepResult('skipped');
|
|
1586
1652
|
}
|
|
1587
1653
|
|
|
1588
1654
|
// ── AI analyze ──
|
|
@@ -1590,28 +1656,45 @@ async function runInputTask(opts) {
|
|
|
1590
1656
|
if (steps.includes('analyze') && transcribeText) {
|
|
1591
1657
|
const aiEnabled = (process.env.AI_ENABLED || 'true').toLowerCase() === 'true';
|
|
1592
1658
|
if (aiEnabled) {
|
|
1593
|
-
console.log(` [${
|
|
1659
|
+
console.log(` [${usedStem}] 🤖 开始 AI 分析...`);
|
|
1594
1660
|
try {
|
|
1595
1661
|
const { text: kw, error } = await stepAnalyze(transcribeText, maxRetries, retryDelay, analyzeTimeout);
|
|
1596
1662
|
if (kw && typeof kw === 'string') {
|
|
1597
1663
|
analyzeText = kw;
|
|
1598
|
-
console.log(` [${
|
|
1664
|
+
console.log(` [${usedStem}] 🤖 AI分析完成: ${kw.length} 字符`);
|
|
1665
|
+
result.analyze = new StepResult('success', kw);
|
|
1599
1666
|
} else {
|
|
1600
|
-
console.log(` [${
|
|
1667
|
+
console.log(` [${usedStem}] 🤖 AI分析: ${c('red', '失败 — ' + (error || ''))}`);
|
|
1668
|
+
result.analyze = new StepResult('failed', null, error);
|
|
1601
1669
|
}
|
|
1602
1670
|
} catch (e) {
|
|
1603
|
-
console.log(` [${
|
|
1671
|
+
console.log(` [${usedStem}] 🤖 AI分析: ${c('red', '异常 — ' + (e.message || '').slice(0, 200))}`);
|
|
1672
|
+
result.analyze = new StepResult('failed', null, String(e.message).slice(0, 500));
|
|
1604
1673
|
}
|
|
1605
1674
|
} else {
|
|
1606
|
-
console.log(` [${
|
|
1675
|
+
console.log(` [${usedStem}] 🤖 AI分析: ${c('yellow', '已禁用 (AI_ENABLED=false)')}`);
|
|
1676
|
+
result.analyze = new StepResult('skipped');
|
|
1607
1677
|
}
|
|
1678
|
+
} else {
|
|
1679
|
+
result.analyze = new StepResult('skipped');
|
|
1680
|
+
}
|
|
1681
|
+
|
|
1682
|
+
// ── 判定整体状态 ──
|
|
1683
|
+
if (result.transcode.status === 'failed') {
|
|
1684
|
+
result.overall_status = 'failed';
|
|
1685
|
+
} else if (result.transcribe.status === 'failed' && steps.includes('transcribe')) {
|
|
1686
|
+
result.overall_status = 'partial';
|
|
1687
|
+
} else if (result.analyze.status === 'failed') {
|
|
1688
|
+
result.overall_status = 'partial';
|
|
1689
|
+
} else {
|
|
1690
|
+
result.overall_status = 'success';
|
|
1608
1691
|
}
|
|
1609
1692
|
|
|
1610
1693
|
// ── 保存文本结果 ──
|
|
1611
1694
|
if (transcribeText || analyzeText) {
|
|
1612
|
-
const outDir = path.join(REPORTS_DIR, '
|
|
1695
|
+
const outDir = path.join(REPORTS_DIR, sheetName, 'tasks');
|
|
1613
1696
|
fs.mkdirSync(outDir, { recursive: true });
|
|
1614
|
-
const outFile = path.join(outDir, `${
|
|
1697
|
+
const outFile = path.join(outDir, `${usedStem}.txt`);
|
|
1615
1698
|
const lines = [
|
|
1616
1699
|
`文件: ${inputPath}`,
|
|
1617
1700
|
`平台: local`,
|
|
@@ -1642,6 +1725,8 @@ async function runInputTask(opts) {
|
|
|
1642
1725
|
console.log(c('yellow', `⚠️ ${failed.length} 个步骤未成功: ${failed.join(', ')}`));
|
|
1643
1726
|
}
|
|
1644
1727
|
console.log('');
|
|
1728
|
+
|
|
1729
|
+
return result;
|
|
1645
1730
|
}
|
|
1646
1731
|
|
|
1647
1732
|
|
|
@@ -1737,7 +1822,7 @@ async function runUrlTask(opts) {
|
|
|
1737
1822
|
const analyzeText = (result.analyze && typeof result.analyze.file === 'string') ? result.analyze.file : '';
|
|
1738
1823
|
|
|
1739
1824
|
if (transcribeText || analyzeText) {
|
|
1740
|
-
const outDir = path.join(REPORTS_DIR, '
|
|
1825
|
+
const outDir = path.join(REPORTS_DIR, platform, 'tasks');
|
|
1741
1826
|
fs.mkdirSync(outDir, { recursive: true });
|
|
1742
1827
|
const outFile = path.join(outDir, `${stem}.txt`);
|
|
1743
1828
|
const lines = [
|
|
@@ -1757,12 +1842,14 @@ async function runUrlTask(opts) {
|
|
|
1757
1842
|
}
|
|
1758
1843
|
|
|
1759
1844
|
console.log(c('bold', c('green', `\n\uD83C\uDF89 \u5168\u90E8\u5B8C\u6210! (${successes.length}/${steps.length} \u6B65\u6210\u529F)\n`)));
|
|
1845
|
+
return result;
|
|
1760
1846
|
}
|
|
1761
1847
|
|
|
1762
1848
|
async function run({
|
|
1763
1849
|
targetSheet, targetId, steps, maxRetries, retryDelay,
|
|
1764
1850
|
concurrency, force, dryRun, retryFailed,
|
|
1765
1851
|
downloadTimeout, transcodeTimeout, transcribeTimeout, analyzeTimeout,
|
|
1852
|
+
offset = 0, rowLimit = 0,
|
|
1766
1853
|
}) {
|
|
1767
1854
|
// ── 重跑失败模式 ──
|
|
1768
1855
|
if (retryFailed) {
|
|
@@ -1772,7 +1859,7 @@ async function run({
|
|
|
1772
1859
|
|
|
1773
1860
|
// ── 构建任务列表 ──
|
|
1774
1861
|
const sheets = targetSheet ? [targetSheet] : VIDEO_SHEETS;
|
|
1775
|
-
|
|
1862
|
+
let tasks = [];
|
|
1776
1863
|
for (const sheetName of sheets) {
|
|
1777
1864
|
let rows = readExcelSheet(sheetName);
|
|
1778
1865
|
if (targetId) {
|
|
@@ -1796,6 +1883,15 @@ async function run({
|
|
|
1796
1883
|
}
|
|
1797
1884
|
}
|
|
1798
1885
|
|
|
1886
|
+
// ── 偏移/限量(全局,跨 sheet) ──
|
|
1887
|
+
if (offset > 0 || rowLimit > 0) {
|
|
1888
|
+
const start = offset;
|
|
1889
|
+
const end = rowLimit > 0 ? start + rowLimit : undefined;
|
|
1890
|
+
const originalLen = tasks.length;
|
|
1891
|
+
tasks = tasks.slice(start, end);
|
|
1892
|
+
logInfo(`applied offset=${start}, limit=${rowLimit || 'all'} → tasks: ${originalLen} → ${tasks.length}`);
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1799
1895
|
logInfo(`tasks: ${tasks.length}, concurrency: ${concurrency}, max retries: ${maxRetries}`);
|
|
1800
1896
|
|
|
1801
1897
|
const envCheck = await checkEnvironmentAsync(steps);
|
|
@@ -1861,10 +1957,10 @@ async function run({
|
|
|
1861
1957
|
sheets, target_id: targetId, steps, max_retries: maxRetries,
|
|
1862
1958
|
retry_delay: retryDelay, concurrency, force,
|
|
1863
1959
|
};
|
|
1864
|
-
const
|
|
1960
|
+
const reportPaths = generateReport(results, config);
|
|
1865
1961
|
printReportSummary(results);
|
|
1866
1962
|
|
|
1867
|
-
logInfo(`all done!
|
|
1963
|
+
logInfo(`all done! reports: ${Array.isArray(reportPaths) ? reportPaths.join(', ') : reportPaths}`);
|
|
1868
1964
|
}
|
|
1869
1965
|
|
|
1870
1966
|
function printDryRun(tasks, steps, env) {
|
|
@@ -2057,9 +2153,9 @@ async function runFromReport(reportPath, steps, maxRetries, retryDelay, concurre
|
|
|
2057
2153
|
|
|
2058
2154
|
const config = { retry_from: reportPath, steps, max_retries: maxRetries,
|
|
2059
2155
|
retry_delay: retryDelay, concurrency, force };
|
|
2060
|
-
const
|
|
2156
|
+
const reportPaths = generateReport(results, config);
|
|
2061
2157
|
printReportSummary(results);
|
|
2062
|
-
logInfo(`all done!
|
|
2158
|
+
logInfo(`all done! reports: ${Array.isArray(reportPaths) ? reportPaths.join(', ') : reportPaths}`);
|
|
2063
2159
|
}
|
|
2064
2160
|
|
|
2065
2161
|
// ============================== CLI ==============================
|
|
@@ -2069,6 +2165,8 @@ if (process.argv[1] === __filename || process.argv[1]?.endsWith('process_videos.
|
|
|
2069
2165
|
.description('视频下载、转码、文本识别、AI分析一体化流程')
|
|
2070
2166
|
.option('--sheet <name>', '指定 sheet 名称')
|
|
2071
2167
|
.option('--id <id>', '指定 extra.id 或 title(单条测试)')
|
|
2168
|
+
.option('--offset <n>', '跳过前 N 条任务(从 0 开始),默认 0', parseInt, 0)
|
|
2169
|
+
.option('--limit <n>', '最多处理 N 条任务,默认无限制', parseInt, 0)
|
|
2072
2170
|
.option('--step <step>', '指定执行步骤(可多次指定),如 --step transcode --step transcribe', (val, prev) => {
|
|
2073
2171
|
const allowed = ['download', 'transcode', 'transcribe', 'analyze'];
|
|
2074
2172
|
if (!allowed.includes(val)) {
|
|
@@ -2086,7 +2184,7 @@ if (process.argv[1] === __filename || process.argv[1]?.endsWith('process_videos.
|
|
|
2086
2184
|
.option('--transcribe-timeout <n>', '识别超时(秒),默认 600', parseInt, 600)
|
|
2087
2185
|
.option('--analyze-timeout <n>', 'AI 分析超时(秒),默认 300', parseInt, 300)
|
|
2088
2186
|
.option('--dry-run', '干跑模式,只列任务不执行')
|
|
2089
|
-
.option('--retry-failed <path>', '从报告 JSON
|
|
2187
|
+
.option('--retry-failed <path>', '从报告 JSON 重跑失败项(output/reports/{sheet}/report_xxx.json)')
|
|
2090
2188
|
.option('--init', '复制 .env.example 到当前目录并重命名为 .env')
|
|
2091
2189
|
.option('--file <path>', '指定 Excel 文件路径(优先级高于 EXCEL_FILE 环境变量)')
|
|
2092
2190
|
.option('--input <path>', '指定本地视频文件路径(跳过下载,直接转码→识别→分析)')
|
|
@@ -2216,7 +2314,7 @@ if (process.argv[1] === __filename || process.argv[1]?.endsWith('process_videos.
|
|
|
2216
2314
|
}
|
|
2217
2315
|
|
|
2218
2316
|
// 执行流水线
|
|
2219
|
-
await runUrlTask({
|
|
2317
|
+
const urlResult = await runUrlTask({
|
|
2220
2318
|
watchUrl: parsed.watchUrl,
|
|
2221
2319
|
platform: parsed.platform,
|
|
2222
2320
|
pkey: parsed.pkey,
|
|
@@ -2234,6 +2332,13 @@ if (process.argv[1] === __filename || process.argv[1]?.endsWith('process_videos.
|
|
|
2234
2332
|
whisperAvailable,
|
|
2235
2333
|
});
|
|
2236
2334
|
|
|
2335
|
+
// 生成标准报告 JSON(与 Excel 模式格式一致)
|
|
2336
|
+
if (urlResult) {
|
|
2337
|
+
const config = { steps, max_retries: opts.retry, retry_delay: opts.retryDelay, concurrency: 1, force: opts.force || false };
|
|
2338
|
+
generateReport([urlResult], config, parsed.platform);
|
|
2339
|
+
printReportSummary([urlResult]);
|
|
2340
|
+
}
|
|
2341
|
+
|
|
2237
2342
|
process.exit(0);
|
|
2238
2343
|
}
|
|
2239
2344
|
|
|
@@ -2336,7 +2441,7 @@ if (process.argv[1] === __filename || process.argv[1]?.endsWith('process_videos.
|
|
|
2336
2441
|
}
|
|
2337
2442
|
|
|
2338
2443
|
// 执行流水线
|
|
2339
|
-
await runInputTask({
|
|
2444
|
+
const inputResult = await runInputTask({
|
|
2340
2445
|
inputPath,
|
|
2341
2446
|
stem,
|
|
2342
2447
|
sheetName,
|
|
@@ -2351,6 +2456,13 @@ if (process.argv[1] === __filename || process.argv[1]?.endsWith('process_videos.
|
|
|
2351
2456
|
fileInfo,
|
|
2352
2457
|
});
|
|
2353
2458
|
|
|
2459
|
+
// 生成标准报告 JSON(与 Excel 模式格式一致)
|
|
2460
|
+
if (inputResult) {
|
|
2461
|
+
const config = { steps, max_retries: opts.retry, retry_delay: opts.retryDelay, concurrency: 1, force: opts.force || false };
|
|
2462
|
+
generateReport([inputResult], config, sheetName);
|
|
2463
|
+
printReportSummary([inputResult]);
|
|
2464
|
+
}
|
|
2465
|
+
|
|
2354
2466
|
process.exit(0);
|
|
2355
2467
|
}
|
|
2356
2468
|
|
|
@@ -2359,6 +2471,8 @@ if (process.argv[1] === __filename || process.argv[1]?.endsWith('process_videos.
|
|
|
2359
2471
|
targetSheet: opts.sheet || null,
|
|
2360
2472
|
targetId: opts.id || null,
|
|
2361
2473
|
steps,
|
|
2474
|
+
offset: opts.offset || 0,
|
|
2475
|
+
rowLimit: opts.limit || 0,
|
|
2362
2476
|
maxRetries: opts.retry,
|
|
2363
2477
|
retryDelay: opts.retryDelay,
|
|
2364
2478
|
concurrency: opts.concurrency,
|