@xiping/subtitle 1.0.45 → 1.0.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -14
- package/lib/index.d.ts +1 -1
- package/lib/src/converter.d.ts +1 -1
- package/lib/src/converter.js +3 -3
- package/lib/src/json-converter.d.ts +5 -2
- package/lib/src/json-converter.js +46 -4
- package/lib/src/types.d.ts +5 -0
- package/package.json +9 -5
package/README.md
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
- 🎯 简单易用的API
|
|
12
12
|
- 🔄 支持SRT和WebVTT互相转换
|
|
13
13
|
- 📄 支持转换为JSON格式
|
|
14
|
+
- 🌐 支持中英文分词(中文使用jieba-wasm,英文使用compromise)
|
|
14
15
|
|
|
15
16
|
## 安装
|
|
16
17
|
|
|
@@ -92,16 +93,24 @@ const isVttValid = validateVTT(vttContent);
|
|
|
92
93
|
### JSON转换
|
|
93
94
|
|
|
94
95
|
```typescript
|
|
95
|
-
import { srtToJson, vttToJson } from '@xiping/subtitle';
|
|
96
|
+
import { srtToJson, vttToJson, SubtitleLanguage } from '@xiping/subtitle';
|
|
96
97
|
|
|
97
|
-
// SRT转JSON
|
|
98
|
+
// SRT转JSON(中文,使用jieba-wasm分词)
|
|
98
99
|
const srtContent = `1
|
|
99
100
|
00:00:00,000 --> 00:00:04,000
|
|
100
|
-
|
|
101
|
+
中华人民共和国`;
|
|
101
102
|
|
|
102
|
-
const jsonContent = srtToJson(srtContent);
|
|
103
|
+
const jsonContent = await srtToJson(srtContent, 'zh');
|
|
103
104
|
console.log(jsonContent);
|
|
104
|
-
//
|
|
105
|
+
// 输出包含分词结果: [{"index":1,"startTime":"00:00:00,000","endTime":"00:00:04,000","text":"中华人民共和国","words":["中华人民共和国",...]}]
|
|
106
|
+
|
|
107
|
+
// SRT转JSON(英文,使用compromise分词)
|
|
108
|
+
const englishSrt = `1
|
|
109
|
+
00:00:00,000 --> 00:00:04,000
|
|
110
|
+
Hello world, this is a test.`;
|
|
111
|
+
|
|
112
|
+
const englishJson = await srtToJson(englishSrt, 'en');
|
|
113
|
+
// 输出包含英文分词结果
|
|
105
114
|
|
|
106
115
|
// WebVTT转JSON
|
|
107
116
|
const vttContent = `WEBVTT
|
|
@@ -109,7 +118,7 @@ const vttContent = `WEBVTT
|
|
|
109
118
|
00:00:00.000 --> 00:00:04.000
|
|
110
119
|
字幕内容`;
|
|
111
120
|
|
|
112
|
-
const vttJson = vttToJson(vttContent);
|
|
121
|
+
const vttJson = await vttToJson(vttContent, 'zh');
|
|
113
122
|
```
|
|
114
123
|
|
|
115
124
|
### 文件操作示例
|
|
@@ -125,8 +134,8 @@ const srtContent = fs.readFileSync('subtitle.srt', 'utf-8');
|
|
|
125
134
|
const webvttContent = srtToVtt(srtContent);
|
|
126
135
|
fs.writeFileSync('subtitle.vtt', webvttContent, 'utf-8');
|
|
127
136
|
|
|
128
|
-
// 转换为JSON
|
|
129
|
-
const jsonContent = srtToJson(srtContent);
|
|
137
|
+
// 转换为JSON(需要指定语言类型)
|
|
138
|
+
const jsonContent = await srtToJson(srtContent, 'zh');
|
|
130
139
|
fs.writeFileSync('subtitle.json', jsonContent, 'utf-8');
|
|
131
140
|
```
|
|
132
141
|
|
|
@@ -192,25 +201,33 @@ fs.writeFileSync('subtitle.json', jsonContent, 'utf-8');
|
|
|
192
201
|
**返回值:**
|
|
193
202
|
- 是否为有效的WebVTT格式
|
|
194
203
|
|
|
195
|
-
### `srtToJson(srtContent: string): string
|
|
204
|
+
### `srtToJson(srtContent: string, language?: SubtitleLanguage): Promise<string>`
|
|
196
205
|
|
|
197
|
-
将SRT字幕转换为JSON
|
|
206
|
+
将SRT字幕转换为JSON格式字符串,并自动进行分词处理。
|
|
198
207
|
|
|
199
208
|
**参数:**
|
|
200
209
|
- `srtContent` - SRT格式的字幕内容
|
|
210
|
+
- `language` - 字幕语言类型,可选值:`'zh'`(中文,默认)或 `'en'`(英文)
|
|
201
211
|
|
|
202
212
|
**返回值:**
|
|
203
|
-
- JSON格式字符串(格式化,缩进2空格)
|
|
213
|
+
- Promise,解析为JSON格式字符串(格式化,缩进2空格)
|
|
214
|
+
- JSON中包含 `words` 字段,为分词后的单词数组
|
|
215
|
+
- 中文使用 `jieba-wasm` 进行分词
|
|
216
|
+
- 英文使用 `compromise` 进行分词
|
|
204
217
|
|
|
205
|
-
### `vttToJson(vttContent: string): string
|
|
218
|
+
### `vttToJson(vttContent: string, language?: SubtitleLanguage): Promise<string>`
|
|
206
219
|
|
|
207
|
-
将WebVTT字幕转换为JSON
|
|
220
|
+
将WebVTT字幕转换为JSON格式字符串,并自动进行分词处理。
|
|
208
221
|
|
|
209
222
|
**参数:**
|
|
210
223
|
- `vttContent` - WebVTT格式的字幕内容
|
|
224
|
+
- `language` - 字幕语言类型,可选值:`'zh'`(中文,默认)或 `'en'`(英文)
|
|
211
225
|
|
|
212
226
|
**返回值:**
|
|
213
|
-
- JSON格式字符串(格式化,缩进2空格)
|
|
227
|
+
- Promise,解析为JSON格式字符串(格式化,缩进2空格)
|
|
228
|
+
- JSON中包含 `words` 字段,为分词后的单词数组
|
|
229
|
+
- 中文使用 `jieba-wasm` 进行分词
|
|
230
|
+
- 英文使用 `compromise` 进行分词
|
|
214
231
|
|
|
215
232
|
### `convertTimeFormat(srtTime: string): string`
|
|
216
233
|
|
|
@@ -225,11 +242,15 @@ fs.writeFileSync('subtitle.json', jsonContent, 'utf-8');
|
|
|
225
242
|
## 类型定义
|
|
226
243
|
|
|
227
244
|
```typescript
|
|
245
|
+
// 字幕语言类型
|
|
246
|
+
type SubtitleLanguage = 'zh' | 'en';
|
|
247
|
+
|
|
228
248
|
interface SubtitleEntry {
|
|
229
249
|
index: number; // 字幕序号
|
|
230
250
|
startTime: string; // 开始时间
|
|
231
251
|
endTime: string; // 结束时间
|
|
232
252
|
text: string; // 字幕文本
|
|
253
|
+
words?: string[]; // 分词后的单词数组(可选,仅在转换为JSON时生成)
|
|
233
254
|
}
|
|
234
255
|
```
|
|
235
256
|
|
package/lib/index.d.ts
CHANGED
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* 00:00:00.000 --> 00:00:04.000
|
|
13
13
|
* 字幕内容
|
|
14
14
|
*/
|
|
15
|
-
export type { SubtitleEntry } from "./src/types.js";
|
|
15
|
+
export type { SubtitleEntry, SubtitleLanguage } from "./src/types.js";
|
|
16
16
|
export { parseSRT, parseVTT, validateSRT, validateVTT } from "./src/parser.js";
|
|
17
17
|
export { convertTimeFormat, generateWebVTT, srtToVtt, } from "./src/converter.js";
|
|
18
18
|
export { srtToJson, vttToJson } from "./src/json-converter.js";
|
package/lib/src/converter.d.ts
CHANGED
package/lib/src/converter.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import { parseSRT } from
|
|
1
|
+
import { parseSRT } from "./parser.js";
|
|
2
2
|
/**
|
|
3
3
|
* 将SRT时间格式转换为WebVTT时间格式
|
|
4
4
|
* @param srtTime SRT时间格式 (HH:MM:SS,mmm)
|
|
5
5
|
* @returns WebVTT时间格式 (HH:MM:SS.mmm)
|
|
6
6
|
*/
|
|
7
7
|
export function convertTimeFormat(srtTime) {
|
|
8
|
-
return srtTime.replace(
|
|
8
|
+
return srtTime.replace(",", ".");
|
|
9
9
|
}
|
|
10
10
|
/**
|
|
11
11
|
* 将字幕条目数组转换为WebVTT格式
|
|
@@ -13,7 +13,7 @@ export function convertTimeFormat(srtTime) {
|
|
|
13
13
|
* @returns WebVTT格式字符串
|
|
14
14
|
*/
|
|
15
15
|
export function generateWebVTT(entries) {
|
|
16
|
-
let webvtt =
|
|
16
|
+
let webvtt = "WEBVTT\n\n";
|
|
17
17
|
for (const entry of entries) {
|
|
18
18
|
const startTime = convertTimeFormat(entry.startTime);
|
|
19
19
|
const endTime = convertTimeFormat(entry.endTime);
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
+
import type { SubtitleLanguage } from './types.js';
|
|
1
2
|
/**
|
|
2
3
|
* 将SRT字幕转换为JSON格式
|
|
3
4
|
* @param srtContent SRT文件内容
|
|
5
|
+
* @param language 字幕语言类型,默认为中文
|
|
4
6
|
* @returns JSON格式字符串
|
|
5
7
|
*/
|
|
6
|
-
export declare function srtToJson(srtContent: string): string
|
|
8
|
+
export declare function srtToJson(srtContent: string, language?: SubtitleLanguage): Promise<string>;
|
|
7
9
|
/**
|
|
8
10
|
* 将WebVTT字幕转换为JSON格式
|
|
9
11
|
* @param vttContent WebVTT文件内容
|
|
12
|
+
* @param language 字幕语言类型,默认为中文
|
|
10
13
|
* @returns JSON格式字符串
|
|
11
14
|
*/
|
|
12
|
-
export declare function vttToJson(vttContent: string): string
|
|
15
|
+
export declare function vttToJson(vttContent: string, language?: SubtitleLanguage): Promise<string>;
|
|
@@ -1,19 +1,61 @@
|
|
|
1
1
|
import { parseSRT, parseVTT } from './parser.js';
|
|
2
|
+
import { cut } from 'jieba-wasm';
|
|
3
|
+
import nlp from 'compromise';
|
|
4
|
+
/**
|
|
5
|
+
* 对文本进行分词
|
|
6
|
+
* @param text 待分词的文本
|
|
7
|
+
* @param language 语言类型
|
|
8
|
+
* @returns 分词后的单词数组
|
|
9
|
+
*/
|
|
10
|
+
async function tokenizeText(text, language) {
|
|
11
|
+
if (!text || text.trim().length === 0) {
|
|
12
|
+
return [];
|
|
13
|
+
}
|
|
14
|
+
if (language === 'zh') {
|
|
15
|
+
// 使用 jieba-wasm 进行中文分词
|
|
16
|
+
return cut(text, true);
|
|
17
|
+
}
|
|
18
|
+
else if (language === 'en') {
|
|
19
|
+
// 使用 compromise 进行英文分词
|
|
20
|
+
const doc = nlp(text);
|
|
21
|
+
return doc.terms().out('array');
|
|
22
|
+
}
|
|
23
|
+
return [];
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* 对字幕条目进行分词处理
|
|
27
|
+
* @param entries 字幕条目数组
|
|
28
|
+
* @param language 语言类型
|
|
29
|
+
* @returns 添加了分词信息的字幕条目数组
|
|
30
|
+
*/
|
|
31
|
+
async function addTokenization(entries, language) {
|
|
32
|
+
return Promise.all(entries.map(async (entry) => {
|
|
33
|
+
const words = await tokenizeText(entry.text, language);
|
|
34
|
+
return {
|
|
35
|
+
...entry,
|
|
36
|
+
words,
|
|
37
|
+
};
|
|
38
|
+
}));
|
|
39
|
+
}
|
|
2
40
|
/**
|
|
3
41
|
* 将SRT字幕转换为JSON格式
|
|
4
42
|
* @param srtContent SRT文件内容
|
|
43
|
+
* @param language 字幕语言类型,默认为中文
|
|
5
44
|
* @returns JSON格式字符串
|
|
6
45
|
*/
|
|
7
|
-
export function srtToJson(srtContent) {
|
|
46
|
+
export async function srtToJson(srtContent, language = 'zh') {
|
|
8
47
|
const entries = parseSRT(srtContent);
|
|
9
|
-
|
|
48
|
+
const entriesWithWords = await addTokenization(entries, language);
|
|
49
|
+
return JSON.stringify(entriesWithWords, null, 2);
|
|
10
50
|
}
|
|
11
51
|
/**
|
|
12
52
|
* 将WebVTT字幕转换为JSON格式
|
|
13
53
|
* @param vttContent WebVTT文件内容
|
|
54
|
+
* @param language 字幕语言类型,默认为中文
|
|
14
55
|
* @returns JSON格式字符串
|
|
15
56
|
*/
|
|
16
|
-
export function vttToJson(vttContent) {
|
|
57
|
+
export async function vttToJson(vttContent, language = 'zh') {
|
|
17
58
|
const entries = parseVTT(vttContent);
|
|
18
|
-
|
|
59
|
+
const entriesWithWords = await addTokenization(entries, language);
|
|
60
|
+
return JSON.stringify(entriesWithWords, null, 2);
|
|
19
61
|
}
|
package/lib/src/types.d.ts
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@xiping/subtitle",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.50",
|
|
4
4
|
"description": "字幕文件处理工具,支持SRT和WebVTT格式的解析、转换和验证",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"author": "The-End-Hero <527409987@qq.com>",
|
|
@@ -18,15 +18,19 @@
|
|
|
18
18
|
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js",
|
|
19
19
|
"build": "tsc"
|
|
20
20
|
},
|
|
21
|
-
"gitHead": "
|
|
21
|
+
"gitHead": "35571812778b756cd83d9d21fc4b0b3f3a1926e3",
|
|
22
22
|
"publishConfig": {
|
|
23
23
|
"access": "public",
|
|
24
24
|
"registry": "https://registry.npmjs.org/"
|
|
25
25
|
},
|
|
26
26
|
"devDependencies": {
|
|
27
27
|
"@types/jest": "^30.0.0",
|
|
28
|
-
"jest": "^30.
|
|
29
|
-
"ts-jest": "^29.4.
|
|
30
|
-
"typescript": "^5.
|
|
28
|
+
"jest": "^30.2.0",
|
|
29
|
+
"ts-jest": "^29.4.6",
|
|
30
|
+
"typescript": "^5.9.3"
|
|
31
|
+
},
|
|
32
|
+
"dependencies": {
|
|
33
|
+
"compromise": "^14.14.5",
|
|
34
|
+
"jieba-wasm": "^2.4.0"
|
|
31
35
|
}
|
|
32
36
|
}
|