@dan-uni/dan-any-plugin-detaolu 1.4.8 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,615 +0,0 @@
1
- /**
2
- * @author: xmcp(代码主要逻辑来源)
3
- * @see: https://github.com/xmcp/pakku.js/blob/master/pakkujs/core/combine_worker.ts
4
- * @see: https://github.com/xmcp/pakku.js/blob/master/pakkujs/background/config.ts
5
- * @see: https://github.com/xmcp/pakku.js/blob/master/pakkujs/page/options.html
6
- * @see: https://github.com/xmcp/pakku.js/blob/master/pakkujs/page/options.ts
7
- * @license: GPL-3.0
8
- * 本文件内代码来源见上,经部分修改,并整合config注释
9
- */
10
-
11
- import fs from 'fs-extra'
12
- import type { DanmuChunk, DanmuClusterOutput, DanmuObject, int } from './types'
13
-
14
- import {
15
- begin_chunk,
16
- begin_index_lock,
17
- detect_similarity,
18
- init as sim_init,
19
- } from './similarity_stub'
20
- import { Queue, Stats } from './types'
21
-
22
- export const DEFAULT_CONFIG = {
23
- // 弹幕合并
24
- /**
25
- * 时间阈值:合并时间差在n秒之内的重复弹幕
26
- * 超长(大概 60 秒以上?)的阈值可能会导致程序运行缓慢
27
- */
28
- THRESHOLD: 30,
29
- /**
30
- * 编辑距离合并阈值:
31
- * 根据编辑距离判断不完全一致但内容相近(例如有错别字)的弹幕
32
- * 能有效击杀 "你指尖跃动的电光" 和 "你之间跃动的电光" 等
33
- * @example 禁用(0), 轻微(≤3), 中等(≤5), 强力(≤8)
34
- */
35
- MAX_DIST: 5,
36
- /**
37
- * 词频向量合并阈值:
38
- * 根据 2-Gram 频率向量的夹角判断不完全一致但内容类似的弹幕
39
- * 能有效击杀 "yeah!~" 和 "yeah!~yeah!~yeah!~yeah!~" 等
40
- * @example 禁用(1000), 轻微(60%), 中等(45%), 强力(30%)
41
- */
42
- MAX_COSINE: 45,
43
- /**
44
- * 识别谐音弹幕:
45
- * 将常用汉字转换为拼音再进行比较
46
- * 能有效击杀 "布拉迪巴特福来" 和 "布拉迪·八德福莱" 等
47
- */
48
- TRIM_PINYIN: true,
49
- // 比较文本时:
50
- TRIM_ENDING: true, // 忽略末尾标点
51
- TRIM_SPACE: true, // 忽略多余空格
52
- TRIM_WIDTH: true, // 忽略全半角差异
53
-
54
- // 例外设置
55
- /**
56
- * 内容替换:符合这些规则的弹幕,判断是否合并前会先对内容进行替换
57
- */
58
- FORCELIST: [
59
- ['^23{2,}$', '23333'],
60
- ['^6{3,}$', '66666'],
61
- ],
62
- /**
63
- * 内容替换规则命中时:继续尝试匹配后续规则
64
- */
65
- FORCELIST_CONTINUE_ON_MATCH: true,
66
- /**
67
- * 内容替换规则命中时:即使未触发合并也使用替换后的文本
68
- */
69
- FORCELIST_APPLY_SINGULAR: false,
70
- /**
71
- * 强制忽略:符合这些规则的弹幕不会被合并,优先级高于内容替换规则
72
- */
73
- WHITELIST: [] as [string, string][],
74
- /**
75
- * 强制删除:符合这些规则的弹幕会直接被删除(未实现)
76
- */
77
- BLACKLIST: [] as [string, string][],
78
- /**
79
- * 合并不同类型的弹幕(取消勾选后,底部弹幕不会跟滚动弹幕合并到一起)
80
- */
81
- CROSS_MODE: true,
82
- // 放过特定类型的弹幕:
83
- PROC_TYPE7: true, // 高级弹幕(特殊弹幕)
84
- PROC_TYPE4: true, // 底部弹幕
85
- PROC_POOL1: false, // 字幕弹幕(位于弹幕池1)
86
-
87
- // // 显示设置
88
- // DANMU_MARK: 'prefix' as 'prefix' | 'suffix' | 'off', // 弹幕数量标记(开头/结尾/关闭)
89
- // MARK_THRESHOLD: 1, // 仅当数字大于n时显示
90
- // DANMU_SUBSCRIPT: true, // 数量标记显示成下标(₍₂₎/[x2])
91
- // // ENLARGE: true,
92
- // // SHRINK_THRESHOLD: 0,
93
- // /**
94
- // * 自动弹幕优选:
95
- // * 瞬时弹幕密度大于阈值时,按比例删除低权重弹幕,优先删除未合并弹幕
96
- // * @example 禁用(0),轻微(>120),中等(>75),强力(>50)
97
- // */
98
- // DROP_THRESHOLD: 0,
99
- // /**
100
- // * 合并后尽量显示为固定弹幕:
101
- // * 滚动弹幕和顶部 / 底部弹幕合并后显示在顶部 / 底部
102
- // */
103
- // MODE_ELEVATION: true,
104
- // /**
105
- // * 合并后的弹幕显示于n百分位弹幕的时间点
106
- // * @example 0%(0),20%(20),50%(50)
107
- // */
108
- // REPRESENTATIVE_PERCENT: 20,
109
- }
110
-
111
- export type Config = Partial<typeof DEFAULT_CONFIG>
112
- type ResolvedConfig = typeof DEFAULT_CONFIG
113
-
114
- interface DanmuIr {
115
- obj: DanmuObject
116
- str: string // for similarity algorithm
117
- ptr_idx: int
118
- sim_reason: string
119
- }
120
-
121
- const ENDING_CHARS = new Set('.。,,/??!!…~~@^、+=-_♂♀ ')
122
- // const TRIM_EXTRA_SPACE_RE = /[ \u3000]+/g
123
- // const TRIM_CJK_SPACE_RE =
124
- // /([\u3000-\u9FFF\uFF00-\uFFEF]) (?=[\u3000-\u9FFF\uFF00-\uFFEF])/g
125
- const WIDTH_TABLE = new Map(
126
- Object.entries({
127
- ' ': ' ',
128
- '1': '1',
129
- '2': '2',
130
- '3': '3',
131
- '4': '4',
132
- '5': '5',
133
- '6': '6',
134
- '7': '7',
135
- '8': '8',
136
- '9': '9',
137
- '0': '0',
138
- '!': '!',
139
- '@': '@',
140
- '#': '#',
141
- '$': '$',
142
- '%': '%',
143
- '^': '^',
144
- '&': '&',
145
- '*': '*',
146
- '(': '(',
147
- ')': ')',
148
- '-': '-',
149
- '=': '=',
150
- '_': '_',
151
- '+': '+',
152
- '[': '[',
153
- ']': ']',
154
- '{': '{',
155
- '}': '}',
156
- ';': ';',
157
- ''': "'",
158
- ':': ':',
159
- '"': '"',
160
- ',': ',',
161
- '.': '.',
162
- '/': '/',
163
- '<': '<',
164
- '>': '>',
165
- '?': '?',
166
- '\': '\\',
167
- '|': '|',
168
- '`': '`',
169
- '~': '~',
170
- q: 'q',
171
- w: 'w',
172
- e: 'e',
173
- r: 'r',
174
- t: 't',
175
- y: 'y',
176
- u: 'u',
177
- i: 'i',
178
- o: 'o',
179
- p: 'p',
180
- a: 'a',
181
- s: 's',
182
- d: 'd',
183
- f: 'f',
184
- g: 'g',
185
- h: 'h',
186
- j: 'j',
187
- k: 'k',
188
- l: 'l',
189
- z: 'z',
190
- x: 'x',
191
- c: 'c',
192
- v: 'v',
193
- b: 'b',
194
- n: 'n',
195
- m: 'm',
196
- Q: 'Q',
197
- W: 'W',
198
- E: 'E',
199
- R: 'R',
200
- T: 'T',
201
- Y: 'Y',
202
- U: 'U',
203
- I: 'I',
204
- O: 'O',
205
- P: 'P',
206
- A: 'A',
207
- S: 'S',
208
- D: 'D',
209
- F: 'F',
210
- G: 'G',
211
- H: 'H',
212
- J: 'J',
213
- K: 'K',
214
- L: 'L',
215
- Z: 'Z',
216
- X: 'X',
217
- C: 'C',
218
- V: 'V',
219
- B: 'B',
220
- N: 'N',
221
- M: 'M',
222
- }),
223
- )
224
-
225
- /**
226
- * 反套路
227
- */
228
- function detaolu_meta(
229
- config: ResolvedConfig,
230
- ): (text: string) => [boolean, string] {
231
- const TRIM_ENDING = config.TRIM_ENDING
232
- const TRIM_SPACE = config.TRIM_SPACE
233
- const TRIM_WIDTH = config.TRIM_WIDTH
234
- const FORCELIST = config.FORCELIST.map(
235
- ([pattern, repl]) => [new RegExp(pattern, 'giu'), repl] as [RegExp, string],
236
- )
237
- const FORCELIST_BREAK_ON_MATCH = !config.FORCELIST_CONTINUE_ON_MATCH
238
-
239
- return (inp: string) => {
240
- let len = inp.length
241
- let text = ''
242
-
243
- if (TRIM_ENDING) {
244
- while (ENDING_CHARS.has(inp.charAt(len - 1)))
245
- // assert str.charAt(-1)===''
246
- len--
247
- if (len === 0)
248
- // all chars are ending chars, do nothing
249
- len = inp.length
250
- }
251
-
252
- if (TRIM_WIDTH) {
253
- for (let i = 0; i < len; i++) {
254
- const c = inp.charAt(i)
255
- text += WIDTH_TABLE.get(c) || c
256
- }
257
- } else {
258
- text = inp.slice(0, len)
259
- }
260
-
261
- if (TRIM_SPACE) {
262
- // text = text
263
- // .replace(TRIM_EXTRA_SPACE_RE, ' ')
264
- // .replace(TRIM_CJK_SPACE_RE, '$1')
265
- text = text
266
- .replaceAll(/[ \u3000]+/g, ' ')
267
- .replaceAll(
268
- /([\u3000-\u9FFF\uFF00-\uFFEF]) (?=[\u3000-\u9FFF\uFF00-\uFFEF])/g,
269
- '$1',
270
- )
271
- }
272
-
273
- let taolu_matched = false
274
- for (const taolu of FORCELIST) {
275
- if (taolu[0].test(text)) {
276
- text = text.replace(taolu[0], taolu[1])
277
- taolu_matched = true
278
- if (FORCELIST_BREAK_ON_MATCH) break
279
- }
280
- }
281
-
282
- return [taolu_matched, text]
283
- }
284
- }
285
-
286
- /**
287
- * 白名单处理
288
- */
289
- function whitelisted_meta(config: ResolvedConfig): (text: string) => boolean {
290
- const WHITELIST = config.WHITELIST.map((x) => new RegExp(x[0], 'iu'))
291
-
292
- if (WHITELIST.length === 0) return () => false
293
-
294
- return (text: string) => WHITELIST.some((re) => re.test(text))
295
- }
296
-
297
- /**
298
- * 黑名单处理
299
- */
300
- function blacklisted_meta(
301
- config: ResolvedConfig,
302
- ): (text: string) => string | null {
303
- const BLACKLIST = config.BLACKLIST.map((x) =>
304
- x[0] ? new RegExp(x[1]) : x[1].toLowerCase(),
305
- )
306
-
307
- if (BLACKLIST.length === 0) return () => null
308
-
309
- return (text: string) => {
310
- const lower = text.toLowerCase()
311
- for (const pattern of BLACKLIST) {
312
- const matched =
313
- typeof pattern === 'string'
314
- ? lower.includes(pattern)
315
- : pattern.test(text)
316
- if (matched) {
317
- return typeof pattern === 'string'
318
- ? ` ${pattern}`
319
- : ` /${pattern.source}/`
320
- }
321
- }
322
- return null
323
- }
324
- }
325
-
326
- function extract_special_danmu(text: string): string {
327
- try {
328
- text = JSON.parse(text)[4]
329
- } catch {}
330
- return text
331
- }
332
-
333
- /**
334
- * 删除换行符/制表符
335
- */
336
- function trim_dispstr(text: string): string {
337
- return text.replaceAll(/([\r\n\t])/g, '').trim()
338
- }
339
-
340
- /**
341
- * 选取中间值(最多出现的文字)
342
- */
343
- function select_median_length(strs: string[]): string {
344
- if (strs.length === 1) return strs[0]
345
-
346
- const sorted = strs.toSorted((a, b) => a.length - b.length)
347
- const mid = Math.floor(sorted.length / 2)
348
- return sorted[mid]
349
- }
350
-
351
- function u8array_to_arraybuffer(array: Uint8Array): ArrayBuffer {
352
- return array.buffer.slice(
353
- array.byteOffset,
354
- array.byteOffset + array.byteLength,
355
- ) as ArrayBuffer
356
- }
357
-
358
- async function load_wasm(wasm_mod?: ArrayBuffer) {
359
- if (wasm_mod) {
360
- await sim_init(wasm_mod)
361
- return
362
- }
363
-
364
- const wasm_path = new URL('similarity-gen.wasm', import.meta.url).pathname
365
- const wasm_u8 = await fs.readFile(wasm_path)
366
- await sim_init(u8array_to_arraybuffer(wasm_u8))
367
- }
368
-
369
- function make_ptr_idx(idx: int, is_next_chunk: boolean): int {
370
- return is_next_chunk ? -1 - idx : idx
371
- }
372
-
373
- async function merge(
374
- chunk: DanmuChunk<DanmuObject>,
375
- // next_chunk: DanmuChunk<DanmuObject>,
376
- config: Config = DEFAULT_CONFIG,
377
- ): Promise<DanmuClusterOutput> {
378
- const local_config: ResolvedConfig = { ...DEFAULT_CONFIG, ...config }
379
-
380
- await load_wasm()
381
-
382
- begin_chunk(local_config)
383
-
384
- const ret: DanmuClusterOutput = {
385
- clusters: [],
386
- stats: new Stats(),
387
- deleted_chunk: [],
388
- }
389
-
390
- function apply_single_cluster(idx: int, obj: DanmuObject, desc: string) {
391
- ret.clusters.push({
392
- peers_ptr: [[idx, 'IGN']],
393
- desc: [desc],
394
- chosen_str: obj.content,
395
- // danuni
396
- danuni_count: 1,
397
- // danuni_senders: [obj.danuni_sender],
398
- danuni_dans: [obj],
399
- })
400
- }
401
- function apply_cluster(irs: DanmuIr[]) {
402
- if (irs.length === 1) {
403
- ret.clusters.push({
404
- peers_ptr: irs.map((ir) => [ir.ptr_idx, ir.sim_reason]),
405
- desc: [],
406
- chosen_str: irs[0].obj.content,
407
- // danuni
408
- danuni_count: irs.length,
409
- // danuni_senders: irs.map((ir) => ir.obj.danuni_sender),
410
- danuni_dans: irs.map((ir) => ir.obj),
411
- })
412
- } else {
413
- const text_cnts = new Map()
414
- let most_texts: string[] = []
415
- let most_cnt = 0
416
-
417
- for (const ir of irs) {
418
- const text = ir.str
419
- const cnt = 1 + (text_cnts.get(text) || 0)
420
- text_cnts.set(text, cnt)
421
-
422
- if (cnt > most_cnt) {
423
- most_texts = [text]
424
- most_cnt = cnt
425
- } else if (cnt === most_cnt) {
426
- most_texts.push(text)
427
- }
428
- }
429
-
430
- const most_text = select_median_length(most_texts)
431
-
432
- ret.clusters.push({
433
- peers_ptr: irs.map((ir) => [ir.ptr_idx, ir.sim_reason]),
434
- desc: most_cnt > 1 ? [`采用了出现 ${most_cnt} 次的文本`] : [],
435
- chosen_str: most_text,
436
- // danuni
437
- danuni_count: most_cnt,
438
- // danuni_senders: irs.map((ir) => ir.obj.danuni_sender),
439
- danuni_dans: irs.map((ir) => ir.obj),
440
- })
441
- }
442
- }
443
-
444
- const detaolu = detaolu_meta(local_config)
445
- const whitelisted = whitelisted_meta(local_config)
446
- const blacklisted = blacklisted_meta(local_config)
447
-
448
- function obj_to_ir(
449
- objs: DanmuObject[],
450
- s: Stats | null,
451
- is_next_chunk: boolean,
452
- ): DanmuIr[] {
453
- return objs
454
- .map((obj, idx) => {
455
- if (!local_config.PROC_POOL1 && obj.pool === 1) {
456
- if (s) {
457
- s.ignored_type++
458
- apply_single_cluster(idx, obj, '已忽略字幕弹幕,可以在选项中修改')
459
- }
460
- return null
461
- }
462
- // if (!config.PROC_TYPE7 && obj.mode === 7) {
463
- if (!local_config.PROC_TYPE7 && obj.mode === 4) {
464
- if (s) {
465
- s.ignored_type++
466
- apply_single_cluster(idx, obj, '已忽略特殊弹幕,可以在选项中修改')
467
- }
468
- return null
469
- }
470
- // if (!config.PROC_TYPE4 && obj.mode === 4) {
471
- if (!local_config.PROC_TYPE4 && obj.mode === 1) {
472
- if (s) {
473
- s.ignored_type++
474
- apply_single_cluster(idx, obj, '已忽略底部弹幕,可以在选项中修改')
475
- }
476
- return null
477
- }
478
- // if (obj.mode === 8) {
479
- // if (s) {
480
- // s.ignored_script++
481
- // apply_single_cluster(idx, obj, '代码弹幕')
482
- // }
483
- // return null
484
- // }
485
- // if (obj.mode === 9) {
486
- // if (s) {
487
- // s.ignored_script++
488
- // apply_single_cluster(idx, obj, 'BAS弹幕')
489
- // }
490
- // return null
491
- // }
492
-
493
- const disp_str = trim_dispstr(
494
- // obj.mode === 7 && obj.content[0] === '['
495
- obj.mode === 4 && obj.content[0] === '['
496
- ? extract_special_danmu(obj.content)
497
- : obj.content,
498
- )
499
-
500
- // if (obj.mode !== 8 && obj.mode !== 9) {
501
- if (obj.mode !== 4) {
502
- const matched = blacklisted(disp_str)
503
- if (matched) {
504
- if (s) {
505
- s.deleted_blacklist++
506
- s.deleted_blacklist_each[matched] =
507
- (s.deleted_blacklist_each[matched] || 0) + 1
508
- ret.deleted_chunk.push({
509
- ...obj,
510
- pakku: {
511
- deleted_reason: `命中黑名单:${matched}`,
512
- },
513
- })
514
- }
515
- return null
516
- }
517
- }
518
- if (whitelisted(disp_str)) {
519
- if (s) {
520
- s.ignored_whitelist++
521
- apply_single_cluster(idx, obj, '命中白名单')
522
- }
523
- return null
524
- }
525
-
526
- const [matched_taolu, detaolued] = detaolu(disp_str)
527
-
528
- if (matched_taolu) {
529
- if (s) s.num_taolu_matched++
530
- if (local_config.FORCELIST_APPLY_SINGULAR)
531
- obj = {
532
- ...obj,
533
- content: detaolued,
534
- }
535
- }
536
-
537
- return {
538
- obj,
539
- str: detaolued,
540
- ptr_idx: make_ptr_idx(idx, is_next_chunk),
541
- sim_reason: 'ORIG',
542
- }
543
- })
544
- .filter((obj) => obj !== null) as DanmuIr[]
545
- }
546
-
547
- const danmus = obj_to_ir(chunk.objs, ret.stats, false)
548
- // const next_chunk_danmus = obj_to_ir(next_chunk.objs, null, true)
549
-
550
- const nearby_danmus: Queue<DanmuIr[]> = new Queue()
551
-
552
- const THRESHOLD_MS = local_config.THRESHOLD * 1000
553
-
554
- for (const dm of danmus) {
555
- while (true) {
556
- const peeked = nearby_danmus.peek()
557
- if (
558
- peeked === null ||
559
- dm.obj.time_ms - peeked[0].obj.time_ms <= THRESHOLD_MS
560
- )
561
- break
562
- apply_cluster(peeked)
563
- nearby_danmus.pop()
564
- }
565
-
566
- const sim = detect_similarity(
567
- dm.str,
568
- dm.obj.mode,
569
- nearby_danmus.index_l,
570
- ret.stats,
571
- )
572
- if (sim === null) {
573
- nearby_danmus.push([dm])
574
- } else {
575
- const candidate =
576
- nearby_danmus.storage[nearby_danmus.index_r - sim.idx_diff]
577
- dm.sim_reason = sim.reason
578
- candidate.push(dm)
579
- }
580
- }
581
-
582
- // now process last few clusters with the next chunk
583
- begin_index_lock()
584
- // outer: for (const dm of next_chunk_danmus) {
585
- // while (true) {
586
- // const peeked = nearby_danmus.peek()
587
- // if (peeked === null) break outer
588
- // if (dm.obj.time_ms - peeked[0].obj.time_ms <= THRESHOLD_MS) break
589
- // apply_cluster(peeked)
590
- // nearby_danmus.pop()
591
- // }
592
-
593
- // const sim = detect_similarity(
594
- // dm.str,
595
- // dm.obj.mode,
596
- // nearby_danmus.index_l,
597
- // ret.stats,
598
- // )
599
- // if (sim !== null) {
600
- // const candidate =
601
- // nearby_danmus.storage[nearby_danmus.index_r - sim.idx_diff]
602
- // dm.sim_reason = sim.reason
603
- // candidate.push(dm)
604
- // }
605
- // }
606
-
607
- // finally apply remaining clusters
608
- for (const candidate of nearby_danmus) {
609
- apply_cluster(candidate)
610
- }
611
-
612
- return ret
613
- }
614
-
615
- export default merge