koishi-plugin-best-cave 2.2.7 → 2.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/HashManager.d.ts +37 -55
- package/lib/index.d.ts +1 -1
- package/lib/index.js +186 -237
- package/package.json +1 -1
package/lib/HashManager.d.ts
CHANGED
|
@@ -7,12 +7,12 @@ import { FileManager } from './FileManager';
|
|
|
7
7
|
export interface CaveHashObject {
|
|
8
8
|
cave: number;
|
|
9
9
|
hash: string;
|
|
10
|
-
type: 'simhash' | '
|
|
10
|
+
type: 'simhash' | 'phash_g' | 'phash_q1' | 'phash_q2' | 'phash_q3' | 'phash_q4';
|
|
11
11
|
}
|
|
12
12
|
/**
|
|
13
13
|
* @class HashManager
|
|
14
|
-
* @description
|
|
15
|
-
*
|
|
14
|
+
* @description 负责生成、存储和比较文本与图片的哈希值。
|
|
15
|
+
* 实现了基于 Simhash 的文本查重和基于 DCT 感知哈希 (pHash) 的图片查重方案。
|
|
16
16
|
*/
|
|
17
17
|
export declare class HashManager {
|
|
18
18
|
private ctx;
|
|
@@ -22,36 +22,40 @@ export declare class HashManager {
|
|
|
22
22
|
/**
|
|
23
23
|
* @constructor
|
|
24
24
|
* @param ctx - Koishi 上下文,用于数据库操作。
|
|
25
|
-
* @param config -
|
|
25
|
+
* @param config - 插件配置,用于获取相似度阈值等。
|
|
26
26
|
* @param logger - 日志记录器实例。
|
|
27
|
-
* @param fileManager -
|
|
27
|
+
* @param fileManager - 文件管理器实例,用于读取图片文件。
|
|
28
28
|
*/
|
|
29
29
|
constructor(ctx: Context, config: Config, logger: Logger, fileManager: FileManager);
|
|
30
30
|
/**
|
|
31
|
-
* @description
|
|
31
|
+
* @description 注册与哈希功能相关的 `.hash` 和 `.check` 子命令。
|
|
32
32
|
* @param cave - 主 `cave` 命令实例。
|
|
33
33
|
*/
|
|
34
34
|
registerCommands(cave: any): void;
|
|
35
35
|
/**
|
|
36
36
|
* @description 检查数据库中所有回声洞,为没有哈希记录的历史数据生成哈希。
|
|
37
|
-
* @returns
|
|
37
|
+
* @returns 一个包含操作结果的报告字符串。
|
|
38
38
|
*/
|
|
39
39
|
generateHashesForHistoricalCaves(): Promise<string>;
|
|
40
40
|
/**
|
|
41
|
-
* @description
|
|
41
|
+
* @description 为单个回声洞对象生成所有类型的哈希(文本+图片)。
|
|
42
42
|
* @param cave - 回声洞对象。
|
|
43
|
-
* @returns
|
|
43
|
+
* @returns 生成的哈希对象数组。
|
|
44
44
|
*/
|
|
45
45
|
generateAllHashesForCave(cave: Pick<CaveObject, 'id' | 'elements'>): Promise<CaveHashObject[]>;
|
|
46
46
|
/**
|
|
47
|
-
* @description
|
|
47
|
+
* @description 对数据库中所有哈希进行两两比较,找出相似度过高的内容。
|
|
48
|
+
* @returns 一个包含检查结果的报告字符串。
|
|
49
|
+
*/
|
|
50
|
+
checkForSimilarCaves(): Promise<string>;
|
|
51
|
+
/**
|
|
52
|
+
* @description 为单个图片Buffer生成全局pHash和四个象限的局部pHash。
|
|
48
53
|
* @param imageBuffer - 图片的Buffer数据。
|
|
49
|
-
* @returns
|
|
54
|
+
* @returns 包含全局哈希和四象限哈希的对象。
|
|
50
55
|
*/
|
|
51
56
|
generateAllImageHashes(imageBuffer: Buffer): Promise<{
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
subHashes: {
|
|
57
|
+
globalHash: string;
|
|
58
|
+
quadrantHashes: {
|
|
55
59
|
q1: string;
|
|
56
60
|
q2: string;
|
|
57
61
|
q3: string;
|
|
@@ -59,58 +63,36 @@ export declare class HashManager {
|
|
|
59
63
|
};
|
|
60
64
|
}>;
|
|
61
65
|
/**
|
|
62
|
-
* @description
|
|
63
|
-
* @
|
|
64
|
-
|
|
65
|
-
checkForSimilarCaves(): Promise<string>;
|
|
66
|
-
/**
|
|
67
|
-
* @description 从单通道原始像素数据计算pHash。
|
|
68
|
-
* @param channelData - 单通道的像素值数组。
|
|
69
|
-
* @param size - 图像的边长(例如16)。
|
|
70
|
-
* @returns {string} 该通道的二进制哈希字符串。
|
|
66
|
+
* @description 执行二维离散余弦变换 (DCT-II)。
|
|
67
|
+
* @param matrix - 输入的 N x N 像素亮度矩阵。
|
|
68
|
+
* @returns DCT变换后的 N x N 系数矩阵。
|
|
71
69
|
*/
|
|
72
|
-
private
|
|
70
|
+
private _dct2D;
|
|
73
71
|
/**
|
|
74
|
-
* @description
|
|
75
|
-
* @param imageBuffer - 图片的
|
|
76
|
-
* @
|
|
72
|
+
* @description pHash 算法核心实现。
|
|
73
|
+
* @param imageBuffer - 图片的Buffer。
|
|
74
|
+
* @param size - 期望的哈希位数 (必须是完全平方数, 如 64 或 256)。
|
|
75
|
+
* @returns 十六进制pHash字符串。
|
|
77
76
|
*/
|
|
78
|
-
|
|
79
|
-
/**
|
|
80
|
-
* @description 生成256位差异哈希(dHash)。
|
|
81
|
-
* @param imageBuffer - 图片的 Buffer 数据。
|
|
82
|
-
* @returns {Promise<string>} 256位二进制哈希对应的64位十六进制字符串。
|
|
83
|
-
*/
|
|
84
|
-
generateDHash(imageBuffer: Buffer): Promise<string>;
|
|
85
|
-
/**
|
|
86
|
-
* @description 将图片切割为4个象限并为每个象限生成Color pHash。
|
|
87
|
-
* @param imageBuffer - 图片的 Buffer 数据。
|
|
88
|
-
* @returns {Promise<object>} 包含四个象限哈希的对象。
|
|
89
|
-
*/
|
|
90
|
-
generateImageSubHashes(imageBuffer: Buffer): Promise<{
|
|
91
|
-
q1: string;
|
|
92
|
-
q2: string;
|
|
93
|
-
q3: string;
|
|
94
|
-
q4: string;
|
|
95
|
-
}>;
|
|
77
|
+
private _generatePHash;
|
|
96
78
|
/**
|
|
97
|
-
* @description
|
|
98
|
-
* @param hex1 -
|
|
99
|
-
* @param hex2 -
|
|
100
|
-
* @returns
|
|
79
|
+
* @description 计算两个十六进制哈希字符串之间的汉明距离 (不同位的数量)。
|
|
80
|
+
* @param hex1 - 第一个哈希。
|
|
81
|
+
* @param hex2 - 第二个哈希。
|
|
82
|
+
* @returns 汉明距离。
|
|
101
83
|
*/
|
|
102
84
|
calculateHammingDistance(hex1: string, hex2: string): number;
|
|
103
85
|
/**
|
|
104
|
-
* @description
|
|
105
|
-
* @param hex1 -
|
|
106
|
-
* @param hex2 -
|
|
107
|
-
* @returns
|
|
86
|
+
* @description 根据汉明距离计算相似度百分比。
|
|
87
|
+
* @param hex1 - 第一个哈希。
|
|
88
|
+
* @param hex2 - 第二个哈希。
|
|
89
|
+
* @returns 相似度 (0-100)。
|
|
108
90
|
*/
|
|
109
91
|
calculateSimilarity(hex1: string, hex2: string): number;
|
|
110
92
|
/**
|
|
111
|
-
* @description
|
|
93
|
+
* @description 为文本生成 64 位 Simhash 字符串。
|
|
112
94
|
* @param text - 需要处理的文本。
|
|
113
|
-
* @returns
|
|
95
|
+
* @returns 16位十六进制 Simhash 字符串。
|
|
114
96
|
*/
|
|
115
97
|
generateTextSimhash(text: string): string;
|
|
116
98
|
}
|
package/lib/index.d.ts
CHANGED
package/lib/index.js
CHANGED
|
@@ -431,62 +431,38 @@ async function handleFileUploads(ctx, config, fileManager, logger2, reviewManage
|
|
|
431
431
|
try {
|
|
432
432
|
const downloadedMedia = [];
|
|
433
433
|
const imageHashesToStore = [];
|
|
434
|
-
const
|
|
435
|
-
const
|
|
436
|
-
const
|
|
437
|
-
const existingSubHashObjects = existingHashes.filter((h4) => h4.type.startsWith("sub_phash_"));
|
|
434
|
+
const allExistingImageHashes = hashManager ? await ctx.database.get("cave_hash", { type: { $ne: "simhash" } }) : [];
|
|
435
|
+
const existingGlobalHashes = allExistingImageHashes.filter((h4) => h4.type === "phash_g");
|
|
436
|
+
const existingQuadrantHashes = allExistingImageHashes.filter((h4) => h4.type.startsWith("phash_q"));
|
|
438
437
|
for (const media of mediaToToSave) {
|
|
439
438
|
const buffer = Buffer.from(await ctx.http.get(media.sourceUrl, { responseType: "arraybuffer", timeout: 3e4 }));
|
|
440
439
|
downloadedMedia.push({ fileName: media.fileName, buffer });
|
|
441
440
|
if (hashManager && [".png", ".jpg", ".jpeg", ".webp"].includes(path2.extname(media.fileName).toLowerCase())) {
|
|
442
|
-
const {
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
similarityScores.get(existing.cave).colorSim = similarity;
|
|
441
|
+
const { globalHash, quadrantHashes } = await hashManager.generateAllImageHashes(buffer);
|
|
442
|
+
for (const existing of existingGlobalHashes) {
|
|
443
|
+
const similarity = hashManager.calculateSimilarity(globalHash, existing.hash);
|
|
444
|
+
if (similarity >= config.imageWholeThreshold) {
|
|
445
|
+
await session.send(`图片与回声洞(${existing.cave})的相似度为 ${similarity.toFixed(2)}%,超过阈值`);
|
|
446
|
+
await ctx.database.upsert("cave", [{ id: cave.id, status: "delete" }]);
|
|
447
|
+
cleanupPendingDeletions(ctx, fileManager, logger2, reusableIds);
|
|
448
|
+
return;
|
|
451
449
|
}
|
|
452
450
|
}
|
|
453
|
-
for (const existing of existingDHashes) {
|
|
454
|
-
const similarity = hashManager.calculateSimilarity(dHash, existing.hash);
|
|
455
|
-
if (similarity >= config.imageThreshold) {
|
|
456
|
-
if (!similarityScores.has(existing.cave)) similarityScores.set(existing.cave, {});
|
|
457
|
-
similarityScores.get(existing.cave).dSim = similarity;
|
|
458
|
-
}
|
|
459
|
-
}
|
|
460
|
-
for (const [caveId, scores] of similarityScores.entries()) {
|
|
461
|
-
if (scores.colorSim && scores.dSim) {
|
|
462
|
-
caveToDelete = caveId;
|
|
463
|
-
highestCombinedSimilarity = scores.colorSim;
|
|
464
|
-
break;
|
|
465
|
-
}
|
|
466
|
-
}
|
|
467
|
-
if (caveToDelete) {
|
|
468
|
-
await session.send(`图片与回声洞(${caveToDelete})的相似度为 ${(highestCombinedSimilarity * 100).toFixed(2)}%,超过阈值`);
|
|
469
|
-
await ctx.database.upsert("cave", [{ id: cave.id, status: "delete" }]);
|
|
470
|
-
cleanupPendingDeletions(ctx, fileManager, logger2, reusableIds);
|
|
471
|
-
return;
|
|
472
|
-
}
|
|
473
451
|
const notifiedPartialCaves = /* @__PURE__ */ new Set();
|
|
474
|
-
for (const newSubHash of Object.values(
|
|
475
|
-
for (const existing of
|
|
452
|
+
for (const newSubHash of Object.values(quadrantHashes)) {
|
|
453
|
+
for (const existing of existingQuadrantHashes) {
|
|
476
454
|
if (notifiedPartialCaves.has(existing.cave)) continue;
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
await session.send(`图片局部与回声洞(${existing.cave})的相似度为 ${(similarity * 100).toFixed(2)}%`);
|
|
455
|
+
if (newSubHash === existing.hash) {
|
|
456
|
+
await session.send(`图片局部与回声洞(${existing.cave})存在完全相同的区块`);
|
|
480
457
|
notifiedPartialCaves.add(existing.cave);
|
|
481
458
|
}
|
|
482
459
|
}
|
|
483
460
|
}
|
|
484
|
-
imageHashesToStore.push({ hash:
|
|
485
|
-
imageHashesToStore.push({ hash:
|
|
486
|
-
imageHashesToStore.push({ hash:
|
|
487
|
-
imageHashesToStore.push({ hash:
|
|
488
|
-
imageHashesToStore.push({ hash:
|
|
489
|
-
imageHashesToStore.push({ hash: subHashes.q4, type: "sub_phash_q4" });
|
|
461
|
+
imageHashesToStore.push({ hash: globalHash, type: "phash_g" });
|
|
462
|
+
imageHashesToStore.push({ hash: quadrantHashes.q1, type: "phash_q1" });
|
|
463
|
+
imageHashesToStore.push({ hash: quadrantHashes.q2, type: "phash_q2" });
|
|
464
|
+
imageHashesToStore.push({ hash: quadrantHashes.q3, type: "phash_q3" });
|
|
465
|
+
imageHashesToStore.push({ hash: quadrantHashes.q4, type: "phash_q4" });
|
|
490
466
|
}
|
|
491
467
|
}
|
|
492
468
|
await Promise.all(downloadedMedia.map((item) => fileManager.saveFile(item.fileName, item.buffer)));
|
|
@@ -606,9 +582,9 @@ var HashManager = class {
|
|
|
606
582
|
/**
|
|
607
583
|
* @constructor
|
|
608
584
|
* @param ctx - Koishi 上下文,用于数据库操作。
|
|
609
|
-
* @param config -
|
|
585
|
+
* @param config - 插件配置,用于获取相似度阈值等。
|
|
610
586
|
* @param logger - 日志记录器实例。
|
|
611
|
-
* @param fileManager -
|
|
587
|
+
* @param fileManager - 文件管理器实例,用于读取图片文件。
|
|
612
588
|
*/
|
|
613
589
|
constructor(ctx, config, logger2, fileManager) {
|
|
614
590
|
this.ctx = ctx;
|
|
@@ -627,7 +603,7 @@ var HashManager = class {
|
|
|
627
603
|
__name(this, "HashManager");
|
|
628
604
|
}
|
|
629
605
|
/**
|
|
630
|
-
* @description
|
|
606
|
+
* @description 注册与哈希功能相关的 `.hash` 和 `.check` 子命令。
|
|
631
607
|
* @param cave - 主 `cave` 命令实例。
|
|
632
608
|
*/
|
|
633
609
|
registerCommands(cave) {
|
|
@@ -637,7 +613,7 @@ var HashManager = class {
|
|
|
637
613
|
return "此指令仅限在管理群组中使用";
|
|
638
614
|
}
|
|
639
615
|
}, "adminCheck");
|
|
640
|
-
cave.subcommand(".hash", "校验回声洞").usage("
|
|
616
|
+
cave.subcommand(".hash", "校验回声洞").usage("校验缺失哈希的回声洞,补全哈希记录。").action(async (argv) => {
|
|
641
617
|
const checkResult = adminCheck(argv);
|
|
642
618
|
if (checkResult) return checkResult;
|
|
643
619
|
await argv.session.send("正在处理,请稍候...");
|
|
@@ -648,7 +624,7 @@ var HashManager = class {
|
|
|
648
624
|
return `操作失败: ${error.message}`;
|
|
649
625
|
}
|
|
650
626
|
});
|
|
651
|
-
cave.subcommand(".check", "
|
|
627
|
+
cave.subcommand(".check", "检查相似度").usage("检查所有回声洞,找出相似度过高的内容。").action(async (argv) => {
|
|
652
628
|
const checkResult = adminCheck(argv);
|
|
653
629
|
if (checkResult) return checkResult;
|
|
654
630
|
await argv.session.send("正在检查,请稍候...");
|
|
@@ -662,239 +638,160 @@ var HashManager = class {
|
|
|
662
638
|
}
|
|
663
639
|
/**
|
|
664
640
|
* @description 检查数据库中所有回声洞,为没有哈希记录的历史数据生成哈希。
|
|
665
|
-
* @returns
|
|
641
|
+
* @returns 一个包含操作结果的报告字符串。
|
|
666
642
|
*/
|
|
667
643
|
async generateHashesForHistoricalCaves() {
|
|
668
644
|
const allCaves = await this.ctx.database.get("cave", { status: "active" });
|
|
669
|
-
const existingHashes = await this.ctx.database.get("cave_hash", {}
|
|
645
|
+
const existingHashes = await this.ctx.database.get("cave_hash", {});
|
|
670
646
|
const existingHashSet = new Set(existingHashes.map((h4) => `${h4.cave}-${h4.hash}-${h4.type}`));
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
const totalToProcessCount = cavesToProcess.length;
|
|
674
|
-
if (totalToProcessCount === 0) {
|
|
675
|
-
return "无需补全回声洞哈希";
|
|
676
|
-
}
|
|
677
|
-
this.logger.info(`开始补全 ${totalToProcessCount} 个回声洞的哈希...`);
|
|
647
|
+
if (allCaves.length === 0) return "无需补全回声洞哈希";
|
|
648
|
+
this.logger.info(`开始补全 ${allCaves.length} 个回声洞的哈希...`);
|
|
678
649
|
let hashesToInsert = [];
|
|
679
|
-
const batchHashSet = /* @__PURE__ */ new Set();
|
|
680
650
|
let processedCaveCount = 0;
|
|
681
651
|
let totalHashesGenerated = 0;
|
|
682
652
|
let errorCount = 0;
|
|
683
653
|
const flushBatch = /* @__PURE__ */ __name(async () => {
|
|
684
|
-
|
|
685
|
-
if (batchSize === 0) return;
|
|
654
|
+
if (hashesToInsert.length === 0) return;
|
|
686
655
|
await this.ctx.database.upsert("cave_hash", hashesToInsert);
|
|
687
|
-
totalHashesGenerated +=
|
|
688
|
-
this.logger.info(
|
|
656
|
+
totalHashesGenerated += hashesToInsert.length;
|
|
657
|
+
this.logger.info(`[${processedCaveCount}/${allCaves.length}] 正在导入 ${hashesToInsert.length} 条回声洞哈希...`);
|
|
689
658
|
hashesToInsert = [];
|
|
690
|
-
batchHashSet.clear();
|
|
691
659
|
}, "flushBatch");
|
|
692
|
-
for (const cave of
|
|
660
|
+
for (const cave of allCaves) {
|
|
693
661
|
processedCaveCount++;
|
|
694
662
|
try {
|
|
695
663
|
const newHashesForCave = await this.generateAllHashesForCave(cave);
|
|
696
664
|
for (const hashObj of newHashesForCave) {
|
|
697
665
|
const uniqueKey = `${hashObj.cave}-${hashObj.hash}-${hashObj.type}`;
|
|
698
|
-
if (!existingHashSet.has(uniqueKey)
|
|
666
|
+
if (!existingHashSet.has(uniqueKey)) {
|
|
699
667
|
hashesToInsert.push(hashObj);
|
|
700
|
-
|
|
668
|
+
existingHashSet.add(uniqueKey);
|
|
701
669
|
}
|
|
702
670
|
}
|
|
671
|
+
if (hashesToInsert.length >= 100) {
|
|
672
|
+
await flushBatch();
|
|
673
|
+
}
|
|
703
674
|
} catch (error) {
|
|
704
675
|
errorCount++;
|
|
705
|
-
this.logger.warn(`补全回声洞(${cave.id}
|
|
706
|
-
continue;
|
|
707
|
-
}
|
|
708
|
-
if (hashesToInsert.length >= 100) {
|
|
709
|
-
await flushBatch();
|
|
676
|
+
this.logger.warn(`补全回声洞(${cave.id})哈希时发生错误: ${error.message}`);
|
|
710
677
|
}
|
|
711
678
|
}
|
|
712
679
|
await flushBatch();
|
|
713
|
-
return `已补全 ${
|
|
680
|
+
return `已补全 ${allCaves.length} 个回声洞的 ${totalHashesGenerated} 条哈希(失败 ${errorCount} 条)`;
|
|
714
681
|
}
|
|
715
682
|
/**
|
|
716
|
-
* @description
|
|
683
|
+
* @description 为单个回声洞对象生成所有类型的哈希(文本+图片)。
|
|
717
684
|
* @param cave - 回声洞对象。
|
|
718
|
-
* @returns
|
|
685
|
+
* @returns 生成的哈希对象数组。
|
|
719
686
|
*/
|
|
720
687
|
async generateAllHashesForCave(cave) {
|
|
721
|
-
const
|
|
688
|
+
const tempHashes = [];
|
|
689
|
+
const uniqueHashTracker = /* @__PURE__ */ new Set();
|
|
690
|
+
const addUniqueHash = /* @__PURE__ */ __name((hashObj) => {
|
|
691
|
+
const key = `${hashObj.hash}-${hashObj.type}`;
|
|
692
|
+
if (!uniqueHashTracker.has(key)) {
|
|
693
|
+
tempHashes.push(hashObj);
|
|
694
|
+
uniqueHashTracker.add(key);
|
|
695
|
+
}
|
|
696
|
+
}, "addUniqueHash");
|
|
722
697
|
const combinedText = cave.elements.filter((el) => el.type === "text" && el.content).map((el) => el.content).join(" ");
|
|
723
698
|
if (combinedText) {
|
|
724
699
|
const textHash = this.generateTextSimhash(combinedText);
|
|
725
|
-
if (textHash) {
|
|
726
|
-
allHashes.push({ cave: cave.id, hash: textHash, type: "simhash" });
|
|
727
|
-
}
|
|
700
|
+
if (textHash) addUniqueHash({ cave: cave.id, hash: textHash, type: "simhash" });
|
|
728
701
|
}
|
|
729
702
|
for (const el of cave.elements.filter((el2) => el2.type === "image" && el2.file)) {
|
|
730
703
|
try {
|
|
731
704
|
const imageBuffer = await this.fileManager.readFile(el.file);
|
|
732
|
-
const
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
allHashes.push({ cave: cave.id, hash: imageHashes.subHashes.q4, type: "sub_phash_q4" });
|
|
705
|
+
const { globalHash, quadrantHashes } = await this.generateAllImageHashes(imageBuffer);
|
|
706
|
+
addUniqueHash({ cave: cave.id, hash: globalHash, type: "phash_g" });
|
|
707
|
+
addUniqueHash({ cave: cave.id, hash: quadrantHashes.q1, type: "phash_q1" });
|
|
708
|
+
addUniqueHash({ cave: cave.id, hash: quadrantHashes.q2, type: "phash_q2" });
|
|
709
|
+
addUniqueHash({ cave: cave.id, hash: quadrantHashes.q3, type: "phash_q3" });
|
|
710
|
+
addUniqueHash({ cave: cave.id, hash: quadrantHashes.q4, type: "phash_q4" });
|
|
739
711
|
} catch (e) {
|
|
740
|
-
this.logger.warn(`无法为回声洞(${cave.id}
|
|
712
|
+
this.logger.warn(`无法为回声洞(${cave.id})的图片(${el.file})生成哈希:`, e);
|
|
741
713
|
}
|
|
742
714
|
}
|
|
743
|
-
return
|
|
744
|
-
}
|
|
745
|
-
/**
|
|
746
|
-
* @description 为单个图片Buffer生成所有类型的哈希。
|
|
747
|
-
* @param imageBuffer - 图片的Buffer数据。
|
|
748
|
-
* @returns {Promise<object>} 包含所有图片哈希的对象。
|
|
749
|
-
*/
|
|
750
|
-
async generateAllImageHashes(imageBuffer) {
|
|
751
|
-
const [colorPHash, dHash, subHashes] = await Promise.all([
|
|
752
|
-
this.generateColorPHash(imageBuffer),
|
|
753
|
-
this.generateDHash(imageBuffer),
|
|
754
|
-
this.generateImageSubHashes(imageBuffer)
|
|
755
|
-
]);
|
|
756
|
-
return { colorPHash, dHash, subHashes };
|
|
715
|
+
return tempHashes;
|
|
757
716
|
}
|
|
758
717
|
/**
|
|
759
|
-
* @description
|
|
760
|
-
* @returns
|
|
718
|
+
* @description 对数据库中所有哈希进行两两比较,找出相似度过高的内容。
|
|
719
|
+
* @returns 一个包含检查结果的报告字符串。
|
|
761
720
|
*/
|
|
762
721
|
async checkForSimilarCaves() {
|
|
763
722
|
const allHashes = await this.ctx.database.get("cave_hash", {});
|
|
764
|
-
const
|
|
765
|
-
const
|
|
766
|
-
const
|
|
767
|
-
|
|
768
|
-
phash_color: /* @__PURE__ */ new Map(),
|
|
769
|
-
dhash_gray: /* @__PURE__ */ new Map()
|
|
770
|
-
};
|
|
771
|
-
const subHashToCaves = /* @__PURE__ */ new Map();
|
|
723
|
+
const allCaveIds = [...new Set(allHashes.map((h4) => h4.cave))];
|
|
724
|
+
const textHashes = /* @__PURE__ */ new Map();
|
|
725
|
+
const globalHashes = /* @__PURE__ */ new Map();
|
|
726
|
+
const quadrantHashes = /* @__PURE__ */ new Map();
|
|
772
727
|
for (const hash of allHashes) {
|
|
773
|
-
if (
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
728
|
+
if (hash.type === "simhash") {
|
|
729
|
+
textHashes.set(hash.cave, hash.hash);
|
|
730
|
+
} else if (hash.type === "phash_g") {
|
|
731
|
+
globalHashes.set(hash.cave, hash.hash);
|
|
732
|
+
} else if (hash.type.startsWith("phash_q")) {
|
|
733
|
+
if (!quadrantHashes.has(hash.cave)) quadrantHashes.set(hash.cave, /* @__PURE__ */ new Set());
|
|
734
|
+
quadrantHashes.get(hash.cave).add(hash.hash);
|
|
779
735
|
}
|
|
780
736
|
}
|
|
781
737
|
const similarPairs = {
|
|
782
738
|
text: /* @__PURE__ */ new Set(),
|
|
783
|
-
|
|
784
|
-
|
|
739
|
+
global: /* @__PURE__ */ new Set(),
|
|
740
|
+
partial: /* @__PURE__ */ new Set()
|
|
785
741
|
};
|
|
786
742
|
for (let i = 0; i < allCaveIds.length; i++) {
|
|
787
743
|
for (let j = i + 1; j < allCaveIds.length; j++) {
|
|
788
744
|
const id1 = allCaveIds[i];
|
|
789
745
|
const id2 = allCaveIds[j];
|
|
790
|
-
const
|
|
791
|
-
const
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
746
|
+
const pair = [id1, id2].sort((a, b) => a - b).join(" & ");
|
|
747
|
+
const text1 = textHashes.get(id1);
|
|
748
|
+
const text2 = textHashes.get(id2);
|
|
749
|
+
if (text1 && text2) {
|
|
750
|
+
const similarity = this.calculateSimilarity(text1, text2);
|
|
751
|
+
if (similarity >= this.config.textThreshold) {
|
|
752
|
+
similarPairs.text.add(`${pair} = ${similarity.toFixed(2)}%`);
|
|
796
753
|
}
|
|
797
754
|
}
|
|
798
|
-
const
|
|
799
|
-
const
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
similarPairs.image_color.add(`${id1} & ${id2} = ${(sim * 100).toFixed(2)}%`);
|
|
805
|
-
}
|
|
755
|
+
const global1 = globalHashes.get(id1);
|
|
756
|
+
const global2 = globalHashes.get(id2);
|
|
757
|
+
if (global1 && global2) {
|
|
758
|
+
const similarity = this.calculateSimilarity(global1, global2);
|
|
759
|
+
if (similarity >= this.config.imageWholeThreshold) {
|
|
760
|
+
similarPairs.global.add(`${pair} = ${similarity.toFixed(2)}%`);
|
|
806
761
|
}
|
|
807
762
|
}
|
|
808
|
-
const
|
|
809
|
-
const
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
if (
|
|
814
|
-
|
|
763
|
+
const quads1 = quadrantHashes.get(id1);
|
|
764
|
+
const quads2 = quadrantHashes.get(id2);
|
|
765
|
+
if (quads1 && quads2 && quads1.size > 0 && quads2.size > 0) {
|
|
766
|
+
let matchFound = false;
|
|
767
|
+
for (const h1 of quads1) {
|
|
768
|
+
if (quads2.has(h1)) {
|
|
769
|
+
matchFound = true;
|
|
770
|
+
break;
|
|
815
771
|
}
|
|
816
772
|
}
|
|
773
|
+
if (matchFound) {
|
|
774
|
+
similarPairs.partial.add(pair);
|
|
775
|
+
}
|
|
817
776
|
}
|
|
818
777
|
}
|
|
819
778
|
}
|
|
820
|
-
const
|
|
821
|
-
subHashToCaves.forEach((caves2) => {
|
|
822
|
-
if (caves2.size > 1) {
|
|
823
|
-
const sortedCaves = [...caves2].sort((a, b) => a - b).join(", ");
|
|
824
|
-
subHashDuplicates.push(`[${sortedCaves}]`);
|
|
825
|
-
}
|
|
826
|
-
});
|
|
827
|
-
const totalFindings = similarPairs.text.size + similarPairs.image_color.size + similarPairs.image_dhash.size + subHashDuplicates.length;
|
|
779
|
+
const totalFindings = similarPairs.text.size + similarPairs.global.size + similarPairs.partial.size;
|
|
828
780
|
if (totalFindings === 0) return "未发现高相似度的内容";
|
|
829
|
-
let report = `已发现 ${totalFindings}
|
|
830
|
-
if (similarPairs.text.size > 0) report += "\n
|
|
831
|
-
if (similarPairs.
|
|
832
|
-
if (similarPairs.
|
|
833
|
-
if (subHashDuplicates.length > 0) report += "\n图片局部重复:\n" + [...new Set(subHashDuplicates)].join("\n");
|
|
781
|
+
let report = `已发现 ${totalFindings} 组高相似度的内容:`;
|
|
782
|
+
if (similarPairs.text.size > 0) report += "\n文本内容相似:\n" + [...similarPairs.text].join("\n");
|
|
783
|
+
if (similarPairs.global.size > 0) report += "\n图片整体相似:\n" + [...similarPairs.global].join("\n");
|
|
784
|
+
if (similarPairs.partial.size > 0) report += "\n图片局部相同:\n" + [...similarPairs.partial].join("\n");
|
|
834
785
|
return report.trim();
|
|
835
786
|
}
|
|
836
787
|
/**
|
|
837
|
-
* @description
|
|
838
|
-
* @param
|
|
839
|
-
* @
|
|
840
|
-
* @returns {string} 该通道的二进制哈希字符串。
|
|
841
|
-
*/
|
|
842
|
-
_calculateHashFromRawChannel(channelData, size) {
|
|
843
|
-
const totalLuminance = channelData.reduce((acc, val) => acc + val, 0);
|
|
844
|
-
const avgLuminance = totalLuminance / (size * size);
|
|
845
|
-
return channelData.map((lum) => lum > avgLuminance ? "1" : "0").join("");
|
|
846
|
-
}
|
|
847
|
-
/**
|
|
848
|
-
* @description 生成768位颜色感知哈希(Color pHash)。
|
|
849
|
-
* @param imageBuffer - 图片的 Buffer 数据。
|
|
850
|
-
* @returns {Promise<string>} 768位二进制哈希对应的192位十六进制字符串。
|
|
851
|
-
*/
|
|
852
|
-
async generateColorPHash(imageBuffer) {
|
|
853
|
-
const { data, info } = await (0, import_sharp.default)(imageBuffer).resize(16, 16, { fit: "fill" }).removeAlpha().raw().toBuffer({ resolveWithObject: true });
|
|
854
|
-
const { channels } = info;
|
|
855
|
-
const r = [], g = [], b = [];
|
|
856
|
-
for (let i = 0; i < data.length; i += channels) {
|
|
857
|
-
r.push(data[i]);
|
|
858
|
-
g.push(data[i + 1]);
|
|
859
|
-
b.push(data[i + 2]);
|
|
860
|
-
}
|
|
861
|
-
const rHash = this._calculateHashFromRawChannel(r, 16);
|
|
862
|
-
const gHash = this._calculateHashFromRawChannel(g, 16);
|
|
863
|
-
const bHash = this._calculateHashFromRawChannel(b, 16);
|
|
864
|
-
const combinedHash = rHash + gHash + bHash;
|
|
865
|
-
let hex = "";
|
|
866
|
-
for (let i = 0; i < combinedHash.length; i += 4) {
|
|
867
|
-
hex += parseInt(combinedHash.substring(i, i + 4), 2).toString(16);
|
|
868
|
-
}
|
|
869
|
-
return hex.padStart(192, "0");
|
|
870
|
-
}
|
|
871
|
-
/**
|
|
872
|
-
* @description 生成256位差异哈希(dHash)。
|
|
873
|
-
* @param imageBuffer - 图片的 Buffer 数据。
|
|
874
|
-
* @returns {Promise<string>} 256位二进制哈希对应的64位十六进制字符串。
|
|
875
|
-
*/
|
|
876
|
-
async generateDHash(imageBuffer) {
|
|
877
|
-
const pixels = await (0, import_sharp.default)(imageBuffer).grayscale().resize(17, 16, { fit: "fill" }).raw().toBuffer();
|
|
878
|
-
let hash = "";
|
|
879
|
-
for (let y = 0; y < 16; y++) {
|
|
880
|
-
for (let x = 0; x < 16; x++) {
|
|
881
|
-
const i = y * 17 + x;
|
|
882
|
-
hash += pixels[i] > pixels[i + 1] ? "1" : "0";
|
|
883
|
-
}
|
|
884
|
-
}
|
|
885
|
-
return BigInt("0b" + hash).toString(16).padStart(64, "0");
|
|
886
|
-
}
|
|
887
|
-
/**
|
|
888
|
-
* @description 将图片切割为4个象限并为每个象限生成Color pHash。
|
|
889
|
-
* @param imageBuffer - 图片的 Buffer 数据。
|
|
890
|
-
* @returns {Promise<object>} 包含四个象限哈希的对象。
|
|
788
|
+
* @description 为单个图片Buffer生成全局pHash和四个象限的局部pHash。
|
|
789
|
+
* @param imageBuffer - 图片的Buffer数据。
|
|
790
|
+
* @returns 包含全局哈希和四象限哈希的对象。
|
|
891
791
|
*/
|
|
892
|
-
async
|
|
792
|
+
async generateAllImageHashes(imageBuffer) {
|
|
793
|
+
const globalHash = await this._generatePHash(imageBuffer, 256);
|
|
893
794
|
const { width, height } = await (0, import_sharp.default)(imageBuffer).metadata();
|
|
894
|
-
if (!width || !height || width < 16 || height < 16) {
|
|
895
|
-
const fallbackHash = await this.generateColorPHash(imageBuffer);
|
|
896
|
-
return { q1: fallbackHash, q2: fallbackHash, q3: fallbackHash, q4: fallbackHash };
|
|
897
|
-
}
|
|
898
795
|
const w2 = Math.floor(width / 2), h22 = Math.floor(height / 2);
|
|
899
796
|
const regions = [
|
|
900
797
|
{ left: 0, top: 0, width: w2, height: h22 },
|
|
@@ -904,17 +801,73 @@ var HashManager = class {
|
|
|
904
801
|
];
|
|
905
802
|
const [q1, q2, q3, q4] = await Promise.all(
|
|
906
803
|
regions.map((region) => {
|
|
907
|
-
if (region.width <
|
|
908
|
-
return (0, import_sharp.default)(imageBuffer).extract(region).toBuffer().then((b) => this.
|
|
804
|
+
if (region.width < 16 || region.height < 16) return this._generatePHash(imageBuffer, 64);
|
|
805
|
+
return (0, import_sharp.default)(imageBuffer).extract(region).toBuffer().then((b) => this._generatePHash(b, 64));
|
|
909
806
|
})
|
|
910
807
|
);
|
|
911
|
-
return { q1, q2, q3, q4 };
|
|
808
|
+
return { globalHash, quadrantHashes: { q1, q2, q3, q4 } };
|
|
912
809
|
}
|
|
913
810
|
/**
|
|
914
|
-
* @description
|
|
915
|
-
* @param
|
|
916
|
-
* @
|
|
917
|
-
|
|
811
|
+
* @description 执行二维离散余弦变换 (DCT-II)。
|
|
812
|
+
* @param matrix - 输入的 N x N 像素亮度矩阵。
|
|
813
|
+
* @returns DCT变换后的 N x N 系数矩阵。
|
|
814
|
+
*/
|
|
815
|
+
_dct2D(matrix) {
|
|
816
|
+
const N = matrix.length;
|
|
817
|
+
if (N === 0) return [];
|
|
818
|
+
const cosines = Array.from(
|
|
819
|
+
{ length: N },
|
|
820
|
+
(_, i) => Array.from({ length: N }, (_2, j) => Math.cos(Math.PI * (2 * i + 1) * j / (2 * N)))
|
|
821
|
+
);
|
|
822
|
+
const applyDct1D = /* @__PURE__ */ __name((input) => {
|
|
823
|
+
const output = new Array(N).fill(0);
|
|
824
|
+
const scale = Math.sqrt(2 / N);
|
|
825
|
+
for (let k = 0; k < N; k++) {
|
|
826
|
+
let sum = 0;
|
|
827
|
+
for (let n = 0; n < N; n++) {
|
|
828
|
+
sum += input[n] * cosines[n][k];
|
|
829
|
+
}
|
|
830
|
+
output[k] = scale * sum;
|
|
831
|
+
}
|
|
832
|
+
output[0] /= Math.sqrt(2);
|
|
833
|
+
return output;
|
|
834
|
+
}, "applyDct1D");
|
|
835
|
+
const tempMatrix = matrix.map((row) => applyDct1D(row));
|
|
836
|
+
const transposed = tempMatrix[0].map((_, col) => tempMatrix.map((row) => row[col]));
|
|
837
|
+
const dctResult = transposed.map((row) => applyDct1D(row));
|
|
838
|
+
return dctResult[0].map((_, col) => dctResult.map((row) => row[col]));
|
|
839
|
+
}
|
|
840
|
+
/**
|
|
841
|
+
* @description pHash 算法核心实现。
|
|
842
|
+
* @param imageBuffer - 图片的Buffer。
|
|
843
|
+
* @param size - 期望的哈希位数 (必须是完全平方数, 如 64 或 256)。
|
|
844
|
+
* @returns 十六进制pHash字符串。
|
|
845
|
+
*/
|
|
846
|
+
async _generatePHash(imageBuffer, size) {
|
|
847
|
+
const dctSize = 32;
|
|
848
|
+
const hashGridSize = Math.sqrt(size);
|
|
849
|
+
if (!Number.isInteger(hashGridSize)) throw new Error("哈希位数必须是完全平方数");
|
|
850
|
+
const pixels = await (0, import_sharp.default)(imageBuffer).grayscale().resize(dctSize, dctSize, { fit: "fill" }).raw().toBuffer();
|
|
851
|
+
const matrix = [];
|
|
852
|
+
for (let y = 0; y < dctSize; y++) {
|
|
853
|
+
matrix.push(Array.from(pixels.slice(y * dctSize, (y + 1) * dctSize)));
|
|
854
|
+
}
|
|
855
|
+
const dctMatrix = this._dct2D(matrix);
|
|
856
|
+
const coefficients = [];
|
|
857
|
+
for (let y = 0; y < hashGridSize; y++) {
|
|
858
|
+
for (let x = 0; x < hashGridSize; x++) {
|
|
859
|
+
coefficients.push(dctMatrix[y][x]);
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
const median = [...coefficients.slice(1)].sort((a, b) => a - b)[Math.floor((coefficients.length - 1) / 2)];
|
|
863
|
+
const binaryHash = coefficients.map((val) => val > median ? "1" : "0").join("");
|
|
864
|
+
return BigInt("0b" + binaryHash).toString(16).padStart(size / 4, "0");
|
|
865
|
+
}
|
|
866
|
+
/**
|
|
867
|
+
* @description 计算两个十六进制哈希字符串之间的汉明距离 (不同位的数量)。
|
|
868
|
+
* @param hex1 - 第一个哈希。
|
|
869
|
+
* @param hex2 - 第二个哈希。
|
|
870
|
+
* @returns 汉明距离。
|
|
918
871
|
*/
|
|
919
872
|
calculateHammingDistance(hex1, hex2) {
|
|
920
873
|
let distance = 0;
|
|
@@ -927,26 +880,24 @@ var HashManager = class {
|
|
|
927
880
|
return distance;
|
|
928
881
|
}
|
|
929
882
|
/**
|
|
930
|
-
* @description
|
|
931
|
-
* @param hex1 -
|
|
932
|
-
* @param hex2 -
|
|
933
|
-
* @returns
|
|
883
|
+
* @description 根据汉明距离计算相似度百分比。
|
|
884
|
+
* @param hex1 - 第一个哈希。
|
|
885
|
+
* @param hex2 - 第二个哈希。
|
|
886
|
+
* @returns 相似度 (0-100)。
|
|
934
887
|
*/
|
|
935
888
|
calculateSimilarity(hex1, hex2) {
|
|
936
889
|
const distance = this.calculateHammingDistance(hex1, hex2);
|
|
937
890
|
const hashLength = Math.max(hex1.length, hex2.length) * 4;
|
|
938
|
-
return hashLength === 0 ?
|
|
891
|
+
return hashLength === 0 ? 100 : (1 - distance / hashLength) * 100;
|
|
939
892
|
}
|
|
940
893
|
/**
|
|
941
|
-
* @description
|
|
894
|
+
* @description 为文本生成 64 位 Simhash 字符串。
|
|
942
895
|
* @param text - 需要处理的文本。
|
|
943
|
-
* @returns
|
|
896
|
+
* @returns 16位十六进制 Simhash 字符串。
|
|
944
897
|
*/
|
|
945
898
|
generateTextSimhash(text) {
|
|
946
899
|
const cleanText = (text || "").toLowerCase().replace(/\s+/g, "");
|
|
947
|
-
if (!cleanText)
|
|
948
|
-
return "";
|
|
949
|
-
}
|
|
900
|
+
if (!cleanText) return "";
|
|
950
901
|
const n = 2;
|
|
951
902
|
const tokens = /* @__PURE__ */ new Set();
|
|
952
903
|
if (cleanText.length < n) {
|
|
@@ -957,9 +908,7 @@ var HashManager = class {
|
|
|
957
908
|
}
|
|
958
909
|
}
|
|
959
910
|
const tokenArray = Array.from(tokens);
|
|
960
|
-
if (tokenArray.length === 0)
|
|
961
|
-
return "";
|
|
962
|
-
}
|
|
911
|
+
if (tokenArray.length === 0) return "";
|
|
963
912
|
const vector = new Array(64).fill(0);
|
|
964
913
|
tokenArray.forEach((token) => {
|
|
965
914
|
const hash = crypto.createHash("md5").update(token).digest();
|
|
@@ -973,8 +922,8 @@ var HashManager = class {
|
|
|
973
922
|
};
|
|
974
923
|
function hexToBinary(hex) {
|
|
975
924
|
let bin = "";
|
|
976
|
-
for (
|
|
977
|
-
bin += parseInt(
|
|
925
|
+
for (const char of hex) {
|
|
926
|
+
bin += parseInt(char, 16).toString(2).padStart(4, "0");
|
|
978
927
|
}
|
|
979
928
|
return bin;
|
|
980
929
|
}
|
|
@@ -1008,8 +957,8 @@ var Config = import_koishi3.Schema.intersect([
|
|
|
1008
957
|
import_koishi3.Schema.object({
|
|
1009
958
|
enableReview: import_koishi3.Schema.boolean().default(false).description("启用审核"),
|
|
1010
959
|
enableSimilarity: import_koishi3.Schema.boolean().default(false).description("启用查重"),
|
|
1011
|
-
textThreshold: import_koishi3.Schema.number().min(0).max(
|
|
1012
|
-
|
|
960
|
+
textThreshold: import_koishi3.Schema.number().min(0).max(100).step(0.01).default(95).description("文本相似度阈值 (%)"),
|
|
961
|
+
imageWholeThreshold: import_koishi3.Schema.number().min(0).max(100).step(0.01).default(95).description("图片相似度阈值 (%)")
|
|
1013
962
|
}).description("复核配置"),
|
|
1014
963
|
import_koishi3.Schema.object({
|
|
1015
964
|
localPath: import_koishi3.Schema.string().description("文件映射路径"),
|
|
@@ -1086,7 +1035,7 @@ function apply(ctx, config) {
|
|
|
1086
1035
|
for (const existing of existingTextHashes) {
|
|
1087
1036
|
const similarity = hashManager.calculateSimilarity(newSimhash, existing.hash);
|
|
1088
1037
|
if (similarity >= config.textThreshold) {
|
|
1089
|
-
return `文本与回声洞(${existing.cave})的相似度为 ${
|
|
1038
|
+
return `文本与回声洞(${existing.cave})的相似度为 ${similarity.toFixed(2)}%,超过阈值`;
|
|
1090
1039
|
}
|
|
1091
1040
|
}
|
|
1092
1041
|
textHashesToStore.push({ hash: newSimhash, type: "simhash" });
|