ai-code-detector 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +234 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +135 -0
- package/dist/detector.d.ts +23 -0
- package/dist/detector.d.ts.map +1 -0
- package/dist/detector.js +132 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +25 -0
- package/dist/perplexity.d.ts +22 -0
- package/dist/perplexity.d.ts.map +1 -0
- package/dist/perplexity.js +180 -0
- package/dist/report.d.ts +13 -0
- package/dist/report.d.ts.map +1 -0
- package/dist/report.js +134 -0
- package/dist/types.d.ts +44 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +4 -0
- package/package.json +63 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 CC
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# @cc-tools/ai-code-detector
|
|
2
|
+
|
|
3
|
+
基于困惑度(Perplexity)检测 AI 生成代码的 npm 包,支持多种编程语言,提供 CLI 和 API 两种使用方式。
|
|
4
|
+
|
|
5
|
+
## 安装
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @cc-tools/ai-code-detector
|
|
9
|
+
# 或
|
|
10
|
+
yarn add @cc-tools/ai-code-detector
|
|
11
|
+
# 或
|
|
12
|
+
pnpm add @cc-tools/ai-code-detector
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## 快速开始
|
|
16
|
+
|
|
17
|
+
### CLI 使用
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# 全局安装后直接使用
|
|
21
|
+
ai-code-detector -i ./src/index.ts
|
|
22
|
+
|
|
23
|
+
# 或使用 npx
|
|
24
|
+
npx @cc-tools/ai-code-detector -i ./src/index.ts
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
#### CLI 参数
|
|
28
|
+
|
|
29
|
+
| 参数 | 说明 | 默认值 |
|
|
30
|
+
|------|------|--------|
|
|
31
|
+
| `-i, --input <path>` | 输入文件或目录路径 | 必填 |
|
|
32
|
+
| `-o, --output <path>` | 输出报告文件路径 | 可选 |
|
|
33
|
+
| `-t, --threshold <num>` | 困惑度阈值 | 15.0 |
|
|
34
|
+
| `-m, --model <name>` | 使用的模型名称 | Xenova/codebert-base |
|
|
35
|
+
| `-f, --format <type>` | 输出格式 (text/json) | text |
|
|
36
|
+
| `-h, --help` | 显示帮助信息 | - |
|
|
37
|
+
|
|
38
|
+
#### CLI 示例
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# 检测单个文件
|
|
42
|
+
ai-code-detector -i ./src/index.ts
|
|
43
|
+
|
|
44
|
+
# 检测目录下所有代码文件
|
|
45
|
+
ai-code-detector -i ./src/
|
|
46
|
+
|
|
47
|
+
# 输出 JSON 格式
|
|
48
|
+
ai-code-detector -i ./code.js -f json
|
|
49
|
+
|
|
50
|
+
# 自定义阈值并保存报告
|
|
51
|
+
ai-code-detector -i ./project/ -t 20 -o report.txt
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### API 使用
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
import { detectAICode, analyzeCodeSegments, calculatePerplexity } from '@cc-tools/ai-code-detector';
|
|
58
|
+
|
|
59
|
+
// 完整检测报告
|
|
60
|
+
const report = await detectAICode(codeString, {
|
|
61
|
+
threshold: 15.0, // 困惑度阈值
|
|
62
|
+
chunkSize: 512, // 代码块大小
|
|
63
|
+
overlapSize: 50, // 重叠大小
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
console.log(report.summary);
|
|
67
|
+
console.log(`是否为 AI 生成: ${report.isAIGenerated}`);
|
|
68
|
+
console.log(`整体评分: ${report.overallScore}/100`);
|
|
69
|
+
|
|
70
|
+
// 仅分析代码段
|
|
71
|
+
const segments = await analyzeCodeSegments(codeString);
|
|
72
|
+
segments.forEach(seg => {
|
|
73
|
+
console.log(`行 ${seg.segment.startLine}-${seg.segment.endLine}: 困惑度=${seg.perplexity.toFixed(2)}`);
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
// 仅计算困惑度
|
|
77
|
+
const perplexity = await calculatePerplexity(codeString);
|
|
78
|
+
console.log(`困惑度: ${perplexity}`);
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## 检测原理
|
|
82
|
+
|
|
83
|
+
### 困惑度 (Perplexity)
|
|
84
|
+
|
|
85
|
+
困惑度是衡量语言模型对文本"惊讶程度"的指标:
|
|
86
|
+
|
|
87
|
+
- **低困惑度** (< 阈值): 代码更加"可预测",模式规整,可能是 AI 生成
|
|
88
|
+
- **高困惑度** (> 阈值): 代码更加"多样/复杂",更可能是人类编写
|
|
89
|
+
|
|
90
|
+
### 检测流程
|
|
91
|
+
|
|
92
|
+
1. **代码分段**: 识别代码逻辑块(函数、类、条件语句等)
|
|
93
|
+
2. **困惑度计算**: 使用 CodeBERT 模型或启发式算法计算每段代码的困惑度
|
|
94
|
+
3. **阈值判断**: 与设定阈值比较,判断是否为 AI 生成
|
|
95
|
+
4. **报告生成**: 汇总结果,生成详细检测报告
|
|
96
|
+
|
|
97
|
+
### 支持的语言
|
|
98
|
+
|
|
99
|
+
- JavaScript / TypeScript
|
|
100
|
+
- Python
|
|
101
|
+
- Java
|
|
102
|
+
- Go
|
|
103
|
+
- Rust
|
|
104
|
+
- C / C++
|
|
105
|
+
- C#
|
|
106
|
+
- Ruby
|
|
107
|
+
- PHP
|
|
108
|
+
- Swift
|
|
109
|
+
- Kotlin
|
|
110
|
+
- Scala
|
|
111
|
+
- SQL
|
|
112
|
+
|
|
113
|
+
## API 文档
|
|
114
|
+
|
|
115
|
+
### `detectAICode(code, options?)`
|
|
116
|
+
|
|
117
|
+
执行完整的 AI 代码检测,返回检测报告。
|
|
118
|
+
|
|
119
|
+
```typescript
|
|
120
|
+
interface DetectionOptions {
|
|
121
|
+
model?: string; // 模型名称,默认 'Xenova/codebert-base'
|
|
122
|
+
threshold?: number; // 困惑度阈值,默认 15.0
|
|
123
|
+
chunkSize?: number; // 代码块大小,默认 512
|
|
124
|
+
overlapSize?: number; // 重叠大小,默认 50
|
|
125
|
+
language?: string; // 指定语言(可选)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
interface DetectionReport {
|
|
129
|
+
overallScore: number; // 整体评分 (0-100)
|
|
130
|
+
isAIGenerated: boolean; // 是否为 AI 生成
|
|
131
|
+
totalSegments: number; // 总代码段数
|
|
132
|
+
aiGeneratedSegments: number; // AI 生成段数
|
|
133
|
+
humanWrittenSegments: number; // 人类编写段数
|
|
134
|
+
segments: SegmentResult[]; // 分段详细结果
|
|
135
|
+
summary: string; // 摘要
|
|
136
|
+
recommendations: string[]; // 建议
|
|
137
|
+
metadata: { // 元数据
|
|
138
|
+
model: string;
|
|
139
|
+
threshold: number;
|
|
140
|
+
timestamp: string;
|
|
141
|
+
language: string;
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### `analyzeCodeSegments(code, options?)`
|
|
147
|
+
|
|
148
|
+
分析代码段,返回每段的检测结果。
|
|
149
|
+
|
|
150
|
+
### `calculatePerplexity(code, options?)`
|
|
151
|
+
|
|
152
|
+
计算代码的困惑度值。
|
|
153
|
+
|
|
154
|
+
### `ReportGenerator`
|
|
155
|
+
|
|
156
|
+
报告生成器类,支持格式化输出。
|
|
157
|
+
|
|
158
|
+
```typescript
|
|
159
|
+
import { ReportGenerator } from '@cc-tools/ai-code-detector';
|
|
160
|
+
|
|
161
|
+
const generator = new ReportGenerator(options);
|
|
162
|
+
await generator.initialize();
|
|
163
|
+
|
|
164
|
+
const report = await generator.generateReport(code);
|
|
165
|
+
console.log(generator.formatReportAsText(report));
|
|
166
|
+
console.log(generator.formatReportAsJSON(report));
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## 示例输出
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
============================================================
|
|
173
|
+
AI 代码检测报告
|
|
174
|
+
============================================================
|
|
175
|
+
|
|
176
|
+
检测时间: 2024-01-15T10:30:00.000Z
|
|
177
|
+
使用模型: Xenova/codebert-base
|
|
178
|
+
检测语言: typescript
|
|
179
|
+
阈值设置: 15
|
|
180
|
+
|
|
181
|
+
------------------------------------------------------------
|
|
182
|
+
检测结果
|
|
183
|
+
------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
整体评分: 35/100
|
|
186
|
+
判定结果: ⚠️ 可能是AI生成
|
|
187
|
+
总代码段: 5
|
|
188
|
+
AI生成段: 4
|
|
189
|
+
人类编写段: 1
|
|
190
|
+
|
|
191
|
+
------------------------------------------------------------
|
|
192
|
+
摘要
|
|
193
|
+
------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
该代码显示出明显的AI生成特征。约80%的代码段被判定为AI生成,
|
|
196
|
+
平均困惑度为8.52,低于设定的阈值。
|
|
197
|
+
|
|
198
|
+
------------------------------------------------------------
|
|
199
|
+
建议
|
|
200
|
+
------------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
1. 所有代码段都显示出AI生成的特征,建议检查代码的原创性
|
|
203
|
+
2. AI生成置信度较高,建议添加更多人工注释和定制化逻辑
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## 注意事项
|
|
207
|
+
|
|
208
|
+
1. **模型加载**: 首次使用时会自动下载模型,可能需要一些时间
|
|
209
|
+
2. **阈值调整**: 默认阈值 15.0 适用于大多数场景,可根据实际情况调整
|
|
210
|
+
3. **检测结果**: 仅供参考,不能作为判断代码来源的唯一依据
|
|
211
|
+
4. **性能**: 大型代码文件可能需要较长时间处理
|
|
212
|
+
|
|
213
|
+
## 开发
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
# 克隆仓库
|
|
217
|
+
git clone https://github.com/cc-cc/cc-tools.git
|
|
218
|
+
|
|
219
|
+
# 进入包目录
|
|
220
|
+
cd packages/ai-code-detector
|
|
221
|
+
|
|
222
|
+
# 安装依赖
|
|
223
|
+
npm install
|
|
224
|
+
|
|
225
|
+
# 构建
|
|
226
|
+
npm run build
|
|
227
|
+
|
|
228
|
+
# 测试
|
|
229
|
+
node dist/cli.js -i ./src/index.ts
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
## License
|
|
233
|
+
|
|
234
|
+
MIT
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { detectAICode, DEFAULT_OPTIONS } from './index.js';
|
|
3
|
+
import { readFileSync, readdirSync, statSync } from 'fs';
|
|
4
|
+
import { join, extname } from 'path';
|
|
5
|
+
function parseArgs() {
|
|
6
|
+
const args = process.argv.slice(2);
|
|
7
|
+
const options = {};
|
|
8
|
+
for (let i = 0; i < args.length; i++) {
|
|
9
|
+
const arg = args[i];
|
|
10
|
+
switch (arg) {
|
|
11
|
+
case '-i':
|
|
12
|
+
case '--input':
|
|
13
|
+
options.input = args[++i];
|
|
14
|
+
break;
|
|
15
|
+
case '-o':
|
|
16
|
+
case '--output':
|
|
17
|
+
options.output = args[++i];
|
|
18
|
+
break;
|
|
19
|
+
case '-t':
|
|
20
|
+
case '--threshold':
|
|
21
|
+
options.threshold = parseFloat(args[++i]);
|
|
22
|
+
break;
|
|
23
|
+
case '-m':
|
|
24
|
+
case '--model':
|
|
25
|
+
options.model = args[++i];
|
|
26
|
+
break;
|
|
27
|
+
case '-f':
|
|
28
|
+
case '--format':
|
|
29
|
+
options.format = args[++i];
|
|
30
|
+
break;
|
|
31
|
+
case '-h':
|
|
32
|
+
case '--help':
|
|
33
|
+
printHelp();
|
|
34
|
+
process.exit(0);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return options;
|
|
38
|
+
}
|
|
39
|
+
function printHelp() {
|
|
40
|
+
console.log(`
|
|
41
|
+
AI Code Detector - 基于困惑度的AI代码检测工具
|
|
42
|
+
|
|
43
|
+
用法: ai-code-detector [选项]
|
|
44
|
+
|
|
45
|
+
选项:
|
|
46
|
+
-i, --input <path> 输入文件或目录路径
|
|
47
|
+
-o, --output <path> 输出报告文件路径 (可选)
|
|
48
|
+
-t, --threshold <num> 困惑度阈值 (默认: 15.0)
|
|
49
|
+
-m, --model <name> 使用的模型名称 (默认: Xenova/codebert-base)
|
|
50
|
+
-f, --format <type> 输出格式: text 或 json (默认: text)
|
|
51
|
+
-h, --help 显示帮助信息
|
|
52
|
+
|
|
53
|
+
示例:
|
|
54
|
+
ai-code-detector --input ./src/index.ts
|
|
55
|
+
ai-code-detector -i ./code.js -t 20 -f json
|
|
56
|
+
ai-code-detector -i ./project/ -o report.txt
|
|
57
|
+
`);
|
|
58
|
+
}
|
|
59
|
+
function readCodeFromFile(filePath) {
|
|
60
|
+
try {
|
|
61
|
+
return readFileSync(filePath, 'utf-8');
|
|
62
|
+
}
|
|
63
|
+
catch (error) {
|
|
64
|
+
console.error(`Error reading file: ${filePath}`);
|
|
65
|
+
process.exit(1);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
function isValidCodeFile(filePath) {
|
|
69
|
+
const validExtensions = [
|
|
70
|
+
'.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.go',
|
|
71
|
+
'.rs', '.cpp', '.c', '.h', '.hpp', '.cs', '.rb',
|
|
72
|
+
'.php', '.swift', '.kt', '.scala', '.sql'
|
|
73
|
+
];
|
|
74
|
+
return validExtensions.includes(extname(filePath).toLowerCase());
|
|
75
|
+
}
|
|
76
|
+
async function main() {
|
|
77
|
+
const options = parseArgs();
|
|
78
|
+
if (!options.input) {
|
|
79
|
+
console.error('Error: Please specify input file or directory');
|
|
80
|
+
printHelp();
|
|
81
|
+
process.exit(1);
|
|
82
|
+
}
|
|
83
|
+
const stats = statSync(options.input);
|
|
84
|
+
let code = '';
|
|
85
|
+
let fileName = options.input;
|
|
86
|
+
if (stats.isDirectory()) {
|
|
87
|
+
const files = readdirSync(options.input);
|
|
88
|
+
const codeFiles = files
|
|
89
|
+
.map(f => join(options.input, f))
|
|
90
|
+
.filter(f => statSync(f).isFile() && isValidCodeFile(f));
|
|
91
|
+
if (codeFiles.length === 0) {
|
|
92
|
+
console.error('No valid code files found in directory');
|
|
93
|
+
process.exit(1);
|
|
94
|
+
}
|
|
95
|
+
code = codeFiles.map(f => readCodeFromFile(f)).join('\n\n');
|
|
96
|
+
fileName = `${options.input} (${codeFiles.length} files)`;
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
code = readCodeFromFile(options.input);
|
|
100
|
+
}
|
|
101
|
+
console.log(`\n正在分析代码: ${fileName}`);
|
|
102
|
+
console.log('加载模型中...\n');
|
|
103
|
+
const detectOptions = {
|
|
104
|
+
...DEFAULT_OPTIONS,
|
|
105
|
+
threshold: options.threshold,
|
|
106
|
+
model: options.model,
|
|
107
|
+
};
|
|
108
|
+
try {
|
|
109
|
+
const report = await detectAICode(code, detectOptions);
|
|
110
|
+
const format = options.format || 'text';
|
|
111
|
+
let output;
|
|
112
|
+
const { ReportGenerator } = await import('./report.js');
|
|
113
|
+
const generator = new ReportGenerator(detectOptions);
|
|
114
|
+
if (format === 'json') {
|
|
115
|
+
output = JSON.stringify(report, null, 2);
|
|
116
|
+
}
|
|
117
|
+
else {
|
|
118
|
+
output = generator.formatReportAsText(report);
|
|
119
|
+
}
|
|
120
|
+
if (options.output) {
|
|
121
|
+
const { writeFileSync } = await import('fs');
|
|
122
|
+
writeFileSync(options.output, output, 'utf-8');
|
|
123
|
+
console.log(`\n报告已保存到: ${options.output}`);
|
|
124
|
+
}
|
|
125
|
+
else {
|
|
126
|
+
console.log(output);
|
|
127
|
+
}
|
|
128
|
+
process.exit(report.isAIGenerated ? 0 : 0);
|
|
129
|
+
}
|
|
130
|
+
catch (error) {
|
|
131
|
+
console.error('Error during detection:', error);
|
|
132
|
+
process.exit(1);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
main();
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { CodeSegment, DetectionOptions, SegmentResult } from './types.js';
|
|
2
|
+
export declare class CodeSegmenter {
|
|
3
|
+
private chunkSize;
|
|
4
|
+
private overlapSize;
|
|
5
|
+
constructor(options?: DetectionOptions);
|
|
6
|
+
segmentCode(code: string): CodeSegment[];
|
|
7
|
+
private identifyLogicalBlocks;
|
|
8
|
+
private isBlockStart;
|
|
9
|
+
private splitLargeBlock;
|
|
10
|
+
}
|
|
11
|
+
export declare class AICodeDetector {
|
|
12
|
+
private perplexityCalculator;
|
|
13
|
+
private segmenter;
|
|
14
|
+
private threshold;
|
|
15
|
+
private options;
|
|
16
|
+
constructor(options?: DetectionOptions);
|
|
17
|
+
initialize(): Promise<void>;
|
|
18
|
+
detect(code: string): Promise<SegmentResult[]>;
|
|
19
|
+
private calculateConfidence;
|
|
20
|
+
getThreshold(): number;
|
|
21
|
+
setThreshold(threshold: number): void;
|
|
22
|
+
}
|
|
23
|
+
//# sourceMappingURL=detector.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detector.d.ts","sourceRoot":"","sources":["../src/detector.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,gBAAgB,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAI/E,qBAAa,aAAa;IACxB,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;gBAEhB,OAAO,GAAE,gBAAqB;IAK1C,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,WAAW,EAAE;IAwBxC,OAAO,CAAC,qBAAqB;IAyC7B,OAAO,CAAC,YAAY;IAMpB,OAAO,CAAC,eAAe;CAgCxB;AAED,qBAAa,cAAc;IACzB,OAAO,CAAC,oBAAoB,CAAuB;IACnD,OAAO,CAAC,SAAS,CAAgB;IACjC,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,OAAO,CAAmB;gBAEtB,OAAO,GAAE,gBAAqB;IAOpC,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAI3B,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC;IAoBpD,OAAO,CAAC,mBAAmB;IAO3B,YAAY,IAAI,MAAM;IAItB,YAAY,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI;CAGtC"}
|
package/dist/detector.js
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import { PerplexityCalculator } from './perplexity.js';
|
|
2
|
+
import { DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP_SIZE, DEFAULT_THRESHOLD } from './types.js';
|
|
3
|
+
export class CodeSegmenter {
|
|
4
|
+
chunkSize;
|
|
5
|
+
overlapSize;
|
|
6
|
+
constructor(options = {}) {
|
|
7
|
+
this.chunkSize = options.chunkSize || DEFAULT_CHUNK_SIZE;
|
|
8
|
+
this.overlapSize = options.overlapSize || DEFAULT_OVERLAP_SIZE;
|
|
9
|
+
}
|
|
10
|
+
segmentCode(code) {
|
|
11
|
+
const lines = code.split('\n');
|
|
12
|
+
const segments = [];
|
|
13
|
+
const logicalBlocks = this.identifyLogicalBlocks(lines);
|
|
14
|
+
for (const block of logicalBlocks) {
|
|
15
|
+
const blockContent = lines.slice(block.start, block.end + 1).join('\n');
|
|
16
|
+
if (blockContent.length > this.chunkSize) {
|
|
17
|
+
const subSegments = this.splitLargeBlock(lines, block.start, block.end);
|
|
18
|
+
segments.push(...subSegments);
|
|
19
|
+
}
|
|
20
|
+
else {
|
|
21
|
+
segments.push({
|
|
22
|
+
content: blockContent,
|
|
23
|
+
startLine: block.start + 1,
|
|
24
|
+
endLine: block.end + 1,
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return segments;
|
|
29
|
+
}
|
|
30
|
+
identifyLogicalBlocks(lines) {
|
|
31
|
+
const blocks = [];
|
|
32
|
+
let currentBlock = null;
|
|
33
|
+
let depth = 0;
|
|
34
|
+
for (let i = 0; i < lines.length; i++) {
|
|
35
|
+
const line = lines[i];
|
|
36
|
+
const openBraces = (line.match(/{/g) || []).length;
|
|
37
|
+
const closeBraces = (line.match(/}/g) || []).length;
|
|
38
|
+
if (currentBlock === null && (openBraces > 0 || this.isBlockStart(line))) {
|
|
39
|
+
currentBlock = { start: i, depth };
|
|
40
|
+
}
|
|
41
|
+
depth += openBraces - closeBraces;
|
|
42
|
+
if (currentBlock !== null && depth === currentBlock.depth && closeBraces > 0) {
|
|
43
|
+
blocks.push({ start: currentBlock.start, end: i });
|
|
44
|
+
currentBlock = null;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
if (blocks.length === 0) {
|
|
48
|
+
return [{ start: 0, end: lines.length - 1 }];
|
|
49
|
+
}
|
|
50
|
+
let lastEnd = -1;
|
|
51
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
52
|
+
if (blocks[i].start > lastEnd + 1) {
|
|
53
|
+
blocks.splice(i, 0, { start: lastEnd + 1, end: blocks[i].start - 1 });
|
|
54
|
+
}
|
|
55
|
+
lastEnd = blocks[i].end;
|
|
56
|
+
}
|
|
57
|
+
if (lastEnd < lines.length - 1) {
|
|
58
|
+
blocks.push({ start: lastEnd + 1, end: lines.length - 1 });
|
|
59
|
+
}
|
|
60
|
+
return blocks;
|
|
61
|
+
}
|
|
62
|
+
isBlockStart(line) {
|
|
63
|
+
const blockKeywords = ['function', 'class', 'interface', 'if', 'for', 'while', 'switch', 'try'];
|
|
64
|
+
const trimmed = line.trim();
|
|
65
|
+
return blockKeywords.some(kw => trimmed.startsWith(kw));
|
|
66
|
+
}
|
|
67
|
+
splitLargeBlock(lines, startLine, endLine) {
|
|
68
|
+
const segments = [];
|
|
69
|
+
let currentStart = startLine;
|
|
70
|
+
while (currentStart <= endLine) {
|
|
71
|
+
let currentEnd = currentStart;
|
|
72
|
+
let currentLength = 0;
|
|
73
|
+
while (currentEnd <= endLine && currentLength < this.chunkSize) {
|
|
74
|
+
currentLength += lines[currentEnd].length + 1;
|
|
75
|
+
currentEnd++;
|
|
76
|
+
}
|
|
77
|
+
if (currentEnd > endLine) {
|
|
78
|
+
currentEnd = endLine + 1;
|
|
79
|
+
}
|
|
80
|
+
segments.push({
|
|
81
|
+
content: lines.slice(currentStart, currentEnd).join('\n'),
|
|
82
|
+
startLine: currentStart + 1,
|
|
83
|
+
endLine: currentEnd,
|
|
84
|
+
});
|
|
85
|
+
currentStart = Math.max(currentEnd - this.overlapSize, currentEnd);
|
|
86
|
+
}
|
|
87
|
+
return segments;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
export class AICodeDetector {
|
|
91
|
+
perplexityCalculator;
|
|
92
|
+
segmenter;
|
|
93
|
+
threshold;
|
|
94
|
+
options;
|
|
95
|
+
constructor(options = {}) {
|
|
96
|
+
this.options = options;
|
|
97
|
+
this.threshold = options.threshold || DEFAULT_THRESHOLD;
|
|
98
|
+
this.perplexityCalculator = new PerplexityCalculator(options);
|
|
99
|
+
this.segmenter = new CodeSegmenter(options);
|
|
100
|
+
}
|
|
101
|
+
async initialize() {
|
|
102
|
+
await this.perplexityCalculator.initialize();
|
|
103
|
+
}
|
|
104
|
+
async detect(code) {
|
|
105
|
+
const segments = this.segmenter.segmentCode(code);
|
|
106
|
+
const results = [];
|
|
107
|
+
for (const segment of segments) {
|
|
108
|
+
const perplexity = await this.perplexityCalculator.calculatePerplexity(segment.content);
|
|
109
|
+
const isAIGenerated = perplexity < this.threshold;
|
|
110
|
+
const confidence = this.calculateConfidence(perplexity);
|
|
111
|
+
results.push({
|
|
112
|
+
segment,
|
|
113
|
+
perplexity,
|
|
114
|
+
isAIGenerated,
|
|
115
|
+
confidence,
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
return results;
|
|
119
|
+
}
|
|
120
|
+
calculateConfidence(perplexity) {
|
|
121
|
+
const distance = Math.abs(perplexity - this.threshold);
|
|
122
|
+
const maxDistance = 20;
|
|
123
|
+
const confidence = Math.min(distance / maxDistance, 1) * 100;
|
|
124
|
+
return Math.round(confidence);
|
|
125
|
+
}
|
|
126
|
+
getThreshold() {
|
|
127
|
+
return this.threshold;
|
|
128
|
+
}
|
|
129
|
+
setThreshold(threshold) {
|
|
130
|
+
this.threshold = threshold;
|
|
131
|
+
}
|
|
132
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { ReportGenerator } from './report.js';
|
|
2
|
+
import { AICodeDetector } from './detector.js';
|
|
3
|
+
import { PerplexityCalculator } from './perplexity.js';
|
|
4
|
+
import type { DetectionOptions, DetectionReport, SegmentResult } from './types.js';
|
|
5
|
+
export { AICodeDetector, PerplexityCalculator, ReportGenerator, };
|
|
6
|
+
export type { DetectionOptions, DetectionReport, SegmentResult, CodeSegment, } from './types.js';
|
|
7
|
+
export declare function detectAICode(code: string, options?: DetectionOptions): Promise<DetectionReport>;
|
|
8
|
+
export declare function analyzeCodeSegments(code: string, options?: DetectionOptions): Promise<SegmentResult[]>;
|
|
9
|
+
export declare function calculatePerplexity(code: string, options?: DetectionOptions): Promise<number>;
|
|
10
|
+
export declare const DEFAULT_OPTIONS: DetectionOptions;
|
|
11
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAC/C,OAAO,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AACvD,OAAO,KAAK,EAAE,gBAAgB,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAEnF,OAAO,EACL,cAAc,EACd,oBAAoB,EACpB,eAAe,GAChB,CAAC;AAEF,YAAY,EACV,gBAAgB,EAChB,eAAe,EACf,aAAa,EACb,WAAW,GACZ,MAAM,YAAY,CAAC;AAEpB,wBAAsB,YAAY,CAChC,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,gBAAqB,GAC7B,OAAO,CAAC,eAAe,CAAC,CAI1B;AAED,wBAAsB,mBAAmB,CACvC,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,gBAAqB,GAC7B,OAAO,CAAC,aAAa,EAAE,CAAC,CAI1B;AAED,wBAAsB,mBAAmB,CACvC,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,gBAAqB,GAC7B,OAAO,CAAC,MAAM,CAAC,CAIjB;AAED,eAAO,MAAM,eAAe,EAAE,gBAK7B,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { ReportGenerator } from './report.js';
|
|
2
|
+
import { AICodeDetector } from './detector.js';
|
|
3
|
+
import { PerplexityCalculator } from './perplexity.js';
|
|
4
|
+
export { AICodeDetector, PerplexityCalculator, ReportGenerator, };
|
|
5
|
+
export async function detectAICode(code, options = {}) {
|
|
6
|
+
const generator = new ReportGenerator(options);
|
|
7
|
+
await generator.initialize();
|
|
8
|
+
return generator.generateReport(code, options);
|
|
9
|
+
}
|
|
10
|
+
export async function analyzeCodeSegments(code, options = {}) {
|
|
11
|
+
const detector = new AICodeDetector(options);
|
|
12
|
+
await detector.initialize();
|
|
13
|
+
return detector.detect(code);
|
|
14
|
+
}
|
|
15
|
+
export async function calculatePerplexity(code, options = {}) {
|
|
16
|
+
const calculator = new PerplexityCalculator(options);
|
|
17
|
+
await calculator.initialize();
|
|
18
|
+
return calculator.calculatePerplexity(code);
|
|
19
|
+
}
|
|
20
|
+
export const DEFAULT_OPTIONS = {
|
|
21
|
+
model: 'Xenova/codebert-base',
|
|
22
|
+
threshold: 15.0,
|
|
23
|
+
chunkSize: 512,
|
|
24
|
+
overlapSize: 50,
|
|
25
|
+
};
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { DetectionOptions, ModelConfig } from './types.js';
|
|
2
|
+
export declare class PerplexityCalculator {
|
|
3
|
+
private model;
|
|
4
|
+
private tokenizer;
|
|
5
|
+
private model_instance;
|
|
6
|
+
private initialized;
|
|
7
|
+
constructor(options?: DetectionOptions);
|
|
8
|
+
initialize(): Promise<void>;
|
|
9
|
+
calculatePerplexity(text: string): Promise<number>;
|
|
10
|
+
private tokenizeCode;
|
|
11
|
+
private computePerplexityWithMasking;
|
|
12
|
+
private tokenSimilarity;
|
|
13
|
+
private heuristicPerplexity;
|
|
14
|
+
private countRepeatedPatterns;
|
|
15
|
+
private countCommonKeywords;
|
|
16
|
+
private calculateStructureRegularity;
|
|
17
|
+
private calculateNamingConsistency;
|
|
18
|
+
private calculateCommentRatio;
|
|
19
|
+
private calculateLineLengthVariance;
|
|
20
|
+
getModelInfo(): ModelConfig;
|
|
21
|
+
}
|
|
22
|
+
//# sourceMappingURL=perplexity.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"perplexity.d.ts","sourceRoot":"","sources":["../src/perplexity.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,gBAAgB,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAKhE,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,SAAS,CAAa;IAC9B,OAAO,CAAC,cAAc,CAAa;IACnC,OAAO,CAAC,WAAW,CAAkB;gBAEzB,OAAO,GAAE,gBAAqB;IAIpC,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAgB3B,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAqBxD,OAAO,CAAC,YAAY;YASN,4BAA4B;IA8C1C,OAAO,CAAC,eAAe;IAWvB,OAAO,CAAC,mBAAmB;IA0B3B,OAAO,CAAC,qBAAqB;IAS7B,OAAO,CAAC,mBAAmB;IAS3B,OAAO,CAAC,4BAA4B;IAcpC,OAAO,CAAC,0BAA0B;IAalC,OAAO,CAAC,qBAAqB;IAY7B,OAAO,CAAC,2BAA2B;IAUnC,YAAY,IAAI,WAAW;CAO5B"}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import { pipeline, env } from '@xenova/transformers';
|
|
2
|
+
import { DEFAULT_MODEL, DEFAULT_THRESHOLD } from './types.js';
|
|
3
|
+
env.allowLocalModels = false;
|
|
4
|
+
export class PerplexityCalculator {
|
|
5
|
+
model;
|
|
6
|
+
tokenizer = null;
|
|
7
|
+
model_instance = null;
|
|
8
|
+
initialized = false;
|
|
9
|
+
constructor(options = {}) {
|
|
10
|
+
this.model = options.model || DEFAULT_MODEL;
|
|
11
|
+
}
|
|
12
|
+
async initialize() {
|
|
13
|
+
if (this.initialized)
|
|
14
|
+
return;
|
|
15
|
+
try {
|
|
16
|
+
console.log(`Loading model: ${this.model}...`);
|
|
17
|
+
this.model_instance = await pipeline('fill-mask', this.model, {
|
|
18
|
+
quantized: true,
|
|
19
|
+
});
|
|
20
|
+
this.initialized = true;
|
|
21
|
+
console.log('Model loaded successfully');
|
|
22
|
+
}
|
|
23
|
+
catch (error) {
|
|
24
|
+
console.warn('Failed to load primary model, using fallback approach');
|
|
25
|
+
this.initialized = true;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
async calculatePerplexity(text) {
|
|
29
|
+
if (!this.initialized) {
|
|
30
|
+
await this.initialize();
|
|
31
|
+
}
|
|
32
|
+
if (!text || text.trim().length === 0) {
|
|
33
|
+
return 0;
|
|
34
|
+
}
|
|
35
|
+
const tokens = this.tokenizeCode(text);
|
|
36
|
+
if (tokens.length < 2) {
|
|
37
|
+
return 0;
|
|
38
|
+
}
|
|
39
|
+
try {
|
|
40
|
+
return await this.computePerplexityWithMasking(text, tokens);
|
|
41
|
+
}
|
|
42
|
+
catch (error) {
|
|
43
|
+
return this.heuristicPerplexity(text, tokens);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
tokenizeCode(code) {
|
|
47
|
+
return code
|
|
48
|
+
.replace(/[{}()\[\];,]/g, ' $& ')
|
|
49
|
+
.replace(/\s+/g, ' ')
|
|
50
|
+
.trim()
|
|
51
|
+
.split(' ')
|
|
52
|
+
.filter(t => t.length > 0);
|
|
53
|
+
}
|
|
54
|
+
async computePerplexityWithMasking(code, tokens) {
|
|
55
|
+
const sampleSize = Math.min(10, Math.floor(tokens.length / 3));
|
|
56
|
+
const sampleIndices = [];
|
|
57
|
+
for (let i = 0; i < sampleSize; i++) {
|
|
58
|
+
const idx = Math.floor(Math.random() * (tokens.length - 2)) + 1;
|
|
59
|
+
if (!sampleIndices.includes(idx)) {
|
|
60
|
+
sampleIndices.push(idx);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
let totalLogProb = 0;
|
|
64
|
+
let validSamples = 0;
|
|
65
|
+
for (const idx of sampleIndices) {
|
|
66
|
+
const maskedTokens = [...tokens];
|
|
67
|
+
const originalToken = maskedTokens[idx];
|
|
68
|
+
maskedTokens[idx] = '[MASK]';
|
|
69
|
+
const maskedText = maskedTokens.join(' ');
|
|
70
|
+
try {
|
|
71
|
+
if (this.model_instance) {
|
|
72
|
+
const results = await this.model_instance(maskedText);
|
|
73
|
+
const topResult = Array.isArray(results) ? results[0] : results;
|
|
74
|
+
const predictedToken = topResult?.token_str || '';
|
|
75
|
+
const similarity = this.tokenSimilarity(originalToken, predictedToken);
|
|
76
|
+
totalLogProb += Math.log(Math.max(similarity, 0.01));
|
|
77
|
+
validSamples++;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
validSamples++;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
if (validSamples === 0) {
|
|
85
|
+
return this.heuristicPerplexity(code, tokens);
|
|
86
|
+
}
|
|
87
|
+
const avgLogProb = totalLogProb / validSamples;
|
|
88
|
+
const perplexity = Math.exp(-avgLogProb);
|
|
89
|
+
return Math.max(1, Math.min(100, perplexity));
|
|
90
|
+
}
|
|
91
|
+
tokenSimilarity(token1, token2) {
|
|
92
|
+
if (token1 === token2)
|
|
93
|
+
return 1.0;
|
|
94
|
+
if (token1.toLowerCase() === token2.toLowerCase())
|
|
95
|
+
return 0.9;
|
|
96
|
+
const commonChars = [...token1.toLowerCase()].filter(c => token2.toLowerCase().includes(c)).length;
|
|
97
|
+
return commonChars / Math.max(token1.length, token2.length, 1);
|
|
98
|
+
}
|
|
99
|
+
heuristicPerplexity(code, tokens) {
|
|
100
|
+
let score = 0;
|
|
101
|
+
const patterns = {
|
|
102
|
+
repeatedTokens: this.countRepeatedPatterns(tokens),
|
|
103
|
+
commonKeywords: this.countCommonKeywords(tokens),
|
|
104
|
+
structureRegularity: this.calculateStructureRegularity(code),
|
|
105
|
+
namingConsistency: this.calculateNamingConsistency(tokens),
|
|
106
|
+
commentRatio: this.calculateCommentRatio(code),
|
|
107
|
+
lineLengthVariance: this.calculateLineLengthVariance(code),
|
|
108
|
+
};
|
|
109
|
+
score += patterns.repeatedTokens * 2;
|
|
110
|
+
score += patterns.commonKeywords * 1.5;
|
|
111
|
+
score += patterns.structureRegularity * 3;
|
|
112
|
+
score += patterns.namingConsistency * 2;
|
|
113
|
+
score -= patterns.commentRatio * 5;
|
|
114
|
+
score += patterns.lineLengthVariance * 0.5;
|
|
115
|
+
const uniqueTokens = new Set(tokens).size;
|
|
116
|
+
const uniquenessRatio = uniqueTokens / tokens.length;
|
|
117
|
+
score += (1 - uniquenessRatio) * 10;
|
|
118
|
+
return Math.max(1, Math.min(100, 15 + score));
|
|
119
|
+
}
|
|
120
|
+
countRepeatedPatterns(tokens) {
|
|
121
|
+
const patterns = {};
|
|
122
|
+
for (let i = 0; i < tokens.length - 1; i++) {
|
|
123
|
+
const pattern = `${tokens[i]} ${tokens[i + 1]}`;
|
|
124
|
+
patterns[pattern] = (patterns[pattern] || 0) + 1;
|
|
125
|
+
}
|
|
126
|
+
return Object.values(patterns).filter(c => c > 2).length;
|
|
127
|
+
}
|
|
128
|
+
countCommonKeywords(tokens) {
|
|
129
|
+
const keywords = new Set([
|
|
130
|
+
'function', 'const', 'let', 'var', 'if', 'else', 'for', 'while',
|
|
131
|
+
'return', 'import', 'export', 'class', 'interface', 'type',
|
|
132
|
+
'async', 'await', 'try', 'catch', 'throw', 'new', 'this'
|
|
133
|
+
]);
|
|
134
|
+
return tokens.filter(t => keywords.has(t)).length;
|
|
135
|
+
}
|
|
136
|
+
calculateStructureRegularity(code) {
|
|
137
|
+
const lines = code.split('\n');
|
|
138
|
+
const indentations = lines
|
|
139
|
+
.filter(l => l.trim().length > 0)
|
|
140
|
+
.map(l => l.match(/^\s*/)?.[0].length || 0);
|
|
141
|
+
if (indentations.length < 2)
|
|
142
|
+
return 0;
|
|
143
|
+
const avgIndent = indentations.reduce((a, b) => a + b, 0) / indentations.length;
|
|
144
|
+
const variance = indentations.reduce((sum, i) => sum + Math.pow(i - avgIndent, 2), 0) / indentations.length;
|
|
145
|
+
return Math.min(10, variance / 10);
|
|
146
|
+
}
|
|
147
|
+
calculateNamingConsistency(tokens) {
|
|
148
|
+
const identifiers = tokens.filter(t => /^[a-zA-Z_][a-zA-Z0-9_]*$/.test(t));
|
|
149
|
+
if (identifiers.length < 3)
|
|
150
|
+
return 0;
|
|
151
|
+
const camelCase = identifiers.filter(t => /[a-z][A-Z]/.test(t)).length;
|
|
152
|
+
const snakeCase = identifiers.filter(t => t.includes('_')).length;
|
|
153
|
+
const dominant = Math.max(camelCase, snakeCase);
|
|
154
|
+
const consistency = dominant / identifiers.length;
|
|
155
|
+
return consistency * 5;
|
|
156
|
+
}
|
|
157
|
+
calculateCommentRatio(code) {
|
|
158
|
+
const lines = code.split('\n');
|
|
159
|
+
const commentLines = lines.filter(l => l.trim().startsWith('//') ||
|
|
160
|
+
l.trim().startsWith('#') ||
|
|
161
|
+
l.trim().startsWith('/*') ||
|
|
162
|
+
l.trim().startsWith('*')).length;
|
|
163
|
+
return commentLines / lines.length;
|
|
164
|
+
}
|
|
165
|
+
calculateLineLengthVariance(code) {
|
|
166
|
+
const lines = code.split('\n').map(l => l.length);
|
|
167
|
+
if (lines.length < 2)
|
|
168
|
+
return 0;
|
|
169
|
+
const avg = lines.reduce((a, b) => a + b, 0) / lines.length;
|
|
170
|
+
const variance = lines.reduce((sum, l) => sum + Math.pow(l - avg, 2), 0) / lines.length;
|
|
171
|
+
return Math.sqrt(variance);
|
|
172
|
+
}
|
|
173
|
+
getModelInfo() {
|
|
174
|
+
return {
|
|
175
|
+
name: this.model,
|
|
176
|
+
maxTokens: 512,
|
|
177
|
+
defaultThreshold: DEFAULT_THRESHOLD,
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
}
|
package/dist/report.d.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { DetectionReport, DetectionOptions } from './types.js';
|
|
2
|
+
export declare class ReportGenerator {
|
|
3
|
+
private detector;
|
|
4
|
+
constructor(options?: DetectionOptions);
|
|
5
|
+
initialize(): Promise<void>;
|
|
6
|
+
generateReport(code: string, options?: DetectionOptions): Promise<DetectionReport>;
|
|
7
|
+
private generateSummary;
|
|
8
|
+
private generateRecommendations;
|
|
9
|
+
private detectLanguage;
|
|
10
|
+
formatReportAsText(report: DetectionReport): string;
|
|
11
|
+
formatReportAsJSON(report: DetectionReport): string;
|
|
12
|
+
}
|
|
13
|
+
//# sourceMappingURL=report.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"report.d.ts","sourceRoot":"","sources":["../src/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAiB,eAAe,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAGnF,qBAAa,eAAe;IAC1B,OAAO,CAAC,QAAQ,CAAiB;gBAErB,OAAO,GAAE,gBAAqB;IAIpC,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAI3B,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,CAAC;IA8BxF,OAAO,CAAC,eAAe;IAQvB,OAAO,CAAC,uBAAuB;IA2B/B,OAAO,CAAC,cAAc;IAoBtB,kBAAkB,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM;IAsDnD,kBAAkB,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM;CAGpD"}
|
package/dist/report.js
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import { AICodeDetector } from './detector.js';
|
|
2
|
+
export class ReportGenerator {
|
|
3
|
+
detector;
|
|
4
|
+
constructor(options = {}) {
|
|
5
|
+
this.detector = new AICodeDetector(options);
|
|
6
|
+
}
|
|
7
|
+
async initialize() {
|
|
8
|
+
await this.detector.initialize();
|
|
9
|
+
}
|
|
10
|
+
async generateReport(code, options) {
|
|
11
|
+
const threshold = options?.threshold || this.detector.getThreshold();
|
|
12
|
+
const results = await this.detector.detect(code);
|
|
13
|
+
const aiSegments = results.filter(r => r.isAIGenerated);
|
|
14
|
+
const humanSegments = results.filter(r => !r.isAIGenerated);
|
|
15
|
+
const avgPerplexity = results.reduce((sum, r) => sum + r.perplexity, 0) / results.length;
|
|
16
|
+
const aiRatio = results.length > 0 ? aiSegments.length / results.length : 0;
|
|
17
|
+
const isAIGenerated = aiRatio > 0.5;
|
|
18
|
+
const overallScore = Math.round((1 - avgPerplexity / 100) * 100);
|
|
19
|
+
return {
|
|
20
|
+
overallScore: Math.max(0, Math.min(100, overallScore)),
|
|
21
|
+
isAIGenerated,
|
|
22
|
+
totalSegments: results.length,
|
|
23
|
+
aiGeneratedSegments: aiSegments.length,
|
|
24
|
+
humanWrittenSegments: humanSegments.length,
|
|
25
|
+
segments: results,
|
|
26
|
+
summary: this.generateSummary(isAIGenerated, aiRatio, avgPerplexity),
|
|
27
|
+
recommendations: this.generateRecommendations(results, threshold),
|
|
28
|
+
metadata: {
|
|
29
|
+
model: 'Xenova/codebert-base',
|
|
30
|
+
threshold,
|
|
31
|
+
timestamp: new Date().toISOString(),
|
|
32
|
+
language: this.detectLanguage(code),
|
|
33
|
+
},
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
generateSummary(isAIGenerated, aiRatio, avgPerplexity) {
|
|
37
|
+
if (isAIGenerated) {
|
|
38
|
+
return `该代码显示出明显的AI生成特征。约${Math.round(aiRatio * 100)}%的代码段被判定为AI生成,平均困惑度为${avgPerplexity.toFixed(2)},低于设定的阈值。`;
|
|
39
|
+
}
|
|
40
|
+
else {
|
|
41
|
+
return `该代码更可能由人类编写。只有约${Math.round(aiRatio * 100)}%的代码段显示出AI生成的特征,平均困惑度为${avgPerplexity.toFixed(2)}。`;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
generateRecommendations(results, threshold) {
|
|
45
|
+
const recommendations = [];
|
|
46
|
+
const aiSegments = results.filter(r => r.isAIGenerated);
|
|
47
|
+
if (aiSegments.length === results.length) {
|
|
48
|
+
recommendations.push('所有代码段都显示出AI生成的特征,建议检查代码的原创性');
|
|
49
|
+
}
|
|
50
|
+
if (aiSegments.length > 0) {
|
|
51
|
+
const avgConfidence = aiSegments.reduce((sum, r) => sum + r.confidence, 0) / aiSegments.length;
|
|
52
|
+
if (avgConfidence > 80) {
|
|
53
|
+
recommendations.push('AI生成置信度较高,建议添加更多人工注释和定制化逻辑');
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
const lowConfidenceResults = results.filter(r => r.confidence < 30);
|
|
57
|
+
if (lowConfidenceResults.length > 0) {
|
|
58
|
+
recommendations.push('部分代码段置信度较低,可能是混合代码或边界情况');
|
|
59
|
+
}
|
|
60
|
+
if (recommendations.length === 0) {
|
|
61
|
+
recommendations.push('代码检测结果正常,未发现明显异常');
|
|
62
|
+
}
|
|
63
|
+
return recommendations;
|
|
64
|
+
}
|
|
65
|
+
detectLanguage(code) {
|
|
66
|
+
const patterns = {
|
|
67
|
+
javascript: [/\bconst\b/, /\blet\b/, /\bfunction\b/, /\b=>\b/, /\bimport\b.*\bfrom\b/],
|
|
68
|
+
typescript: [/\binterface\b/, /\btype\b.*=/, /:\s*(string|number|boolean|any)\b/],
|
|
69
|
+
python: [/\bdef\b/, /\bimport\b.*\n/, /\bclass\b.*:/, /\bself\b/],
|
|
70
|
+
java: [/\bpublic\b.*\bclass\b/, /\bprivate\b/, /\bSystem\.out\.println\b/],
|
|
71
|
+
go: [/\bfunc\b/, /\bpackage\b/, /\bimport\b.*\(/],
|
|
72
|
+
rust: [/\bfn\b/, /\blet\s+mut\b/, /\bimpl\b/, /\buse\b.*::/],
|
|
73
|
+
};
|
|
74
|
+
for (const [lang, regexes] of Object.entries(patterns)) {
|
|
75
|
+
const matchCount = regexes.filter(r => r.test(code)).length;
|
|
76
|
+
if (matchCount >= 2) {
|
|
77
|
+
return lang;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return 'unknown';
|
|
81
|
+
}
|
|
82
|
+
formatReportAsText(report) {
|
|
83
|
+
const lines = [];
|
|
84
|
+
lines.push('='.repeat(60));
|
|
85
|
+
lines.push(' AI 代码检测报告');
|
|
86
|
+
lines.push('='.repeat(60));
|
|
87
|
+
lines.push('');
|
|
88
|
+
lines.push(`检测时间: ${report.metadata.timestamp}`);
|
|
89
|
+
lines.push(`使用模型: ${report.metadata.model}`);
|
|
90
|
+
lines.push(`检测语言: ${report.metadata.language}`);
|
|
91
|
+
lines.push(`阈值设置: ${report.metadata.threshold}`);
|
|
92
|
+
lines.push('');
|
|
93
|
+
lines.push('-'.repeat(60));
|
|
94
|
+
lines.push(' 检测结果');
|
|
95
|
+
lines.push('-'.repeat(60));
|
|
96
|
+
lines.push('');
|
|
97
|
+
lines.push(`整体评分: ${report.overallScore}/100`);
|
|
98
|
+
lines.push(`判定结果: ${report.isAIGenerated ? '⚠️ 可能是AI生成' : '✅ 可能是人类编写'}`);
|
|
99
|
+
lines.push(`总代码段: ${report.totalSegments}`);
|
|
100
|
+
lines.push(`AI生成段: ${report.aiGeneratedSegments}`);
|
|
101
|
+
lines.push(`人类编写段: ${report.humanWrittenSegments}`);
|
|
102
|
+
lines.push('');
|
|
103
|
+
lines.push('-'.repeat(60));
|
|
104
|
+
lines.push(' 摘要');
|
|
105
|
+
lines.push('-'.repeat(60));
|
|
106
|
+
lines.push('');
|
|
107
|
+
lines.push(report.summary);
|
|
108
|
+
lines.push('');
|
|
109
|
+
lines.push('-'.repeat(60));
|
|
110
|
+
lines.push(' 建议');
|
|
111
|
+
lines.push('-'.repeat(60));
|
|
112
|
+
lines.push('');
|
|
113
|
+
report.recommendations.forEach((rec, idx) => {
|
|
114
|
+
lines.push(`${idx + 1}. ${rec}`);
|
|
115
|
+
});
|
|
116
|
+
lines.push('');
|
|
117
|
+
lines.push('-'.repeat(60));
|
|
118
|
+
lines.push(' 详细分段结果');
|
|
119
|
+
lines.push('-'.repeat(60));
|
|
120
|
+
lines.push('');
|
|
121
|
+
report.segments.forEach((seg, idx) => {
|
|
122
|
+
lines.push(`[${idx + 1}] 行 ${seg.segment.startLine}-${seg.segment.endLine}`);
|
|
123
|
+
lines.push(` 困惑度: ${seg.perplexity.toFixed(2)}`);
|
|
124
|
+
lines.push(` 结果: ${seg.isAIGenerated ? 'AI生成' : '人类编写'}`);
|
|
125
|
+
lines.push(` 置信度: ${seg.confidence}%`);
|
|
126
|
+
lines.push('');
|
|
127
|
+
});
|
|
128
|
+
lines.push('='.repeat(60));
|
|
129
|
+
return lines.join('\n');
|
|
130
|
+
}
|
|
131
|
+
formatReportAsJSON(report) {
|
|
132
|
+
return JSON.stringify(report, null, 2);
|
|
133
|
+
}
|
|
134
|
+
}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
export interface DetectionOptions {
|
|
2
|
+
model?: string;
|
|
3
|
+
threshold?: number;
|
|
4
|
+
chunkSize?: number;
|
|
5
|
+
overlapSize?: number;
|
|
6
|
+
language?: string;
|
|
7
|
+
}
|
|
8
|
+
export interface CodeSegment {
|
|
9
|
+
content: string;
|
|
10
|
+
startLine: number;
|
|
11
|
+
endLine: number;
|
|
12
|
+
}
|
|
13
|
+
export interface SegmentResult {
|
|
14
|
+
segment: CodeSegment;
|
|
15
|
+
perplexity: number;
|
|
16
|
+
isAIGenerated: boolean;
|
|
17
|
+
confidence: number;
|
|
18
|
+
}
|
|
19
|
+
export interface DetectionReport {
|
|
20
|
+
overallScore: number;
|
|
21
|
+
isAIGenerated: boolean;
|
|
22
|
+
totalSegments: number;
|
|
23
|
+
aiGeneratedSegments: number;
|
|
24
|
+
humanWrittenSegments: number;
|
|
25
|
+
segments: SegmentResult[];
|
|
26
|
+
summary: string;
|
|
27
|
+
recommendations: string[];
|
|
28
|
+
metadata: {
|
|
29
|
+
model: string;
|
|
30
|
+
threshold: number;
|
|
31
|
+
timestamp: string;
|
|
32
|
+
language: string;
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
export interface ModelConfig {
|
|
36
|
+
name: string;
|
|
37
|
+
maxTokens: number;
|
|
38
|
+
defaultThreshold: number;
|
|
39
|
+
}
|
|
40
|
+
export declare const DEFAULT_MODEL = "Xenova/codebert-base";
|
|
41
|
+
export declare const DEFAULT_THRESHOLD = 15;
|
|
42
|
+
export declare const DEFAULT_CHUNK_SIZE = 512;
|
|
43
|
+
export declare const DEFAULT_OVERLAP_SIZE = 50;
|
|
44
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,WAAW,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,OAAO,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,eAAe;IAC9B,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,OAAO,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,mBAAmB,EAAE,MAAM,CAAC;IAC5B,oBAAoB,EAAE,MAAM,CAAC;IAC7B,QAAQ,EAAE,aAAa,EAAE,CAAC;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,QAAQ,EAAE;QACR,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,MAAM,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC;CACH;AAED,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;CAC1B;AAED,eAAO,MAAM,aAAa,yBAAyB,CAAC;AACpD,eAAO,MAAM,iBAAiB,KAAO,CAAC;AACtC,eAAO,MAAM,kBAAkB,MAAM,CAAC;AACtC,eAAO,MAAM,oBAAoB,KAAK,CAAC"}
|
package/dist/types.js
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "ai-code-detector",
|
|
3
|
+
"displayName": "AI Code Detector",
|
|
4
|
+
"description": "基于困惑度(Perplexity)检测AI生成代码的npm包,支持多种编程语言,提供CLI和API两种使用方式",
|
|
5
|
+
"version": "0.0.1",
|
|
6
|
+
"private": false,
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"author": {
|
|
9
|
+
"name": "CC",
|
|
10
|
+
"email": "463752412@qq.com"
|
|
11
|
+
},
|
|
12
|
+
"main": "./dist/index.js",
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"type": "module",
|
|
15
|
+
"exports": {
|
|
16
|
+
".": {
|
|
17
|
+
"import": "./dist/index.js",
|
|
18
|
+
"types": "./dist/index.d.ts"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"bin": {
|
|
22
|
+
"ai-code-detector": "./dist/cli.js"
|
|
23
|
+
},
|
|
24
|
+
"files": [
|
|
25
|
+
"dist",
|
|
26
|
+
"README.md",
|
|
27
|
+
"LICENSE"
|
|
28
|
+
],
|
|
29
|
+
"scripts": {
|
|
30
|
+
"build": "tsc",
|
|
31
|
+
"prepublishOnly": "npm run build"
|
|
32
|
+
},
|
|
33
|
+
"repository": {
|
|
34
|
+
"type": "git",
|
|
35
|
+
"url": "https://github.com/ccOfHome/ai-code-detector.git"
|
|
36
|
+
},
|
|
37
|
+
"homepage": "https://github.com/ccOfHome/ai-code-detector#readme",
|
|
38
|
+
"bugs": {
|
|
39
|
+
"url": "https://github.com/ccOfHome/ai-code-detector/issues"
|
|
40
|
+
},
|
|
41
|
+
"keywords": [
|
|
42
|
+
"ai",
|
|
43
|
+
"code",
|
|
44
|
+
"detection",
|
|
45
|
+
"perplexity",
|
|
46
|
+
"llm",
|
|
47
|
+
"ai-generated",
|
|
48
|
+
"code-analysis",
|
|
49
|
+
"chatgpt",
|
|
50
|
+
"copilot",
|
|
51
|
+
"transformers"
|
|
52
|
+
],
|
|
53
|
+
"engines": {
|
|
54
|
+
"node": ">=18.0.0"
|
|
55
|
+
},
|
|
56
|
+
"dependencies": {
|
|
57
|
+
"@xenova/transformers": "^2.17.2"
|
|
58
|
+
},
|
|
59
|
+
"devDependencies": {
|
|
60
|
+
"@types/node": "^22",
|
|
61
|
+
"typescript": "^5.9.2"
|
|
62
|
+
}
|
|
63
|
+
}
|