paperfit-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.claude/commands/adjust-length.md +21 -0
  2. package/.claude/commands/check-visual.md +27 -0
  3. package/.claude/commands/fix-layout.md +31 -0
  4. package/.claude/commands/migrate-template.md +23 -0
  5. package/.claude/commands/repair-table.md +21 -0
  6. package/.claude/commands/show-status.md +32 -0
  7. package/.claude-plugin/README.md +77 -0
  8. package/.claude-plugin/marketplace.json +41 -0
  9. package/.claude-plugin/plugin.json +39 -0
  10. package/CLAUDE.md +266 -0
  11. package/CONTRIBUTING.md +131 -0
  12. package/LICENSE +21 -0
  13. package/README.md +164 -0
  14. package/agents/code-surgeon-agent.md +214 -0
  15. package/agents/layout-detective-agent.md +229 -0
  16. package/agents/orchestrator-agent.md +254 -0
  17. package/agents/quality-gatekeeper-agent.md +270 -0
  18. package/agents/rule-engine-agent.md +224 -0
  19. package/agents/semantic-polish-agent.md +250 -0
  20. package/bin/paperfit.js +176 -0
  21. package/config/agent_roles.yaml +56 -0
  22. package/config/layout_rules.yaml +54 -0
  23. package/config/templates.yaml +241 -0
  24. package/config/vto_taxonomy.yaml +489 -0
  25. package/config/writing_rules.yaml +64 -0
  26. package/install.sh +30 -0
  27. package/package.json +52 -0
  28. package/requirements.txt +5 -0
  29. package/scripts/benchmark_runner.py +629 -0
  30. package/scripts/compile.sh +244 -0
  31. package/scripts/config_validator.py +339 -0
  32. package/scripts/cv_detector.py +600 -0
  33. package/scripts/evidence_collector.py +167 -0
  34. package/scripts/float_fixers.py +861 -0
  35. package/scripts/inject_defects.py +549 -0
  36. package/scripts/install-claude-global.js +148 -0
  37. package/scripts/install.js +66 -0
  38. package/scripts/install.sh +106 -0
  39. package/scripts/overflow_fixers.py +656 -0
  40. package/scripts/package-for-opensource.sh +138 -0
  41. package/scripts/parse_log.py +260 -0
  42. package/scripts/postinstall.js +38 -0
  43. package/scripts/pre_tool_use.py +265 -0
  44. package/scripts/render_pages.py +244 -0
  45. package/scripts/session_logger.py +329 -0
  46. package/scripts/space_util_fixers.py +773 -0
  47. package/scripts/state_manager.py +352 -0
  48. package/scripts/test_commands.py +187 -0
  49. package/scripts/test_cv_detector.py +214 -0
  50. package/scripts/test_integration.py +290 -0
  51. package/skills/consistency-polisher/SKILL.md +337 -0
  52. package/skills/float-optimizer/SKILL.md +284 -0
  53. package/skills/latex_fixers/__init__.py +82 -0
  54. package/skills/latex_fixers/float_fixers.py +392 -0
  55. package/skills/latex_fixers/fullwidth_fixers.py +375 -0
  56. package/skills/latex_fixers/overflow_fixers.py +250 -0
  57. package/skills/latex_fixers/semantic_micro_tuning.py +362 -0
  58. package/skills/latex_fixers/space_util_fixers.py +389 -0
  59. package/skills/latex_fixers/utils.py +55 -0
  60. package/skills/overflow-repair/SKILL.md +304 -0
  61. package/skills/space-util-fixer/SKILL.md +307 -0
  62. package/skills/taxonomy-vto/SKILL.md +486 -0
  63. package/skills/template-migrator/SKILL.md +251 -0
  64. package/skills/visual-inspector/SKILL.md +217 -0
  65. package/skills/writing-polish/SKILL.md +289 -0
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env bash
2
+ #
3
+ # PaperFit 开源打包脚本
4
+ #
5
+ # 功能:
6
+ # 1. 清理所有本地数据、编译产物和个人配置
7
+ # 2. 保留核心项目文件
8
+ # 3. 生成干净的发布版本
9
+ #
10
+ # 用法:
11
+ # ./scripts/package-for-opensource.sh [目标目录]
12
+ #
13
+
14
+ set -e
15
+
16
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
17
+ PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
18
+ TARGET_DIR="${1:-$PROJECT_ROOT/dist}"
19
+
20
+ echo "============================================================"
21
+ echo "PaperFit 开源打包脚本"
22
+ echo "============================================================"
23
+ echo ""
24
+ echo "源目录:$PROJECT_ROOT"
25
+ echo "目标目录:$TARGET_DIR"
26
+ echo ""
27
+
28
+ # 创建目标目录
29
+ mkdir -p "$TARGET_DIR"
30
+
31
+ # 定义需要复制的核心目录和文件
32
+ CORE_DIRS=(
33
+ "agents"
34
+ "bin"
35
+ "config"
36
+ "docs"
37
+ "scripts"
38
+ "skills"
39
+ )
40
+
41
+ CORE_FILES=(
42
+ "CLAUDE.md"
43
+ "README.md"
44
+ "package.json"
45
+ "package-lock.json"
46
+ "requirements.txt"
47
+ ".gitignore"
48
+ )
49
+
50
+ # 定义需要排除的模式
51
+ EXCLUDE_PATTERNS=(
52
+ "*.aux"
53
+ "*.log"
54
+ "*.out"
55
+ "*.bbl"
56
+ "*.blg"
57
+ "*.fls"
58
+ "*.fdb_latexmk"
59
+ "*.pdf"
60
+ "*.png"
61
+ "*.jpg"
62
+ "*.DS_Store"
63
+ "__pycache__"
64
+ "*.pyc"
65
+ "*.pyo"
66
+ ".DS_Store"
67
+ )
68
+
69
+ echo "正在复制核心文件..."
70
+
71
+ # 复制核心目录
72
+ for dir in "${CORE_DIRS[@]}"; do
73
+ if [ -d "$PROJECT_ROOT/$dir" ]; then
74
+ echo " → 复制 $dir/"
75
+ cp -r "$PROJECT_ROOT/$dir" "$TARGET_DIR/"
76
+ fi
77
+ done
78
+
79
+ # 复制核心文件
80
+ for file in "${CORE_FILES[@]}"; do
81
+ if [ -f "$PROJECT_ROOT/$file" ]; then
82
+ echo " → 复制 $file"
83
+ cp "$PROJECT_ROOT/$file" "$TARGET_DIR/"
84
+ fi
85
+ done
86
+
87
+ # 复制 .claude 目录(排除敏感配置)
88
+ echo " → 复制 .claude/ (排除敏感配置)..."
89
+ mkdir -p "$TARGET_DIR/.claude"
90
+ if [ -d "$PROJECT_ROOT/.claude/commands" ]; then
91
+ cp -r "$PROJECT_ROOT/.claude/commands" "$TARGET_DIR/.claude/"
92
+ fi
93
+ # 不复制 settings.json 和 settings.local.json(包含个人配置)
94
+
95
+ # 复制 data 目录的结构(不包含实际数据)
96
+ echo " → 创建 data/ 目录结构..."
97
+ mkdir -p "$TARGET_DIR/data"
98
+ mkdir -p "$TARGET_DIR/data/benchmarks/samples"
99
+
100
+ # 如果有 sample 文件,可以选择性复制
101
+ # if [ -d "$PROJECT_ROOT/data/benchmarks/samples" ]; then
102
+ # cp -r "$PROJECT_ROOT/data/benchmarks/samples" "$TARGET_DIR/data/benchmarks/"
103
+ # fi
104
+
105
+ # 清理编译产物
106
+ echo ""
107
+ echo "正在清理编译产物..."
108
+
109
+ find "$TARGET_DIR" -type f -name "*.aux" -delete
110
+ find "$TARGET_DIR" -type f -name "*.log" -delete
111
+ find "$TARGET_DIR" -type f -name "*.out" -delete
112
+ find "$TARGET_DIR" -type f -name "*.bbl" -delete
113
+ find "$TARGET_DIR" -type f -name "*.blg" -delete
114
+ find "$TARGET_DIR" -type f -name "*.fls" -delete
115
+ find "$TARGET_DIR" -type f -name "*.fdb_latexmk" -delete
116
+ find "$TARGET_DIR" -type f -name "*.pdf" -delete
117
+ find "$TARGET_DIR" -type f -name "*.png" -delete
118
+ find "$TARGET_DIR" -type f -name "*.jpg" -delete
119
+ find "$TARGET_DIR" -type f -name ".DS_Store" -delete
120
+ find "$TARGET_DIR" -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
121
+ find "$TARGET_DIR" -type f -name "*.pyc" -delete
122
+ find "$TARGET_DIR" -type f -name "*.pyo" -delete
123
+
124
+ echo ""
125
+ echo "============================================================"
126
+ echo "打包完成!"
127
+ echo "============================================================"
128
+ echo ""
129
+ echo "发布的文件位于:$TARGET_DIR"
130
+ echo ""
131
+ echo "下一步操作:"
132
+ echo " 1. cd $TARGET_DIR"
133
+ echo " 2. git init"
134
+ echo " 3. git add -A"
135
+ echo " 4. git commit -m 'Initial commit: PaperFit VTO System'"
136
+ echo " 5. git remote add origin <your-repo-url>"
137
+ echo " 6. git push -u origin main"
138
+ echo ""
@@ -0,0 +1,260 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LaTeX 编译日志解析器
4
+
5
+ 解析 LaTeX 编译生成的 .log 文件,提取错误、警告、Overfull/Underfull hbox
6
+ 等信息,并输出结构化的 JSON 报告,供 rule-engine-agent 使用。
7
+
8
+ 用法:
9
+ python parse_log.py <log_file> [--output <json_file>] [--verbose]
10
+
11
+ 示例:
12
+ python parse_log.py compile.log --output log_report.json
13
+ """
14
+
15
+ import re
16
+ import json
17
+ import argparse
18
+ import sys
19
+ from pathlib import Path
20
+ from typing import List, Dict, Any, Optional
21
+
22
+
23
+ class LogParser:
24
+ """LaTeX 日志解析器"""
25
+
26
+ def __init__(self, log_path: str):
27
+ self.log_path = Path(log_path)
28
+ self.content = ""
29
+ self.errors: List[Dict] = []
30
+ self.warnings: List[Dict] = []
31
+ self.overfull_hbox: List[Dict] = []
32
+ self.underfull_hbox: List[Dict] = []
33
+ self.undefined_refs: List[str] = []
34
+ self.citation_issues: List[str] = []
35
+ self.float_warnings: List[Dict] = []
36
+ self.package_warnings: List[Dict] = []
37
+ self.compile_success = True
38
+
39
+ def parse(self) -> Dict[str, Any]:
40
+ """执行完整解析,返回结构化报告"""
41
+ if not self.log_path.exists():
42
+ return {"error": f"Log file not found: {self.log_path}"}
43
+
44
+ with open(self.log_path, 'r', encoding='utf-8', errors='ignore') as f:
45
+ self.content = f.read()
46
+
47
+ self._check_compile_success()
48
+ self._extract_errors()
49
+ self._extract_overfull()
50
+ self._extract_underfull()
51
+ self._extract_undefined_references()
52
+ self._extract_citation_warnings()
53
+ self._extract_float_warnings()
54
+ self._extract_package_warnings()
55
+
56
+ return self._build_report()
57
+
58
+ def _check_compile_success(self) -> None:
59
+ """检查编译是否成功(日志中是否有 Fatal error)"""
60
+ # 常见成功标志
61
+ if "Output written on" in self.content:
62
+ self.compile_success = True
63
+ elif "Fatal error" in self.content or "Emergency stop" in self.content:
64
+ self.compile_success = False
65
+ else:
66
+ # 默认视为成功(可能日志不完整)
67
+ self.compile_success = True
68
+
69
+ def _extract_errors(self) -> None:
70
+ """提取 ! 开头的错误行"""
71
+ lines = self.content.split('\n')
72
+ for i, line in enumerate(lines):
73
+ if line.startswith('!'):
74
+ error = {
75
+ "type": "LaTeX Error",
76
+ "message": line[2:].strip(),
77
+ "line": self._find_line_number(lines, i),
78
+ "context": self._extract_context(lines, i)
79
+ }
80
+ self.errors.append(error)
81
+
82
+ def _extract_overfull(self) -> None:
83
+ """提取 Overfull \hbox 警告"""
84
+ pattern = r'Overfull \\hbox \(([0-9.]+)pt too wide\) (.*?)(?: at lines ([0-9]+(?:--[0-9]+)?))?'
85
+ matches = re.finditer(pattern, self.content, re.MULTILINE)
86
+
87
+ for match in matches:
88
+ overflow_pt = float(match.group(1))
89
+ context = match.group(2).strip()
90
+ lines_range = match.group(3) if match.group(3) else None
91
+
92
+ # 判断是否在表格对齐环境中
93
+ is_alignment = 'in alignment' in context
94
+
95
+ entry = {
96
+ "type": "Overfull hbox",
97
+ "subtype": "alignment" if is_alignment else "paragraph",
98
+ "overflow_pt": overflow_pt,
99
+ "context": context,
100
+ "lines": lines_range,
101
+ "severity": "major" if overflow_pt >= 5.0 else "minor"
102
+ }
103
+ self.overfull_hbox.append(entry)
104
+ self.warnings.append(entry)
105
+
106
+ def _extract_underfull(self) -> None:
107
+ """提取 Underfull \hbox 警告"""
108
+ pattern = r'Underfull \\hbox \(badness [0-9]+\) (.*?)(?: at lines ([0-9]+(?:--[0-9]+)?))?'
109
+ matches = re.finditer(pattern, self.content, re.MULTILINE)
110
+
111
+ for match in matches:
112
+ context = match.group(1).strip()
113
+ lines_range = match.group(2) if match.group(2) else None
114
+
115
+ entry = {
116
+ "type": "Underfull hbox",
117
+ "context": context,
118
+ "lines": lines_range,
119
+ "severity": "minor"
120
+ }
121
+ self.underfull_hbox.append(entry)
122
+ self.warnings.append(entry)
123
+
124
+ def _extract_undefined_references(self) -> None:
125
+ """提取未定义的引用警告"""
126
+ pattern = r'LaTeX Warning: Reference `([^`]+)\' undefined'
127
+ matches = re.finditer(pattern, self.content)
128
+
129
+ for match in matches:
130
+ ref = match.group(1)
131
+ self.undefined_refs.append(ref)
132
+ self.warnings.append({
133
+ "type": "Undefined reference",
134
+ "reference": ref,
135
+ "severity": "major"
136
+ })
137
+
138
+ def _extract_citation_warnings(self) -> None:
139
+ """提取未定义的引用警告"""
140
+ pattern = r'LaTeX Warning: Citation `([^`]+)\' .*undefined'
141
+ matches = re.finditer(pattern, self.content)
142
+
143
+ for match in matches:
144
+ cite = match.group(1)
145
+ self.citation_issues.append(cite)
146
+ self.warnings.append({
147
+ "type": "Undefined citation",
148
+ "citation": cite,
149
+ "severity": "major"
150
+ })
151
+
152
+ def _extract_float_warnings(self) -> None:
153
+ """提取浮动体相关警告"""
154
+ pattern = r'LaTeX Warning: Float too large for page by ([0-9.]+)pt'
155
+ matches = re.finditer(pattern, self.content)
156
+
157
+ for match in matches:
158
+ overflow_pt = float(match.group(1))
159
+ entry = {
160
+ "type": "Float too large",
161
+ "overflow_pt": overflow_pt,
162
+ "severity": "major"
163
+ }
164
+ self.float_warnings.append(entry)
165
+ self.warnings.append(entry)
166
+
167
+ def _extract_package_warnings(self) -> None:
168
+ """提取宏包警告(如 hyperref、caption 等)"""
169
+ pattern = r'Package (\w+) Warning: (.*?)(?: on input line ([0-9]+))?'
170
+ matches = re.finditer(pattern, self.content)
171
+
172
+ for match in matches:
173
+ package = match.group(1)
174
+ message = match.group(2).strip()
175
+ line = match.group(3) if match.group(3) else None
176
+
177
+ entry = {
178
+ "type": "Package warning",
179
+ "package": package,
180
+ "message": message,
181
+ "line": line,
182
+ "severity": "minor"
183
+ }
184
+ self.package_warnings.append(entry)
185
+ self.warnings.append(entry)
186
+
187
+ def _find_line_number(self, lines: List[str], current_idx: int) -> Optional[int]:
188
+ """尝试从上下文中提取行号"""
189
+ for offset in range(-3, 4):
190
+ idx = current_idx + offset
191
+ if 0 <= idx < len(lines):
192
+ match = re.search(r'l\.([0-9]+)', lines[idx])
193
+ if match:
194
+ return int(match.group(1))
195
+ return None
196
+
197
+ def _extract_context(self, lines: List[str], error_idx: int) -> str:
198
+ """提取错误附近的上下文行"""
199
+ context_lines = []
200
+ for offset in range(1, 5):
201
+ idx = error_idx + offset
202
+ if idx < len(lines) and lines[idx].strip():
203
+ context_lines.append(lines[idx].strip())
204
+ return ' '.join(context_lines[:2])
205
+
206
+ def _build_report(self) -> Dict[str, Any]:
207
+ """构建最终 JSON 报告"""
208
+ summary = {
209
+ "errors": len(self.errors),
210
+ "warnings": len(self.warnings),
211
+ "overfull_hbox_total": len(self.overfull_hbox),
212
+ "underfull_hbox_total": len(self.underfull_hbox),
213
+ "undefined_references": len(self.undefined_refs),
214
+ "citation_issues": len(self.citation_issues),
215
+ "float_warnings": len(self.float_warnings),
216
+ "package_warnings": len(self.package_warnings)
217
+ }
218
+
219
+ return {
220
+ "parse_version": "1.0",
221
+ "log_file": str(self.log_path),
222
+ "compile_success": self.compile_success,
223
+ "summary": summary,
224
+ "errors": self.errors,
225
+ "warnings": self.warnings,
226
+ "overfull_hbox": self.overfull_hbox,
227
+ "underfull_hbox": self.underfull_hbox,
228
+ "undefined_references": self.undefined_refs,
229
+ "citation_issues": self.citation_issues,
230
+ "float_warnings": self.float_warnings,
231
+ "package_warnings": self.package_warnings,
232
+ "compilation_blockers": self.errors # 任何错误都阻塞编译
233
+ }
234
+
235
+
236
+ def main():
237
+ parser = argparse.ArgumentParser(description="解析 LaTeX 编译日志")
238
+ parser.add_argument("log_file", help="LaTeX .log 文件路径")
239
+ parser.add_argument("--output", "-o", help="输出 JSON 文件路径")
240
+ parser.add_argument("--verbose", "-v", action="store_true", help="详细输出")
241
+
242
+ args = parser.parse_args()
243
+
244
+ log_parser = LogParser(args.log_file)
245
+ report = log_parser.parse()
246
+
247
+ if args.output:
248
+ with open(args.output, 'w', encoding='utf-8') as f:
249
+ json.dump(report, f, indent=2, ensure_ascii=False)
250
+ print(f"Report saved to {args.output}")
251
+
252
+ if args.verbose or not args.output:
253
+ print(json.dumps(report, indent=2, ensure_ascii=False))
254
+
255
+ # 根据编译状态返回退出码
256
+ sys.exit(0 if report.get("compile_success", False) else 1)
257
+
258
+
259
+ if __name__ == "__main__":
260
+ main()
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Postinstall: print next steps. Heavy setup (pip, data dirs) belongs in the user's project or `paperfit doctor`.
4
+ */
5
+
6
+ const path = require('path');
7
+ const fs = require('fs');
8
+
9
+ const pkgRoot = path.join(__dirname, '..');
10
+ const insideNodeModules = pkgRoot.includes(`${path.sep}node_modules${path.sep}`);
11
+ const isGlobal = process.env.npm_config_global === 'true';
12
+
13
+ console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
14
+ console.log(' PaperFit (paperfit-cli) installed');
15
+ console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
16
+
17
+ if (!insideNodeModules) {
18
+ // Git checkout: optional lightweight data scaffold (no pip)
19
+ const dataDirs = ['data/backups', 'data/benchmarks/case', 'data/pages', 'data/evidence'];
20
+ for (const rel of dataDirs) {
21
+ const p = path.join(pkgRoot, rel);
22
+ if (!fs.existsSync(p)) {
23
+ fs.mkdirSync(p, { recursive: true });
24
+ }
25
+ }
26
+ }
27
+
28
+ console.log('Global Claude Code integration (recommended):');
29
+ console.log(' paperfit-install # copy commands/skills/agents → ~/.claude');
30
+ console.log(' paperfit-install --force # overwrite existing files\n');
31
+
32
+ console.log('CLI:');
33
+ console.log(' paperfit doctor # check Python, poppler, latexmk');
34
+ console.log(' paperfit init # verify environment in current dir\n');
35
+
36
+ if (isGlobal) {
37
+ console.log('Tip: run `paperfit-install` once so /fix-layout and skills are available in any project.\n');
38
+ }
@@ -0,0 +1,265 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PaperFit Pre-Tool Use Security Hook
4
+
5
+ Detects secrets and sensitive patterns before tool execution.
6
+ Inspired by ECC's beforeSubmitPrompt hook with sk-, ghp_, AKIA patterns.
7
+
8
+ Usage:
9
+ python pre_tool_use.py --check-secrets <content>
10
+ python pre_tool_use.py --check-file <file_path>
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import re
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ # Secret patterns based on ECC security implementation
20
+ SECRET_PATTERNS = [
21
+ # API Keys
22
+ (r'sk-[a-zA-Z0-9]{20,}', 'OpenAI API Key (sk-...)'),
23
+ (r'sk-proj-[a-zA-Z0-9]{20,}', 'OpenAI Project Key'),
24
+ (r'api[_-]?key[\"\'\s]*[:=]\s*[\"\'\'][a-zA-Z0-9]{16,}[\"\'\']', 'Generic API Key'),
25
+
26
+ # GitHub
27
+ (r'ghp_[a-zA-Z0-9]{36}', 'GitHub Personal Access Token'),
28
+ (r'gho_[a-zA-Z0-9]{36}', 'GitHub OAuth Token'),
29
+ (r'ghu_[a-zA-Z0-9]{36}', 'GitHub User Token'),
30
+ (r'ghs_[a-zA-Z0-9]{36}', 'GitHub Server Token'),
31
+ (r'ghr_[a-zA-Z0-9]{36}', 'GitHub Refresh Token'),
32
+
33
+ # AWS
34
+ (r'AKIA[0-9A-Z]{16}', 'AWS Access Key ID'),
35
+ (r'[0-9a-zA-Z/+]{40}={0,2}', 'Possible AWS Secret Key'),
36
+
37
+ # Azure
38
+ (r'[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}', 'Azure/GUID (potential secret)'),
39
+
40
+ # Google
41
+ (r'AIza[0-9A-Za-z\\-_]{35}', 'Google Cloud API Key'),
42
+ (r'ya29\\.[0-9A-Za-z\\-_]+', 'Google OAuth Token'),
43
+
44
+ # Stripe
45
+ (r'sk_live_[0-9a-zA-Z]{24,}', 'Stripe Secret Key'),
46
+ (r'pk_live_[0-9a-zA-Z]{24,}', 'Stripe Publishable Key'),
47
+
48
+ # Slack
49
+ (r'xox[baprs]-[0-9a-zA-Z]{10,48}', 'Slack Token'),
50
+
51
+ # Private Keys
52
+ (r'-----BEGIN (RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----', 'Private Key'),
53
+ (r'-----BEGIN PGP PRIVATE KEY BLOCK-----', 'PGP Private Key'),
54
+
55
+ # Passwords in config
56
+ (r'password[\"\'\s]*[:=]\s*[\"\'][^\"\']{8,}[\"\']', 'Hardcoded Password'),
57
+ (r'passwd[\"\'\s]*[:=]\s*[\"\'][^\"\']{8,}[\"\']', 'Hardcoded Password (passwd)'),
58
+
59
+ # JWT Tokens
60
+ (r'eyJ[a-zA-Z0-9_-]*\\.eyJ[a-zA-Z0-9_-]*\\.[a-zA-Z0-9_-]*', 'JWT Token'),
61
+
62
+ # NPM Tokens
63
+ (r'npm_[a-zA-Z0-9]{36}', 'NPM Access Token'),
64
+
65
+ # Hugging Face
66
+ (r'hf_[a-zA-Z]{34}', 'Hugging Face Token'),
67
+
68
+ # Generic secrets
69
+ (r'secret[\"\'\s]*[:=]\s*[\"\'][^\"\']{8,}[\"\']', 'Hardcoded Secret'),
70
+ (r'bearer[\\s]+[a-zA-Z0-9\\-_\\.]{20,}', 'Bearer Token'),
71
+ ]
72
+
73
+ # Files that commonly contain secrets
74
+ SENSITIVE_FILES = [
75
+ '.env',
76
+ '.env.local',
77
+ '.env.production',
78
+ '.env.development',
79
+ '.env.test',
80
+ '.git-credentials',
81
+ '.netrc',
82
+ '.pgpass',
83
+ '.my.cnf',
84
+ 'credentials',
85
+ 'credentials.json',
86
+ 'credentials.yaml',
87
+ 'credentials.yml',
88
+ 'config.json',
89
+ 'config.yaml',
90
+ 'config.yml',
91
+ 'secrets.json',
92
+ 'secrets.yaml',
93
+ 'secrets.yml',
94
+ '.dsn',
95
+ 'id_rsa',
96
+ 'id_dsa',
97
+ 'id_ecdsa',
98
+ 'id_ed25519',
99
+ '.pem',
100
+ '.key',
101
+ '.p12',
102
+ '.pfx',
103
+ ]
104
+
105
+
106
+ def check_for_secrets(content: str, file_path: str = None) -> list:
107
+ """
108
+ Check content for secret patterns.
109
+
110
+ Args:
111
+ content: The text content to scan
112
+ file_path: Optional file path for context
113
+
114
+ Returns:
115
+ List of findings, each with pattern_name, match, and severity
116
+ """
117
+ findings = []
118
+
119
+ for pattern, name in SECRET_PATTERNS:
120
+ matches = re.finditer(pattern, content, re.IGNORECASE)
121
+ for match in matches:
122
+ # Skip false positives
123
+ matched_text = match.group(0)
124
+
125
+ # Skip if it looks like an example/placeholder
126
+ if any(placeholder in matched_text.lower() for placeholder in
127
+ ['example', 'your_', '<', '>', 'xxx', '***', '${', '{{']):
128
+ continue
129
+
130
+ # Skip very short matches that might be false positives
131
+ if len(matched_text) < 10 and 'key' not in name.lower():
132
+ continue
133
+
134
+ findings.append({
135
+ 'pattern': name,
136
+ 'match': matched_text[:50] + '...' if len(matched_text) > 50 else matched_text,
137
+ 'position': match.start(),
138
+ 'severity': 'CRITICAL' if 'private key' in name.lower() or 'secret' in name.lower() else 'HIGH'
139
+ })
140
+
141
+ return findings
142
+
143
+
144
+ def check_sensitive_file(file_path: str) -> dict:
145
+ """
146
+ Check if a file path matches known sensitive file patterns.
147
+
148
+ Args:
149
+ file_path: The file path to check
150
+
151
+ Returns:
152
+ Dict with is_sensitive, reason, and risk_level
153
+ """
154
+ path_lower = file_path.lower()
155
+ file_name = Path(file_path).name.lower()
156
+
157
+ # Check exact matches
158
+ for sensitive in SENSITIVE_FILES:
159
+ if file_name == sensitive or path_lower.endswith(f'/{sensitive}'):
160
+ return {
161
+ 'is_sensitive': True,
162
+ 'reason': f'Matches sensitive file pattern: {sensitive}',
163
+ 'risk_level': 'HIGH'
164
+ }
165
+
166
+ # Check extensions
167
+ sensitive_extensions = ['.key', '.pem', '.p12', '.pfx', '.dsn', '.credentials']
168
+ if any(path_lower.endswith(ext) for ext in sensitive_extensions):
169
+ return {
170
+ 'is_sensitive': True,
171
+ 'reason': f'Sensitive file extension detected',
172
+ 'risk_level': 'HIGH'
173
+ }
174
+
175
+ # Check for .env files
176
+ if file_name.startswith('.env'):
177
+ return {
178
+ 'is_sensitive': True,
179
+ 'reason': 'Environment file detected',
180
+ 'risk_level': 'MEDIUM'
181
+ }
182
+
183
+ return {
184
+ 'is_sensitive': False,
185
+ 'reason': None,
186
+ 'risk_level': 'LOW'
187
+ }
188
+
189
+
190
+ def main():
191
+ parser = argparse.ArgumentParser(description='PaperFit Security Hook')
192
+ parser.add_argument('--check-secrets', type=str, help='Check string content for secrets')
193
+ parser.add_argument('--check-file', type=str, help='Check file for secrets')
194
+ parser.add_argument('--check-file-sensitive', type=str, help='Check if file path is sensitive')
195
+ parser.add_argument('--json', action='store_true', help='Output in JSON format')
196
+
197
+ args = parser.parse_args()
198
+
199
+ findings = []
200
+ sensitive_file_result = None
201
+
202
+ if args.check_secrets:
203
+ findings = check_for_secrets(args.check_secrets)
204
+
205
+ if args.check_file:
206
+ try:
207
+ file_path = args.check_file
208
+ content = Path(file_path).read_text(encoding='utf-8', errors='ignore')
209
+ findings = check_for_secrets(content, file_path)
210
+
211
+ # Also check if the file itself is sensitive
212
+ sensitive_file_result = check_sensitive_file(file_path)
213
+ if sensitive_file_result['is_sensitive']:
214
+ findings.append({
215
+ 'pattern': 'Sensitive File',
216
+ 'match': sensitive_file_result['reason'],
217
+ 'position': 0,
218
+ 'severity': sensitive_file_result['risk_level']
219
+ })
220
+ except (OSError, UnicodeDecodeError) as e:
221
+ if args.json:
222
+ print(json.dumps({'error': str(e)}))
223
+ else:
224
+ print(f'Error reading file: {e}')
225
+ sys.exit(1)
226
+
227
+ if args.check_file_sensitive:
228
+ sensitive_file_result = check_sensitive_file(args.check_file_sensitive)
229
+ if args.json:
230
+ print(json.dumps(sensitive_file_result, indent=2))
231
+ else:
232
+ if sensitive_file_result['is_sensitive']:
233
+ print(f"⚠️ {sensitive_file_result['reason']}")
234
+ print(f" Risk Level: {sensitive_file_result['risk_level']}")
235
+ else:
236
+ print("✅ File does not match sensitive patterns")
237
+ sys.exit(0 if not sensitive_file_result['is_sensitive'] else 1)
238
+
239
+ # Output results
240
+ if args.json:
241
+ output = {
242
+ 'findings': findings,
243
+ 'finding_count': len(findings),
244
+ 'has_secrets': len(findings) > 0
245
+ }
246
+ if sensitive_file_result:
247
+ output['sensitive_file'] = sensitive_file_result
248
+ print(json.dumps(output, indent=2))
249
+ else:
250
+ if findings:
251
+ print(f"\n🚨 SECURITY ALERT: {len(findings)} potential secret(s) detected\n")
252
+ for i, finding in enumerate(findings, 1):
253
+ print(f" {i}. {finding['pattern']}")
254
+ print(f" Match: {finding['match']}")
255
+ print(f" Severity: {finding['severity']}")
256
+ print()
257
+ print("⚠️ Remove secrets before committing!\n")
258
+ sys.exit(1)
259
+ else:
260
+ print("✅ No secrets detected")
261
+ sys.exit(0)
262
+
263
+
264
+ if __name__ == '__main__':
265
+ main()