create-ccc-tutor 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/bin/cli.js +70 -9
- package/package.json +1 -1
- package/template/.claude/commands/exam.md +13 -0
- package/template/.claude/commands/slide.md +24 -5
- package/template/.claude-plugin/plugin.json +13 -26
- package/template/.codex/skills/exam/SKILL.md +13 -0
- package/template/.codex/skills/slide/SKILL.md +27 -3
- package/template/.harness/scripts/pdf-rag.sh +40 -0
- package/template/.harness/scripts/pdf_rag.py +485 -0
- package/template/.harness/scripts/requirements-pdf.txt +6 -0
- package/template/.harness/scripts/tests/test_pdf_rag.py +228 -0
- package/template/.harness/state/install.json +1 -1
- package/template/constitution.md +1 -1
- package/template/course/README.md +1 -1
- package/template/docs/features/pdf-vision-implementation.md +109 -0
- package/template/docs/features/pdf-vision.md +226 -0
- package/template/docs/features/slide-query-implementation.md +2 -2
- package/template/docs/features/slide-query.md +2 -0
- package/template/gitignore +4 -0
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""pdf_rag.py — 课件 PDF 的检索/建档/渲染引擎(pdf-vision 功能)。
|
|
3
|
+
|
|
4
|
+
模型无关:Claude 与 Codex 两侧的 /slide、/exam 都调它。输出 JSON 供技能消费。
|
|
5
|
+
|
|
6
|
+
子命令:
|
|
7
|
+
index --subject <s> [--dpi N] 建/更新该科目课件档(幂等)
|
|
8
|
+
query --subject <s> -q "<问题>" [-k K] 先 reconcile 再检索,输出命中页 JSON
|
|
9
|
+
render --pdf <file> --page N [--dpi N] 按需渲染单页 PNG,输出路径
|
|
10
|
+
|
|
11
|
+
设计见 docs/features/pdf-vision-implementation.md。混合检索:建档只嵌入页文字
|
|
12
|
+
(文字过少且有 OCR 则 OCR),看图发生在回答阶段(技能读 PNG)。缓存按
|
|
13
|
+
「内容哈希 + 处理指纹」失效;每次 query 前对课件文件夹做轻量 reconcile(增/删/改)。
|
|
14
|
+
嵌入工具不可用时降级为关键词检索(仅服务文字类问题)。
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import hashlib
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import shutil
|
|
23
|
+
import sys
|
|
24
|
+
import tempfile
|
|
25
|
+
|
|
26
|
+
# ── 常量(改动任一会进入处理指纹,旧档自动失效,见 spec §3.5)────────────────
|
|
27
|
+
PIPELINE_SCHEMA_VERSION = "1"
|
|
28
|
+
EMBED_MODEL_ID = "intfloat/multilingual-e5-small" # 多语言:中文问英文课件
|
|
29
|
+
EMBED_DIM = 384
|
|
30
|
+
DEFAULT_DPI = 150
|
|
31
|
+
VISION_PROMPT_VERSION = "1" # 答题阶段看图提示版本(记入指纹,便于将来回填缓存失效)
|
|
32
|
+
TEXT_MIN_CHARS = 40 # 低于此视为疑似扫描/纯图页
|
|
33
|
+
SIM_THRESHOLD = 0.74 # 余弦相似度命中阈值(e5 偏高;保守,宁 miss 不勉强答)
|
|
34
|
+
KEYWORD_MIN_RATIO = 0.34 # 关键词降级:命中内容词占比下限(防常见词冒充命中)
|
|
35
|
+
|
|
36
|
+
# 关键词降级用停用词(en + 常见中文功能字/单字),过滤后才算「内容词」
|
|
37
|
+
_EN_STOP = {"the", "a", "an", "is", "are", "was", "were", "be", "of", "to", "in",
|
|
38
|
+
"on", "at", "for", "and", "or", "not", "what", "which", "how", "why",
|
|
39
|
+
"when", "who", "this", "that", "these", "those", "it", "its", "as",
|
|
40
|
+
"by", "with", "from", "do", "does", "did", "i", "you", "he", "she",
|
|
41
|
+
"they", "we", "can", "could", "would", "should", "will", "about"}
|
|
42
|
+
_ZH_STOP = set("的了是在和与或不也我你他她它这那个之把被就都要吗呢啊吧么呀有没和对")
|
|
43
|
+
CACHE_ROOT = ".cache/pdf-vision"
|
|
44
|
+
COURSE_ROOT = "course"
|
|
45
|
+
|
|
46
|
+
_SLUG_RE = re.compile(r"[^0-9A-Za-z]+")
|
|
47
|
+
_TOKEN_RE = re.compile(r"[0-9A-Za-z_]+|[一-鿿]")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def slugify(name):
|
|
51
|
+
return _SLUG_RE.sub("-", name).strip("-") or "pdf"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def file_slug(fn):
|
|
55
|
+
# 碰撞安全:不同文件名(如 "Lecture 1.pdf" vs "Lecture_1.pdf")slug 可能相同,
|
|
56
|
+
# 加文件名哈希后缀确保各文件有独立缓存目录,防 rmtree 误删 / 出处张冠李戴(spec §3.7)。
|
|
57
|
+
return slugify(fn) + "-" + hashlib.sha256(fn.encode("utf-8")).hexdigest()[:8]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def eprint(*a):
|
|
61
|
+
print(*a, file=sys.stderr)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ── 文件签名 / 指纹 ───────────────────────────────────────────────────────────
|
|
65
|
+
def quick_sig(path):
|
|
66
|
+
st = os.stat(path)
|
|
67
|
+
return {"size": st.st_size, "mtime_ns": st.st_mtime_ns} # 纳秒精度,避免同秒覆盖漏检
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def file_sha256(path):
|
|
71
|
+
h = hashlib.sha256()
|
|
72
|
+
with open(path, "rb") as f:
|
|
73
|
+
for blk in iter(lambda: f.read(65536), b""):
|
|
74
|
+
h.update(blk)
|
|
75
|
+
return h.hexdigest()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def processing_fingerprint(dpi, ocr_engine):
|
|
79
|
+
# 任一字段变化 → 旧档过期(spec §3.5 红线:内容 + 生成方式都要匹配)
|
|
80
|
+
return {
|
|
81
|
+
"pipeline_schema_version": PIPELINE_SCHEMA_VERSION,
|
|
82
|
+
"embed_model_id": _embed_state["model_id"],
|
|
83
|
+
"embed_model_version": _embed_state["lib_version"],
|
|
84
|
+
"render_dpi": dpi,
|
|
85
|
+
"vision_prompt_version": VISION_PROMPT_VERSION,
|
|
86
|
+
"ocr_engine": ocr_engine,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ── 嵌入器(fastembed 多语言;不可用则降级关键词)─────────────────────────────
|
|
91
|
+
_embed_state = {"tried": False, "model": None, "model_id": EMBED_MODEL_ID,
|
|
92
|
+
"lib_version": "none", "mode": "keyword"}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_embedder():
|
|
96
|
+
"""返回 fastembed 模型或 None(None => 关键词降级)。结果缓存在 _embed_state。"""
|
|
97
|
+
if _embed_state["tried"]:
|
|
98
|
+
return _embed_state["model"]
|
|
99
|
+
_embed_state["tried"] = True
|
|
100
|
+
if os.environ.get("PDF_RAG_FORCE_KEYWORD") == "1":
|
|
101
|
+
_embed_state["mode"] = "keyword"
|
|
102
|
+
return None
|
|
103
|
+
try:
|
|
104
|
+
import fastembed # noqa
|
|
105
|
+
from fastembed import TextEmbedding
|
|
106
|
+
model = TextEmbedding(model_name=EMBED_MODEL_ID)
|
|
107
|
+
_embed_state["model"] = model
|
|
108
|
+
_embed_state["lib_version"] = getattr(fastembed, "__version__", "unknown")
|
|
109
|
+
_embed_state["mode"] = "semantic"
|
|
110
|
+
return model
|
|
111
|
+
except Exception as e: # 下载失败/未安装/离线首次 → 降级
|
|
112
|
+
eprint(f"[pdf_rag] embedding unavailable ({e}); degrading to keyword search")
|
|
113
|
+
_embed_state["mode"] = "keyword"
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def embed_texts(texts, kind):
|
|
118
|
+
"""kind: 'passage' | 'query'(e5 前缀约定,否则召回质量下降)。返回 list[list[float]]。"""
|
|
119
|
+
model = get_embedder()
|
|
120
|
+
if model is None:
|
|
121
|
+
return None
|
|
122
|
+
prefix = "query: " if kind == "query" else "passage: "
|
|
123
|
+
import numpy as np
|
|
124
|
+
vecs = list(model.embed([prefix + t for t in texts]))
|
|
125
|
+
return [np.asarray(v, dtype="float32") for v in vecs]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ── PDF 解析(pymupdf)────────────────────────────────────────────────────────
|
|
129
|
+
def _open_pdf(path):
|
|
130
|
+
try:
|
|
131
|
+
import pymupdf # 1.24+
|
|
132
|
+
return pymupdf.open(path)
|
|
133
|
+
except ImportError:
|
|
134
|
+
import fitz # 兼容旧别名
|
|
135
|
+
return fitz.open(path)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _ocr_available():
|
|
139
|
+
"""探测 PyMuPDF 是否能 OCR(需系统 Tesseract + tessdata)。"""
|
|
140
|
+
if os.environ.get("PDF_RAG_DISABLE_OCR") == "1":
|
|
141
|
+
return False
|
|
142
|
+
try:
|
|
143
|
+
doc = _open_pdf # noqa 仅确认 import 可用;真正能力在抽取时 try
|
|
144
|
+
return os.environ.get("TESSDATA_PREFIX") is not None or _which("tesseract")
|
|
145
|
+
except Exception:
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _which(prog):
|
|
150
|
+
from shutil import which
|
|
151
|
+
return which(prog) is not None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def extract_pages(pdf_path, pdf_dir, dpi, ocr_engine):
|
|
155
|
+
"""返回 [{page, text, visual_flag, visual_only, png_path}]。坏页 text='' 且 unreadable=True。"""
|
|
156
|
+
pages = []
|
|
157
|
+
doc = _open_pdf(pdf_path)
|
|
158
|
+
use_ocr = ocr_engine != "none"
|
|
159
|
+
for i in range(len(doc)):
|
|
160
|
+
page_no = i + 1
|
|
161
|
+
rec = {"page": page_no, "text": "", "visual_flag": False,
|
|
162
|
+
"visual_only": False, "unreadable": False, "png_path": None}
|
|
163
|
+
try:
|
|
164
|
+
pg = doc[i]
|
|
165
|
+
text = pg.get_text() or ""
|
|
166
|
+
try:
|
|
167
|
+
rec["visual_flag"] = len(pg.get_images(full=True)) > 0
|
|
168
|
+
except Exception:
|
|
169
|
+
rec["visual_flag"] = False
|
|
170
|
+
if len(text.strip()) < TEXT_MIN_CHARS:
|
|
171
|
+
# 疑似扫描/纯图页:有 OCR 则补文字,否则标 visual_only
|
|
172
|
+
if use_ocr:
|
|
173
|
+
try:
|
|
174
|
+
tp = pg.get_textpage_ocr(full=True)
|
|
175
|
+
text = pg.get_text(textpage=tp) or text
|
|
176
|
+
except Exception as oe:
|
|
177
|
+
eprint(f"[pdf_rag] OCR failed p{page_no} of {pdf_path}: {oe}")
|
|
178
|
+
if len(text.strip()) < TEXT_MIN_CHARS:
|
|
179
|
+
rec["visual_only"] = True
|
|
180
|
+
rec["text"] = text
|
|
181
|
+
except Exception as pe:
|
|
182
|
+
rec["unreadable"] = True
|
|
183
|
+
eprint(f"[pdf_rag] page {page_no} unreadable in {pdf_path}: {pe}")
|
|
184
|
+
pages.append(rec)
|
|
185
|
+
doc.close()
|
|
186
|
+
return pages
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def render_page(pdf_path, page_no, dpi, out_dir):
|
|
190
|
+
"""渲染单页为 PNG,已存在则跳过。返回路径或 None。"""
|
|
191
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
192
|
+
out = os.path.join(out_dir, f"page-{page_no:03d}.png")
|
|
193
|
+
if os.path.exists(out):
|
|
194
|
+
return out
|
|
195
|
+
try:
|
|
196
|
+
doc = _open_pdf(pdf_path)
|
|
197
|
+
pix = doc[page_no - 1].get_pixmap(dpi=dpi)
|
|
198
|
+
pix.save(out)
|
|
199
|
+
doc.close()
|
|
200
|
+
return out
|
|
201
|
+
except Exception as e:
|
|
202
|
+
eprint(f"[pdf_rag] render failed {pdf_path} p{page_no}: {e}")
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ── 缓存读写(原子)───────────────────────────────────────────────────────────
|
|
207
|
+
def subject_paths(subject):
|
|
208
|
+
base = os.path.join(CACHE_ROOT, subject)
|
|
209
|
+
return {
|
|
210
|
+
"base": base,
|
|
211
|
+
"manifest": os.path.join(base, "manifest.json"),
|
|
212
|
+
"chunks": os.path.join(base, "chunks.json"),
|
|
213
|
+
"vectors": os.path.join(base, "vectors.npy"),
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _atomic_write_json(path, obj):
|
|
218
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
219
|
+
fd, tmp = tempfile.mkstemp(dir=os.path.dirname(path), suffix=".tmp")
|
|
220
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
221
|
+
json.dump(obj, f, ensure_ascii=False, indent=1)
|
|
222
|
+
os.replace(tmp, path)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _atomic_write_npy(path, arr):
|
|
226
|
+
import numpy as np
|
|
227
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
228
|
+
fd, tmp = tempfile.mkstemp(dir=os.path.dirname(path), suffix=".tmp")
|
|
229
|
+
os.close(fd)
|
|
230
|
+
np.save(tmp, arr) # numpy 会补 .npy 后缀
|
|
231
|
+
src = tmp + ".npy" if os.path.exists(tmp + ".npy") else tmp
|
|
232
|
+
os.replace(src, path)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def load_manifest(subject):
|
|
236
|
+
p = subject_paths(subject)["manifest"]
|
|
237
|
+
if os.path.exists(p):
|
|
238
|
+
with open(p, encoding="utf-8") as f:
|
|
239
|
+
return json.load(f)
|
|
240
|
+
return {"fingerprint": None, "files": {}}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ── 建档 / reconcile ──────────────────────────────────────────────────────────
|
|
244
|
+
def slide_dir(subject):
|
|
245
|
+
return os.path.join(COURSE_ROOT, subject, "slide")
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def list_pdfs(subject):
|
|
249
|
+
d = slide_dir(subject)
|
|
250
|
+
if not os.path.isdir(d):
|
|
251
|
+
return []
|
|
252
|
+
return sorted(fn for fn in os.listdir(d) if fn.lower().endswith(".pdf"))
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def reconcile(subject, dpi):
|
|
256
|
+
"""对照课件文件夹增量更新档:增→建、删→移除、改→重建。返回变更摘要。"""
|
|
257
|
+
ocr_engine = "tesseract" if _ocr_available() else "none"
|
|
258
|
+
fp = processing_fingerprint(dpi, ocr_engine)
|
|
259
|
+
base = subject_paths(subject)["base"]
|
|
260
|
+
manifest = load_manifest(subject)
|
|
261
|
+
# 处理指纹变了 → 清掉整个科目缓存(含旧 PNG/文字),整体重建(spec §3.5 红线)
|
|
262
|
+
if manifest.get("fingerprint") != fp:
|
|
263
|
+
if os.path.isdir(base):
|
|
264
|
+
shutil.rmtree(base, ignore_errors=True)
|
|
265
|
+
manifest = {"fingerprint": fp, "files": {}}
|
|
266
|
+
|
|
267
|
+
present = set(list_pdfs(subject))
|
|
268
|
+
known = set(manifest["files"].keys())
|
|
269
|
+
added, removed, changed, unchanged = [], [], [], []
|
|
270
|
+
|
|
271
|
+
for fn in sorted(known - present): # 删除:从档移除 + 清掉它的渲染目录
|
|
272
|
+
slug = manifest["files"][fn].get("slug")
|
|
273
|
+
if slug:
|
|
274
|
+
shutil.rmtree(os.path.join(base, slug), ignore_errors=True)
|
|
275
|
+
del manifest["files"][fn]
|
|
276
|
+
removed.append(fn)
|
|
277
|
+
|
|
278
|
+
for fn in sorted(present):
|
|
279
|
+
path = os.path.join(slide_dir(subject), fn)
|
|
280
|
+
sig = quick_sig(path)
|
|
281
|
+
ent = manifest["files"].get(fn)
|
|
282
|
+
if ent and ent.get("indexed_complete") and ent.get("quick_sig") == sig:
|
|
283
|
+
unchanged.append(fn) # 名/大小/时间都没变 → 跳过,不重读
|
|
284
|
+
continue
|
|
285
|
+
sha = file_sha256(path)
|
|
286
|
+
if ent and ent.get("indexed_complete") and ent.get("sha256") == sha:
|
|
287
|
+
ent["quick_sig"] = sig # 仅 mtime 变、内容没变
|
|
288
|
+
unchanged.append(fn)
|
|
289
|
+
continue
|
|
290
|
+
_index_file(subject, fn, path, sha, sig, dpi, ocr_engine, manifest)
|
|
291
|
+
(changed if ent else added).append(fn)
|
|
292
|
+
|
|
293
|
+
_rebuild_vectors(subject, manifest)
|
|
294
|
+
_atomic_write_json(subject_paths(subject)["manifest"], manifest)
|
|
295
|
+
return {"added": added, "removed": removed, "changed": changed,
|
|
296
|
+
"unchanged": unchanged, "mode": _embed_state["mode"]}
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _index_file(subject, fn, path, sha, sig, dpi, ocr_engine, manifest):
|
|
300
|
+
"""建单个 PDF 的档:标 incomplete → 抽页 → 标 complete(中断不会被复用)。"""
|
|
301
|
+
slug = file_slug(fn)
|
|
302
|
+
png_dir = os.path.join(subject_paths(subject)["base"], slug)
|
|
303
|
+
# 重建该文件前清掉旧渲染/文字,避免内容变了仍返回旧 PNG(spec §3.5 红线)
|
|
304
|
+
if os.path.isdir(png_dir):
|
|
305
|
+
shutil.rmtree(png_dir, ignore_errors=True)
|
|
306
|
+
manifest["files"][fn] = {"sha256": sha, "quick_sig": sig,
|
|
307
|
+
"indexed_complete": False, "slug": slug, "pages": []}
|
|
308
|
+
pages = extract_pages(path, png_dir, dpi, ocr_engine)
|
|
309
|
+
page_meta = []
|
|
310
|
+
for pr in pages:
|
|
311
|
+
page_meta.append({k: pr[k] for k in
|
|
312
|
+
("page", "visual_flag", "visual_only", "unreadable")})
|
|
313
|
+
# 文字写入每页缓存(供答题阶段加载,免重抽)
|
|
314
|
+
if not pr["unreadable"]:
|
|
315
|
+
os.makedirs(png_dir, exist_ok=True)
|
|
316
|
+
with open(os.path.join(png_dir, f"page-{pr['page']:03d}.text.txt"),
|
|
317
|
+
"w", encoding="utf-8") as f:
|
|
318
|
+
f.write(pr["text"])
|
|
319
|
+
manifest["files"][fn]["pages"] = page_meta
|
|
320
|
+
manifest["files"][fn]["indexed_complete"] = True
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _build_chunks(subject, manifest):
|
|
324
|
+
"""从 manifest 收集每页 chunk(page-level)。坏页和全空页不入检索。"""
|
|
325
|
+
chunks = []
|
|
326
|
+
for fn, ent in manifest["files"].items():
|
|
327
|
+
if not ent.get("indexed_complete"):
|
|
328
|
+
continue # 半截档不参与检索(spec §3.4)
|
|
329
|
+
slug = ent["slug"]
|
|
330
|
+
png_dir = os.path.join(subject_paths(subject)["base"], slug)
|
|
331
|
+
for pm in ent["pages"]:
|
|
332
|
+
if pm["unreadable"]:
|
|
333
|
+
continue
|
|
334
|
+
tpath = os.path.join(png_dir, f"page-{pm['page']:03d}.text.txt")
|
|
335
|
+
text = ""
|
|
336
|
+
if os.path.exists(tpath):
|
|
337
|
+
with open(tpath, encoding="utf-8") as f:
|
|
338
|
+
text = f.read()
|
|
339
|
+
lecture = _lecture_label(fn)
|
|
340
|
+
# visual_only 页文字稀少:用「讲次标题 + 文件名」兜底,靠所属讲次被检索到
|
|
341
|
+
embed_text = text.strip() or f"{lecture} {fn}"
|
|
342
|
+
chunks.append({
|
|
343
|
+
"file": fn, "source_file_exact": fn, "lecture": lecture,
|
|
344
|
+
"page": pm["page"], "visual_flag": pm["visual_flag"],
|
|
345
|
+
"visual_only": pm["visual_only"], "slug": slug,
|
|
346
|
+
"text": text, "embed_text": embed_text,
|
|
347
|
+
})
|
|
348
|
+
return chunks
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _rebuild_vectors(subject, manifest):
|
|
352
|
+
import numpy as np
|
|
353
|
+
sp = subject_paths(subject)
|
|
354
|
+
chunks = _build_chunks(subject, manifest)
|
|
355
|
+
_atomic_write_json(sp["chunks"], chunks)
|
|
356
|
+
if not chunks:
|
|
357
|
+
_atomic_write_npy(sp["vectors"], np.zeros((0, EMBED_DIM), dtype="float32"))
|
|
358
|
+
return
|
|
359
|
+
vecs = embed_texts([c["embed_text"] for c in chunks], "passage")
|
|
360
|
+
if vecs is None: # 关键词模式:无向量
|
|
361
|
+
if os.path.exists(sp["vectors"]):
|
|
362
|
+
os.remove(sp["vectors"])
|
|
363
|
+
return
|
|
364
|
+
mat = np.vstack(vecs)
|
|
365
|
+
mat = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + 1e-9)
|
|
366
|
+
_atomic_write_npy(sp["vectors"], mat)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
_LEC_RE = re.compile(r"(Lecture\s*\d+|Week\s*\d+|L\d+)", re.IGNORECASE)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _lecture_label(fn):
|
|
373
|
+
m = _LEC_RE.search(fn)
|
|
374
|
+
return m.group(1) if m else os.path.splitext(fn)[0]
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
# ── 检索 ──────────────────────────────────────────────────────────────────────
|
|
378
|
+
def _content_tokens(s):
|
|
379
|
+
"""分词并去停用词,只留「内容词」(防 the/is/的/了 这类把无关页判成命中)。"""
|
|
380
|
+
return {t for t in _TOKEN_RE.findall(s.lower())
|
|
381
|
+
if t not in _EN_STOP and t not in _ZH_STOP}
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _keyword_score(query, text):
|
|
385
|
+
qt = _content_tokens(query)
|
|
386
|
+
if not qt:
|
|
387
|
+
return 0.0 # 问题里没有内容词 → 不可判命中
|
|
388
|
+
ds = _content_tokens(text)
|
|
389
|
+
if not ds:
|
|
390
|
+
return 0.0
|
|
391
|
+
hit = sum(1 for t in qt if t in ds)
|
|
392
|
+
return hit / len(qt) # 命中内容词占比(query 侧)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def query(subject, question, k, dpi):
|
|
396
|
+
rec = reconcile(subject, dpi)
|
|
397
|
+
sp = subject_paths(subject)
|
|
398
|
+
if not os.path.exists(sp["chunks"]):
|
|
399
|
+
return {"subject": subject, "mode": rec["mode"], "reconcile": rec,
|
|
400
|
+
"miss": True, "reason": "no_index", "results": []}
|
|
401
|
+
with open(sp["chunks"], encoding="utf-8") as f:
|
|
402
|
+
chunks = json.load(f)
|
|
403
|
+
if not chunks:
|
|
404
|
+
return {"subject": subject, "mode": rec["mode"], "reconcile": rec,
|
|
405
|
+
"miss": True, "reason": "empty", "results": []}
|
|
406
|
+
|
|
407
|
+
mode = _embed_state["mode"]
|
|
408
|
+
scored = []
|
|
409
|
+
if mode == "semantic" and os.path.exists(sp["vectors"]):
|
|
410
|
+
import numpy as np
|
|
411
|
+
mat = np.load(sp["vectors"])
|
|
412
|
+
qv = embed_texts([question], "query")
|
|
413
|
+
if qv is not None and mat.shape[0] == len(chunks):
|
|
414
|
+
q = qv[0] / (np.linalg.norm(qv[0]) + 1e-9)
|
|
415
|
+
sims = mat @ q
|
|
416
|
+
order = np.argsort(-sims)[:k]
|
|
417
|
+
scored = [(float(sims[i]), chunks[i]) for i in order]
|
|
418
|
+
else:
|
|
419
|
+
mode = "keyword"
|
|
420
|
+
if not scored: # 关键词降级路径
|
|
421
|
+
mode = "keyword"
|
|
422
|
+
ranked = sorted(((_keyword_score(question, c["embed_text"]), c)
|
|
423
|
+
for c in chunks), key=lambda x: -x[0])[:k]
|
|
424
|
+
scored = [(s, c) for s, c in ranked]
|
|
425
|
+
|
|
426
|
+
threshold = SIM_THRESHOLD if mode == "semantic" else KEYWORD_MIN_RATIO
|
|
427
|
+
top = scored[0][0] if scored else 0.0
|
|
428
|
+
miss = (not scored) or (top < threshold)
|
|
429
|
+
|
|
430
|
+
results = []
|
|
431
|
+
for score, c in scored:
|
|
432
|
+
png = None
|
|
433
|
+
if c["visual_flag"] or c["visual_only"]: # 仅对含图的命中页渲染(top-k,开销小)
|
|
434
|
+
png = render_page(os.path.join(slide_dir(subject), c["file"]), c["page"],
|
|
435
|
+
dpi, os.path.join(sp["base"], c["slug"]))
|
|
436
|
+
results.append({
|
|
437
|
+
"source_file_exact": c["source_file_exact"], "lecture": c["lecture"],
|
|
438
|
+
"page": c["page"], "score": round(score, 4),
|
|
439
|
+
"visual_flag": c["visual_flag"], "visual_only": c["visual_only"],
|
|
440
|
+
"png_path": png,
|
|
441
|
+
"text_path": os.path.join(sp["base"], c["slug"],
|
|
442
|
+
f"page-{c['page']:03d}.text.txt"),
|
|
443
|
+
})
|
|
444
|
+
return {"subject": subject, "mode": mode, "reconcile": rec,
|
|
445
|
+
"miss": miss, "threshold": threshold, "top_score": round(top, 4),
|
|
446
|
+
"results": results}
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
# ── CLI ───────────────────────────────────────────────────────────────────────
|
|
450
|
+
def main(argv=None):
|
|
451
|
+
p = argparse.ArgumentParser(description="课件 PDF 检索/建档/渲染引擎")
|
|
452
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
453
|
+
|
|
454
|
+
pi = sub.add_parser("index", help="建/更新某科目课件档")
|
|
455
|
+
pi.add_argument("--subject", required=True)
|
|
456
|
+
pi.add_argument("--dpi", type=int, default=DEFAULT_DPI)
|
|
457
|
+
|
|
458
|
+
pq = sub.add_parser("query", help="检索(先 reconcile)")
|
|
459
|
+
pq.add_argument("--subject", required=True)
|
|
460
|
+
pq.add_argument("-q", "--question", required=True)
|
|
461
|
+
pq.add_argument("-k", type=int, default=5)
|
|
462
|
+
pq.add_argument("--dpi", type=int, default=DEFAULT_DPI)
|
|
463
|
+
|
|
464
|
+
pr = sub.add_parser("render", help="渲染单页 PNG")
|
|
465
|
+
pr.add_argument("--pdf", required=True, help="PDF 路径")
|
|
466
|
+
pr.add_argument("--page", type=int, required=True)
|
|
467
|
+
pr.add_argument("--dpi", type=int, default=DEFAULT_DPI)
|
|
468
|
+
pr.add_argument("--subject", help="可选,仅用于缓存归类")
|
|
469
|
+
|
|
470
|
+
args = p.parse_args(argv)
|
|
471
|
+
if args.cmd == "index":
|
|
472
|
+
out = reconcile(args.subject, args.dpi)
|
|
473
|
+
elif args.cmd == "query":
|
|
474
|
+
out = query(args.subject, args.question, args.k, args.dpi)
|
|
475
|
+
elif args.cmd == "render":
|
|
476
|
+
slug = file_slug(os.path.basename(args.pdf)) # 碰撞安全,与 index 路径一致
|
|
477
|
+
base = os.path.join(CACHE_ROOT, args.subject or "_adhoc", slug)
|
|
478
|
+
path = render_page(args.pdf, args.page, args.dpi, base)
|
|
479
|
+
out = {"png_path": path, "ok": path is not None}
|
|
480
|
+
print(json.dumps(out, ensure_ascii=False))
|
|
481
|
+
return 0
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
if __name__ == "__main__":
|
|
485
|
+
sys.exit(main())
|