create-ccc-tutor 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,485 @@
1
+ #!/usr/bin/env python3
2
+ """pdf_rag.py — 课件 PDF 的检索/建档/渲染引擎(pdf-vision 功能)。
3
+
4
+ 模型无关:Claude 与 Codex 两侧的 /slide、/exam 都调它。输出 JSON 供技能消费。
5
+
6
+ 子命令:
7
+ index --subject <s> [--dpi N] 建/更新该科目课件档(幂等)
8
+ query --subject <s> -q "<问题>" [-k K] 先 reconcile 再检索,输出命中页 JSON
9
+ render --pdf <file> --page N [--dpi N] 按需渲染单页 PNG,输出路径
10
+
11
+ 设计见 docs/features/pdf-vision-implementation.md。混合检索:建档只嵌入页文字
12
+ (文字过少且有 OCR 则 OCR),看图发生在回答阶段(技能读 PNG)。缓存按
13
+ 「内容哈希 + 处理指纹」失效;每次 query 前对课件文件夹做轻量 reconcile(增/删/改)。
14
+ 嵌入工具不可用时降级为关键词检索(仅服务文字类问题)。
15
+ """
16
+
17
+ import argparse
18
+ import hashlib
19
+ import json
20
+ import os
21
+ import re
22
+ import shutil
23
+ import sys
24
+ import tempfile
25
+
26
+ # ── 常量(改动任一会进入处理指纹,旧档自动失效,见 spec §3.5)────────────────
27
+ PIPELINE_SCHEMA_VERSION = "1"
28
+ EMBED_MODEL_ID = "intfloat/multilingual-e5-small" # 多语言:中文问英文课件
29
+ EMBED_DIM = 384
30
+ DEFAULT_DPI = 150
31
+ VISION_PROMPT_VERSION = "1" # 答题阶段看图提示版本(记入指纹,便于将来回填缓存失效)
32
+ TEXT_MIN_CHARS = 40 # 低于此视为疑似扫描/纯图页
33
+ SIM_THRESHOLD = 0.74 # 余弦相似度命中阈值(e5 偏高;保守,宁 miss 不勉强答)
34
+ KEYWORD_MIN_RATIO = 0.34 # 关键词降级:命中内容词占比下限(防常见词冒充命中)
35
+
36
+ # 关键词降级用停用词(en + 常见中文功能字/单字),过滤后才算「内容词」
37
+ _EN_STOP = {"the", "a", "an", "is", "are", "was", "were", "be", "of", "to", "in",
38
+ "on", "at", "for", "and", "or", "not", "what", "which", "how", "why",
39
+ "when", "who", "this", "that", "these", "those", "it", "its", "as",
40
+ "by", "with", "from", "do", "does", "did", "i", "you", "he", "she",
41
+ "they", "we", "can", "could", "would", "should", "will", "about"}
42
+ _ZH_STOP = set("的了是在和与或不也我你他她它这那个之把被就都要吗呢啊吧么呀有没和对")
43
+ CACHE_ROOT = ".cache/pdf-vision"
44
+ COURSE_ROOT = "course"
45
+
46
+ _SLUG_RE = re.compile(r"[^0-9A-Za-z]+")
47
+ _TOKEN_RE = re.compile(r"[0-9A-Za-z_]+|[一-鿿]")
48
+
49
+
50
+ def slugify(name):
51
+ return _SLUG_RE.sub("-", name).strip("-") or "pdf"
52
+
53
+
54
+ def file_slug(fn):
55
+ # 碰撞安全:不同文件名(如 "Lecture 1.pdf" vs "Lecture_1.pdf")slug 可能相同,
56
+ # 加文件名哈希后缀确保各文件有独立缓存目录,防 rmtree 误删 / 出处张冠李戴(spec §3.7)。
57
+ return slugify(fn) + "-" + hashlib.sha256(fn.encode("utf-8")).hexdigest()[:8]
58
+
59
+
60
+ def eprint(*a):
61
+ print(*a, file=sys.stderr)
62
+
63
+
64
+ # ── 文件签名 / 指纹 ───────────────────────────────────────────────────────────
65
+ def quick_sig(path):
66
+ st = os.stat(path)
67
+ return {"size": st.st_size, "mtime_ns": st.st_mtime_ns} # 纳秒精度,避免同秒覆盖漏检
68
+
69
+
70
+ def file_sha256(path):
71
+ h = hashlib.sha256()
72
+ with open(path, "rb") as f:
73
+ for blk in iter(lambda: f.read(65536), b""):
74
+ h.update(blk)
75
+ return h.hexdigest()
76
+
77
+
78
+ def processing_fingerprint(dpi, ocr_engine):
79
+ # 任一字段变化 → 旧档过期(spec §3.5 红线:内容 + 生成方式都要匹配)
80
+ return {
81
+ "pipeline_schema_version": PIPELINE_SCHEMA_VERSION,
82
+ "embed_model_id": _embed_state["model_id"],
83
+ "embed_model_version": _embed_state["lib_version"],
84
+ "render_dpi": dpi,
85
+ "vision_prompt_version": VISION_PROMPT_VERSION,
86
+ "ocr_engine": ocr_engine,
87
+ }
88
+
89
+
90
+ # ── 嵌入器(fastembed 多语言;不可用则降级关键词)─────────────────────────────
91
+ _embed_state = {"tried": False, "model": None, "model_id": EMBED_MODEL_ID,
92
+ "lib_version": "none", "mode": "keyword"}
93
+
94
+
95
+ def get_embedder():
96
+ """返回 fastembed 模型或 None(None => 关键词降级)。结果缓存在 _embed_state。"""
97
+ if _embed_state["tried"]:
98
+ return _embed_state["model"]
99
+ _embed_state["tried"] = True
100
+ if os.environ.get("PDF_RAG_FORCE_KEYWORD") == "1":
101
+ _embed_state["mode"] = "keyword"
102
+ return None
103
+ try:
104
+ import fastembed # noqa
105
+ from fastembed import TextEmbedding
106
+ model = TextEmbedding(model_name=EMBED_MODEL_ID)
107
+ _embed_state["model"] = model
108
+ _embed_state["lib_version"] = getattr(fastembed, "__version__", "unknown")
109
+ _embed_state["mode"] = "semantic"
110
+ return model
111
+ except Exception as e: # 下载失败/未安装/离线首次 → 降级
112
+ eprint(f"[pdf_rag] embedding unavailable ({e}); degrading to keyword search")
113
+ _embed_state["mode"] = "keyword"
114
+ return None
115
+
116
+
117
+ def embed_texts(texts, kind):
118
+ """kind: 'passage' | 'query'(e5 前缀约定,否则召回质量下降)。返回 list[list[float]]。"""
119
+ model = get_embedder()
120
+ if model is None:
121
+ return None
122
+ prefix = "query: " if kind == "query" else "passage: "
123
+ import numpy as np
124
+ vecs = list(model.embed([prefix + t for t in texts]))
125
+ return [np.asarray(v, dtype="float32") for v in vecs]
126
+
127
+
128
+ # ── PDF 解析(pymupdf)────────────────────────────────────────────────────────
129
+ def _open_pdf(path):
130
+ try:
131
+ import pymupdf # 1.24+
132
+ return pymupdf.open(path)
133
+ except ImportError:
134
+ import fitz # 兼容旧别名
135
+ return fitz.open(path)
136
+
137
+
138
+ def _ocr_available():
139
+ """探测 PyMuPDF 是否能 OCR(需系统 Tesseract + tessdata)。"""
140
+ if os.environ.get("PDF_RAG_DISABLE_OCR") == "1":
141
+ return False
142
+ try:
143
+ doc = _open_pdf # noqa 仅确认 import 可用;真正能力在抽取时 try
144
+ return os.environ.get("TESSDATA_PREFIX") is not None or _which("tesseract")
145
+ except Exception:
146
+ return False
147
+
148
+
149
+ def _which(prog):
150
+ from shutil import which
151
+ return which(prog) is not None
152
+
153
+
154
+ def extract_pages(pdf_path, pdf_dir, dpi, ocr_engine):
155
+ """返回 [{page, text, visual_flag, visual_only, png_path}]。坏页 text='' 且 unreadable=True。"""
156
+ pages = []
157
+ doc = _open_pdf(pdf_path)
158
+ use_ocr = ocr_engine != "none"
159
+ for i in range(len(doc)):
160
+ page_no = i + 1
161
+ rec = {"page": page_no, "text": "", "visual_flag": False,
162
+ "visual_only": False, "unreadable": False, "png_path": None}
163
+ try:
164
+ pg = doc[i]
165
+ text = pg.get_text() or ""
166
+ try:
167
+ rec["visual_flag"] = len(pg.get_images(full=True)) > 0
168
+ except Exception:
169
+ rec["visual_flag"] = False
170
+ if len(text.strip()) < TEXT_MIN_CHARS:
171
+ # 疑似扫描/纯图页:有 OCR 则补文字,否则标 visual_only
172
+ if use_ocr:
173
+ try:
174
+ tp = pg.get_textpage_ocr(full=True)
175
+ text = pg.get_text(textpage=tp) or text
176
+ except Exception as oe:
177
+ eprint(f"[pdf_rag] OCR failed p{page_no} of {pdf_path}: {oe}")
178
+ if len(text.strip()) < TEXT_MIN_CHARS:
179
+ rec["visual_only"] = True
180
+ rec["text"] = text
181
+ except Exception as pe:
182
+ rec["unreadable"] = True
183
+ eprint(f"[pdf_rag] page {page_no} unreadable in {pdf_path}: {pe}")
184
+ pages.append(rec)
185
+ doc.close()
186
+ return pages
187
+
188
+
189
+ def render_page(pdf_path, page_no, dpi, out_dir):
190
+ """渲染单页为 PNG,已存在则跳过。返回路径或 None。"""
191
+ os.makedirs(out_dir, exist_ok=True)
192
+ out = os.path.join(out_dir, f"page-{page_no:03d}.png")
193
+ if os.path.exists(out):
194
+ return out
195
+ try:
196
+ doc = _open_pdf(pdf_path)
197
+ pix = doc[page_no - 1].get_pixmap(dpi=dpi)
198
+ pix.save(out)
199
+ doc.close()
200
+ return out
201
+ except Exception as e:
202
+ eprint(f"[pdf_rag] render failed {pdf_path} p{page_no}: {e}")
203
+ return None
204
+
205
+
206
+ # ── 缓存读写(原子)───────────────────────────────────────────────────────────
207
+ def subject_paths(subject):
208
+ base = os.path.join(CACHE_ROOT, subject)
209
+ return {
210
+ "base": base,
211
+ "manifest": os.path.join(base, "manifest.json"),
212
+ "chunks": os.path.join(base, "chunks.json"),
213
+ "vectors": os.path.join(base, "vectors.npy"),
214
+ }
215
+
216
+
217
+ def _atomic_write_json(path, obj):
218
+ os.makedirs(os.path.dirname(path), exist_ok=True)
219
+ fd, tmp = tempfile.mkstemp(dir=os.path.dirname(path), suffix=".tmp")
220
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
221
+ json.dump(obj, f, ensure_ascii=False, indent=1)
222
+ os.replace(tmp, path)
223
+
224
+
225
+ def _atomic_write_npy(path, arr):
226
+ import numpy as np
227
+ os.makedirs(os.path.dirname(path), exist_ok=True)
228
+ fd, tmp = tempfile.mkstemp(dir=os.path.dirname(path), suffix=".tmp")
229
+ os.close(fd)
230
+ np.save(tmp, arr) # numpy 会补 .npy 后缀
231
+ src = tmp + ".npy" if os.path.exists(tmp + ".npy") else tmp
232
+ os.replace(src, path)
233
+
234
+
235
+ def load_manifest(subject):
236
+ p = subject_paths(subject)["manifest"]
237
+ if os.path.exists(p):
238
+ with open(p, encoding="utf-8") as f:
239
+ return json.load(f)
240
+ return {"fingerprint": None, "files": {}}
241
+
242
+
243
+ # ── 建档 / reconcile ──────────────────────────────────────────────────────────
244
+ def slide_dir(subject):
245
+ return os.path.join(COURSE_ROOT, subject, "slide")
246
+
247
+
248
+ def list_pdfs(subject):
249
+ d = slide_dir(subject)
250
+ if not os.path.isdir(d):
251
+ return []
252
+ return sorted(fn for fn in os.listdir(d) if fn.lower().endswith(".pdf"))
253
+
254
+
255
+ def reconcile(subject, dpi):
256
+ """对照课件文件夹增量更新档:增→建、删→移除、改→重建。返回变更摘要。"""
257
+ ocr_engine = "tesseract" if _ocr_available() else "none"
258
+ fp = processing_fingerprint(dpi, ocr_engine)
259
+ base = subject_paths(subject)["base"]
260
+ manifest = load_manifest(subject)
261
+ # 处理指纹变了 → 清掉整个科目缓存(含旧 PNG/文字),整体重建(spec §3.5 红线)
262
+ if manifest.get("fingerprint") != fp:
263
+ if os.path.isdir(base):
264
+ shutil.rmtree(base, ignore_errors=True)
265
+ manifest = {"fingerprint": fp, "files": {}}
266
+
267
+ present = set(list_pdfs(subject))
268
+ known = set(manifest["files"].keys())
269
+ added, removed, changed, unchanged = [], [], [], []
270
+
271
+ for fn in sorted(known - present): # 删除:从档移除 + 清掉它的渲染目录
272
+ slug = manifest["files"][fn].get("slug")
273
+ if slug:
274
+ shutil.rmtree(os.path.join(base, slug), ignore_errors=True)
275
+ del manifest["files"][fn]
276
+ removed.append(fn)
277
+
278
+ for fn in sorted(present):
279
+ path = os.path.join(slide_dir(subject), fn)
280
+ sig = quick_sig(path)
281
+ ent = manifest["files"].get(fn)
282
+ if ent and ent.get("indexed_complete") and ent.get("quick_sig") == sig:
283
+ unchanged.append(fn) # 名/大小/时间都没变 → 跳过,不重读
284
+ continue
285
+ sha = file_sha256(path)
286
+ if ent and ent.get("indexed_complete") and ent.get("sha256") == sha:
287
+ ent["quick_sig"] = sig # 仅 mtime 变、内容没变
288
+ unchanged.append(fn)
289
+ continue
290
+ _index_file(subject, fn, path, sha, sig, dpi, ocr_engine, manifest)
291
+ (changed if ent else added).append(fn)
292
+
293
+ _rebuild_vectors(subject, manifest)
294
+ _atomic_write_json(subject_paths(subject)["manifest"], manifest)
295
+ return {"added": added, "removed": removed, "changed": changed,
296
+ "unchanged": unchanged, "mode": _embed_state["mode"]}
297
+
298
+
299
+ def _index_file(subject, fn, path, sha, sig, dpi, ocr_engine, manifest):
300
+ """建单个 PDF 的档:标 incomplete → 抽页 → 标 complete(中断不会被复用)。"""
301
+ slug = file_slug(fn)
302
+ png_dir = os.path.join(subject_paths(subject)["base"], slug)
303
+ # 重建该文件前清掉旧渲染/文字,避免内容变了仍返回旧 PNG(spec §3.5 红线)
304
+ if os.path.isdir(png_dir):
305
+ shutil.rmtree(png_dir, ignore_errors=True)
306
+ manifest["files"][fn] = {"sha256": sha, "quick_sig": sig,
307
+ "indexed_complete": False, "slug": slug, "pages": []}
308
+ pages = extract_pages(path, png_dir, dpi, ocr_engine)
309
+ page_meta = []
310
+ for pr in pages:
311
+ page_meta.append({k: pr[k] for k in
312
+ ("page", "visual_flag", "visual_only", "unreadable")})
313
+ # 文字写入每页缓存(供答题阶段加载,免重抽)
314
+ if not pr["unreadable"]:
315
+ os.makedirs(png_dir, exist_ok=True)
316
+ with open(os.path.join(png_dir, f"page-{pr['page']:03d}.text.txt"),
317
+ "w", encoding="utf-8") as f:
318
+ f.write(pr["text"])
319
+ manifest["files"][fn]["pages"] = page_meta
320
+ manifest["files"][fn]["indexed_complete"] = True
321
+
322
+
323
+ def _build_chunks(subject, manifest):
324
+ """从 manifest 收集每页 chunk(page-level)。坏页和全空页不入检索。"""
325
+ chunks = []
326
+ for fn, ent in manifest["files"].items():
327
+ if not ent.get("indexed_complete"):
328
+ continue # 半截档不参与检索(spec §3.4)
329
+ slug = ent["slug"]
330
+ png_dir = os.path.join(subject_paths(subject)["base"], slug)
331
+ for pm in ent["pages"]:
332
+ if pm["unreadable"]:
333
+ continue
334
+ tpath = os.path.join(png_dir, f"page-{pm['page']:03d}.text.txt")
335
+ text = ""
336
+ if os.path.exists(tpath):
337
+ with open(tpath, encoding="utf-8") as f:
338
+ text = f.read()
339
+ lecture = _lecture_label(fn)
340
+ # visual_only 页文字稀少:用「讲次标题 + 文件名」兜底,靠所属讲次被检索到
341
+ embed_text = text.strip() or f"{lecture} {fn}"
342
+ chunks.append({
343
+ "file": fn, "source_file_exact": fn, "lecture": lecture,
344
+ "page": pm["page"], "visual_flag": pm["visual_flag"],
345
+ "visual_only": pm["visual_only"], "slug": slug,
346
+ "text": text, "embed_text": embed_text,
347
+ })
348
+ return chunks
349
+
350
+
351
+ def _rebuild_vectors(subject, manifest):
352
+ import numpy as np
353
+ sp = subject_paths(subject)
354
+ chunks = _build_chunks(subject, manifest)
355
+ _atomic_write_json(sp["chunks"], chunks)
356
+ if not chunks:
357
+ _atomic_write_npy(sp["vectors"], np.zeros((0, EMBED_DIM), dtype="float32"))
358
+ return
359
+ vecs = embed_texts([c["embed_text"] for c in chunks], "passage")
360
+ if vecs is None: # 关键词模式:无向量
361
+ if os.path.exists(sp["vectors"]):
362
+ os.remove(sp["vectors"])
363
+ return
364
+ mat = np.vstack(vecs)
365
+ mat = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + 1e-9)
366
+ _atomic_write_npy(sp["vectors"], mat)
367
+
368
+
369
+ _LEC_RE = re.compile(r"(Lecture\s*\d+|Week\s*\d+|L\d+)", re.IGNORECASE)
370
+
371
+
372
+ def _lecture_label(fn):
373
+ m = _LEC_RE.search(fn)
374
+ return m.group(1) if m else os.path.splitext(fn)[0]
375
+
376
+
377
+ # ── 检索 ──────────────────────────────────────────────────────────────────────
378
+ def _content_tokens(s):
379
+ """分词并去停用词,只留「内容词」(防 the/is/的/了 这类把无关页判成命中)。"""
380
+ return {t for t in _TOKEN_RE.findall(s.lower())
381
+ if t not in _EN_STOP and t not in _ZH_STOP}
382
+
383
+
384
+ def _keyword_score(query, text):
385
+ qt = _content_tokens(query)
386
+ if not qt:
387
+ return 0.0 # 问题里没有内容词 → 不可判命中
388
+ ds = _content_tokens(text)
389
+ if not ds:
390
+ return 0.0
391
+ hit = sum(1 for t in qt if t in ds)
392
+ return hit / len(qt) # 命中内容词占比(query 侧)
393
+
394
+
395
+ def query(subject, question, k, dpi):
396
+ rec = reconcile(subject, dpi)
397
+ sp = subject_paths(subject)
398
+ if not os.path.exists(sp["chunks"]):
399
+ return {"subject": subject, "mode": rec["mode"], "reconcile": rec,
400
+ "miss": True, "reason": "no_index", "results": []}
401
+ with open(sp["chunks"], encoding="utf-8") as f:
402
+ chunks = json.load(f)
403
+ if not chunks:
404
+ return {"subject": subject, "mode": rec["mode"], "reconcile": rec,
405
+ "miss": True, "reason": "empty", "results": []}
406
+
407
+ mode = _embed_state["mode"]
408
+ scored = []
409
+ if mode == "semantic" and os.path.exists(sp["vectors"]):
410
+ import numpy as np
411
+ mat = np.load(sp["vectors"])
412
+ qv = embed_texts([question], "query")
413
+ if qv is not None and mat.shape[0] == len(chunks):
414
+ q = qv[0] / (np.linalg.norm(qv[0]) + 1e-9)
415
+ sims = mat @ q
416
+ order = np.argsort(-sims)[:k]
417
+ scored = [(float(sims[i]), chunks[i]) for i in order]
418
+ else:
419
+ mode = "keyword"
420
+ if not scored: # 关键词降级路径
421
+ mode = "keyword"
422
+ ranked = sorted(((_keyword_score(question, c["embed_text"]), c)
423
+ for c in chunks), key=lambda x: -x[0])[:k]
424
+ scored = [(s, c) for s, c in ranked]
425
+
426
+ threshold = SIM_THRESHOLD if mode == "semantic" else KEYWORD_MIN_RATIO
427
+ top = scored[0][0] if scored else 0.0
428
+ miss = (not scored) or (top < threshold)
429
+
430
+ results = []
431
+ for score, c in scored:
432
+ png = None
433
+ if c["visual_flag"] or c["visual_only"]: # 仅对含图的命中页渲染(top-k,开销小)
434
+ png = render_page(os.path.join(slide_dir(subject), c["file"]), c["page"],
435
+ dpi, os.path.join(sp["base"], c["slug"]))
436
+ results.append({
437
+ "source_file_exact": c["source_file_exact"], "lecture": c["lecture"],
438
+ "page": c["page"], "score": round(score, 4),
439
+ "visual_flag": c["visual_flag"], "visual_only": c["visual_only"],
440
+ "png_path": png,
441
+ "text_path": os.path.join(sp["base"], c["slug"],
442
+ f"page-{c['page']:03d}.text.txt"),
443
+ })
444
+ return {"subject": subject, "mode": mode, "reconcile": rec,
445
+ "miss": miss, "threshold": threshold, "top_score": round(top, 4),
446
+ "results": results}
447
+
448
+
449
+ # ── CLI ───────────────────────────────────────────────────────────────────────
450
+ def main(argv=None):
451
+ p = argparse.ArgumentParser(description="课件 PDF 检索/建档/渲染引擎")
452
+ sub = p.add_subparsers(dest="cmd", required=True)
453
+
454
+ pi = sub.add_parser("index", help="建/更新某科目课件档")
455
+ pi.add_argument("--subject", required=True)
456
+ pi.add_argument("--dpi", type=int, default=DEFAULT_DPI)
457
+
458
+ pq = sub.add_parser("query", help="检索(先 reconcile)")
459
+ pq.add_argument("--subject", required=True)
460
+ pq.add_argument("-q", "--question", required=True)
461
+ pq.add_argument("-k", type=int, default=5)
462
+ pq.add_argument("--dpi", type=int, default=DEFAULT_DPI)
463
+
464
+ pr = sub.add_parser("render", help="渲染单页 PNG")
465
+ pr.add_argument("--pdf", required=True, help="PDF 路径")
466
+ pr.add_argument("--page", type=int, required=True)
467
+ pr.add_argument("--dpi", type=int, default=DEFAULT_DPI)
468
+ pr.add_argument("--subject", help="可选,仅用于缓存归类")
469
+
470
+ args = p.parse_args(argv)
471
+ if args.cmd == "index":
472
+ out = reconcile(args.subject, args.dpi)
473
+ elif args.cmd == "query":
474
+ out = query(args.subject, args.question, args.k, args.dpi)
475
+ elif args.cmd == "render":
476
+ slug = file_slug(os.path.basename(args.pdf)) # 碰撞安全,与 index 路径一致
477
+ base = os.path.join(CACHE_ROOT, args.subject or "_adhoc", slug)
478
+ path = render_page(args.pdf, args.page, args.dpi, base)
479
+ out = {"png_path": path, "ok": path is not None}
480
+ print(json.dumps(out, ensure_ascii=False))
481
+ return 0
482
+
483
+
484
+ if __name__ == "__main__":
485
+ sys.exit(main())
@@ -0,0 +1,6 @@
1
+ # pdf-vision 依赖(pdf-rag.sh 安装到隔离 venv .harness/.venv-pdf)
2
+ # OCR(Tesseract)是可选系统依赖,不在此处,运行时探测。
3
+ pymupdf==1.24.*
4
+ fastembed==0.7.*
5
+ numpy>=1.26,<3
6
+ pypdf>=4,<6