paperforge 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. paperforge/__init__.py +3 -0
  2. paperforge/__main__.py +8 -0
  3. paperforge/adapters/__init__.py +0 -0
  4. paperforge/adapters/bbt.py +224 -0
  5. paperforge/adapters/collections.py +30 -0
  6. paperforge/adapters/obsidian_frontmatter.py +355 -0
  7. paperforge/adapters/zotero_paths.py +63 -0
  8. paperforge/cli.py +556 -0
  9. paperforge/command_files/pf-deep.md +313 -0
  10. paperforge/command_files/pf-end.md +47 -0
  11. paperforge/command_files/pf-ocr.md +142 -0
  12. paperforge/command_files/pf-paper.md +132 -0
  13. paperforge/command_files/pf-status.md +119 -0
  14. paperforge/command_files/pf-sync.md +121 -0
  15. paperforge/commands/__init__.py +26 -0
  16. paperforge/commands/context.py +170 -0
  17. paperforge/commands/dashboard.py +163 -0
  18. paperforge/commands/deep.py +77 -0
  19. paperforge/commands/ocr.py +180 -0
  20. paperforge/commands/repair.py +98 -0
  21. paperforge/commands/status.py +47 -0
  22. paperforge/commands/sync.py +68 -0
  23. paperforge/config.py +433 -0
  24. paperforge/core/__init__.py +0 -0
  25. paperforge/core/date_utils.py +17 -0
  26. paperforge/core/errors.py +60 -0
  27. paperforge/core/io.py +15 -0
  28. paperforge/core/result.py +79 -0
  29. paperforge/core/state.py +131 -0
  30. paperforge/doctor/__init__.py +0 -0
  31. paperforge/doctor/field_validator.py +171 -0
  32. paperforge/logging_config.py +69 -0
  33. paperforge/ocr_diagnostics.py +270 -0
  34. paperforge/pdf_resolver.py +113 -0
  35. paperforge/plugin/main.js +3500 -0
  36. paperforge/plugin/manifest.json +10 -0
  37. paperforge/plugin/package-lock.json +2375 -0
  38. paperforge/plugin/package.json +16 -0
  39. paperforge/plugin/styles.css +2418 -0
  40. paperforge/plugin/versions.json +8 -0
  41. paperforge/schema/__init__.py +35 -0
  42. paperforge/services/__init__.py +0 -0
  43. paperforge/services/skill_deploy.py +90 -0
  44. paperforge/services/sync_service.py +318 -0
  45. paperforge/setup/__init__.py +25 -0
  46. paperforge/setup/agent.py +85 -0
  47. paperforge/setup/checker.py +68 -0
  48. paperforge/setup/config_writer.py +80 -0
  49. paperforge/setup/plan.py +97 -0
  50. paperforge/setup/runtime.py +103 -0
  51. paperforge/setup/vault.py +148 -0
  52. paperforge/setup_wizard.py +865 -0
  53. paperforge/skills/__init__.py +1 -0
  54. paperforge/skills/literature-qa/scripts/ld_deep.py +1790 -0
  55. paperforge/worker/__init__.py +34 -0
  56. paperforge/worker/_domain.py +103 -0
  57. paperforge/worker/_progress.py +19 -0
  58. paperforge/worker/_retry.py +80 -0
  59. paperforge/worker/_utils.py +294 -0
  60. paperforge/worker/asset_index.py +659 -0
  61. paperforge/worker/asset_state.py +221 -0
  62. paperforge/worker/base_views.py +534 -0
  63. paperforge/worker/deep_reading.py +135 -0
  64. paperforge/worker/discussion.py +409 -0
  65. paperforge/worker/ocr.py +1924 -0
  66. paperforge/worker/paper_meta.py +87 -0
  67. paperforge/worker/paper_resolver.py +384 -0
  68. paperforge/worker/repair.py +415 -0
  69. paperforge/worker/status.py +1150 -0
  70. paperforge/worker/sync.py +1245 -0
  71. paperforge/worker/update.py +345 -0
  72. paperforge-1.5.1.dist-info/METADATA +328 -0
  73. paperforge-1.5.1.dist-info/RECORD +77 -0
  74. paperforge-1.5.1.dist-info/WHEEL +5 -0
  75. paperforge-1.5.1.dist-info/entry_points.txt +2 -0
  76. paperforge-1.5.1.dist-info/licenses/LICENSE +425 -0
  77. paperforge-1.5.1.dist-info/top_level.txt +1 -0
paperforge/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """paperforge — PaperForge package."""
2
+
3
+ __version__ = "1.5.1"
paperforge/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """paperforge.__main__ — entry point for `python -m paperforge`."""
2
+
3
+ import sys
4
+
5
+ from paperforge.cli import main
6
+
7
+ if __name__ == "__main__":
8
+ sys.exit(main())
File without changes
@@ -0,0 +1,224 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from paperforge.adapters.collections import build_collection_lookup
7
+ from paperforge.core.date_utils import extract_year
8
+ from paperforge.core.io import read_json
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def normalize_attachment_path(path: str, zotero_dir: Path | None = None) -> tuple[str, str, str]:
14
+ """Normalize a BBT attachment path to a consistent storage: format.
15
+
16
+ Handles three real-world BBT export formats:
17
+ 1. Absolute Windows paths: D:\\...\\Zotero\\storage\\8CHARKEY\\filename.pdf
18
+ -> storage:8CHARKEY/filename.pdf
19
+ 2. storage: prefix: storage:KEY/filename.pdf -> pass through
20
+ 3. Bare relative: KEY/filename.pdf -> storage:KEY/filename.pdf
21
+
22
+ Args:
23
+ path: Raw path from BBT JSON attachment.
24
+ zotero_dir: Optional absolute path to Zotero data directory for
25
+ validating absolute paths.
26
+
27
+ Returns:
28
+ Tuple of (normalized_path, bbt_path_raw, zotero_storage_key).
29
+ normalized_path uses forward slashes and storage: prefix for
30
+ Zotero storage paths. bbt_path_raw preserves the original input
31
+ for debugging. zotero_storage_key is the 8-character Zotero key.
32
+ """
33
+ raw = str(path or "").strip()
34
+ if not raw:
35
+ return ("", "", "")
36
+
37
+ bbt_path_raw = raw
38
+
39
+ # Format 2: Already has storage: prefix — pass through with slash normalization
40
+ if raw.startswith("storage:"):
41
+ storage_rel = raw[len("storage:") :].lstrip("/").lstrip("\\")
42
+ storage_rel = storage_rel.replace("\\", "/")
43
+ parts = storage_rel.split("/")
44
+ zotero_storage_key = parts[0] if parts else ""
45
+ return (f"storage:{storage_rel}", bbt_path_raw, zotero_storage_key)
46
+
47
+ # Format 1: Absolute Windows path pointing to Zotero storage
48
+ candidate = Path(raw)
49
+ _looks_absolute = candidate.is_absolute() or (len(raw) >= 2 and raw[0].isalpha() and raw[1] == ":")
50
+ if _looks_absolute:
51
+ norm_path = raw.replace("\\", "/")
52
+ # Detect Zotero storage pattern: .../storage/8CHARKEY/...
53
+ if "/storage/" in norm_path:
54
+ parts_after_storage = norm_path.split("/storage/", 1)[1]
55
+ parts = parts_after_storage.split("/")
56
+ if len(parts) >= 2 and len(parts[0]) == 8 and parts[0].isalnum():
57
+ zotero_storage_key = parts[0]
58
+ filename = "/".join(parts[1:])
59
+ return (f"storage:{zotero_storage_key}/{filename}", bbt_path_raw, zotero_storage_key)
60
+ # Absolute path but not in Zotero storage — mark as absolute
61
+ return (f"absolute:{raw}", bbt_path_raw, "")
62
+
63
+ # Format 3: Bare relative path — prepend storage: prefix
64
+ norm = raw.replace("\\", "/")
65
+ parts = norm.split("/")
66
+ zotero_storage_key = parts[0] if parts else ""
67
+ return (f"storage:{norm}", bbt_path_raw, zotero_storage_key)
68
+
69
+
70
+ def identify_main_pdf(attachments: list[dict]) -> tuple[dict | None, list[dict]]:
71
+ """Identify the main PDF and supplementary materials from attachments.
72
+
73
+ Uses a hybrid three-priority strategy (Decision D-02):
74
+ 1. Primary: attachment.title == "PDF" AND contentType == "application/pdf"
75
+ 2. Fallback heuristic: largest file by size (if available), else shortest title
76
+ 3. Final fallback: first PDF attachment in the list
77
+
78
+ Args:
79
+ attachments: List of attachment dicts from load_export_rows().
80
+
81
+ Returns:
82
+ Tuple of (main_pdf_attachment, supplementary_attachments).
83
+ main_pdf_attachment may be None if no PDFs found.
84
+ supplementary_attachments is a list of all other PDF attachments.
85
+ """
86
+ pdf_attachments = [a for a in attachments if isinstance(a, dict) and a.get("contentType") == "application/pdf"]
87
+
88
+ if not pdf_attachments:
89
+ return (None, [])
90
+
91
+ # Priority 1: Title exactly equals "PDF"
92
+ for att in pdf_attachments:
93
+ if att.get("title") == "PDF":
94
+ main = att
95
+ supplementary = [a for a in pdf_attachments if a is not main]
96
+ return (main, supplementary)
97
+
98
+ # Priority 2: Largest file by size (if size field is available and differentiated)
99
+ sized = [(a, a.get("size", 0) or 0) for a in pdf_attachments]
100
+ sized.sort(key=lambda x: x[1], reverse=True)
101
+ if sized and sized[0][1] > 0 and (len(sized) == 1 or sized[0][1] > sized[1][1]):
102
+ main = sized[0][0]
103
+ supplementary = [a for a in pdf_attachments if a is not main]
104
+ return (main, supplementary)
105
+
106
+ # Priority 2b (sizes equal or unavailable): shortest title
107
+ titled = [(a, len(str(a.get("title", "")))) for a in pdf_attachments]
108
+ titled.sort(key=lambda x: x[1])
109
+ main = titled[0][0]
110
+ supplementary = [a for a in pdf_attachments if a is not main]
111
+ return (main, supplementary)
112
+
113
+
114
+ def extract_authors(item: dict) -> list[str]:
115
+ authors = []
116
+ for creator in item.get("creators", []):
117
+ if creator.get("creatorType") != "author":
118
+ continue
119
+ full_name = " ".join(
120
+ part for part in [creator.get("firstName", ""), creator.get("lastName", "")] if part
121
+ ).strip()
122
+ if full_name:
123
+ authors.append(full_name)
124
+ elif creator.get("name"):
125
+ authors.append(creator["name"])
126
+ return authors
127
+
128
+
129
+ def collection_fields(collection_paths: list[str]) -> dict[str, str | list[str]]:
130
+ paths = [path for path in collection_paths if path]
131
+ primary = paths[0] if paths else ""
132
+ if paths:
133
+ primary = sorted(paths, key=lambda value: (value.count("/"), len(value), value), reverse=True)[0]
134
+ tags = []
135
+ seen = set()
136
+ for path in paths:
137
+ for part in [segment.strip() for segment in path.split("/") if segment.strip()]:
138
+ if part not in seen:
139
+ seen.add(part)
140
+ tags.append(part)
141
+ group = primary
142
+ return {"collections": paths, "collection_tags": tags, "collection_group": [group] if group else []}
143
+
144
+
145
+ def resolve_item_collection_paths(item: dict, collection_lookup: dict) -> list[str]:
146
+ paths = []
147
+ collection_keys = item.get("collections") or []
148
+ if collection_keys:
149
+ for key in collection_keys:
150
+ paths.append(collection_lookup.get("path_by_key", {}).get(key, key))
151
+ item_id = item.get("itemID")
152
+ if item_id is not None:
153
+ paths.extend(collection_lookup.get("paths_by_item_id", {}).get(item_id, []))
154
+ return sorted({path for path in paths if path}, key=lambda value: (-value.count("/"), value))
155
+
156
+
157
+ def load_export_rows(path: Path) -> list[dict]:
158
+ data = read_json(path)
159
+ if isinstance(data, list):
160
+ return data
161
+ if isinstance(data, dict) and isinstance(data.get("items"), list):
162
+ collection_lookup = build_collection_lookup(data.get("collections", {}))
163
+ rows = []
164
+ for item in data["items"]:
165
+ if item.get("itemType") in {"attachment", "note", "annotation"}:
166
+ continue
167
+ attachments = []
168
+ for attachment in item.get("attachments", []):
169
+ if not isinstance(attachment, dict):
170
+ continue
171
+ raw_path = attachment.get("path", "")
172
+ normalized_path, bbt_path_raw, zotero_storage_key = normalize_attachment_path(raw_path)
173
+ # Preserve contentType from BBT if present; fallback to file extension
174
+ content_type = attachment.get("contentType", "")
175
+ if not content_type and str(normalized_path).lower().endswith(".pdf"):
176
+ content_type = "application/pdf"
177
+ attachments.append(
178
+ {
179
+ "path": normalized_path,
180
+ "contentType": content_type,
181
+ "title": attachment.get("title", ""),
182
+ "bbt_path_raw": bbt_path_raw,
183
+ "zotero_storage_key": zotero_storage_key,
184
+ "size": attachment.get("size", 0) or 0,
185
+ }
186
+ )
187
+ main_pdf, supplementary_pdfs = identify_main_pdf(attachments)
188
+ pdf_path = main_pdf["path"] if main_pdf else ""
189
+ bbt_path_raw = main_pdf["bbt_path_raw"] if main_pdf else ""
190
+ zotero_storage_key = main_pdf["zotero_storage_key"] if main_pdf else ""
191
+ path_error = "not_found" if not main_pdf else ""
192
+ supplementary = [a["path"] for a in supplementary_pdfs] if supplementary_pdfs else []
193
+ attachment_count = len(attachments)
194
+ rows.append(
195
+ {
196
+ "key": item.get("key") or item.get("itemKey", ""),
197
+ "title": item.get("title", ""),
198
+ "authors": extract_authors(item),
199
+ "creators": item.get("creators", []),
200
+ "abstract": item.get("abstractNote", ""),
201
+ "journal": item.get("publicationTitle", ""),
202
+ "extra": item.get("extra", ""),
203
+ "year": extract_year(item.get("date", "")),
204
+ "date": item.get("date", ""),
205
+ "doi": item.get("DOI", ""),
206
+ "pmid": item.get("PMID", ""),
207
+ "collections": resolve_item_collection_paths(item, collection_lookup),
208
+ "attachments": attachments,
209
+ "pdf_path": pdf_path,
210
+ "supplementary": supplementary,
211
+ "attachment_count": attachment_count,
212
+ "bbt_path_raw": bbt_path_raw,
213
+ "zotero_storage_key": zotero_storage_key,
214
+ "path_error": path_error,
215
+ }
216
+ )
217
+ return rows
218
+ raise ValueError(f"Unsupported export format: {path}")
219
+
220
+
221
+ # ── Backward-compat aliases (v2.1 → 2.2 migration) ──
222
+ _normalize_attachment_path = normalize_attachment_path
223
+ _identify_main_pdf = identify_main_pdf
224
+ _extract_year = extract_year
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def build_collection_lookup(collections: dict) -> dict:
5
+ """Build parent-resolved path cache from a Zotero collection tree.
6
+
7
+ Returns a dict with:
8
+ path_by_key: {collection_key: "Parent/Sub/Name", ...}
9
+ paths_by_item_id: {item_id: [collection_path, ...], ...}
10
+ """
11
+ path_cache: dict[str, str] = {}
12
+ item_paths: dict[str, list[str]] = {}
13
+
14
+ def path_for(key: str) -> str:
15
+ if key in path_cache:
16
+ return path_cache[key]
17
+ node = collections.get(key, {})
18
+ parent = node.get("parent") or ""
19
+ name = node.get("name", "")
20
+ parent_path = path_for(parent) if parent else ""
21
+ full_path = f"{parent_path}/{name}" if parent_path else name
22
+ path_cache[key] = full_path
23
+ return full_path
24
+
25
+ for key, node in collections.items():
26
+ full_path = path_for(key)
27
+ for item_id in node.get("items", []):
28
+ item_paths.setdefault(item_id, []).append(full_path)
29
+
30
+ return {"path_by_key": path_cache, "paths_by_item_id": item_paths}
@@ -0,0 +1,355 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+
8
+ def _yaml_quote(value: str) -> str:
9
+ if isinstance(value, bool):
10
+ return "true" if value else "false"
11
+ return '"' + str(value or "").replace("\\", "\\\\").replace('"', '\\"') + '"'
12
+
13
+
14
+ def _yaml_block(value: str) -> list[str]:
15
+ value = (value or "").strip()
16
+ if not value:
17
+ return ["abstract: |-", " "]
18
+ lines = ["abstract: |-"]
19
+ for line in value.splitlines():
20
+ lines.append(f" {line}")
21
+ return lines
22
+
23
+
24
+ def compute_final_collection(row: dict) -> str:
25
+ user_raw = str(row.get("user_collection", "") or "").strip()
26
+ user_resolved = str(row.get("user_collection_resolved", "") or "").strip()
27
+ recommended = str(row.get("recommended_collection", "") or "").strip()
28
+ if user_raw:
29
+ return user_resolved
30
+ return recommended
31
+
32
+
33
+ DEEP_READING_HEADER = "## 🔍 精读"
34
+
35
+
36
+ def _read_frontmatter_bool_from_text(text: str, key: str, default: bool = False) -> bool:
37
+ match = re.search(rf"^{re.escape(key)}:\s*(?:[\"'])?(true|false)(?:[\"'])?\s*$", text, re.MULTILINE | re.IGNORECASE)
38
+ if not match:
39
+ return default
40
+ return match.group(1).lower() == "true"
41
+
42
+
43
+ def read_frontmatter_bool(note_path: Path, key: str, default: bool = False) -> bool:
44
+ """Read a boolean field from a formal note's YAML frontmatter (Path-based)."""
45
+ if not note_path or not note_path.exists():
46
+ return default
47
+ try:
48
+ text = note_path.read_text(encoding="utf-8")
49
+ return _read_frontmatter_bool_from_text(text, key, default)
50
+ except Exception:
51
+ return default
52
+
53
+
54
+ def _read_frontmatter_optional_bool_from_text(text: str, key: str) -> Optional[bool]:
55
+ match = re.search(rf"^{re.escape(key)}:\s*(?:[\"'])?(true|false)(?:[\"'])?\s*$", text, re.MULTILINE | re.IGNORECASE)
56
+ if not match:
57
+ return None
58
+ return match.group(1).lower() == "true"
59
+
60
+
61
+ def read_frontmatter_optional_bool(note_path: Path, key: str) -> Optional[bool]:
62
+ """Read an optional boolean field from a formal note's YAML frontmatter (Path-based)."""
63
+ if not note_path or not note_path.exists():
64
+ return None
65
+ try:
66
+ text = note_path.read_text(encoding="utf-8")
67
+ return _read_frontmatter_optional_bool_from_text(text, key)
68
+ except Exception:
69
+ return None
70
+
71
+
72
+ def _legacy_control_flags(paths: dict[str, Path], zotero_key: str) -> dict[str, Optional[bool]]:
73
+ records_root = paths.get("library_records")
74
+ if not records_root or not records_root.exists():
75
+ return {"do_ocr": None, "analyze": None}
76
+ for record_path in records_root.rglob(f"{zotero_key}.md"):
77
+ try:
78
+ text = record_path.read_text(encoding="utf-8")
79
+ except Exception:
80
+ continue
81
+ return {
82
+ "do_ocr": _read_frontmatter_optional_bool_from_text(text, "do_ocr"),
83
+ "analyze": _read_frontmatter_optional_bool_from_text(text, "analyze"),
84
+ }
85
+ return {"do_ocr": None, "analyze": None}
86
+
87
+
88
+ def canonicalize_decision(value: str) -> str:
89
+ text = str(value or "").strip()
90
+ if text in {"", "待查"}:
91
+ return "待定"
92
+ if text in {"排除", "不纳入"}:
93
+ return "不纳入"
94
+ if text == "纳入":
95
+ return "纳入"
96
+ return "待定"
97
+
98
+
99
+ def candidate_markdown(row: dict) -> str:
100
+ row = dict(row)
101
+ row["final_collection"] = compute_final_collection(row)
102
+ row["decision"] = canonicalize_decision(row.get("decision", ""))
103
+ lines = ["---"]
104
+ ordered_keys = [
105
+ "candidate_id",
106
+ "domain",
107
+ "title",
108
+ "authors",
109
+ "year",
110
+ "journal",
111
+ "doi",
112
+ "pmid",
113
+ "source",
114
+ "requester_skill",
115
+ "request_context",
116
+ "abstract_short",
117
+ "decision",
118
+ "recommended_collection",
119
+ "recommend_confidence",
120
+ "recommend_reason",
121
+ "user_collection",
122
+ "user_collection_resolved",
123
+ "final_collection",
124
+ "collection_resolution",
125
+ "duplicate_hint",
126
+ "existing_zotero_key",
127
+ "existing_collections",
128
+ "import_status",
129
+ "note",
130
+ "candidate_source_type",
131
+ "source_zotero_key",
132
+ "cited_ref_number",
133
+ "trigger_sentence",
134
+ "source_context",
135
+ "task_relevance_reason",
136
+ "harvest_priority",
137
+ "raw_reference",
138
+ "status",
139
+ ]
140
+ row.setdefault("status", "candidate")
141
+ for key in ordered_keys:
142
+ value = row.get(key, "")
143
+ if isinstance(value, list):
144
+ lines.append(f"{key}:")
145
+ for item in value:
146
+ lines.append(f" - {_yaml_quote(item)}")
147
+ elif value == "":
148
+ lines.append(f"{key}:")
149
+ elif "\n" in str(value):
150
+ lines.extend(
151
+ _yaml_block(str(value)).copy()
152
+ if key == "abstract"
153
+ else [f"{key}: |-"] + [f" {line}" for line in str(value).splitlines()]
154
+ )
155
+ else:
156
+ lines.append(f"{key}: {_yaml_quote(value)}")
157
+ lines.extend(
158
+ [
159
+ "---",
160
+ "",
161
+ f"# {row['candidate_id']}",
162
+ "",
163
+ "候选文献轻量记录,仅用于 Base 决策和 write-back 触发,不是正式文献卡片。",
164
+ "",
165
+ ]
166
+ )
167
+ return "\n".join(lines)
168
+
169
+
170
+ def generate_review(candidates: list[dict]) -> str:
171
+ normalized = []
172
+ for row in candidates:
173
+ copy = dict(row)
174
+ copy["decision"] = canonicalize_decision(copy.get("decision", ""))
175
+ normalized.append(copy)
176
+ include = [c for c in normalized if c.get("decision") == "纳入"]
177
+ exclude = [c for c in normalized if c.get("decision") == "不纳入"]
178
+ lines = [
179
+ "# 本轮候选总览",
180
+ "",
181
+ "## 检索背景",
182
+ "",
183
+ f"- 候选数量:{len(normalized)}",
184
+ f"- 建议纳入:{len(include)}",
185
+ f"- 不纳入:{len(exclude)}",
186
+ "",
187
+ "## 总体判断",
188
+ "",
189
+ "- 当前候选池已经按决策状态分层,可直接进入 Base 处理。",
190
+ "",
191
+ "## 推荐优先纳入",
192
+ "",
193
+ ]
194
+ if include:
195
+ for row in include:
196
+ lines.extend(
197
+ [
198
+ f"### {row['candidate_id']}",
199
+ "",
200
+ f"- 标题:{row['title']}",
201
+ f"- 推荐分类:`{compute_final_collection(row)}`",
202
+ f"- 理由:{row.get('recommend_reason', '')}",
203
+ "",
204
+ ]
205
+ )
206
+ else:
207
+ lines.extend(["- 暂无", ""])
208
+ lines.extend(["## 不纳入", ""])
209
+ if exclude:
210
+ for row in exclude:
211
+ lines.extend(
212
+ [
213
+ f"### {row['candidate_id']}",
214
+ "",
215
+ f"- 标题:{row['title']}",
216
+ f"- 理由:{row.get('recommend_reason', '')}",
217
+ "",
218
+ ]
219
+ )
220
+ else:
221
+ lines.extend(["- 暂无", ""])
222
+ lines.extend(["## 下一步", "", "1. 在 Base 中确认决策。", "2. 对纳入项执行 write-back。", "3. 刷新正式索引。", ""])
223
+ return "\n".join(lines)
224
+
225
+
226
+ def extract_preserved_deep_reading(text: str) -> str:
227
+ if not text:
228
+ return ""
229
+ # Match "## 🔍 精读" or "## 精读" (emoji is optional)
230
+ match = re.search(r"^##\s*🔍?\s*精读\s*$", text, re.MULTILINE)
231
+ if not match:
232
+ return ""
233
+ start = match.start()
234
+ preserved = text[start:].strip()
235
+ # Skip if section body is empty or only contains placeholder text
236
+ body = re.sub(r"^##\s*🔍?\s*精读\s*$", "", preserved, flags=re.MULTILINE).strip()
237
+ if not body or _is_placeholder_only(body):
238
+ return ""
239
+ return preserved
240
+
241
+
242
+ _PLACEHOLDER_PATTERN = re.compile(r"(待补充)|\(待补充\)|\(TODO\)|\(TBD\)")
243
+
244
+
245
+ def _is_placeholder_only(body: str) -> bool:
246
+ """Check if the deep reading body is only placeholder text (no real content)."""
247
+ cleaned = _PLACEHOLDER_PATTERN.sub("", body).strip()
248
+ cleaned = re.sub(r"^[-*]\s*$", "", cleaned, flags=re.MULTILINE).strip()
249
+ return not cleaned
250
+
251
+
252
+ def has_deep_reading_content(text: str) -> bool:
253
+ preserved = extract_preserved_deep_reading(text)
254
+ if not preserved:
255
+ return False
256
+ # Strip both "## 🔍 精读" and "## 精读" header variants
257
+ body = re.sub(r"^##\s*🔍?\s*精读\s*$", "", preserved, flags=re.MULTILINE).strip()
258
+ if not body:
259
+ return False
260
+
261
+ clarity_ok = bool(re.search(r"-\s*\*\*Clarity\*\*(清晰度):(.+)", body)) and not re.search(
262
+ r"-\s*\*\*Clarity\*\*(清晰度):\s*$", body, re.MULTILINE
263
+ )
264
+
265
+ figure_sec = _extract_section(body, r"\*\*Figure 导读\*\*")
266
+ figure_ok = False
267
+ if figure_sec:
268
+ for line in figure_sec.splitlines():
269
+ s = line.strip()
270
+ if s.startswith("- ") and ":" in s:
271
+ _, after = s.split(":", 1)
272
+ if after.strip() and after.strip() != "(待补充)":
273
+ figure_ok = True
274
+ break
275
+
276
+ issue_sec = _extract_section(body, r"\*\*遗留问题\*\*")
277
+ if not issue_sec:
278
+ issue_sec = _extract_section(body, r"####\s*遗留问题")
279
+ issue_ok = False
280
+ if issue_sec:
281
+ dirty = [l.strip() for l in issue_sec.splitlines() if l.strip()]
282
+ substantive = [l for l in dirty if l not in ("-", "(待补充)", "", "**遗留问题**")]
283
+ issue_ok = bool(substantive)
284
+
285
+ return clarity_ok and figure_ok and issue_ok
286
+
287
+
288
+ def _extract_section(body: str, section_header: str) -> str | None:
289
+ m = re.search(
290
+ section_header + r"\n*(.*?)(?=\n(?:#{1,3})\s|\Z)",
291
+ body,
292
+ re.DOTALL,
293
+ )
294
+ if m:
295
+ return m.group(1).strip()
296
+ return None
297
+
298
+
299
+ def _add_missing_frontmatter_fields(existing_content: str, new_fields: dict[str, str]) -> str:
300
+ if not existing_content.startswith("---"):
301
+ return existing_content
302
+ parts = existing_content.split("---", 2)
303
+ if len(parts) < 3:
304
+ return existing_content
305
+ frontmatter = parts[1]
306
+ body = parts[2]
307
+ lines_to_add = []
308
+ for key, value in new_fields.items():
309
+ pattern = "^" + re.escape(key) + "\\s*:"
310
+ if not re.search(pattern, frontmatter, re.MULTILINE):
311
+ lines_to_add.append(f"{key}: {_yaml_quote(value)}")
312
+ if not lines_to_add:
313
+ return existing_content
314
+ new_frontmatter = frontmatter.rstrip("\n") + "\n" + "\n".join(lines_to_add) + "\n"
315
+ return f"---{new_frontmatter}---{body}"
316
+
317
+
318
+ def update_frontmatter_field(content: str, key: str, value: str) -> str:
319
+ if not content.startswith("---"):
320
+ return content
321
+ pattern = "^" + re.escape(key) + "\\s*:.*$"
322
+ replacement = f"{key}: {_yaml_quote(value)}"
323
+ new_content, count = re.subn(pattern, replacement, content, flags=re.MULTILINE, count=1)
324
+ if count == 0:
325
+ new_content = _add_missing_frontmatter_fields(content, {key: value})
326
+ return new_content
327
+
328
+
329
+ def read_frontmatter_dict(text: str) -> dict:
330
+ """Parse YAML frontmatter from markdown text using yaml.safe_load.
331
+
332
+ Falls back to regex parsing if YAML parser fails (malformed frontmatter).
333
+ Returns empty dict if no frontmatter.
334
+ """
335
+ import re
336
+ import yaml
337
+
338
+ fm_match = re.match(r"^---\s*\n(.*?)\n---", text, re.DOTALL)
339
+ if not fm_match:
340
+ return {}
341
+
342
+ try:
343
+ data = yaml.safe_load(fm_match.group(1))
344
+ if isinstance(data, dict):
345
+ return data
346
+ except Exception:
347
+ pass
348
+
349
+ result = {}
350
+ for line in fm_match.group(1).splitlines():
351
+ line = line.strip()
352
+ if ":" in line:
353
+ key, _, val = line.partition(":")
354
+ result[key.strip()] = val.strip()
355
+ return result
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+
7
+ def obsidian_wikilink_for_pdf(pdf_path: str, vault_dir: Path, zotero_dir: Path | None = None) -> str:
8
+ text = str(pdf_path or "").strip()
9
+ if not text:
10
+ return ""
11
+ # Handle storage: prefix paths by resolving through zotero_dir
12
+ if text.startswith("storage:") and zotero_dir is not None:
13
+ storage_rel = text[len("storage:") :].lstrip("/").lstrip("\\")
14
+ absolute_pdf_path = zotero_dir / "storage" / storage_rel.replace("/", os.sep)
15
+ absolute_str = str(absolute_pdf_path)
16
+ else:
17
+ absolute_str = absolutize_vault_path(vault_dir, text, resolve_junction=True)
18
+ if not absolute_str:
19
+ return ""
20
+ absolute_path = Path(absolute_str)
21
+ try:
22
+ relative = absolute_path.relative_to(vault_dir)
23
+ except ValueError:
24
+ # Path outside vault — try to route through Zotero junction inside vault
25
+ if zotero_dir is not None and zotero_dir.exists():
26
+ try:
27
+ from paperforge.pdf_resolver import resolve_junction
28
+
29
+ real_zotero = resolve_junction(zotero_dir)
30
+ if real_zotero != zotero_dir:
31
+ rel_to_zotero = absolute_path.relative_to(real_zotero)
32
+ via_junction = zotero_dir / rel_to_zotero
33
+ relative = via_junction.relative_to(vault_dir)
34
+ return f"[[{relative.as_posix()}]]"
35
+ except (ValueError, OSError):
36
+ pass
37
+ return f"[[{absolute_path.as_posix()}]]"
38
+ return f"[[{relative.as_posix()}]]"
39
+
40
+
41
+ def absolutize_vault_path(vault: Path, path: str, resolve_junction: bool = False) -> str:
42
+ text = str(path or "").strip()
43
+ if not text:
44
+ return ""
45
+ candidate = Path(text)
46
+ result = str(candidate) if candidate.is_absolute() else str((vault / text.replace("/", os.sep)).resolve())
47
+ if resolve_junction:
48
+ from paperforge.pdf_resolver import resolve_junction
49
+
50
+ result = str(resolve_junction(Path(result)))
51
+ return result
52
+
53
+
54
+ def obsidian_wikilink_for_path(vault: Path, path: str) -> str:
55
+ absolute = absolutize_vault_path(vault, path, resolve_junction=True)
56
+ if not absolute:
57
+ return ""
58
+ absolute_path = Path(absolute)
59
+ try:
60
+ relative = absolute_path.relative_to(vault)
61
+ except ValueError:
62
+ return f"[[{absolute_path.as_posix()}]]"
63
+ return f"[[{relative.as_posix()}]]"