paperforge 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paperforge/__init__.py +3 -0
- paperforge/__main__.py +8 -0
- paperforge/adapters/__init__.py +0 -0
- paperforge/adapters/bbt.py +224 -0
- paperforge/adapters/collections.py +30 -0
- paperforge/adapters/obsidian_frontmatter.py +355 -0
- paperforge/adapters/zotero_paths.py +63 -0
- paperforge/cli.py +556 -0
- paperforge/command_files/pf-deep.md +313 -0
- paperforge/command_files/pf-end.md +47 -0
- paperforge/command_files/pf-ocr.md +142 -0
- paperforge/command_files/pf-paper.md +132 -0
- paperforge/command_files/pf-status.md +119 -0
- paperforge/command_files/pf-sync.md +121 -0
- paperforge/commands/__init__.py +26 -0
- paperforge/commands/context.py +170 -0
- paperforge/commands/dashboard.py +163 -0
- paperforge/commands/deep.py +77 -0
- paperforge/commands/ocr.py +180 -0
- paperforge/commands/repair.py +98 -0
- paperforge/commands/status.py +47 -0
- paperforge/commands/sync.py +68 -0
- paperforge/config.py +433 -0
- paperforge/core/__init__.py +0 -0
- paperforge/core/date_utils.py +17 -0
- paperforge/core/errors.py +60 -0
- paperforge/core/io.py +15 -0
- paperforge/core/result.py +79 -0
- paperforge/core/state.py +131 -0
- paperforge/doctor/__init__.py +0 -0
- paperforge/doctor/field_validator.py +171 -0
- paperforge/logging_config.py +69 -0
- paperforge/ocr_diagnostics.py +270 -0
- paperforge/pdf_resolver.py +113 -0
- paperforge/plugin/main.js +3500 -0
- paperforge/plugin/manifest.json +10 -0
- paperforge/plugin/package-lock.json +2375 -0
- paperforge/plugin/package.json +16 -0
- paperforge/plugin/styles.css +2418 -0
- paperforge/plugin/versions.json +8 -0
- paperforge/schema/__init__.py +35 -0
- paperforge/services/__init__.py +0 -0
- paperforge/services/skill_deploy.py +90 -0
- paperforge/services/sync_service.py +318 -0
- paperforge/setup/__init__.py +25 -0
- paperforge/setup/agent.py +85 -0
- paperforge/setup/checker.py +68 -0
- paperforge/setup/config_writer.py +80 -0
- paperforge/setup/plan.py +97 -0
- paperforge/setup/runtime.py +103 -0
- paperforge/setup/vault.py +148 -0
- paperforge/setup_wizard.py +865 -0
- paperforge/skills/__init__.py +1 -0
- paperforge/skills/literature-qa/scripts/ld_deep.py +1790 -0
- paperforge/worker/__init__.py +34 -0
- paperforge/worker/_domain.py +103 -0
- paperforge/worker/_progress.py +19 -0
- paperforge/worker/_retry.py +80 -0
- paperforge/worker/_utils.py +294 -0
- paperforge/worker/asset_index.py +659 -0
- paperforge/worker/asset_state.py +221 -0
- paperforge/worker/base_views.py +534 -0
- paperforge/worker/deep_reading.py +135 -0
- paperforge/worker/discussion.py +409 -0
- paperforge/worker/ocr.py +1924 -0
- paperforge/worker/paper_meta.py +87 -0
- paperforge/worker/paper_resolver.py +384 -0
- paperforge/worker/repair.py +415 -0
- paperforge/worker/status.py +1150 -0
- paperforge/worker/sync.py +1245 -0
- paperforge/worker/update.py +345 -0
- paperforge-1.5.1.dist-info/METADATA +328 -0
- paperforge-1.5.1.dist-info/RECORD +77 -0
- paperforge-1.5.1.dist-info/WHEEL +5 -0
- paperforge-1.5.1.dist-info/entry_points.txt +2 -0
- paperforge-1.5.1.dist-info/licenses/LICENSE +425 -0
- paperforge-1.5.1.dist-info/top_level.txt +1 -0
paperforge/__init__.py
ADDED
paperforge/__main__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from paperforge.adapters.collections import build_collection_lookup
|
|
7
|
+
from paperforge.core.date_utils import extract_year
|
|
8
|
+
from paperforge.core.io import read_json
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def normalize_attachment_path(path: str, zotero_dir: Path | None = None) -> tuple[str, str, str]:
|
|
14
|
+
"""Normalize a BBT attachment path to a consistent storage: format.
|
|
15
|
+
|
|
16
|
+
Handles three real-world BBT export formats:
|
|
17
|
+
1. Absolute Windows paths: D:\\...\\Zotero\\storage\\8CHARKEY\\filename.pdf
|
|
18
|
+
-> storage:8CHARKEY/filename.pdf
|
|
19
|
+
2. storage: prefix: storage:KEY/filename.pdf -> pass through
|
|
20
|
+
3. Bare relative: KEY/filename.pdf -> storage:KEY/filename.pdf
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
path: Raw path from BBT JSON attachment.
|
|
24
|
+
zotero_dir: Optional absolute path to Zotero data directory for
|
|
25
|
+
validating absolute paths.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple of (normalized_path, bbt_path_raw, zotero_storage_key).
|
|
29
|
+
normalized_path uses forward slashes and storage: prefix for
|
|
30
|
+
Zotero storage paths. bbt_path_raw preserves the original input
|
|
31
|
+
for debugging. zotero_storage_key is the 8-character Zotero key.
|
|
32
|
+
"""
|
|
33
|
+
raw = str(path or "").strip()
|
|
34
|
+
if not raw:
|
|
35
|
+
return ("", "", "")
|
|
36
|
+
|
|
37
|
+
bbt_path_raw = raw
|
|
38
|
+
|
|
39
|
+
# Format 2: Already has storage: prefix — pass through with slash normalization
|
|
40
|
+
if raw.startswith("storage:"):
|
|
41
|
+
storage_rel = raw[len("storage:") :].lstrip("/").lstrip("\\")
|
|
42
|
+
storage_rel = storage_rel.replace("\\", "/")
|
|
43
|
+
parts = storage_rel.split("/")
|
|
44
|
+
zotero_storage_key = parts[0] if parts else ""
|
|
45
|
+
return (f"storage:{storage_rel}", bbt_path_raw, zotero_storage_key)
|
|
46
|
+
|
|
47
|
+
# Format 1: Absolute Windows path pointing to Zotero storage
|
|
48
|
+
candidate = Path(raw)
|
|
49
|
+
_looks_absolute = candidate.is_absolute() or (len(raw) >= 2 and raw[0].isalpha() and raw[1] == ":")
|
|
50
|
+
if _looks_absolute:
|
|
51
|
+
norm_path = raw.replace("\\", "/")
|
|
52
|
+
# Detect Zotero storage pattern: .../storage/8CHARKEY/...
|
|
53
|
+
if "/storage/" in norm_path:
|
|
54
|
+
parts_after_storage = norm_path.split("/storage/", 1)[1]
|
|
55
|
+
parts = parts_after_storage.split("/")
|
|
56
|
+
if len(parts) >= 2 and len(parts[0]) == 8 and parts[0].isalnum():
|
|
57
|
+
zotero_storage_key = parts[0]
|
|
58
|
+
filename = "/".join(parts[1:])
|
|
59
|
+
return (f"storage:{zotero_storage_key}/{filename}", bbt_path_raw, zotero_storage_key)
|
|
60
|
+
# Absolute path but not in Zotero storage — mark as absolute
|
|
61
|
+
return (f"absolute:{raw}", bbt_path_raw, "")
|
|
62
|
+
|
|
63
|
+
# Format 3: Bare relative path — prepend storage: prefix
|
|
64
|
+
norm = raw.replace("\\", "/")
|
|
65
|
+
parts = norm.split("/")
|
|
66
|
+
zotero_storage_key = parts[0] if parts else ""
|
|
67
|
+
return (f"storage:{norm}", bbt_path_raw, zotero_storage_key)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def identify_main_pdf(attachments: list[dict]) -> tuple[dict | None, list[dict]]:
|
|
71
|
+
"""Identify the main PDF and supplementary materials from attachments.
|
|
72
|
+
|
|
73
|
+
Uses a hybrid three-priority strategy (Decision D-02):
|
|
74
|
+
1. Primary: attachment.title == "PDF" AND contentType == "application/pdf"
|
|
75
|
+
2. Fallback heuristic: largest file by size (if available), else shortest title
|
|
76
|
+
3. Final fallback: first PDF attachment in the list
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
attachments: List of attachment dicts from load_export_rows().
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Tuple of (main_pdf_attachment, supplementary_attachments).
|
|
83
|
+
main_pdf_attachment may be None if no PDFs found.
|
|
84
|
+
supplementary_attachments is a list of all other PDF attachments.
|
|
85
|
+
"""
|
|
86
|
+
pdf_attachments = [a for a in attachments if isinstance(a, dict) and a.get("contentType") == "application/pdf"]
|
|
87
|
+
|
|
88
|
+
if not pdf_attachments:
|
|
89
|
+
return (None, [])
|
|
90
|
+
|
|
91
|
+
# Priority 1: Title exactly equals "PDF"
|
|
92
|
+
for att in pdf_attachments:
|
|
93
|
+
if att.get("title") == "PDF":
|
|
94
|
+
main = att
|
|
95
|
+
supplementary = [a for a in pdf_attachments if a is not main]
|
|
96
|
+
return (main, supplementary)
|
|
97
|
+
|
|
98
|
+
# Priority 2: Largest file by size (if size field is available and differentiated)
|
|
99
|
+
sized = [(a, a.get("size", 0) or 0) for a in pdf_attachments]
|
|
100
|
+
sized.sort(key=lambda x: x[1], reverse=True)
|
|
101
|
+
if sized and sized[0][1] > 0 and (len(sized) == 1 or sized[0][1] > sized[1][1]):
|
|
102
|
+
main = sized[0][0]
|
|
103
|
+
supplementary = [a for a in pdf_attachments if a is not main]
|
|
104
|
+
return (main, supplementary)
|
|
105
|
+
|
|
106
|
+
# Priority 2b (sizes equal or unavailable): shortest title
|
|
107
|
+
titled = [(a, len(str(a.get("title", "")))) for a in pdf_attachments]
|
|
108
|
+
titled.sort(key=lambda x: x[1])
|
|
109
|
+
main = titled[0][0]
|
|
110
|
+
supplementary = [a for a in pdf_attachments if a is not main]
|
|
111
|
+
return (main, supplementary)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def extract_authors(item: dict) -> list[str]:
|
|
115
|
+
authors = []
|
|
116
|
+
for creator in item.get("creators", []):
|
|
117
|
+
if creator.get("creatorType") != "author":
|
|
118
|
+
continue
|
|
119
|
+
full_name = " ".join(
|
|
120
|
+
part for part in [creator.get("firstName", ""), creator.get("lastName", "")] if part
|
|
121
|
+
).strip()
|
|
122
|
+
if full_name:
|
|
123
|
+
authors.append(full_name)
|
|
124
|
+
elif creator.get("name"):
|
|
125
|
+
authors.append(creator["name"])
|
|
126
|
+
return authors
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def collection_fields(collection_paths: list[str]) -> dict[str, str | list[str]]:
|
|
130
|
+
paths = [path for path in collection_paths if path]
|
|
131
|
+
primary = paths[0] if paths else ""
|
|
132
|
+
if paths:
|
|
133
|
+
primary = sorted(paths, key=lambda value: (value.count("/"), len(value), value), reverse=True)[0]
|
|
134
|
+
tags = []
|
|
135
|
+
seen = set()
|
|
136
|
+
for path in paths:
|
|
137
|
+
for part in [segment.strip() for segment in path.split("/") if segment.strip()]:
|
|
138
|
+
if part not in seen:
|
|
139
|
+
seen.add(part)
|
|
140
|
+
tags.append(part)
|
|
141
|
+
group = primary
|
|
142
|
+
return {"collections": paths, "collection_tags": tags, "collection_group": [group] if group else []}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def resolve_item_collection_paths(item: dict, collection_lookup: dict) -> list[str]:
|
|
146
|
+
paths = []
|
|
147
|
+
collection_keys = item.get("collections") or []
|
|
148
|
+
if collection_keys:
|
|
149
|
+
for key in collection_keys:
|
|
150
|
+
paths.append(collection_lookup.get("path_by_key", {}).get(key, key))
|
|
151
|
+
item_id = item.get("itemID")
|
|
152
|
+
if item_id is not None:
|
|
153
|
+
paths.extend(collection_lookup.get("paths_by_item_id", {}).get(item_id, []))
|
|
154
|
+
return sorted({path for path in paths if path}, key=lambda value: (-value.count("/"), value))
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def load_export_rows(path: Path) -> list[dict]:
|
|
158
|
+
data = read_json(path)
|
|
159
|
+
if isinstance(data, list):
|
|
160
|
+
return data
|
|
161
|
+
if isinstance(data, dict) and isinstance(data.get("items"), list):
|
|
162
|
+
collection_lookup = build_collection_lookup(data.get("collections", {}))
|
|
163
|
+
rows = []
|
|
164
|
+
for item in data["items"]:
|
|
165
|
+
if item.get("itemType") in {"attachment", "note", "annotation"}:
|
|
166
|
+
continue
|
|
167
|
+
attachments = []
|
|
168
|
+
for attachment in item.get("attachments", []):
|
|
169
|
+
if not isinstance(attachment, dict):
|
|
170
|
+
continue
|
|
171
|
+
raw_path = attachment.get("path", "")
|
|
172
|
+
normalized_path, bbt_path_raw, zotero_storage_key = normalize_attachment_path(raw_path)
|
|
173
|
+
# Preserve contentType from BBT if present; fallback to file extension
|
|
174
|
+
content_type = attachment.get("contentType", "")
|
|
175
|
+
if not content_type and str(normalized_path).lower().endswith(".pdf"):
|
|
176
|
+
content_type = "application/pdf"
|
|
177
|
+
attachments.append(
|
|
178
|
+
{
|
|
179
|
+
"path": normalized_path,
|
|
180
|
+
"contentType": content_type,
|
|
181
|
+
"title": attachment.get("title", ""),
|
|
182
|
+
"bbt_path_raw": bbt_path_raw,
|
|
183
|
+
"zotero_storage_key": zotero_storage_key,
|
|
184
|
+
"size": attachment.get("size", 0) or 0,
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
main_pdf, supplementary_pdfs = identify_main_pdf(attachments)
|
|
188
|
+
pdf_path = main_pdf["path"] if main_pdf else ""
|
|
189
|
+
bbt_path_raw = main_pdf["bbt_path_raw"] if main_pdf else ""
|
|
190
|
+
zotero_storage_key = main_pdf["zotero_storage_key"] if main_pdf else ""
|
|
191
|
+
path_error = "not_found" if not main_pdf else ""
|
|
192
|
+
supplementary = [a["path"] for a in supplementary_pdfs] if supplementary_pdfs else []
|
|
193
|
+
attachment_count = len(attachments)
|
|
194
|
+
rows.append(
|
|
195
|
+
{
|
|
196
|
+
"key": item.get("key") or item.get("itemKey", ""),
|
|
197
|
+
"title": item.get("title", ""),
|
|
198
|
+
"authors": extract_authors(item),
|
|
199
|
+
"creators": item.get("creators", []),
|
|
200
|
+
"abstract": item.get("abstractNote", ""),
|
|
201
|
+
"journal": item.get("publicationTitle", ""),
|
|
202
|
+
"extra": item.get("extra", ""),
|
|
203
|
+
"year": extract_year(item.get("date", "")),
|
|
204
|
+
"date": item.get("date", ""),
|
|
205
|
+
"doi": item.get("DOI", ""),
|
|
206
|
+
"pmid": item.get("PMID", ""),
|
|
207
|
+
"collections": resolve_item_collection_paths(item, collection_lookup),
|
|
208
|
+
"attachments": attachments,
|
|
209
|
+
"pdf_path": pdf_path,
|
|
210
|
+
"supplementary": supplementary,
|
|
211
|
+
"attachment_count": attachment_count,
|
|
212
|
+
"bbt_path_raw": bbt_path_raw,
|
|
213
|
+
"zotero_storage_key": zotero_storage_key,
|
|
214
|
+
"path_error": path_error,
|
|
215
|
+
}
|
|
216
|
+
)
|
|
217
|
+
return rows
|
|
218
|
+
raise ValueError(f"Unsupported export format: {path}")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# ── Backward-compat aliases (v2.1 → 2.2 migration) ──
|
|
222
|
+
_normalize_attachment_path = normalize_attachment_path
|
|
223
|
+
_identify_main_pdf = identify_main_pdf
|
|
224
|
+
_extract_year = extract_year
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def build_collection_lookup(collections: dict) -> dict:
|
|
5
|
+
"""Build parent-resolved path cache from a Zotero collection tree.
|
|
6
|
+
|
|
7
|
+
Returns a dict with:
|
|
8
|
+
path_by_key: {collection_key: "Parent/Sub/Name", ...}
|
|
9
|
+
paths_by_item_id: {item_id: [collection_path, ...], ...}
|
|
10
|
+
"""
|
|
11
|
+
path_cache: dict[str, str] = {}
|
|
12
|
+
item_paths: dict[str, list[str]] = {}
|
|
13
|
+
|
|
14
|
+
def path_for(key: str) -> str:
|
|
15
|
+
if key in path_cache:
|
|
16
|
+
return path_cache[key]
|
|
17
|
+
node = collections.get(key, {})
|
|
18
|
+
parent = node.get("parent") or ""
|
|
19
|
+
name = node.get("name", "")
|
|
20
|
+
parent_path = path_for(parent) if parent else ""
|
|
21
|
+
full_path = f"{parent_path}/{name}" if parent_path else name
|
|
22
|
+
path_cache[key] = full_path
|
|
23
|
+
return full_path
|
|
24
|
+
|
|
25
|
+
for key, node in collections.items():
|
|
26
|
+
full_path = path_for(key)
|
|
27
|
+
for item_id in node.get("items", []):
|
|
28
|
+
item_paths.setdefault(item_id, []).append(full_path)
|
|
29
|
+
|
|
30
|
+
return {"path_by_key": path_cache, "paths_by_item_id": item_paths}
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _yaml_quote(value: str) -> str:
|
|
9
|
+
if isinstance(value, bool):
|
|
10
|
+
return "true" if value else "false"
|
|
11
|
+
return '"' + str(value or "").replace("\\", "\\\\").replace('"', '\\"') + '"'
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _yaml_block(value: str) -> list[str]:
|
|
15
|
+
value = (value or "").strip()
|
|
16
|
+
if not value:
|
|
17
|
+
return ["abstract: |-", " "]
|
|
18
|
+
lines = ["abstract: |-"]
|
|
19
|
+
for line in value.splitlines():
|
|
20
|
+
lines.append(f" {line}")
|
|
21
|
+
return lines
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def compute_final_collection(row: dict) -> str:
|
|
25
|
+
user_raw = str(row.get("user_collection", "") or "").strip()
|
|
26
|
+
user_resolved = str(row.get("user_collection_resolved", "") or "").strip()
|
|
27
|
+
recommended = str(row.get("recommended_collection", "") or "").strip()
|
|
28
|
+
if user_raw:
|
|
29
|
+
return user_resolved
|
|
30
|
+
return recommended
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
DEEP_READING_HEADER = "## 🔍 精读"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _read_frontmatter_bool_from_text(text: str, key: str, default: bool = False) -> bool:
|
|
37
|
+
match = re.search(rf"^{re.escape(key)}:\s*(?:[\"'])?(true|false)(?:[\"'])?\s*$", text, re.MULTILINE | re.IGNORECASE)
|
|
38
|
+
if not match:
|
|
39
|
+
return default
|
|
40
|
+
return match.group(1).lower() == "true"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def read_frontmatter_bool(note_path: Path, key: str, default: bool = False) -> bool:
|
|
44
|
+
"""Read a boolean field from a formal note's YAML frontmatter (Path-based)."""
|
|
45
|
+
if not note_path or not note_path.exists():
|
|
46
|
+
return default
|
|
47
|
+
try:
|
|
48
|
+
text = note_path.read_text(encoding="utf-8")
|
|
49
|
+
return _read_frontmatter_bool_from_text(text, key, default)
|
|
50
|
+
except Exception:
|
|
51
|
+
return default
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _read_frontmatter_optional_bool_from_text(text: str, key: str) -> Optional[bool]:
|
|
55
|
+
match = re.search(rf"^{re.escape(key)}:\s*(?:[\"'])?(true|false)(?:[\"'])?\s*$", text, re.MULTILINE | re.IGNORECASE)
|
|
56
|
+
if not match:
|
|
57
|
+
return None
|
|
58
|
+
return match.group(1).lower() == "true"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def read_frontmatter_optional_bool(note_path: Path, key: str) -> Optional[bool]:
|
|
62
|
+
"""Read an optional boolean field from a formal note's YAML frontmatter (Path-based)."""
|
|
63
|
+
if not note_path or not note_path.exists():
|
|
64
|
+
return None
|
|
65
|
+
try:
|
|
66
|
+
text = note_path.read_text(encoding="utf-8")
|
|
67
|
+
return _read_frontmatter_optional_bool_from_text(text, key)
|
|
68
|
+
except Exception:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _legacy_control_flags(paths: dict[str, Path], zotero_key: str) -> dict[str, Optional[bool]]:
|
|
73
|
+
records_root = paths.get("library_records")
|
|
74
|
+
if not records_root or not records_root.exists():
|
|
75
|
+
return {"do_ocr": None, "analyze": None}
|
|
76
|
+
for record_path in records_root.rglob(f"{zotero_key}.md"):
|
|
77
|
+
try:
|
|
78
|
+
text = record_path.read_text(encoding="utf-8")
|
|
79
|
+
except Exception:
|
|
80
|
+
continue
|
|
81
|
+
return {
|
|
82
|
+
"do_ocr": _read_frontmatter_optional_bool_from_text(text, "do_ocr"),
|
|
83
|
+
"analyze": _read_frontmatter_optional_bool_from_text(text, "analyze"),
|
|
84
|
+
}
|
|
85
|
+
return {"do_ocr": None, "analyze": None}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def canonicalize_decision(value: str) -> str:
|
|
89
|
+
text = str(value or "").strip()
|
|
90
|
+
if text in {"", "待查"}:
|
|
91
|
+
return "待定"
|
|
92
|
+
if text in {"排除", "不纳入"}:
|
|
93
|
+
return "不纳入"
|
|
94
|
+
if text == "纳入":
|
|
95
|
+
return "纳入"
|
|
96
|
+
return "待定"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def candidate_markdown(row: dict) -> str:
|
|
100
|
+
row = dict(row)
|
|
101
|
+
row["final_collection"] = compute_final_collection(row)
|
|
102
|
+
row["decision"] = canonicalize_decision(row.get("decision", ""))
|
|
103
|
+
lines = ["---"]
|
|
104
|
+
ordered_keys = [
|
|
105
|
+
"candidate_id",
|
|
106
|
+
"domain",
|
|
107
|
+
"title",
|
|
108
|
+
"authors",
|
|
109
|
+
"year",
|
|
110
|
+
"journal",
|
|
111
|
+
"doi",
|
|
112
|
+
"pmid",
|
|
113
|
+
"source",
|
|
114
|
+
"requester_skill",
|
|
115
|
+
"request_context",
|
|
116
|
+
"abstract_short",
|
|
117
|
+
"decision",
|
|
118
|
+
"recommended_collection",
|
|
119
|
+
"recommend_confidence",
|
|
120
|
+
"recommend_reason",
|
|
121
|
+
"user_collection",
|
|
122
|
+
"user_collection_resolved",
|
|
123
|
+
"final_collection",
|
|
124
|
+
"collection_resolution",
|
|
125
|
+
"duplicate_hint",
|
|
126
|
+
"existing_zotero_key",
|
|
127
|
+
"existing_collections",
|
|
128
|
+
"import_status",
|
|
129
|
+
"note",
|
|
130
|
+
"candidate_source_type",
|
|
131
|
+
"source_zotero_key",
|
|
132
|
+
"cited_ref_number",
|
|
133
|
+
"trigger_sentence",
|
|
134
|
+
"source_context",
|
|
135
|
+
"task_relevance_reason",
|
|
136
|
+
"harvest_priority",
|
|
137
|
+
"raw_reference",
|
|
138
|
+
"status",
|
|
139
|
+
]
|
|
140
|
+
row.setdefault("status", "candidate")
|
|
141
|
+
for key in ordered_keys:
|
|
142
|
+
value = row.get(key, "")
|
|
143
|
+
if isinstance(value, list):
|
|
144
|
+
lines.append(f"{key}:")
|
|
145
|
+
for item in value:
|
|
146
|
+
lines.append(f" - {_yaml_quote(item)}")
|
|
147
|
+
elif value == "":
|
|
148
|
+
lines.append(f"{key}:")
|
|
149
|
+
elif "\n" in str(value):
|
|
150
|
+
lines.extend(
|
|
151
|
+
_yaml_block(str(value)).copy()
|
|
152
|
+
if key == "abstract"
|
|
153
|
+
else [f"{key}: |-"] + [f" {line}" for line in str(value).splitlines()]
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
lines.append(f"{key}: {_yaml_quote(value)}")
|
|
157
|
+
lines.extend(
|
|
158
|
+
[
|
|
159
|
+
"---",
|
|
160
|
+
"",
|
|
161
|
+
f"# {row['candidate_id']}",
|
|
162
|
+
"",
|
|
163
|
+
"候选文献轻量记录,仅用于 Base 决策和 write-back 触发,不是正式文献卡片。",
|
|
164
|
+
"",
|
|
165
|
+
]
|
|
166
|
+
)
|
|
167
|
+
return "\n".join(lines)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def generate_review(candidates: list[dict]) -> str:
|
|
171
|
+
normalized = []
|
|
172
|
+
for row in candidates:
|
|
173
|
+
copy = dict(row)
|
|
174
|
+
copy["decision"] = canonicalize_decision(copy.get("decision", ""))
|
|
175
|
+
normalized.append(copy)
|
|
176
|
+
include = [c for c in normalized if c.get("decision") == "纳入"]
|
|
177
|
+
exclude = [c for c in normalized if c.get("decision") == "不纳入"]
|
|
178
|
+
lines = [
|
|
179
|
+
"# 本轮候选总览",
|
|
180
|
+
"",
|
|
181
|
+
"## 检索背景",
|
|
182
|
+
"",
|
|
183
|
+
f"- 候选数量:{len(normalized)}",
|
|
184
|
+
f"- 建议纳入:{len(include)}",
|
|
185
|
+
f"- 不纳入:{len(exclude)}",
|
|
186
|
+
"",
|
|
187
|
+
"## 总体判断",
|
|
188
|
+
"",
|
|
189
|
+
"- 当前候选池已经按决策状态分层,可直接进入 Base 处理。",
|
|
190
|
+
"",
|
|
191
|
+
"## 推荐优先纳入",
|
|
192
|
+
"",
|
|
193
|
+
]
|
|
194
|
+
if include:
|
|
195
|
+
for row in include:
|
|
196
|
+
lines.extend(
|
|
197
|
+
[
|
|
198
|
+
f"### {row['candidate_id']}",
|
|
199
|
+
"",
|
|
200
|
+
f"- 标题:{row['title']}",
|
|
201
|
+
f"- 推荐分类:`{compute_final_collection(row)}`",
|
|
202
|
+
f"- 理由:{row.get('recommend_reason', '')}",
|
|
203
|
+
"",
|
|
204
|
+
]
|
|
205
|
+
)
|
|
206
|
+
else:
|
|
207
|
+
lines.extend(["- 暂无", ""])
|
|
208
|
+
lines.extend(["## 不纳入", ""])
|
|
209
|
+
if exclude:
|
|
210
|
+
for row in exclude:
|
|
211
|
+
lines.extend(
|
|
212
|
+
[
|
|
213
|
+
f"### {row['candidate_id']}",
|
|
214
|
+
"",
|
|
215
|
+
f"- 标题:{row['title']}",
|
|
216
|
+
f"- 理由:{row.get('recommend_reason', '')}",
|
|
217
|
+
"",
|
|
218
|
+
]
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
lines.extend(["- 暂无", ""])
|
|
222
|
+
lines.extend(["## 下一步", "", "1. 在 Base 中确认决策。", "2. 对纳入项执行 write-back。", "3. 刷新正式索引。", ""])
|
|
223
|
+
return "\n".join(lines)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def extract_preserved_deep_reading(text: str) -> str:
|
|
227
|
+
if not text:
|
|
228
|
+
return ""
|
|
229
|
+
# Match "## 🔍 精读" or "## 精读" (emoji is optional)
|
|
230
|
+
match = re.search(r"^##\s*🔍?\s*精读\s*$", text, re.MULTILINE)
|
|
231
|
+
if not match:
|
|
232
|
+
return ""
|
|
233
|
+
start = match.start()
|
|
234
|
+
preserved = text[start:].strip()
|
|
235
|
+
# Skip if section body is empty or only contains placeholder text
|
|
236
|
+
body = re.sub(r"^##\s*🔍?\s*精读\s*$", "", preserved, flags=re.MULTILINE).strip()
|
|
237
|
+
if not body or _is_placeholder_only(body):
|
|
238
|
+
return ""
|
|
239
|
+
return preserved
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
_PLACEHOLDER_PATTERN = re.compile(r"(待补充)|\(待补充\)|\(TODO\)|\(TBD\)")
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _is_placeholder_only(body: str) -> bool:
|
|
246
|
+
"""Check if the deep reading body is only placeholder text (no real content)."""
|
|
247
|
+
cleaned = _PLACEHOLDER_PATTERN.sub("", body).strip()
|
|
248
|
+
cleaned = re.sub(r"^[-*]\s*$", "", cleaned, flags=re.MULTILINE).strip()
|
|
249
|
+
return not cleaned
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def has_deep_reading_content(text: str) -> bool:
|
|
253
|
+
preserved = extract_preserved_deep_reading(text)
|
|
254
|
+
if not preserved:
|
|
255
|
+
return False
|
|
256
|
+
# Strip both "## 🔍 精读" and "## 精读" header variants
|
|
257
|
+
body = re.sub(r"^##\s*🔍?\s*精读\s*$", "", preserved, flags=re.MULTILINE).strip()
|
|
258
|
+
if not body:
|
|
259
|
+
return False
|
|
260
|
+
|
|
261
|
+
clarity_ok = bool(re.search(r"-\s*\*\*Clarity\*\*(清晰度):(.+)", body)) and not re.search(
|
|
262
|
+
r"-\s*\*\*Clarity\*\*(清晰度):\s*$", body, re.MULTILINE
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
figure_sec = _extract_section(body, r"\*\*Figure 导读\*\*")
|
|
266
|
+
figure_ok = False
|
|
267
|
+
if figure_sec:
|
|
268
|
+
for line in figure_sec.splitlines():
|
|
269
|
+
s = line.strip()
|
|
270
|
+
if s.startswith("- ") and ":" in s:
|
|
271
|
+
_, after = s.split(":", 1)
|
|
272
|
+
if after.strip() and after.strip() != "(待补充)":
|
|
273
|
+
figure_ok = True
|
|
274
|
+
break
|
|
275
|
+
|
|
276
|
+
issue_sec = _extract_section(body, r"\*\*遗留问题\*\*")
|
|
277
|
+
if not issue_sec:
|
|
278
|
+
issue_sec = _extract_section(body, r"####\s*遗留问题")
|
|
279
|
+
issue_ok = False
|
|
280
|
+
if issue_sec:
|
|
281
|
+
dirty = [l.strip() for l in issue_sec.splitlines() if l.strip()]
|
|
282
|
+
substantive = [l for l in dirty if l not in ("-", "(待补充)", "", "**遗留问题**")]
|
|
283
|
+
issue_ok = bool(substantive)
|
|
284
|
+
|
|
285
|
+
return clarity_ok and figure_ok and issue_ok
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _extract_section(body: str, section_header: str) -> str | None:
|
|
289
|
+
m = re.search(
|
|
290
|
+
section_header + r"\n*(.*?)(?=\n(?:#{1,3})\s|\Z)",
|
|
291
|
+
body,
|
|
292
|
+
re.DOTALL,
|
|
293
|
+
)
|
|
294
|
+
if m:
|
|
295
|
+
return m.group(1).strip()
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _add_missing_frontmatter_fields(existing_content: str, new_fields: dict[str, str]) -> str:
|
|
300
|
+
if not existing_content.startswith("---"):
|
|
301
|
+
return existing_content
|
|
302
|
+
parts = existing_content.split("---", 2)
|
|
303
|
+
if len(parts) < 3:
|
|
304
|
+
return existing_content
|
|
305
|
+
frontmatter = parts[1]
|
|
306
|
+
body = parts[2]
|
|
307
|
+
lines_to_add = []
|
|
308
|
+
for key, value in new_fields.items():
|
|
309
|
+
pattern = "^" + re.escape(key) + "\\s*:"
|
|
310
|
+
if not re.search(pattern, frontmatter, re.MULTILINE):
|
|
311
|
+
lines_to_add.append(f"{key}: {_yaml_quote(value)}")
|
|
312
|
+
if not lines_to_add:
|
|
313
|
+
return existing_content
|
|
314
|
+
new_frontmatter = frontmatter.rstrip("\n") + "\n" + "\n".join(lines_to_add) + "\n"
|
|
315
|
+
return f"---{new_frontmatter}---{body}"
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def update_frontmatter_field(content: str, key: str, value: str) -> str:
|
|
319
|
+
if not content.startswith("---"):
|
|
320
|
+
return content
|
|
321
|
+
pattern = "^" + re.escape(key) + "\\s*:.*$"
|
|
322
|
+
replacement = f"{key}: {_yaml_quote(value)}"
|
|
323
|
+
new_content, count = re.subn(pattern, replacement, content, flags=re.MULTILINE, count=1)
|
|
324
|
+
if count == 0:
|
|
325
|
+
new_content = _add_missing_frontmatter_fields(content, {key: value})
|
|
326
|
+
return new_content
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def read_frontmatter_dict(text: str) -> dict:
|
|
330
|
+
"""Parse YAML frontmatter from markdown text using yaml.safe_load.
|
|
331
|
+
|
|
332
|
+
Falls back to regex parsing if YAML parser fails (malformed frontmatter).
|
|
333
|
+
Returns empty dict if no frontmatter.
|
|
334
|
+
"""
|
|
335
|
+
import re
|
|
336
|
+
import yaml
|
|
337
|
+
|
|
338
|
+
fm_match = re.match(r"^---\s*\n(.*?)\n---", text, re.DOTALL)
|
|
339
|
+
if not fm_match:
|
|
340
|
+
return {}
|
|
341
|
+
|
|
342
|
+
try:
|
|
343
|
+
data = yaml.safe_load(fm_match.group(1))
|
|
344
|
+
if isinstance(data, dict):
|
|
345
|
+
return data
|
|
346
|
+
except Exception:
|
|
347
|
+
pass
|
|
348
|
+
|
|
349
|
+
result = {}
|
|
350
|
+
for line in fm_match.group(1).splitlines():
|
|
351
|
+
line = line.strip()
|
|
352
|
+
if ":" in line:
|
|
353
|
+
key, _, val = line.partition(":")
|
|
354
|
+
result[key.strip()] = val.strip()
|
|
355
|
+
return result
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def obsidian_wikilink_for_pdf(pdf_path: str, vault_dir: Path, zotero_dir: Path | None = None) -> str:
|
|
8
|
+
text = str(pdf_path or "").strip()
|
|
9
|
+
if not text:
|
|
10
|
+
return ""
|
|
11
|
+
# Handle storage: prefix paths by resolving through zotero_dir
|
|
12
|
+
if text.startswith("storage:") and zotero_dir is not None:
|
|
13
|
+
storage_rel = text[len("storage:") :].lstrip("/").lstrip("\\")
|
|
14
|
+
absolute_pdf_path = zotero_dir / "storage" / storage_rel.replace("/", os.sep)
|
|
15
|
+
absolute_str = str(absolute_pdf_path)
|
|
16
|
+
else:
|
|
17
|
+
absolute_str = absolutize_vault_path(vault_dir, text, resolve_junction=True)
|
|
18
|
+
if not absolute_str:
|
|
19
|
+
return ""
|
|
20
|
+
absolute_path = Path(absolute_str)
|
|
21
|
+
try:
|
|
22
|
+
relative = absolute_path.relative_to(vault_dir)
|
|
23
|
+
except ValueError:
|
|
24
|
+
# Path outside vault — try to route through Zotero junction inside vault
|
|
25
|
+
if zotero_dir is not None and zotero_dir.exists():
|
|
26
|
+
try:
|
|
27
|
+
from paperforge.pdf_resolver import resolve_junction
|
|
28
|
+
|
|
29
|
+
real_zotero = resolve_junction(zotero_dir)
|
|
30
|
+
if real_zotero != zotero_dir:
|
|
31
|
+
rel_to_zotero = absolute_path.relative_to(real_zotero)
|
|
32
|
+
via_junction = zotero_dir / rel_to_zotero
|
|
33
|
+
relative = via_junction.relative_to(vault_dir)
|
|
34
|
+
return f"[[{relative.as_posix()}]]"
|
|
35
|
+
except (ValueError, OSError):
|
|
36
|
+
pass
|
|
37
|
+
return f"[[{absolute_path.as_posix()}]]"
|
|
38
|
+
return f"[[{relative.as_posix()}]]"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def absolutize_vault_path(vault: Path, path: str, resolve_junction: bool = False) -> str:
|
|
42
|
+
text = str(path or "").strip()
|
|
43
|
+
if not text:
|
|
44
|
+
return ""
|
|
45
|
+
candidate = Path(text)
|
|
46
|
+
result = str(candidate) if candidate.is_absolute() else str((vault / text.replace("/", os.sep)).resolve())
|
|
47
|
+
if resolve_junction:
|
|
48
|
+
from paperforge.pdf_resolver import resolve_junction
|
|
49
|
+
|
|
50
|
+
result = str(resolve_junction(Path(result)))
|
|
51
|
+
return result
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def obsidian_wikilink_for_path(vault: Path, path: str) -> str:
|
|
55
|
+
absolute = absolutize_vault_path(vault, path, resolve_junction=True)
|
|
56
|
+
if not absolute:
|
|
57
|
+
return ""
|
|
58
|
+
absolute_path = Path(absolute)
|
|
59
|
+
try:
|
|
60
|
+
relative = absolute_path.relative_to(vault)
|
|
61
|
+
except ValueError:
|
|
62
|
+
return f"[[{absolute_path.as_posix()}]]"
|
|
63
|
+
return f"[[{relative.as_posix()}]]"
|