@researai/deepscientist 1.5.8 → 1.5.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/LICENSE +186 -21
  2. package/README.md +108 -95
  3. package/assets/branding/connector-qq.png +0 -0
  4. package/assets/branding/connector-rokid.png +0 -0
  5. package/assets/branding/connector-weixin.png +0 -0
  6. package/assets/branding/projects.png +0 -0
  7. package/bin/ds.js +172 -13
  8. package/docs/assets/branding/projects.png +0 -0
  9. package/docs/en/00_QUICK_START.md +308 -70
  10. package/docs/en/01_SETTINGS_REFERENCE.md +3 -0
  11. package/docs/en/02_START_RESEARCH_GUIDE.md +112 -0
  12. package/docs/en/04_LINGZHU_CONNECTOR_GUIDE.md +62 -179
  13. package/docs/en/09_DOCTOR.md +41 -5
  14. package/docs/en/10_WEIXIN_CONNECTOR_GUIDE.md +137 -0
  15. package/docs/en/11_LICENSE_AND_RISK.md +256 -0
  16. package/docs/en/12_GUIDED_WORKFLOW_TOUR.md +427 -0
  17. package/docs/en/13_CORE_ARCHITECTURE_GUIDE.md +297 -0
  18. package/docs/en/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +506 -0
  19. package/docs/en/99_ACKNOWLEDGEMENTS.md +4 -1
  20. package/docs/en/README.md +79 -0
  21. package/docs/images/lingzhu/rokid-agent-platform-create.png +0 -0
  22. package/docs/images/weixin/weixin-plugin-entry.png +0 -0
  23. package/docs/images/weixin/weixin-plugin-entry.svg +33 -0
  24. package/docs/images/weixin/weixin-qr-confirm.svg +30 -0
  25. package/docs/images/weixin/weixin-quest-media-flow.svg +44 -0
  26. package/docs/images/weixin/weixin-settings-bind.svg +57 -0
  27. package/docs/zh/00_QUICK_START.md +315 -74
  28. package/docs/zh/01_SETTINGS_REFERENCE.md +3 -0
  29. package/docs/zh/02_START_RESEARCH_GUIDE.md +112 -0
  30. package/docs/zh/04_LINGZHU_CONNECTOR_GUIDE.md +62 -193
  31. package/docs/zh/09_DOCTOR.md +41 -5
  32. package/docs/zh/10_WEIXIN_CONNECTOR_GUIDE.md +144 -0
  33. package/docs/zh/11_LICENSE_AND_RISK.md +256 -0
  34. package/docs/zh/12_GUIDED_WORKFLOW_TOUR.md +423 -0
  35. package/docs/zh/13_CORE_ARCHITECTURE_GUIDE.md +296 -0
  36. package/docs/zh/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +506 -0
  37. package/docs/zh/99_ACKNOWLEDGEMENTS.md +4 -1
  38. package/docs/zh/README.md +126 -0
  39. package/install.sh +0 -34
  40. package/package.json +3 -3
  41. package/pyproject.toml +2 -2
  42. package/src/deepscientist/__init__.py +1 -1
  43. package/src/deepscientist/annotations.py +343 -0
  44. package/src/deepscientist/artifact/arxiv.py +484 -37
  45. package/src/deepscientist/artifact/metrics.py +1 -3
  46. package/src/deepscientist/artifact/service.py +1347 -111
  47. package/src/deepscientist/arxiv_library.py +275 -0
  48. package/src/deepscientist/bash_exec/service.py +9 -0
  49. package/src/deepscientist/bridges/builtins.py +2 -0
  50. package/src/deepscientist/bridges/connectors.py +447 -0
  51. package/src/deepscientist/channels/__init__.py +2 -0
  52. package/src/deepscientist/channels/builtins.py +3 -1
  53. package/src/deepscientist/channels/qq.py +1 -1
  54. package/src/deepscientist/channels/qq_gateway.py +1 -1
  55. package/src/deepscientist/channels/relay.py +7 -1
  56. package/src/deepscientist/channels/weixin.py +59 -0
  57. package/src/deepscientist/channels/weixin_ilink.py +317 -0
  58. package/src/deepscientist/config/models.py +22 -2
  59. package/src/deepscientist/config/service.py +431 -60
  60. package/src/deepscientist/connector/__init__.py +4 -0
  61. package/src/deepscientist/connector/connector_profiles.py +481 -0
  62. package/src/deepscientist/connector/lingzhu_support.py +668 -0
  63. package/src/deepscientist/connector/qq_profiles.py +206 -0
  64. package/src/deepscientist/connector/weixin_support.py +663 -0
  65. package/src/deepscientist/connector_profiles.py +1 -374
  66. package/src/deepscientist/connector_runtime.py +2 -0
  67. package/src/deepscientist/daemon/api/handlers.py +295 -5
  68. package/src/deepscientist/daemon/api/router.py +16 -1
  69. package/src/deepscientist/daemon/app.py +1130 -61
  70. package/src/deepscientist/doctor.py +5 -2
  71. package/src/deepscientist/gitops/diff.py +120 -29
  72. package/src/deepscientist/lingzhu_support.py +1 -182
  73. package/src/deepscientist/mcp/server.py +14 -5
  74. package/src/deepscientist/prompts/builder.py +29 -1
  75. package/src/deepscientist/qq_profiles.py +1 -196
  76. package/src/deepscientist/quest/node_traces.py +152 -2
  77. package/src/deepscientist/quest/service.py +169 -43
  78. package/src/deepscientist/quest/stage_views.py +172 -9
  79. package/src/deepscientist/registries/baseline.py +56 -4
  80. package/src/deepscientist/runners/codex.py +55 -3
  81. package/src/deepscientist/weixin_support.py +1 -0
  82. package/src/prompts/connectors/lingzhu.md +3 -1
  83. package/src/prompts/connectors/weixin.md +230 -0
  84. package/src/prompts/system.md +9 -0
  85. package/src/skills/idea/SKILL.md +16 -0
  86. package/src/skills/idea/references/literature-survey-template.md +24 -0
  87. package/src/skills/idea/references/related-work-playbook.md +4 -0
  88. package/src/skills/idea/references/selection-gate.md +9 -0
  89. package/src/skills/write/SKILL.md +1 -1
  90. package/src/tui/package.json +1 -1
  91. package/src/ui/dist/assets/{AiManusChatView-m2FNtwbn.js → AiManusChatView-D0mTXG4-.js} +156 -48
  92. package/src/ui/dist/assets/{AnalysisPlugin-BMTF8EGL.js → AnalysisPlugin-Db0cTXxm.js} +1 -1
  93. package/src/ui/dist/assets/{CliPlugin-BEOWgxCI.js → CliPlugin-DrV8je02.js} +164 -9
  94. package/src/ui/dist/assets/{CodeEditorPlugin-BCXvjqmb.js → CodeEditorPlugin-QXMSCH71.js} +8 -8
  95. package/src/ui/dist/assets/{CodeViewerPlugin-DaJcy3nD.js → CodeViewerPlugin-7hhtWj_E.js} +5 -5
  96. package/src/ui/dist/assets/{DocViewerPlugin-ByfeIq4K.js → DocViewerPlugin-BWMSnRJe.js} +3 -3
  97. package/src/ui/dist/assets/{GitDiffViewerPlugin-Cksf3VZ-.js → GitDiffViewerPlugin-7J9h9Vy_.js} +20 -21
  98. package/src/ui/dist/assets/{ImageViewerPlugin-CFz-OsTS.js → ImageViewerPlugin-CHJl_0lr.js} +5 -5
  99. package/src/ui/dist/assets/{LabCopilotPanel-CJ1cJzoX.js → LabCopilotPanel-1qSow1es.js} +11 -11
  100. package/src/ui/dist/assets/{LabPlugin-BF3dVJwa.js → LabPlugin-eQpPPCEp.js} +2 -1
  101. package/src/ui/dist/assets/{LatexPlugin-DDkwZ6Sj.js → LatexPlugin-BwRfi89Z.js} +7 -7
  102. package/src/ui/dist/assets/{MarkdownViewerPlugin-HAuvurcT.js → MarkdownViewerPlugin-836PVQWV.js} +4 -4
  103. package/src/ui/dist/assets/{MarketplacePlugin-BtoTYy2C.js → MarketplacePlugin-C2y_556i.js} +3 -3
  104. package/src/ui/dist/assets/{NotebookEditor-CSJYx7b-.js → NotebookEditor-BRzJbGsn.js} +12 -12
  105. package/src/ui/dist/assets/{NotebookEditor-DQgRezm_.js → NotebookEditor-DIX7Mlzu.js} +1 -1
  106. package/src/ui/dist/assets/{PdfLoader-DPa_-fv6.js → PdfLoader-DzRaTAlq.js} +14 -7
  107. package/src/ui/dist/assets/{PdfMarkdownPlugin-BZpXOEjm.js → PdfMarkdownPlugin-DZUfIUnp.js} +73 -6
  108. package/src/ui/dist/assets/{PdfViewerPlugin-BT8a6wGR.js → PdfViewerPlugin-BwtICzue.js} +103 -34
  109. package/src/ui/dist/assets/PdfViewerPlugin-DQ11QcSf.css +3627 -0
  110. package/src/ui/dist/assets/{SearchPlugin-D_blveZi.js → SearchPlugin-DHeIAMsx.js} +1 -1
  111. package/src/ui/dist/assets/{TextViewerPlugin-Btx0M3hX.js → TextViewerPlugin-C3tCmFox.js} +5 -4
  112. package/src/ui/dist/assets/{VNCViewer-DImJO4rO.js → VNCViewer-CQsKVm3t.js} +10 -10
  113. package/src/ui/dist/assets/bot-BEA2vWuK.js +21 -0
  114. package/src/ui/dist/assets/branding/logo-rokid.png +0 -0
  115. package/src/ui/dist/assets/browser-BAcuE0Xj.js +2895 -0
  116. package/src/ui/dist/assets/{code-BUfXGJSl.js → code-XfbSR8K2.js} +1 -1
  117. package/src/ui/dist/assets/{file-content-VqamwI3X.js → file-content-BjxNaIfy.js} +1 -1
  118. package/src/ui/dist/assets/{file-diff-panel-C_wOoS7a.js → file-diff-panel-D_lLVQk0.js} +1 -1
  119. package/src/ui/dist/assets/{file-socket-D2bTuMVP.js → file-socket-D9x_5vlY.js} +1 -1
  120. package/src/ui/dist/assets/{image-BZkGJ4mM.js → image-BhWT33W1.js} +1 -1
  121. package/src/ui/dist/assets/{index-DdRW6RMJ.js → index--c4iXtuy.js} +12 -12
  122. package/src/ui/dist/assets/{index-CxkvSeKw.js → index-BDxipwrC.js} +2 -2
  123. package/src/ui/dist/assets/{index-DjggJovS.js → index-DZTZ8mWP.js} +14934 -9613
  124. package/src/ui/dist/assets/{index-DXZ1daiJ.css → index-Dqj-Mjb4.css} +2 -13
  125. package/src/ui/dist/assets/index-PJbSbPTy.js +25 -0
  126. package/src/ui/dist/assets/{monaco-DHMc7kKM.js → monaco-K8izTGgo.js} +1 -1
  127. package/src/ui/dist/assets/{pdf-effect-queue-DSw_D3RV.js → pdf-effect-queue-DfBors6y.js} +16 -1
  128. package/src/ui/dist/assets/pdf.worker.min-yatZIOMy.mjs +21 -0
  129. package/src/ui/dist/assets/{popover-B85oCgCS.js → popover-yFK1J4fL.js} +1 -1
  130. package/src/ui/dist/assets/{project-sync-DOMCcPac.js → project-sync-PENr2zcz.js} +1 -74
  131. package/src/ui/dist/assets/select-CAbJDfYv.js +1690 -0
  132. package/src/ui/dist/assets/{sigma-BO2rQrl3.js → sigma-DEuYJqTl.js} +1 -1
  133. package/src/ui/dist/assets/{index-D9QIGcmc.js → square-check-big-omoSUmcd.js} +2 -13
  134. package/src/ui/dist/assets/{trash-BsVEH_dV.js → trash--F119N47.js} +1 -1
  135. package/src/ui/dist/assets/{useCliAccess-b8L6JuZm.js → useCliAccess-D31UR23I.js} +1 -1
  136. package/src/ui/dist/assets/{useFileDiffOverlay-BY7uA9hV.js → useFileDiffOverlay-BH6KcMzq.js} +1 -1
  137. package/src/ui/dist/assets/{wrap-text-BwyVuUIK.js → wrap-text-CZ613PM5.js} +1 -1
  138. package/src/ui/dist/assets/{zoom-out-RDpLugQP.js → zoom-out-BgDLAv3z.js} +1 -1
  139. package/src/ui/dist/index.html +2 -2
  140. package/src/ui/dist/assets/AutoFigurePlugin-BGxN8Umr.css +0 -3056
  141. package/src/ui/dist/assets/AutoFigurePlugin-DxPdMUNb.js +0 -8149
  142. package/src/ui/dist/assets/PdfViewerPlugin-BJXtIwj_.css +0 -260
  143. package/src/ui/dist/assets/Stepper-DH2k75Vo.js +0 -158
  144. package/src/ui/dist/assets/bibtex-B-Hqu0Sg.js +0 -189
  145. package/src/ui/dist/assets/file-utils--zJCPN1i.js +0 -109
  146. package/src/ui/dist/assets/message-square-FUIPIhU2.js +0 -16
  147. package/src/ui/dist/assets/pdfjs-DU1YE8WO.js +0 -3
  148. package/src/ui/dist/assets/tooltip-B1OspAkx.js +0 -108
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
+ import xml.etree.ElementTree as ET
4
5
  from collections.abc import Callable
5
6
  from dataclasses import dataclass
6
7
  from html import unescape
@@ -10,6 +11,11 @@ from urllib.request import Request, urlopen
10
11
 
11
12
  DEFAULT_TIMEOUT_SECONDS = 6
12
13
  USER_AGENT = "DeepScientist/0.1"
14
+ ARXIV_API_URL = "http://export.arxiv.org/api/query?id_list={paper_id}"
15
+ ARXIV_XML_NAMESPACES = {
16
+ "atom": "http://www.w3.org/2005/Atom",
17
+ "arxiv": "http://arxiv.org/schemas/atom",
18
+ }
13
19
 
14
20
 
15
21
  @dataclass(frozen=True)
@@ -115,14 +121,171 @@ def read_arxiv_content(paper_id: str, *, full_text: bool = False) -> dict[str, A
115
121
  "guidance": "Pass an arXiv id like `2010.11929` or `2401.12345v2`.",
116
122
  }
117
123
 
124
+ metadata = fetch_arxiv_metadata(normalized_id)
125
+ attempts: list[dict[str, Any]] = list(metadata.get("attempts") or [])
126
+ if not metadata.get("ok"):
127
+ mode = "full text" if full_text else "overview"
128
+ return {
129
+ "ok": False,
130
+ "paper_id": normalized_id,
131
+ "requested_full_text": full_text,
132
+ "error": f"Unable to fetch arXiv {mode} content for `{normalized_id}`.",
133
+ "attempts": attempts,
134
+ "guidance": "Use web search to confirm the paper id or try again later.",
135
+ }
136
+
137
+ merged: dict[str, Any] = {
138
+ "ok": True,
139
+ "paper_id": metadata.get("paper_id") or normalized_id,
140
+ "requested_full_text": full_text,
141
+ "title": metadata.get("title"),
142
+ "authors": metadata.get("authors") or [],
143
+ "categories": metadata.get("categories") or [],
144
+ "abstract": metadata.get("abstract") or "",
145
+ "published_at": metadata.get("published_at") or "",
146
+ "version": metadata.get("version"),
147
+ "primary_class": metadata.get("primary_class") or "",
148
+ "bibtex": metadata.get("bibtex") or "",
149
+ "metadata_source": metadata.get("metadata_source") or metadata.get("source"),
150
+ "abs_url": metadata.get("abs_url") or f"https://arxiv.org/abs/{normalized_id}",
151
+ "pdf_url": metadata.get("pdf_url") or f"https://arxiv.org/pdf/{normalized_id}.pdf",
152
+ }
153
+
154
+ if full_text:
155
+ for plan in _full_text_plans(normalized_id):
156
+ try:
157
+ payload = _fetch_text(plan.url, timeout=plan.timeout)
158
+ parsed = plan.parser(normalized_id, payload, plan.url)
159
+ content = str(parsed.get("content") or "").strip()
160
+ if not content:
161
+ attempts.append(
162
+ {
163
+ "source": plan.name,
164
+ "url": plan.url,
165
+ "ok": False,
166
+ "error": "Empty response.",
167
+ }
168
+ )
169
+ continue
170
+ attempts.append(
171
+ {
172
+ "source": plan.name,
173
+ "url": plan.url,
174
+ "ok": True,
175
+ "content_mode": plan.content_mode,
176
+ }
177
+ )
178
+ return {
179
+ **merged,
180
+ "content_mode": plan.content_mode,
181
+ "source": plan.name,
182
+ "source_url": plan.url,
183
+ "summary_source": metadata.get("metadata_source") or metadata.get("source"),
184
+ "overview": "",
185
+ "overview_source": None,
186
+ "content": _build_full_text_content(merged, content),
187
+ "attempts": attempts,
188
+ "guidance": "Use web search for discovery. Use `artifact.arxiv(...)` after you already know the arXiv paper id.",
189
+ }
190
+ except Exception as exc: # noqa: BLE001
191
+ attempts.append(
192
+ {
193
+ "source": plan.name,
194
+ "url": plan.url,
195
+ "ok": False,
196
+ "error": _format_error(exc),
197
+ }
198
+ )
199
+
200
+ return {
201
+ **merged,
202
+ "content_mode": "abstract",
203
+ "source": metadata.get("source"),
204
+ "source_url": metadata.get("source_url"),
205
+ "summary_source": metadata.get("metadata_source") or metadata.get("source"),
206
+ "overview": "",
207
+ "overview_source": None,
208
+ "content": _build_overview_content(merged, None),
209
+ "attempts": attempts,
210
+ "guidance": "Use web search for discovery. Use `artifact.arxiv(...)` after you already know the arXiv paper id.",
211
+ }
212
+
213
+ overview_text = ""
214
+ overview_markdown = ""
215
+ overview_source: str | None = None
216
+ overview_url: str | None = None
217
+ for plan in _overview_plans(normalized_id):
218
+ try:
219
+ payload = _fetch_text(plan.url, timeout=plan.timeout)
220
+ parsed = plan.parser(normalized_id, payload, plan.url)
221
+ candidate = str(parsed.get("abstract") or parsed.get("content") or "").strip()
222
+ candidate_markdown = str(parsed.get("overview_markdown") or parsed.get("content") or "").strip()
223
+ if not candidate:
224
+ attempts.append(
225
+ {
226
+ "source": plan.name,
227
+ "url": plan.url,
228
+ "ok": False,
229
+ "error": "Empty response.",
230
+ }
231
+ )
232
+ continue
233
+ attempts.append(
234
+ {
235
+ "source": plan.name,
236
+ "url": plan.url,
237
+ "ok": True,
238
+ "content_mode": "overview",
239
+ }
240
+ )
241
+ overview_text = candidate
242
+ overview_markdown = candidate_markdown
243
+ overview_source = plan.name
244
+ overview_url = plan.url
245
+ break
246
+ except Exception as exc: # noqa: BLE001
247
+ attempts.append(
248
+ {
249
+ "source": plan.name,
250
+ "url": plan.url,
251
+ "ok": False,
252
+ "error": _format_error(exc),
253
+ }
254
+ )
255
+
256
+ return {
257
+ **merged,
258
+ "content_mode": "overview" if overview_text else "abstract",
259
+ "source": overview_source or metadata.get("source"),
260
+ "source_url": overview_url or metadata.get("source_url"),
261
+ "summary_source": overview_source or metadata.get("metadata_source") or metadata.get("source"),
262
+ "overview": overview_text,
263
+ "overview_markdown": overview_markdown,
264
+ "overview_source": overview_source,
265
+ "content": _build_overview_content(merged, overview_text or None),
266
+ "attempts": attempts,
267
+ "guidance": "Use web search for discovery. Use `artifact.arxiv(...)` after you already know the arXiv paper id.",
268
+ }
269
+
270
+
271
+ def fetch_arxiv_metadata(paper_id: str) -> dict[str, Any]:
272
+ normalized_id = normalize_arxiv_id(paper_id)
273
+ if not normalized_id:
274
+ return {
275
+ "ok": False,
276
+ "paper_id": str(paper_id or "").strip(),
277
+ "error": "Invalid arXiv paper id.",
278
+ "attempts": [],
279
+ }
280
+
118
281
  attempts: list[dict[str, Any]] = []
119
- plans = _full_text_plans(normalized_id) if full_text else _overview_plans(normalized_id)
120
- for plan in plans:
282
+ for plan in _metadata_plans(normalized_id):
121
283
  try:
122
284
  payload = _fetch_text(plan.url, timeout=plan.timeout)
123
285
  parsed = plan.parser(normalized_id, payload, plan.url)
124
- content = str(parsed.get("content") or "").strip()
125
- if not content:
286
+ title = str(parsed.get("title") or "").strip()
287
+ abstract = str(parsed.get("abstract") or "").strip()
288
+ if not title and not abstract:
126
289
  attempts.append(
127
290
  {
128
291
  "source": plan.name,
@@ -140,19 +303,30 @@ def read_arxiv_content(paper_id: str, *, full_text: bool = False) -> dict[str, A
140
303
  "content_mode": plan.content_mode,
141
304
  }
142
305
  )
143
- return {
306
+ canonical_id = str(parsed.get("paper_id") or normalized_id).strip() or normalized_id
307
+ primary_class = str(parsed.get("primary_class") or "").strip()
308
+ published_at = str(parsed.get("published_at") or "").strip()
309
+ version = parsed.get("version")
310
+ metadata = {
144
311
  "ok": True,
145
- "paper_id": normalized_id,
146
- "requested_full_text": full_text,
147
- "content_mode": plan.content_mode,
312
+ "paper_id": canonical_id,
148
313
  "source": plan.name,
149
314
  "source_url": plan.url,
150
- "title": parsed.get("title"),
315
+ "metadata_source": plan.name,
316
+ "title": title or canonical_id,
151
317
  "authors": parsed.get("authors") or [],
152
- "content": content,
318
+ "categories": parsed.get("categories") or ([] if not primary_class else [primary_class]),
319
+ "abstract": abstract,
320
+ "published_at": published_at,
321
+ "version": version if isinstance(version, int) else _parse_arxiv_version(canonical_id),
322
+ "primary_class": primary_class or ((parsed.get("categories") or [None])[0] or ""),
323
+ "abs_url": str(parsed.get("abs_url") or f"https://arxiv.org/abs/{canonical_id}"),
324
+ "pdf_url": str(parsed.get("pdf_url") or f"https://arxiv.org/pdf/{canonical_id}.pdf"),
153
325
  "attempts": attempts,
154
- "guidance": "Use web search for discovery. Use `artifact.arxiv(...)` after you already know the arXiv paper id.",
155
326
  }
327
+ metadata["bibtex"] = _build_bibtex(metadata)
328
+ metadata["content"] = _build_metadata_content(metadata)
329
+ return metadata
156
330
  except Exception as exc: # noqa: BLE001
157
331
  attempts.append(
158
332
  {
@@ -163,14 +337,11 @@ def read_arxiv_content(paper_id: str, *, full_text: bool = False) -> dict[str, A
163
337
  }
164
338
  )
165
339
 
166
- mode = "full text" if full_text else "overview"
167
340
  return {
168
341
  "ok": False,
169
342
  "paper_id": normalized_id,
170
- "requested_full_text": full_text,
171
- "error": f"Unable to fetch arXiv {mode} content for `{normalized_id}`.",
343
+ "error": f"Unable to fetch arXiv metadata for `{normalized_id}`.",
172
344
  "attempts": attempts,
173
- "guidance": "Use web search to confirm the paper id or try again later.",
174
345
  }
175
346
 
176
347
 
@@ -201,18 +372,6 @@ def _overview_plans(paper_id: str) -> list[_FetchPlan]:
201
372
  parser=_parse_markdown,
202
373
  timeout=4,
203
374
  ),
204
- _FetchPlan(
205
- name="arxiv_abstract",
206
- url=f"https://arxiv.org/abs/{paper_id}",
207
- content_mode="abstract",
208
- parser=_parse_arxiv_abstract_html,
209
- ),
210
- _FetchPlan(
211
- name="alphaxiv_full_text",
212
- url=f"https://www.alphaxiv.org/abs/{paper_id}.md",
213
- content_mode="full_text",
214
- parser=_parse_markdown,
215
- ),
216
375
  ]
217
376
 
218
377
 
@@ -251,6 +410,24 @@ def _full_text_plans(paper_id: str) -> list[_FetchPlan]:
251
410
  ]
252
411
 
253
412
 
413
+ def _metadata_plans(paper_id: str) -> list[_FetchPlan]:
414
+ return [
415
+ _FetchPlan(
416
+ name="arxiv_api",
417
+ url=ARXIV_API_URL.format(paper_id=paper_id),
418
+ content_mode="abstract",
419
+ parser=_parse_arxiv_atom,
420
+ timeout=8,
421
+ ),
422
+ _FetchPlan(
423
+ name="arxiv_abstract",
424
+ url=f"https://arxiv.org/abs/{paper_id}",
425
+ content_mode="abstract",
426
+ parser=_parse_arxiv_abstract_html,
427
+ ),
428
+ ]
429
+
430
+
254
431
  def _fetch_text(url: str, *, timeout: int) -> str:
255
432
  request = Request(
256
433
  url,
@@ -270,18 +447,89 @@ def _parse_markdown(paper_id: str, payload: str, url: str) -> dict[str, Any]:
270
447
  return {"content": ""}
271
448
  title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
272
449
  title = title_match.group(1).strip() if title_match else _first_nonempty_line(content)
450
+ abstract = _markdown_to_text(content, title=title)
273
451
  return {
274
452
  "title": title,
275
453
  "authors": [],
454
+ "categories": [],
455
+ "abstract": abstract,
456
+ "overview_markdown": content,
276
457
  "content": content,
277
458
  }
278
459
 
279
460
 
461
+ def _parse_arxiv_atom(paper_id: str, payload: str, url: str) -> dict[str, Any]:
462
+ root = ET.fromstring(payload)
463
+ entry = root.find("atom:entry", ARXIV_XML_NAMESPACES)
464
+ if entry is None:
465
+ return {"content": ""}
466
+
467
+ title = _clean_inline_text(entry.findtext("atom:title", default="", namespaces=ARXIV_XML_NAMESPACES))
468
+ abstract = _clean_inline_text(entry.findtext("atom:summary", default="", namespaces=ARXIV_XML_NAMESPACES))
469
+ published_at = _clean_inline_text(
470
+ entry.findtext("atom:published", default="", namespaces=ARXIV_XML_NAMESPACES)
471
+ )
472
+ authors: list[str] = []
473
+ for author in entry.findall("atom:author", ARXIV_XML_NAMESPACES):
474
+ author_name = _clean_inline_text(
475
+ author.findtext("atom:name", default="", namespaces=ARXIV_XML_NAMESPACES)
476
+ )
477
+ if author_name:
478
+ authors.append(author_name)
479
+
480
+ categories: list[str] = []
481
+ primary_class = ""
482
+ primary_node = entry.find("arxiv:primary_category", ARXIV_XML_NAMESPACES)
483
+ if primary_node is not None:
484
+ primary_class = _clean_inline_text(primary_node.attrib.get("term", ""))
485
+ if primary_class:
486
+ categories.append(primary_class)
487
+ for category in entry.findall("atom:category", ARXIV_XML_NAMESPACES):
488
+ term = _clean_inline_text(category.attrib.get("term", ""))
489
+ if term and term not in categories:
490
+ categories.append(term)
491
+
492
+ entry_id = _clean_inline_text(entry.findtext("atom:id", default="", namespaces=ARXIV_XML_NAMESPACES))
493
+ entry_id_normalized = normalize_arxiv_id(entry_id) or paper_id
494
+ canonical_id = normalize_arxiv_id(paper_id) or _strip_arxiv_version(entry_id_normalized) or paper_id
495
+ version = _parse_arxiv_version(entry_id_normalized)
496
+ abs_url = f"https://arxiv.org/abs/{canonical_id}"
497
+ pdf_url = f"https://arxiv.org/pdf/{canonical_id}.pdf"
498
+ return {
499
+ "paper_id": canonical_id,
500
+ "title": title,
501
+ "authors": authors,
502
+ "categories": categories,
503
+ "primary_class": primary_class or (categories[0] if categories else ""),
504
+ "published_at": _normalize_published_at(published_at),
505
+ "version": version,
506
+ "abstract": abstract,
507
+ "abs_url": abs_url,
508
+ "pdf_url": pdf_url,
509
+ "content": _build_metadata_content(
510
+ {
511
+ "paper_id": canonical_id,
512
+ "title": title,
513
+ "authors": authors,
514
+ "categories": categories,
515
+ "primary_class": primary_class or (categories[0] if categories else ""),
516
+ "published_at": _normalize_published_at(published_at),
517
+ "version": version,
518
+ "abstract": abstract,
519
+ "abs_url": abs_url,
520
+ "pdf_url": pdf_url,
521
+ }
522
+ ),
523
+ }
524
+
525
+
280
526
  def _parse_arxiv_abstract_html(paper_id: str, payload: str, url: str) -> dict[str, Any]:
281
527
  title = _match_first(payload, r'<meta name="citation_title" content="([^"]+)"')
282
528
  if not title:
283
529
  title = _match_first(payload, r"<title>(.*?)</title>", flags=re.IGNORECASE | re.DOTALL)
284
530
  authors = re.findall(r'<meta name="citation_author" content="([^"]+)"', payload)
531
+ categories = _parse_arxiv_categories(payload)
532
+ published_at = _normalize_published_at(_match_first(payload, r'<meta name="citation_date" content="([^"]+)"'))
285
533
  abstract = _match_first(
286
534
  payload,
287
535
  r'<span class="descriptor">Abstract:</span>(.*?)</blockquote>',
@@ -290,18 +538,22 @@ def _parse_arxiv_abstract_html(paper_id: str, payload: str, url: str) -> dict[st
290
538
  abstract = _clean_inline_text(abstract)
291
539
  if not abstract:
292
540
  abstract = _clean_inline_text(_extract_text(payload))
293
- lines = []
294
- if title:
295
- lines.extend([f"# {title}", ""])
296
- lines.append(f"- paper_id: {paper_id}")
297
- lines.append("- source: arXiv abstract page")
298
- if authors:
299
- lines.append(f"- authors: {', '.join(_clean_inline_text(author) for author in authors)}")
300
- lines.extend(["", "## Abstract", "", abstract or "Abstract unavailable."])
301
- return {
541
+ primary_class = categories[0] if categories else ""
542
+ metadata = {
543
+ "paper_id": paper_id,
302
544
  "title": _clean_inline_text(title),
303
545
  "authors": [_clean_inline_text(author) for author in authors if _clean_inline_text(author)],
304
- "content": "\n".join(lines).strip(),
546
+ "categories": categories,
547
+ "abstract": abstract,
548
+ "published_at": published_at,
549
+ "version": _parse_arxiv_version(paper_id),
550
+ "primary_class": primary_class,
551
+ "abs_url": f"https://arxiv.org/abs/{paper_id}",
552
+ "pdf_url": f"https://arxiv.org/pdf/{paper_id}.pdf",
553
+ }
554
+ return {
555
+ **metadata,
556
+ "content": _build_metadata_content(metadata),
305
557
  }
306
558
 
307
559
 
@@ -324,6 +576,8 @@ def _parse_article_html(paper_id: str, payload: str, url: str) -> dict[str, Any]
324
576
  return {
325
577
  "title": cleaned_title,
326
578
  "authors": [],
579
+ "categories": [],
580
+ "abstract": _summarize_text(text),
327
581
  "content": "\n".join(lines).strip(),
328
582
  }
329
583
 
@@ -361,6 +615,199 @@ def _first_nonempty_line(text: str) -> str:
361
615
  return ""
362
616
 
363
617
 
618
+ def _markdown_to_text(content: str, *, title: str | None = None) -> str:
619
+ text = re.sub(r"```.*?```", " ", content, flags=re.DOTALL)
620
+ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
621
+ text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE)
622
+ text = re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", text)
623
+ cleaned = _clean_inline_text(text)
624
+ if title:
625
+ title_prefix = _clean_inline_text(title)
626
+ if cleaned.lower().startswith(title_prefix.lower()):
627
+ cleaned = cleaned[len(title_prefix) :].strip(" :-")
628
+ return _summarize_text(cleaned)
629
+
630
+
631
+ def _summarize_text(text: str, *, limit: int = 1600) -> str:
632
+ cleaned = _clean_inline_text(text)
633
+ if len(cleaned) <= limit:
634
+ return cleaned
635
+ return f"{cleaned[: max(0, limit - 1)].rstrip()}…"
636
+
637
+
638
+ def _parse_arxiv_categories(payload: str) -> list[str]:
639
+ raw = _match_first(
640
+ payload,
641
+ r'<td class="tablecell subjects">(.*?)</td>',
642
+ flags=re.IGNORECASE | re.DOTALL,
643
+ )
644
+ cleaned = _clean_inline_text(raw)
645
+ if not cleaned:
646
+ return []
647
+ parts = [part.strip() for part in cleaned.split(";") if part.strip()]
648
+ return parts
649
+
650
+
651
+ def _normalize_published_at(value: str) -> str:
652
+ raw = _clean_inline_text(value)
653
+ if not raw:
654
+ return ""
655
+ if "T" in raw:
656
+ return raw.split("T", 1)[0]
657
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", raw):
658
+ return raw
659
+ month_match = re.search(r"([A-Za-z]{3,9})\s+(\d{1,2}),\s*(\d{4})", raw)
660
+ if month_match:
661
+ month_lookup = {
662
+ "jan": "01",
663
+ "feb": "02",
664
+ "mar": "03",
665
+ "apr": "04",
666
+ "may": "05",
667
+ "jun": "06",
668
+ "jul": "07",
669
+ "aug": "08",
670
+ "sep": "09",
671
+ "oct": "10",
672
+ "nov": "11",
673
+ "dec": "12",
674
+ }
675
+ month = month_lookup.get(month_match.group(1)[:3].lower())
676
+ if month:
677
+ return f"{month_match.group(3)}-{month}-{int(month_match.group(2)):02d}"
678
+ year_match = re.search(r"\b(\d{4})\b", raw)
679
+ return year_match.group(1) if year_match else raw
680
+
681
+
682
+ def _parse_arxiv_version(paper_id: str) -> int | None:
683
+ match = re.search(r"v(\d+)$", str(paper_id or "").strip(), re.IGNORECASE)
684
+ if not match:
685
+ return None
686
+ try:
687
+ return int(match.group(1))
688
+ except ValueError:
689
+ return None
690
+
691
+
692
+ def _strip_arxiv_version(paper_id: str) -> str:
693
+ return re.sub(r"v\d+$", "", str(paper_id or "").strip(), flags=re.IGNORECASE)
694
+
695
+
696
+ def _bibtex_year(published_at: str) -> str:
697
+ match = re.search(r"\b(\d{4})\b", str(published_at or "").strip())
698
+ return match.group(1) if match else ""
699
+
700
+
701
+ def _bibtex_key_author(authors: list[str]) -> str:
702
+ if not authors:
703
+ return "unknown"
704
+ parts = re.split(r"[\s,]+", authors[0].strip())
705
+ cleaned = [part for part in parts if part]
706
+ if not cleaned:
707
+ return "unknown"
708
+ return re.sub(r"[^a-z0-9]+", "", cleaned[-1].lower()) or "unknown"
709
+
710
+
711
+ def _citation_key(paper_id: str, authors: list[str], published_at: str) -> str:
712
+ year = _bibtex_year(published_at) or "0000"
713
+ normalized_paper_id = re.sub(r"v\d+$", "", str(paper_id or "").lower())
714
+ base_id = re.sub(r"[^a-z0-9]+", "", normalized_paper_id)
715
+ if not base_id:
716
+ base_id = "arxiv"
717
+ return f"{_bibtex_key_author(authors)}{year}{base_id}"
718
+
719
+
720
+ def _build_bibtex(metadata: dict[str, Any]) -> str:
721
+ paper_id = str(metadata.get("paper_id") or "").strip()
722
+ title = str(metadata.get("title") or "").strip()
723
+ authors = [str(item).strip() for item in (metadata.get("authors") or []) if str(item).strip()]
724
+ published_at = str(metadata.get("published_at") or "").strip()
725
+ primary_class = str(metadata.get("primary_class") or "").strip()
726
+ year = _bibtex_year(published_at) or "0000"
727
+ lines = [
728
+ f"@misc{{{_citation_key(paper_id, authors, published_at)},",
729
+ f" title={{{title}}},",
730
+ f" author={{{' and '.join(authors)}}},",
731
+ f" year={{{year}}},",
732
+ f" eprint={{{paper_id}}},",
733
+ " archivePrefix={arXiv},",
734
+ ]
735
+ if primary_class:
736
+ lines.append(f" primaryClass={{{primary_class}}},")
737
+ lines[-1] = lines[-1].replace(",", "")
738
+ lines.append("}")
739
+ return "\n".join(lines)
740
+
741
+
742
+ def _build_metadata_lines(metadata: dict[str, Any]) -> list[str]:
743
+ paper_id = str(metadata.get("paper_id") or "").strip()
744
+ title = str(metadata.get("title") or "").strip() or paper_id
745
+ authors = [str(item).strip() for item in (metadata.get("authors") or []) if str(item).strip()]
746
+ categories = [str(item).strip() for item in (metadata.get("categories") or []) if str(item).strip()]
747
+ published_at = str(metadata.get("published_at") or "").strip()
748
+ version = metadata.get("version")
749
+ lines = [f"# {title}", "", f"- paper_id: {paper_id}"]
750
+ if authors:
751
+ lines.append(f"- authors: {', '.join(authors)}")
752
+ if categories:
753
+ lines.append(f"- categories: {', '.join(categories)}")
754
+ if published_at:
755
+ lines.append(f"- published_at: {published_at}")
756
+ if isinstance(version, int):
757
+ lines.append(f"- version: v{version}")
758
+ lines.append(f"- abs_url: {str(metadata.get('abs_url') or f'https://arxiv.org/abs/{paper_id}')}")
759
+ return lines
760
+
761
+
762
+ def _build_metadata_content(metadata: dict[str, Any]) -> str:
763
+ lines = _build_metadata_lines(metadata)
764
+ abstract = str(metadata.get("abstract") or "").strip()
765
+ lines.extend(["", "## Abstract", "", abstract or "Abstract unavailable."])
766
+ return "\n".join(lines).strip()
767
+
768
+
769
+ def _build_overview_content(metadata: dict[str, Any], overview_text: str | None) -> str:
770
+ lines = _build_metadata_lines(metadata)
771
+ cleaned_overview = _clean_inline_text(overview_text or "")
772
+ abstract = str(metadata.get("abstract") or "").strip()
773
+ if cleaned_overview:
774
+ lines.extend(["", "## Summary", "", cleaned_overview])
775
+ if abstract and _clean_inline_text(abstract).lower() != cleaned_overview.lower():
776
+ lines.extend(["", "## Abstract", "", abstract])
777
+ else:
778
+ lines.extend(["", "## Abstract", "", abstract or "Abstract unavailable."])
779
+ return "\n".join(lines).strip()
780
+
781
+
782
+ def _strip_duplicate_heading(content: str, title: str) -> str:
783
+ if not content:
784
+ return ""
785
+ lines = content.splitlines()
786
+ cleaned_title = _clean_inline_text(title)
787
+ while lines:
788
+ current = lines[0].strip()
789
+ if not current:
790
+ lines.pop(0)
791
+ continue
792
+ stripped = re.sub(r"^#+\s*", "", current)
793
+ if cleaned_title and _clean_inline_text(stripped).lower() == cleaned_title.lower():
794
+ lines.pop(0)
795
+ continue
796
+ break
797
+ return "\n".join(lines).strip()
798
+
799
+
800
+ def _build_full_text_content(metadata: dict[str, Any], raw_content: str) -> str:
801
+ lines = _build_metadata_lines(metadata)
802
+ abstract = str(metadata.get("abstract") or "").strip()
803
+ if abstract:
804
+ lines.extend(["", "## Abstract", "", abstract])
805
+ body = _strip_duplicate_heading(raw_content, str(metadata.get("title") or ""))
806
+ if body:
807
+ lines.extend(["", "## Full Text", "", body])
808
+ return "\n".join(lines).strip()
809
+
810
+
364
811
  def _format_error(exc: Exception) -> str:
365
812
  message = str(exc).strip()
366
813
  return message or exc.__class__.__name__
@@ -506,9 +506,7 @@ def _normalize_metric_entry(metric: object, *, fallback_id: str | None = None) -
506
506
  metric_id = as_metric_id(
507
507
  metric.get("metric_id") or metric.get("id") or metric.get("name") or fallback_id,
508
508
  )
509
- direction = str(metric.get("direction") or "").strip().lower()
510
- if direction not in {"maximize", "minimize"}:
511
- direction = infer_metric_direction(metric_id)
509
+ direction = normalize_metric_direction(metric.get("direction"), metric_id=metric_id)
512
510
  decimals_raw = metric.get("decimals")
513
511
  decimals = int(decimals_raw) if isinstance(decimals_raw, int) else None
514
512
  chart_group = str(metric.get("chart_group") or "default").strip() or "default"