@zenalexa/unicli 0.225.1 → 0.225.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (271) hide show
  1. package/AGENTS.md +3 -3
  2. package/README.md +3 -3
  3. package/README.zh-CN.md +3 -3
  4. package/dist/adapters/acl-anthology/papers.d.ts +16 -9
  5. package/dist/adapters/acl-anthology/papers.d.ts.map +1 -1
  6. package/dist/adapters/acl-anthology/papers.js +322 -58
  7. package/dist/adapters/acl-anthology/papers.js.map +1 -1
  8. package/dist/adapters/arxiv/papers.d.ts +22 -4
  9. package/dist/adapters/arxiv/papers.d.ts.map +1 -1
  10. package/dist/adapters/arxiv/papers.js +202 -4
  11. package/dist/adapters/arxiv/papers.js.map +1 -1
  12. package/dist/adapters/baidu-scholar/search.d.ts +15 -1
  13. package/dist/adapters/baidu-scholar/search.d.ts.map +1 -1
  14. package/dist/adapters/baidu-scholar/search.js +72 -8
  15. package/dist/adapters/baidu-scholar/search.js.map +1 -1
  16. package/dist/adapters/biorxiv/preprints.d.ts +9 -0
  17. package/dist/adapters/biorxiv/preprints.d.ts.map +1 -0
  18. package/dist/adapters/biorxiv/preprints.js +78 -0
  19. package/dist/adapters/biorxiv/preprints.js.map +1 -0
  20. package/dist/adapters/cnki/search.d.ts +82 -0
  21. package/dist/adapters/cnki/search.d.ts.map +1 -0
  22. package/dist/adapters/cnki/search.js +236 -0
  23. package/dist/adapters/cnki/search.js.map +1 -0
  24. package/dist/adapters/cvf/papers.d.ts +12 -7
  25. package/dist/adapters/cvf/papers.d.ts.map +1 -1
  26. package/dist/adapters/cvf/papers.js +210 -27
  27. package/dist/adapters/cvf/papers.js.map +1 -1
  28. package/dist/adapters/dblp/publications.d.ts +12 -5
  29. package/dist/adapters/dblp/publications.d.ts.map +1 -1
  30. package/dist/adapters/dblp/publications.js +31 -8
  31. package/dist/adapters/dblp/publications.js.map +1 -1
  32. package/dist/adapters/google-scholar/search.d.ts +22 -1
  33. package/dist/adapters/google-scholar/search.d.ts.map +1 -1
  34. package/dist/adapters/google-scholar/search.js +129 -14
  35. package/dist/adapters/google-scholar/search.js.map +1 -1
  36. package/dist/adapters/hf/paper.d.ts +12 -3
  37. package/dist/adapters/hf/paper.d.ts.map +1 -1
  38. package/dist/adapters/hf/paper.js +65 -5
  39. package/dist/adapters/hf/paper.js.map +1 -1
  40. package/dist/adapters/medrxiv/preprints.d.ts +9 -0
  41. package/dist/adapters/medrxiv/preprints.d.ts.map +1 -0
  42. package/dist/adapters/medrxiv/preprints.js +78 -0
  43. package/dist/adapters/medrxiv/preprints.js.map +1 -0
  44. package/dist/adapters/neurips/proceedings.d.ts +8 -7
  45. package/dist/adapters/neurips/proceedings.d.ts.map +1 -1
  46. package/dist/adapters/neurips/proceedings.js +209 -21
  47. package/dist/adapters/neurips/proceedings.js.map +1 -1
  48. package/dist/adapters/openalex/works.d.ts +21 -5
  49. package/dist/adapters/openalex/works.d.ts.map +1 -1
  50. package/dist/adapters/openalex/works.js +108 -8
  51. package/dist/adapters/openalex/works.js.map +1 -1
  52. package/dist/adapters/openreview/papers.d.ts +10 -4
  53. package/dist/adapters/openreview/papers.d.ts.map +1 -1
  54. package/dist/adapters/openreview/papers.js +351 -24
  55. package/dist/adapters/openreview/papers.js.map +1 -1
  56. package/dist/adapters/pmlr/proceedings.d.ts +6 -6
  57. package/dist/adapters/pmlr/proceedings.d.ts.map +1 -1
  58. package/dist/adapters/pmlr/proceedings.js +92 -12
  59. package/dist/adapters/pmlr/proceedings.js.map +1 -1
  60. package/dist/adapters/pubmed/articles.d.ts +8 -4
  61. package/dist/adapters/pubmed/articles.d.ts.map +1 -1
  62. package/dist/adapters/pubmed/articles.js +272 -39
  63. package/dist/adapters/pubmed/articles.js.map +1 -1
  64. package/dist/adapters/rxiv/preprints.d.ts +75 -0
  65. package/dist/adapters/rxiv/preprints.d.ts.map +1 -0
  66. package/dist/adapters/rxiv/preprints.js +651 -0
  67. package/dist/adapters/rxiv/preprints.js.map +1 -0
  68. package/dist/adapters/scholar-artifacts/pdf-read.d.ts +49 -0
  69. package/dist/adapters/scholar-artifacts/pdf-read.d.ts.map +1 -0
  70. package/dist/adapters/scholar-artifacts/pdf-read.js +204 -0
  71. package/dist/adapters/scholar-artifacts/pdf-read.js.map +1 -0
  72. package/dist/adapters/scholar-artifacts/pdf.d.ts +16 -0
  73. package/dist/adapters/scholar-artifacts/pdf.d.ts.map +1 -0
  74. package/dist/adapters/scholar-artifacts/pdf.js +122 -0
  75. package/dist/adapters/scholar-artifacts/pdf.js.map +1 -0
  76. package/dist/adapters/semantic-scholar/papers.d.ts +6 -6
  77. package/dist/adapters/semantic-scholar/papers.d.ts.map +1 -1
  78. package/dist/adapters/semantic-scholar/papers.js +80 -6
  79. package/dist/adapters/semantic-scholar/papers.js.map +1 -1
  80. package/dist/adapters/unpaywall/works.d.ts +7 -7
  81. package/dist/adapters/unpaywall/works.d.ts.map +1 -1
  82. package/dist/adapters/unpaywall/works.js +104 -12
  83. package/dist/adapters/unpaywall/works.js.map +1 -1
  84. package/dist/adapters/wanfang/search.d.ts +14 -0
  85. package/dist/adapters/wanfang/search.d.ts.map +1 -1
  86. package/dist/adapters/wanfang/search.js +56 -7
  87. package/dist/adapters/wanfang/search.js.map +1 -1
  88. package/dist/browser/page.d.ts +2 -0
  89. package/dist/browser/page.d.ts.map +1 -1
  90. package/dist/browser/page.js +12 -0
  91. package/dist/browser/page.js.map +1 -1
  92. package/dist/browser/protocol.d.ts +6 -1
  93. package/dist/browser/protocol.d.ts.map +1 -1
  94. package/dist/browser/protocol.js.map +1 -1
  95. package/dist/commands/browser/actions.d.ts.map +1 -1
  96. package/dist/commands/browser/actions.js +487 -8
  97. package/dist/commands/browser/actions.js.map +1 -1
  98. package/dist/commands/compute.js +12 -1
  99. package/dist/commands/compute.js.map +1 -1
  100. package/dist/commands/schema.d.ts.map +1 -1
  101. package/dist/commands/schema.js +22 -0
  102. package/dist/commands/schema.js.map +1 -1
  103. package/dist/commands/scholar.d.ts +77 -5
  104. package/dist/commands/scholar.d.ts.map +1 -1
  105. package/dist/commands/scholar.js +2945 -83
  106. package/dist/commands/scholar.js.map +1 -1
  107. package/dist/commands/search.d.ts.map +1 -1
  108. package/dist/commands/search.js +14 -3
  109. package/dist/commands/search.js.map +1 -1
  110. package/dist/compute/contracts.d.ts +55 -0
  111. package/dist/compute/contracts.d.ts.map +1 -0
  112. package/dist/compute/contracts.js +487 -0
  113. package/dist/compute/contracts.js.map +1 -0
  114. package/dist/core/command-contract.d.ts.map +1 -1
  115. package/dist/core/command-contract.js +5 -0
  116. package/dist/core/command-contract.js.map +1 -1
  117. package/dist/core/schema-v2.d.ts +1 -0
  118. package/dist/core/schema-v2.d.ts.map +1 -1
  119. package/dist/core/schema-v2.js +1 -0
  120. package/dist/core/schema-v2.js.map +1 -1
  121. package/dist/discovery/aliases.d.ts +8 -1
  122. package/dist/discovery/aliases.d.ts.map +1 -1
  123. package/dist/discovery/aliases.js +333 -20
  124. package/dist/discovery/aliases.js.map +1 -1
  125. package/dist/discovery/core-catalog.d.ts +2 -0
  126. package/dist/discovery/core-catalog.d.ts.map +1 -1
  127. package/dist/discovery/core-catalog.js +525 -66
  128. package/dist/discovery/core-catalog.js.map +1 -1
  129. package/dist/discovery/intents.d.ts +1 -0
  130. package/dist/discovery/intents.d.ts.map +1 -1
  131. package/dist/discovery/intents.js +299 -2
  132. package/dist/discovery/intents.js.map +1 -1
  133. package/dist/discovery/loader.d.ts.map +1 -1
  134. package/dist/discovery/loader.js +3 -0
  135. package/dist/discovery/loader.js.map +1 -1
  136. package/dist/discovery/macos-dynamic.d.ts +1 -0
  137. package/dist/discovery/macos-dynamic.d.ts.map +1 -1
  138. package/dist/discovery/macos-dynamic.js +20 -1
  139. package/dist/discovery/macos-dynamic.js.map +1 -1
  140. package/dist/discovery/search.d.ts.map +1 -1
  141. package/dist/discovery/search.js +12 -5
  142. package/dist/discovery/search.js.map +1 -1
  143. package/dist/engine/browser/evidence.d.ts +34 -1
  144. package/dist/engine/browser/evidence.d.ts.map +1 -1
  145. package/dist/engine/browser/evidence.js +141 -6
  146. package/dist/engine/browser/evidence.js.map +1 -1
  147. package/dist/engine/capability-policy.d.ts.map +1 -1
  148. package/dist/engine/capability-policy.js +30 -4
  149. package/dist/engine/capability-policy.js.map +1 -1
  150. package/dist/engine/kernel/stages.d.ts.map +1 -1
  151. package/dist/engine/kernel/stages.js +3 -0
  152. package/dist/engine/kernel/stages.js.map +1 -1
  153. package/dist/engine/operation-policy.d.ts +4 -1
  154. package/dist/engine/operation-policy.d.ts.map +1 -1
  155. package/dist/engine/operation-policy.js +23 -0
  156. package/dist/engine/operation-policy.js.map +1 -1
  157. package/dist/engine/steps/fetch-text.d.ts.map +1 -1
  158. package/dist/engine/steps/fetch-text.js +2 -2
  159. package/dist/engine/steps/fetch-text.js.map +1 -1
  160. package/dist/engine/steps/fetch.d.ts +1 -0
  161. package/dist/engine/steps/fetch.d.ts.map +1 -1
  162. package/dist/engine/steps/fetch.js +24 -4
  163. package/dist/engine/steps/fetch.js.map +1 -1
  164. package/dist/fast-path/handlers/discovery.d.ts +5 -5
  165. package/dist/fast-path/handlers/discovery.d.ts.map +1 -1
  166. package/dist/fast-path/handlers/discovery.js +61 -8
  167. package/dist/fast-path/handlers/discovery.js.map +1 -1
  168. package/dist/fast-path/manifest.d.ts +3 -0
  169. package/dist/fast-path/manifest.d.ts.map +1 -1
  170. package/dist/fast-path/manifest.js.map +1 -1
  171. package/dist/fast-path/policy.d.ts.map +1 -1
  172. package/dist/fast-path/policy.js +3 -0
  173. package/dist/fast-path/policy.js.map +1 -1
  174. package/dist/fast-path/render.d.ts +2 -0
  175. package/dist/fast-path/render.d.ts.map +1 -1
  176. package/dist/fast-path/render.js +9 -0
  177. package/dist/fast-path/render.js.map +1 -1
  178. package/dist/manifest-compact.txt +2 -2
  179. package/dist/manifest.json +6977 -1002
  180. package/dist/mcp/handler.d.ts +2 -16
  181. package/dist/mcp/handler.d.ts.map +1 -1
  182. package/dist/mcp/handler.js.map +1 -1
  183. package/dist/mcp/http-transport.d.ts +7 -1
  184. package/dist/mcp/http-transport.d.ts.map +1 -1
  185. package/dist/mcp/http-transport.js +20 -1
  186. package/dist/mcp/http-transport.js.map +1 -1
  187. package/dist/mcp/jsonrpc.d.ts +27 -0
  188. package/dist/mcp/jsonrpc.d.ts.map +1 -0
  189. package/dist/mcp/jsonrpc.js +12 -0
  190. package/dist/mcp/jsonrpc.js.map +1 -0
  191. package/dist/mcp/origin-guard.d.ts +26 -0
  192. package/dist/mcp/origin-guard.d.ts.map +1 -0
  193. package/dist/mcp/origin-guard.js +42 -0
  194. package/dist/mcp/origin-guard.js.map +1 -0
  195. package/dist/mcp/profiles/computer-use.d.ts.map +1 -1
  196. package/dist/mcp/profiles/computer-use.js +30 -270
  197. package/dist/mcp/profiles/computer-use.js.map +1 -1
  198. package/dist/mcp/streamable-http/session.d.ts +4 -22
  199. package/dist/mcp/streamable-http/session.d.ts.map +1 -1
  200. package/dist/mcp/streamable-http/session.js +4 -24
  201. package/dist/mcp/streamable-http/session.js.map +1 -1
  202. package/dist/mcp/tools.d.ts.map +1 -1
  203. package/dist/mcp/tools.js +74 -54
  204. package/dist/mcp/tools.js.map +1 -1
  205. package/dist/output/envelope.d.ts +2 -0
  206. package/dist/output/envelope.d.ts.map +1 -1
  207. package/dist/output/envelope.js.map +1 -1
  208. package/dist/output/error-map.d.ts +14 -0
  209. package/dist/output/error-map.d.ts.map +1 -1
  210. package/dist/output/error-map.js +20 -0
  211. package/dist/output/error-map.js.map +1 -1
  212. package/dist/registry.d.ts +2 -0
  213. package/dist/registry.d.ts.map +1 -1
  214. package/dist/registry.js +1 -0
  215. package/dist/registry.js.map +1 -1
  216. package/dist/transport/cascade.d.ts.map +1 -1
  217. package/dist/transport/cascade.js +77 -5
  218. package/dist/transport/cascade.js.map +1 -1
  219. package/dist/transport/refs.d.ts +33 -1
  220. package/dist/transport/refs.d.ts.map +1 -1
  221. package/dist/transport/refs.js +40 -1
  222. package/dist/transport/refs.js.map +1 -1
  223. package/dist/types/scholarly.d.ts +19 -4
  224. package/dist/types/scholarly.d.ts.map +1 -1
  225. package/dist/types/scholarly.js +4 -4
  226. package/dist/types.d.ts +8 -0
  227. package/dist/types.d.ts.map +1 -1
  228. package/dist/types.js.map +1 -1
  229. package/package.json +1 -1
  230. package/server.json +2 -2
  231. package/skills/unicli/SKILL.md +1 -1
  232. package/skills/unicli-claude-code/SKILL.md +1 -1
  233. package/skills/unicli-hermes/SKILL.md +1 -1
  234. package/src/adapters/acl-anthology/papers.test.ts +111 -0
  235. package/src/adapters/acl-anthology/papers.ts +379 -71
  236. package/src/adapters/arxiv/papers.test.ts +46 -0
  237. package/src/adapters/arxiv/papers.ts +251 -4
  238. package/src/adapters/baidu-scholar/search.ts +74 -11
  239. package/src/adapters/biorxiv/preprints.ts +112 -0
  240. package/src/adapters/cnki/search.ts +357 -0
  241. package/src/adapters/cvf/papers.ts +260 -27
  242. package/src/adapters/dblp/publications.test.ts +9 -0
  243. package/src/adapters/dblp/publications.ts +31 -8
  244. package/src/adapters/defuddle/read.yaml +30 -0
  245. package/src/adapters/google-scholar/search.ts +165 -17
  246. package/src/adapters/hf/paper.test.ts +23 -0
  247. package/src/adapters/hf/paper.ts +89 -5
  248. package/src/adapters/hf/top.yaml +34 -2
  249. package/src/adapters/huggingface-papers/daily.yaml +37 -3
  250. package/src/adapters/huggingface-papers/search.yaml +43 -9
  251. package/src/adapters/jina/read.yaml +30 -0
  252. package/src/adapters/markdown-new/read.yaml +50 -0
  253. package/src/adapters/medrxiv/preprints.ts +112 -0
  254. package/src/adapters/neurips/proceedings.ts +266 -22
  255. package/src/adapters/ollama-cloud/fetch.yaml +39 -0
  256. package/src/adapters/ollama-cloud/search.yaml +43 -0
  257. package/src/adapters/openalex/works.test.ts +15 -4
  258. package/src/adapters/openalex/works.ts +136 -8
  259. package/src/adapters/openreview/papers.test.ts +31 -0
  260. package/src/adapters/openreview/papers.ts +407 -29
  261. package/src/adapters/pmlr/proceedings.ts +102 -12
  262. package/src/adapters/pubmed/articles.test.ts +88 -1
  263. package/src/adapters/pubmed/articles.ts +343 -44
  264. package/src/adapters/rxiv/preprints.test.ts +233 -0
  265. package/src/adapters/rxiv/preprints.ts +849 -0
  266. package/src/adapters/scholar-artifacts/pdf-read.ts +277 -0
  267. package/src/adapters/scholar-artifacts/pdf.ts +133 -0
  268. package/src/adapters/semantic-scholar/papers.ts +98 -6
  269. package/src/adapters/unpaywall/works.ts +141 -12
  270. package/src/adapters/wanfang/search.ts +57 -7
  271. package/src/adapters/cnki/search.yaml +0 -49
@@ -0,0 +1,111 @@
1
+ import { describe, expect, it } from "vitest";
2
+
3
+ import { resolveCommand } from "../../registry.js";
4
+ import {
5
+ aclAnthologyPdfUrl,
6
+ aclArtifactFilename,
7
+ cleanAclHtml,
8
+ normalizeAclAnthologyId,
9
+ parseAclBibEntries,
10
+ requireAclReadPageArgs,
11
+ searchAclBibRows,
12
+ } from "./papers.js";
13
+
14
+ describe("acl-anthology scholarly commands", () => {
15
+ it("normalizes source ids and URLs without accepting traversal", () => {
16
+ expect(normalizeAclAnthologyId("2020.acl-main.447")).toBe(
17
+ "2020.acl-main.447",
18
+ );
19
+ expect(
20
+ normalizeAclAnthologyId("https://aclanthology.org/2020.acl-main.447.pdf"),
21
+ ).toBe("2020.acl-main.447");
22
+ expect(
23
+ normalizeAclAnthologyId("https://aclanthology.org/2020.acl-main.447/"),
24
+ ).toBe("2020.acl-main.447");
25
+ expect(() => normalizeAclAnthologyId("../2020.acl-main.447")).toThrow(
26
+ "ACL Anthology id",
27
+ );
28
+ });
29
+
30
+ it("builds official ACL PDF URLs and stable artifact filenames", () => {
31
+ expect(aclAnthologyPdfUrl("2020.acl-main.447")).toBe(
32
+ "https://aclanthology.org/2020.acl-main.447.pdf",
33
+ );
34
+ expect(
35
+ aclArtifactFilename({
36
+ id: "2020.acl-main.447",
37
+ title: "S2ORC: The Semantic Scholar Open Research Corpus",
38
+ source_adapter: "acl-anthology",
39
+ retrieved_at: "2026-06-27T00:00:00Z",
40
+ }),
41
+ ).toBe(
42
+ "2020.acl-main.447-S2ORC__The_Semantic_Scholar_Open_Research_Corpus.pdf",
43
+ );
44
+ });
45
+
46
+ it("maps scholar read hyphenated arguments to the PDF reader contract", () => {
47
+ expect(
48
+ requireAclReadPageArgs({
49
+ "first-page": "2",
50
+ "last-page": "4",
51
+ "max-chars": "1000",
52
+ }),
53
+ ).toEqual({ first_page: "2", last_page: "4", max_chars: "1000" });
54
+ });
55
+
56
+ it("registers ACL read as source-level fulltext with pdftotext governance", () => {
57
+ const command = resolveCommand("acl-anthology", "read")?.command;
58
+ expect(command?.capabilities).toEqual([
59
+ "http.fetch",
60
+ "http.download",
61
+ "subprocess.exec",
62
+ "scholar.fulltext",
63
+ "scholar.pdf",
64
+ ]);
65
+ expect(command?.executables).toEqual(["pdftotext"]);
66
+ expect(command?.minimum_capability).toBe("subprocess.exec");
67
+ });
68
+
69
+ it("parses official ACL BibTeX export rows for source-backed search", () => {
70
+ const rows = parseAclBibEntries(`
71
+ @inproceedings{lo-etal-2020-s2orc,
72
+ title = "{S}2{ORC}: The Semantic Scholar Open Research Corpus",
73
+ author = "Lo, Kyle and
74
+ Wang, Lucy Lu and
75
+ Neumann, Mark",
76
+ booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
77
+ year = "2020",
78
+ url = "https://aclanthology.org/2020.acl-main.447/",
79
+ doi = "10.18653/v1/2020.acl-main.447",
80
+ pages = "4969--4983"
81
+ }
82
+ @inproceedings{sharma-etal-2026-council,
83
+ title = "Council of {LLM}s",
84
+ author = "Sharma, Vivek",
85
+ year = "2026",
86
+ url = "https://aclanthology.org/2026.wassa-1.1/"
87
+ }
88
+ `);
89
+
90
+ expect(rows[0]).toMatchObject({
91
+ id: "2020.acl-main.447",
92
+ title: "S2ORC: The Semantic Scholar Open Research Corpus",
93
+ authors: ["Kyle Lo", "Lucy Lu Wang", "Mark Neumann"],
94
+ year: 2020,
95
+ doi: "10.18653/v1/2020.acl-main.447",
96
+ pdf_url: "https://aclanthology.org/2020.acl-main.447.pdf",
97
+ source_url: "https://aclanthology.org/2020.acl-main.447/",
98
+ });
99
+ expect(searchAclBibRows(rows, "Semantic Scholar Corpus", 1)).toEqual([
100
+ rows[0],
101
+ ]);
102
+ });
103
+
104
+ it("cleans ACL Anthology title HTML", () => {
105
+ expect(
106
+ cleanAclHtml(
107
+ '<a href="https://aclanthology.org/demo.pdf"><span class="acl-fixed-case">S</span>2ORC</a>',
108
+ ),
109
+ ).toBe("S2ORC");
110
+ });
111
+ });
@@ -1,22 +1,31 @@
1
1
  /**
2
2
  * @owner src::adapters::acl-anthology::papers
3
- * @does Registers ACL Anthology paper search and event proceedings listing from official Anthology pages.
4
- * @needs aclanthology.org static search/event HTML, src/registry.ts
5
- * @feeds src/commands/scholar.ts via scholar.search, scholar.get, scholar.pdf, and scholar.venue
6
- * @breaks ACL Anthology markup drift surfaces as empty parse output; no browser workaround is used.
7
- * @invariants Paper URLs/PDF URLs are absolutized against aclanthology.org; event keys are explicit.
8
- * @side-effects HTTPS egress to aclanthology.org only
9
- * @perf O(N) over one HTML response
3
+ * @does Registers ACL Anthology paper search, metadata lookup, PDF download, and PDF text-read commands from official Anthology pages.
4
+ * @needs aclanthology.org static search/paper HTML, scholar-artifacts PDF reader, src/registry.ts
5
+ * @feeds src/commands/scholar.ts via scholar.search, scholar.get, scholar.pdf, scholar.fulltext, and scholar.venue
6
+ * @breaks ACL Anthology markup drift, denied PDF downloads, missing pdftotext, or empty PDF text surfaces as source read failure.
7
+ * @invariants Paper URLs/PDF URLs are absolutized against aclanthology.org; read output labels `text_source=pdf`.
8
+ * @side-effects HTTPS egress to aclanthology.org; read writes PDFs under the requested output directory and executes pdftotext.
9
+ * @perf O(N) over one HTML response for search; O(PDF bytes + extracted pages) for read.
10
10
  * @concurrency safe
11
- * @test covered by scholar command discovery and parser style tests for sibling proceedings sources
11
+ * @test src/adapters/acl-anthology/papers.test.ts, tests/unit/adapters/scholar-sources.test.ts
12
12
  * @stability experimental
13
13
  * @since 2026-05-19
14
14
  */
15
15
 
16
+ import { execFile } from "node:child_process";
17
+ import { join, resolve } from "node:path";
18
+ import { promisify } from "node:util";
19
+ import { gunzipSync } from "node:zlib";
20
+
16
21
  import { cli, Strategy } from "../../registry.js";
22
+ import { httpDownload, sanitizeFilename } from "../../engine/download.js";
17
23
  import type { ScholarlyWorkRecord } from "../../types/scholarly.js";
18
24
 
19
25
  const ORIGIN = "https://aclanthology.org";
26
+ const ANTHOLOGY_BIB_URL = `${ORIGIN}/anthology.bib.gz`;
27
+ const execFileAsync = promisify(execFile);
28
+ let anthologyBibCache: Promise<string> | undefined;
20
29
 
21
30
  function decode(value: string): string {
22
31
  return value
@@ -29,54 +38,206 @@ function decode(value: string): string {
29
38
  .trim();
30
39
  }
31
40
 
32
- function absolute(path: string): string {
33
- return /^https?:\/\//i.test(path)
34
- ? path
35
- : `${ORIGIN}${path.startsWith("/") ? "" : "/"}${path}`;
41
+ export function cleanAclHtml(value: string): string {
42
+ return decode(value.replace(/<[^>]+>/g, ""));
43
+ }
44
+
45
+ function normalizeSearchText(value: string): string {
46
+ return value
47
+ .toLowerCase()
48
+ .replace(/[^a-z0-9]+/g, " ")
49
+ .replace(/\s+/g, " ")
50
+ .trim();
36
51
  }
37
52
 
38
- function normalizeId(value: unknown): string {
39
- const raw = String(value ?? "").trim();
53
+ export function normalizeAclAnthologyId(value: unknown): string {
54
+ const raw = String(value ?? "")
55
+ .trim()
56
+ .replace(/^https?:\/\/(?:www\.)?aclanthology\.org\//i, "")
57
+ .replace(/\.pdf$/i, "")
58
+ .replace(/\/$/i, "");
40
59
  if (!/^[A-Za-z0-9.-]+$/.test(raw)) {
41
- throw new Error(`ACL Anthology id "${raw}" is not valid.`);
60
+ throw new Error(`ACL Anthology id "${String(value ?? "")}" is not valid.`);
42
61
  }
43
62
  return raw.replace(/\.$/, "");
44
63
  }
45
64
 
46
- function parseRows(
47
- html: string,
48
- source = "acl-anthology",
49
- ): ScholarlyWorkRecord[] {
50
- const out: ScholarlyWorkRecord[] = [];
65
+ export function aclAnthologyPdfUrl(id: string): string {
66
+ return `${ORIGIN}/${id}.pdf`;
67
+ }
68
+
69
+ function cleanBibValue(value: string): string {
70
+ return decode(
71
+ value
72
+ .trim()
73
+ .replace(/,$/, "")
74
+ .replace(/^["{]|["}]$/g, "")
75
+ .replace(/\\"/g, '"')
76
+ .replace(/\\&/g, "&")
77
+ .replace(/\\url\{([^}]+)\}/g, "$1")
78
+ .replace(/\\[a-zA-Z]+\{([^}]*)\}/g, "$1")
79
+ .replace(/[{}]/g, "")
80
+ .replace(/\s+/g, " "),
81
+ );
82
+ }
83
+
84
+ function normalizeBibAuthors(value: string): string[] | undefined {
85
+ const authors = cleanBibValue(value)
86
+ .split(/\s+and\s+/)
87
+ .map((author) => {
88
+ const parts = author.split(/\s*,\s*/);
89
+ return parts.length === 2 ? `${parts[1]} ${parts[0]}` : author;
90
+ })
91
+ .map((author) => author.trim())
92
+ .filter(Boolean);
93
+ return authors.length > 0 ? authors.slice(0, 20) : undefined;
94
+ }
95
+
96
+ function parseBibFields(entry: string): Record<string, string> {
97
+ const fields: Record<string, string> = {};
51
98
  const re =
52
- /<p class="d-sm-flex[^"]*">[\s\S]*?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>([\s\S]*?)(?=<p class="d-sm-flex|$)/g;
99
+ /^\s*([A-Za-z][A-Za-z0-9_-]*)\s*=\s*([\s\S]*?)(?=,\n\s*[A-Za-z][A-Za-z0-9_-]*\s*=|\n}\s*$)/gm;
53
100
  let match: RegExpExecArray | null;
54
- while ((match = re.exec(html)) !== null) {
55
- const sourceUrl = absolute(match[1]);
56
- const id = sourceUrl.replace(`${ORIGIN}/`, "").replace(/\/$/, "");
57
- const block = match[3];
58
- const pdf = block.match(/href="([^"]+\.pdf)"/i)?.[1] ?? "";
59
- const authorText = block
60
- .replace(/<span class="d-block">[\s\S]*?<\/span>/g, " ")
61
- .replace(/<[^>]+>/g, " ");
62
- const authors = decode(authorText)
63
- .split(/,\s*/)
64
- .map((author) => author.trim())
65
- .filter(Boolean)
66
- .slice(0, 20);
67
- out.push({
101
+ while ((match = re.exec(entry)) !== null) {
102
+ fields[match[1].toLowerCase()] = match[2];
103
+ }
104
+ return fields;
105
+ }
106
+
107
+ export function parseAclBibEntries(bib: string): ScholarlyWorkRecord[] {
108
+ const rows: ScholarlyWorkRecord[] = [];
109
+ for (const entry of bib.split(/\n(?=@[A-Za-z]+\{)/)) {
110
+ const header = entry.match(/^@([A-Za-z]+)\{([^,]+),/);
111
+ if (!header) continue;
112
+ const fields = parseBibFields(entry);
113
+ const title = fields.title ? cleanBibValue(fields.title) : "";
114
+ const sourceUrl = fields.url ? cleanBibValue(fields.url) : "";
115
+ const id = sourceUrl
116
+ ? normalizeAclAnthologyId(sourceUrl)
117
+ : normalizeAclAnthologyId(header[2]);
118
+ if (!title || !id) continue;
119
+ const year = fields.year ? Number(cleanBibValue(fields.year)) : undefined;
120
+ const doi = fields.doi ? cleanBibValue(fields.doi) : undefined;
121
+ rows.push({
68
122
  id,
69
- title: decode(match[2].replace(/<[^>]+>/g, " ")),
70
- authors: authors.length > 0 ? authors : undefined,
71
- year: Number(id.slice(0, 4)) || undefined,
72
- venue: "ACL Anthology",
73
- pdf_url: pdf ? absolute(pdf) : `${sourceUrl}.pdf`,
74
- source_adapter: source,
75
- source_url: sourceUrl,
123
+ title,
124
+ authors: fields.author ? normalizeBibAuthors(fields.author) : undefined,
125
+ year: Number.isInteger(year) ? year : undefined,
126
+ venue: fields.booktitle
127
+ ? cleanBibValue(fields.booktitle)
128
+ : "ACL Anthology",
129
+ doi,
130
+ pdf_url: aclAnthologyPdfUrl(id),
131
+ source_adapter: "acl-anthology",
132
+ source_url: `${ORIGIN}/${id}/`,
76
133
  retrieved_at: new Date().toISOString(),
77
134
  });
78
135
  }
79
- return out;
136
+ return rows;
137
+ }
138
+
139
+ function scoreAclBibRow(row: ScholarlyWorkRecord, query: string): number {
140
+ const normalizedQuery = normalizeSearchText(query);
141
+ const terms = normalizedQuery.split(" ").filter(Boolean);
142
+ const title = normalizeSearchText(row.title);
143
+ const authors = normalizeSearchText((row.authors ?? []).join(" "));
144
+ const haystack = normalizeSearchText(
145
+ [
146
+ row.id,
147
+ row.title,
148
+ row.venue,
149
+ row.year,
150
+ row.doi,
151
+ (row.authors ?? []).join(" "),
152
+ ]
153
+ .filter(Boolean)
154
+ .join(" "),
155
+ );
156
+ if (!normalizedQuery || !terms.every((term) => haystack.includes(term))) {
157
+ return 0;
158
+ }
159
+ let score = 10;
160
+ if (row.id.toLowerCase() === normalizedQuery) score += 100;
161
+ if (title === normalizedQuery) score += 80;
162
+ if (title.includes(normalizedQuery)) score += 40;
163
+ if (authors.includes(normalizedQuery)) score += 20;
164
+ return score + Math.min(Number(row.year ?? 0) / 10_000, 1);
165
+ }
166
+
167
+ export function searchAclBibRows(
168
+ rows: readonly ScholarlyWorkRecord[],
169
+ query: string,
170
+ limit: number,
171
+ ): ScholarlyWorkRecord[] {
172
+ return rows
173
+ .map((row, index) => ({
174
+ row,
175
+ index,
176
+ score: scoreAclBibRow(row, query),
177
+ }))
178
+ .filter((candidate) => candidate.score > 0)
179
+ .sort((a, b) => b.score - a.score || a.index - b.index)
180
+ .slice(0, limit)
181
+ .map((candidate) => candidate.row);
182
+ }
183
+
184
+ export function requireAclReadPageArgs(
185
+ kwargs: Record<string, unknown>,
186
+ ): Record<string, unknown> {
187
+ return {
188
+ first_page: kwargs["first-page"] ?? kwargs.firstPage,
189
+ last_page: kwargs["last-page"] ?? kwargs.lastPage,
190
+ max_chars: kwargs["max-chars"] ?? kwargs.maxChars,
191
+ };
192
+ }
193
+
194
+ export function aclArtifactFilename(record: ScholarlyWorkRecord): string {
195
+ const title = sanitizeFilename(String(record.title ?? ""))
196
+ .replace(/\s+/g, "_")
197
+ .slice(0, 96);
198
+ return `${sanitizeFilename(record.id)}${title ? `-${title}` : ""}.pdf`;
199
+ }
200
+
201
+ function requireAclPageRange(
202
+ firstPage: unknown,
203
+ lastPage: unknown,
204
+ ): { firstPage: number; lastPage: number } {
205
+ const first = Number(firstPage ?? 1);
206
+ const last = Number(lastPage ?? 20);
207
+ if (!Number.isInteger(first) || first < 1) {
208
+ throw new Error("acl-anthology first-page must be an integer >= 1.");
209
+ }
210
+ if (!Number.isInteger(last) || last < first) {
211
+ throw new Error(
212
+ "acl-anthology last-page must be an integer >= first-page.",
213
+ );
214
+ }
215
+ return { firstPage: first, lastPage: last };
216
+ }
217
+
218
+ function requireAclMaxChars(value: unknown, fallback = 40_000): number {
219
+ if (value === undefined || value === null || value === "") return fallback;
220
+ const n = Number(value);
221
+ if (!Number.isInteger(n) || n < 1_000 || n > 1_000_000) {
222
+ throw new Error(
223
+ `acl-anthology max-chars must be an integer in [1000, 1000000]. Got: ${String(value)}`,
224
+ );
225
+ }
226
+ return n;
227
+ }
228
+
229
+ function truncateAclText(
230
+ text: string,
231
+ maxChars: number,
232
+ ): { text: string; truncated: boolean; originalChars: number } {
233
+ if (text.length <= maxChars) {
234
+ return { text, truncated: false, originalChars: text.length };
235
+ }
236
+ return {
237
+ text: `${text.slice(0, maxChars).trimEnd()}\n\n[truncated at ${maxChars} characters]`,
238
+ truncated: true,
239
+ originalChars: text.length,
240
+ };
80
241
  }
81
242
 
82
243
  async function fetchHtml(url: string, label: string): Promise<string> {
@@ -93,6 +254,104 @@ async function fetchHtml(url: string, label: string): Promise<string> {
93
254
  return response.text();
94
255
  }
95
256
 
257
+ async function fetchAnthologyBib(): Promise<string> {
258
+ anthologyBibCache ??= (async () => {
259
+ const response = await fetch(ANTHOLOGY_BIB_URL, {
260
+ headers: {
261
+ Accept: "application/x-gzip, application/gzip, */*",
262
+ "User-Agent":
263
+ "unicli-acl-anthology/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
264
+ },
265
+ });
266
+ if (!response.ok)
267
+ throw new Error(`ACL Anthology BibTeX returned HTTP ${response.status}.`);
268
+ return gunzipSync(Buffer.from(await response.arrayBuffer())).toString(
269
+ "utf8",
270
+ );
271
+ })();
272
+ return anthologyBibCache;
273
+ }
274
+
275
+ async function fetchAclPaperRecord(id: string): Promise<ScholarlyWorkRecord> {
276
+ const html = await fetchHtml(`${ORIGIN}/${id}/`, `acl-anthology paper ${id}`);
277
+ const title = cleanAclHtml(
278
+ html.match(/<h2[^>]*id=title[^>]*>([\s\S]*?)<\/h2>/)?.[1] ?? "",
279
+ );
280
+ if (!title)
281
+ throw new Error(`ACL Anthology paper ${id} did not expose a title.`);
282
+ return {
283
+ id,
284
+ title,
285
+ year: Number(id.slice(0, 4)) || undefined,
286
+ venue: "ACL Anthology",
287
+ pdf_url: aclAnthologyPdfUrl(id),
288
+ source_adapter: "acl-anthology",
289
+ source_url: `${ORIGIN}/${id}/`,
290
+ retrieved_at: new Date().toISOString(),
291
+ };
292
+ }
293
+
294
+ async function readAclPaperPdf(
295
+ record: ScholarlyWorkRecord,
296
+ kwargs: Record<string, unknown>,
297
+ ): Promise<Record<string, unknown>> {
298
+ if (!record.pdf_url) {
299
+ throw new Error(`ACL Anthology paper ${record.id} did not expose a PDF.`);
300
+ }
301
+ const outputDir = resolve(
302
+ String(kwargs.output ?? "./acl-anthology-downloads"),
303
+ );
304
+ const path = join(outputDir, aclArtifactFilename(record));
305
+ const download = await httpDownload(record.pdf_url, path, {
306
+ Accept: "application/pdf,*/*",
307
+ Referer: record.source_url ?? `${ORIGIN}/${record.id}/`,
308
+ "User-Agent":
309
+ "unicli-acl-anthology/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
310
+ });
311
+ if (download.status === "failed" || !download.path) {
312
+ throw new Error(
313
+ `ACL Anthology PDF download failed for ${record.id}: ${download.error ?? "no path"}.`,
314
+ );
315
+ }
316
+
317
+ const pageArgs = requireAclReadPageArgs(kwargs);
318
+ const { firstPage, lastPage } = requireAclPageRange(
319
+ pageArgs.first_page,
320
+ pageArgs.last_page,
321
+ );
322
+ const maxChars = requireAclMaxChars(pageArgs.max_chars);
323
+ const { stdout } = await execFileAsync(
324
+ "pdftotext",
325
+ [
326
+ "-layout",
327
+ "-enc",
328
+ "UTF-8",
329
+ "-f",
330
+ String(firstPage),
331
+ "-l",
332
+ String(lastPage),
333
+ download.path,
334
+ "-",
335
+ ],
336
+ { timeout: 60_000, maxBuffer: 10 * 1024 * 1024 },
337
+ );
338
+ const extracted = stdout.trim();
339
+ if (!extracted) {
340
+ throw new Error(
341
+ `pdftotext returned no text for ACL Anthology ${record.id} pages ${firstPage}-${lastPage}.`,
342
+ );
343
+ }
344
+ const truncated = truncateAclText(extracted, maxChars);
345
+ return {
346
+ ...record,
347
+ path: download.path,
348
+ text: truncated.text,
349
+ text_chars: truncated.originalChars,
350
+ text_truncated: truncated.truncated,
351
+ text_source: "pdf",
352
+ };
353
+ }
354
+
96
355
  cli({
97
356
  site: "acl-anthology",
98
357
  name: "search",
@@ -109,12 +368,11 @@ cli({
109
368
  const query = String(kwargs.query ?? "").trim();
110
369
  if (!query) throw new Error("acl-anthology search query cannot be empty.");
111
370
  const limit = Math.min(Math.max(Number(kwargs.limit ?? 20), 1), 100);
112
- const rows = parseRows(
113
- await fetchHtml(
114
- `${ORIGIN}/search/?q=${encodeURIComponent(query)}`,
115
- "acl-anthology search",
116
- ),
117
- ).slice(0, limit);
371
+ const rows = searchAclBibRows(
372
+ parseAclBibEntries(await fetchAnthologyBib()),
373
+ query,
374
+ limit,
375
+ );
118
376
  if (rows.length === 0)
119
377
  throw new Error(`No ACL Anthology papers matched "${query}".`);
120
378
  return rows;
@@ -131,27 +389,77 @@ cli({
131
389
  columns: ["id", "title", "authors", "year", "venue", "pdf_url", "source_url"],
132
390
  capabilities: ["http.fetch", "scholar.get", "scholar.pdf"],
133
391
  func: async (_page, kwargs) => {
134
- const id = normalizeId(kwargs.id ?? kwargs.ref);
135
- const html = await fetchHtml(
136
- `${ORIGIN}/${id}/`,
137
- `acl-anthology paper ${id}`,
138
- );
139
- const title = decode(
140
- html.match(/<h2[^>]*id=title[^>]*>([\s\S]*?)<\/h2>/)?.[1] ?? "",
141
- );
142
- if (!title)
143
- throw new Error(`ACL Anthology paper ${id} did not expose a title.`);
144
- return [
145
- {
146
- id,
147
- title,
148
- year: Number(id.slice(0, 4)) || undefined,
149
- venue: "ACL Anthology",
150
- pdf_url: `${ORIGIN}/${id}.pdf`,
151
- source_adapter: "acl-anthology",
152
- source_url: `${ORIGIN}/${id}/`,
153
- retrieved_at: new Date().toISOString(),
154
- } satisfies ScholarlyWorkRecord,
155
- ];
392
+ const id = normalizeAclAnthologyId(kwargs.id ?? kwargs.ref);
393
+ return [await fetchAclPaperRecord(id)];
394
+ },
395
+ });
396
+
397
+ cli({
398
+ site: "acl-anthology",
399
+ name: "read",
400
+ description: "Download an ACL Anthology paper PDF by id and extract text",
401
+ domain: "aclanthology.org",
402
+ strategy: Strategy.PUBLIC,
403
+ args: [
404
+ {
405
+ name: "id",
406
+ type: "str",
407
+ required: true,
408
+ positional: true,
409
+ description: "ACL Anthology paper id (e.g. 2020.acl-main.447)",
410
+ "x-unicli-kind": "id",
411
+ "x-unicli-accepts": ["url"],
412
+ },
413
+ {
414
+ name: "output",
415
+ type: "str",
416
+ default: "./acl-anthology-downloads",
417
+ description: "Output directory",
418
+ "x-unicli-kind": "path",
419
+ },
420
+ {
421
+ name: "first-page",
422
+ type: "int",
423
+ default: 1,
424
+ description: "First PDF page to extract",
425
+ },
426
+ {
427
+ name: "last-page",
428
+ type: "int",
429
+ default: 20,
430
+ description: "Last PDF page to extract",
431
+ },
432
+ {
433
+ name: "max-chars",
434
+ type: "int",
435
+ default: 40000,
436
+ description: "Maximum extracted text characters",
437
+ },
438
+ ],
439
+ columns: [
440
+ "id",
441
+ "title",
442
+ "source_adapter",
443
+ "source_url",
444
+ "pdf_url",
445
+ "path",
446
+ "text_source",
447
+ "text",
448
+ "text_chars",
449
+ "text_truncated",
450
+ ],
451
+ capabilities: [
452
+ "http.fetch",
453
+ "http.download",
454
+ "subprocess.exec",
455
+ "scholar.fulltext",
456
+ "scholar.pdf",
457
+ ],
458
+ executables: ["pdftotext"],
459
+ minimum_capability: "subprocess.exec",
460
+ func: async (_page, kwargs) => {
461
+ const id = normalizeAclAnthologyId(kwargs.id ?? kwargs.ref);
462
+ const record = await fetchAclPaperRecord(id);
463
+ return [await readAclPaperPdf(record, kwargs)];
156
464
  },
157
465
  });
@@ -1,9 +1,14 @@
1
1
  import { describe, expect, it } from "vitest";
2
+ import { resolveCommand } from "../../registry.js";
2
3
  import {
4
+ arxivArtifactFilename,
3
5
  decodeArxivEntities,
6
+ normalizeArxivId,
4
7
  parseArxivEntries,
5
8
  requireArxivAuthor,
6
9
  requireArxivCategory,
10
+ requireArxivMaxChars,
11
+ requireArxivPageRange,
7
12
  requireArxivLimit,
8
13
  } from "./papers.js";
9
14
 
@@ -19,6 +24,47 @@ describe("arxiv agent-facing author and recent commands", () => {
19
24
  expect(() => requireArxivLimit("51", 20)).toThrow("arxiv limit");
20
25
  });
21
26
 
27
+ it("validates read IDs, page ranges, max text bounds, and filenames", () => {
28
+ expect(normalizeArxivId("arxiv:1706.03762v7")).toBe("1706.03762v7");
29
+ expect(normalizeArxivId("https://arxiv.org/abs/1706.03762")).toBe(
30
+ "1706.03762",
31
+ );
32
+ expect(normalizeArxivId("https://arxiv.org/pdf/hep-th/9901001.pdf")).toBe(
33
+ "hep-th/9901001",
34
+ );
35
+ expect(() => normalizeArxivId("../1706.03762")).toThrow("Invalid arXiv");
36
+ expect(requireArxivPageRange("2", "4")).toEqual({
37
+ firstPage: 2,
38
+ lastPage: 4,
39
+ });
40
+ expect(() => requireArxivPageRange("0", "4")).toThrow("first-page");
41
+ expect(() => requireArxivPageRange("4", "3")).toThrow("last-page");
42
+ expect(requireArxivMaxChars(undefined)).toBe(40000);
43
+ expect(() => requireArxivMaxChars("999")).toThrow("max-chars");
44
+ expect(
45
+ arxivArtifactFilename({
46
+ id: "1706.03762v7",
47
+ title: "Attention / Is: All? You Need",
48
+ }),
49
+ ).toBe("1706.03762v7-Attention-Is-All-You-Need.pdf");
50
+ });
51
+
52
+ it("registers arxiv read as a source-level fulltext command", () => {
53
+ expect(resolveCommand("arxiv", "read")?.command.capabilities).toEqual([
54
+ "http.fetch",
55
+ "http.download",
56
+ "subprocess.exec",
57
+ "scholar.fulltext",
58
+ "scholar.pdf",
59
+ ]);
60
+ expect(resolveCommand("arxiv", "read")?.command.minimum_capability).toBe(
61
+ "subprocess.exec",
62
+ );
63
+ expect(resolveCommand("arxiv", "read")?.command.executables).toEqual([
64
+ "pdftotext",
65
+ ]);
66
+ });
67
+
22
68
  it("decodes entities and parses Atom entries", () => {
23
69
  expect(decodeArxivEntities("A &amp; B &lt; C")).toBe("A & B < C");
24
70
  expect(