@zenalexa/unicli 0.225.1 → 0.225.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +3 -3
- package/README.md +3 -3
- package/README.zh-CN.md +3 -3
- package/dist/adapters/acl-anthology/papers.d.ts +16 -9
- package/dist/adapters/acl-anthology/papers.d.ts.map +1 -1
- package/dist/adapters/acl-anthology/papers.js +322 -58
- package/dist/adapters/acl-anthology/papers.js.map +1 -1
- package/dist/adapters/arxiv/papers.d.ts +22 -4
- package/dist/adapters/arxiv/papers.d.ts.map +1 -1
- package/dist/adapters/arxiv/papers.js +202 -4
- package/dist/adapters/arxiv/papers.js.map +1 -1
- package/dist/adapters/baidu-scholar/search.d.ts +15 -1
- package/dist/adapters/baidu-scholar/search.d.ts.map +1 -1
- package/dist/adapters/baidu-scholar/search.js +72 -8
- package/dist/adapters/baidu-scholar/search.js.map +1 -1
- package/dist/adapters/biorxiv/preprints.d.ts +9 -0
- package/dist/adapters/biorxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/biorxiv/preprints.js +78 -0
- package/dist/adapters/biorxiv/preprints.js.map +1 -0
- package/dist/adapters/cnki/search.d.ts +82 -0
- package/dist/adapters/cnki/search.d.ts.map +1 -0
- package/dist/adapters/cnki/search.js +236 -0
- package/dist/adapters/cnki/search.js.map +1 -0
- package/dist/adapters/cvf/papers.d.ts +12 -7
- package/dist/adapters/cvf/papers.d.ts.map +1 -1
- package/dist/adapters/cvf/papers.js +210 -27
- package/dist/adapters/cvf/papers.js.map +1 -1
- package/dist/adapters/dblp/publications.d.ts +12 -5
- package/dist/adapters/dblp/publications.d.ts.map +1 -1
- package/dist/adapters/dblp/publications.js +31 -8
- package/dist/adapters/dblp/publications.js.map +1 -1
- package/dist/adapters/google-scholar/search.d.ts +22 -1
- package/dist/adapters/google-scholar/search.d.ts.map +1 -1
- package/dist/adapters/google-scholar/search.js +129 -14
- package/dist/adapters/google-scholar/search.js.map +1 -1
- package/dist/adapters/hf/paper.d.ts +12 -3
- package/dist/adapters/hf/paper.d.ts.map +1 -1
- package/dist/adapters/hf/paper.js +65 -5
- package/dist/adapters/hf/paper.js.map +1 -1
- package/dist/adapters/medrxiv/preprints.d.ts +9 -0
- package/dist/adapters/medrxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/medrxiv/preprints.js +78 -0
- package/dist/adapters/medrxiv/preprints.js.map +1 -0
- package/dist/adapters/neurips/proceedings.d.ts +8 -7
- package/dist/adapters/neurips/proceedings.d.ts.map +1 -1
- package/dist/adapters/neurips/proceedings.js +209 -21
- package/dist/adapters/neurips/proceedings.js.map +1 -1
- package/dist/adapters/openalex/works.d.ts +21 -5
- package/dist/adapters/openalex/works.d.ts.map +1 -1
- package/dist/adapters/openalex/works.js +108 -8
- package/dist/adapters/openalex/works.js.map +1 -1
- package/dist/adapters/openreview/papers.d.ts +10 -4
- package/dist/adapters/openreview/papers.d.ts.map +1 -1
- package/dist/adapters/openreview/papers.js +351 -24
- package/dist/adapters/openreview/papers.js.map +1 -1
- package/dist/adapters/pmlr/proceedings.d.ts +6 -6
- package/dist/adapters/pmlr/proceedings.d.ts.map +1 -1
- package/dist/adapters/pmlr/proceedings.js +92 -12
- package/dist/adapters/pmlr/proceedings.js.map +1 -1
- package/dist/adapters/pubmed/articles.d.ts +8 -4
- package/dist/adapters/pubmed/articles.d.ts.map +1 -1
- package/dist/adapters/pubmed/articles.js +272 -39
- package/dist/adapters/pubmed/articles.js.map +1 -1
- package/dist/adapters/rxiv/preprints.d.ts +75 -0
- package/dist/adapters/rxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/rxiv/preprints.js +651 -0
- package/dist/adapters/rxiv/preprints.js.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf-read.d.ts +49 -0
- package/dist/adapters/scholar-artifacts/pdf-read.d.ts.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf-read.js +204 -0
- package/dist/adapters/scholar-artifacts/pdf-read.js.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf.d.ts +16 -0
- package/dist/adapters/scholar-artifacts/pdf.d.ts.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf.js +122 -0
- package/dist/adapters/scholar-artifacts/pdf.js.map +1 -0
- package/dist/adapters/semantic-scholar/papers.d.ts +6 -6
- package/dist/adapters/semantic-scholar/papers.d.ts.map +1 -1
- package/dist/adapters/semantic-scholar/papers.js +80 -6
- package/dist/adapters/semantic-scholar/papers.js.map +1 -1
- package/dist/adapters/unpaywall/works.d.ts +7 -7
- package/dist/adapters/unpaywall/works.d.ts.map +1 -1
- package/dist/adapters/unpaywall/works.js +104 -12
- package/dist/adapters/unpaywall/works.js.map +1 -1
- package/dist/adapters/wanfang/search.d.ts +14 -0
- package/dist/adapters/wanfang/search.d.ts.map +1 -1
- package/dist/adapters/wanfang/search.js +56 -7
- package/dist/adapters/wanfang/search.js.map +1 -1
- package/dist/browser/page.d.ts +2 -0
- package/dist/browser/page.d.ts.map +1 -1
- package/dist/browser/page.js +12 -0
- package/dist/browser/page.js.map +1 -1
- package/dist/browser/protocol.d.ts +6 -1
- package/dist/browser/protocol.d.ts.map +1 -1
- package/dist/browser/protocol.js.map +1 -1
- package/dist/commands/browser/actions.d.ts.map +1 -1
- package/dist/commands/browser/actions.js +487 -8
- package/dist/commands/browser/actions.js.map +1 -1
- package/dist/commands/compute.js +12 -1
- package/dist/commands/compute.js.map +1 -1
- package/dist/commands/schema.d.ts.map +1 -1
- package/dist/commands/schema.js +22 -0
- package/dist/commands/schema.js.map +1 -1
- package/dist/commands/scholar.d.ts +77 -5
- package/dist/commands/scholar.d.ts.map +1 -1
- package/dist/commands/scholar.js +2945 -83
- package/dist/commands/scholar.js.map +1 -1
- package/dist/commands/search.d.ts.map +1 -1
- package/dist/commands/search.js +14 -3
- package/dist/commands/search.js.map +1 -1
- package/dist/compute/contracts.d.ts +55 -0
- package/dist/compute/contracts.d.ts.map +1 -0
- package/dist/compute/contracts.js +487 -0
- package/dist/compute/contracts.js.map +1 -0
- package/dist/core/command-contract.d.ts.map +1 -1
- package/dist/core/command-contract.js +5 -0
- package/dist/core/command-contract.js.map +1 -1
- package/dist/core/schema-v2.d.ts +1 -0
- package/dist/core/schema-v2.d.ts.map +1 -1
- package/dist/core/schema-v2.js +1 -0
- package/dist/core/schema-v2.js.map +1 -1
- package/dist/discovery/aliases.d.ts +8 -1
- package/dist/discovery/aliases.d.ts.map +1 -1
- package/dist/discovery/aliases.js +333 -20
- package/dist/discovery/aliases.js.map +1 -1
- package/dist/discovery/core-catalog.d.ts +2 -0
- package/dist/discovery/core-catalog.d.ts.map +1 -1
- package/dist/discovery/core-catalog.js +525 -66
- package/dist/discovery/core-catalog.js.map +1 -1
- package/dist/discovery/intents.d.ts +1 -0
- package/dist/discovery/intents.d.ts.map +1 -1
- package/dist/discovery/intents.js +299 -2
- package/dist/discovery/intents.js.map +1 -1
- package/dist/discovery/loader.d.ts.map +1 -1
- package/dist/discovery/loader.js +3 -0
- package/dist/discovery/loader.js.map +1 -1
- package/dist/discovery/macos-dynamic.d.ts +1 -0
- package/dist/discovery/macos-dynamic.d.ts.map +1 -1
- package/dist/discovery/macos-dynamic.js +20 -1
- package/dist/discovery/macos-dynamic.js.map +1 -1
- package/dist/discovery/search.d.ts.map +1 -1
- package/dist/discovery/search.js +12 -5
- package/dist/discovery/search.js.map +1 -1
- package/dist/engine/browser/evidence.d.ts +34 -1
- package/dist/engine/browser/evidence.d.ts.map +1 -1
- package/dist/engine/browser/evidence.js +141 -6
- package/dist/engine/browser/evidence.js.map +1 -1
- package/dist/engine/capability-policy.d.ts.map +1 -1
- package/dist/engine/capability-policy.js +30 -4
- package/dist/engine/capability-policy.js.map +1 -1
- package/dist/engine/kernel/stages.d.ts.map +1 -1
- package/dist/engine/kernel/stages.js +3 -0
- package/dist/engine/kernel/stages.js.map +1 -1
- package/dist/engine/operation-policy.d.ts +4 -1
- package/dist/engine/operation-policy.d.ts.map +1 -1
- package/dist/engine/operation-policy.js +23 -0
- package/dist/engine/operation-policy.js.map +1 -1
- package/dist/engine/steps/fetch-text.d.ts.map +1 -1
- package/dist/engine/steps/fetch-text.js +2 -2
- package/dist/engine/steps/fetch-text.js.map +1 -1
- package/dist/engine/steps/fetch.d.ts +1 -0
- package/dist/engine/steps/fetch.d.ts.map +1 -1
- package/dist/engine/steps/fetch.js +24 -4
- package/dist/engine/steps/fetch.js.map +1 -1
- package/dist/fast-path/handlers/discovery.d.ts +5 -5
- package/dist/fast-path/handlers/discovery.d.ts.map +1 -1
- package/dist/fast-path/handlers/discovery.js +61 -8
- package/dist/fast-path/handlers/discovery.js.map +1 -1
- package/dist/fast-path/manifest.d.ts +3 -0
- package/dist/fast-path/manifest.d.ts.map +1 -1
- package/dist/fast-path/manifest.js.map +1 -1
- package/dist/fast-path/policy.d.ts.map +1 -1
- package/dist/fast-path/policy.js +3 -0
- package/dist/fast-path/policy.js.map +1 -1
- package/dist/fast-path/render.d.ts +2 -0
- package/dist/fast-path/render.d.ts.map +1 -1
- package/dist/fast-path/render.js +9 -0
- package/dist/fast-path/render.js.map +1 -1
- package/dist/manifest-compact.txt +2 -2
- package/dist/manifest.json +6977 -1002
- package/dist/mcp/handler.d.ts +2 -16
- package/dist/mcp/handler.d.ts.map +1 -1
- package/dist/mcp/handler.js.map +1 -1
- package/dist/mcp/http-transport.d.ts +7 -1
- package/dist/mcp/http-transport.d.ts.map +1 -1
- package/dist/mcp/http-transport.js +20 -1
- package/dist/mcp/http-transport.js.map +1 -1
- package/dist/mcp/jsonrpc.d.ts +27 -0
- package/dist/mcp/jsonrpc.d.ts.map +1 -0
- package/dist/mcp/jsonrpc.js +12 -0
- package/dist/mcp/jsonrpc.js.map +1 -0
- package/dist/mcp/origin-guard.d.ts +26 -0
- package/dist/mcp/origin-guard.d.ts.map +1 -0
- package/dist/mcp/origin-guard.js +42 -0
- package/dist/mcp/origin-guard.js.map +1 -0
- package/dist/mcp/profiles/computer-use.d.ts.map +1 -1
- package/dist/mcp/profiles/computer-use.js +30 -270
- package/dist/mcp/profiles/computer-use.js.map +1 -1
- package/dist/mcp/streamable-http/session.d.ts +4 -22
- package/dist/mcp/streamable-http/session.d.ts.map +1 -1
- package/dist/mcp/streamable-http/session.js +4 -24
- package/dist/mcp/streamable-http/session.js.map +1 -1
- package/dist/mcp/tools.d.ts.map +1 -1
- package/dist/mcp/tools.js +74 -54
- package/dist/mcp/tools.js.map +1 -1
- package/dist/output/envelope.d.ts +2 -0
- package/dist/output/envelope.d.ts.map +1 -1
- package/dist/output/envelope.js.map +1 -1
- package/dist/output/error-map.d.ts +14 -0
- package/dist/output/error-map.d.ts.map +1 -1
- package/dist/output/error-map.js +20 -0
- package/dist/output/error-map.js.map +1 -1
- package/dist/registry.d.ts +2 -0
- package/dist/registry.d.ts.map +1 -1
- package/dist/registry.js +1 -0
- package/dist/registry.js.map +1 -1
- package/dist/transport/cascade.d.ts.map +1 -1
- package/dist/transport/cascade.js +77 -5
- package/dist/transport/cascade.js.map +1 -1
- package/dist/transport/refs.d.ts +33 -1
- package/dist/transport/refs.d.ts.map +1 -1
- package/dist/transport/refs.js +40 -1
- package/dist/transport/refs.js.map +1 -1
- package/dist/types/scholarly.d.ts +19 -4
- package/dist/types/scholarly.d.ts.map +1 -1
- package/dist/types/scholarly.js +4 -4
- package/dist/types.d.ts +8 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
- package/server.json +2 -2
- package/skills/unicli/SKILL.md +1 -1
- package/skills/unicli-claude-code/SKILL.md +1 -1
- package/skills/unicli-hermes/SKILL.md +1 -1
- package/src/adapters/acl-anthology/papers.test.ts +111 -0
- package/src/adapters/acl-anthology/papers.ts +379 -71
- package/src/adapters/arxiv/papers.test.ts +46 -0
- package/src/adapters/arxiv/papers.ts +251 -4
- package/src/adapters/baidu-scholar/search.ts +74 -11
- package/src/adapters/biorxiv/preprints.ts +112 -0
- package/src/adapters/cnki/search.ts +357 -0
- package/src/adapters/cvf/papers.ts +260 -27
- package/src/adapters/dblp/publications.test.ts +9 -0
- package/src/adapters/dblp/publications.ts +31 -8
- package/src/adapters/defuddle/read.yaml +30 -0
- package/src/adapters/google-scholar/search.ts +165 -17
- package/src/adapters/hf/paper.test.ts +23 -0
- package/src/adapters/hf/paper.ts +89 -5
- package/src/adapters/hf/top.yaml +34 -2
- package/src/adapters/huggingface-papers/daily.yaml +37 -3
- package/src/adapters/huggingface-papers/search.yaml +43 -9
- package/src/adapters/jina/read.yaml +30 -0
- package/src/adapters/markdown-new/read.yaml +50 -0
- package/src/adapters/medrxiv/preprints.ts +112 -0
- package/src/adapters/neurips/proceedings.ts +266 -22
- package/src/adapters/ollama-cloud/fetch.yaml +39 -0
- package/src/adapters/ollama-cloud/search.yaml +43 -0
- package/src/adapters/openalex/works.test.ts +15 -4
- package/src/adapters/openalex/works.ts +136 -8
- package/src/adapters/openreview/papers.test.ts +31 -0
- package/src/adapters/openreview/papers.ts +407 -29
- package/src/adapters/pmlr/proceedings.ts +102 -12
- package/src/adapters/pubmed/articles.test.ts +88 -1
- package/src/adapters/pubmed/articles.ts +343 -44
- package/src/adapters/rxiv/preprints.test.ts +233 -0
- package/src/adapters/rxiv/preprints.ts +849 -0
- package/src/adapters/scholar-artifacts/pdf-read.ts +277 -0
- package/src/adapters/scholar-artifacts/pdf.ts +133 -0
- package/src/adapters/semantic-scholar/papers.ts +98 -6
- package/src/adapters/unpaywall/works.ts +141 -12
- package/src/adapters/wanfang/search.ts +57 -7
- package/src/adapters/cnki/search.yaml +0 -49
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
|
|
3
|
+
import { resolveCommand } from "../../registry.js";
|
|
4
|
+
import {
|
|
5
|
+
aclAnthologyPdfUrl,
|
|
6
|
+
aclArtifactFilename,
|
|
7
|
+
cleanAclHtml,
|
|
8
|
+
normalizeAclAnthologyId,
|
|
9
|
+
parseAclBibEntries,
|
|
10
|
+
requireAclReadPageArgs,
|
|
11
|
+
searchAclBibRows,
|
|
12
|
+
} from "./papers.js";
|
|
13
|
+
|
|
14
|
+
describe("acl-anthology scholarly commands", () => {
|
|
15
|
+
it("normalizes source ids and URLs without accepting traversal", () => {
|
|
16
|
+
expect(normalizeAclAnthologyId("2020.acl-main.447")).toBe(
|
|
17
|
+
"2020.acl-main.447",
|
|
18
|
+
);
|
|
19
|
+
expect(
|
|
20
|
+
normalizeAclAnthologyId("https://aclanthology.org/2020.acl-main.447.pdf"),
|
|
21
|
+
).toBe("2020.acl-main.447");
|
|
22
|
+
expect(
|
|
23
|
+
normalizeAclAnthologyId("https://aclanthology.org/2020.acl-main.447/"),
|
|
24
|
+
).toBe("2020.acl-main.447");
|
|
25
|
+
expect(() => normalizeAclAnthologyId("../2020.acl-main.447")).toThrow(
|
|
26
|
+
"ACL Anthology id",
|
|
27
|
+
);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it("builds official ACL PDF URLs and stable artifact filenames", () => {
|
|
31
|
+
expect(aclAnthologyPdfUrl("2020.acl-main.447")).toBe(
|
|
32
|
+
"https://aclanthology.org/2020.acl-main.447.pdf",
|
|
33
|
+
);
|
|
34
|
+
expect(
|
|
35
|
+
aclArtifactFilename({
|
|
36
|
+
id: "2020.acl-main.447",
|
|
37
|
+
title: "S2ORC: The Semantic Scholar Open Research Corpus",
|
|
38
|
+
source_adapter: "acl-anthology",
|
|
39
|
+
retrieved_at: "2026-06-27T00:00:00Z",
|
|
40
|
+
}),
|
|
41
|
+
).toBe(
|
|
42
|
+
"2020.acl-main.447-S2ORC__The_Semantic_Scholar_Open_Research_Corpus.pdf",
|
|
43
|
+
);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it("maps scholar read hyphenated arguments to the PDF reader contract", () => {
|
|
47
|
+
expect(
|
|
48
|
+
requireAclReadPageArgs({
|
|
49
|
+
"first-page": "2",
|
|
50
|
+
"last-page": "4",
|
|
51
|
+
"max-chars": "1000",
|
|
52
|
+
}),
|
|
53
|
+
).toEqual({ first_page: "2", last_page: "4", max_chars: "1000" });
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
it("registers ACL read as source-level fulltext with pdftotext governance", () => {
|
|
57
|
+
const command = resolveCommand("acl-anthology", "read")?.command;
|
|
58
|
+
expect(command?.capabilities).toEqual([
|
|
59
|
+
"http.fetch",
|
|
60
|
+
"http.download",
|
|
61
|
+
"subprocess.exec",
|
|
62
|
+
"scholar.fulltext",
|
|
63
|
+
"scholar.pdf",
|
|
64
|
+
]);
|
|
65
|
+
expect(command?.executables).toEqual(["pdftotext"]);
|
|
66
|
+
expect(command?.minimum_capability).toBe("subprocess.exec");
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it("parses official ACL BibTeX export rows for source-backed search", () => {
|
|
70
|
+
const rows = parseAclBibEntries(`
|
|
71
|
+
@inproceedings{lo-etal-2020-s2orc,
|
|
72
|
+
title = "{S}2{ORC}: The Semantic Scholar Open Research Corpus",
|
|
73
|
+
author = "Lo, Kyle and
|
|
74
|
+
Wang, Lucy Lu and
|
|
75
|
+
Neumann, Mark",
|
|
76
|
+
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
|
|
77
|
+
year = "2020",
|
|
78
|
+
url = "https://aclanthology.org/2020.acl-main.447/",
|
|
79
|
+
doi = "10.18653/v1/2020.acl-main.447",
|
|
80
|
+
pages = "4969--4983"
|
|
81
|
+
}
|
|
82
|
+
@inproceedings{sharma-etal-2026-council,
|
|
83
|
+
title = "Council of {LLM}s",
|
|
84
|
+
author = "Sharma, Vivek",
|
|
85
|
+
year = "2026",
|
|
86
|
+
url = "https://aclanthology.org/2026.wassa-1.1/"
|
|
87
|
+
}
|
|
88
|
+
`);
|
|
89
|
+
|
|
90
|
+
expect(rows[0]).toMatchObject({
|
|
91
|
+
id: "2020.acl-main.447",
|
|
92
|
+
title: "S2ORC: The Semantic Scholar Open Research Corpus",
|
|
93
|
+
authors: ["Kyle Lo", "Lucy Lu Wang", "Mark Neumann"],
|
|
94
|
+
year: 2020,
|
|
95
|
+
doi: "10.18653/v1/2020.acl-main.447",
|
|
96
|
+
pdf_url: "https://aclanthology.org/2020.acl-main.447.pdf",
|
|
97
|
+
source_url: "https://aclanthology.org/2020.acl-main.447/",
|
|
98
|
+
});
|
|
99
|
+
expect(searchAclBibRows(rows, "Semantic Scholar Corpus", 1)).toEqual([
|
|
100
|
+
rows[0],
|
|
101
|
+
]);
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it("cleans ACL Anthology title HTML", () => {
|
|
105
|
+
expect(
|
|
106
|
+
cleanAclHtml(
|
|
107
|
+
'<a href="https://aclanthology.org/demo.pdf"><span class="acl-fixed-case">S</span>2ORC</a>',
|
|
108
|
+
),
|
|
109
|
+
).toBe("S2ORC");
|
|
110
|
+
});
|
|
111
|
+
});
|
|
@@ -1,22 +1,31 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @owner src::adapters::acl-anthology::papers
|
|
3
|
-
* @does Registers ACL Anthology paper search and
|
|
4
|
-
* @needs aclanthology.org static search/
|
|
5
|
-
* @feeds src/commands/scholar.ts via scholar.search, scholar.get, scholar.pdf, and scholar.venue
|
|
6
|
-
* @breaks ACL Anthology markup drift
|
|
7
|
-
* @invariants Paper URLs/PDF URLs are absolutized against aclanthology.org;
|
|
8
|
-
* @side-effects HTTPS egress to aclanthology.org
|
|
9
|
-
* @perf O(N) over one HTML response
|
|
3
|
+
* @does Registers ACL Anthology paper search, metadata lookup, PDF download, and PDF text-read commands from official Anthology pages.
|
|
4
|
+
* @needs aclanthology.org static search/paper HTML, scholar-artifacts PDF reader, src/registry.ts
|
|
5
|
+
* @feeds src/commands/scholar.ts via scholar.search, scholar.get, scholar.pdf, scholar.fulltext, and scholar.venue
|
|
6
|
+
* @breaks ACL Anthology markup drift, denied PDF downloads, missing pdftotext, or empty PDF text surfaces as source read failure.
|
|
7
|
+
* @invariants Paper URLs/PDF URLs are absolutized against aclanthology.org; read output labels `text_source=pdf`.
|
|
8
|
+
* @side-effects HTTPS egress to aclanthology.org; read writes PDFs under the requested output directory and executes pdftotext.
|
|
9
|
+
* @perf O(N) over one HTML response for search; O(PDF bytes + extracted pages) for read.
|
|
10
10
|
* @concurrency safe
|
|
11
|
-
* @test
|
|
11
|
+
* @test src/adapters/acl-anthology/papers.test.ts, tests/unit/adapters/scholar-sources.test.ts
|
|
12
12
|
* @stability experimental
|
|
13
13
|
* @since 2026-05-19
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
|
+
import { execFile } from "node:child_process";
|
|
17
|
+
import { join, resolve } from "node:path";
|
|
18
|
+
import { promisify } from "node:util";
|
|
19
|
+
import { gunzipSync } from "node:zlib";
|
|
20
|
+
|
|
16
21
|
import { cli, Strategy } from "../../registry.js";
|
|
22
|
+
import { httpDownload, sanitizeFilename } from "../../engine/download.js";
|
|
17
23
|
import type { ScholarlyWorkRecord } from "../../types/scholarly.js";
|
|
18
24
|
|
|
19
25
|
const ORIGIN = "https://aclanthology.org";
|
|
26
|
+
const ANTHOLOGY_BIB_URL = `${ORIGIN}/anthology.bib.gz`;
|
|
27
|
+
const execFileAsync = promisify(execFile);
|
|
28
|
+
let anthologyBibCache: Promise<string> | undefined;
|
|
20
29
|
|
|
21
30
|
function decode(value: string): string {
|
|
22
31
|
return value
|
|
@@ -29,54 +38,206 @@ function decode(value: string): string {
|
|
|
29
38
|
.trim();
|
|
30
39
|
}
|
|
31
40
|
|
|
32
|
-
function
|
|
33
|
-
return
|
|
34
|
-
|
|
35
|
-
|
|
41
|
+
export function cleanAclHtml(value: string): string {
|
|
42
|
+
return decode(value.replace(/<[^>]+>/g, ""));
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function normalizeSearchText(value: string): string {
|
|
46
|
+
return value
|
|
47
|
+
.toLowerCase()
|
|
48
|
+
.replace(/[^a-z0-9]+/g, " ")
|
|
49
|
+
.replace(/\s+/g, " ")
|
|
50
|
+
.trim();
|
|
36
51
|
}
|
|
37
52
|
|
|
38
|
-
function
|
|
39
|
-
const raw = String(value ?? "")
|
|
53
|
+
export function normalizeAclAnthologyId(value: unknown): string {
|
|
54
|
+
const raw = String(value ?? "")
|
|
55
|
+
.trim()
|
|
56
|
+
.replace(/^https?:\/\/(?:www\.)?aclanthology\.org\//i, "")
|
|
57
|
+
.replace(/\.pdf$/i, "")
|
|
58
|
+
.replace(/\/$/i, "");
|
|
40
59
|
if (!/^[A-Za-z0-9.-]+$/.test(raw)) {
|
|
41
|
-
throw new Error(`ACL Anthology id "${
|
|
60
|
+
throw new Error(`ACL Anthology id "${String(value ?? "")}" is not valid.`);
|
|
42
61
|
}
|
|
43
62
|
return raw.replace(/\.$/, "");
|
|
44
63
|
}
|
|
45
64
|
|
|
46
|
-
function
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
65
|
+
export function aclAnthologyPdfUrl(id: string): string {
|
|
66
|
+
return `${ORIGIN}/${id}.pdf`;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function cleanBibValue(value: string): string {
|
|
70
|
+
return decode(
|
|
71
|
+
value
|
|
72
|
+
.trim()
|
|
73
|
+
.replace(/,$/, "")
|
|
74
|
+
.replace(/^["{]|["}]$/g, "")
|
|
75
|
+
.replace(/\\"/g, '"')
|
|
76
|
+
.replace(/\\&/g, "&")
|
|
77
|
+
.replace(/\\url\{([^}]+)\}/g, "$1")
|
|
78
|
+
.replace(/\\[a-zA-Z]+\{([^}]*)\}/g, "$1")
|
|
79
|
+
.replace(/[{}]/g, "")
|
|
80
|
+
.replace(/\s+/g, " "),
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function normalizeBibAuthors(value: string): string[] | undefined {
|
|
85
|
+
const authors = cleanBibValue(value)
|
|
86
|
+
.split(/\s+and\s+/)
|
|
87
|
+
.map((author) => {
|
|
88
|
+
const parts = author.split(/\s*,\s*/);
|
|
89
|
+
return parts.length === 2 ? `${parts[1]} ${parts[0]}` : author;
|
|
90
|
+
})
|
|
91
|
+
.map((author) => author.trim())
|
|
92
|
+
.filter(Boolean);
|
|
93
|
+
return authors.length > 0 ? authors.slice(0, 20) : undefined;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function parseBibFields(entry: string): Record<string, string> {
|
|
97
|
+
const fields: Record<string, string> = {};
|
|
51
98
|
const re =
|
|
52
|
-
|
|
99
|
+
/^\s*([A-Za-z][A-Za-z0-9_-]*)\s*=\s*([\s\S]*?)(?=,\n\s*[A-Za-z][A-Za-z0-9_-]*\s*=|\n}\s*$)/gm;
|
|
53
100
|
let match: RegExpExecArray | null;
|
|
54
|
-
while ((match = re.exec(
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
101
|
+
while ((match = re.exec(entry)) !== null) {
|
|
102
|
+
fields[match[1].toLowerCase()] = match[2];
|
|
103
|
+
}
|
|
104
|
+
return fields;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export function parseAclBibEntries(bib: string): ScholarlyWorkRecord[] {
|
|
108
|
+
const rows: ScholarlyWorkRecord[] = [];
|
|
109
|
+
for (const entry of bib.split(/\n(?=@[A-Za-z]+\{)/)) {
|
|
110
|
+
const header = entry.match(/^@([A-Za-z]+)\{([^,]+),/);
|
|
111
|
+
if (!header) continue;
|
|
112
|
+
const fields = parseBibFields(entry);
|
|
113
|
+
const title = fields.title ? cleanBibValue(fields.title) : "";
|
|
114
|
+
const sourceUrl = fields.url ? cleanBibValue(fields.url) : "";
|
|
115
|
+
const id = sourceUrl
|
|
116
|
+
? normalizeAclAnthologyId(sourceUrl)
|
|
117
|
+
: normalizeAclAnthologyId(header[2]);
|
|
118
|
+
if (!title || !id) continue;
|
|
119
|
+
const year = fields.year ? Number(cleanBibValue(fields.year)) : undefined;
|
|
120
|
+
const doi = fields.doi ? cleanBibValue(fields.doi) : undefined;
|
|
121
|
+
rows.push({
|
|
68
122
|
id,
|
|
69
|
-
title
|
|
70
|
-
authors:
|
|
71
|
-
year: Number
|
|
72
|
-
venue:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
123
|
+
title,
|
|
124
|
+
authors: fields.author ? normalizeBibAuthors(fields.author) : undefined,
|
|
125
|
+
year: Number.isInteger(year) ? year : undefined,
|
|
126
|
+
venue: fields.booktitle
|
|
127
|
+
? cleanBibValue(fields.booktitle)
|
|
128
|
+
: "ACL Anthology",
|
|
129
|
+
doi,
|
|
130
|
+
pdf_url: aclAnthologyPdfUrl(id),
|
|
131
|
+
source_adapter: "acl-anthology",
|
|
132
|
+
source_url: `${ORIGIN}/${id}/`,
|
|
76
133
|
retrieved_at: new Date().toISOString(),
|
|
77
134
|
});
|
|
78
135
|
}
|
|
79
|
-
return
|
|
136
|
+
return rows;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function scoreAclBibRow(row: ScholarlyWorkRecord, query: string): number {
|
|
140
|
+
const normalizedQuery = normalizeSearchText(query);
|
|
141
|
+
const terms = normalizedQuery.split(" ").filter(Boolean);
|
|
142
|
+
const title = normalizeSearchText(row.title);
|
|
143
|
+
const authors = normalizeSearchText((row.authors ?? []).join(" "));
|
|
144
|
+
const haystack = normalizeSearchText(
|
|
145
|
+
[
|
|
146
|
+
row.id,
|
|
147
|
+
row.title,
|
|
148
|
+
row.venue,
|
|
149
|
+
row.year,
|
|
150
|
+
row.doi,
|
|
151
|
+
(row.authors ?? []).join(" "),
|
|
152
|
+
]
|
|
153
|
+
.filter(Boolean)
|
|
154
|
+
.join(" "),
|
|
155
|
+
);
|
|
156
|
+
if (!normalizedQuery || !terms.every((term) => haystack.includes(term))) {
|
|
157
|
+
return 0;
|
|
158
|
+
}
|
|
159
|
+
let score = 10;
|
|
160
|
+
if (row.id.toLowerCase() === normalizedQuery) score += 100;
|
|
161
|
+
if (title === normalizedQuery) score += 80;
|
|
162
|
+
if (title.includes(normalizedQuery)) score += 40;
|
|
163
|
+
if (authors.includes(normalizedQuery)) score += 20;
|
|
164
|
+
return score + Math.min(Number(row.year ?? 0) / 10_000, 1);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
export function searchAclBibRows(
|
|
168
|
+
rows: readonly ScholarlyWorkRecord[],
|
|
169
|
+
query: string,
|
|
170
|
+
limit: number,
|
|
171
|
+
): ScholarlyWorkRecord[] {
|
|
172
|
+
return rows
|
|
173
|
+
.map((row, index) => ({
|
|
174
|
+
row,
|
|
175
|
+
index,
|
|
176
|
+
score: scoreAclBibRow(row, query),
|
|
177
|
+
}))
|
|
178
|
+
.filter((candidate) => candidate.score > 0)
|
|
179
|
+
.sort((a, b) => b.score - a.score || a.index - b.index)
|
|
180
|
+
.slice(0, limit)
|
|
181
|
+
.map((candidate) => candidate.row);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
export function requireAclReadPageArgs(
|
|
185
|
+
kwargs: Record<string, unknown>,
|
|
186
|
+
): Record<string, unknown> {
|
|
187
|
+
return {
|
|
188
|
+
first_page: kwargs["first-page"] ?? kwargs.firstPage,
|
|
189
|
+
last_page: kwargs["last-page"] ?? kwargs.lastPage,
|
|
190
|
+
max_chars: kwargs["max-chars"] ?? kwargs.maxChars,
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
export function aclArtifactFilename(record: ScholarlyWorkRecord): string {
|
|
195
|
+
const title = sanitizeFilename(String(record.title ?? ""))
|
|
196
|
+
.replace(/\s+/g, "_")
|
|
197
|
+
.slice(0, 96);
|
|
198
|
+
return `${sanitizeFilename(record.id)}${title ? `-${title}` : ""}.pdf`;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function requireAclPageRange(
|
|
202
|
+
firstPage: unknown,
|
|
203
|
+
lastPage: unknown,
|
|
204
|
+
): { firstPage: number; lastPage: number } {
|
|
205
|
+
const first = Number(firstPage ?? 1);
|
|
206
|
+
const last = Number(lastPage ?? 20);
|
|
207
|
+
if (!Number.isInteger(first) || first < 1) {
|
|
208
|
+
throw new Error("acl-anthology first-page must be an integer >= 1.");
|
|
209
|
+
}
|
|
210
|
+
if (!Number.isInteger(last) || last < first) {
|
|
211
|
+
throw new Error(
|
|
212
|
+
"acl-anthology last-page must be an integer >= first-page.",
|
|
213
|
+
);
|
|
214
|
+
}
|
|
215
|
+
return { firstPage: first, lastPage: last };
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function requireAclMaxChars(value: unknown, fallback = 40_000): number {
|
|
219
|
+
if (value === undefined || value === null || value === "") return fallback;
|
|
220
|
+
const n = Number(value);
|
|
221
|
+
if (!Number.isInteger(n) || n < 1_000 || n > 1_000_000) {
|
|
222
|
+
throw new Error(
|
|
223
|
+
`acl-anthology max-chars must be an integer in [1000, 1000000]. Got: ${String(value)}`,
|
|
224
|
+
);
|
|
225
|
+
}
|
|
226
|
+
return n;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
function truncateAclText(
|
|
230
|
+
text: string,
|
|
231
|
+
maxChars: number,
|
|
232
|
+
): { text: string; truncated: boolean; originalChars: number } {
|
|
233
|
+
if (text.length <= maxChars) {
|
|
234
|
+
return { text, truncated: false, originalChars: text.length };
|
|
235
|
+
}
|
|
236
|
+
return {
|
|
237
|
+
text: `${text.slice(0, maxChars).trimEnd()}\n\n[truncated at ${maxChars} characters]`,
|
|
238
|
+
truncated: true,
|
|
239
|
+
originalChars: text.length,
|
|
240
|
+
};
|
|
80
241
|
}
|
|
81
242
|
|
|
82
243
|
async function fetchHtml(url: string, label: string): Promise<string> {
|
|
@@ -93,6 +254,104 @@ async function fetchHtml(url: string, label: string): Promise<string> {
|
|
|
93
254
|
return response.text();
|
|
94
255
|
}
|
|
95
256
|
|
|
257
|
+
async function fetchAnthologyBib(): Promise<string> {
|
|
258
|
+
anthologyBibCache ??= (async () => {
|
|
259
|
+
const response = await fetch(ANTHOLOGY_BIB_URL, {
|
|
260
|
+
headers: {
|
|
261
|
+
Accept: "application/x-gzip, application/gzip, */*",
|
|
262
|
+
"User-Agent":
|
|
263
|
+
"unicli-acl-anthology/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
|
|
264
|
+
},
|
|
265
|
+
});
|
|
266
|
+
if (!response.ok)
|
|
267
|
+
throw new Error(`ACL Anthology BibTeX returned HTTP ${response.status}.`);
|
|
268
|
+
return gunzipSync(Buffer.from(await response.arrayBuffer())).toString(
|
|
269
|
+
"utf8",
|
|
270
|
+
);
|
|
271
|
+
})();
|
|
272
|
+
return anthologyBibCache;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
async function fetchAclPaperRecord(id: string): Promise<ScholarlyWorkRecord> {
|
|
276
|
+
const html = await fetchHtml(`${ORIGIN}/${id}/`, `acl-anthology paper ${id}`);
|
|
277
|
+
const title = cleanAclHtml(
|
|
278
|
+
html.match(/<h2[^>]*id=title[^>]*>([\s\S]*?)<\/h2>/)?.[1] ?? "",
|
|
279
|
+
);
|
|
280
|
+
if (!title)
|
|
281
|
+
throw new Error(`ACL Anthology paper ${id} did not expose a title.`);
|
|
282
|
+
return {
|
|
283
|
+
id,
|
|
284
|
+
title,
|
|
285
|
+
year: Number(id.slice(0, 4)) || undefined,
|
|
286
|
+
venue: "ACL Anthology",
|
|
287
|
+
pdf_url: aclAnthologyPdfUrl(id),
|
|
288
|
+
source_adapter: "acl-anthology",
|
|
289
|
+
source_url: `${ORIGIN}/${id}/`,
|
|
290
|
+
retrieved_at: new Date().toISOString(),
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
async function readAclPaperPdf(
|
|
295
|
+
record: ScholarlyWorkRecord,
|
|
296
|
+
kwargs: Record<string, unknown>,
|
|
297
|
+
): Promise<Record<string, unknown>> {
|
|
298
|
+
if (!record.pdf_url) {
|
|
299
|
+
throw new Error(`ACL Anthology paper ${record.id} did not expose a PDF.`);
|
|
300
|
+
}
|
|
301
|
+
const outputDir = resolve(
|
|
302
|
+
String(kwargs.output ?? "./acl-anthology-downloads"),
|
|
303
|
+
);
|
|
304
|
+
const path = join(outputDir, aclArtifactFilename(record));
|
|
305
|
+
const download = await httpDownload(record.pdf_url, path, {
|
|
306
|
+
Accept: "application/pdf,*/*",
|
|
307
|
+
Referer: record.source_url ?? `${ORIGIN}/${record.id}/`,
|
|
308
|
+
"User-Agent":
|
|
309
|
+
"unicli-acl-anthology/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
|
|
310
|
+
});
|
|
311
|
+
if (download.status === "failed" || !download.path) {
|
|
312
|
+
throw new Error(
|
|
313
|
+
`ACL Anthology PDF download failed for ${record.id}: ${download.error ?? "no path"}.`,
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
const pageArgs = requireAclReadPageArgs(kwargs);
|
|
318
|
+
const { firstPage, lastPage } = requireAclPageRange(
|
|
319
|
+
pageArgs.first_page,
|
|
320
|
+
pageArgs.last_page,
|
|
321
|
+
);
|
|
322
|
+
const maxChars = requireAclMaxChars(pageArgs.max_chars);
|
|
323
|
+
const { stdout } = await execFileAsync(
|
|
324
|
+
"pdftotext",
|
|
325
|
+
[
|
|
326
|
+
"-layout",
|
|
327
|
+
"-enc",
|
|
328
|
+
"UTF-8",
|
|
329
|
+
"-f",
|
|
330
|
+
String(firstPage),
|
|
331
|
+
"-l",
|
|
332
|
+
String(lastPage),
|
|
333
|
+
download.path,
|
|
334
|
+
"-",
|
|
335
|
+
],
|
|
336
|
+
{ timeout: 60_000, maxBuffer: 10 * 1024 * 1024 },
|
|
337
|
+
);
|
|
338
|
+
const extracted = stdout.trim();
|
|
339
|
+
if (!extracted) {
|
|
340
|
+
throw new Error(
|
|
341
|
+
`pdftotext returned no text for ACL Anthology ${record.id} pages ${firstPage}-${lastPage}.`,
|
|
342
|
+
);
|
|
343
|
+
}
|
|
344
|
+
const truncated = truncateAclText(extracted, maxChars);
|
|
345
|
+
return {
|
|
346
|
+
...record,
|
|
347
|
+
path: download.path,
|
|
348
|
+
text: truncated.text,
|
|
349
|
+
text_chars: truncated.originalChars,
|
|
350
|
+
text_truncated: truncated.truncated,
|
|
351
|
+
text_source: "pdf",
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
|
|
96
355
|
cli({
|
|
97
356
|
site: "acl-anthology",
|
|
98
357
|
name: "search",
|
|
@@ -109,12 +368,11 @@ cli({
|
|
|
109
368
|
const query = String(kwargs.query ?? "").trim();
|
|
110
369
|
if (!query) throw new Error("acl-anthology search query cannot be empty.");
|
|
111
370
|
const limit = Math.min(Math.max(Number(kwargs.limit ?? 20), 1), 100);
|
|
112
|
-
const rows =
|
|
113
|
-
await
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
).slice(0, limit);
|
|
371
|
+
const rows = searchAclBibRows(
|
|
372
|
+
parseAclBibEntries(await fetchAnthologyBib()),
|
|
373
|
+
query,
|
|
374
|
+
limit,
|
|
375
|
+
);
|
|
118
376
|
if (rows.length === 0)
|
|
119
377
|
throw new Error(`No ACL Anthology papers matched "${query}".`);
|
|
120
378
|
return rows;
|
|
@@ -131,27 +389,77 @@ cli({
|
|
|
131
389
|
columns: ["id", "title", "authors", "year", "venue", "pdf_url", "source_url"],
|
|
132
390
|
capabilities: ["http.fetch", "scholar.get", "scholar.pdf"],
|
|
133
391
|
func: async (_page, kwargs) => {
|
|
134
|
-
const id =
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
392
|
+
const id = normalizeAclAnthologyId(kwargs.id ?? kwargs.ref);
|
|
393
|
+
return [await fetchAclPaperRecord(id)];
|
|
394
|
+
},
|
|
395
|
+
});
|
|
396
|
+
|
|
397
|
+
cli({
|
|
398
|
+
site: "acl-anthology",
|
|
399
|
+
name: "read",
|
|
400
|
+
description: "Download an ACL Anthology paper PDF by id and extract text",
|
|
401
|
+
domain: "aclanthology.org",
|
|
402
|
+
strategy: Strategy.PUBLIC,
|
|
403
|
+
args: [
|
|
404
|
+
{
|
|
405
|
+
name: "id",
|
|
406
|
+
type: "str",
|
|
407
|
+
required: true,
|
|
408
|
+
positional: true,
|
|
409
|
+
description: "ACL Anthology paper id (e.g. 2020.acl-main.447)",
|
|
410
|
+
"x-unicli-kind": "id",
|
|
411
|
+
"x-unicli-accepts": ["url"],
|
|
412
|
+
},
|
|
413
|
+
{
|
|
414
|
+
name: "output",
|
|
415
|
+
type: "str",
|
|
416
|
+
default: "./acl-anthology-downloads",
|
|
417
|
+
description: "Output directory",
|
|
418
|
+
"x-unicli-kind": "path",
|
|
419
|
+
},
|
|
420
|
+
{
|
|
421
|
+
name: "first-page",
|
|
422
|
+
type: "int",
|
|
423
|
+
default: 1,
|
|
424
|
+
description: "First PDF page to extract",
|
|
425
|
+
},
|
|
426
|
+
{
|
|
427
|
+
name: "last-page",
|
|
428
|
+
type: "int",
|
|
429
|
+
default: 20,
|
|
430
|
+
description: "Last PDF page to extract",
|
|
431
|
+
},
|
|
432
|
+
{
|
|
433
|
+
name: "max-chars",
|
|
434
|
+
type: "int",
|
|
435
|
+
default: 40000,
|
|
436
|
+
description: "Maximum extracted text characters",
|
|
437
|
+
},
|
|
438
|
+
],
|
|
439
|
+
columns: [
|
|
440
|
+
"id",
|
|
441
|
+
"title",
|
|
442
|
+
"source_adapter",
|
|
443
|
+
"source_url",
|
|
444
|
+
"pdf_url",
|
|
445
|
+
"path",
|
|
446
|
+
"text_source",
|
|
447
|
+
"text",
|
|
448
|
+
"text_chars",
|
|
449
|
+
"text_truncated",
|
|
450
|
+
],
|
|
451
|
+
capabilities: [
|
|
452
|
+
"http.fetch",
|
|
453
|
+
"http.download",
|
|
454
|
+
"subprocess.exec",
|
|
455
|
+
"scholar.fulltext",
|
|
456
|
+
"scholar.pdf",
|
|
457
|
+
],
|
|
458
|
+
executables: ["pdftotext"],
|
|
459
|
+
minimum_capability: "subprocess.exec",
|
|
460
|
+
func: async (_page, kwargs) => {
|
|
461
|
+
const id = normalizeAclAnthologyId(kwargs.id ?? kwargs.ref);
|
|
462
|
+
const record = await fetchAclPaperRecord(id);
|
|
463
|
+
return [await readAclPaperPdf(record, kwargs)];
|
|
156
464
|
},
|
|
157
465
|
});
|
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { resolveCommand } from "../../registry.js";
|
|
2
3
|
import {
|
|
4
|
+
arxivArtifactFilename,
|
|
3
5
|
decodeArxivEntities,
|
|
6
|
+
normalizeArxivId,
|
|
4
7
|
parseArxivEntries,
|
|
5
8
|
requireArxivAuthor,
|
|
6
9
|
requireArxivCategory,
|
|
10
|
+
requireArxivMaxChars,
|
|
11
|
+
requireArxivPageRange,
|
|
7
12
|
requireArxivLimit,
|
|
8
13
|
} from "./papers.js";
|
|
9
14
|
|
|
@@ -19,6 +24,47 @@ describe("arxiv agent-facing author and recent commands", () => {
|
|
|
19
24
|
expect(() => requireArxivLimit("51", 20)).toThrow("arxiv limit");
|
|
20
25
|
});
|
|
21
26
|
|
|
27
|
+
it("validates read IDs, page ranges, max text bounds, and filenames", () => {
|
|
28
|
+
expect(normalizeArxivId("arxiv:1706.03762v7")).toBe("1706.03762v7");
|
|
29
|
+
expect(normalizeArxivId("https://arxiv.org/abs/1706.03762")).toBe(
|
|
30
|
+
"1706.03762",
|
|
31
|
+
);
|
|
32
|
+
expect(normalizeArxivId("https://arxiv.org/pdf/hep-th/9901001.pdf")).toBe(
|
|
33
|
+
"hep-th/9901001",
|
|
34
|
+
);
|
|
35
|
+
expect(() => normalizeArxivId("../1706.03762")).toThrow("Invalid arXiv");
|
|
36
|
+
expect(requireArxivPageRange("2", "4")).toEqual({
|
|
37
|
+
firstPage: 2,
|
|
38
|
+
lastPage: 4,
|
|
39
|
+
});
|
|
40
|
+
expect(() => requireArxivPageRange("0", "4")).toThrow("first-page");
|
|
41
|
+
expect(() => requireArxivPageRange("4", "3")).toThrow("last-page");
|
|
42
|
+
expect(requireArxivMaxChars(undefined)).toBe(40000);
|
|
43
|
+
expect(() => requireArxivMaxChars("999")).toThrow("max-chars");
|
|
44
|
+
expect(
|
|
45
|
+
arxivArtifactFilename({
|
|
46
|
+
id: "1706.03762v7",
|
|
47
|
+
title: "Attention / Is: All? You Need",
|
|
48
|
+
}),
|
|
49
|
+
).toBe("1706.03762v7-Attention-Is-All-You-Need.pdf");
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it("registers arxiv read as a source-level fulltext command", () => {
|
|
53
|
+
expect(resolveCommand("arxiv", "read")?.command.capabilities).toEqual([
|
|
54
|
+
"http.fetch",
|
|
55
|
+
"http.download",
|
|
56
|
+
"subprocess.exec",
|
|
57
|
+
"scholar.fulltext",
|
|
58
|
+
"scholar.pdf",
|
|
59
|
+
]);
|
|
60
|
+
expect(resolveCommand("arxiv", "read")?.command.minimum_capability).toBe(
|
|
61
|
+
"subprocess.exec",
|
|
62
|
+
);
|
|
63
|
+
expect(resolveCommand("arxiv", "read")?.command.executables).toEqual([
|
|
64
|
+
"pdftotext",
|
|
65
|
+
]);
|
|
66
|
+
});
|
|
67
|
+
|
|
22
68
|
it("decodes entities and parses Atom entries", () => {
|
|
23
69
|
expect(decodeArxivEntities("A & B < C")).toBe("A & B < C");
|
|
24
70
|
expect(
|