@zenalexa/unicli 0.225.2 → 0.225.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/README.md +3 -3
- package/README.zh-CN.md +3 -3
- package/dist/adapters/acl-anthology/papers.d.ts +16 -9
- package/dist/adapters/acl-anthology/papers.d.ts.map +1 -1
- package/dist/adapters/acl-anthology/papers.js +322 -58
- package/dist/adapters/acl-anthology/papers.js.map +1 -1
- package/dist/adapters/arxiv/papers.d.ts +22 -4
- package/dist/adapters/arxiv/papers.d.ts.map +1 -1
- package/dist/adapters/arxiv/papers.js +202 -4
- package/dist/adapters/arxiv/papers.js.map +1 -1
- package/dist/adapters/baidu-scholar/search.d.ts +15 -1
- package/dist/adapters/baidu-scholar/search.d.ts.map +1 -1
- package/dist/adapters/baidu-scholar/search.js +72 -8
- package/dist/adapters/baidu-scholar/search.js.map +1 -1
- package/dist/adapters/biorxiv/preprints.d.ts +9 -0
- package/dist/adapters/biorxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/biorxiv/preprints.js +78 -0
- package/dist/adapters/biorxiv/preprints.js.map +1 -0
- package/dist/adapters/cnki/search.d.ts +82 -0
- package/dist/adapters/cnki/search.d.ts.map +1 -0
- package/dist/adapters/cnki/search.js +236 -0
- package/dist/adapters/cnki/search.js.map +1 -0
- package/dist/adapters/cvf/papers.d.ts +12 -7
- package/dist/adapters/cvf/papers.d.ts.map +1 -1
- package/dist/adapters/cvf/papers.js +210 -27
- package/dist/adapters/cvf/papers.js.map +1 -1
- package/dist/adapters/dblp/publications.d.ts +12 -5
- package/dist/adapters/dblp/publications.d.ts.map +1 -1
- package/dist/adapters/dblp/publications.js +31 -8
- package/dist/adapters/dblp/publications.js.map +1 -1
- package/dist/adapters/google-scholar/search.d.ts +22 -1
- package/dist/adapters/google-scholar/search.d.ts.map +1 -1
- package/dist/adapters/google-scholar/search.js +129 -14
- package/dist/adapters/google-scholar/search.js.map +1 -1
- package/dist/adapters/hf/paper.d.ts +12 -3
- package/dist/adapters/hf/paper.d.ts.map +1 -1
- package/dist/adapters/hf/paper.js +65 -5
- package/dist/adapters/hf/paper.js.map +1 -1
- package/dist/adapters/medrxiv/preprints.d.ts +9 -0
- package/dist/adapters/medrxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/medrxiv/preprints.js +78 -0
- package/dist/adapters/medrxiv/preprints.js.map +1 -0
- package/dist/adapters/neurips/proceedings.d.ts +8 -7
- package/dist/adapters/neurips/proceedings.d.ts.map +1 -1
- package/dist/adapters/neurips/proceedings.js +209 -21
- package/dist/adapters/neurips/proceedings.js.map +1 -1
- package/dist/adapters/openalex/works.d.ts +21 -5
- package/dist/adapters/openalex/works.d.ts.map +1 -1
- package/dist/adapters/openalex/works.js +108 -8
- package/dist/adapters/openalex/works.js.map +1 -1
- package/dist/adapters/openreview/papers.d.ts +10 -4
- package/dist/adapters/openreview/papers.d.ts.map +1 -1
- package/dist/adapters/openreview/papers.js +351 -24
- package/dist/adapters/openreview/papers.js.map +1 -1
- package/dist/adapters/pmlr/proceedings.d.ts +6 -6
- package/dist/adapters/pmlr/proceedings.d.ts.map +1 -1
- package/dist/adapters/pmlr/proceedings.js +92 -12
- package/dist/adapters/pmlr/proceedings.js.map +1 -1
- package/dist/adapters/pubmed/articles.d.ts +8 -4
- package/dist/adapters/pubmed/articles.d.ts.map +1 -1
- package/dist/adapters/pubmed/articles.js +272 -39
- package/dist/adapters/pubmed/articles.js.map +1 -1
- package/dist/adapters/rxiv/preprints.d.ts +75 -0
- package/dist/adapters/rxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/rxiv/preprints.js +651 -0
- package/dist/adapters/rxiv/preprints.js.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf-read.d.ts +49 -0
- package/dist/adapters/scholar-artifacts/pdf-read.d.ts.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf-read.js +204 -0
- package/dist/adapters/scholar-artifacts/pdf-read.js.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf.d.ts +16 -0
- package/dist/adapters/scholar-artifacts/pdf.d.ts.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf.js +122 -0
- package/dist/adapters/scholar-artifacts/pdf.js.map +1 -0
- package/dist/adapters/semantic-scholar/papers.d.ts +6 -6
- package/dist/adapters/semantic-scholar/papers.d.ts.map +1 -1
- package/dist/adapters/semantic-scholar/papers.js +80 -6
- package/dist/adapters/semantic-scholar/papers.js.map +1 -1
- package/dist/adapters/unpaywall/works.d.ts +7 -7
- package/dist/adapters/unpaywall/works.d.ts.map +1 -1
- package/dist/adapters/unpaywall/works.js +104 -12
- package/dist/adapters/unpaywall/works.js.map +1 -1
- package/dist/adapters/wanfang/search.d.ts +14 -0
- package/dist/adapters/wanfang/search.d.ts.map +1 -1
- package/dist/adapters/wanfang/search.js +56 -7
- package/dist/adapters/wanfang/search.js.map +1 -1
- package/dist/browser/page.d.ts +2 -0
- package/dist/browser/page.d.ts.map +1 -1
- package/dist/browser/page.js +12 -0
- package/dist/browser/page.js.map +1 -1
- package/dist/commands/browser/actions.d.ts.map +1 -1
- package/dist/commands/browser/actions.js +59 -3
- package/dist/commands/browser/actions.js.map +1 -1
- package/dist/commands/scholar.d.ts +77 -5
- package/dist/commands/scholar.d.ts.map +1 -1
- package/dist/commands/scholar.js +2945 -83
- package/dist/commands/scholar.js.map +1 -1
- package/dist/core/command-contract.d.ts.map +1 -1
- package/dist/core/command-contract.js +5 -0
- package/dist/core/command-contract.js.map +1 -1
- package/dist/core/schema-v2.d.ts +1 -0
- package/dist/core/schema-v2.d.ts.map +1 -1
- package/dist/core/schema-v2.js +1 -0
- package/dist/core/schema-v2.js.map +1 -1
- package/dist/discovery/aliases.d.ts.map +1 -1
- package/dist/discovery/aliases.js +208 -0
- package/dist/discovery/aliases.js.map +1 -1
- package/dist/discovery/core-catalog.d.ts +2 -0
- package/dist/discovery/core-catalog.d.ts.map +1 -1
- package/dist/discovery/core-catalog.js +487 -0
- package/dist/discovery/core-catalog.js.map +1 -1
- package/dist/discovery/intents.d.ts.map +1 -1
- package/dist/discovery/intents.js +273 -2
- package/dist/discovery/intents.js.map +1 -1
- package/dist/discovery/loader.d.ts.map +1 -1
- package/dist/discovery/loader.js +3 -0
- package/dist/discovery/loader.js.map +1 -1
- package/dist/engine/capability-policy.d.ts.map +1 -1
- package/dist/engine/capability-policy.js +30 -4
- package/dist/engine/capability-policy.js.map +1 -1
- package/dist/engine/kernel/stages.d.ts.map +1 -1
- package/dist/engine/kernel/stages.js +3 -0
- package/dist/engine/kernel/stages.js.map +1 -1
- package/dist/engine/operation-policy.d.ts +4 -1
- package/dist/engine/operation-policy.d.ts.map +1 -1
- package/dist/engine/operation-policy.js +23 -0
- package/dist/engine/operation-policy.js.map +1 -1
- package/dist/fast-path/manifest.d.ts +3 -0
- package/dist/fast-path/manifest.d.ts.map +1 -1
- package/dist/fast-path/manifest.js.map +1 -1
- package/dist/fast-path/policy.d.ts.map +1 -1
- package/dist/fast-path/policy.js +3 -0
- package/dist/fast-path/policy.js.map +1 -1
- package/dist/manifest-compact.txt +1 -1
- package/dist/manifest.json +6804 -1002
- package/dist/registry.d.ts +2 -0
- package/dist/registry.d.ts.map +1 -1
- package/dist/registry.js +1 -0
- package/dist/registry.js.map +1 -1
- package/dist/types/scholarly.d.ts +19 -4
- package/dist/types/scholarly.d.ts.map +1 -1
- package/dist/types/scholarly.js +4 -4
- package/dist/types.d.ts +8 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
- package/server.json +2 -2
- package/skills/unicli/SKILL.md +1 -1
- package/skills/unicli-claude-code/SKILL.md +1 -1
- package/skills/unicli-hermes/SKILL.md +1 -1
- package/src/adapters/acl-anthology/papers.test.ts +111 -0
- package/src/adapters/acl-anthology/papers.ts +379 -71
- package/src/adapters/arxiv/papers.test.ts +46 -0
- package/src/adapters/arxiv/papers.ts +251 -4
- package/src/adapters/baidu-scholar/search.ts +74 -11
- package/src/adapters/biorxiv/preprints.ts +112 -0
- package/src/adapters/cnki/search.ts +357 -0
- package/src/adapters/cvf/papers.ts +260 -27
- package/src/adapters/dblp/publications.test.ts +9 -0
- package/src/adapters/dblp/publications.ts +31 -8
- package/src/adapters/google-scholar/search.ts +165 -17
- package/src/adapters/hf/paper.test.ts +23 -0
- package/src/adapters/hf/paper.ts +89 -5
- package/src/adapters/hf/top.yaml +34 -2
- package/src/adapters/huggingface-papers/daily.yaml +37 -3
- package/src/adapters/huggingface-papers/search.yaml +43 -9
- package/src/adapters/medrxiv/preprints.ts +112 -0
- package/src/adapters/neurips/proceedings.ts +266 -22
- package/src/adapters/openalex/works.test.ts +15 -4
- package/src/adapters/openalex/works.ts +136 -8
- package/src/adapters/openreview/papers.test.ts +31 -0
- package/src/adapters/openreview/papers.ts +407 -29
- package/src/adapters/pmlr/proceedings.ts +102 -12
- package/src/adapters/pubmed/articles.test.ts +88 -1
- package/src/adapters/pubmed/articles.ts +343 -44
- package/src/adapters/rxiv/preprints.test.ts +233 -0
- package/src/adapters/rxiv/preprints.ts +849 -0
- package/src/adapters/scholar-artifacts/pdf-read.ts +277 -0
- package/src/adapters/scholar-artifacts/pdf.ts +133 -0
- package/src/adapters/semantic-scholar/papers.ts +98 -6
- package/src/adapters/unpaywall/works.ts +141 -12
- package/src/adapters/wanfang/search.ts +57 -7
- package/src/adapters/cnki/search.yaml +0 -49
|
@@ -17,24 +17,58 @@ args:
|
|
|
17
17
|
|
|
18
18
|
pipeline:
|
|
19
19
|
- fetch:
|
|
20
|
-
url: https://huggingface.co/api/papers
|
|
20
|
+
url: https://huggingface.co/api/papers/search
|
|
21
21
|
params:
|
|
22
|
-
|
|
22
|
+
q: "${{ args.query }}"
|
|
23
23
|
limit: "${{ args.limit }}"
|
|
24
24
|
|
|
25
25
|
- map:
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
26
|
+
id: "${{ (item.paper || item).id || '' }}"
|
|
27
|
+
title: "${{ (item.paper || item).title || item.title || '' }}"
|
|
28
|
+
authors: "${{ (item.paper || item).authors ? (item.paper || item).authors.slice(0,3).map(a => a.name || a).join(', ') : '' }}"
|
|
29
|
+
published: "${{ (item.paper || item).publishedAt || item.publishedAt || '' }}"
|
|
30
|
+
upvotes: "${{ (item.paper || item).upvotes || item.upvotes || 0 }}"
|
|
31
|
+
url: "${{ 'https://huggingface.co/papers/' + ((item.paper || item).id || '') }}"
|
|
32
|
+
source_url: "${{ (item.paper || item).id ? 'https://huggingface.co/papers/' + (item.paper || item).id : '' }}"
|
|
33
|
+
pdf_url: "${{ (item.paper || item).id ? 'https://arxiv.org/pdf/' + (item.paper || item).id : '' }}"
|
|
34
|
+
code_url: "${{ (item.paper || item).githubRepo || '' }}"
|
|
35
|
+
github_stars: "${{ (item.paper || item).githubStars || '' }}"
|
|
36
|
+
project_url: "${{ (item.paper || item).projectPage || '' }}"
|
|
37
|
+
dataset_url: "${{ (item.paper || item).linkedDatasets && (item.paper || item).linkedDatasets[0] ? 'https://huggingface.co/datasets/' + (item.paper || item).linkedDatasets[0].id : '' }}"
|
|
38
|
+
model_urls: "${{ (item.paper || item).linkedModels ? (item.paper || item).linkedModels.map(x => 'https://huggingface.co/' + x.id).join(', ') : '' }}"
|
|
39
|
+
dataset_urls: "${{ (item.paper || item).linkedDatasets ? (item.paper || item).linkedDatasets.map(x => 'https://huggingface.co/datasets/' + x.id).join(', ') : '' }}"
|
|
40
|
+
space_urls: "${{ (item.paper || item).linkedSpaces ? (item.paper || item).linkedSpaces.map(x => 'https://huggingface.co/spaces/' + x.id).join(', ') : '' }}"
|
|
41
|
+
num_models: "${{ (item.paper || item).numTotalModels !== null && (item.paper || item).numTotalModels !== undefined ? (item.paper || item).numTotalModels : '' }}"
|
|
42
|
+
num_datasets: "${{ (item.paper || item).numTotalDatasets !== null && (item.paper || item).numTotalDatasets !== undefined ? (item.paper || item).numTotalDatasets : '' }}"
|
|
43
|
+
num_spaces: "${{ (item.paper || item).numTotalSpaces !== null && (item.paper || item).numTotalSpaces !== undefined ? (item.paper || item).numTotalSpaces : '' }}"
|
|
31
44
|
|
|
32
45
|
- limit: ${{ args.limit }}
|
|
33
46
|
|
|
34
|
-
columns:
|
|
47
|
+
columns:
|
|
48
|
+
[
|
|
49
|
+
id,
|
|
50
|
+
title,
|
|
51
|
+
authors,
|
|
52
|
+
upvotes,
|
|
53
|
+
published,
|
|
54
|
+
url,
|
|
55
|
+
source_url,
|
|
56
|
+
pdf_url,
|
|
57
|
+
code_url,
|
|
58
|
+
github_stars,
|
|
59
|
+
project_url,
|
|
60
|
+
dataset_url,
|
|
61
|
+
model_urls,
|
|
62
|
+
dataset_urls,
|
|
63
|
+
space_urls,
|
|
64
|
+
num_models,
|
|
65
|
+
num_datasets,
|
|
66
|
+
num_spaces,
|
|
67
|
+
]
|
|
35
68
|
|
|
36
69
|
# schema-v2 metadata — injected by `unicli migrate schema-v2`
|
|
37
|
-
capabilities:
|
|
70
|
+
capabilities:
|
|
71
|
+
["http.fetch", "scholar.search", "scholar.code", "scholar.datasets"]
|
|
38
72
|
minimum_capability: http.fetch
|
|
39
73
|
trust: public
|
|
40
74
|
confidentiality: public
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @owner src::adapters::medrxiv::preprints
|
|
3
|
+
* @does Registers medRxiv recent/search, DOI metadata, PDF download, and read commands backed by the official xRxiv API helpers.
|
|
4
|
+
* @needs src/adapters/rxiv/preprints.ts, api.biorxiv.org medRxiv endpoints, medRxiv PDF/JATS asset URLs.
|
|
5
|
+
* @feeds surface coverage ledger, clinical preprint discovery/search, scholar DOI read/download routing.
|
|
6
|
+
* @breaks medRxiv API drift, date-window search exhaustion, source-asset denial, or missing pdftotext stops read/download rather than fabricating text.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { cli, Strategy } from "../../registry.js";
|
|
10
|
+
import {
|
|
11
|
+
downloadRxivPdf,
|
|
12
|
+
fetchPaperRow,
|
|
13
|
+
fetchRecentRows,
|
|
14
|
+
fetchSearchRows,
|
|
15
|
+
readRxivPaper,
|
|
16
|
+
RXIV_DOWNLOAD_ARGS,
|
|
17
|
+
RXIV_DOWNLOAD_CAPABILITIES,
|
|
18
|
+
RXIV_DOWNLOAD_COLUMNS,
|
|
19
|
+
RXIV_PAPER_ARGS,
|
|
20
|
+
RXIV_PAPER_CAPABILITIES,
|
|
21
|
+
RXIV_PAPER_COLUMNS,
|
|
22
|
+
RXIV_READ_ARGS,
|
|
23
|
+
RXIV_READ_CAPABILITIES,
|
|
24
|
+
RXIV_READ_COLUMNS,
|
|
25
|
+
RXIV_RECENT_ARGS,
|
|
26
|
+
RXIV_RECENT_CAPABILITIES,
|
|
27
|
+
RXIV_RECENT_COLUMNS,
|
|
28
|
+
RXIV_SEARCH_ARGS,
|
|
29
|
+
RXIV_SEARCH_CAPABILITIES,
|
|
30
|
+
RXIV_SEARCH_COLUMNS,
|
|
31
|
+
type RxivConfig,
|
|
32
|
+
} from "../rxiv/preprints.js";
|
|
33
|
+
|
|
34
|
+
const CONFIG: RxivConfig = {
|
|
35
|
+
site: "medrxiv",
|
|
36
|
+
label: "medRxiv",
|
|
37
|
+
apiServer: "medrxiv",
|
|
38
|
+
webOrigin: "https://www.medrxiv.org",
|
|
39
|
+
};
|
|
40
|
+
const DOMAIN = "api.biorxiv.org";
|
|
41
|
+
|
|
42
|
+
cli({
|
|
43
|
+
site: "medrxiv",
|
|
44
|
+
name: "recent",
|
|
45
|
+
description: "List recent medRxiv preprints from the official API",
|
|
46
|
+
domain: DOMAIN,
|
|
47
|
+
strategy: Strategy.PUBLIC,
|
|
48
|
+
args: RXIV_RECENT_ARGS,
|
|
49
|
+
columns: RXIV_RECENT_COLUMNS,
|
|
50
|
+
capabilities: RXIV_RECENT_CAPABILITIES,
|
|
51
|
+
func: async (_page, kwargs) => fetchRecentRows(CONFIG, kwargs),
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
cli({
|
|
55
|
+
site: "medrxiv",
|
|
56
|
+
name: "search",
|
|
57
|
+
description:
|
|
58
|
+
"Search medRxiv official API metadata within a bounded date window",
|
|
59
|
+
domain: DOMAIN,
|
|
60
|
+
strategy: Strategy.PUBLIC,
|
|
61
|
+
args: RXIV_SEARCH_ARGS,
|
|
62
|
+
columns: RXIV_SEARCH_COLUMNS,
|
|
63
|
+
capabilities: RXIV_SEARCH_CAPABILITIES,
|
|
64
|
+
func: async (_page, kwargs) => fetchSearchRows(CONFIG, kwargs),
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
cli({
|
|
68
|
+
site: "medrxiv",
|
|
69
|
+
name: "paper",
|
|
70
|
+
description: "Fetch medRxiv preprint metadata by DOI",
|
|
71
|
+
domain: DOMAIN,
|
|
72
|
+
strategy: Strategy.PUBLIC,
|
|
73
|
+
args: RXIV_PAPER_ARGS,
|
|
74
|
+
columns: RXIV_PAPER_COLUMNS,
|
|
75
|
+
capabilities: RXIV_PAPER_CAPABILITIES,
|
|
76
|
+
func: async (_page, kwargs) => [
|
|
77
|
+
await fetchPaperRow(CONFIG, kwargs.doi ?? kwargs.id ?? kwargs.ref),
|
|
78
|
+
],
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
cli({
|
|
82
|
+
site: "medrxiv",
|
|
83
|
+
name: "download",
|
|
84
|
+
description: "Download a medRxiv preprint PDF by DOI",
|
|
85
|
+
domain: DOMAIN,
|
|
86
|
+
strategy: Strategy.PUBLIC,
|
|
87
|
+
args: RXIV_DOWNLOAD_ARGS,
|
|
88
|
+
columns: RXIV_DOWNLOAD_COLUMNS,
|
|
89
|
+
capabilities: RXIV_DOWNLOAD_CAPABILITIES,
|
|
90
|
+
minimum_capability: "http.download",
|
|
91
|
+
func: async (_page, kwargs) => [
|
|
92
|
+
await downloadRxivPdf(
|
|
93
|
+
CONFIG,
|
|
94
|
+
await fetchPaperRow(CONFIG, kwargs.doi ?? kwargs.id ?? kwargs.ref),
|
|
95
|
+
kwargs.output,
|
|
96
|
+
),
|
|
97
|
+
],
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
cli({
|
|
101
|
+
site: "medrxiv",
|
|
102
|
+
name: "read",
|
|
103
|
+
description:
|
|
104
|
+
"Read medRxiv preprint text by DOI, preferring JATS XML before PDF extraction",
|
|
105
|
+
domain: DOMAIN,
|
|
106
|
+
strategy: Strategy.PUBLIC,
|
|
107
|
+
args: RXIV_READ_ARGS,
|
|
108
|
+
columns: RXIV_READ_COLUMNS,
|
|
109
|
+
capabilities: RXIV_READ_CAPABILITIES,
|
|
110
|
+
minimum_capability: "subprocess.exec",
|
|
111
|
+
func: async (_page, kwargs) => [await readRxivPaper(CONFIG, kwargs)],
|
|
112
|
+
});
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @owner src::adapters::neurips::proceedings
|
|
3
|
-
* @does Registers NeurIPS proceedings search over
|
|
4
|
-
* @needs proceedings.neurips.cc static HTML, src/registry.ts
|
|
5
|
-
* @feeds src/commands/scholar.ts via scholar.search, scholar.pdf, and scholar.venue
|
|
6
|
-
* @breaks NeurIPS markup drift
|
|
7
|
-
* @invariants Year is explicit; paper
|
|
8
|
-
* @side-effects HTTPS egress to proceedings.neurips.cc
|
|
9
|
-
* @perf O(N) over one proceedings HTML page
|
|
3
|
+
* @does Registers NeurIPS proceedings search, paper detail retrieval, and PDF text reading over official paper pages.
|
|
4
|
+
* @needs proceedings.neurips.cc static HTML/PDFs, src/adapters/scholar-artifacts/pdf-read.ts, src/registry.ts
|
|
5
|
+
* @feeds src/commands/scholar.ts via scholar.search, scholar.get, scholar.pdf, scholar.fulltext, and scholar.venue
|
|
6
|
+
* @breaks NeurIPS markup/PDF drift, denied downloads, or missing pdftotext surface as explicit adapter errors; no unrelated source fallback is used.
|
|
7
|
+
* @invariants Year is explicit; paper detail prefers citation_* metadata and official /file/ PDF URLs.
|
|
8
|
+
* @side-effects HTTPS egress to proceedings.neurips.cc; read writes one PDF artifact and executes pdftotext.
|
|
9
|
+
* @perf O(N) over one proceedings HTML page; read is O(PDF bytes + selected pages)
|
|
10
10
|
* @concurrency safe
|
|
11
11
|
* @test tests/unit/adapters/scholar-sources.test.ts
|
|
12
12
|
* @stability experimental
|
|
@@ -15,8 +15,18 @@
|
|
|
15
15
|
|
|
16
16
|
import { cli, Strategy } from "../../registry.js";
|
|
17
17
|
import type { ScholarlyWorkRecord } from "../../types/scholarly.js";
|
|
18
|
+
import { readScholarPdf } from "../scholar-artifacts/pdf-read.js";
|
|
18
19
|
|
|
19
20
|
const ORIGIN = "https://proceedings.neurips.cc";
|
|
21
|
+
const NEURIPS_USER_AGENT =
|
|
22
|
+
"unicli-neurips/1.0 (https://github.com/olo-dot-io/Uni-CLI)";
|
|
23
|
+
|
|
24
|
+
type NeuripsActionableError = Error & {
|
|
25
|
+
code?: string;
|
|
26
|
+
suggestion?: string;
|
|
27
|
+
retryable?: boolean;
|
|
28
|
+
alternatives?: string[];
|
|
29
|
+
};
|
|
20
30
|
|
|
21
31
|
function decode(value: string): string {
|
|
22
32
|
return value
|
|
@@ -35,6 +45,21 @@ function absolute(path: string): string {
|
|
|
35
45
|
: `${ORIGIN}${path.startsWith("/") ? "" : "/"}${path}`;
|
|
36
46
|
}
|
|
37
47
|
|
|
48
|
+
function metaContents(html: string, name: string): string[] {
|
|
49
|
+
const values: string[] = [];
|
|
50
|
+
const re = new RegExp(
|
|
51
|
+
`<meta\\s+name=["']${name}["']\\s+content=["']([^"']*)["'][^>]*>`,
|
|
52
|
+
"gi",
|
|
53
|
+
);
|
|
54
|
+
let match: RegExpExecArray | null;
|
|
55
|
+
while ((match = re.exec(html)) !== null) values.push(decode(match[1]));
|
|
56
|
+
return values;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function firstMetaContent(html: string, name: string): string {
|
|
60
|
+
return metaContents(html, name)[0] ?? "";
|
|
61
|
+
}
|
|
62
|
+
|
|
38
63
|
function requireYear(value: unknown): string {
|
|
39
64
|
const year = String(value ?? "").trim();
|
|
40
65
|
if (!/^\d{4}$/.test(year))
|
|
@@ -42,6 +67,90 @@ function requireYear(value: unknown): string {
|
|
|
42
67
|
return year;
|
|
43
68
|
}
|
|
44
69
|
|
|
70
|
+
function requireNeuripsPaperId(value: unknown): string {
|
|
71
|
+
const raw = String(value ?? "").trim();
|
|
72
|
+
const id =
|
|
73
|
+
raw
|
|
74
|
+
.match(/\/(?:hash|file)\/([^/?#]+\.html)/)?.[1]
|
|
75
|
+
?.replace(/\.html$/, "") ?? raw.replace(/\.html$/, "");
|
|
76
|
+
if (!/^[A-Za-z0-9_.-]+$/.test(id)) {
|
|
77
|
+
throw new Error(`NeurIPS paper id "${raw}" is not valid.`);
|
|
78
|
+
}
|
|
79
|
+
return id;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function abstractUrl(id: string, year: string): string {
|
|
83
|
+
return `${ORIGIN}/paper_files/paper/${year}/hash/${id}.html`;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function pdfUrlFromAbstractUrl(sourceUrl: string): string {
|
|
87
|
+
return sourceUrl
|
|
88
|
+
.replace("/hash/", "/file/")
|
|
89
|
+
.replace("-Abstract-", "-Paper-")
|
|
90
|
+
.replace(/\.html$/, ".pdf");
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function neuripsUpstreamError(
|
|
94
|
+
label: string,
|
|
95
|
+
detail: string,
|
|
96
|
+
): NeuripsActionableError {
|
|
97
|
+
const error = new Error(
|
|
98
|
+
`${label} failed: ${detail}.`,
|
|
99
|
+
) as NeuripsActionableError;
|
|
100
|
+
error.code = "upstream_error";
|
|
101
|
+
error.suggestion =
|
|
102
|
+
"NeurIPS proceedings did not return the expected public paper page on this network path; retry later or verify the official proceedings.neurips.cc page manually.";
|
|
103
|
+
error.retryable =
|
|
104
|
+
/fetch failed|timeout|ECONNRESET|ETIMEDOUT|HTTP (429|5\d\d)/i.test(detail);
|
|
105
|
+
error.alternatives = [];
|
|
106
|
+
return error;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
export function parseNeuripsPaperPage(
|
|
110
|
+
html: string,
|
|
111
|
+
sourceUrl: string,
|
|
112
|
+
): ScholarlyWorkRecord {
|
|
113
|
+
const title =
|
|
114
|
+
firstMetaContent(html, "citation_title") ||
|
|
115
|
+
decode(
|
|
116
|
+
html
|
|
117
|
+
.match(/<h1 class="paper-title">([\s\S]*?)<\/h1>/i)?.[1]
|
|
118
|
+
?.replace(/<[^>]+>/g, " ") ?? "",
|
|
119
|
+
);
|
|
120
|
+
if (!title) throw new Error("NeurIPS paper page did not expose a title.");
|
|
121
|
+
const year =
|
|
122
|
+
firstMetaContent(html, "citation_publication_date").match(/\d{4}/)?.[0] ??
|
|
123
|
+
sourceUrl.match(/\/paper\/(\d{4})\//)?.[1];
|
|
124
|
+
const id =
|
|
125
|
+
sourceUrl
|
|
126
|
+
.split("/")
|
|
127
|
+
.pop()
|
|
128
|
+
?.replace(/\.html$/, "") ?? title;
|
|
129
|
+
const pdfUrl =
|
|
130
|
+
firstMetaContent(html, "citation_pdf_url") ||
|
|
131
|
+
html.match(/href=["']([^"']+-Paper-[^"']+\.pdf)["']/i)?.[1] ||
|
|
132
|
+
"";
|
|
133
|
+
return {
|
|
134
|
+
id,
|
|
135
|
+
title,
|
|
136
|
+
authors: metaContents(html, "citation_author"),
|
|
137
|
+
year: year ? Number(year) : undefined,
|
|
138
|
+
venue: "NeurIPS",
|
|
139
|
+
type: firstMetaContent(html, "citation_journal_title") || undefined,
|
|
140
|
+
doi: firstMetaContent(html, "citation_doi") || undefined,
|
|
141
|
+
abstract:
|
|
142
|
+
decode(
|
|
143
|
+
html
|
|
144
|
+
.match(/<p class="paper-abstract">([\s\S]*?)<\/p>\s*<\/p>/i)?.[1]
|
|
145
|
+
?.replace(/<[^>]+>/g, " ") ?? "",
|
|
146
|
+
) || undefined,
|
|
147
|
+
pdf_url: pdfUrl ? absolute(pdfUrl) : undefined,
|
|
148
|
+
source_adapter: "neurips",
|
|
149
|
+
source_url: sourceUrl,
|
|
150
|
+
retrieved_at: new Date().toISOString(),
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
45
154
|
export function parseNeuripsRows(
|
|
46
155
|
html: string,
|
|
47
156
|
year = "2024",
|
|
@@ -65,9 +174,7 @@ export function parseNeuripsRows(
|
|
|
65
174
|
.filter(Boolean),
|
|
66
175
|
year: Number(year),
|
|
67
176
|
venue: "NeurIPS",
|
|
68
|
-
pdf_url: sourceUrl
|
|
69
|
-
.replace("-Abstract-", "-Paper-")
|
|
70
|
-
.replace(/\.html$/, ".pdf"),
|
|
177
|
+
pdf_url: pdfUrlFromAbstractUrl(sourceUrl),
|
|
71
178
|
source_adapter: "neurips",
|
|
72
179
|
source_url: sourceUrl,
|
|
73
180
|
retrieved_at: new Date().toISOString(),
|
|
@@ -76,6 +183,54 @@ export function parseNeuripsRows(
|
|
|
76
183
|
return out;
|
|
77
184
|
}
|
|
78
185
|
|
|
186
|
+
async function fetchNeuripsHtml(url: string, label: string): Promise<string> {
|
|
187
|
+
let response: Response;
|
|
188
|
+
try {
|
|
189
|
+
response = await fetch(url, {
|
|
190
|
+
headers: {
|
|
191
|
+
Accept: "text/html",
|
|
192
|
+
"User-Agent": NEURIPS_USER_AGENT,
|
|
193
|
+
},
|
|
194
|
+
});
|
|
195
|
+
} catch (error) {
|
|
196
|
+
throw neuripsUpstreamError(
|
|
197
|
+
label,
|
|
198
|
+
error instanceof Error ? error.message : String(error),
|
|
199
|
+
);
|
|
200
|
+
}
|
|
201
|
+
if (response.status === 404) throw new Error(`${label} returned no page.`);
|
|
202
|
+
if (!response.ok)
|
|
203
|
+
throw neuripsUpstreamError(label, `HTTP ${response.status}`);
|
|
204
|
+
return response.text();
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
async function readNeuripsPaperPdf(
|
|
208
|
+
row: ScholarlyWorkRecord,
|
|
209
|
+
kwargs: Record<string, unknown>,
|
|
210
|
+
): Promise<Record<string, unknown>> {
|
|
211
|
+
if (!row.pdf_url) throw new Error(`NeurIPS paper ${row.id} has no PDF URL.`);
|
|
212
|
+
return readScholarPdf(
|
|
213
|
+
{
|
|
214
|
+
id: row.id,
|
|
215
|
+
title: row.title,
|
|
216
|
+
source_adapter: "neurips",
|
|
217
|
+
source_url: row.source_url,
|
|
218
|
+
pdf_url: row.pdf_url,
|
|
219
|
+
output: kwargs.output,
|
|
220
|
+
filename: kwargs.filename,
|
|
221
|
+
"first-page": kwargs["first-page"] ?? kwargs.firstPage,
|
|
222
|
+
"last-page": kwargs["last-page"] ?? kwargs.lastPage,
|
|
223
|
+
"max-chars": kwargs["max-chars"] ?? kwargs.maxChars,
|
|
224
|
+
},
|
|
225
|
+
{
|
|
226
|
+
site: "neurips",
|
|
227
|
+
command: "read",
|
|
228
|
+
defaultOutput: "./neurips-downloads",
|
|
229
|
+
userAgent: NEURIPS_USER_AGENT,
|
|
230
|
+
},
|
|
231
|
+
);
|
|
232
|
+
}
|
|
233
|
+
|
|
79
234
|
cli({
|
|
80
235
|
site: "neurips",
|
|
81
236
|
name: "search",
|
|
@@ -100,19 +255,14 @@ cli({
|
|
|
100
255
|
.toLowerCase();
|
|
101
256
|
if (!query) throw new Error("neurips search query cannot be empty.");
|
|
102
257
|
const year = requireYear(kwargs.year);
|
|
103
|
-
const response = await fetch(`${ORIGIN}/paper_files/paper/${year}`, {
|
|
104
|
-
headers: {
|
|
105
|
-
Accept: "text/html",
|
|
106
|
-
"User-Agent":
|
|
107
|
-
"unicli-neurips/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
|
|
108
|
-
},
|
|
109
|
-
});
|
|
110
|
-
if (response.status === 404)
|
|
111
|
-
throw new Error(`NeurIPS ${year} returned no proceedings page.`);
|
|
112
|
-
if (!response.ok)
|
|
113
|
-
throw new Error(`NeurIPS ${year} returned HTTP ${response.status}.`);
|
|
114
258
|
const limit = Math.min(Math.max(Number(kwargs.limit ?? 20), 1), 200);
|
|
115
|
-
const rows = parseNeuripsRows(
|
|
259
|
+
const rows = parseNeuripsRows(
|
|
260
|
+
await fetchNeuripsHtml(
|
|
261
|
+
`${ORIGIN}/paper_files/paper/${year}`,
|
|
262
|
+
`NeurIPS ${year}`,
|
|
263
|
+
),
|
|
264
|
+
year,
|
|
265
|
+
)
|
|
116
266
|
.filter((row) =>
|
|
117
267
|
`${row.title} ${row.authors?.join(" ") ?? ""}`
|
|
118
268
|
.toLowerCase()
|
|
@@ -124,3 +274,97 @@ cli({
|
|
|
124
274
|
return rows;
|
|
125
275
|
},
|
|
126
276
|
});
|
|
277
|
+
|
|
278
|
+
cli({
|
|
279
|
+
site: "neurips",
|
|
280
|
+
name: "paper",
|
|
281
|
+
description: "Fetch NeurIPS proceedings paper metadata by page id",
|
|
282
|
+
domain: "proceedings.neurips.cc",
|
|
283
|
+
strategy: Strategy.PUBLIC,
|
|
284
|
+
args: [
|
|
285
|
+
{ name: "id", type: "str", required: true, positional: true },
|
|
286
|
+
{ name: "year", type: "str", default: "2024" },
|
|
287
|
+
],
|
|
288
|
+
columns: [
|
|
289
|
+
"id",
|
|
290
|
+
"title",
|
|
291
|
+
"authors",
|
|
292
|
+
"year",
|
|
293
|
+
"venue",
|
|
294
|
+
"doi",
|
|
295
|
+
"pdf_url",
|
|
296
|
+
"source_url",
|
|
297
|
+
],
|
|
298
|
+
capabilities: ["http.fetch", "scholar.get", "scholar.pdf"],
|
|
299
|
+
func: async (_page, kwargs) => {
|
|
300
|
+
const year = requireYear(kwargs.year);
|
|
301
|
+
const id = requireNeuripsPaperId(kwargs.id ?? kwargs.ref);
|
|
302
|
+
const url = abstractUrl(id, year);
|
|
303
|
+
return [
|
|
304
|
+
parseNeuripsPaperPage(
|
|
305
|
+
await fetchNeuripsHtml(url, `NeurIPS paper ${id}`),
|
|
306
|
+
url,
|
|
307
|
+
),
|
|
308
|
+
];
|
|
309
|
+
},
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
cli({
|
|
313
|
+
site: "neurips",
|
|
314
|
+
name: "read",
|
|
315
|
+
description:
|
|
316
|
+
"Download a NeurIPS proceedings paper PDF by page id and extract text",
|
|
317
|
+
domain: "proceedings.neurips.cc",
|
|
318
|
+
strategy: Strategy.PUBLIC,
|
|
319
|
+
args: [
|
|
320
|
+
{ name: "id", type: "str", required: true, positional: true },
|
|
321
|
+
{ name: "year", type: "str", default: "2024" },
|
|
322
|
+
{
|
|
323
|
+
name: "output",
|
|
324
|
+
type: "str",
|
|
325
|
+
default: "./neurips-downloads",
|
|
326
|
+
description: "Output directory for the downloaded PDF",
|
|
327
|
+
"x-unicli-kind": "path",
|
|
328
|
+
},
|
|
329
|
+
{ name: "filename", type: "str", description: "Output PDF filename" },
|
|
330
|
+
{ name: "first-page", type: "int", default: 1, description: "First page" },
|
|
331
|
+
{ name: "last-page", type: "int", default: 20, description: "Last page" },
|
|
332
|
+
{
|
|
333
|
+
name: "max-chars",
|
|
334
|
+
type: "int",
|
|
335
|
+
default: 40000,
|
|
336
|
+
description: "Maximum extracted text characters",
|
|
337
|
+
},
|
|
338
|
+
],
|
|
339
|
+
columns: [
|
|
340
|
+
"id",
|
|
341
|
+
"title",
|
|
342
|
+
"source_adapter",
|
|
343
|
+
"source_url",
|
|
344
|
+
"pdf_url",
|
|
345
|
+
"path",
|
|
346
|
+
"text_source",
|
|
347
|
+
"text",
|
|
348
|
+
"text_chars",
|
|
349
|
+
"text_truncated",
|
|
350
|
+
],
|
|
351
|
+
capabilities: [
|
|
352
|
+
"http.fetch",
|
|
353
|
+
"http.download",
|
|
354
|
+
"subprocess.exec",
|
|
355
|
+
"scholar.fulltext",
|
|
356
|
+
"scholar.pdf",
|
|
357
|
+
],
|
|
358
|
+
executables: ["pdftotext"],
|
|
359
|
+
minimum_capability: "subprocess.exec",
|
|
360
|
+
func: async (_page, kwargs) => {
|
|
361
|
+
const year = requireYear(kwargs.year);
|
|
362
|
+
const id = requireNeuripsPaperId(kwargs.id ?? kwargs.ref);
|
|
363
|
+
const url = abstractUrl(id, year);
|
|
364
|
+
const row = parseNeuripsPaperPage(
|
|
365
|
+
await fetchNeuripsHtml(url, `NeurIPS paper ${id}`),
|
|
366
|
+
url,
|
|
367
|
+
);
|
|
368
|
+
return [await readNeuripsPaperPdf(row, kwargs)];
|
|
369
|
+
},
|
|
370
|
+
});
|
|
@@ -40,8 +40,12 @@ describe("openalex agent-facing commands", () => {
|
|
|
40
40
|
publication_year: 2026,
|
|
41
41
|
cited_by_count: 5,
|
|
42
42
|
authorships: [{ author: { display_name: "Ada" } }],
|
|
43
|
-
primary_location: {
|
|
44
|
-
|
|
43
|
+
primary_location: {
|
|
44
|
+
landing_page_url: "https://publisher.test/paper",
|
|
45
|
+
pdf_url: "https://publisher.test/paper.pdf",
|
|
46
|
+
source: { display_name: "Journal" },
|
|
47
|
+
},
|
|
48
|
+
open_access: { is_oa: true, oa_url: "https://publisher.test/paper" },
|
|
45
49
|
type: "article",
|
|
46
50
|
},
|
|
47
51
|
],
|
|
@@ -62,7 +66,8 @@ describe("openalex agent-facing commands", () => {
|
|
|
62
66
|
is_open_access: true,
|
|
63
67
|
type: "article",
|
|
64
68
|
doi: "10.1/example",
|
|
65
|
-
pdf_url: "",
|
|
69
|
+
pdf_url: "https://publisher.test/paper.pdf",
|
|
70
|
+
landing_url: "https://publisher.test/paper",
|
|
66
71
|
openalex_id: "W1234",
|
|
67
72
|
source_adapter: "openalex",
|
|
68
73
|
source_url: "https://openalex.org/W1234",
|
|
@@ -86,7 +91,11 @@ describe("openalex agent-facing commands", () => {
|
|
|
86
91
|
{ author: { display_name: "Ada" } },
|
|
87
92
|
{ author: { display_name: "Grace" } },
|
|
88
93
|
],
|
|
89
|
-
primary_location: {
|
|
94
|
+
primary_location: {
|
|
95
|
+
landing_page_url: "https://publisher.test/paper",
|
|
96
|
+
source: { display_name: "Journal" },
|
|
97
|
+
},
|
|
98
|
+
best_oa_location: { pdf_url: "https://repository.test/paper.pdf" },
|
|
90
99
|
cited_by_count: 5,
|
|
91
100
|
open_access: { is_oa: true, oa_url: "https://example.test/pdf" },
|
|
92
101
|
referenced_works: ["W1", "W2"],
|
|
@@ -98,6 +107,8 @@ describe("openalex agent-facing commands", () => {
|
|
|
98
107
|
authors: "Ada, Grace",
|
|
99
108
|
referencedCount: 2,
|
|
100
109
|
doi: "10.1/example",
|
|
110
|
+
pdf_url: "https://repository.test/paper.pdf",
|
|
111
|
+
landing_url: "https://publisher.test/paper",
|
|
101
112
|
abstract: "hello world",
|
|
102
113
|
});
|
|
103
114
|
expect(() => mapOpenAlexWorkRow({})).toThrow(
|