@zenalexa/unicli 0.225.2 → 0.225.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/README.md +3 -3
- package/README.zh-CN.md +3 -3
- package/dist/adapters/acl-anthology/papers.d.ts +16 -9
- package/dist/adapters/acl-anthology/papers.d.ts.map +1 -1
- package/dist/adapters/acl-anthology/papers.js +322 -58
- package/dist/adapters/acl-anthology/papers.js.map +1 -1
- package/dist/adapters/arxiv/papers.d.ts +22 -4
- package/dist/adapters/arxiv/papers.d.ts.map +1 -1
- package/dist/adapters/arxiv/papers.js +202 -4
- package/dist/adapters/arxiv/papers.js.map +1 -1
- package/dist/adapters/baidu-scholar/search.d.ts +15 -1
- package/dist/adapters/baidu-scholar/search.d.ts.map +1 -1
- package/dist/adapters/baidu-scholar/search.js +72 -8
- package/dist/adapters/baidu-scholar/search.js.map +1 -1
- package/dist/adapters/biorxiv/preprints.d.ts +9 -0
- package/dist/adapters/biorxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/biorxiv/preprints.js +78 -0
- package/dist/adapters/biorxiv/preprints.js.map +1 -0
- package/dist/adapters/cnki/search.d.ts +82 -0
- package/dist/adapters/cnki/search.d.ts.map +1 -0
- package/dist/adapters/cnki/search.js +236 -0
- package/dist/adapters/cnki/search.js.map +1 -0
- package/dist/adapters/cvf/papers.d.ts +12 -7
- package/dist/adapters/cvf/papers.d.ts.map +1 -1
- package/dist/adapters/cvf/papers.js +210 -27
- package/dist/adapters/cvf/papers.js.map +1 -1
- package/dist/adapters/dblp/publications.d.ts +12 -5
- package/dist/adapters/dblp/publications.d.ts.map +1 -1
- package/dist/adapters/dblp/publications.js +31 -8
- package/dist/adapters/dblp/publications.js.map +1 -1
- package/dist/adapters/google-scholar/search.d.ts +22 -1
- package/dist/adapters/google-scholar/search.d.ts.map +1 -1
- package/dist/adapters/google-scholar/search.js +129 -14
- package/dist/adapters/google-scholar/search.js.map +1 -1
- package/dist/adapters/hf/paper.d.ts +12 -3
- package/dist/adapters/hf/paper.d.ts.map +1 -1
- package/dist/adapters/hf/paper.js +65 -5
- package/dist/adapters/hf/paper.js.map +1 -1
- package/dist/adapters/medrxiv/preprints.d.ts +9 -0
- package/dist/adapters/medrxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/medrxiv/preprints.js +78 -0
- package/dist/adapters/medrxiv/preprints.js.map +1 -0
- package/dist/adapters/neurips/proceedings.d.ts +8 -7
- package/dist/adapters/neurips/proceedings.d.ts.map +1 -1
- package/dist/adapters/neurips/proceedings.js +209 -21
- package/dist/adapters/neurips/proceedings.js.map +1 -1
- package/dist/adapters/openalex/works.d.ts +21 -5
- package/dist/adapters/openalex/works.d.ts.map +1 -1
- package/dist/adapters/openalex/works.js +108 -8
- package/dist/adapters/openalex/works.js.map +1 -1
- package/dist/adapters/openreview/papers.d.ts +10 -4
- package/dist/adapters/openreview/papers.d.ts.map +1 -1
- package/dist/adapters/openreview/papers.js +351 -24
- package/dist/adapters/openreview/papers.js.map +1 -1
- package/dist/adapters/pmlr/proceedings.d.ts +6 -6
- package/dist/adapters/pmlr/proceedings.d.ts.map +1 -1
- package/dist/adapters/pmlr/proceedings.js +92 -12
- package/dist/adapters/pmlr/proceedings.js.map +1 -1
- package/dist/adapters/pubmed/articles.d.ts +8 -4
- package/dist/adapters/pubmed/articles.d.ts.map +1 -1
- package/dist/adapters/pubmed/articles.js +272 -39
- package/dist/adapters/pubmed/articles.js.map +1 -1
- package/dist/adapters/rxiv/preprints.d.ts +75 -0
- package/dist/adapters/rxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/rxiv/preprints.js +651 -0
- package/dist/adapters/rxiv/preprints.js.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf-read.d.ts +49 -0
- package/dist/adapters/scholar-artifacts/pdf-read.d.ts.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf-read.js +204 -0
- package/dist/adapters/scholar-artifacts/pdf-read.js.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf.d.ts +16 -0
- package/dist/adapters/scholar-artifacts/pdf.d.ts.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf.js +122 -0
- package/dist/adapters/scholar-artifacts/pdf.js.map +1 -0
- package/dist/adapters/semantic-scholar/papers.d.ts +6 -6
- package/dist/adapters/semantic-scholar/papers.d.ts.map +1 -1
- package/dist/adapters/semantic-scholar/papers.js +80 -6
- package/dist/adapters/semantic-scholar/papers.js.map +1 -1
- package/dist/adapters/unpaywall/works.d.ts +7 -7
- package/dist/adapters/unpaywall/works.d.ts.map +1 -1
- package/dist/adapters/unpaywall/works.js +104 -12
- package/dist/adapters/unpaywall/works.js.map +1 -1
- package/dist/adapters/wanfang/search.d.ts +14 -0
- package/dist/adapters/wanfang/search.d.ts.map +1 -1
- package/dist/adapters/wanfang/search.js +56 -7
- package/dist/adapters/wanfang/search.js.map +1 -1
- package/dist/browser/page.d.ts +2 -0
- package/dist/browser/page.d.ts.map +1 -1
- package/dist/browser/page.js +12 -0
- package/dist/browser/page.js.map +1 -1
- package/dist/commands/browser/actions.d.ts.map +1 -1
- package/dist/commands/browser/actions.js +59 -3
- package/dist/commands/browser/actions.js.map +1 -1
- package/dist/commands/scholar.d.ts +77 -5
- package/dist/commands/scholar.d.ts.map +1 -1
- package/dist/commands/scholar.js +2945 -83
- package/dist/commands/scholar.js.map +1 -1
- package/dist/core/command-contract.d.ts.map +1 -1
- package/dist/core/command-contract.js +5 -0
- package/dist/core/command-contract.js.map +1 -1
- package/dist/core/schema-v2.d.ts +1 -0
- package/dist/core/schema-v2.d.ts.map +1 -1
- package/dist/core/schema-v2.js +1 -0
- package/dist/core/schema-v2.js.map +1 -1
- package/dist/discovery/aliases.d.ts.map +1 -1
- package/dist/discovery/aliases.js +208 -0
- package/dist/discovery/aliases.js.map +1 -1
- package/dist/discovery/core-catalog.d.ts +2 -0
- package/dist/discovery/core-catalog.d.ts.map +1 -1
- package/dist/discovery/core-catalog.js +487 -0
- package/dist/discovery/core-catalog.js.map +1 -1
- package/dist/discovery/intents.d.ts.map +1 -1
- package/dist/discovery/intents.js +273 -2
- package/dist/discovery/intents.js.map +1 -1
- package/dist/discovery/loader.d.ts.map +1 -1
- package/dist/discovery/loader.js +3 -0
- package/dist/discovery/loader.js.map +1 -1
- package/dist/engine/capability-policy.d.ts.map +1 -1
- package/dist/engine/capability-policy.js +30 -4
- package/dist/engine/capability-policy.js.map +1 -1
- package/dist/engine/kernel/stages.d.ts.map +1 -1
- package/dist/engine/kernel/stages.js +3 -0
- package/dist/engine/kernel/stages.js.map +1 -1
- package/dist/engine/operation-policy.d.ts +4 -1
- package/dist/engine/operation-policy.d.ts.map +1 -1
- package/dist/engine/operation-policy.js +23 -0
- package/dist/engine/operation-policy.js.map +1 -1
- package/dist/fast-path/manifest.d.ts +3 -0
- package/dist/fast-path/manifest.d.ts.map +1 -1
- package/dist/fast-path/manifest.js.map +1 -1
- package/dist/fast-path/policy.d.ts.map +1 -1
- package/dist/fast-path/policy.js +3 -0
- package/dist/fast-path/policy.js.map +1 -1
- package/dist/manifest-compact.txt +1 -1
- package/dist/manifest.json +6804 -1002
- package/dist/registry.d.ts +2 -0
- package/dist/registry.d.ts.map +1 -1
- package/dist/registry.js +1 -0
- package/dist/registry.js.map +1 -1
- package/dist/types/scholarly.d.ts +19 -4
- package/dist/types/scholarly.d.ts.map +1 -1
- package/dist/types/scholarly.js +4 -4
- package/dist/types.d.ts +8 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
- package/server.json +2 -2
- package/skills/unicli/SKILL.md +1 -1
- package/skills/unicli-claude-code/SKILL.md +1 -1
- package/skills/unicli-hermes/SKILL.md +1 -1
- package/src/adapters/acl-anthology/papers.test.ts +111 -0
- package/src/adapters/acl-anthology/papers.ts +379 -71
- package/src/adapters/arxiv/papers.test.ts +46 -0
- package/src/adapters/arxiv/papers.ts +251 -4
- package/src/adapters/baidu-scholar/search.ts +74 -11
- package/src/adapters/biorxiv/preprints.ts +112 -0
- package/src/adapters/cnki/search.ts +357 -0
- package/src/adapters/cvf/papers.ts +260 -27
- package/src/adapters/dblp/publications.test.ts +9 -0
- package/src/adapters/dblp/publications.ts +31 -8
- package/src/adapters/google-scholar/search.ts +165 -17
- package/src/adapters/hf/paper.test.ts +23 -0
- package/src/adapters/hf/paper.ts +89 -5
- package/src/adapters/hf/top.yaml +34 -2
- package/src/adapters/huggingface-papers/daily.yaml +37 -3
- package/src/adapters/huggingface-papers/search.yaml +43 -9
- package/src/adapters/medrxiv/preprints.ts +112 -0
- package/src/adapters/neurips/proceedings.ts +266 -22
- package/src/adapters/openalex/works.test.ts +15 -4
- package/src/adapters/openalex/works.ts +136 -8
- package/src/adapters/openreview/papers.test.ts +31 -0
- package/src/adapters/openreview/papers.ts +407 -29
- package/src/adapters/pmlr/proceedings.ts +102 -12
- package/src/adapters/pubmed/articles.test.ts +88 -1
- package/src/adapters/pubmed/articles.ts +343 -44
- package/src/adapters/rxiv/preprints.test.ts +233 -0
- package/src/adapters/rxiv/preprints.ts +849 -0
- package/src/adapters/scholar-artifacts/pdf-read.ts +277 -0
- package/src/adapters/scholar-artifacts/pdf.ts +133 -0
- package/src/adapters/semantic-scholar/papers.ts +98 -6
- package/src/adapters/unpaywall/works.ts +141 -12
- package/src/adapters/wanfang/search.ts +57 -7
- package/src/adapters/cnki/search.yaml +0 -49
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @owner src::adapters::cnki::search
|
|
3
|
+
* @does Registers CNKI Scholar title search against the current public KNS criteria endpoint.
|
|
4
|
+
* @needs scholar.cnki.net KNS criteria API, node:crypto, src/registry.ts, src/types/scholarly.ts
|
|
5
|
+
* @feeds `unicli cnki search`, src/commands/scholar.ts via scholar.search, scholar doctor live probes
|
|
6
|
+
* @breaks CNKI token/request-shape drift surfaces as upstream_error; access-controlled PDF/order URLs are exposed as relation hints, not as a scholar.pdf guarantee.
|
|
7
|
+
* @invariants Search uses CNKI's current all-database class id; rows must carry source-local id/title/source_adapter/retrieved_at; no stale API fallback is kept.
|
|
8
|
+
* @side-effects HTTPS egress to scholar.cnki.net only
|
|
9
|
+
* @perf O(limit) JSON mapping after one POST request.
|
|
10
|
+
* @concurrency safe
|
|
11
|
+
* @test tests/unit/adapters/scholar-sources.test.ts; live smoke via `unicli cnki search <query>` and `unicli scholar doctor --sources cnki --live`
|
|
12
|
+
* @stability experimental
|
|
13
|
+
* @since 2026-06-27
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { createCipheriv } from "node:crypto";
|
|
17
|
+
|
|
18
|
+
import { cli, Strategy } from "../../registry.js";
|
|
19
|
+
import type { ScholarlyWorkRecord } from "../../types/scholarly.js";
|
|
20
|
+
|
|
21
|
+
const CNKI_QUERY_API =
|
|
22
|
+
"https://scholar.cnki.net/restapi/kns8s-api/v2/criteria/query";
|
|
23
|
+
const CNKI_TOKEN_SECRET = "cf4e8f25360248f89248af06a55d21ea";
|
|
24
|
+
const CNKI_CLIENT_ID = "c5fd4ef0-d314-4888-b0a7-f6190eaefaf0";
|
|
25
|
+
const CNKI_ALL_DATABASE_CLASS_ID = "WD0FTY92";
|
|
26
|
+
const CNKI_REFERER = "https://scholar.cnki.net/";
|
|
27
|
+
const MAX_LIMIT = 50;
|
|
28
|
+
|
|
29
|
+
interface CnkiMetadataEntry {
|
|
30
|
+
name?: unknown;
|
|
31
|
+
value?: unknown;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
interface CnkiRelation {
|
|
35
|
+
scope?: unknown;
|
|
36
|
+
url?: unknown;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
interface CnkiAuthor {
|
|
40
|
+
title?: unknown;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
interface CnkiSource {
|
|
44
|
+
title?: unknown;
|
|
45
|
+
type?: unknown;
|
|
46
|
+
year?: unknown;
|
|
47
|
+
relations?: CnkiRelation[];
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface CnkiSearchRow {
|
|
51
|
+
metadata?: CnkiMetadataEntry[];
|
|
52
|
+
relations?: CnkiRelation[];
|
|
53
|
+
authors?: CnkiAuthor[];
|
|
54
|
+
source?: CnkiSource;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
interface CnkiSearchResponse {
|
|
58
|
+
code?: unknown;
|
|
59
|
+
message?: unknown;
|
|
60
|
+
data?: {
|
|
61
|
+
total?: unknown;
|
|
62
|
+
data?: CnkiSearchRow[];
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
interface CnkiSearchPayload {
|
|
67
|
+
Resource: string;
|
|
68
|
+
Classid: string;
|
|
69
|
+
Products: string;
|
|
70
|
+
KuaKuCode: string;
|
|
71
|
+
QNode: {
|
|
72
|
+
QGroup: Array<{
|
|
73
|
+
Key: string;
|
|
74
|
+
Title: string;
|
|
75
|
+
Logic: number;
|
|
76
|
+
Items: unknown[];
|
|
77
|
+
ChildItems: Array<{
|
|
78
|
+
Key: string;
|
|
79
|
+
Title: string;
|
|
80
|
+
Logic: number;
|
|
81
|
+
Items: Array<{
|
|
82
|
+
Key: string;
|
|
83
|
+
Title: string;
|
|
84
|
+
Logic: number;
|
|
85
|
+
Field: string;
|
|
86
|
+
Operator: string;
|
|
87
|
+
Value: string;
|
|
88
|
+
}>;
|
|
89
|
+
ChildItems: unknown[];
|
|
90
|
+
}>;
|
|
91
|
+
}>;
|
|
92
|
+
};
|
|
93
|
+
ExScope: string;
|
|
94
|
+
SearchType: number;
|
|
95
|
+
SearchFrom: number;
|
|
96
|
+
Rlang: string;
|
|
97
|
+
sort: string;
|
|
98
|
+
sortType: string;
|
|
99
|
+
pageNum: number;
|
|
100
|
+
pageSize: number;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
type ActionableError = Error & {
|
|
104
|
+
code?: string;
|
|
105
|
+
suggestion?: string;
|
|
106
|
+
retryable?: boolean;
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
function str(value: unknown): string {
|
|
110
|
+
return typeof value === "string" ? value.trim() : "";
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function stripHtml(value: unknown): string {
|
|
114
|
+
return str(value)
|
|
115
|
+
.replace(/<[^>]*>/g, "")
|
|
116
|
+
.replace(/ /g, " ")
|
|
117
|
+
.replace(/&/g, "&")
|
|
118
|
+
.replace(/</g, "<")
|
|
119
|
+
.replace(/>/g, ">")
|
|
120
|
+
.replace(/"/g, '"')
|
|
121
|
+
.replace(/'/g, "'")
|
|
122
|
+
.replace(/\s+/g, " ")
|
|
123
|
+
.trim();
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function metadataValue(row: CnkiSearchRow, name: string): string {
|
|
127
|
+
const entry = row.metadata?.find(
|
|
128
|
+
(candidate) => str(candidate.name).toUpperCase() === name,
|
|
129
|
+
);
|
|
130
|
+
return stripHtml(entry?.value);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function relationUrl(
|
|
134
|
+
relations: CnkiRelation[] | undefined,
|
|
135
|
+
scope: string,
|
|
136
|
+
): string {
|
|
137
|
+
const found = relations?.find(
|
|
138
|
+
(relation) => str(relation.scope).toUpperCase() === scope,
|
|
139
|
+
);
|
|
140
|
+
return str(found?.url);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function firstRelationUrl(row: CnkiSearchRow, scopes: string[]): string {
|
|
144
|
+
for (const scope of scopes) {
|
|
145
|
+
const rowUrl = relationUrl(row.relations, scope);
|
|
146
|
+
if (rowUrl) return rowUrl;
|
|
147
|
+
const sourceUrl = relationUrl(row.source?.relations, scope);
|
|
148
|
+
if (sourceUrl) return sourceUrl;
|
|
149
|
+
}
|
|
150
|
+
return "";
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function parseYear(value: string): number | undefined {
|
|
154
|
+
const year = Number(value.match(/(?:19|20)\d{2}/)?.[0]);
|
|
155
|
+
return Number.isInteger(year) ? year : undefined;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function splitMetadataAuthors(value: string): string[] {
|
|
159
|
+
return value
|
|
160
|
+
.split(/[;;]/)
|
|
161
|
+
.map((author) => author.trim())
|
|
162
|
+
.filter(Boolean);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function cnkiError(message: string, code = "upstream_error"): ActionableError {
|
|
166
|
+
const error = new Error(message) as ActionableError;
|
|
167
|
+
error.code = code;
|
|
168
|
+
error.suggestion =
|
|
169
|
+
"CNKI changed or rejected the public KNS criteria query. Run `unicli describe cnki search`, then inspect scholar.cnki.net's current search request before changing the adapter.";
|
|
170
|
+
error.retryable = false;
|
|
171
|
+
return error;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
export function createCnkiVvToken(timestamp = Date.now()): string {
|
|
175
|
+
const cipher = createCipheriv(
|
|
176
|
+
"aes-256-ecb",
|
|
177
|
+
Buffer.from(CNKI_TOKEN_SECRET, "utf8"),
|
|
178
|
+
null,
|
|
179
|
+
);
|
|
180
|
+
cipher.setAutoPadding(true);
|
|
181
|
+
return Buffer.concat([
|
|
182
|
+
cipher.update(JSON.stringify({ timestamp }), "utf8"),
|
|
183
|
+
cipher.final(),
|
|
184
|
+
]).toString("hex");
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
export function buildCnkiSearchPayload(
|
|
188
|
+
query: string,
|
|
189
|
+
limit: number,
|
|
190
|
+
): CnkiSearchPayload {
|
|
191
|
+
return {
|
|
192
|
+
Resource: "",
|
|
193
|
+
Classid: CNKI_ALL_DATABASE_CLASS_ID,
|
|
194
|
+
Products: "",
|
|
195
|
+
KuaKuCode: "",
|
|
196
|
+
QNode: {
|
|
197
|
+
QGroup: [
|
|
198
|
+
{
|
|
199
|
+
Key: "",
|
|
200
|
+
Title: "",
|
|
201
|
+
Logic: 0,
|
|
202
|
+
Items: [],
|
|
203
|
+
ChildItems: [
|
|
204
|
+
{
|
|
205
|
+
Key: "subject",
|
|
206
|
+
Title: "",
|
|
207
|
+
Logic: 0,
|
|
208
|
+
Items: [
|
|
209
|
+
{
|
|
210
|
+
Key: "",
|
|
211
|
+
Title: "题名",
|
|
212
|
+
Logic: 0,
|
|
213
|
+
Field: "TI",
|
|
214
|
+
Operator: "FUZZY",
|
|
215
|
+
Value: query,
|
|
216
|
+
},
|
|
217
|
+
],
|
|
218
|
+
ChildItems: [],
|
|
219
|
+
},
|
|
220
|
+
],
|
|
221
|
+
},
|
|
222
|
+
],
|
|
223
|
+
},
|
|
224
|
+
ExScope: "1",
|
|
225
|
+
SearchType: 2,
|
|
226
|
+
SearchFrom: 1,
|
|
227
|
+
Rlang: "",
|
|
228
|
+
sort: "PT",
|
|
229
|
+
sortType: "DESC",
|
|
230
|
+
pageNum: 1,
|
|
231
|
+
pageSize: limit,
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
export function mapCnkiSearchRow(
|
|
236
|
+
row: CnkiSearchRow,
|
|
237
|
+
rank: number,
|
|
238
|
+
): ScholarlyWorkRecord & { rank: number; pdf_url?: string } {
|
|
239
|
+
const title = metadataValue(row, "TI") || metadataValue(row, "ENTI");
|
|
240
|
+
const doi = metadataValue(row, "DOI").replace(/^doi:/i, "");
|
|
241
|
+
const date = metadataValue(row, "PT");
|
|
242
|
+
const source = metadataValue(row, "LY") || stripHtml(row.source?.title);
|
|
243
|
+
const authorNames =
|
|
244
|
+
row.authors?.map((author) => stripHtml(author.title)).filter(Boolean) ?? [];
|
|
245
|
+
const authors = authorNames.length
|
|
246
|
+
? authorNames
|
|
247
|
+
: splitMetadataAuthors(metadataValue(row, "AU"));
|
|
248
|
+
const sourceUrl = firstRelationUrl(row, ["ABSTRACT", "PUBLICATION"]);
|
|
249
|
+
const pdfUrl = firstRelationUrl(row, ["PDF"]);
|
|
250
|
+
const id =
|
|
251
|
+
metadataValue(row, "ID") ||
|
|
252
|
+
metadataValue(row, "FN") ||
|
|
253
|
+
doi ||
|
|
254
|
+
sourceUrl ||
|
|
255
|
+
title;
|
|
256
|
+
if (!id || !title) {
|
|
257
|
+
throw cnkiError("CNKI returned a row without a stable id or title.");
|
|
258
|
+
}
|
|
259
|
+
const citedByCount = Number(metadataValue(row, "CF"));
|
|
260
|
+
return {
|
|
261
|
+
id,
|
|
262
|
+
rank,
|
|
263
|
+
title,
|
|
264
|
+
authors,
|
|
265
|
+
year: parseYear(date) ?? parseYear(str(row.source?.year)),
|
|
266
|
+
date: date || undefined,
|
|
267
|
+
venue: source || undefined,
|
|
268
|
+
type: metadataValue(row, "DB") || stripHtml(row.source?.type) || undefined,
|
|
269
|
+
abstract: metadataValue(row, "AB") || undefined,
|
|
270
|
+
doi: doi || undefined,
|
|
271
|
+
cited_by_count: Number.isFinite(citedByCount) ? citedByCount : undefined,
|
|
272
|
+
pdf_url: pdfUrl || undefined,
|
|
273
|
+
landing_url: sourceUrl || undefined,
|
|
274
|
+
source_adapter: "cnki",
|
|
275
|
+
source_url: sourceUrl || undefined,
|
|
276
|
+
retrieved_at: new Date().toISOString(),
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
async function fetchCnkiSearch(
|
|
281
|
+
payload: CnkiSearchPayload,
|
|
282
|
+
): Promise<CnkiSearchResponse> {
|
|
283
|
+
const url = new URL(CNKI_QUERY_API);
|
|
284
|
+
url.searchParams.set("vv", createCnkiVvToken());
|
|
285
|
+
url.searchParams.set("clientId", CNKI_CLIENT_ID);
|
|
286
|
+
const response = await fetch(url, {
|
|
287
|
+
method: "POST",
|
|
288
|
+
headers: {
|
|
289
|
+
Accept: "application/json, text/plain, */*",
|
|
290
|
+
"Content-Type": "application/json;charset=UTF-8",
|
|
291
|
+
Origin: "https://scholar.cnki.net",
|
|
292
|
+
Referer: CNKI_REFERER,
|
|
293
|
+
"User-Agent":
|
|
294
|
+
"Mozilla/5.0 (compatible; Uni-CLI/1.0; +https://github.com/olo-dot-io/Uni-CLI)",
|
|
295
|
+
Version: "",
|
|
296
|
+
},
|
|
297
|
+
body: JSON.stringify(payload),
|
|
298
|
+
});
|
|
299
|
+
if (!response.ok) {
|
|
300
|
+
throw cnkiError(`CNKI search returned HTTP ${response.status}.`);
|
|
301
|
+
}
|
|
302
|
+
const json = (await response.json()) as CnkiSearchResponse;
|
|
303
|
+
const code = Number(json.code);
|
|
304
|
+
if (code !== 0) {
|
|
305
|
+
throw cnkiError(
|
|
306
|
+
`CNKI search returned code ${String(json.code)}: ${str(json.message)}`,
|
|
307
|
+
);
|
|
308
|
+
}
|
|
309
|
+
return json;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
cli({
|
|
313
|
+
site: "cnki",
|
|
314
|
+
name: "search",
|
|
315
|
+
description: "Search CNKI academic papers by title",
|
|
316
|
+
domain: "scholar.cnki.net",
|
|
317
|
+
strategy: Strategy.PUBLIC,
|
|
318
|
+
adapter_path: "src/adapters/cnki/search.ts",
|
|
319
|
+
args: [
|
|
320
|
+
{ name: "query", type: "str", required: true, positional: true },
|
|
321
|
+
{ name: "limit", type: "int", default: 20 },
|
|
322
|
+
],
|
|
323
|
+
columns: [
|
|
324
|
+
"rank",
|
|
325
|
+
"id",
|
|
326
|
+
"title",
|
|
327
|
+
"authors",
|
|
328
|
+
"venue",
|
|
329
|
+
"year",
|
|
330
|
+
"doi",
|
|
331
|
+
"cited_by_count",
|
|
332
|
+
"pdf_url",
|
|
333
|
+
"source_url",
|
|
334
|
+
],
|
|
335
|
+
capabilities: ["http.fetch", "scholar.search"],
|
|
336
|
+
func: async (_page, kwargs) => {
|
|
337
|
+
const query = str(kwargs.query);
|
|
338
|
+
if (!query) {
|
|
339
|
+
const error = new Error(
|
|
340
|
+
"cnki search query cannot be empty.",
|
|
341
|
+
) as ActionableError;
|
|
342
|
+
error.code = "invalid_input";
|
|
343
|
+
error.suggestion =
|
|
344
|
+
"Pass a CNKI title keyword, for example `unicli cnki search 人工智能`.";
|
|
345
|
+
error.retryable = false;
|
|
346
|
+
throw error;
|
|
347
|
+
}
|
|
348
|
+
const requestedLimit = Number(kwargs.limit ?? 20);
|
|
349
|
+
const limit = Number.isFinite(requestedLimit)
|
|
350
|
+
? Math.min(Math.max(Math.trunc(requestedLimit), 1), MAX_LIMIT)
|
|
351
|
+
: 20;
|
|
352
|
+
const payload = buildCnkiSearchPayload(query, limit);
|
|
353
|
+
const response = await fetchCnkiSearch(payload);
|
|
354
|
+
const rows = Array.isArray(response.data?.data) ? response.data.data : [];
|
|
355
|
+
return rows.map((row, index) => mapCnkiSearchRow(row, index + 1));
|
|
356
|
+
},
|
|
357
|
+
});
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @owner src::adapters::cvf::papers
|
|
3
|
-
* @does Registers CVF OpenAccess conference paper search for CVPR/ICCV/ECCV-style proceedings pages.
|
|
4
|
-
* @needs openaccess.thecvf.com static proceedings HTML, src/registry.ts
|
|
5
|
-
* @feeds src/commands/scholar.ts via scholar.search, scholar.pdf, and scholar.venue
|
|
6
|
-
* @breaks CVF markup drift
|
|
7
|
-
* @invariants Venue/year map to explicit CVF event pages;
|
|
8
|
-
* @side-effects HTTPS egress to openaccess.thecvf.com
|
|
9
|
-
* @perf O(N) over one proceedings HTML page
|
|
3
|
+
* @does Registers CVF OpenAccess conference paper search, detail retrieval, and PDF text reading for CVPR/ICCV/ECCV-style proceedings pages.
|
|
4
|
+
* @needs openaccess.thecvf.com static proceedings HTML/PDFs, src/adapters/scholar-artifacts/pdf-read.ts, src/registry.ts
|
|
5
|
+
* @feeds src/commands/scholar.ts via scholar.search, scholar.get, scholar.pdf, scholar.fulltext, and scholar.venue
|
|
6
|
+
* @breaks CVF markup/PDF drift, denied downloads, or missing pdftotext surface as explicit adapter errors rather than non-CVF fallbacks.
|
|
7
|
+
* @invariants Venue/year map to explicit CVF event pages; paper detail prefers citation_* metadata over scraped display blocks.
|
|
8
|
+
* @side-effects HTTPS egress to openaccess.thecvf.com; read writes one PDF artifact and executes pdftotext.
|
|
9
|
+
* @perf O(N) over one proceedings HTML page; read is O(PDF bytes + selected pages)
|
|
10
10
|
* @concurrency safe
|
|
11
11
|
* @test tests/unit/adapters/scholar-sources.test.ts
|
|
12
12
|
* @stability experimental
|
|
@@ -15,8 +15,23 @@
|
|
|
15
15
|
|
|
16
16
|
import { cli, Strategy } from "../../registry.js";
|
|
17
17
|
import type { ScholarlyWorkRecord } from "../../types/scholarly.js";
|
|
18
|
+
import { readScholarPdf } from "../scholar-artifacts/pdf-read.js";
|
|
19
|
+
import { Agent, request } from "undici";
|
|
18
20
|
|
|
19
21
|
const ORIGIN = "https://openaccess.thecvf.com";
|
|
22
|
+
const CVF_USER_AGENT = "unicli-cvf/1.0 (https://github.com/olo-dot-io/Uni-CLI)";
|
|
23
|
+
const CVF_HTTP_AGENT = new Agent({ connect: { timeout: 30_000 } });
|
|
24
|
+
export const CVF_HTTP_HEADERS = {
|
|
25
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
26
|
+
"User-Agent": CVF_USER_AGENT,
|
|
27
|
+
} as const;
|
|
28
|
+
|
|
29
|
+
type CvfActionableError = Error & {
|
|
30
|
+
code?: string;
|
|
31
|
+
suggestion?: string;
|
|
32
|
+
retryable?: boolean;
|
|
33
|
+
alternatives?: string[];
|
|
34
|
+
};
|
|
20
35
|
|
|
21
36
|
function decode(value: string): string {
|
|
22
37
|
return value
|
|
@@ -35,6 +50,26 @@ function absolute(path: string): string {
|
|
|
35
50
|
: `${ORIGIN}${path.startsWith("/") ? "" : "/"}${path}`;
|
|
36
51
|
}
|
|
37
52
|
|
|
53
|
+
function metaContents(html: string, name: string): string[] {
|
|
54
|
+
const values: string[] = [];
|
|
55
|
+
const re = new RegExp(
|
|
56
|
+
`<meta\\s+name=["']${name}["']\\s+content=["']([^"']*)["'][^>]*>`,
|
|
57
|
+
"gi",
|
|
58
|
+
);
|
|
59
|
+
let match: RegExpExecArray | null;
|
|
60
|
+
while ((match = re.exec(html)) !== null) values.push(decode(match[1]));
|
|
61
|
+
return values;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function firstMetaContent(html: string, name: string): string {
|
|
65
|
+
return metaContents(html, name)[0] ?? "";
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function parseYear(value: string): number | undefined {
|
|
69
|
+
const year = value.match(/\d{4}/)?.[0];
|
|
70
|
+
return year ? Number(year) : undefined;
|
|
71
|
+
}
|
|
72
|
+
|
|
38
73
|
function eventId(venue: unknown, year: unknown): string {
|
|
39
74
|
const v = String(venue ?? "CVPR")
|
|
40
75
|
.trim()
|
|
@@ -46,6 +81,89 @@ function eventId(venue: unknown, year: unknown): string {
|
|
|
46
81
|
return `${v}${y}`;
|
|
47
82
|
}
|
|
48
83
|
|
|
84
|
+
function requireCvfPaperId(value: unknown): string {
|
|
85
|
+
const raw = String(value ?? "").trim();
|
|
86
|
+
const id =
|
|
87
|
+
raw.match(/\/html\/([^/?#]+\.html)/)?.[1]?.replace(/\.html$/, "") ??
|
|
88
|
+
raw.replace(/\.html$/, "");
|
|
89
|
+
if (!/^[A-Za-z0-9_.-]+$/.test(id)) {
|
|
90
|
+
throw new Error(`CVF paper id "${raw}" is not valid.`);
|
|
91
|
+
}
|
|
92
|
+
return id;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function paperUrl(id: string, event: string): string {
|
|
96
|
+
return `${ORIGIN}/content/${event}/html/${id}.html`;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function cvfUpstreamError(label: string, detail: string): CvfActionableError {
|
|
100
|
+
const error = new Error(`${label} failed: ${detail}.`) as CvfActionableError;
|
|
101
|
+
error.code = "upstream_error";
|
|
102
|
+
error.suggestion =
|
|
103
|
+
"CVF OpenAccess did not return the expected public proceedings page on this network path; retry later or verify the official openaccess.thecvf.com page manually.";
|
|
104
|
+
error.retryable =
|
|
105
|
+
/fetch failed|timeout|ECONNRESET|ETIMEDOUT|HTTP (429|5\d\d)/i.test(detail);
|
|
106
|
+
error.alternatives = [];
|
|
107
|
+
return error;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function parseListAuthors(block: string): string[] | undefined {
|
|
111
|
+
const beforeLinks = block.split(/\[<a\s+href=/i)[0] ?? block;
|
|
112
|
+
const dd = beforeLinks.match(/<dd>([\s\S]*?)(?:<\/dd>|$)/i)?.[1] ?? "";
|
|
113
|
+
const text = decode(
|
|
114
|
+
dd.replace(/<div class="bibref[\s\S]*$/i, " ").replace(/<[^>]+>/g, " "),
|
|
115
|
+
);
|
|
116
|
+
const authors = text
|
|
117
|
+
.replace(/;\s*Proceedings[\s\S]*$/i, "")
|
|
118
|
+
.split(",")
|
|
119
|
+
.map((author) => author.trim())
|
|
120
|
+
.filter(Boolean);
|
|
121
|
+
return authors.length > 0 ? authors : undefined;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export function parseCvfPaperPage(
|
|
125
|
+
html: string,
|
|
126
|
+
sourceUrl: string,
|
|
127
|
+
): ScholarlyWorkRecord {
|
|
128
|
+
const title =
|
|
129
|
+
firstMetaContent(html, "citation_title") ||
|
|
130
|
+
decode(
|
|
131
|
+
html
|
|
132
|
+
.match(/<div id="papertitle">([\s\S]*?)<dd>/i)?.[1]
|
|
133
|
+
?.replace(/<[^>]+>/g, " ") ?? "",
|
|
134
|
+
);
|
|
135
|
+
if (!title) throw new Error("CVF paper page did not expose a title.");
|
|
136
|
+
const event = sourceUrl.match(/\/content\/([A-Z]+\d{4})\//)?.[1] ?? undefined;
|
|
137
|
+
const id =
|
|
138
|
+
sourceUrl
|
|
139
|
+
.split("/")
|
|
140
|
+
.pop()
|
|
141
|
+
?.replace(/\.html$/, "") ?? title;
|
|
142
|
+
const pdfUrl =
|
|
143
|
+
firstMetaContent(html, "citation_pdf_url") ||
|
|
144
|
+
html.match(/<a href="([^"]+\.pdf)">pdf<\/a>/i)?.[1] ||
|
|
145
|
+
"";
|
|
146
|
+
return {
|
|
147
|
+
id,
|
|
148
|
+
title,
|
|
149
|
+
authors: metaContents(html, "citation_author"),
|
|
150
|
+
year:
|
|
151
|
+
parseYear(firstMetaContent(html, "citation_publication_date")) ??
|
|
152
|
+
(event ? Number(event.slice(-4)) : undefined),
|
|
153
|
+
venue:
|
|
154
|
+
firstMetaContent(html, "citation_conference_title") ||
|
|
155
|
+
event?.replace(/\d{4}$/, ""),
|
|
156
|
+
abstract:
|
|
157
|
+
decode(
|
|
158
|
+
html.match(/<div id="abstract">([\s\S]*?)<\/div>/i)?.[1] ?? "",
|
|
159
|
+
).replace(/<[^>]+>/g, " ") || undefined,
|
|
160
|
+
pdf_url: pdfUrl ? absolute(pdfUrl) : undefined,
|
|
161
|
+
source_adapter: "cvf",
|
|
162
|
+
source_url: sourceUrl,
|
|
163
|
+
retrieved_at: new Date().toISOString(),
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
49
167
|
export function parseCvfRows(
|
|
50
168
|
html: string,
|
|
51
169
|
event = "CVPR2024",
|
|
@@ -59,14 +177,6 @@ export function parseCvfRows(
|
|
|
59
177
|
const title = decode(match[2].replace(/<[^>]+>/g, " "));
|
|
60
178
|
const block = match[3];
|
|
61
179
|
const pdf = block.match(/<a href="([^"]+\.pdf)">pdf<\/a>/i)?.[1] ?? "";
|
|
62
|
-
const authorText = block
|
|
63
|
-
.replace(/\[[\s\S]*?\]/g, " ")
|
|
64
|
-
.replace(/<form[\s\S]*?<\/form>/g, " ")
|
|
65
|
-
.replace(/<[^>]+>/g, " ");
|
|
66
|
-
const authors = decode(authorText)
|
|
67
|
-
.split(",")
|
|
68
|
-
.map((author) => author.trim())
|
|
69
|
-
.filter(Boolean);
|
|
70
180
|
out.push({
|
|
71
181
|
id:
|
|
72
182
|
sourceUrl
|
|
@@ -74,7 +184,7 @@ export function parseCvfRows(
|
|
|
74
184
|
.pop()
|
|
75
185
|
?.replace(/\.html$/, "") ?? title,
|
|
76
186
|
title,
|
|
77
|
-
authors:
|
|
187
|
+
authors: parseListAuthors(block),
|
|
78
188
|
year: Number(event.slice(-4)),
|
|
79
189
|
venue: event.replace(/\d{4}$/, ""),
|
|
80
190
|
pdf_url: pdf ? absolute(pdf) : undefined,
|
|
@@ -86,6 +196,54 @@ export function parseCvfRows(
|
|
|
86
196
|
return out;
|
|
87
197
|
}
|
|
88
198
|
|
|
199
|
+
async function fetchCvfHtml(url: string, label: string): Promise<string> {
|
|
200
|
+
let response: Awaited<ReturnType<typeof request>>;
|
|
201
|
+
try {
|
|
202
|
+
response = await request(url, {
|
|
203
|
+
dispatcher: CVF_HTTP_AGENT,
|
|
204
|
+
headers: CVF_HTTP_HEADERS,
|
|
205
|
+
});
|
|
206
|
+
} catch (error) {
|
|
207
|
+
throw cvfUpstreamError(
|
|
208
|
+
label,
|
|
209
|
+
error instanceof Error ? error.message : String(error),
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
if (response.statusCode === 404)
|
|
213
|
+
throw new Error(`${label} returned no page.`);
|
|
214
|
+
if (response.statusCode < 200 || response.statusCode >= 300) {
|
|
215
|
+
throw cvfUpstreamError(label, `HTTP ${response.statusCode}`);
|
|
216
|
+
}
|
|
217
|
+
return response.body.text();
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
async function readCvfPaperPdf(
|
|
221
|
+
row: ScholarlyWorkRecord,
|
|
222
|
+
kwargs: Record<string, unknown>,
|
|
223
|
+
): Promise<Record<string, unknown>> {
|
|
224
|
+
if (!row.pdf_url) throw new Error(`CVF paper ${row.id} has no PDF URL.`);
|
|
225
|
+
return readScholarPdf(
|
|
226
|
+
{
|
|
227
|
+
id: row.id,
|
|
228
|
+
title: row.title,
|
|
229
|
+
source_adapter: "cvf",
|
|
230
|
+
source_url: row.source_url,
|
|
231
|
+
pdf_url: row.pdf_url,
|
|
232
|
+
output: kwargs.output,
|
|
233
|
+
filename: kwargs.filename,
|
|
234
|
+
"first-page": kwargs["first-page"] ?? kwargs.firstPage,
|
|
235
|
+
"last-page": kwargs["last-page"] ?? kwargs.lastPage,
|
|
236
|
+
"max-chars": kwargs["max-chars"] ?? kwargs.maxChars,
|
|
237
|
+
},
|
|
238
|
+
{
|
|
239
|
+
site: "cvf",
|
|
240
|
+
command: "read",
|
|
241
|
+
defaultOutput: "./cvf-downloads",
|
|
242
|
+
userAgent: CVF_USER_AGENT,
|
|
243
|
+
},
|
|
244
|
+
);
|
|
245
|
+
}
|
|
246
|
+
|
|
89
247
|
cli({
|
|
90
248
|
site: "cvf",
|
|
91
249
|
name: "search",
|
|
@@ -111,18 +269,11 @@ cli({
|
|
|
111
269
|
.toLowerCase();
|
|
112
270
|
if (!query) throw new Error("cvf search query cannot be empty.");
|
|
113
271
|
const event = eventId(kwargs.venue, kwargs.year);
|
|
114
|
-
const response = await fetch(`${ORIGIN}/${event}?day=all`, {
|
|
115
|
-
headers: {
|
|
116
|
-
Accept: "*/*",
|
|
117
|
-
"User-Agent": "unicli-cvf/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
|
|
118
|
-
},
|
|
119
|
-
});
|
|
120
|
-
if (response.status === 404)
|
|
121
|
-
throw new Error(`CVF ${event} returned no proceedings page.`);
|
|
122
|
-
if (!response.ok)
|
|
123
|
-
throw new Error(`CVF ${event} returned HTTP ${response.status}.`);
|
|
124
272
|
const limit = Math.min(Math.max(Number(kwargs.limit ?? 20), 1), 200);
|
|
125
|
-
const rows = parseCvfRows(
|
|
273
|
+
const rows = parseCvfRows(
|
|
274
|
+
await fetchCvfHtml(`${ORIGIN}/${event}?day=all`, `CVF ${event}`),
|
|
275
|
+
event,
|
|
276
|
+
)
|
|
126
277
|
.filter((row) =>
|
|
127
278
|
`${row.title} ${row.authors?.join(" ") ?? ""}`
|
|
128
279
|
.toLowerCase()
|
|
@@ -134,3 +285,85 @@ cli({
|
|
|
134
285
|
return rows;
|
|
135
286
|
},
|
|
136
287
|
});
|
|
288
|
+
|
|
289
|
+
cli({
|
|
290
|
+
site: "cvf",
|
|
291
|
+
name: "paper",
|
|
292
|
+
description: "Fetch CVF OpenAccess paper metadata by page id",
|
|
293
|
+
domain: "openaccess.thecvf.com",
|
|
294
|
+
strategy: Strategy.PUBLIC,
|
|
295
|
+
args: [
|
|
296
|
+
{ name: "id", type: "str", required: true, positional: true },
|
|
297
|
+
{ name: "venue", type: "str", default: "CVPR" },
|
|
298
|
+
{ name: "year", type: "str", default: "2024" },
|
|
299
|
+
],
|
|
300
|
+
columns: ["id", "title", "authors", "year", "venue", "pdf_url", "source_url"],
|
|
301
|
+
capabilities: ["http.fetch", "scholar.get", "scholar.pdf"],
|
|
302
|
+
func: async (_page, kwargs) => {
|
|
303
|
+
const event = eventId(kwargs.venue, kwargs.year);
|
|
304
|
+
const id = requireCvfPaperId(kwargs.id ?? kwargs.ref);
|
|
305
|
+
const url = paperUrl(id, event);
|
|
306
|
+
return [parseCvfPaperPage(await fetchCvfHtml(url, `CVF paper ${id}`), url)];
|
|
307
|
+
},
|
|
308
|
+
});
|
|
309
|
+
|
|
310
|
+
cli({
|
|
311
|
+
site: "cvf",
|
|
312
|
+
name: "read",
|
|
313
|
+
description:
|
|
314
|
+
"Download a CVF OpenAccess paper PDF by page id and extract text",
|
|
315
|
+
domain: "openaccess.thecvf.com",
|
|
316
|
+
strategy: Strategy.PUBLIC,
|
|
317
|
+
args: [
|
|
318
|
+
{ name: "id", type: "str", required: true, positional: true },
|
|
319
|
+
{ name: "venue", type: "str", default: "CVPR" },
|
|
320
|
+
{ name: "year", type: "str", default: "2024" },
|
|
321
|
+
{
|
|
322
|
+
name: "output",
|
|
323
|
+
type: "str",
|
|
324
|
+
default: "./cvf-downloads",
|
|
325
|
+
description: "Output directory for the downloaded PDF",
|
|
326
|
+
"x-unicli-kind": "path",
|
|
327
|
+
},
|
|
328
|
+
{ name: "filename", type: "str", description: "Output PDF filename" },
|
|
329
|
+
{ name: "first-page", type: "int", default: 1, description: "First page" },
|
|
330
|
+
{ name: "last-page", type: "int", default: 20, description: "Last page" },
|
|
331
|
+
{
|
|
332
|
+
name: "max-chars",
|
|
333
|
+
type: "int",
|
|
334
|
+
default: 40000,
|
|
335
|
+
description: "Maximum extracted text characters",
|
|
336
|
+
},
|
|
337
|
+
],
|
|
338
|
+
columns: [
|
|
339
|
+
"id",
|
|
340
|
+
"title",
|
|
341
|
+
"source_adapter",
|
|
342
|
+
"source_url",
|
|
343
|
+
"pdf_url",
|
|
344
|
+
"path",
|
|
345
|
+
"text_source",
|
|
346
|
+
"text",
|
|
347
|
+
"text_chars",
|
|
348
|
+
"text_truncated",
|
|
349
|
+
],
|
|
350
|
+
capabilities: [
|
|
351
|
+
"http.fetch",
|
|
352
|
+
"http.download",
|
|
353
|
+
"subprocess.exec",
|
|
354
|
+
"scholar.fulltext",
|
|
355
|
+
"scholar.pdf",
|
|
356
|
+
],
|
|
357
|
+
executables: ["pdftotext"],
|
|
358
|
+
minimum_capability: "subprocess.exec",
|
|
359
|
+
func: async (_page, kwargs) => {
|
|
360
|
+
const event = eventId(kwargs.venue, kwargs.year);
|
|
361
|
+
const id = requireCvfPaperId(kwargs.id ?? kwargs.ref);
|
|
362
|
+
const url = paperUrl(id, event);
|
|
363
|
+
const row = parseCvfPaperPage(
|
|
364
|
+
await fetchCvfHtml(url, `CVF paper ${id}`),
|
|
365
|
+
url,
|
|
366
|
+
);
|
|
367
|
+
return [await readCvfPaperPdf(row, kwargs)];
|
|
368
|
+
},
|
|
369
|
+
});
|