@zenalexa/unicli 0.225.2 → 0.225.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/AGENTS.md +2 -2
  2. package/README.md +3 -3
  3. package/README.zh-CN.md +3 -3
  4. package/dist/adapters/acl-anthology/papers.d.ts +16 -9
  5. package/dist/adapters/acl-anthology/papers.d.ts.map +1 -1
  6. package/dist/adapters/acl-anthology/papers.js +322 -58
  7. package/dist/adapters/acl-anthology/papers.js.map +1 -1
  8. package/dist/adapters/arxiv/papers.d.ts +22 -4
  9. package/dist/adapters/arxiv/papers.d.ts.map +1 -1
  10. package/dist/adapters/arxiv/papers.js +202 -4
  11. package/dist/adapters/arxiv/papers.js.map +1 -1
  12. package/dist/adapters/baidu-scholar/search.d.ts +15 -1
  13. package/dist/adapters/baidu-scholar/search.d.ts.map +1 -1
  14. package/dist/adapters/baidu-scholar/search.js +72 -8
  15. package/dist/adapters/baidu-scholar/search.js.map +1 -1
  16. package/dist/adapters/biorxiv/preprints.d.ts +9 -0
  17. package/dist/adapters/biorxiv/preprints.d.ts.map +1 -0
  18. package/dist/adapters/biorxiv/preprints.js +78 -0
  19. package/dist/adapters/biorxiv/preprints.js.map +1 -0
  20. package/dist/adapters/cnki/search.d.ts +82 -0
  21. package/dist/adapters/cnki/search.d.ts.map +1 -0
  22. package/dist/adapters/cnki/search.js +236 -0
  23. package/dist/adapters/cnki/search.js.map +1 -0
  24. package/dist/adapters/cvf/papers.d.ts +12 -7
  25. package/dist/adapters/cvf/papers.d.ts.map +1 -1
  26. package/dist/adapters/cvf/papers.js +210 -27
  27. package/dist/adapters/cvf/papers.js.map +1 -1
  28. package/dist/adapters/dblp/publications.d.ts +12 -5
  29. package/dist/adapters/dblp/publications.d.ts.map +1 -1
  30. package/dist/adapters/dblp/publications.js +31 -8
  31. package/dist/adapters/dblp/publications.js.map +1 -1
  32. package/dist/adapters/google-scholar/search.d.ts +22 -1
  33. package/dist/adapters/google-scholar/search.d.ts.map +1 -1
  34. package/dist/adapters/google-scholar/search.js +129 -14
  35. package/dist/adapters/google-scholar/search.js.map +1 -1
  36. package/dist/adapters/hf/paper.d.ts +12 -3
  37. package/dist/adapters/hf/paper.d.ts.map +1 -1
  38. package/dist/adapters/hf/paper.js +65 -5
  39. package/dist/adapters/hf/paper.js.map +1 -1
  40. package/dist/adapters/medrxiv/preprints.d.ts +9 -0
  41. package/dist/adapters/medrxiv/preprints.d.ts.map +1 -0
  42. package/dist/adapters/medrxiv/preprints.js +78 -0
  43. package/dist/adapters/medrxiv/preprints.js.map +1 -0
  44. package/dist/adapters/neurips/proceedings.d.ts +8 -7
  45. package/dist/adapters/neurips/proceedings.d.ts.map +1 -1
  46. package/dist/adapters/neurips/proceedings.js +209 -21
  47. package/dist/adapters/neurips/proceedings.js.map +1 -1
  48. package/dist/adapters/openalex/works.d.ts +21 -5
  49. package/dist/adapters/openalex/works.d.ts.map +1 -1
  50. package/dist/adapters/openalex/works.js +108 -8
  51. package/dist/adapters/openalex/works.js.map +1 -1
  52. package/dist/adapters/openreview/papers.d.ts +10 -4
  53. package/dist/adapters/openreview/papers.d.ts.map +1 -1
  54. package/dist/adapters/openreview/papers.js +351 -24
  55. package/dist/adapters/openreview/papers.js.map +1 -1
  56. package/dist/adapters/pmlr/proceedings.d.ts +6 -6
  57. package/dist/adapters/pmlr/proceedings.d.ts.map +1 -1
  58. package/dist/adapters/pmlr/proceedings.js +92 -12
  59. package/dist/adapters/pmlr/proceedings.js.map +1 -1
  60. package/dist/adapters/pubmed/articles.d.ts +8 -4
  61. package/dist/adapters/pubmed/articles.d.ts.map +1 -1
  62. package/dist/adapters/pubmed/articles.js +272 -39
  63. package/dist/adapters/pubmed/articles.js.map +1 -1
  64. package/dist/adapters/rxiv/preprints.d.ts +75 -0
  65. package/dist/adapters/rxiv/preprints.d.ts.map +1 -0
  66. package/dist/adapters/rxiv/preprints.js +651 -0
  67. package/dist/adapters/rxiv/preprints.js.map +1 -0
  68. package/dist/adapters/scholar-artifacts/pdf-read.d.ts +49 -0
  69. package/dist/adapters/scholar-artifacts/pdf-read.d.ts.map +1 -0
  70. package/dist/adapters/scholar-artifacts/pdf-read.js +204 -0
  71. package/dist/adapters/scholar-artifacts/pdf-read.js.map +1 -0
  72. package/dist/adapters/scholar-artifacts/pdf.d.ts +16 -0
  73. package/dist/adapters/scholar-artifacts/pdf.d.ts.map +1 -0
  74. package/dist/adapters/scholar-artifacts/pdf.js +122 -0
  75. package/dist/adapters/scholar-artifacts/pdf.js.map +1 -0
  76. package/dist/adapters/semantic-scholar/papers.d.ts +6 -6
  77. package/dist/adapters/semantic-scholar/papers.d.ts.map +1 -1
  78. package/dist/adapters/semantic-scholar/papers.js +80 -6
  79. package/dist/adapters/semantic-scholar/papers.js.map +1 -1
  80. package/dist/adapters/unpaywall/works.d.ts +7 -7
  81. package/dist/adapters/unpaywall/works.d.ts.map +1 -1
  82. package/dist/adapters/unpaywall/works.js +104 -12
  83. package/dist/adapters/unpaywall/works.js.map +1 -1
  84. package/dist/adapters/wanfang/search.d.ts +14 -0
  85. package/dist/adapters/wanfang/search.d.ts.map +1 -1
  86. package/dist/adapters/wanfang/search.js +56 -7
  87. package/dist/adapters/wanfang/search.js.map +1 -1
  88. package/dist/browser/page.d.ts +2 -0
  89. package/dist/browser/page.d.ts.map +1 -1
  90. package/dist/browser/page.js +12 -0
  91. package/dist/browser/page.js.map +1 -1
  92. package/dist/commands/browser/actions.d.ts.map +1 -1
  93. package/dist/commands/browser/actions.js +59 -3
  94. package/dist/commands/browser/actions.js.map +1 -1
  95. package/dist/commands/scholar.d.ts +77 -5
  96. package/dist/commands/scholar.d.ts.map +1 -1
  97. package/dist/commands/scholar.js +2945 -83
  98. package/dist/commands/scholar.js.map +1 -1
  99. package/dist/core/command-contract.d.ts.map +1 -1
  100. package/dist/core/command-contract.js +5 -0
  101. package/dist/core/command-contract.js.map +1 -1
  102. package/dist/core/schema-v2.d.ts +1 -0
  103. package/dist/core/schema-v2.d.ts.map +1 -1
  104. package/dist/core/schema-v2.js +1 -0
  105. package/dist/core/schema-v2.js.map +1 -1
  106. package/dist/discovery/aliases.d.ts.map +1 -1
  107. package/dist/discovery/aliases.js +208 -0
  108. package/dist/discovery/aliases.js.map +1 -1
  109. package/dist/discovery/core-catalog.d.ts +2 -0
  110. package/dist/discovery/core-catalog.d.ts.map +1 -1
  111. package/dist/discovery/core-catalog.js +487 -0
  112. package/dist/discovery/core-catalog.js.map +1 -1
  113. package/dist/discovery/intents.d.ts.map +1 -1
  114. package/dist/discovery/intents.js +273 -2
  115. package/dist/discovery/intents.js.map +1 -1
  116. package/dist/discovery/loader.d.ts.map +1 -1
  117. package/dist/discovery/loader.js +3 -0
  118. package/dist/discovery/loader.js.map +1 -1
  119. package/dist/engine/capability-policy.d.ts.map +1 -1
  120. package/dist/engine/capability-policy.js +30 -4
  121. package/dist/engine/capability-policy.js.map +1 -1
  122. package/dist/engine/kernel/stages.d.ts.map +1 -1
  123. package/dist/engine/kernel/stages.js +3 -0
  124. package/dist/engine/kernel/stages.js.map +1 -1
  125. package/dist/engine/operation-policy.d.ts +4 -1
  126. package/dist/engine/operation-policy.d.ts.map +1 -1
  127. package/dist/engine/operation-policy.js +23 -0
  128. package/dist/engine/operation-policy.js.map +1 -1
  129. package/dist/fast-path/manifest.d.ts +3 -0
  130. package/dist/fast-path/manifest.d.ts.map +1 -1
  131. package/dist/fast-path/manifest.js.map +1 -1
  132. package/dist/fast-path/policy.d.ts.map +1 -1
  133. package/dist/fast-path/policy.js +3 -0
  134. package/dist/fast-path/policy.js.map +1 -1
  135. package/dist/manifest-compact.txt +1 -1
  136. package/dist/manifest.json +6804 -1002
  137. package/dist/registry.d.ts +2 -0
  138. package/dist/registry.d.ts.map +1 -1
  139. package/dist/registry.js +1 -0
  140. package/dist/registry.js.map +1 -1
  141. package/dist/types/scholarly.d.ts +19 -4
  142. package/dist/types/scholarly.d.ts.map +1 -1
  143. package/dist/types/scholarly.js +4 -4
  144. package/dist/types.d.ts +8 -0
  145. package/dist/types.d.ts.map +1 -1
  146. package/dist/types.js.map +1 -1
  147. package/package.json +1 -1
  148. package/server.json +2 -2
  149. package/skills/unicli/SKILL.md +1 -1
  150. package/skills/unicli-claude-code/SKILL.md +1 -1
  151. package/skills/unicli-hermes/SKILL.md +1 -1
  152. package/src/adapters/acl-anthology/papers.test.ts +111 -0
  153. package/src/adapters/acl-anthology/papers.ts +379 -71
  154. package/src/adapters/arxiv/papers.test.ts +46 -0
  155. package/src/adapters/arxiv/papers.ts +251 -4
  156. package/src/adapters/baidu-scholar/search.ts +74 -11
  157. package/src/adapters/biorxiv/preprints.ts +112 -0
  158. package/src/adapters/cnki/search.ts +357 -0
  159. package/src/adapters/cvf/papers.ts +260 -27
  160. package/src/adapters/dblp/publications.test.ts +9 -0
  161. package/src/adapters/dblp/publications.ts +31 -8
  162. package/src/adapters/google-scholar/search.ts +165 -17
  163. package/src/adapters/hf/paper.test.ts +23 -0
  164. package/src/adapters/hf/paper.ts +89 -5
  165. package/src/adapters/hf/top.yaml +34 -2
  166. package/src/adapters/huggingface-papers/daily.yaml +37 -3
  167. package/src/adapters/huggingface-papers/search.yaml +43 -9
  168. package/src/adapters/medrxiv/preprints.ts +112 -0
  169. package/src/adapters/neurips/proceedings.ts +266 -22
  170. package/src/adapters/openalex/works.test.ts +15 -4
  171. package/src/adapters/openalex/works.ts +136 -8
  172. package/src/adapters/openreview/papers.test.ts +31 -0
  173. package/src/adapters/openreview/papers.ts +407 -29
  174. package/src/adapters/pmlr/proceedings.ts +102 -12
  175. package/src/adapters/pubmed/articles.test.ts +88 -1
  176. package/src/adapters/pubmed/articles.ts +343 -44
  177. package/src/adapters/rxiv/preprints.test.ts +233 -0
  178. package/src/adapters/rxiv/preprints.ts +849 -0
  179. package/src/adapters/scholar-artifacts/pdf-read.ts +277 -0
  180. package/src/adapters/scholar-artifacts/pdf.ts +133 -0
  181. package/src/adapters/semantic-scholar/papers.ts +98 -6
  182. package/src/adapters/unpaywall/works.ts +141 -12
  183. package/src/adapters/wanfang/search.ts +57 -7
  184. package/src/adapters/cnki/search.yaml +0 -49
@@ -0,0 +1,357 @@
1
+ /**
2
+ * @owner src::adapters::cnki::search
3
+ * @does Registers CNKI Scholar title search against the current public KNS criteria endpoint.
4
+ * @needs scholar.cnki.net KNS criteria API, node:crypto, src/registry.ts, src/types/scholarly.ts
5
+ * @feeds `unicli cnki search`, src/commands/scholar.ts via scholar.search, scholar doctor live probes
6
+ * @breaks CNKI token/request-shape drift surfaces as upstream_error; access-controlled PDF/order URLs are exposed as relation hints, not as a scholar.pdf guarantee.
7
+ * @invariants Search uses CNKI's current all-database class id; rows must carry source-local id/title/source_adapter/retrieved_at; no stale API fallback is kept.
8
+ * @side-effects HTTPS egress to scholar.cnki.net only
9
+ * @perf O(limit) JSON mapping after one POST request.
10
+ * @concurrency safe
11
+ * @test tests/unit/adapters/scholar-sources.test.ts; live smoke via `unicli cnki search <query>` and `unicli scholar doctor --sources cnki --live`
12
+ * @stability experimental
13
+ * @since 2026-06-27
14
+ */
15
+
16
+ import { createCipheriv } from "node:crypto";
17
+
18
+ import { cli, Strategy } from "../../registry.js";
19
+ import type { ScholarlyWorkRecord } from "../../types/scholarly.js";
20
+
21
+ const CNKI_QUERY_API =
22
+ "https://scholar.cnki.net/restapi/kns8s-api/v2/criteria/query";
23
+ const CNKI_TOKEN_SECRET = "cf4e8f25360248f89248af06a55d21ea";
24
+ const CNKI_CLIENT_ID = "c5fd4ef0-d314-4888-b0a7-f6190eaefaf0";
25
+ const CNKI_ALL_DATABASE_CLASS_ID = "WD0FTY92";
26
+ const CNKI_REFERER = "https://scholar.cnki.net/";
27
+ const MAX_LIMIT = 50;
28
+
29
+ interface CnkiMetadataEntry {
30
+ name?: unknown;
31
+ value?: unknown;
32
+ }
33
+
34
+ interface CnkiRelation {
35
+ scope?: unknown;
36
+ url?: unknown;
37
+ }
38
+
39
+ interface CnkiAuthor {
40
+ title?: unknown;
41
+ }
42
+
43
+ interface CnkiSource {
44
+ title?: unknown;
45
+ type?: unknown;
46
+ year?: unknown;
47
+ relations?: CnkiRelation[];
48
+ }
49
+
50
+ export interface CnkiSearchRow {
51
+ metadata?: CnkiMetadataEntry[];
52
+ relations?: CnkiRelation[];
53
+ authors?: CnkiAuthor[];
54
+ source?: CnkiSource;
55
+ }
56
+
57
+ interface CnkiSearchResponse {
58
+ code?: unknown;
59
+ message?: unknown;
60
+ data?: {
61
+ total?: unknown;
62
+ data?: CnkiSearchRow[];
63
+ };
64
+ }
65
+
66
+ interface CnkiSearchPayload {
67
+ Resource: string;
68
+ Classid: string;
69
+ Products: string;
70
+ KuaKuCode: string;
71
+ QNode: {
72
+ QGroup: Array<{
73
+ Key: string;
74
+ Title: string;
75
+ Logic: number;
76
+ Items: unknown[];
77
+ ChildItems: Array<{
78
+ Key: string;
79
+ Title: string;
80
+ Logic: number;
81
+ Items: Array<{
82
+ Key: string;
83
+ Title: string;
84
+ Logic: number;
85
+ Field: string;
86
+ Operator: string;
87
+ Value: string;
88
+ }>;
89
+ ChildItems: unknown[];
90
+ }>;
91
+ }>;
92
+ };
93
+ ExScope: string;
94
+ SearchType: number;
95
+ SearchFrom: number;
96
+ Rlang: string;
97
+ sort: string;
98
+ sortType: string;
99
+ pageNum: number;
100
+ pageSize: number;
101
+ }
102
+
103
+ type ActionableError = Error & {
104
+ code?: string;
105
+ suggestion?: string;
106
+ retryable?: boolean;
107
+ };
108
+
109
+ function str(value: unknown): string {
110
+ return typeof value === "string" ? value.trim() : "";
111
+ }
112
+
113
+ function stripHtml(value: unknown): string {
114
+ return str(value)
115
+ .replace(/<[^>]*>/g, "")
116
+ .replace(/&nbsp;/g, " ")
117
+ .replace(/&amp;/g, "&")
118
+ .replace(/&lt;/g, "<")
119
+ .replace(/&gt;/g, ">")
120
+ .replace(/&quot;/g, '"')
121
+ .replace(/&#39;/g, "'")
122
+ .replace(/\s+/g, " ")
123
+ .trim();
124
+ }
125
+
126
+ function metadataValue(row: CnkiSearchRow, name: string): string {
127
+ const entry = row.metadata?.find(
128
+ (candidate) => str(candidate.name).toUpperCase() === name,
129
+ );
130
+ return stripHtml(entry?.value);
131
+ }
132
+
133
+ function relationUrl(
134
+ relations: CnkiRelation[] | undefined,
135
+ scope: string,
136
+ ): string {
137
+ const found = relations?.find(
138
+ (relation) => str(relation.scope).toUpperCase() === scope,
139
+ );
140
+ return str(found?.url);
141
+ }
142
+
143
+ function firstRelationUrl(row: CnkiSearchRow, scopes: string[]): string {
144
+ for (const scope of scopes) {
145
+ const rowUrl = relationUrl(row.relations, scope);
146
+ if (rowUrl) return rowUrl;
147
+ const sourceUrl = relationUrl(row.source?.relations, scope);
148
+ if (sourceUrl) return sourceUrl;
149
+ }
150
+ return "";
151
+ }
152
+
153
+ function parseYear(value: string): number | undefined {
154
+ const year = Number(value.match(/(?:19|20)\d{2}/)?.[0]);
155
+ return Number.isInteger(year) ? year : undefined;
156
+ }
157
+
158
+ function splitMetadataAuthors(value: string): string[] {
159
+ return value
160
+ .split(/[;;]/)
161
+ .map((author) => author.trim())
162
+ .filter(Boolean);
163
+ }
164
+
165
+ function cnkiError(message: string, code = "upstream_error"): ActionableError {
166
+ const error = new Error(message) as ActionableError;
167
+ error.code = code;
168
+ error.suggestion =
169
+ "CNKI changed or rejected the public KNS criteria query. Run `unicli describe cnki search`, then inspect scholar.cnki.net's current search request before changing the adapter.";
170
+ error.retryable = false;
171
+ return error;
172
+ }
173
+
174
+ export function createCnkiVvToken(timestamp = Date.now()): string {
175
+ const cipher = createCipheriv(
176
+ "aes-256-ecb",
177
+ Buffer.from(CNKI_TOKEN_SECRET, "utf8"),
178
+ null,
179
+ );
180
+ cipher.setAutoPadding(true);
181
+ return Buffer.concat([
182
+ cipher.update(JSON.stringify({ timestamp }), "utf8"),
183
+ cipher.final(),
184
+ ]).toString("hex");
185
+ }
186
+
187
+ export function buildCnkiSearchPayload(
188
+ query: string,
189
+ limit: number,
190
+ ): CnkiSearchPayload {
191
+ return {
192
+ Resource: "",
193
+ Classid: CNKI_ALL_DATABASE_CLASS_ID,
194
+ Products: "",
195
+ KuaKuCode: "",
196
+ QNode: {
197
+ QGroup: [
198
+ {
199
+ Key: "",
200
+ Title: "",
201
+ Logic: 0,
202
+ Items: [],
203
+ ChildItems: [
204
+ {
205
+ Key: "subject",
206
+ Title: "",
207
+ Logic: 0,
208
+ Items: [
209
+ {
210
+ Key: "",
211
+ Title: "题名",
212
+ Logic: 0,
213
+ Field: "TI",
214
+ Operator: "FUZZY",
215
+ Value: query,
216
+ },
217
+ ],
218
+ ChildItems: [],
219
+ },
220
+ ],
221
+ },
222
+ ],
223
+ },
224
+ ExScope: "1",
225
+ SearchType: 2,
226
+ SearchFrom: 1,
227
+ Rlang: "",
228
+ sort: "PT",
229
+ sortType: "DESC",
230
+ pageNum: 1,
231
+ pageSize: limit,
232
+ };
233
+ }
234
+
235
+ export function mapCnkiSearchRow(
236
+ row: CnkiSearchRow,
237
+ rank: number,
238
+ ): ScholarlyWorkRecord & { rank: number; pdf_url?: string } {
239
+ const title = metadataValue(row, "TI") || metadataValue(row, "ENTI");
240
+ const doi = metadataValue(row, "DOI").replace(/^doi:/i, "");
241
+ const date = metadataValue(row, "PT");
242
+ const source = metadataValue(row, "LY") || stripHtml(row.source?.title);
243
+ const authorNames =
244
+ row.authors?.map((author) => stripHtml(author.title)).filter(Boolean) ?? [];
245
+ const authors = authorNames.length
246
+ ? authorNames
247
+ : splitMetadataAuthors(metadataValue(row, "AU"));
248
+ const sourceUrl = firstRelationUrl(row, ["ABSTRACT", "PUBLICATION"]);
249
+ const pdfUrl = firstRelationUrl(row, ["PDF"]);
250
+ const id =
251
+ metadataValue(row, "ID") ||
252
+ metadataValue(row, "FN") ||
253
+ doi ||
254
+ sourceUrl ||
255
+ title;
256
+ if (!id || !title) {
257
+ throw cnkiError("CNKI returned a row without a stable id or title.");
258
+ }
259
+ const citedByCount = Number(metadataValue(row, "CF"));
260
+ return {
261
+ id,
262
+ rank,
263
+ title,
264
+ authors,
265
+ year: parseYear(date) ?? parseYear(str(row.source?.year)),
266
+ date: date || undefined,
267
+ venue: source || undefined,
268
+ type: metadataValue(row, "DB") || stripHtml(row.source?.type) || undefined,
269
+ abstract: metadataValue(row, "AB") || undefined,
270
+ doi: doi || undefined,
271
+ cited_by_count: Number.isFinite(citedByCount) ? citedByCount : undefined,
272
+ pdf_url: pdfUrl || undefined,
273
+ landing_url: sourceUrl || undefined,
274
+ source_adapter: "cnki",
275
+ source_url: sourceUrl || undefined,
276
+ retrieved_at: new Date().toISOString(),
277
+ };
278
+ }
279
+
280
+ async function fetchCnkiSearch(
281
+ payload: CnkiSearchPayload,
282
+ ): Promise<CnkiSearchResponse> {
283
+ const url = new URL(CNKI_QUERY_API);
284
+ url.searchParams.set("vv", createCnkiVvToken());
285
+ url.searchParams.set("clientId", CNKI_CLIENT_ID);
286
+ const response = await fetch(url, {
287
+ method: "POST",
288
+ headers: {
289
+ Accept: "application/json, text/plain, */*",
290
+ "Content-Type": "application/json;charset=UTF-8",
291
+ Origin: "https://scholar.cnki.net",
292
+ Referer: CNKI_REFERER,
293
+ "User-Agent":
294
+ "Mozilla/5.0 (compatible; Uni-CLI/1.0; +https://github.com/olo-dot-io/Uni-CLI)",
295
+ Version: "",
296
+ },
297
+ body: JSON.stringify(payload),
298
+ });
299
+ if (!response.ok) {
300
+ throw cnkiError(`CNKI search returned HTTP ${response.status}.`);
301
+ }
302
+ const json = (await response.json()) as CnkiSearchResponse;
303
+ const code = Number(json.code);
304
+ if (code !== 0) {
305
+ throw cnkiError(
306
+ `CNKI search returned code ${String(json.code)}: ${str(json.message)}`,
307
+ );
308
+ }
309
+ return json;
310
+ }
311
+
312
+ cli({
313
+ site: "cnki",
314
+ name: "search",
315
+ description: "Search CNKI academic papers by title",
316
+ domain: "scholar.cnki.net",
317
+ strategy: Strategy.PUBLIC,
318
+ adapter_path: "src/adapters/cnki/search.ts",
319
+ args: [
320
+ { name: "query", type: "str", required: true, positional: true },
321
+ { name: "limit", type: "int", default: 20 },
322
+ ],
323
+ columns: [
324
+ "rank",
325
+ "id",
326
+ "title",
327
+ "authors",
328
+ "venue",
329
+ "year",
330
+ "doi",
331
+ "cited_by_count",
332
+ "pdf_url",
333
+ "source_url",
334
+ ],
335
+ capabilities: ["http.fetch", "scholar.search"],
336
+ func: async (_page, kwargs) => {
337
+ const query = str(kwargs.query);
338
+ if (!query) {
339
+ const error = new Error(
340
+ "cnki search query cannot be empty.",
341
+ ) as ActionableError;
342
+ error.code = "invalid_input";
343
+ error.suggestion =
344
+ "Pass a CNKI title keyword, for example `unicli cnki search 人工智能`.";
345
+ error.retryable = false;
346
+ throw error;
347
+ }
348
+ const requestedLimit = Number(kwargs.limit ?? 20);
349
+ const limit = Number.isFinite(requestedLimit)
350
+ ? Math.min(Math.max(Math.trunc(requestedLimit), 1), MAX_LIMIT)
351
+ : 20;
352
+ const payload = buildCnkiSearchPayload(query, limit);
353
+ const response = await fetchCnkiSearch(payload);
354
+ const rows = Array.isArray(response.data?.data) ? response.data.data : [];
355
+ return rows.map((row, index) => mapCnkiSearchRow(row, index + 1));
356
+ },
357
+ });
@@ -1,12 +1,12 @@
1
1
  /**
2
2
  * @owner src::adapters::cvf::papers
3
- * @does Registers CVF OpenAccess conference paper search for CVPR/ICCV/ECCV-style proceedings pages.
4
- * @needs openaccess.thecvf.com static proceedings HTML, src/registry.ts
5
- * @feeds src/commands/scholar.ts via scholar.search, scholar.pdf, and scholar.venue
6
- * @breaks CVF markup drift surfaces as empty/parse errors rather than non-CVF fallbacks.
7
- * @invariants Venue/year map to explicit CVF event pages; PDF URLs are absolutized against openaccess.thecvf.com.
8
- * @side-effects HTTPS egress to openaccess.thecvf.com only
9
- * @perf O(N) over one proceedings HTML page
3
+ * @does Registers CVF OpenAccess conference paper search, detail retrieval, and PDF text reading for CVPR/ICCV/ECCV-style proceedings pages.
4
+ * @needs openaccess.thecvf.com static proceedings HTML/PDFs, src/adapters/scholar-artifacts/pdf-read.ts, src/registry.ts
5
+ * @feeds src/commands/scholar.ts via scholar.search, scholar.get, scholar.pdf, scholar.fulltext, and scholar.venue
6
+ * @breaks CVF markup/PDF drift, denied downloads, or missing pdftotext surface as explicit adapter errors rather than non-CVF fallbacks.
7
+ * @invariants Venue/year map to explicit CVF event pages; paper detail prefers citation_* metadata over scraped display blocks.
8
+ * @side-effects HTTPS egress to openaccess.thecvf.com; read writes one PDF artifact and executes pdftotext.
9
+ * @perf O(N) over one proceedings HTML page; read is O(PDF bytes + selected pages)
10
10
  * @concurrency safe
11
11
  * @test tests/unit/adapters/scholar-sources.test.ts
12
12
  * @stability experimental
@@ -15,8 +15,23 @@
15
15
 
16
16
  import { cli, Strategy } from "../../registry.js";
17
17
  import type { ScholarlyWorkRecord } from "../../types/scholarly.js";
18
+ import { readScholarPdf } from "../scholar-artifacts/pdf-read.js";
19
+ import { Agent, request } from "undici";
18
20
 
19
21
  const ORIGIN = "https://openaccess.thecvf.com";
22
+ const CVF_USER_AGENT = "unicli-cvf/1.0 (https://github.com/olo-dot-io/Uni-CLI)";
23
+ const CVF_HTTP_AGENT = new Agent({ connect: { timeout: 30_000 } });
24
+ export const CVF_HTTP_HEADERS = {
25
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
26
+ "User-Agent": CVF_USER_AGENT,
27
+ } as const;
28
+
29
+ type CvfActionableError = Error & {
30
+ code?: string;
31
+ suggestion?: string;
32
+ retryable?: boolean;
33
+ alternatives?: string[];
34
+ };
20
35
 
21
36
  function decode(value: string): string {
22
37
  return value
@@ -35,6 +50,26 @@ function absolute(path: string): string {
35
50
  : `${ORIGIN}${path.startsWith("/") ? "" : "/"}${path}`;
36
51
  }
37
52
 
53
+ function metaContents(html: string, name: string): string[] {
54
+ const values: string[] = [];
55
+ const re = new RegExp(
56
+ `<meta\\s+name=["']${name}["']\\s+content=["']([^"']*)["'][^>]*>`,
57
+ "gi",
58
+ );
59
+ let match: RegExpExecArray | null;
60
+ while ((match = re.exec(html)) !== null) values.push(decode(match[1]));
61
+ return values;
62
+ }
63
+
64
+ function firstMetaContent(html: string, name: string): string {
65
+ return metaContents(html, name)[0] ?? "";
66
+ }
67
+
68
+ function parseYear(value: string): number | undefined {
69
+ const year = value.match(/\d{4}/)?.[0];
70
+ return year ? Number(year) : undefined;
71
+ }
72
+
38
73
  function eventId(venue: unknown, year: unknown): string {
39
74
  const v = String(venue ?? "CVPR")
40
75
  .trim()
@@ -46,6 +81,89 @@ function eventId(venue: unknown, year: unknown): string {
46
81
  return `${v}${y}`;
47
82
  }
48
83
 
84
+ function requireCvfPaperId(value: unknown): string {
85
+ const raw = String(value ?? "").trim();
86
+ const id =
87
+ raw.match(/\/html\/([^/?#]+\.html)/)?.[1]?.replace(/\.html$/, "") ??
88
+ raw.replace(/\.html$/, "");
89
+ if (!/^[A-Za-z0-9_.-]+$/.test(id)) {
90
+ throw new Error(`CVF paper id "${raw}" is not valid.`);
91
+ }
92
+ return id;
93
+ }
94
+
95
+ function paperUrl(id: string, event: string): string {
96
+ return `${ORIGIN}/content/${event}/html/${id}.html`;
97
+ }
98
+
99
+ function cvfUpstreamError(label: string, detail: string): CvfActionableError {
100
+ const error = new Error(`${label} failed: ${detail}.`) as CvfActionableError;
101
+ error.code = "upstream_error";
102
+ error.suggestion =
103
+ "CVF OpenAccess did not return the expected public proceedings page on this network path; retry later or verify the official openaccess.thecvf.com page manually.";
104
+ error.retryable =
105
+ /fetch failed|timeout|ECONNRESET|ETIMEDOUT|HTTP (429|5\d\d)/i.test(detail);
106
+ error.alternatives = [];
107
+ return error;
108
+ }
109
+
110
+ function parseListAuthors(block: string): string[] | undefined {
111
+ const beforeLinks = block.split(/\[<a\s+href=/i)[0] ?? block;
112
+ const dd = beforeLinks.match(/<dd>([\s\S]*?)(?:<\/dd>|$)/i)?.[1] ?? "";
113
+ const text = decode(
114
+ dd.replace(/<div class="bibref[\s\S]*$/i, " ").replace(/<[^>]+>/g, " "),
115
+ );
116
+ const authors = text
117
+ .replace(/;\s*Proceedings[\s\S]*$/i, "")
118
+ .split(",")
119
+ .map((author) => author.trim())
120
+ .filter(Boolean);
121
+ return authors.length > 0 ? authors : undefined;
122
+ }
123
+
124
+ export function parseCvfPaperPage(
125
+ html: string,
126
+ sourceUrl: string,
127
+ ): ScholarlyWorkRecord {
128
+ const title =
129
+ firstMetaContent(html, "citation_title") ||
130
+ decode(
131
+ html
132
+ .match(/<div id="papertitle">([\s\S]*?)<dd>/i)?.[1]
133
+ ?.replace(/<[^>]+>/g, " ") ?? "",
134
+ );
135
+ if (!title) throw new Error("CVF paper page did not expose a title.");
136
+ const event = sourceUrl.match(/\/content\/([A-Z]+\d{4})\//)?.[1] ?? undefined;
137
+ const id =
138
+ sourceUrl
139
+ .split("/")
140
+ .pop()
141
+ ?.replace(/\.html$/, "") ?? title;
142
+ const pdfUrl =
143
+ firstMetaContent(html, "citation_pdf_url") ||
144
+ html.match(/<a href="([^"]+\.pdf)">pdf<\/a>/i)?.[1] ||
145
+ "";
146
+ return {
147
+ id,
148
+ title,
149
+ authors: metaContents(html, "citation_author"),
150
+ year:
151
+ parseYear(firstMetaContent(html, "citation_publication_date")) ??
152
+ (event ? Number(event.slice(-4)) : undefined),
153
+ venue:
154
+ firstMetaContent(html, "citation_conference_title") ||
155
+ event?.replace(/\d{4}$/, ""),
156
+ abstract:
157
+ decode(
158
+ html.match(/<div id="abstract">([\s\S]*?)<\/div>/i)?.[1] ?? "",
159
+ ).replace(/<[^>]+>/g, " ") || undefined,
160
+ pdf_url: pdfUrl ? absolute(pdfUrl) : undefined,
161
+ source_adapter: "cvf",
162
+ source_url: sourceUrl,
163
+ retrieved_at: new Date().toISOString(),
164
+ };
165
+ }
166
+
49
167
  export function parseCvfRows(
50
168
  html: string,
51
169
  event = "CVPR2024",
@@ -59,14 +177,6 @@ export function parseCvfRows(
59
177
  const title = decode(match[2].replace(/<[^>]+>/g, " "));
60
178
  const block = match[3];
61
179
  const pdf = block.match(/<a href="([^"]+\.pdf)">pdf<\/a>/i)?.[1] ?? "";
62
- const authorText = block
63
- .replace(/\[[\s\S]*?\]/g, " ")
64
- .replace(/<form[\s\S]*?<\/form>/g, " ")
65
- .replace(/<[^>]+>/g, " ");
66
- const authors = decode(authorText)
67
- .split(",")
68
- .map((author) => author.trim())
69
- .filter(Boolean);
70
180
  out.push({
71
181
  id:
72
182
  sourceUrl
@@ -74,7 +184,7 @@ export function parseCvfRows(
74
184
  .pop()
75
185
  ?.replace(/\.html$/, "") ?? title,
76
186
  title,
77
- authors: authors.length > 0 ? authors : undefined,
187
+ authors: parseListAuthors(block),
78
188
  year: Number(event.slice(-4)),
79
189
  venue: event.replace(/\d{4}$/, ""),
80
190
  pdf_url: pdf ? absolute(pdf) : undefined,
@@ -86,6 +196,54 @@ export function parseCvfRows(
86
196
  return out;
87
197
  }
88
198
 
199
+ async function fetchCvfHtml(url: string, label: string): Promise<string> {
200
+ let response: Awaited<ReturnType<typeof request>>;
201
+ try {
202
+ response = await request(url, {
203
+ dispatcher: CVF_HTTP_AGENT,
204
+ headers: CVF_HTTP_HEADERS,
205
+ });
206
+ } catch (error) {
207
+ throw cvfUpstreamError(
208
+ label,
209
+ error instanceof Error ? error.message : String(error),
210
+ );
211
+ }
212
+ if (response.statusCode === 404)
213
+ throw new Error(`${label} returned no page.`);
214
+ if (response.statusCode < 200 || response.statusCode >= 300) {
215
+ throw cvfUpstreamError(label, `HTTP ${response.statusCode}`);
216
+ }
217
+ return response.body.text();
218
+ }
219
+
220
+ async function readCvfPaperPdf(
221
+ row: ScholarlyWorkRecord,
222
+ kwargs: Record<string, unknown>,
223
+ ): Promise<Record<string, unknown>> {
224
+ if (!row.pdf_url) throw new Error(`CVF paper ${row.id} has no PDF URL.`);
225
+ return readScholarPdf(
226
+ {
227
+ id: row.id,
228
+ title: row.title,
229
+ source_adapter: "cvf",
230
+ source_url: row.source_url,
231
+ pdf_url: row.pdf_url,
232
+ output: kwargs.output,
233
+ filename: kwargs.filename,
234
+ "first-page": kwargs["first-page"] ?? kwargs.firstPage,
235
+ "last-page": kwargs["last-page"] ?? kwargs.lastPage,
236
+ "max-chars": kwargs["max-chars"] ?? kwargs.maxChars,
237
+ },
238
+ {
239
+ site: "cvf",
240
+ command: "read",
241
+ defaultOutput: "./cvf-downloads",
242
+ userAgent: CVF_USER_AGENT,
243
+ },
244
+ );
245
+ }
246
+
89
247
  cli({
90
248
  site: "cvf",
91
249
  name: "search",
@@ -111,18 +269,11 @@ cli({
111
269
  .toLowerCase();
112
270
  if (!query) throw new Error("cvf search query cannot be empty.");
113
271
  const event = eventId(kwargs.venue, kwargs.year);
114
- const response = await fetch(`${ORIGIN}/${event}?day=all`, {
115
- headers: {
116
- Accept: "*/*",
117
- "User-Agent": "unicli-cvf/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
118
- },
119
- });
120
- if (response.status === 404)
121
- throw new Error(`CVF ${event} returned no proceedings page.`);
122
- if (!response.ok)
123
- throw new Error(`CVF ${event} returned HTTP ${response.status}.`);
124
272
  const limit = Math.min(Math.max(Number(kwargs.limit ?? 20), 1), 200);
125
- const rows = parseCvfRows(await response.text(), event)
273
+ const rows = parseCvfRows(
274
+ await fetchCvfHtml(`${ORIGIN}/${event}?day=all`, `CVF ${event}`),
275
+ event,
276
+ )
126
277
  .filter((row) =>
127
278
  `${row.title} ${row.authors?.join(" ") ?? ""}`
128
279
  .toLowerCase()
@@ -134,3 +285,85 @@ cli({
134
285
  return rows;
135
286
  },
136
287
  });
288
+
289
+ cli({
290
+ site: "cvf",
291
+ name: "paper",
292
+ description: "Fetch CVF OpenAccess paper metadata by page id",
293
+ domain: "openaccess.thecvf.com",
294
+ strategy: Strategy.PUBLIC,
295
+ args: [
296
+ { name: "id", type: "str", required: true, positional: true },
297
+ { name: "venue", type: "str", default: "CVPR" },
298
+ { name: "year", type: "str", default: "2024" },
299
+ ],
300
+ columns: ["id", "title", "authors", "year", "venue", "pdf_url", "source_url"],
301
+ capabilities: ["http.fetch", "scholar.get", "scholar.pdf"],
302
+ func: async (_page, kwargs) => {
303
+ const event = eventId(kwargs.venue, kwargs.year);
304
+ const id = requireCvfPaperId(kwargs.id ?? kwargs.ref);
305
+ const url = paperUrl(id, event);
306
+ return [parseCvfPaperPage(await fetchCvfHtml(url, `CVF paper ${id}`), url)];
307
+ },
308
+ });
309
+
310
+ cli({
311
+ site: "cvf",
312
+ name: "read",
313
+ description:
314
+ "Download a CVF OpenAccess paper PDF by page id and extract text",
315
+ domain: "openaccess.thecvf.com",
316
+ strategy: Strategy.PUBLIC,
317
+ args: [
318
+ { name: "id", type: "str", required: true, positional: true },
319
+ { name: "venue", type: "str", default: "CVPR" },
320
+ { name: "year", type: "str", default: "2024" },
321
+ {
322
+ name: "output",
323
+ type: "str",
324
+ default: "./cvf-downloads",
325
+ description: "Output directory for the downloaded PDF",
326
+ "x-unicli-kind": "path",
327
+ },
328
+ { name: "filename", type: "str", description: "Output PDF filename" },
329
+ { name: "first-page", type: "int", default: 1, description: "First page" },
330
+ { name: "last-page", type: "int", default: 20, description: "Last page" },
331
+ {
332
+ name: "max-chars",
333
+ type: "int",
334
+ default: 40000,
335
+ description: "Maximum extracted text characters",
336
+ },
337
+ ],
338
+ columns: [
339
+ "id",
340
+ "title",
341
+ "source_adapter",
342
+ "source_url",
343
+ "pdf_url",
344
+ "path",
345
+ "text_source",
346
+ "text",
347
+ "text_chars",
348
+ "text_truncated",
349
+ ],
350
+ capabilities: [
351
+ "http.fetch",
352
+ "http.download",
353
+ "subprocess.exec",
354
+ "scholar.fulltext",
355
+ "scholar.pdf",
356
+ ],
357
+ executables: ["pdftotext"],
358
+ minimum_capability: "subprocess.exec",
359
+ func: async (_page, kwargs) => {
360
+ const event = eventId(kwargs.venue, kwargs.year);
361
+ const id = requireCvfPaperId(kwargs.id ?? kwargs.ref);
362
+ const url = paperUrl(id, event);
363
+ const row = parseCvfPaperPage(
364
+ await fetchCvfHtml(url, `CVF paper ${id}`),
365
+ url,
366
+ );
367
+ return [await readCvfPaperPdf(row, kwargs)];
368
+ },
369
+ });