@zenalexa/unicli 0.225.2 → 0.225.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/AGENTS.md +2 -2
  2. package/README.md +3 -3
  3. package/README.zh-CN.md +3 -3
  4. package/dist/adapters/acl-anthology/papers.d.ts +16 -9
  5. package/dist/adapters/acl-anthology/papers.d.ts.map +1 -1
  6. package/dist/adapters/acl-anthology/papers.js +322 -58
  7. package/dist/adapters/acl-anthology/papers.js.map +1 -1
  8. package/dist/adapters/arxiv/papers.d.ts +22 -4
  9. package/dist/adapters/arxiv/papers.d.ts.map +1 -1
  10. package/dist/adapters/arxiv/papers.js +202 -4
  11. package/dist/adapters/arxiv/papers.js.map +1 -1
  12. package/dist/adapters/baidu-scholar/search.d.ts +15 -1
  13. package/dist/adapters/baidu-scholar/search.d.ts.map +1 -1
  14. package/dist/adapters/baidu-scholar/search.js +72 -8
  15. package/dist/adapters/baidu-scholar/search.js.map +1 -1
  16. package/dist/adapters/biorxiv/preprints.d.ts +9 -0
  17. package/dist/adapters/biorxiv/preprints.d.ts.map +1 -0
  18. package/dist/adapters/biorxiv/preprints.js +78 -0
  19. package/dist/adapters/biorxiv/preprints.js.map +1 -0
  20. package/dist/adapters/cnki/search.d.ts +82 -0
  21. package/dist/adapters/cnki/search.d.ts.map +1 -0
  22. package/dist/adapters/cnki/search.js +236 -0
  23. package/dist/adapters/cnki/search.js.map +1 -0
  24. package/dist/adapters/cvf/papers.d.ts +12 -7
  25. package/dist/adapters/cvf/papers.d.ts.map +1 -1
  26. package/dist/adapters/cvf/papers.js +210 -27
  27. package/dist/adapters/cvf/papers.js.map +1 -1
  28. package/dist/adapters/dblp/publications.d.ts +12 -5
  29. package/dist/adapters/dblp/publications.d.ts.map +1 -1
  30. package/dist/adapters/dblp/publications.js +31 -8
  31. package/dist/adapters/dblp/publications.js.map +1 -1
  32. package/dist/adapters/google-scholar/search.d.ts +22 -1
  33. package/dist/adapters/google-scholar/search.d.ts.map +1 -1
  34. package/dist/adapters/google-scholar/search.js +129 -14
  35. package/dist/adapters/google-scholar/search.js.map +1 -1
  36. package/dist/adapters/hf/paper.d.ts +12 -3
  37. package/dist/adapters/hf/paper.d.ts.map +1 -1
  38. package/dist/adapters/hf/paper.js +65 -5
  39. package/dist/adapters/hf/paper.js.map +1 -1
  40. package/dist/adapters/medrxiv/preprints.d.ts +9 -0
  41. package/dist/adapters/medrxiv/preprints.d.ts.map +1 -0
  42. package/dist/adapters/medrxiv/preprints.js +78 -0
  43. package/dist/adapters/medrxiv/preprints.js.map +1 -0
  44. package/dist/adapters/neurips/proceedings.d.ts +8 -7
  45. package/dist/adapters/neurips/proceedings.d.ts.map +1 -1
  46. package/dist/adapters/neurips/proceedings.js +209 -21
  47. package/dist/adapters/neurips/proceedings.js.map +1 -1
  48. package/dist/adapters/openalex/works.d.ts +21 -5
  49. package/dist/adapters/openalex/works.d.ts.map +1 -1
  50. package/dist/adapters/openalex/works.js +108 -8
  51. package/dist/adapters/openalex/works.js.map +1 -1
  52. package/dist/adapters/openreview/papers.d.ts +10 -4
  53. package/dist/adapters/openreview/papers.d.ts.map +1 -1
  54. package/dist/adapters/openreview/papers.js +351 -24
  55. package/dist/adapters/openreview/papers.js.map +1 -1
  56. package/dist/adapters/pmlr/proceedings.d.ts +6 -6
  57. package/dist/adapters/pmlr/proceedings.d.ts.map +1 -1
  58. package/dist/adapters/pmlr/proceedings.js +92 -12
  59. package/dist/adapters/pmlr/proceedings.js.map +1 -1
  60. package/dist/adapters/pubmed/articles.d.ts +8 -4
  61. package/dist/adapters/pubmed/articles.d.ts.map +1 -1
  62. package/dist/adapters/pubmed/articles.js +272 -39
  63. package/dist/adapters/pubmed/articles.js.map +1 -1
  64. package/dist/adapters/rxiv/preprints.d.ts +75 -0
  65. package/dist/adapters/rxiv/preprints.d.ts.map +1 -0
  66. package/dist/adapters/rxiv/preprints.js +651 -0
  67. package/dist/adapters/rxiv/preprints.js.map +1 -0
  68. package/dist/adapters/scholar-artifacts/pdf-read.d.ts +49 -0
  69. package/dist/adapters/scholar-artifacts/pdf-read.d.ts.map +1 -0
  70. package/dist/adapters/scholar-artifacts/pdf-read.js +204 -0
  71. package/dist/adapters/scholar-artifacts/pdf-read.js.map +1 -0
  72. package/dist/adapters/scholar-artifacts/pdf.d.ts +16 -0
  73. package/dist/adapters/scholar-artifacts/pdf.d.ts.map +1 -0
  74. package/dist/adapters/scholar-artifacts/pdf.js +122 -0
  75. package/dist/adapters/scholar-artifacts/pdf.js.map +1 -0
  76. package/dist/adapters/semantic-scholar/papers.d.ts +6 -6
  77. package/dist/adapters/semantic-scholar/papers.d.ts.map +1 -1
  78. package/dist/adapters/semantic-scholar/papers.js +80 -6
  79. package/dist/adapters/semantic-scholar/papers.js.map +1 -1
  80. package/dist/adapters/unpaywall/works.d.ts +7 -7
  81. package/dist/adapters/unpaywall/works.d.ts.map +1 -1
  82. package/dist/adapters/unpaywall/works.js +104 -12
  83. package/dist/adapters/unpaywall/works.js.map +1 -1
  84. package/dist/adapters/wanfang/search.d.ts +14 -0
  85. package/dist/adapters/wanfang/search.d.ts.map +1 -1
  86. package/dist/adapters/wanfang/search.js +56 -7
  87. package/dist/adapters/wanfang/search.js.map +1 -1
  88. package/dist/browser/page.d.ts +2 -0
  89. package/dist/browser/page.d.ts.map +1 -1
  90. package/dist/browser/page.js +12 -0
  91. package/dist/browser/page.js.map +1 -1
  92. package/dist/commands/browser/actions.d.ts.map +1 -1
  93. package/dist/commands/browser/actions.js +59 -3
  94. package/dist/commands/browser/actions.js.map +1 -1
  95. package/dist/commands/scholar.d.ts +77 -5
  96. package/dist/commands/scholar.d.ts.map +1 -1
  97. package/dist/commands/scholar.js +2945 -83
  98. package/dist/commands/scholar.js.map +1 -1
  99. package/dist/core/command-contract.d.ts.map +1 -1
  100. package/dist/core/command-contract.js +5 -0
  101. package/dist/core/command-contract.js.map +1 -1
  102. package/dist/core/schema-v2.d.ts +1 -0
  103. package/dist/core/schema-v2.d.ts.map +1 -1
  104. package/dist/core/schema-v2.js +1 -0
  105. package/dist/core/schema-v2.js.map +1 -1
  106. package/dist/discovery/aliases.d.ts.map +1 -1
  107. package/dist/discovery/aliases.js +208 -0
  108. package/dist/discovery/aliases.js.map +1 -1
  109. package/dist/discovery/core-catalog.d.ts +2 -0
  110. package/dist/discovery/core-catalog.d.ts.map +1 -1
  111. package/dist/discovery/core-catalog.js +487 -0
  112. package/dist/discovery/core-catalog.js.map +1 -1
  113. package/dist/discovery/intents.d.ts.map +1 -1
  114. package/dist/discovery/intents.js +273 -2
  115. package/dist/discovery/intents.js.map +1 -1
  116. package/dist/discovery/loader.d.ts.map +1 -1
  117. package/dist/discovery/loader.js +3 -0
  118. package/dist/discovery/loader.js.map +1 -1
  119. package/dist/engine/capability-policy.d.ts.map +1 -1
  120. package/dist/engine/capability-policy.js +30 -4
  121. package/dist/engine/capability-policy.js.map +1 -1
  122. package/dist/engine/kernel/stages.d.ts.map +1 -1
  123. package/dist/engine/kernel/stages.js +3 -0
  124. package/dist/engine/kernel/stages.js.map +1 -1
  125. package/dist/engine/operation-policy.d.ts +4 -1
  126. package/dist/engine/operation-policy.d.ts.map +1 -1
  127. package/dist/engine/operation-policy.js +23 -0
  128. package/dist/engine/operation-policy.js.map +1 -1
  129. package/dist/fast-path/manifest.d.ts +3 -0
  130. package/dist/fast-path/manifest.d.ts.map +1 -1
  131. package/dist/fast-path/manifest.js.map +1 -1
  132. package/dist/fast-path/policy.d.ts.map +1 -1
  133. package/dist/fast-path/policy.js +3 -0
  134. package/dist/fast-path/policy.js.map +1 -1
  135. package/dist/manifest-compact.txt +1 -1
  136. package/dist/manifest.json +6804 -1002
  137. package/dist/registry.d.ts +2 -0
  138. package/dist/registry.d.ts.map +1 -1
  139. package/dist/registry.js +1 -0
  140. package/dist/registry.js.map +1 -1
  141. package/dist/types/scholarly.d.ts +19 -4
  142. package/dist/types/scholarly.d.ts.map +1 -1
  143. package/dist/types/scholarly.js +4 -4
  144. package/dist/types.d.ts +8 -0
  145. package/dist/types.d.ts.map +1 -1
  146. package/dist/types.js.map +1 -1
  147. package/package.json +1 -1
  148. package/server.json +2 -2
  149. package/skills/unicli/SKILL.md +1 -1
  150. package/skills/unicli-claude-code/SKILL.md +1 -1
  151. package/skills/unicli-hermes/SKILL.md +1 -1
  152. package/src/adapters/acl-anthology/papers.test.ts +111 -0
  153. package/src/adapters/acl-anthology/papers.ts +379 -71
  154. package/src/adapters/arxiv/papers.test.ts +46 -0
  155. package/src/adapters/arxiv/papers.ts +251 -4
  156. package/src/adapters/baidu-scholar/search.ts +74 -11
  157. package/src/adapters/biorxiv/preprints.ts +112 -0
  158. package/src/adapters/cnki/search.ts +357 -0
  159. package/src/adapters/cvf/papers.ts +260 -27
  160. package/src/adapters/dblp/publications.test.ts +9 -0
  161. package/src/adapters/dblp/publications.ts +31 -8
  162. package/src/adapters/google-scholar/search.ts +165 -17
  163. package/src/adapters/hf/paper.test.ts +23 -0
  164. package/src/adapters/hf/paper.ts +89 -5
  165. package/src/adapters/hf/top.yaml +34 -2
  166. package/src/adapters/huggingface-papers/daily.yaml +37 -3
  167. package/src/adapters/huggingface-papers/search.yaml +43 -9
  168. package/src/adapters/medrxiv/preprints.ts +112 -0
  169. package/src/adapters/neurips/proceedings.ts +266 -22
  170. package/src/adapters/openalex/works.test.ts +15 -4
  171. package/src/adapters/openalex/works.ts +136 -8
  172. package/src/adapters/openreview/papers.test.ts +31 -0
  173. package/src/adapters/openreview/papers.ts +407 -29
  174. package/src/adapters/pmlr/proceedings.ts +102 -12
  175. package/src/adapters/pubmed/articles.test.ts +88 -1
  176. package/src/adapters/pubmed/articles.ts +343 -44
  177. package/src/adapters/rxiv/preprints.test.ts +233 -0
  178. package/src/adapters/rxiv/preprints.ts +849 -0
  179. package/src/adapters/scholar-artifacts/pdf-read.ts +277 -0
  180. package/src/adapters/scholar-artifacts/pdf.ts +133 -0
  181. package/src/adapters/semantic-scholar/papers.ts +98 -6
  182. package/src/adapters/unpaywall/works.ts +141 -12
  183. package/src/adapters/wanfang/search.ts +57 -7
  184. package/src/adapters/cnki/search.yaml +0 -49
@@ -1,15 +1,28 @@
1
1
  /**
2
2
  * @owner src/adapters/arxiv/papers.ts
3
- * @does Register agent-facing arXiv author and recent category commands.
4
- * @needs export.arxiv.org Atom API, category validation, conservative XML parsing.
5
- * @feeds surface coverage ledger, scholarly search workflow, arXiv category monitoring.
6
- * @breaks arXiv Atom shape drift, weak category parsing, or silent empty feeds hide paper discovery failures.
3
+ * @does Register agent-facing arXiv author, recent category, and PDF text-read commands.
4
+ * @needs export.arxiv.org Atom API, arxiv.org PDF URLs, category/id validation, conservative XML parsing, pdftotext.
5
+ * @feeds surface coverage ledger, scholarly search/read workflow, arXiv category monitoring.
6
+ * @breaks arXiv Atom/PDF shape drift, weak category/id parsing, denied PDF downloads, missing pdftotext, or silent empty feeds hide paper discovery/read failures.
7
+ * @invariants arXiv ids are normalized before URL construction; read returns PDF-derived text only and labels `text_source=pdf`.
8
+ * @side-effects HTTPS egress to export.arxiv.org and arxiv.org; read writes PDFs under the requested output directory and executes pdftotext.
9
+ * @perf O(limit) for Atom discovery; O(PDF bytes + extracted pages) for read.
10
+ * @concurrency safe - per-command local state only
11
+ * @test src/adapters/arxiv/papers.test.ts, tests/unit/commands/scholar.test.ts
12
+ * @stability experimental
13
+ * @since 0.225.2
7
14
  */
8
15
 
16
+ import { execFile } from "node:child_process";
17
+ import { join, resolve } from "node:path";
18
+ import { promisify } from "node:util";
19
+
9
20
  import { cli, Strategy } from "../../registry.js";
21
+ import { httpDownload, sanitizeFilename } from "../../engine/download.js";
10
22
 
11
23
  const ARXIV_BASE = "https://export.arxiv.org/api/query";
12
24
  const CATEGORY_RE = /^[a-z]+(?:-[a-z]+)*(?:\.[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*)?$/;
25
+ const execFileAsync = promisify(execFile);
13
26
 
14
27
  interface ArxivEntry {
15
28
  id: string;
@@ -44,6 +57,20 @@ export function requireArxivAuthor(value: unknown): string {
44
57
  return author;
45
58
  }
46
59
 
60
+ export function normalizeArxivId(value: unknown): string {
61
+ const id = String(value ?? "")
62
+ .trim()
63
+ .replace(/^arxiv:/i, "")
64
+ .replace(/^https?:\/\/(?:www\.)?arxiv\.org\/(?:abs|pdf)\//i, "")
65
+ .replace(/\.pdf$/i, "");
66
+ if (
67
+ !/^(?:\d{4}\.\d{4,5}|[a-z-]+(?:\.[A-Z]{2})?\/\d{7})(?:v\d+)?$/i.test(id)
68
+ ) {
69
+ throw new Error(`Invalid arXiv id "${String(value ?? "")}".`);
70
+ }
71
+ return id;
72
+ }
73
+
47
74
  export function requireArxivCategory(value: unknown): string {
48
75
  const category = String(value ?? "").trim();
49
76
  if (!CATEGORY_RE.test(category)) {
@@ -157,6 +184,160 @@ function compactRows(entries: ArxivEntry[]): Array<Record<string, unknown>> {
157
184
  }));
158
185
  }
159
186
 
187
+ function arxivPdfUrl(id: string): string {
188
+ return `https://arxiv.org/pdf/${id}`;
189
+ }
190
+
191
+ function arxivAbsUrl(id: string): string {
192
+ return `https://arxiv.org/abs/${id.replace(/v\d+$/i, "")}`;
193
+ }
194
+
195
+ export function arxivArtifactFilename(input: {
196
+ id: string;
197
+ title?: unknown;
198
+ }): string {
199
+ const title = String(input.title ?? "")
200
+ .replace(/\s+/g, " ")
201
+ .trim()
202
+ .replace(/[^A-Za-z0-9._-]+/g, "-")
203
+ .replace(/^-+|-+$/g, "")
204
+ .slice(0, 96);
205
+ return sanitizeFilename(`${input.id}${title ? `-${title}` : ""}.pdf`);
206
+ }
207
+
208
+ export function requireArxivPageRange(
209
+ firstPage: unknown,
210
+ lastPage: unknown,
211
+ ): { firstPage: number; lastPage: number } {
212
+ const first = Number(firstPage ?? 1);
213
+ const last = Number(lastPage ?? 20);
214
+ if (!Number.isInteger(first) || first < 1) {
215
+ throw new Error("arxiv first-page must be an integer >= 1.");
216
+ }
217
+ if (!Number.isInteger(last) || last < first) {
218
+ throw new Error("arxiv last-page must be an integer >= first-page.");
219
+ }
220
+ return { firstPage: first, lastPage: last };
221
+ }
222
+
223
+ export function requireArxivMaxChars(
224
+ value: unknown,
225
+ fallback = 40_000,
226
+ ): number {
227
+ if (value === undefined || value === null || value === "") return fallback;
228
+ const n = Number(value);
229
+ if (!Number.isInteger(n) || n < 1_000 || n > 1_000_000) {
230
+ throw new Error(
231
+ `arxiv max-chars must be an integer in [1000, 1000000]. Got: ${String(value)}`,
232
+ );
233
+ }
234
+ return n;
235
+ }
236
+
237
+ function truncateText(
238
+ text: string,
239
+ maxChars: number,
240
+ ): {
241
+ text: string;
242
+ truncated: boolean;
243
+ originalChars: number;
244
+ } {
245
+ if (text.length <= maxChars) {
246
+ return { text, truncated: false, originalChars: text.length };
247
+ }
248
+ return {
249
+ text: `${text.slice(0, maxChars).trimEnd()}\n\n[truncated at ${maxChars} characters]`,
250
+ truncated: true,
251
+ originalChars: text.length,
252
+ };
253
+ }
254
+
255
+ async function fetchArxivEntryById(id: string): Promise<ArxivEntry> {
256
+ const params = new URLSearchParams({ id_list: id });
257
+ const rows = parseArxivEntries(await fetchArxiv(params));
258
+ const row = rows[0];
259
+ if (!row) throw new Error(`No arXiv paper found for ${id}.`);
260
+ return row;
261
+ }
262
+
263
+ export async function readArxivPaper(
264
+ kwargs: Record<string, unknown>,
265
+ ): Promise<Record<string, unknown>> {
266
+ const id = normalizeArxivId(kwargs.id ?? kwargs.arxiv_id ?? kwargs.ref);
267
+ const entry = await fetchArxivEntryById(id);
268
+ const canonicalId = entry.id || id.replace(/v\d+$/i, "");
269
+ const pdfUrl = arxivPdfUrl(id);
270
+ const outputDir = resolve(String(kwargs.output ?? "./arxiv-downloads"));
271
+ const path = join(
272
+ outputDir,
273
+ arxivArtifactFilename({ id, title: entry.title }),
274
+ );
275
+ const download = await httpDownload(pdfUrl, path, {
276
+ Accept: "application/pdf,*/*",
277
+ Referer: arxivAbsUrl(canonicalId),
278
+ "User-Agent": "unicli-arxiv/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
279
+ });
280
+ if (download.status === "failed" || !download.path) {
281
+ throw new Error(
282
+ `arXiv PDF download failed for ${id}: ${download.error ?? "no path"}.`,
283
+ );
284
+ }
285
+
286
+ const { firstPage, lastPage } = requireArxivPageRange(
287
+ kwargs["first-page"] ?? kwargs.firstPage,
288
+ kwargs["last-page"] ?? kwargs.lastPage,
289
+ );
290
+ const maxChars = requireArxivMaxChars(
291
+ kwargs["max-chars"] ?? kwargs.maxChars,
292
+ 40_000,
293
+ );
294
+ const { stdout } = await execFileAsync(
295
+ "pdftotext",
296
+ [
297
+ "-layout",
298
+ "-enc",
299
+ "UTF-8",
300
+ "-f",
301
+ String(firstPage),
302
+ "-l",
303
+ String(lastPage),
304
+ download.path,
305
+ "-",
306
+ ],
307
+ { timeout: 60_000, maxBuffer: 10 * 1024 * 1024 },
308
+ );
309
+ const text = stdout.trim();
310
+ if (!text) {
311
+ throw new Error(
312
+ `pdftotext returned no text for arXiv ${id} pages ${firstPage}-${lastPage}.`,
313
+ );
314
+ }
315
+ const truncated = truncateText(text, maxChars);
316
+ return {
317
+ id: canonicalId,
318
+ title: entry.title,
319
+ authors: entry.authors
320
+ .split(/\s*,\s*/)
321
+ .map((author) => author.trim())
322
+ .filter(Boolean),
323
+ year: Number(entry.published.slice(0, 4)) || undefined,
324
+ date: entry.published,
325
+ venue: "arXiv",
326
+ type: "preprint",
327
+ abstract: entry.abstract,
328
+ arxiv_id: canonicalId,
329
+ source_adapter: "arxiv",
330
+ source_url: arxivAbsUrl(canonicalId),
331
+ pdf_url: pdfUrl,
332
+ path: download.path,
333
+ text: truncated.text,
334
+ text_chars: truncated.originalChars,
335
+ text_truncated: truncated.truncated,
336
+ text_source: "pdf",
337
+ retrieved_at: new Date().toISOString(),
338
+ };
339
+ }
340
+
160
341
  cli({
161
342
  site: "arxiv",
162
343
  name: "author",
@@ -192,6 +373,72 @@ cli({
192
373
  },
193
374
  });
194
375
 
376
+ cli({
377
+ site: "arxiv",
378
+ name: "read",
379
+ description: "Download an arXiv PDF by ID and extract text with pdftotext",
380
+ domain: "arxiv.org",
381
+ strategy: Strategy.PUBLIC,
382
+ args: [
383
+ {
384
+ name: "id",
385
+ type: "str",
386
+ required: true,
387
+ positional: true,
388
+ description: "arXiv paper ID (e.g. 1706.03762)",
389
+ "x-unicli-kind": "id",
390
+ "x-unicli-accepts": ["url"],
391
+ },
392
+ {
393
+ name: "output",
394
+ type: "str",
395
+ default: "./arxiv-downloads",
396
+ description: "Output directory",
397
+ "x-unicli-kind": "path",
398
+ },
399
+ {
400
+ name: "first-page",
401
+ type: "int",
402
+ default: 1,
403
+ description: "First PDF page to extract",
404
+ },
405
+ {
406
+ name: "last-page",
407
+ type: "int",
408
+ default: 20,
409
+ description: "Last PDF page to extract",
410
+ },
411
+ {
412
+ name: "max-chars",
413
+ type: "int",
414
+ default: 40000,
415
+ description: "Maximum extracted text characters",
416
+ },
417
+ ],
418
+ columns: [
419
+ "id",
420
+ "title",
421
+ "source_adapter",
422
+ "source_url",
423
+ "pdf_url",
424
+ "path",
425
+ "text_source",
426
+ "text",
427
+ "text_chars",
428
+ "text_truncated",
429
+ ],
430
+ capabilities: [
431
+ "http.fetch",
432
+ "http.download",
433
+ "subprocess.exec",
434
+ "scholar.fulltext",
435
+ "scholar.pdf",
436
+ ],
437
+ executables: ["pdftotext"],
438
+ minimum_capability: "subprocess.exec",
439
+ func: async (_page, kwargs) => [await readArxivPaper(kwargs)],
440
+ });
441
+
195
442
  cli({
196
443
  site: "arxiv",
197
444
  name: "recent",
@@ -1,7 +1,26 @@
1
+ /**
2
+ * @owner src::adapters::baidu-scholar::search
3
+ * @does Registers Baidu Scholar public browser search as a discovery-only scholarly source.
4
+ * @needs xueshu.baidu.com current `/ndscholar/browse/search` result DOM, src/registry.ts, src/types.ts, browser tools
5
+ * @feeds src/commands/scholar.ts capability discovery, `unicli baidu-scholar search`, `unicli scholar coverage/doctor`
6
+ * @breaks Baidu Scholar route or result-card DOM drift can return empty rows or navigation errors.
7
+ * @invariants Search is discovery-only; source/provider links are hints, not PDF/full-text proof.
8
+ * @side-effects Navigates a Uni-CLI managed browser page to Baidu Scholar public search.
9
+ * @perf O(limit) DOM extraction after one page navigation.
10
+ * @concurrency safe — command state is page-local
11
+ * @test live smoke via `unicli baidu-scholar search <query>`; URL contract in tests/unit/adapters/scholar-sources.test.ts
12
+ * @stability experimental
13
+ * @since 2026-06-27
14
+ */
15
+
1
16
  import { cli, Strategy } from "../../registry.js";
2
17
  import type { IPage } from "../../types.js";
3
18
  import { intArg, js, str } from "../_shared/browser-tools.js";
4
19
 
20
+ export function buildBaiduScholarSearchUrl(query: string): string {
21
+ return `https://xueshu.baidu.com/ndscholar/browse/search?wd=${encodeURIComponent(query)}`;
22
+ }
23
+
5
24
  cli({
6
25
  site: "baidu-scholar",
7
26
  name: "search",
@@ -13,7 +32,15 @@ cli({
13
32
  { name: "query", type: "str", required: true, positional: true },
14
33
  { name: "limit", type: "int", default: 10 },
15
34
  ],
16
- columns: ["title", "authors", "source", "url"],
35
+ columns: [
36
+ "id",
37
+ "title",
38
+ "authors",
39
+ "source",
40
+ "year",
41
+ "cited_by_count",
42
+ "source_url",
43
+ ],
17
44
  capabilities: [
18
45
  "mcp-browser.navigate",
19
46
  "mcp-browser.evaluate",
@@ -22,19 +49,55 @@ cli({
22
49
  func: async (page, kwargs) => {
23
50
  const p = page as IPage;
24
51
  const limit = intArg(kwargs.limit, 10, 50);
25
- await p.goto(
26
- `https://xueshu.baidu.com/s?wd=${encodeURIComponent(str(kwargs.query))}`,
27
- { settleMs: 2500 },
28
- );
52
+ await p.goto(buildBaiduScholarSearchUrl(str(kwargs.query)), {
53
+ settleMs: 3000,
54
+ });
29
55
  const rows = await p.evaluate(`(() => {
30
- const cards = [...document.querySelectorAll('.result, .sc_content, .result-item')];
56
+ const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim();
57
+ const cleanAuthor = (value) => normalize(value).replace(/[,,]+$/g, '');
58
+ const paperId = (url) => {
59
+ try {
60
+ return new URL(url, location.href).searchParams.get('paperid') || '';
61
+ } catch {
62
+ return '';
63
+ }
64
+ };
65
+ const cards = [...document.querySelectorAll('.paper-wrap.result, .result, .sc_content, .result-item')];
31
66
  return cards.map((card) => {
32
- const link = card.querySelector('h3 a, .t a, a[href]');
67
+ const link = card.querySelector('.paper-title a[href], h3 a[href], .t a[href], a[href]');
68
+ const url = link ? new URL(link.getAttribute('href') || '', location.href).href : '';
69
+ const info = card.querySelector('.paper-info');
70
+ const infoText = normalize(info?.textContent);
71
+ const authors = [...(info?.querySelectorAll('a[href*="author"]') || [])]
72
+ .map((node) => cleanAuthor(node.textContent))
73
+ .filter(Boolean);
74
+ const source = normalize(
75
+ [...(info?.querySelectorAll('a[href]') || [])]
76
+ .find((node) => {
77
+ const href = node.getAttribute('href') || '';
78
+ return !href.includes('author%3A') && !href.includes('refpaperuri');
79
+ })?.textContent
80
+ ).replace(/^《|》$/g, '');
81
+ const sourceLinks = [...card.querySelectorAll('.paper-source a[href]')]
82
+ .map((node) => ({
83
+ label: normalize(node.textContent),
84
+ url: new URL(node.getAttribute('href') || '', location.href).href
85
+ }))
86
+ .filter((item) => item.label && item.url && !item.url.startsWith('javascript:'));
87
+ const citedText = normalize(card.querySelector('.paper-info a[href*="refpaperuri"]')?.textContent);
33
88
  return {
34
- title: (link?.textContent || '').replace(/\\s+/g, ' ').trim(),
35
- authors: (card.querySelector('.author_text, .sc_info, .c_font')?.textContent || '').replace(/\\s+/g, ' ').trim(),
36
- source: (card.querySelector('.journal_title, .sc_info')?.textContent || '').replace(/\\s+/g, ' ').trim(),
37
- url: link ? new URL(link.getAttribute('href') || '', location.href).href : ''
89
+ id: paperId(url) || url || normalize(link?.textContent),
90
+ title: normalize(link?.textContent),
91
+ authors: authors.join(', '),
92
+ source,
93
+ venue: source,
94
+ type: normalize(card.querySelector('.paper-type')?.textContent),
95
+ year: (infoText.match(/(19|20)\\d{2}/) || [])[0] || '',
96
+ abstract: normalize(card.querySelector('.paper-abstract')?.textContent).replace(/\\s*查看全部>>$/, ''),
97
+ cited_by_count: citedText.match(/\\d+/)?.[0] || '',
98
+ source_url: url,
99
+ url,
100
+ source_links: sourceLinks
38
101
  };
39
102
  }).filter((row) => row.title).slice(0, ${js(limit)});
40
103
  })()`);
@@ -0,0 +1,112 @@
1
+ /**
2
+ * @owner src::adapters::biorxiv::preprints
3
+ * @does Registers bioRxiv recent/search, DOI metadata, PDF download, and read commands backed by the official xRxiv API helpers.
4
+ * @needs src/adapters/rxiv/preprints.ts, api.biorxiv.org, bioRxiv PDF/JATS asset URLs.
5
+ * @feeds surface coverage ledger, scholarly preprint discovery/search, scholar DOI read/download routing.
6
+ * @breaks bioRxiv API drift, date-window search exhaustion, Cloudflare denial on source assets, or missing pdftotext stops read/download rather than fabricating text.
7
+ */
8
+
9
+ import { cli, Strategy } from "../../registry.js";
10
+ import {
11
+ downloadRxivPdf,
12
+ fetchPaperRow,
13
+ fetchRecentRows,
14
+ fetchSearchRows,
15
+ readRxivPaper,
16
+ RXIV_DOWNLOAD_ARGS,
17
+ RXIV_DOWNLOAD_CAPABILITIES,
18
+ RXIV_DOWNLOAD_COLUMNS,
19
+ RXIV_PAPER_ARGS,
20
+ RXIV_PAPER_CAPABILITIES,
21
+ RXIV_PAPER_COLUMNS,
22
+ RXIV_READ_ARGS,
23
+ RXIV_READ_CAPABILITIES,
24
+ RXIV_READ_COLUMNS,
25
+ RXIV_RECENT_ARGS,
26
+ RXIV_RECENT_CAPABILITIES,
27
+ RXIV_RECENT_COLUMNS,
28
+ RXIV_SEARCH_ARGS,
29
+ RXIV_SEARCH_CAPABILITIES,
30
+ RXIV_SEARCH_COLUMNS,
31
+ type RxivConfig,
32
+ } from "../rxiv/preprints.js";
33
+
34
+ const CONFIG: RxivConfig = {
35
+ site: "biorxiv",
36
+ label: "bioRxiv",
37
+ apiServer: "biorxiv",
38
+ webOrigin: "https://www.biorxiv.org",
39
+ };
40
+ const DOMAIN = "api.biorxiv.org";
41
+
42
+ cli({
43
+ site: "biorxiv",
44
+ name: "recent",
45
+ description: "List recent bioRxiv preprints from the official API",
46
+ domain: DOMAIN,
47
+ strategy: Strategy.PUBLIC,
48
+ args: RXIV_RECENT_ARGS,
49
+ columns: RXIV_RECENT_COLUMNS,
50
+ capabilities: RXIV_RECENT_CAPABILITIES,
51
+ func: async (_page, kwargs) => fetchRecentRows(CONFIG, kwargs),
52
+ });
53
+
54
+ cli({
55
+ site: "biorxiv",
56
+ name: "search",
57
+ description:
58
+ "Search bioRxiv official API metadata within a bounded date window",
59
+ domain: DOMAIN,
60
+ strategy: Strategy.PUBLIC,
61
+ args: RXIV_SEARCH_ARGS,
62
+ columns: RXIV_SEARCH_COLUMNS,
63
+ capabilities: RXIV_SEARCH_CAPABILITIES,
64
+ func: async (_page, kwargs) => fetchSearchRows(CONFIG, kwargs),
65
+ });
66
+
67
+ cli({
68
+ site: "biorxiv",
69
+ name: "paper",
70
+ description: "Fetch bioRxiv preprint metadata by DOI",
71
+ domain: DOMAIN,
72
+ strategy: Strategy.PUBLIC,
73
+ args: RXIV_PAPER_ARGS,
74
+ columns: RXIV_PAPER_COLUMNS,
75
+ capabilities: RXIV_PAPER_CAPABILITIES,
76
+ func: async (_page, kwargs) => [
77
+ await fetchPaperRow(CONFIG, kwargs.doi ?? kwargs.id ?? kwargs.ref),
78
+ ],
79
+ });
80
+
81
+ cli({
82
+ site: "biorxiv",
83
+ name: "download",
84
+ description: "Download a bioRxiv preprint PDF by DOI",
85
+ domain: DOMAIN,
86
+ strategy: Strategy.PUBLIC,
87
+ args: RXIV_DOWNLOAD_ARGS,
88
+ columns: RXIV_DOWNLOAD_COLUMNS,
89
+ capabilities: RXIV_DOWNLOAD_CAPABILITIES,
90
+ minimum_capability: "http.download",
91
+ func: async (_page, kwargs) => [
92
+ await downloadRxivPdf(
93
+ CONFIG,
94
+ await fetchPaperRow(CONFIG, kwargs.doi ?? kwargs.id ?? kwargs.ref),
95
+ kwargs.output,
96
+ ),
97
+ ],
98
+ });
99
+
100
+ cli({
101
+ site: "biorxiv",
102
+ name: "read",
103
+ description:
104
+ "Read bioRxiv preprint text by DOI, preferring JATS XML before PDF extraction",
105
+ domain: DOMAIN,
106
+ strategy: Strategy.PUBLIC,
107
+ args: RXIV_READ_ARGS,
108
+ columns: RXIV_READ_COLUMNS,
109
+ capabilities: RXIV_READ_CAPABILITIES,
110
+ minimum_capability: "subprocess.exec",
111
+ func: async (_page, kwargs) => [await readRxivPaper(CONFIG, kwargs)],
112
+ });