lifeos 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +15 -9
  2. package/README.zh.md +15 -9
  3. package/assets/lifeos-rules.en.md +1 -1
  4. package/assets/lifeos-rules.zh.md +1 -1
  5. package/assets/lifeos.yaml +1 -0
  6. package/assets/skills/archive/SKILL.en.md +1 -1
  7. package/assets/skills/archive/SKILL.zh.md +1 -1
  8. package/assets/skills/ask/SKILL.en.md +1 -1
  9. package/assets/skills/ask/SKILL.zh.md +1 -1
  10. package/assets/skills/brainstorm/SKILL.en.md +1 -1
  11. package/assets/skills/brainstorm/SKILL.zh.md +1 -1
  12. package/assets/skills/digest/SKILL.en.md +212 -0
  13. package/assets/skills/digest/SKILL.zh.md +207 -0
  14. package/assets/skills/digest/references/__pycache__/rss-arxiv-script.cpython-312.pyc +0 -0
  15. package/assets/skills/digest/references/config-parser.en.md +179 -0
  16. package/assets/skills/digest/references/config-parser.zh.md +177 -0
  17. package/assets/skills/digest/references/rss-arxiv-script.py +1549 -0
  18. package/assets/skills/digest/references/run-pipeline.en.md +236 -0
  19. package/assets/skills/digest/references/run-pipeline.zh.md +235 -0
  20. package/assets/skills/digest/references/setup-guide.en.md +192 -0
  21. package/assets/skills/digest/references/setup-guide.zh.md +188 -0
  22. package/assets/skills/knowledge/SKILL.en.md +1 -1
  23. package/assets/skills/knowledge/SKILL.zh.md +1 -1
  24. package/assets/skills/project/SKILL.en.md +1 -1
  25. package/assets/skills/project/SKILL.zh.md +1 -1
  26. package/assets/skills/read-pdf/SKILL.en.md +1 -1
  27. package/assets/skills/read-pdf/SKILL.zh.md +1 -1
  28. package/assets/skills/research/SKILL.en.md +1 -1
  29. package/assets/skills/research/SKILL.zh.md +1 -1
  30. package/assets/skills/revise/SKILL.en.md +1 -1
  31. package/assets/skills/revise/SKILL.zh.md +1 -1
  32. package/assets/skills/today/SKILL.en.md +1 -1
  33. package/assets/skills/today/SKILL.zh.md +1 -1
  34. package/dist/cli/commands/doctor.js +9 -9
  35. package/dist/cli/commands/doctor.js.map +1 -1
  36. package/dist/cli/commands/upgrade.js +20 -2
  37. package/dist/cli/commands/upgrade.js.map +1 -1
  38. package/dist/cli/utils/install-assets.js +6 -2
  39. package/dist/cli/utils/install-assets.js.map +1 -1
  40. package/dist/config.d.ts +1 -0
  41. package/dist/config.js +2 -0
  42. package/dist/config.js.map +1 -1
  43. package/dist/index.d.ts +1 -1
  44. package/dist/index.js +1 -1
  45. package/dist/server.js +1 -1
  46. package/package.json +1 -1
@@ -0,0 +1,1549 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ /digest RSS + arXiv fetch helper.
4
+
5
+ Input: JSON config from stdin.
6
+ Output: JSON result on stdout.
7
+
8
+ Example:
9
+ echo '{"language":"en","rss":{"enabled":false},"arxiv":{"enabled":false},"days":7}' | python3 rss-arxiv-script.py
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import re
16
+ import subprocess
17
+ import sys
18
+ import urllib.parse
19
+ import urllib.request
20
+ import xml.etree.ElementTree as ET
21
+ from datetime import datetime, timedelta, timezone
22
+
23
+
24
+ ARXIV_API_URL = "http://export.arxiv.org/api/query"
25
+ OPENALEX_API_URL = "https://api.openalex.org/works"
26
+ REQUEST_HEADERS = {"User-Agent": "LifeOS digest/1.0"}
27
+ ARXIV_REQUEST_INTERVAL_SECONDS = 3
28
+ CHEMRXIV_OPENALEX_REPOSITORY_ID = "S4393918830"
29
+ SOCARXIV_OPENALEX_REPOSITORY_ID = "S4306401238"
30
+ SSRN_OPENALEX_REPOSITORY_ID = "S4210172589"
31
+ ARXIV_LINK_RE = re.compile(
32
+ r"arxiv\.org/(?:abs|pdf)/((?:[a-z\-]+(?:\.[a-z\-]+)?/\d{7})|(?:\d{4}\.\d{4,5}))(?:v\d+)?(?:\.pdf)?",
33
+ re.IGNORECASE,
34
+ )
35
+ CHEMRXIV_DOI_RE = re.compile(r"10\.26434/chemrxiv[0-9A-Za-z./_-]*", re.IGNORECASE)
36
+ SSRN_DOI_RE = re.compile(r"10\.2139/ssrn[0-9A-Za-z./_-]*", re.IGNORECASE)
37
+ CJK_RE = re.compile(r"[\u3400-\u9fff]")
38
+ WHITESPACE_RE = re.compile(r"\s+")
39
+ QUOTE_RE = re.compile(r'"([^"]+)"')
40
+ SOURCE_PRIORITY = {
41
+ "arxiv": 4,
42
+ "biorxiv": 3,
43
+ "medrxiv": 3,
44
+ "chemrxiv": 3,
45
+ "socarxiv": 3,
46
+ "ssrn": 3,
47
+ "openalex": 1,
48
+ }
49
+
50
+ SOURCE_DISPLAY_NAMES = {
51
+ "arxiv": "arXiv",
52
+ "biorxiv": "bioRxiv",
53
+ "medrxiv": "medRxiv",
54
+ "chemrxiv": "ChemRxiv",
55
+ "socarxiv": "SocArXiv",
56
+ "ssrn": "SSRN",
57
+ "openalex": "openalex",
58
+ }
59
+
60
+ SUPPORTED_PAPER_SOURCE_KEYS = {"arxiv", "biorxiv", "medrxiv", "chemrxiv", "socarxiv", "ssrn"}
61
+
62
+
63
+ MESSAGES = {
64
+ "zh": {
65
+ "untitled": "无标题",
66
+ "fetch_failed": "抓取失败",
67
+ "arxiv_batch_failed": "arXiv 批次 {index} 抓取失败",
68
+ "author_suffix": " 等",
69
+ },
70
+ "en": {
71
+ "untitled": "Untitled",
72
+ "fetch_failed": "Fetch failed",
73
+ "arxiv_batch_failed": "arXiv batch {index} failed",
74
+ "author_suffix": " et al.",
75
+ },
76
+ }
77
+
78
+
79
+ def normalize_language(language: str | None) -> str:
80
+ """Return a supported language key."""
81
+ return "en" if language == "en" else "zh"
82
+
83
+
84
+ def normalize_source_type(source_type: str | None) -> str:
85
+ """Return a canonical lowercase paper source key."""
86
+ if not source_type:
87
+ return ""
88
+
89
+ normalized = normalize_whitespace(source_type).lower().replace(" ", "")
90
+ aliases = {
91
+ "arxiv": "arxiv",
92
+ "biorxiv": "biorxiv",
93
+ "medrxiv": "medrxiv",
94
+ "chemrxiv": "chemrxiv",
95
+ "socarxiv": "socarxiv",
96
+ "ssrn": "ssrn",
97
+ "openalex": "openalex",
98
+ }
99
+ return aliases.get(normalized, normalized)
100
+
101
+
102
+ def display_source_type(source_type: str | None) -> str:
103
+ """Return the canonical display name for a paper source."""
104
+ source_key = normalize_source_type(source_type)
105
+ if not source_key:
106
+ return ""
107
+ return SOURCE_DISPLAY_NAMES.get(source_key, source_key)
108
+
109
+
110
+ def normalize_string_list(value: object) -> list[str]:
111
+ """Normalize a config or record field into a flat string list."""
112
+ if value is None:
113
+ return []
114
+
115
+ if isinstance(value, list):
116
+ items: list[str] = []
117
+ for item in value:
118
+ if isinstance(item, str):
119
+ cleaned = normalize_whitespace(item)
120
+ if cleaned:
121
+ items.append(cleaned)
122
+ elif item is not None:
123
+ cleaned = normalize_whitespace(str(item))
124
+ if cleaned:
125
+ items.append(cleaned)
126
+ return items
127
+
128
+ if isinstance(value, str):
129
+ parts = [normalize_whitespace(part) for part in value.split(",")]
130
+ return [part for part in parts if part]
131
+
132
+ cleaned = normalize_whitespace(str(value))
133
+ return [cleaned] if cleaned else []
134
+
135
+
136
+ def normalize_published_date(value: object) -> str:
137
+ """Return a YYYY-MM-DD date string when possible."""
138
+ if value is None:
139
+ return ""
140
+
141
+ text = normalize_whitespace(str(value))
142
+ if not text:
143
+ return ""
144
+
145
+ try:
146
+ published_at = datetime.fromisoformat(text.replace("Z", "+00:00"))
147
+ return published_at.strftime("%Y-%m-%d")
148
+ except Exception:
149
+ match = re.search(r"\d{4}-\d{2}-\d{2}", text)
150
+ return match.group(0) if match else ""
151
+
152
+
153
+ def normalize_paper_title(value: object, language: str | None) -> str:
154
+ """Return a stable title string for a paper record."""
155
+ title = normalize_whitespace(str(value)) if value is not None else ""
156
+ return title or get_messages(language)["untitled"]
157
+
158
+
159
+ def normalize_paper_authors(authors: object, language: str | None) -> str:
160
+ """Normalize paper authors from list or string input."""
161
+ if isinstance(authors, list):
162
+ names = [normalize_whitespace(str(author)) for author in authors if normalize_whitespace(str(author))]
163
+ return format_authors(names, language)
164
+
165
+ if isinstance(authors, str):
166
+ return normalize_whitespace(authors)
167
+
168
+ if authors is None:
169
+ return ""
170
+
171
+ return normalize_whitespace(str(authors))
172
+
173
+
174
+ def normalize_paper_record(
175
+ source_type: str,
176
+ record: dict,
177
+ scope: str,
178
+ language: str | None,
179
+ ) -> dict[str, str]:
180
+ """Normalize a source-specific record into the unified paper schema."""
181
+ source_key = normalize_source_type(source_type)
182
+ raw_link = record.get("link") or record.get("url") or record.get("doi")
183
+ link = normalize_whitespace(str(raw_link)) if raw_link is not None else ""
184
+ if source_key == "arxiv" and link:
185
+ normalized_link = normalize_arxiv_link(link)
186
+ if normalized_link is not None:
187
+ link = normalized_link
188
+ elif source_key == "openalex" and link:
189
+ normalized_link = normalize_arxiv_link(link)
190
+ if normalized_link is not None:
191
+ link = normalized_link
192
+
193
+ categories_value = (
194
+ record.get("categories")
195
+ or record.get("category")
196
+ or record.get("preprint_category")
197
+ or record.get("scope")
198
+ or scope
199
+ or ""
200
+ )
201
+ categories = normalize_whitespace(str(categories_value))
202
+ published = normalize_published_date(
203
+ record.get("published")
204
+ or record.get("published_date")
205
+ or record.get("preprint_date")
206
+ or record.get("date")
207
+ or record.get("publication_date")
208
+ )
209
+ summary_value = (
210
+ record.get("summary")
211
+ or record.get("abstract")
212
+ or record.get("preprint_abstract")
213
+ or record.get("description")
214
+ or ""
215
+ )
216
+ summary = normalize_whitespace(str(summary_value))[:300]
217
+ title = normalize_paper_title(
218
+ record.get("title")
219
+ or record.get("display_name")
220
+ or record.get("preprint_title")
221
+ or record.get("name"),
222
+ language,
223
+ )
224
+ authors = normalize_paper_authors(
225
+ record.get("authors") or record.get("preprint_authors") or record.get("author_names"),
226
+ language,
227
+ )
228
+
229
+ normalized_scope = normalize_whitespace(str(record.get("scope") or scope or categories))
230
+ return {
231
+ "title": title,
232
+ "link": link,
233
+ "published": published,
234
+ "summary": summary,
235
+ "categories": categories,
236
+ "authors": authors,
237
+ "source": source_key,
238
+ "source_type": display_source_type(source_key),
239
+ "scope": normalized_scope,
240
+ }
241
+
242
+
243
+ def get_messages(language: str | None) -> dict[str, str]:
244
+ """Return the localized message bundle."""
245
+ return MESSAGES[normalize_language(language)]
246
+
247
+
248
+ def format_authors(authors: list[str], language: str | None) -> str:
249
+ """Format author names with a localized overflow suffix."""
250
+ if not authors:
251
+ return ""
252
+
253
+ formatted = ", ".join(authors[:3])
254
+ if len(authors) > 3:
255
+ formatted += get_messages(language)["author_suffix"]
256
+ return formatted
257
+
258
+
259
+ def build_failure_title(label: str, error: Exception) -> str:
260
+ """Build a bracketed failure title that keeps the existing JSON contract stable."""
261
+ return f"[{label}: {error}]"
262
+
263
+
264
+ def normalize_whitespace(value: str | None) -> str:
265
+ """Collapse internal whitespace and trim."""
266
+ return WHITESPACE_RE.sub(" ", value or "").strip()
267
+
268
+
269
+ def normalize_title_key(value: str) -> str:
270
+ """Build a loose title key for deduplication."""
271
+ lowered = normalize_whitespace(value).lower()
272
+ return re.sub(r"[^a-z0-9]+", "", lowered)
273
+
274
+
275
+ def strip_arxiv_version(identifier: str) -> str:
276
+ """Drop the trailing arXiv version suffix."""
277
+ return re.sub(r"v\d+$", "", identifier)
278
+
279
+
280
+ def normalize_arxiv_link(link: str | None) -> str | None:
281
+ """Normalize a raw arXiv URL or id to the canonical abs URL."""
282
+ if not link:
283
+ return None
284
+
285
+ raw = normalize_whitespace(link)
286
+ if not raw:
287
+ return None
288
+
289
+ if raw.startswith("arXiv:"):
290
+ raw = raw[6:]
291
+
292
+ bare_match = re.fullmatch(
293
+ r"((?:[a-z\-]+(?:\.[a-z\-]+)?/\d{7})|(?:\d{4}\.\d{4,5}))(?:v\d+)?",
294
+ raw,
295
+ re.IGNORECASE,
296
+ )
297
+ if bare_match:
298
+ return f"https://arxiv.org/abs/{strip_arxiv_version(bare_match.group(1))}"
299
+
300
+ matched = ARXIV_LINK_RE.search(raw)
301
+ if not matched:
302
+ return None
303
+
304
+ return f"https://arxiv.org/abs/{strip_arxiv_version(matched.group(1))}"
305
+
306
+
307
+ def build_error(module: str, source: str, message: str) -> dict[str, str]:
308
+ """Return a structured error record."""
309
+ return {
310
+ "module": module,
311
+ "source": source,
312
+ "message": normalize_whitespace(message),
313
+ }
314
+
315
+
316
+ def ensure_dependencies() -> None:
317
+ """Install feedparser and requests on demand."""
318
+ try:
319
+ import feedparser # noqa: F401
320
+ import requests # noqa: F401
321
+ except ImportError:
322
+ subprocess.run(
323
+ [
324
+ sys.executable,
325
+ "-m",
326
+ "pip",
327
+ "install",
328
+ "feedparser",
329
+ "requests",
330
+ "--break-system-packages",
331
+ "-q",
332
+ ],
333
+ check=True,
334
+ )
335
+
336
+
337
+ def fetch_rss(feeds: list[dict[str, str]], cutoff: datetime, language: str) -> list[dict[str, str]]:
338
+ """Fetch RSS articles published after the cutoff."""
339
+ import feedparser
340
+ import requests
341
+ from email.utils import parsedate_to_datetime
342
+
343
+ messages = get_messages(language)
344
+ articles: list[dict[str, str]] = []
345
+
346
+ for feed in feeds:
347
+ url = feed["url"]
348
+ if not url.startswith("http"):
349
+ url = "https://" + url
350
+
351
+ try:
352
+ response = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
353
+ parsed = feedparser.parse(response.content)
354
+ for entry in parsed.entries:
355
+ published_at = None
356
+ for attr in ["published", "updated"]:
357
+ if hasattr(entry, attr):
358
+ try:
359
+ published_at = parsedate_to_datetime(getattr(entry, attr))
360
+ break
361
+ except Exception:
362
+ pass
363
+ if published_at is None:
364
+ published_at = datetime.now(timezone.utc)
365
+ if published_at.tzinfo is None:
366
+ published_at = published_at.replace(tzinfo=timezone.utc)
367
+ if published_at >= cutoff:
368
+ summary = re.sub(r"<[^>]+>", "", getattr(entry, "summary", "") or "")[:300]
369
+ articles.append(
370
+ {
371
+ "source": feed.get("name", ""),
372
+ "title": entry.get("title", messages["untitled"]),
373
+ "link": entry.get("link", ""),
374
+ "published": published_at.strftime("%Y-%m-%d"),
375
+ "summary": summary.strip(),
376
+ }
377
+ )
378
+ except Exception as error:
379
+ articles.append(
380
+ {
381
+ "source": feed.get("name", ""),
382
+ "title": build_failure_title(messages["fetch_failed"], error),
383
+ "link": "",
384
+ "published": "",
385
+ "summary": "",
386
+ }
387
+ )
388
+
389
+ return articles
390
+
391
+
392
+ def parse_arxiv_feed(xml_data: bytes, cutoff: datetime, language: str) -> list[dict[str, str]]:
393
+ """Parse an arXiv Atom feed into normalized paper records."""
394
+ papers: list[dict[str, str]] = []
395
+ namespaces = {"atom": "http://www.w3.org/2005/Atom"}
396
+ root = ET.fromstring(xml_data)
397
+
398
+ for entry in root.findall("atom:entry", namespaces):
399
+ identifier = entry.find("atom:id", namespaces)
400
+ published_elem = entry.find("atom:published", namespaces)
401
+ title_elem = entry.find("atom:title", namespaces)
402
+ summary_elem = entry.find("atom:summary", namespaces)
403
+ if (
404
+ identifier is None
405
+ or published_elem is None
406
+ or title_elem is None
407
+ or summary_elem is None
408
+ or identifier.text is None
409
+ or published_elem.text is None
410
+ or title_elem.text is None
411
+ or summary_elem.text is None
412
+ ):
413
+ continue
414
+
415
+ normalized_link = normalize_arxiv_link(identifier.text)
416
+ if normalized_link is None:
417
+ continue
418
+
419
+ published_at = datetime.fromisoformat(published_elem.text.replace("Z", "+00:00"))
420
+ if published_at < cutoff:
421
+ continue
422
+
423
+ entry_categories = [category.get("term") or "" for category in entry.findall("atom:category", namespaces)]
424
+ authors = [
425
+ author_name.text
426
+ for author in entry.findall("atom:author", namespaces)
427
+ for author_name in [author.find("atom:name", namespaces)]
428
+ if author_name is not None and author_name.text is not None
429
+ ]
430
+ papers.append(
431
+ normalize_paper_record(
432
+ "arXiv",
433
+ {
434
+ "title": normalize_whitespace(title_elem.text),
435
+ "link": normalized_link,
436
+ "published": published_at.strftime("%Y-%m-%d"),
437
+ "summary": normalize_whitespace(summary_elem.text)[:300],
438
+ "categories": ", ".join(entry_categories[:5]),
439
+ "authors": authors,
440
+ },
441
+ ", ".join(entry_categories[:5]),
442
+ language,
443
+ )
444
+ )
445
+
446
+ return papers
447
+
448
+
449
+ def fetch_recent_arxiv_category(category: str, max_results: int) -> bytes:
450
+ """Fetch recent papers for one arXiv category."""
451
+ params = urllib.parse.urlencode(
452
+ {
453
+ "search_query": f"cat:{category}",
454
+ "start": 0,
455
+ "max_results": max_results,
456
+ "sortBy": "submittedDate",
457
+ "sortOrder": "descending",
458
+ }
459
+ )
460
+
461
+ request = urllib.request.Request(
462
+ f"{ARXIV_API_URL}?{params}",
463
+ headers=REQUEST_HEADERS,
464
+ )
465
+ with urllib.request.urlopen(request, timeout=60) as response:
466
+ return response.read()
467
+
468
+
469
+ def extract_openalex_abstract(work: dict) -> str:
470
+ """Build a readable abstract from OpenAlex fields."""
471
+ inverted_index = work.get("abstract_inverted_index")
472
+ if isinstance(inverted_index, dict):
473
+ tokens: list[tuple[int, str]] = []
474
+ for word, positions in inverted_index.items():
475
+ if not isinstance(word, str) or not isinstance(positions, list):
476
+ continue
477
+ for position in positions:
478
+ if isinstance(position, int):
479
+ tokens.append((position, word))
480
+ if tokens:
481
+ return normalize_whitespace(" ".join(word for _, word in sorted(tokens)))[:300]
482
+
483
+ abstract = work.get("abstract")
484
+ if isinstance(abstract, str):
485
+ return normalize_whitespace(abstract)[:300]
486
+
487
+ return ""
488
+
489
+
490
+ def extract_openalex_author_names(work: dict) -> list[str]:
491
+ """Collect OpenAlex author display names."""
492
+ author_names: list[str] = []
493
+ authorships = work.get("authorships")
494
+ if not isinstance(authorships, list):
495
+ return author_names
496
+
497
+ for authorship in authorships:
498
+ if not isinstance(authorship, dict):
499
+ continue
500
+ author = authorship.get("author")
501
+ if not isinstance(author, dict):
502
+ continue
503
+ display_name = author.get("display_name")
504
+ if isinstance(display_name, str):
505
+ author_names.append(display_name)
506
+
507
+ return author_names
508
+
509
+
510
+ def extract_openalex_category(work: dict) -> str:
511
+ """Extract the most specific OpenAlex topic label available."""
512
+ primary_topic = work.get("primary_topic")
513
+ if not isinstance(primary_topic, dict):
514
+ return ""
515
+
516
+ for field in ["subfield", "field", "domain"]:
517
+ nested = primary_topic.get(field)
518
+ if not isinstance(nested, dict):
519
+ continue
520
+ display_name = nested.get("display_name")
521
+ if isinstance(display_name, str) and display_name:
522
+ return display_name
523
+
524
+ return ""
525
+
526
+
527
+ def normalize_openalex_arxiv_link(work: dict) -> str | None:
528
+ """Extract a canonical arXiv abs URL from an OpenAlex work."""
529
+ ids = work.get("ids")
530
+ if isinstance(ids, dict):
531
+ for key in ["arxiv", "openalex"]:
532
+ candidate = ids.get(key)
533
+ if isinstance(candidate, str):
534
+ normalized = normalize_arxiv_link(candidate)
535
+ if normalized is not None:
536
+ return normalized
537
+
538
+ location_candidates: list[object] = []
539
+ for field in ["primary_location", "best_oa_location"]:
540
+ location_candidates.append(work.get(field))
541
+ locations = work.get("locations")
542
+ if isinstance(locations, list):
543
+ location_candidates.extend(locations)
544
+
545
+ for location in location_candidates:
546
+ if not isinstance(location, dict):
547
+ continue
548
+ for field in ["landing_page_url", "pdf_url"]:
549
+ candidate = location.get(field)
550
+ if isinstance(candidate, str):
551
+ normalized = normalize_arxiv_link(candidate)
552
+ if normalized is not None:
553
+ return normalized
554
+
555
+ return None
556
+
557
+
558
+ def normalize_openalex_chemrxiv_link(work: dict) -> str | None:
559
+ """Extract a canonical ChemRxiv DOI URL from an OpenAlex work."""
560
+ candidates: list[object] = [work.get("doi")]
561
+ ids = work.get("ids")
562
+ if isinstance(ids, dict):
563
+ candidates.append(ids.get("doi"))
564
+
565
+ location_candidates: list[object] = []
566
+ for field in ["primary_location", "best_oa_location"]:
567
+ location_candidates.append(work.get(field))
568
+ locations = work.get("locations")
569
+ if isinstance(locations, list):
570
+ location_candidates.extend(locations)
571
+
572
+ for location in location_candidates:
573
+ if not isinstance(location, dict):
574
+ continue
575
+ for field in ["landing_page_url", "pdf_url"]:
576
+ candidates.append(location.get(field))
577
+
578
+ for candidate in candidates:
579
+ if not isinstance(candidate, str):
580
+ continue
581
+ matched = CHEMRXIV_DOI_RE.search(normalize_whitespace(candidate))
582
+ if matched:
583
+ return f"https://doi.org/{matched.group(0)}"
584
+
585
+ return None
586
+
587
+
588
+ def collect_openalex_location_urls(work: dict) -> list[str]:
589
+ """Collect OpenAlex landing-page and PDF URLs in source order."""
590
+ urls: list[str] = []
591
+ location_candidates: list[object] = []
592
+ for field in ["primary_location", "best_oa_location"]:
593
+ location_candidates.append(work.get(field))
594
+ locations = work.get("locations")
595
+ if isinstance(locations, list):
596
+ location_candidates.extend(locations)
597
+
598
+ for location in location_candidates:
599
+ if not isinstance(location, dict):
600
+ continue
601
+ for field in ["landing_page_url", "pdf_url"]:
602
+ candidate = location.get(field)
603
+ if isinstance(candidate, str):
604
+ normalized = normalize_whitespace(candidate)
605
+ if normalized:
606
+ urls.append(normalized)
607
+
608
+ return urls
609
+
610
+
611
+ def normalize_openalex_socarxiv_link(work: dict) -> str | None:
612
+ """Extract a canonical SocArXiv link from an OpenAlex work."""
613
+ urls = collect_openalex_location_urls(work)
614
+ for candidate in urls:
615
+ if "socarxiv.com" in candidate:
616
+ return candidate
617
+ for candidate in urls:
618
+ if "osf.io" in candidate:
619
+ return candidate
620
+ return None
621
+
622
+
623
+ def normalize_openalex_ssrn_link(work: dict) -> str | None:
624
+ """Extract a canonical SSRN link from an OpenAlex work."""
625
+ urls = collect_openalex_location_urls(work)
626
+ for candidate in urls:
627
+ if "papers.ssrn.com" in candidate:
628
+ return candidate
629
+ for candidate in urls:
630
+ if "ssrn.com" in candidate:
631
+ return candidate
632
+
633
+ candidates: list[object] = [work.get("doi")]
634
+ ids = work.get("ids")
635
+ if isinstance(ids, dict):
636
+ candidates.append(ids.get("doi"))
637
+
638
+ for candidate in candidates:
639
+ if not isinstance(candidate, str):
640
+ continue
641
+ matched = SSRN_DOI_RE.search(normalize_whitespace(candidate))
642
+ if matched:
643
+ return f"https://doi.org/{matched.group(0)}"
644
+
645
+ return None
646
+
647
+
648
+ def is_openalex_chemrxiv_work(work: dict) -> bool:
649
+ """Return whether an OpenAlex work can be attributed to ChemRxiv."""
650
+ if normalize_openalex_chemrxiv_link(work) is not None:
651
+ return True
652
+
653
+ location_candidates: list[object] = []
654
+ for field in ["primary_location", "best_oa_location"]:
655
+ location_candidates.append(work.get(field))
656
+ locations = work.get("locations")
657
+ if isinstance(locations, list):
658
+ location_candidates.extend(locations)
659
+
660
+ for location in location_candidates:
661
+ if not isinstance(location, dict):
662
+ continue
663
+ source = location.get("source")
664
+ if not isinstance(source, dict):
665
+ continue
666
+
667
+ source_id = normalize_whitespace(str(source.get("id") or ""))
668
+ display_name = normalize_whitespace(str(source.get("display_name") or ""))
669
+ if source_id.endswith(f"/{CHEMRXIV_OPENALEX_REPOSITORY_ID}") or display_name == "ChemRxiv":
670
+ return True
671
+
672
+ return False
673
+
674
+
675
+ def parse_openalex_repository_results(
676
+ payload: dict,
677
+ cutoff: datetime,
678
+ language: str,
679
+ source_type: str,
680
+ scope: str,
681
+ link_normalizer,
682
+ ) -> list[dict[str, str]]:
683
+ """Parse repository-filtered OpenAlex results into normalized paper records."""
684
+ papers: list[dict[str, str]] = []
685
+ results = payload.get("results")
686
+ if not isinstance(results, list):
687
+ return papers
688
+
689
+ for work in results:
690
+ if not isinstance(work, dict):
691
+ continue
692
+
693
+ published = normalize_published_date(work.get("publication_date"))
694
+ if not published:
695
+ continue
696
+
697
+ published_at = datetime.fromisoformat(f"{published}T00:00:00+00:00")
698
+ if published_at < cutoff:
699
+ continue
700
+
701
+ link = link_normalizer(work)
702
+ if not link:
703
+ continue
704
+
705
+ category = normalize_whitespace(str(extract_openalex_category(work) or scope or ""))
706
+ if scope:
707
+ scope_terms = normalize_string_list(scope)
708
+ if scope_terms and not any(term.lower() in category.lower() for term in scope_terms):
709
+ continue
710
+
711
+ papers.append(
712
+ normalize_paper_record(
713
+ source_type,
714
+ {
715
+ "title": work.get("display_name") or work.get("title"),
716
+ "link": link,
717
+ "published": published,
718
+ "summary": extract_openalex_abstract(work),
719
+ "authors": extract_openalex_author_names(work),
720
+ "categories": category,
721
+ },
722
+ scope or category,
723
+ language,
724
+ )
725
+ )
726
+
727
+ return papers
728
+
729
+
730
+ def parse_openalex_results(
731
+ payload: dict,
732
+ cutoff: datetime,
733
+ language: str,
734
+ require_arxiv_link: bool = True,
735
+ ) -> list[dict[str, str]]:
736
+ """Parse OpenAlex results into normalized paper records."""
737
+ messages = get_messages(language)
738
+ papers: list[dict[str, str]] = []
739
+ results = payload.get("results")
740
+ if not isinstance(results, list):
741
+ return papers
742
+
743
+ for work in results:
744
+ if not isinstance(work, dict):
745
+ continue
746
+
747
+ published_text = work.get("publication_date")
748
+ if not isinstance(published_text, str):
749
+ continue
750
+
751
+ published_at = datetime.fromisoformat(published_text)
752
+ if published_at.tzinfo is None:
753
+ published_at = published_at.replace(tzinfo=timezone.utc)
754
+ if published_at < cutoff:
755
+ continue
756
+
757
+ normalized_link = normalize_openalex_arxiv_link(work)
758
+ if require_arxiv_link and normalized_link is None:
759
+ continue
760
+
761
+ title = work.get("display_name") or work.get("title") or messages["untitled"]
762
+ author_names = extract_openalex_author_names(work)
763
+ category = extract_openalex_category(work)
764
+
765
+ papers.append(
766
+ normalize_paper_record(
767
+ "openalex",
768
+ {
769
+ "title": normalize_whitespace(str(title)),
770
+ "link": normalized_link or "",
771
+ "published": published_at.strftime("%Y-%m-%d"),
772
+ "summary": extract_openalex_abstract(work),
773
+ "categories": category,
774
+ "authors": author_names,
775
+ },
776
+ category,
777
+ language,
778
+ )
779
+ )
780
+
781
+ return papers
782
+
783
+
784
+ def keyword_contains_non_english(keyword: str) -> bool:
785
+ """Detect whether a keyword includes CJK characters."""
786
+ return CJK_RE.search(keyword) is not None
787
+
788
+
789
+ def compile_keyword_expressions(keywords: list[str]) -> list[list[str]]:
790
+ """Split configured keywords into exact phrases and plain English terms."""
791
+ expressions: list[list[str]] = []
792
+ for keyword in keywords:
793
+ cleaned = normalize_whitespace(keyword)
794
+ if not cleaned:
795
+ continue
796
+
797
+ phrases = [normalize_whitespace(value).lower() for value in QUOTE_RE.findall(cleaned)]
798
+ remainder = QUOTE_RE.sub(" ", cleaned)
799
+ terms = [part.lower() for part in remainder.split() if part]
800
+ clauses = [clause for clause in [*phrases, *terms] if clause]
801
+ if clauses:
802
+ expressions.append(clauses)
803
+ return expressions
804
+
805
+
806
+ def score_paper(paper: dict[str, str], expressions: list[list[str]]) -> int:
807
+ """Score one paper against compiled keyword expressions."""
808
+ title = paper.get("title", "").lower()
809
+ summary = paper.get("summary", "").lower()
810
+ best_score = 0
811
+
812
+ for expression in expressions:
813
+ expression_score = 0
814
+ matched = True
815
+ for clause in expression:
816
+ if clause in title:
817
+ expression_score += 4
818
+ elif clause in summary:
819
+ expression_score += 2
820
+ else:
821
+ matched = False
822
+ break
823
+ if matched:
824
+ best_score = max(best_score, expression_score + len(expression))
825
+
826
+ return best_score
827
+
828
+
829
+ def rank_papers(papers: list[dict[str, str]], expressions: list[list[str]]) -> list[dict[str, str]]:
830
+ """Filter papers by keyword match and attach a ranking score."""
831
+ if not expressions:
832
+ return []
833
+
834
+ ranked: list[dict[str, str]] = []
835
+ for paper in papers:
836
+ score = score_paper(paper, expressions)
837
+ if score <= 0:
838
+ continue
839
+ ranked.append({**paper, "score": score})
840
+
841
+ ranked.sort(
842
+ key=lambda paper: (
843
+ int(paper.get("score", 0)),
844
+ paper.get("published", ""),
845
+ SOURCE_PRIORITY.get(paper.get("source", ""), 0),
846
+ len(paper.get("summary", "")),
847
+ ),
848
+ reverse=True,
849
+ )
850
+ return ranked
851
+
852
+
853
+ def strip_internal_fields(paper: dict[str, str]) -> dict[str, str]:
854
+ """Remove helper-only fields before returning JSON."""
855
+ source = paper.get("source", "")
856
+ return {
857
+ "title": paper.get("title", ""),
858
+ "link": paper.get("link", ""),
859
+ "published": paper.get("published", ""),
860
+ "summary": paper.get("summary", ""),
861
+ "categories": paper.get("categories", ""),
862
+ "authors": paper.get("authors", ""),
863
+ "source": source,
864
+ "source_type": paper.get("source_type", "") or display_source_type(source),
865
+ "scope": paper.get("scope", "") or paper.get("categories", ""),
866
+ }
867
+
868
+
869
+ def deduplicate_papers(papers: list[dict[str, str]]) -> list[dict[str, str]]:
870
+ """Merge duplicate papers, preferring official arXiv records."""
871
+ chosen: dict[str, dict[str, str]] = {}
872
+
873
+ for paper in papers:
874
+ link_key = normalize_arxiv_link(paper.get("link"))
875
+ title_key = normalize_title_key(paper.get("title", ""))
876
+ dedupe_key = link_key or title_key
877
+ if not dedupe_key:
878
+ continue
879
+
880
+ existing = chosen.get(dedupe_key)
881
+ current_rank = (
882
+ SOURCE_PRIORITY.get(paper.get("source", ""), 0),
883
+ int(paper.get("score", 0)),
884
+ paper.get("published", ""),
885
+ len(paper.get("summary", "")),
886
+ )
887
+ if existing is None:
888
+ chosen[dedupe_key] = paper
889
+ continue
890
+
891
+ existing_rank = (
892
+ SOURCE_PRIORITY.get(existing.get("source", ""), 0),
893
+ int(existing.get("score", 0)),
894
+ existing.get("published", ""),
895
+ len(existing.get("summary", "")),
896
+ )
897
+ if current_rank > existing_rank:
898
+ chosen[dedupe_key] = paper
899
+
900
+ normalized = [strip_internal_fields(paper) for paper in chosen.values()]
901
+ normalized.sort(
902
+ key=lambda paper: (
903
+ paper.get("published", ""),
904
+ SOURCE_PRIORITY.get(paper.get("source", ""), 0),
905
+ normalize_title_key(paper.get("title", "")),
906
+ ),
907
+ reverse=True,
908
+ )
909
+ return normalized
910
+
911
+
912
+ def build_openalex_query(keywords: list[str]) -> str:
913
+ """Build a simple OpenAlex fallback query string."""
914
+ parts = [normalize_whitespace(keyword.replace('"', " ")) for keyword in keywords]
915
+ return " ".join(part for part in parts if part)
916
+
917
+
918
+ def fetch_openalex_works(
919
+ query: str,
920
+ cutoff: datetime,
921
+ max_results: int,
922
+ extra_filters: list[str] | None = None,
923
+ ) -> dict:
924
+ """Run an OpenAlex work search constrained by the digest date window."""
925
+ filters = [f"from_publication_date:{cutoff.date().isoformat()}"]
926
+ if extra_filters:
927
+ filters.extend(filter(None, extra_filters))
928
+
929
+ params = urllib.parse.urlencode(
930
+ {
931
+ "search": query,
932
+ "per-page": min(max_results, 100),
933
+ "sort": "publication_date:desc",
934
+ "filter": ",".join(filters),
935
+ }
936
+ )
937
+ request = urllib.request.Request(
938
+ f"{OPENALEX_API_URL}?{params}",
939
+ headers=REQUEST_HEADERS,
940
+ )
941
+ with urllib.request.urlopen(request, timeout=60) as response:
942
+ return json.loads(response.read().decode("utf-8"))
943
+
944
+
945
+ def normalize_paper_sources(config: dict) -> list[dict[str, object]]:
946
+ """Normalize legacy and phase-1 paper source config into runtime entries."""
947
+ sources: list[dict[str, object]] = []
948
+
949
+ paper_sources = config.get("paper_sources")
950
+ if isinstance(paper_sources, list):
951
+ for row in paper_sources:
952
+ if not isinstance(row, dict):
953
+ continue
954
+
955
+ source_key = normalize_source_type(row.get("source_type"))
956
+ if not source_key:
957
+ continue
958
+
959
+ enabled_value = row.get("enabled")
960
+ sources.append(
961
+ {
962
+ "enabled": bool(enabled_value) if enabled_value is not None else True,
963
+ "source_type": display_source_type(source_key),
964
+ "queries": normalize_string_list(row.get("queries") or row.get("query") or row.get("keywords")),
965
+ "scope": normalize_whitespace(str(row.get("scope") or row.get("categories") or "")),
966
+ "notes": normalize_whitespace(str(row.get("notes") or "")),
967
+ "max_results": int(row.get("max_results", 200)) if row.get("max_results") is not None else 200,
968
+ "fallback_enabled": bool(row.get("fallback_enabled", True)),
969
+ "require_arxiv_link": bool(row.get("require_arxiv_link", True)),
970
+ }
971
+ )
972
+
973
+ arxiv_config = config.get("arxiv")
974
+ if isinstance(arxiv_config, dict) and arxiv_config.get("enabled", False):
975
+ sources.append(
976
+ {
977
+ "enabled": True,
978
+ "source_type": "arXiv",
979
+ "queries": normalize_string_list(arxiv_config.get("keywords")),
980
+ "scope": ", ".join(normalize_string_list(arxiv_config.get("categories"))),
981
+ "notes": "",
982
+ "max_results": int(arxiv_config.get("max_results", 200)),
983
+ "fallback_enabled": bool(arxiv_config.get("fallback_enabled", True)),
984
+ "require_arxiv_link": bool(arxiv_config.get("require_arxiv_link", True)),
985
+ }
986
+ )
987
+
988
+ return sources
989
+
990
+
991
+ def parse_biorxiv_results(
992
+ payload: dict,
993
+ cutoff: datetime,
994
+ language: str,
995
+ source_type: str,
996
+ scope: str,
997
+ ) -> list[dict[str, str]]:
998
+ """Parse bioRxiv/medRxiv API payloads into unified paper records."""
999
+ papers: list[dict[str, str]] = []
1000
+ collection = payload.get("collection")
1001
+ if not isinstance(collection, list):
1002
+ return papers
1003
+
1004
+ for item in collection:
1005
+ if not isinstance(item, dict):
1006
+ continue
1007
+
1008
+ published = normalize_published_date(item.get("preprint_date") or item.get("published_date") or item.get("date"))
1009
+ if not published:
1010
+ continue
1011
+
1012
+ published_at = datetime.fromisoformat(f"{published}T00:00:00+00:00")
1013
+ if published_at < cutoff:
1014
+ continue
1015
+
1016
+ category = normalize_whitespace(str(item.get("preprint_category") or item.get("category") or ""))
1017
+ if scope:
1018
+ scope_terms = normalize_string_list(scope)
1019
+ if scope_terms and not any(term.lower() in category.lower() for term in scope_terms):
1020
+ continue
1021
+
1022
+ doi = item.get("biorxiv_doi") or item.get("doi")
1023
+ link = ""
1024
+ if isinstance(doi, str) and doi:
1025
+ link = f"https://doi.org/{normalize_whitespace(doi)}"
1026
+
1027
+ papers.append(
1028
+ normalize_paper_record(
1029
+ source_type,
1030
+ {
1031
+ "title": item.get("preprint_title") or item.get("title"),
1032
+ "link": link,
1033
+ "published": published,
1034
+ "summary": item.get("preprint_abstract") or item.get("abstract"),
1035
+ "authors": item.get("preprint_authors") or item.get("authors"),
1036
+ "categories": category,
1037
+ },
1038
+ scope or category,
1039
+ language,
1040
+ )
1041
+ )
1042
+
1043
+ return papers
1044
+
1045
+
1046
+ def parse_chemrxiv_results(
1047
+ payload: object,
1048
+ cutoff: datetime,
1049
+ language: str,
1050
+ scope: str,
1051
+ ) -> list[dict[str, str]]:
1052
+ """Parse ChemRxiv adapter payloads into unified paper records."""
1053
+ papers: list[dict[str, str]] = []
1054
+ if isinstance(payload, dict):
1055
+ records = payload.get("results") or payload.get("collection") or payload.get("items")
1056
+ else:
1057
+ records = payload
1058
+
1059
+ if not isinstance(records, list):
1060
+ return papers
1061
+
1062
+ for item in records:
1063
+ if not isinstance(item, dict):
1064
+ continue
1065
+
1066
+ is_openalex_record = any(
1067
+ key in item for key in ["publication_date", "authorships", "primary_location", "best_oa_location"]
1068
+ )
1069
+ if is_openalex_record and not is_openalex_chemrxiv_work(item):
1070
+ continue
1071
+
1072
+ published = normalize_published_date(
1073
+ item.get("published") or item.get("published_date") or item.get("date") or item.get("publication_date")
1074
+ )
1075
+ if not published:
1076
+ continue
1077
+
1078
+ published_at = datetime.fromisoformat(f"{published}T00:00:00+00:00")
1079
+ if published_at < cutoff:
1080
+ continue
1081
+
1082
+ category_value = item.get("categories") or item.get("category") or extract_openalex_category(item) or scope or ""
1083
+ category = normalize_whitespace(str(category_value))
1084
+ if scope:
1085
+ scope_terms = normalize_string_list(scope)
1086
+ if scope_terms and not any(term.lower() in category.lower() for term in scope_terms):
1087
+ continue
1088
+
1089
+ link = item.get("link") or item.get("url") or item.get("doi")
1090
+ if not link and is_openalex_record:
1091
+ link = normalize_openalex_chemrxiv_link(item) or ""
1092
+
1093
+ summary = item.get("summary") or item.get("abstract") or item.get("description")
1094
+ if not summary and is_openalex_record:
1095
+ summary = extract_openalex_abstract(item)
1096
+
1097
+ authors = item.get("authors") or item.get("author_names")
1098
+ if not authors and is_openalex_record:
1099
+ authors = extract_openalex_author_names(item)
1100
+
1101
+ papers.append(
1102
+ normalize_paper_record(
1103
+ "ChemRxiv",
1104
+ {
1105
+ "title": item.get("title") or item.get("display_name"),
1106
+ "link": link,
1107
+ "published": published,
1108
+ "summary": summary,
1109
+ "authors": authors,
1110
+ "categories": category,
1111
+ },
1112
+ scope or category,
1113
+ language,
1114
+ )
1115
+ )
1116
+
1117
+ return papers
1118
+
1119
+
1120
+ def fetch_biorxiv_pubs(server: str, cutoff: datetime) -> dict:
1121
+ """Fetch bioRxiv/medRxiv preprints for a date window."""
1122
+ interval = f"{cutoff.date().isoformat()}/{datetime.now(timezone.utc).date().isoformat()}"
1123
+ request = urllib.request.Request(
1124
+ f"https://api.biorxiv.org/details/{server}/{interval}/0/json",
1125
+ headers=REQUEST_HEADERS,
1126
+ )
1127
+ last_error: Exception | None = None
1128
+ for _ in range(3):
1129
+ try:
1130
+ with urllib.request.urlopen(request, timeout=60) as response:
1131
+ return json.loads(response.read().decode("utf-8"))
1132
+ except Exception as error:
1133
+ last_error = error
1134
+
1135
+ if last_error is not None:
1136
+ raise last_error
1137
+
1138
+ return {"collection": []}
1139
+
1140
+
1141
+ def fetch_chemrxiv_results(source: dict, cutoff: datetime) -> object:
1142
+ """Fetch ChemRxiv results via repository-filtered OpenAlex search."""
1143
+ query = build_openalex_query(normalize_string_list(source.get("queries")))
1144
+ max_results = int(source.get("max_results", 200))
1145
+ return fetch_openalex_works(
1146
+ query,
1147
+ cutoff,
1148
+ max_results,
1149
+ extra_filters=[f"repository:{CHEMRXIV_OPENALEX_REPOSITORY_ID}"],
1150
+ )
1151
+
1152
+
1153
+ def fetch_socarxiv_results(source: dict, cutoff: datetime) -> object:
1154
+ """Fetch SocArXiv results via repository-filtered OpenAlex search."""
1155
+ query = build_openalex_query(normalize_string_list(source.get("queries")))
1156
+ max_results = int(source.get("max_results", 200))
1157
+ return fetch_openalex_works(
1158
+ query,
1159
+ cutoff,
1160
+ max_results,
1161
+ extra_filters=[f"repository:{SOCARXIV_OPENALEX_REPOSITORY_ID}"],
1162
+ )
1163
+
1164
+
1165
+ def fetch_ssrn_results(source: dict, cutoff: datetime) -> object:
1166
+ """Fetch SSRN results via repository-filtered OpenAlex search."""
1167
+ query = build_openalex_query(normalize_string_list(source.get("queries")))
1168
+ max_results = int(source.get("max_results", 200))
1169
+ return fetch_openalex_works(
1170
+ query,
1171
+ cutoff,
1172
+ max_results,
1173
+ extra_filters=[f"repository:{SSRN_OPENALEX_REPOSITORY_ID}"],
1174
+ )
1175
+
1176
+
1177
+ def collect_biorxiv_like_source(
1178
+ source: dict[str, object],
1179
+ cutoff: datetime,
1180
+ language: str,
1181
+ source_key: str,
1182
+ server: str,
1183
+ ) -> dict[str, list[dict[str, str]]]:
1184
+ """Collect bioRxiv/medRxiv papers using the official API."""
1185
+ queries = [query for query in normalize_string_list(source.get("queries")) if query]
1186
+ if any(keyword_contains_non_english(query) for query in queries):
1187
+ return {
1188
+ "papers": [],
1189
+ "errors": [build_error(source_key, "config", f"{display_source_type(source_key)} keywords must be English")],
1190
+ }
1191
+
1192
+ expressions = compile_keyword_expressions(queries)
1193
+ if not expressions:
1194
+ return {
1195
+ "papers": [],
1196
+ "errors": [],
1197
+ }
1198
+
1199
+ try:
1200
+ payload = fetch_biorxiv_pubs(server, cutoff)
1201
+ candidates = parse_biorxiv_results(payload, cutoff, language, source_key, str(source.get("scope", "")))
1202
+ ranked = rank_papers(candidates, expressions)
1203
+ max_results = int(source.get("max_results", 200))
1204
+ return {
1205
+ "papers": deduplicate_papers(ranked)[:max_results],
1206
+ "errors": [],
1207
+ }
1208
+ except Exception as error:
1209
+ return {
1210
+ "papers": [],
1211
+ "errors": [build_error(source_key, "adapter", str(error))],
1212
+ }
1213
+
1214
+
1215
+ def collect_arxiv_source(
1216
+ source: dict[str, object],
1217
+ cutoff: datetime,
1218
+ language: str,
1219
+ ) -> dict[str, list[dict[str, str]]]:
1220
+ """Collect arXiv papers using official feeds first and OpenAlex fallback second."""
1221
+ import time
1222
+
1223
+ queries = [query for query in normalize_string_list(source.get("queries")) if query]
1224
+ categories = [category for category in normalize_string_list(source.get("scope")) if category]
1225
+ max_results = int(source.get("max_results", 200))
1226
+ fallback_enabled = bool(source.get("fallback_enabled", True))
1227
+ require_arxiv_link = bool(source.get("require_arxiv_link", True))
1228
+
1229
+ if any(keyword_contains_non_english(keyword) for keyword in queries):
1230
+ return {
1231
+ "papers": [],
1232
+ "errors": [build_error("arxiv", "config", "arXiv keywords must be English")],
1233
+ }
1234
+
1235
+ expressions = compile_keyword_expressions(queries)
1236
+ if not expressions:
1237
+ return {
1238
+ "papers": [],
1239
+ "errors": [],
1240
+ }
1241
+
1242
+ errors: list[dict[str, str]] = []
1243
+ primary_candidates: list[dict[str, str]] = []
1244
+ per_category_results = max(50, min(100, max_results))
1245
+
1246
+ if categories:
1247
+ for index, category in enumerate(dict.fromkeys(categories)):
1248
+ if index > 0:
1249
+ time.sleep(ARXIV_REQUEST_INTERVAL_SECONDS)
1250
+ try:
1251
+ xml_data = fetch_recent_arxiv_category(category, per_category_results)
1252
+ primary_candidates.extend(parse_arxiv_feed(xml_data, cutoff, language))
1253
+ except Exception as error:
1254
+ errors.append(build_error("arxiv", "arxiv-api", str(error)))
1255
+
1256
+ primary_ranked = rank_papers(primary_candidates, expressions)
1257
+ primary_papers = deduplicate_papers(primary_ranked)[:max_results]
1258
+ if primary_papers:
1259
+ return {
1260
+ "papers": primary_papers,
1261
+ "errors": errors,
1262
+ }
1263
+
1264
+ if not fallback_enabled:
1265
+ return {
1266
+ "papers": [],
1267
+ "errors": errors,
1268
+ }
1269
+
1270
+ try:
1271
+ openalex_payload = fetch_openalex_works(build_openalex_query(queries), cutoff, max_results)
1272
+ fallback_candidates = parse_openalex_results(
1273
+ openalex_payload,
1274
+ cutoff,
1275
+ language,
1276
+ require_arxiv_link=require_arxiv_link,
1277
+ )
1278
+ fallback_ranked = rank_papers(fallback_candidates, expressions)
1279
+ return {
1280
+ "papers": deduplicate_papers(fallback_ranked)[:max_results],
1281
+ "errors": errors,
1282
+ }
1283
+ except Exception as error:
1284
+ errors.append(build_error("arxiv", "openalex", str(error)))
1285
+ return {
1286
+ "papers": [],
1287
+ "errors": errors,
1288
+ }
1289
+
1290
+
1291
+ def collect_biorxiv_source(
1292
+ source: dict[str, object],
1293
+ cutoff: datetime,
1294
+ language: str,
1295
+ ) -> dict[str, list[dict[str, str]]]:
1296
+ """Collect bioRxiv papers."""
1297
+ return collect_biorxiv_like_source(source, cutoff, language, "biorxiv", "biorxiv")
1298
+
1299
+
1300
+ def collect_medrxiv_source(
1301
+ source: dict[str, object],
1302
+ cutoff: datetime,
1303
+ language: str,
1304
+ ) -> dict[str, list[dict[str, str]]]:
1305
+ """Collect medRxiv papers."""
1306
+ return collect_biorxiv_like_source(source, cutoff, language, "medrxiv", "medrxiv")
1307
+
1308
+
1309
+ def collect_chemrxiv_source(
1310
+ source: dict[str, object],
1311
+ cutoff: datetime,
1312
+ language: str,
1313
+ ) -> dict[str, list[dict[str, str]]]:
1314
+ """Collect ChemRxiv papers through the adapter boundary."""
1315
+ queries = [query for query in normalize_string_list(source.get("queries")) if query]
1316
+ if any(keyword_contains_non_english(query) for query in queries):
1317
+ return {
1318
+ "papers": [],
1319
+ "errors": [build_error("chemrxiv", "config", "ChemRxiv keywords must be English")],
1320
+ }
1321
+
1322
+ expressions = compile_keyword_expressions(queries)
1323
+ if not expressions:
1324
+ return {
1325
+ "papers": [],
1326
+ "errors": [],
1327
+ }
1328
+
1329
+ try:
1330
+ payload = fetch_chemrxiv_results(source, cutoff)
1331
+ candidates = parse_chemrxiv_results(payload, cutoff, language, str(source.get("scope", "")))
1332
+ ranked = rank_papers(candidates, expressions)
1333
+ max_results = int(source.get("max_results", 200))
1334
+ return {
1335
+ "papers": deduplicate_papers(ranked)[:max_results],
1336
+ "errors": [],
1337
+ }
1338
+ except Exception as error:
1339
+ return {
1340
+ "papers": [],
1341
+ "errors": [build_error("chemrxiv", "adapter", str(error))],
1342
+ }
1343
+
1344
+
1345
+ def collect_openalex_repository_source(
1346
+ source: dict[str, object],
1347
+ cutoff: datetime,
1348
+ language: str,
1349
+ source_key: str,
1350
+ fetcher,
1351
+ link_normalizer,
1352
+ ) -> dict[str, list[dict[str, str]]]:
1353
+ """Collect repository-backed OpenAlex papers for a source type."""
1354
+ queries = [query for query in normalize_string_list(source.get("queries")) if query]
1355
+ if any(keyword_contains_non_english(query) for query in queries):
1356
+ return {
1357
+ "papers": [],
1358
+ "errors": [build_error(source_key, "config", f"{display_source_type(source_key)} keywords must be English")],
1359
+ }
1360
+
1361
+ expressions = compile_keyword_expressions(queries)
1362
+ if not expressions:
1363
+ return {
1364
+ "papers": [],
1365
+ "errors": [],
1366
+ }
1367
+
1368
+ try:
1369
+ payload = fetcher(source, cutoff)
1370
+ candidates = parse_openalex_repository_results(
1371
+ payload,
1372
+ cutoff,
1373
+ language,
1374
+ display_source_type(source_key),
1375
+ str(source.get("scope", "")),
1376
+ link_normalizer,
1377
+ )
1378
+ ranked = rank_papers(candidates, expressions)
1379
+ max_results = int(source.get("max_results", 200))
1380
+ return {
1381
+ "papers": deduplicate_papers(ranked)[:max_results],
1382
+ "errors": [],
1383
+ }
1384
+ except Exception as error:
1385
+ return {
1386
+ "papers": [],
1387
+ "errors": [build_error(source_key, "adapter", str(error))],
1388
+ }
1389
+
1390
+
1391
+ def collect_socarxiv_source(
1392
+ source: dict[str, object],
1393
+ cutoff: datetime,
1394
+ language: str,
1395
+ ) -> dict[str, list[dict[str, str]]]:
1396
+ """Collect SocArXiv papers through repository-filtered OpenAlex search."""
1397
+ return collect_openalex_repository_source(
1398
+ source,
1399
+ cutoff,
1400
+ language,
1401
+ "socarxiv",
1402
+ fetch_socarxiv_results,
1403
+ normalize_openalex_socarxiv_link,
1404
+ )
1405
+
1406
+
1407
+ def collect_ssrn_source(
1408
+ source: dict[str, object],
1409
+ cutoff: datetime,
1410
+ language: str,
1411
+ ) -> dict[str, list[dict[str, str]]]:
1412
+ """Collect SSRN papers through repository-filtered OpenAlex search."""
1413
+ return collect_openalex_repository_source(
1414
+ source,
1415
+ cutoff,
1416
+ language,
1417
+ "ssrn",
1418
+ fetch_ssrn_results,
1419
+ normalize_openalex_ssrn_link,
1420
+ )
1421
+
1422
+
1423
+ def collect_papers(
1424
+ paper_sources: list[dict[str, object]],
1425
+ cutoff: datetime,
1426
+ language: str,
1427
+ max_results: int = 200,
1428
+ ) -> dict[str, list[dict[str, str]]]:
1429
+ """Collect papers across multiple explicit source adapters."""
1430
+ papers: list[dict[str, str]] = []
1431
+ errors: list[dict[str, str]] = []
1432
+
1433
+ for source in paper_sources:
1434
+ if not isinstance(source, dict):
1435
+ continue
1436
+ if not source.get("enabled", True):
1437
+ continue
1438
+
1439
+ source_key = normalize_source_type(str(source.get("source_type") or ""))
1440
+ try:
1441
+ if source_key == "arxiv":
1442
+ result = collect_arxiv_source(source, cutoff, language)
1443
+ elif source_key == "biorxiv":
1444
+ result = collect_biorxiv_source(source, cutoff, language)
1445
+ elif source_key == "medrxiv":
1446
+ result = collect_medrxiv_source(source, cutoff, language)
1447
+ elif source_key == "chemrxiv":
1448
+ result = collect_chemrxiv_source(source, cutoff, language)
1449
+ elif source_key == "socarxiv":
1450
+ result = collect_socarxiv_source(source, cutoff, language)
1451
+ elif source_key == "ssrn":
1452
+ result = collect_ssrn_source(source, cutoff, language)
1453
+ else:
1454
+ errors.append(
1455
+ build_error(
1456
+ source_key or "papers",
1457
+ "config",
1458
+ f"Unsupported paper source type: {source.get('source_type')}",
1459
+ )
1460
+ )
1461
+ continue
1462
+ except Exception as error:
1463
+ errors.append(build_error(source_key or "papers", "adapter", str(error)))
1464
+ continue
1465
+
1466
+ papers.extend(result["papers"])
1467
+ errors.extend(result["errors"])
1468
+
1469
+ return {
1470
+ "papers": deduplicate_papers(papers)[:max_results],
1471
+ "errors": errors,
1472
+ }
1473
+
1474
+
1475
+ def fetch_arxiv(
1476
+ keywords: list[str], categories: list[str], max_results: int, cutoff: datetime, language: str
1477
+ ) -> list[dict[str, str]]:
1478
+ """Backward-compatible wrapper that returns only paper rows."""
1479
+ return collect_arxiv_papers(
1480
+ keywords=keywords,
1481
+ categories=categories,
1482
+ max_results=max_results,
1483
+ cutoff=cutoff,
1484
+ language=language,
1485
+ )["papers"]
1486
+
1487
+
1488
+ def collect_arxiv_papers(
1489
+ keywords: list[str],
1490
+ categories: list[str],
1491
+ max_results: int,
1492
+ cutoff: datetime,
1493
+ language: str,
1494
+ fallback_enabled: bool = True,
1495
+ require_arxiv_link: bool = True,
1496
+ ) -> dict[str, list[dict[str, str]]]:
1497
+ """Backward-compatible wrapper around the generic papers collector."""
1498
+ paper_sources = normalize_paper_sources(
1499
+ {
1500
+ "arxiv": {
1501
+ "enabled": True,
1502
+ "keywords": keywords,
1503
+ "categories": categories,
1504
+ "max_results": max_results,
1505
+ "fallback_enabled": fallback_enabled,
1506
+ "require_arxiv_link": require_arxiv_link,
1507
+ }
1508
+ }
1509
+ )
1510
+ return collect_papers(paper_sources, cutoff, language, max_results)
1511
+
1512
+
1513
+ def main() -> None:
1514
+ """Read config from stdin, run enabled fetchers, and print JSON."""
1515
+ config = json.loads(sys.stdin.read())
1516
+ language = normalize_language(config.get("language"))
1517
+ days = config.get("days", 7)
1518
+ cutoff = datetime.now(timezone.utc) - timedelta(days=days)
1519
+
1520
+ rss_articles: list[dict[str, str]] = []
1521
+ arxiv_papers: list[dict[str, str]] = []
1522
+ errors: list[dict[str, str]] = []
1523
+
1524
+ rss_config = config.get("rss", {})
1525
+ if rss_config.get("enabled", False):
1526
+ ensure_dependencies()
1527
+ feeds = rss_config.get("feeds", [])
1528
+ rss_articles = fetch_rss(feeds, cutoff, language)
1529
+
1530
+ paper_sources = normalize_paper_sources(config)
1531
+ if paper_sources:
1532
+ paper_result = collect_papers(paper_sources, cutoff, language, 200)
1533
+ arxiv_papers = paper_result["papers"]
1534
+ errors.extend(paper_result["errors"])
1535
+
1536
+ result = {
1537
+ "rss_articles": rss_articles,
1538
+ "arxiv_papers": arxiv_papers,
1539
+ "stats": {
1540
+ "rss_count": len([article for article in rss_articles if not article["title"].startswith("[")]),
1541
+ "arxiv_count": len(arxiv_papers),
1542
+ },
1543
+ "errors": errors,
1544
+ }
1545
+ print(json.dumps(result, ensure_ascii=False))
1546
+
1547
+
1548
+ if __name__ == "__main__":
1549
+ main()