lifeos 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -9
- package/README.zh.md +15 -9
- package/assets/lifeos-rules.en.md +1 -1
- package/assets/lifeos-rules.zh.md +1 -1
- package/assets/lifeos.yaml +1 -0
- package/assets/skills/archive/SKILL.en.md +1 -1
- package/assets/skills/archive/SKILL.zh.md +1 -1
- package/assets/skills/ask/SKILL.en.md +1 -1
- package/assets/skills/ask/SKILL.zh.md +1 -1
- package/assets/skills/brainstorm/SKILL.en.md +1 -1
- package/assets/skills/brainstorm/SKILL.zh.md +1 -1
- package/assets/skills/digest/SKILL.en.md +212 -0
- package/assets/skills/digest/SKILL.zh.md +207 -0
- package/assets/skills/digest/references/__pycache__/rss-arxiv-script.cpython-312.pyc +0 -0
- package/assets/skills/digest/references/config-parser.en.md +179 -0
- package/assets/skills/digest/references/config-parser.zh.md +177 -0
- package/assets/skills/digest/references/rss-arxiv-script.py +1549 -0
- package/assets/skills/digest/references/run-pipeline.en.md +236 -0
- package/assets/skills/digest/references/run-pipeline.zh.md +235 -0
- package/assets/skills/digest/references/setup-guide.en.md +192 -0
- package/assets/skills/digest/references/setup-guide.zh.md +188 -0
- package/assets/skills/knowledge/SKILL.en.md +1 -1
- package/assets/skills/knowledge/SKILL.zh.md +1 -1
- package/assets/skills/project/SKILL.en.md +1 -1
- package/assets/skills/project/SKILL.zh.md +1 -1
- package/assets/skills/read-pdf/SKILL.en.md +1 -1
- package/assets/skills/read-pdf/SKILL.zh.md +1 -1
- package/assets/skills/research/SKILL.en.md +1 -1
- package/assets/skills/research/SKILL.zh.md +1 -1
- package/assets/skills/revise/SKILL.en.md +1 -1
- package/assets/skills/revise/SKILL.zh.md +1 -1
- package/assets/skills/today/SKILL.en.md +1 -1
- package/assets/skills/today/SKILL.zh.md +1 -1
- package/dist/cli/commands/doctor.js +9 -9
- package/dist/cli/commands/doctor.js.map +1 -1
- package/dist/cli/commands/upgrade.js +20 -2
- package/dist/cli/commands/upgrade.js.map +1 -1
- package/dist/cli/utils/install-assets.js +6 -2
- package/dist/cli/utils/install-assets.js.map +1 -1
- package/dist/config.d.ts +1 -0
- package/dist/config.js +2 -0
- package/dist/config.js.map +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/server.js +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,1549 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
/digest RSS + arXiv fetch helper.
|
|
4
|
+
|
|
5
|
+
Input: JSON config from stdin.
|
|
6
|
+
Output: JSON result on stdout.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
echo '{"language":"en","rss":{"enabled":false},"arxiv":{"enabled":false},"days":7}' | python3 rss-arxiv-script.py
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import re
|
|
16
|
+
import subprocess
|
|
17
|
+
import sys
|
|
18
|
+
import urllib.parse
|
|
19
|
+
import urllib.request
|
|
20
|
+
import xml.etree.ElementTree as ET
|
|
21
|
+
from datetime import datetime, timedelta, timezone
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
ARXIV_API_URL = "http://export.arxiv.org/api/query"
|
|
25
|
+
OPENALEX_API_URL = "https://api.openalex.org/works"
|
|
26
|
+
REQUEST_HEADERS = {"User-Agent": "LifeOS digest/1.0"}
|
|
27
|
+
ARXIV_REQUEST_INTERVAL_SECONDS = 3
|
|
28
|
+
CHEMRXIV_OPENALEX_REPOSITORY_ID = "S4393918830"
|
|
29
|
+
SOCARXIV_OPENALEX_REPOSITORY_ID = "S4306401238"
|
|
30
|
+
SSRN_OPENALEX_REPOSITORY_ID = "S4210172589"
|
|
31
|
+
ARXIV_LINK_RE = re.compile(
|
|
32
|
+
r"arxiv\.org/(?:abs|pdf)/((?:[a-z\-]+(?:\.[a-z\-]+)?/\d{7})|(?:\d{4}\.\d{4,5}))(?:v\d+)?(?:\.pdf)?",
|
|
33
|
+
re.IGNORECASE,
|
|
34
|
+
)
|
|
35
|
+
CHEMRXIV_DOI_RE = re.compile(r"10\.26434/chemrxiv[0-9A-Za-z./_-]*", re.IGNORECASE)
|
|
36
|
+
SSRN_DOI_RE = re.compile(r"10\.2139/ssrn[0-9A-Za-z./_-]*", re.IGNORECASE)
|
|
37
|
+
CJK_RE = re.compile(r"[\u3400-\u9fff]")
|
|
38
|
+
WHITESPACE_RE = re.compile(r"\s+")
|
|
39
|
+
QUOTE_RE = re.compile(r'"([^"]+)"')
|
|
40
|
+
SOURCE_PRIORITY = {
|
|
41
|
+
"arxiv": 4,
|
|
42
|
+
"biorxiv": 3,
|
|
43
|
+
"medrxiv": 3,
|
|
44
|
+
"chemrxiv": 3,
|
|
45
|
+
"socarxiv": 3,
|
|
46
|
+
"ssrn": 3,
|
|
47
|
+
"openalex": 1,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
SOURCE_DISPLAY_NAMES = {
|
|
51
|
+
"arxiv": "arXiv",
|
|
52
|
+
"biorxiv": "bioRxiv",
|
|
53
|
+
"medrxiv": "medRxiv",
|
|
54
|
+
"chemrxiv": "ChemRxiv",
|
|
55
|
+
"socarxiv": "SocArXiv",
|
|
56
|
+
"ssrn": "SSRN",
|
|
57
|
+
"openalex": "openalex",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
SUPPORTED_PAPER_SOURCE_KEYS = {"arxiv", "biorxiv", "medrxiv", "chemrxiv", "socarxiv", "ssrn"}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
MESSAGES = {
|
|
64
|
+
"zh": {
|
|
65
|
+
"untitled": "无标题",
|
|
66
|
+
"fetch_failed": "抓取失败",
|
|
67
|
+
"arxiv_batch_failed": "arXiv 批次 {index} 抓取失败",
|
|
68
|
+
"author_suffix": " 等",
|
|
69
|
+
},
|
|
70
|
+
"en": {
|
|
71
|
+
"untitled": "Untitled",
|
|
72
|
+
"fetch_failed": "Fetch failed",
|
|
73
|
+
"arxiv_batch_failed": "arXiv batch {index} failed",
|
|
74
|
+
"author_suffix": " et al.",
|
|
75
|
+
},
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def normalize_language(language: str | None) -> str:
|
|
80
|
+
"""Return a supported language key."""
|
|
81
|
+
return "en" if language == "en" else "zh"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def normalize_source_type(source_type: str | None) -> str:
|
|
85
|
+
"""Return a canonical lowercase paper source key."""
|
|
86
|
+
if not source_type:
|
|
87
|
+
return ""
|
|
88
|
+
|
|
89
|
+
normalized = normalize_whitespace(source_type).lower().replace(" ", "")
|
|
90
|
+
aliases = {
|
|
91
|
+
"arxiv": "arxiv",
|
|
92
|
+
"biorxiv": "biorxiv",
|
|
93
|
+
"medrxiv": "medrxiv",
|
|
94
|
+
"chemrxiv": "chemrxiv",
|
|
95
|
+
"socarxiv": "socarxiv",
|
|
96
|
+
"ssrn": "ssrn",
|
|
97
|
+
"openalex": "openalex",
|
|
98
|
+
}
|
|
99
|
+
return aliases.get(normalized, normalized)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def display_source_type(source_type: str | None) -> str:
|
|
103
|
+
"""Return the canonical display name for a paper source."""
|
|
104
|
+
source_key = normalize_source_type(source_type)
|
|
105
|
+
if not source_key:
|
|
106
|
+
return ""
|
|
107
|
+
return SOURCE_DISPLAY_NAMES.get(source_key, source_key)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def normalize_string_list(value: object) -> list[str]:
|
|
111
|
+
"""Normalize a config or record field into a flat string list."""
|
|
112
|
+
if value is None:
|
|
113
|
+
return []
|
|
114
|
+
|
|
115
|
+
if isinstance(value, list):
|
|
116
|
+
items: list[str] = []
|
|
117
|
+
for item in value:
|
|
118
|
+
if isinstance(item, str):
|
|
119
|
+
cleaned = normalize_whitespace(item)
|
|
120
|
+
if cleaned:
|
|
121
|
+
items.append(cleaned)
|
|
122
|
+
elif item is not None:
|
|
123
|
+
cleaned = normalize_whitespace(str(item))
|
|
124
|
+
if cleaned:
|
|
125
|
+
items.append(cleaned)
|
|
126
|
+
return items
|
|
127
|
+
|
|
128
|
+
if isinstance(value, str):
|
|
129
|
+
parts = [normalize_whitespace(part) for part in value.split(",")]
|
|
130
|
+
return [part for part in parts if part]
|
|
131
|
+
|
|
132
|
+
cleaned = normalize_whitespace(str(value))
|
|
133
|
+
return [cleaned] if cleaned else []
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def normalize_published_date(value: object) -> str:
|
|
137
|
+
"""Return a YYYY-MM-DD date string when possible."""
|
|
138
|
+
if value is None:
|
|
139
|
+
return ""
|
|
140
|
+
|
|
141
|
+
text = normalize_whitespace(str(value))
|
|
142
|
+
if not text:
|
|
143
|
+
return ""
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
published_at = datetime.fromisoformat(text.replace("Z", "+00:00"))
|
|
147
|
+
return published_at.strftime("%Y-%m-%d")
|
|
148
|
+
except Exception:
|
|
149
|
+
match = re.search(r"\d{4}-\d{2}-\d{2}", text)
|
|
150
|
+
return match.group(0) if match else ""
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def normalize_paper_title(value: object, language: str | None) -> str:
|
|
154
|
+
"""Return a stable title string for a paper record."""
|
|
155
|
+
title = normalize_whitespace(str(value)) if value is not None else ""
|
|
156
|
+
return title or get_messages(language)["untitled"]
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def normalize_paper_authors(authors: object, language: str | None) -> str:
|
|
160
|
+
"""Normalize paper authors from list or string input."""
|
|
161
|
+
if isinstance(authors, list):
|
|
162
|
+
names = [normalize_whitespace(str(author)) for author in authors if normalize_whitespace(str(author))]
|
|
163
|
+
return format_authors(names, language)
|
|
164
|
+
|
|
165
|
+
if isinstance(authors, str):
|
|
166
|
+
return normalize_whitespace(authors)
|
|
167
|
+
|
|
168
|
+
if authors is None:
|
|
169
|
+
return ""
|
|
170
|
+
|
|
171
|
+
return normalize_whitespace(str(authors))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def normalize_paper_record(
|
|
175
|
+
source_type: str,
|
|
176
|
+
record: dict,
|
|
177
|
+
scope: str,
|
|
178
|
+
language: str | None,
|
|
179
|
+
) -> dict[str, str]:
|
|
180
|
+
"""Normalize a source-specific record into the unified paper schema."""
|
|
181
|
+
source_key = normalize_source_type(source_type)
|
|
182
|
+
raw_link = record.get("link") or record.get("url") or record.get("doi")
|
|
183
|
+
link = normalize_whitespace(str(raw_link)) if raw_link is not None else ""
|
|
184
|
+
if source_key == "arxiv" and link:
|
|
185
|
+
normalized_link = normalize_arxiv_link(link)
|
|
186
|
+
if normalized_link is not None:
|
|
187
|
+
link = normalized_link
|
|
188
|
+
elif source_key == "openalex" and link:
|
|
189
|
+
normalized_link = normalize_arxiv_link(link)
|
|
190
|
+
if normalized_link is not None:
|
|
191
|
+
link = normalized_link
|
|
192
|
+
|
|
193
|
+
categories_value = (
|
|
194
|
+
record.get("categories")
|
|
195
|
+
or record.get("category")
|
|
196
|
+
or record.get("preprint_category")
|
|
197
|
+
or record.get("scope")
|
|
198
|
+
or scope
|
|
199
|
+
or ""
|
|
200
|
+
)
|
|
201
|
+
categories = normalize_whitespace(str(categories_value))
|
|
202
|
+
published = normalize_published_date(
|
|
203
|
+
record.get("published")
|
|
204
|
+
or record.get("published_date")
|
|
205
|
+
or record.get("preprint_date")
|
|
206
|
+
or record.get("date")
|
|
207
|
+
or record.get("publication_date")
|
|
208
|
+
)
|
|
209
|
+
summary_value = (
|
|
210
|
+
record.get("summary")
|
|
211
|
+
or record.get("abstract")
|
|
212
|
+
or record.get("preprint_abstract")
|
|
213
|
+
or record.get("description")
|
|
214
|
+
or ""
|
|
215
|
+
)
|
|
216
|
+
summary = normalize_whitespace(str(summary_value))[:300]
|
|
217
|
+
title = normalize_paper_title(
|
|
218
|
+
record.get("title")
|
|
219
|
+
or record.get("display_name")
|
|
220
|
+
or record.get("preprint_title")
|
|
221
|
+
or record.get("name"),
|
|
222
|
+
language,
|
|
223
|
+
)
|
|
224
|
+
authors = normalize_paper_authors(
|
|
225
|
+
record.get("authors") or record.get("preprint_authors") or record.get("author_names"),
|
|
226
|
+
language,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
normalized_scope = normalize_whitespace(str(record.get("scope") or scope or categories))
|
|
230
|
+
return {
|
|
231
|
+
"title": title,
|
|
232
|
+
"link": link,
|
|
233
|
+
"published": published,
|
|
234
|
+
"summary": summary,
|
|
235
|
+
"categories": categories,
|
|
236
|
+
"authors": authors,
|
|
237
|
+
"source": source_key,
|
|
238
|
+
"source_type": display_source_type(source_key),
|
|
239
|
+
"scope": normalized_scope,
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def get_messages(language: str | None) -> dict[str, str]:
|
|
244
|
+
"""Return the localized message bundle."""
|
|
245
|
+
return MESSAGES[normalize_language(language)]
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def format_authors(authors: list[str], language: str | None) -> str:
|
|
249
|
+
"""Format author names with a localized overflow suffix."""
|
|
250
|
+
if not authors:
|
|
251
|
+
return ""
|
|
252
|
+
|
|
253
|
+
formatted = ", ".join(authors[:3])
|
|
254
|
+
if len(authors) > 3:
|
|
255
|
+
formatted += get_messages(language)["author_suffix"]
|
|
256
|
+
return formatted
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def build_failure_title(label: str, error: Exception) -> str:
|
|
260
|
+
"""Build a bracketed failure title that keeps the existing JSON contract stable."""
|
|
261
|
+
return f"[{label}: {error}]"
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def normalize_whitespace(value: str | None) -> str:
|
|
265
|
+
"""Collapse internal whitespace and trim."""
|
|
266
|
+
return WHITESPACE_RE.sub(" ", value or "").strip()
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def normalize_title_key(value: str) -> str:
|
|
270
|
+
"""Build a loose title key for deduplication."""
|
|
271
|
+
lowered = normalize_whitespace(value).lower()
|
|
272
|
+
return re.sub(r"[^a-z0-9]+", "", lowered)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def strip_arxiv_version(identifier: str) -> str:
|
|
276
|
+
"""Drop the trailing arXiv version suffix."""
|
|
277
|
+
return re.sub(r"v\d+$", "", identifier)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def normalize_arxiv_link(link: str | None) -> str | None:
|
|
281
|
+
"""Normalize a raw arXiv URL or id to the canonical abs URL."""
|
|
282
|
+
if not link:
|
|
283
|
+
return None
|
|
284
|
+
|
|
285
|
+
raw = normalize_whitespace(link)
|
|
286
|
+
if not raw:
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
if raw.startswith("arXiv:"):
|
|
290
|
+
raw = raw[6:]
|
|
291
|
+
|
|
292
|
+
bare_match = re.fullmatch(
|
|
293
|
+
r"((?:[a-z\-]+(?:\.[a-z\-]+)?/\d{7})|(?:\d{4}\.\d{4,5}))(?:v\d+)?",
|
|
294
|
+
raw,
|
|
295
|
+
re.IGNORECASE,
|
|
296
|
+
)
|
|
297
|
+
if bare_match:
|
|
298
|
+
return f"https://arxiv.org/abs/{strip_arxiv_version(bare_match.group(1))}"
|
|
299
|
+
|
|
300
|
+
matched = ARXIV_LINK_RE.search(raw)
|
|
301
|
+
if not matched:
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
return f"https://arxiv.org/abs/{strip_arxiv_version(matched.group(1))}"
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def build_error(module: str, source: str, message: str) -> dict[str, str]:
|
|
308
|
+
"""Return a structured error record."""
|
|
309
|
+
return {
|
|
310
|
+
"module": module,
|
|
311
|
+
"source": source,
|
|
312
|
+
"message": normalize_whitespace(message),
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def ensure_dependencies() -> None:
|
|
317
|
+
"""Install feedparser and requests on demand."""
|
|
318
|
+
try:
|
|
319
|
+
import feedparser # noqa: F401
|
|
320
|
+
import requests # noqa: F401
|
|
321
|
+
except ImportError:
|
|
322
|
+
subprocess.run(
|
|
323
|
+
[
|
|
324
|
+
sys.executable,
|
|
325
|
+
"-m",
|
|
326
|
+
"pip",
|
|
327
|
+
"install",
|
|
328
|
+
"feedparser",
|
|
329
|
+
"requests",
|
|
330
|
+
"--break-system-packages",
|
|
331
|
+
"-q",
|
|
332
|
+
],
|
|
333
|
+
check=True,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def fetch_rss(feeds: list[dict[str, str]], cutoff: datetime, language: str) -> list[dict[str, str]]:
|
|
338
|
+
"""Fetch RSS articles published after the cutoff."""
|
|
339
|
+
import feedparser
|
|
340
|
+
import requests
|
|
341
|
+
from email.utils import parsedate_to_datetime
|
|
342
|
+
|
|
343
|
+
messages = get_messages(language)
|
|
344
|
+
articles: list[dict[str, str]] = []
|
|
345
|
+
|
|
346
|
+
for feed in feeds:
|
|
347
|
+
url = feed["url"]
|
|
348
|
+
if not url.startswith("http"):
|
|
349
|
+
url = "https://" + url
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
response = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
|
|
353
|
+
parsed = feedparser.parse(response.content)
|
|
354
|
+
for entry in parsed.entries:
|
|
355
|
+
published_at = None
|
|
356
|
+
for attr in ["published", "updated"]:
|
|
357
|
+
if hasattr(entry, attr):
|
|
358
|
+
try:
|
|
359
|
+
published_at = parsedate_to_datetime(getattr(entry, attr))
|
|
360
|
+
break
|
|
361
|
+
except Exception:
|
|
362
|
+
pass
|
|
363
|
+
if published_at is None:
|
|
364
|
+
published_at = datetime.now(timezone.utc)
|
|
365
|
+
if published_at.tzinfo is None:
|
|
366
|
+
published_at = published_at.replace(tzinfo=timezone.utc)
|
|
367
|
+
if published_at >= cutoff:
|
|
368
|
+
summary = re.sub(r"<[^>]+>", "", getattr(entry, "summary", "") or "")[:300]
|
|
369
|
+
articles.append(
|
|
370
|
+
{
|
|
371
|
+
"source": feed.get("name", ""),
|
|
372
|
+
"title": entry.get("title", messages["untitled"]),
|
|
373
|
+
"link": entry.get("link", ""),
|
|
374
|
+
"published": published_at.strftime("%Y-%m-%d"),
|
|
375
|
+
"summary": summary.strip(),
|
|
376
|
+
}
|
|
377
|
+
)
|
|
378
|
+
except Exception as error:
|
|
379
|
+
articles.append(
|
|
380
|
+
{
|
|
381
|
+
"source": feed.get("name", ""),
|
|
382
|
+
"title": build_failure_title(messages["fetch_failed"], error),
|
|
383
|
+
"link": "",
|
|
384
|
+
"published": "",
|
|
385
|
+
"summary": "",
|
|
386
|
+
}
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
return articles
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def parse_arxiv_feed(xml_data: bytes, cutoff: datetime, language: str) -> list[dict[str, str]]:
|
|
393
|
+
"""Parse an arXiv Atom feed into normalized paper records."""
|
|
394
|
+
papers: list[dict[str, str]] = []
|
|
395
|
+
namespaces = {"atom": "http://www.w3.org/2005/Atom"}
|
|
396
|
+
root = ET.fromstring(xml_data)
|
|
397
|
+
|
|
398
|
+
for entry in root.findall("atom:entry", namespaces):
|
|
399
|
+
identifier = entry.find("atom:id", namespaces)
|
|
400
|
+
published_elem = entry.find("atom:published", namespaces)
|
|
401
|
+
title_elem = entry.find("atom:title", namespaces)
|
|
402
|
+
summary_elem = entry.find("atom:summary", namespaces)
|
|
403
|
+
if (
|
|
404
|
+
identifier is None
|
|
405
|
+
or published_elem is None
|
|
406
|
+
or title_elem is None
|
|
407
|
+
or summary_elem is None
|
|
408
|
+
or identifier.text is None
|
|
409
|
+
or published_elem.text is None
|
|
410
|
+
or title_elem.text is None
|
|
411
|
+
or summary_elem.text is None
|
|
412
|
+
):
|
|
413
|
+
continue
|
|
414
|
+
|
|
415
|
+
normalized_link = normalize_arxiv_link(identifier.text)
|
|
416
|
+
if normalized_link is None:
|
|
417
|
+
continue
|
|
418
|
+
|
|
419
|
+
published_at = datetime.fromisoformat(published_elem.text.replace("Z", "+00:00"))
|
|
420
|
+
if published_at < cutoff:
|
|
421
|
+
continue
|
|
422
|
+
|
|
423
|
+
entry_categories = [category.get("term") or "" for category in entry.findall("atom:category", namespaces)]
|
|
424
|
+
authors = [
|
|
425
|
+
author_name.text
|
|
426
|
+
for author in entry.findall("atom:author", namespaces)
|
|
427
|
+
for author_name in [author.find("atom:name", namespaces)]
|
|
428
|
+
if author_name is not None and author_name.text is not None
|
|
429
|
+
]
|
|
430
|
+
papers.append(
|
|
431
|
+
normalize_paper_record(
|
|
432
|
+
"arXiv",
|
|
433
|
+
{
|
|
434
|
+
"title": normalize_whitespace(title_elem.text),
|
|
435
|
+
"link": normalized_link,
|
|
436
|
+
"published": published_at.strftime("%Y-%m-%d"),
|
|
437
|
+
"summary": normalize_whitespace(summary_elem.text)[:300],
|
|
438
|
+
"categories": ", ".join(entry_categories[:5]),
|
|
439
|
+
"authors": authors,
|
|
440
|
+
},
|
|
441
|
+
", ".join(entry_categories[:5]),
|
|
442
|
+
language,
|
|
443
|
+
)
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
return papers
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def fetch_recent_arxiv_category(category: str, max_results: int) -> bytes:
|
|
450
|
+
"""Fetch recent papers for one arXiv category."""
|
|
451
|
+
params = urllib.parse.urlencode(
|
|
452
|
+
{
|
|
453
|
+
"search_query": f"cat:{category}",
|
|
454
|
+
"start": 0,
|
|
455
|
+
"max_results": max_results,
|
|
456
|
+
"sortBy": "submittedDate",
|
|
457
|
+
"sortOrder": "descending",
|
|
458
|
+
}
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
request = urllib.request.Request(
|
|
462
|
+
f"{ARXIV_API_URL}?{params}",
|
|
463
|
+
headers=REQUEST_HEADERS,
|
|
464
|
+
)
|
|
465
|
+
with urllib.request.urlopen(request, timeout=60) as response:
|
|
466
|
+
return response.read()
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def extract_openalex_abstract(work: dict) -> str:
|
|
470
|
+
"""Build a readable abstract from OpenAlex fields."""
|
|
471
|
+
inverted_index = work.get("abstract_inverted_index")
|
|
472
|
+
if isinstance(inverted_index, dict):
|
|
473
|
+
tokens: list[tuple[int, str]] = []
|
|
474
|
+
for word, positions in inverted_index.items():
|
|
475
|
+
if not isinstance(word, str) or not isinstance(positions, list):
|
|
476
|
+
continue
|
|
477
|
+
for position in positions:
|
|
478
|
+
if isinstance(position, int):
|
|
479
|
+
tokens.append((position, word))
|
|
480
|
+
if tokens:
|
|
481
|
+
return normalize_whitespace(" ".join(word for _, word in sorted(tokens)))[:300]
|
|
482
|
+
|
|
483
|
+
abstract = work.get("abstract")
|
|
484
|
+
if isinstance(abstract, str):
|
|
485
|
+
return normalize_whitespace(abstract)[:300]
|
|
486
|
+
|
|
487
|
+
return ""
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def extract_openalex_author_names(work: dict) -> list[str]:
|
|
491
|
+
"""Collect OpenAlex author display names."""
|
|
492
|
+
author_names: list[str] = []
|
|
493
|
+
authorships = work.get("authorships")
|
|
494
|
+
if not isinstance(authorships, list):
|
|
495
|
+
return author_names
|
|
496
|
+
|
|
497
|
+
for authorship in authorships:
|
|
498
|
+
if not isinstance(authorship, dict):
|
|
499
|
+
continue
|
|
500
|
+
author = authorship.get("author")
|
|
501
|
+
if not isinstance(author, dict):
|
|
502
|
+
continue
|
|
503
|
+
display_name = author.get("display_name")
|
|
504
|
+
if isinstance(display_name, str):
|
|
505
|
+
author_names.append(display_name)
|
|
506
|
+
|
|
507
|
+
return author_names
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def extract_openalex_category(work: dict) -> str:
|
|
511
|
+
"""Extract the most specific OpenAlex topic label available."""
|
|
512
|
+
primary_topic = work.get("primary_topic")
|
|
513
|
+
if not isinstance(primary_topic, dict):
|
|
514
|
+
return ""
|
|
515
|
+
|
|
516
|
+
for field in ["subfield", "field", "domain"]:
|
|
517
|
+
nested = primary_topic.get(field)
|
|
518
|
+
if not isinstance(nested, dict):
|
|
519
|
+
continue
|
|
520
|
+
display_name = nested.get("display_name")
|
|
521
|
+
if isinstance(display_name, str) and display_name:
|
|
522
|
+
return display_name
|
|
523
|
+
|
|
524
|
+
return ""
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def normalize_openalex_arxiv_link(work: dict) -> str | None:
|
|
528
|
+
"""Extract a canonical arXiv abs URL from an OpenAlex work."""
|
|
529
|
+
ids = work.get("ids")
|
|
530
|
+
if isinstance(ids, dict):
|
|
531
|
+
for key in ["arxiv", "openalex"]:
|
|
532
|
+
candidate = ids.get(key)
|
|
533
|
+
if isinstance(candidate, str):
|
|
534
|
+
normalized = normalize_arxiv_link(candidate)
|
|
535
|
+
if normalized is not None:
|
|
536
|
+
return normalized
|
|
537
|
+
|
|
538
|
+
location_candidates: list[object] = []
|
|
539
|
+
for field in ["primary_location", "best_oa_location"]:
|
|
540
|
+
location_candidates.append(work.get(field))
|
|
541
|
+
locations = work.get("locations")
|
|
542
|
+
if isinstance(locations, list):
|
|
543
|
+
location_candidates.extend(locations)
|
|
544
|
+
|
|
545
|
+
for location in location_candidates:
|
|
546
|
+
if not isinstance(location, dict):
|
|
547
|
+
continue
|
|
548
|
+
for field in ["landing_page_url", "pdf_url"]:
|
|
549
|
+
candidate = location.get(field)
|
|
550
|
+
if isinstance(candidate, str):
|
|
551
|
+
normalized = normalize_arxiv_link(candidate)
|
|
552
|
+
if normalized is not None:
|
|
553
|
+
return normalized
|
|
554
|
+
|
|
555
|
+
return None
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def normalize_openalex_chemrxiv_link(work: dict) -> str | None:
|
|
559
|
+
"""Extract a canonical ChemRxiv DOI URL from an OpenAlex work."""
|
|
560
|
+
candidates: list[object] = [work.get("doi")]
|
|
561
|
+
ids = work.get("ids")
|
|
562
|
+
if isinstance(ids, dict):
|
|
563
|
+
candidates.append(ids.get("doi"))
|
|
564
|
+
|
|
565
|
+
location_candidates: list[object] = []
|
|
566
|
+
for field in ["primary_location", "best_oa_location"]:
|
|
567
|
+
location_candidates.append(work.get(field))
|
|
568
|
+
locations = work.get("locations")
|
|
569
|
+
if isinstance(locations, list):
|
|
570
|
+
location_candidates.extend(locations)
|
|
571
|
+
|
|
572
|
+
for location in location_candidates:
|
|
573
|
+
if not isinstance(location, dict):
|
|
574
|
+
continue
|
|
575
|
+
for field in ["landing_page_url", "pdf_url"]:
|
|
576
|
+
candidates.append(location.get(field))
|
|
577
|
+
|
|
578
|
+
for candidate in candidates:
|
|
579
|
+
if not isinstance(candidate, str):
|
|
580
|
+
continue
|
|
581
|
+
matched = CHEMRXIV_DOI_RE.search(normalize_whitespace(candidate))
|
|
582
|
+
if matched:
|
|
583
|
+
return f"https://doi.org/{matched.group(0)}"
|
|
584
|
+
|
|
585
|
+
return None
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def collect_openalex_location_urls(work: dict) -> list[str]:
|
|
589
|
+
"""Collect OpenAlex landing-page and PDF URLs in source order."""
|
|
590
|
+
urls: list[str] = []
|
|
591
|
+
location_candidates: list[object] = []
|
|
592
|
+
for field in ["primary_location", "best_oa_location"]:
|
|
593
|
+
location_candidates.append(work.get(field))
|
|
594
|
+
locations = work.get("locations")
|
|
595
|
+
if isinstance(locations, list):
|
|
596
|
+
location_candidates.extend(locations)
|
|
597
|
+
|
|
598
|
+
for location in location_candidates:
|
|
599
|
+
if not isinstance(location, dict):
|
|
600
|
+
continue
|
|
601
|
+
for field in ["landing_page_url", "pdf_url"]:
|
|
602
|
+
candidate = location.get(field)
|
|
603
|
+
if isinstance(candidate, str):
|
|
604
|
+
normalized = normalize_whitespace(candidate)
|
|
605
|
+
if normalized:
|
|
606
|
+
urls.append(normalized)
|
|
607
|
+
|
|
608
|
+
return urls
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def normalize_openalex_socarxiv_link(work: dict) -> str | None:
|
|
612
|
+
"""Extract a canonical SocArXiv link from an OpenAlex work."""
|
|
613
|
+
urls = collect_openalex_location_urls(work)
|
|
614
|
+
for candidate in urls:
|
|
615
|
+
if "socarxiv.com" in candidate:
|
|
616
|
+
return candidate
|
|
617
|
+
for candidate in urls:
|
|
618
|
+
if "osf.io" in candidate:
|
|
619
|
+
return candidate
|
|
620
|
+
return None
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def normalize_openalex_ssrn_link(work: dict) -> str | None:
|
|
624
|
+
"""Extract a canonical SSRN link from an OpenAlex work."""
|
|
625
|
+
urls = collect_openalex_location_urls(work)
|
|
626
|
+
for candidate in urls:
|
|
627
|
+
if "papers.ssrn.com" in candidate:
|
|
628
|
+
return candidate
|
|
629
|
+
for candidate in urls:
|
|
630
|
+
if "ssrn.com" in candidate:
|
|
631
|
+
return candidate
|
|
632
|
+
|
|
633
|
+
candidates: list[object] = [work.get("doi")]
|
|
634
|
+
ids = work.get("ids")
|
|
635
|
+
if isinstance(ids, dict):
|
|
636
|
+
candidates.append(ids.get("doi"))
|
|
637
|
+
|
|
638
|
+
for candidate in candidates:
|
|
639
|
+
if not isinstance(candidate, str):
|
|
640
|
+
continue
|
|
641
|
+
matched = SSRN_DOI_RE.search(normalize_whitespace(candidate))
|
|
642
|
+
if matched:
|
|
643
|
+
return f"https://doi.org/{matched.group(0)}"
|
|
644
|
+
|
|
645
|
+
return None
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def is_openalex_chemrxiv_work(work: dict) -> bool:
|
|
649
|
+
"""Return whether an OpenAlex work can be attributed to ChemRxiv."""
|
|
650
|
+
if normalize_openalex_chemrxiv_link(work) is not None:
|
|
651
|
+
return True
|
|
652
|
+
|
|
653
|
+
location_candidates: list[object] = []
|
|
654
|
+
for field in ["primary_location", "best_oa_location"]:
|
|
655
|
+
location_candidates.append(work.get(field))
|
|
656
|
+
locations = work.get("locations")
|
|
657
|
+
if isinstance(locations, list):
|
|
658
|
+
location_candidates.extend(locations)
|
|
659
|
+
|
|
660
|
+
for location in location_candidates:
|
|
661
|
+
if not isinstance(location, dict):
|
|
662
|
+
continue
|
|
663
|
+
source = location.get("source")
|
|
664
|
+
if not isinstance(source, dict):
|
|
665
|
+
continue
|
|
666
|
+
|
|
667
|
+
source_id = normalize_whitespace(str(source.get("id") or ""))
|
|
668
|
+
display_name = normalize_whitespace(str(source.get("display_name") or ""))
|
|
669
|
+
if source_id.endswith(f"/{CHEMRXIV_OPENALEX_REPOSITORY_ID}") or display_name == "ChemRxiv":
|
|
670
|
+
return True
|
|
671
|
+
|
|
672
|
+
return False
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def parse_openalex_repository_results(
|
|
676
|
+
payload: dict,
|
|
677
|
+
cutoff: datetime,
|
|
678
|
+
language: str,
|
|
679
|
+
source_type: str,
|
|
680
|
+
scope: str,
|
|
681
|
+
link_normalizer,
|
|
682
|
+
) -> list[dict[str, str]]:
|
|
683
|
+
"""Parse repository-filtered OpenAlex results into normalized paper records."""
|
|
684
|
+
papers: list[dict[str, str]] = []
|
|
685
|
+
results = payload.get("results")
|
|
686
|
+
if not isinstance(results, list):
|
|
687
|
+
return papers
|
|
688
|
+
|
|
689
|
+
for work in results:
|
|
690
|
+
if not isinstance(work, dict):
|
|
691
|
+
continue
|
|
692
|
+
|
|
693
|
+
published = normalize_published_date(work.get("publication_date"))
|
|
694
|
+
if not published:
|
|
695
|
+
continue
|
|
696
|
+
|
|
697
|
+
published_at = datetime.fromisoformat(f"{published}T00:00:00+00:00")
|
|
698
|
+
if published_at < cutoff:
|
|
699
|
+
continue
|
|
700
|
+
|
|
701
|
+
link = link_normalizer(work)
|
|
702
|
+
if not link:
|
|
703
|
+
continue
|
|
704
|
+
|
|
705
|
+
category = normalize_whitespace(str(extract_openalex_category(work) or scope or ""))
|
|
706
|
+
if scope:
|
|
707
|
+
scope_terms = normalize_string_list(scope)
|
|
708
|
+
if scope_terms and not any(term.lower() in category.lower() for term in scope_terms):
|
|
709
|
+
continue
|
|
710
|
+
|
|
711
|
+
papers.append(
|
|
712
|
+
normalize_paper_record(
|
|
713
|
+
source_type,
|
|
714
|
+
{
|
|
715
|
+
"title": work.get("display_name") or work.get("title"),
|
|
716
|
+
"link": link,
|
|
717
|
+
"published": published,
|
|
718
|
+
"summary": extract_openalex_abstract(work),
|
|
719
|
+
"authors": extract_openalex_author_names(work),
|
|
720
|
+
"categories": category,
|
|
721
|
+
},
|
|
722
|
+
scope or category,
|
|
723
|
+
language,
|
|
724
|
+
)
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
return papers
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def parse_openalex_results(
|
|
731
|
+
payload: dict,
|
|
732
|
+
cutoff: datetime,
|
|
733
|
+
language: str,
|
|
734
|
+
require_arxiv_link: bool = True,
|
|
735
|
+
) -> list[dict[str, str]]:
|
|
736
|
+
"""Parse OpenAlex results into normalized paper records."""
|
|
737
|
+
messages = get_messages(language)
|
|
738
|
+
papers: list[dict[str, str]] = []
|
|
739
|
+
results = payload.get("results")
|
|
740
|
+
if not isinstance(results, list):
|
|
741
|
+
return papers
|
|
742
|
+
|
|
743
|
+
for work in results:
|
|
744
|
+
if not isinstance(work, dict):
|
|
745
|
+
continue
|
|
746
|
+
|
|
747
|
+
published_text = work.get("publication_date")
|
|
748
|
+
if not isinstance(published_text, str):
|
|
749
|
+
continue
|
|
750
|
+
|
|
751
|
+
published_at = datetime.fromisoformat(published_text)
|
|
752
|
+
if published_at.tzinfo is None:
|
|
753
|
+
published_at = published_at.replace(tzinfo=timezone.utc)
|
|
754
|
+
if published_at < cutoff:
|
|
755
|
+
continue
|
|
756
|
+
|
|
757
|
+
normalized_link = normalize_openalex_arxiv_link(work)
|
|
758
|
+
if require_arxiv_link and normalized_link is None:
|
|
759
|
+
continue
|
|
760
|
+
|
|
761
|
+
title = work.get("display_name") or work.get("title") or messages["untitled"]
|
|
762
|
+
author_names = extract_openalex_author_names(work)
|
|
763
|
+
category = extract_openalex_category(work)
|
|
764
|
+
|
|
765
|
+
papers.append(
|
|
766
|
+
normalize_paper_record(
|
|
767
|
+
"openalex",
|
|
768
|
+
{
|
|
769
|
+
"title": normalize_whitespace(str(title)),
|
|
770
|
+
"link": normalized_link or "",
|
|
771
|
+
"published": published_at.strftime("%Y-%m-%d"),
|
|
772
|
+
"summary": extract_openalex_abstract(work),
|
|
773
|
+
"categories": category,
|
|
774
|
+
"authors": author_names,
|
|
775
|
+
},
|
|
776
|
+
category,
|
|
777
|
+
language,
|
|
778
|
+
)
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
return papers
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def keyword_contains_non_english(keyword: str) -> bool:
|
|
785
|
+
"""Detect whether a keyword includes CJK characters."""
|
|
786
|
+
return CJK_RE.search(keyword) is not None
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def compile_keyword_expressions(keywords: list[str]) -> list[list[str]]:
|
|
790
|
+
"""Split configured keywords into exact phrases and plain English terms."""
|
|
791
|
+
expressions: list[list[str]] = []
|
|
792
|
+
for keyword in keywords:
|
|
793
|
+
cleaned = normalize_whitespace(keyword)
|
|
794
|
+
if not cleaned:
|
|
795
|
+
continue
|
|
796
|
+
|
|
797
|
+
phrases = [normalize_whitespace(value).lower() for value in QUOTE_RE.findall(cleaned)]
|
|
798
|
+
remainder = QUOTE_RE.sub(" ", cleaned)
|
|
799
|
+
terms = [part.lower() for part in remainder.split() if part]
|
|
800
|
+
clauses = [clause for clause in [*phrases, *terms] if clause]
|
|
801
|
+
if clauses:
|
|
802
|
+
expressions.append(clauses)
|
|
803
|
+
return expressions
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def score_paper(paper: dict[str, str], expressions: list[list[str]]) -> int:
|
|
807
|
+
"""Score one paper against compiled keyword expressions."""
|
|
808
|
+
title = paper.get("title", "").lower()
|
|
809
|
+
summary = paper.get("summary", "").lower()
|
|
810
|
+
best_score = 0
|
|
811
|
+
|
|
812
|
+
for expression in expressions:
|
|
813
|
+
expression_score = 0
|
|
814
|
+
matched = True
|
|
815
|
+
for clause in expression:
|
|
816
|
+
if clause in title:
|
|
817
|
+
expression_score += 4
|
|
818
|
+
elif clause in summary:
|
|
819
|
+
expression_score += 2
|
|
820
|
+
else:
|
|
821
|
+
matched = False
|
|
822
|
+
break
|
|
823
|
+
if matched:
|
|
824
|
+
best_score = max(best_score, expression_score + len(expression))
|
|
825
|
+
|
|
826
|
+
return best_score
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def rank_papers(papers: list[dict[str, str]], expressions: list[list[str]]) -> list[dict[str, str]]:
|
|
830
|
+
"""Filter papers by keyword match and attach a ranking score."""
|
|
831
|
+
if not expressions:
|
|
832
|
+
return []
|
|
833
|
+
|
|
834
|
+
ranked: list[dict[str, str]] = []
|
|
835
|
+
for paper in papers:
|
|
836
|
+
score = score_paper(paper, expressions)
|
|
837
|
+
if score <= 0:
|
|
838
|
+
continue
|
|
839
|
+
ranked.append({**paper, "score": score})
|
|
840
|
+
|
|
841
|
+
ranked.sort(
|
|
842
|
+
key=lambda paper: (
|
|
843
|
+
int(paper.get("score", 0)),
|
|
844
|
+
paper.get("published", ""),
|
|
845
|
+
SOURCE_PRIORITY.get(paper.get("source", ""), 0),
|
|
846
|
+
len(paper.get("summary", "")),
|
|
847
|
+
),
|
|
848
|
+
reverse=True,
|
|
849
|
+
)
|
|
850
|
+
return ranked
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
def strip_internal_fields(paper: dict[str, str]) -> dict[str, str]:
|
|
854
|
+
"""Remove helper-only fields before returning JSON."""
|
|
855
|
+
source = paper.get("source", "")
|
|
856
|
+
return {
|
|
857
|
+
"title": paper.get("title", ""),
|
|
858
|
+
"link": paper.get("link", ""),
|
|
859
|
+
"published": paper.get("published", ""),
|
|
860
|
+
"summary": paper.get("summary", ""),
|
|
861
|
+
"categories": paper.get("categories", ""),
|
|
862
|
+
"authors": paper.get("authors", ""),
|
|
863
|
+
"source": source,
|
|
864
|
+
"source_type": paper.get("source_type", "") or display_source_type(source),
|
|
865
|
+
"scope": paper.get("scope", "") or paper.get("categories", ""),
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
def deduplicate_papers(papers: list[dict[str, str]]) -> list[dict[str, str]]:
|
|
870
|
+
"""Merge duplicate papers, preferring official arXiv records."""
|
|
871
|
+
chosen: dict[str, dict[str, str]] = {}
|
|
872
|
+
|
|
873
|
+
for paper in papers:
|
|
874
|
+
link_key = normalize_arxiv_link(paper.get("link"))
|
|
875
|
+
title_key = normalize_title_key(paper.get("title", ""))
|
|
876
|
+
dedupe_key = link_key or title_key
|
|
877
|
+
if not dedupe_key:
|
|
878
|
+
continue
|
|
879
|
+
|
|
880
|
+
existing = chosen.get(dedupe_key)
|
|
881
|
+
current_rank = (
|
|
882
|
+
SOURCE_PRIORITY.get(paper.get("source", ""), 0),
|
|
883
|
+
int(paper.get("score", 0)),
|
|
884
|
+
paper.get("published", ""),
|
|
885
|
+
len(paper.get("summary", "")),
|
|
886
|
+
)
|
|
887
|
+
if existing is None:
|
|
888
|
+
chosen[dedupe_key] = paper
|
|
889
|
+
continue
|
|
890
|
+
|
|
891
|
+
existing_rank = (
|
|
892
|
+
SOURCE_PRIORITY.get(existing.get("source", ""), 0),
|
|
893
|
+
int(existing.get("score", 0)),
|
|
894
|
+
existing.get("published", ""),
|
|
895
|
+
len(existing.get("summary", "")),
|
|
896
|
+
)
|
|
897
|
+
if current_rank > existing_rank:
|
|
898
|
+
chosen[dedupe_key] = paper
|
|
899
|
+
|
|
900
|
+
normalized = [strip_internal_fields(paper) for paper in chosen.values()]
|
|
901
|
+
normalized.sort(
|
|
902
|
+
key=lambda paper: (
|
|
903
|
+
paper.get("published", ""),
|
|
904
|
+
SOURCE_PRIORITY.get(paper.get("source", ""), 0),
|
|
905
|
+
normalize_title_key(paper.get("title", "")),
|
|
906
|
+
),
|
|
907
|
+
reverse=True,
|
|
908
|
+
)
|
|
909
|
+
return normalized
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
def build_openalex_query(keywords: list[str]) -> str:
|
|
913
|
+
"""Build a simple OpenAlex fallback query string."""
|
|
914
|
+
parts = [normalize_whitespace(keyword.replace('"', " ")) for keyword in keywords]
|
|
915
|
+
return " ".join(part for part in parts if part)
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def fetch_openalex_works(
|
|
919
|
+
query: str,
|
|
920
|
+
cutoff: datetime,
|
|
921
|
+
max_results: int,
|
|
922
|
+
extra_filters: list[str] | None = None,
|
|
923
|
+
) -> dict:
|
|
924
|
+
"""Run an OpenAlex work search constrained by the digest date window."""
|
|
925
|
+
filters = [f"from_publication_date:{cutoff.date().isoformat()}"]
|
|
926
|
+
if extra_filters:
|
|
927
|
+
filters.extend(filter(None, extra_filters))
|
|
928
|
+
|
|
929
|
+
params = urllib.parse.urlencode(
|
|
930
|
+
{
|
|
931
|
+
"search": query,
|
|
932
|
+
"per-page": min(max_results, 100),
|
|
933
|
+
"sort": "publication_date:desc",
|
|
934
|
+
"filter": ",".join(filters),
|
|
935
|
+
}
|
|
936
|
+
)
|
|
937
|
+
request = urllib.request.Request(
|
|
938
|
+
f"{OPENALEX_API_URL}?{params}",
|
|
939
|
+
headers=REQUEST_HEADERS,
|
|
940
|
+
)
|
|
941
|
+
with urllib.request.urlopen(request, timeout=60) as response:
|
|
942
|
+
return json.loads(response.read().decode("utf-8"))
|
|
943
|
+
|
|
944
|
+
|
|
945
|
+
def normalize_paper_sources(config: dict) -> list[dict[str, object]]:
|
|
946
|
+
"""Normalize legacy and phase-1 paper source config into runtime entries."""
|
|
947
|
+
sources: list[dict[str, object]] = []
|
|
948
|
+
|
|
949
|
+
paper_sources = config.get("paper_sources")
|
|
950
|
+
if isinstance(paper_sources, list):
|
|
951
|
+
for row in paper_sources:
|
|
952
|
+
if not isinstance(row, dict):
|
|
953
|
+
continue
|
|
954
|
+
|
|
955
|
+
source_key = normalize_source_type(row.get("source_type"))
|
|
956
|
+
if not source_key:
|
|
957
|
+
continue
|
|
958
|
+
|
|
959
|
+
enabled_value = row.get("enabled")
|
|
960
|
+
sources.append(
|
|
961
|
+
{
|
|
962
|
+
"enabled": bool(enabled_value) if enabled_value is not None else True,
|
|
963
|
+
"source_type": display_source_type(source_key),
|
|
964
|
+
"queries": normalize_string_list(row.get("queries") or row.get("query") or row.get("keywords")),
|
|
965
|
+
"scope": normalize_whitespace(str(row.get("scope") or row.get("categories") or "")),
|
|
966
|
+
"notes": normalize_whitespace(str(row.get("notes") or "")),
|
|
967
|
+
"max_results": int(row.get("max_results", 200)) if row.get("max_results") is not None else 200,
|
|
968
|
+
"fallback_enabled": bool(row.get("fallback_enabled", True)),
|
|
969
|
+
"require_arxiv_link": bool(row.get("require_arxiv_link", True)),
|
|
970
|
+
}
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
arxiv_config = config.get("arxiv")
|
|
974
|
+
if isinstance(arxiv_config, dict) and arxiv_config.get("enabled", False):
|
|
975
|
+
sources.append(
|
|
976
|
+
{
|
|
977
|
+
"enabled": True,
|
|
978
|
+
"source_type": "arXiv",
|
|
979
|
+
"queries": normalize_string_list(arxiv_config.get("keywords")),
|
|
980
|
+
"scope": ", ".join(normalize_string_list(arxiv_config.get("categories"))),
|
|
981
|
+
"notes": "",
|
|
982
|
+
"max_results": int(arxiv_config.get("max_results", 200)),
|
|
983
|
+
"fallback_enabled": bool(arxiv_config.get("fallback_enabled", True)),
|
|
984
|
+
"require_arxiv_link": bool(arxiv_config.get("require_arxiv_link", True)),
|
|
985
|
+
}
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
return sources
|
|
989
|
+
|
|
990
|
+
|
|
991
|
+
def parse_biorxiv_results(
|
|
992
|
+
payload: dict,
|
|
993
|
+
cutoff: datetime,
|
|
994
|
+
language: str,
|
|
995
|
+
source_type: str,
|
|
996
|
+
scope: str,
|
|
997
|
+
) -> list[dict[str, str]]:
|
|
998
|
+
"""Parse bioRxiv/medRxiv API payloads into unified paper records."""
|
|
999
|
+
papers: list[dict[str, str]] = []
|
|
1000
|
+
collection = payload.get("collection")
|
|
1001
|
+
if not isinstance(collection, list):
|
|
1002
|
+
return papers
|
|
1003
|
+
|
|
1004
|
+
for item in collection:
|
|
1005
|
+
if not isinstance(item, dict):
|
|
1006
|
+
continue
|
|
1007
|
+
|
|
1008
|
+
published = normalize_published_date(item.get("preprint_date") or item.get("published_date") or item.get("date"))
|
|
1009
|
+
if not published:
|
|
1010
|
+
continue
|
|
1011
|
+
|
|
1012
|
+
published_at = datetime.fromisoformat(f"{published}T00:00:00+00:00")
|
|
1013
|
+
if published_at < cutoff:
|
|
1014
|
+
continue
|
|
1015
|
+
|
|
1016
|
+
category = normalize_whitespace(str(item.get("preprint_category") or item.get("category") or ""))
|
|
1017
|
+
if scope:
|
|
1018
|
+
scope_terms = normalize_string_list(scope)
|
|
1019
|
+
if scope_terms and not any(term.lower() in category.lower() for term in scope_terms):
|
|
1020
|
+
continue
|
|
1021
|
+
|
|
1022
|
+
doi = item.get("biorxiv_doi") or item.get("doi")
|
|
1023
|
+
link = ""
|
|
1024
|
+
if isinstance(doi, str) and doi:
|
|
1025
|
+
link = f"https://doi.org/{normalize_whitespace(doi)}"
|
|
1026
|
+
|
|
1027
|
+
papers.append(
|
|
1028
|
+
normalize_paper_record(
|
|
1029
|
+
source_type,
|
|
1030
|
+
{
|
|
1031
|
+
"title": item.get("preprint_title") or item.get("title"),
|
|
1032
|
+
"link": link,
|
|
1033
|
+
"published": published,
|
|
1034
|
+
"summary": item.get("preprint_abstract") or item.get("abstract"),
|
|
1035
|
+
"authors": item.get("preprint_authors") or item.get("authors"),
|
|
1036
|
+
"categories": category,
|
|
1037
|
+
},
|
|
1038
|
+
scope or category,
|
|
1039
|
+
language,
|
|
1040
|
+
)
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
return papers
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def parse_chemrxiv_results(
|
|
1047
|
+
payload: object,
|
|
1048
|
+
cutoff: datetime,
|
|
1049
|
+
language: str,
|
|
1050
|
+
scope: str,
|
|
1051
|
+
) -> list[dict[str, str]]:
|
|
1052
|
+
"""Parse ChemRxiv adapter payloads into unified paper records."""
|
|
1053
|
+
papers: list[dict[str, str]] = []
|
|
1054
|
+
if isinstance(payload, dict):
|
|
1055
|
+
records = payload.get("results") or payload.get("collection") or payload.get("items")
|
|
1056
|
+
else:
|
|
1057
|
+
records = payload
|
|
1058
|
+
|
|
1059
|
+
if not isinstance(records, list):
|
|
1060
|
+
return papers
|
|
1061
|
+
|
|
1062
|
+
for item in records:
|
|
1063
|
+
if not isinstance(item, dict):
|
|
1064
|
+
continue
|
|
1065
|
+
|
|
1066
|
+
is_openalex_record = any(
|
|
1067
|
+
key in item for key in ["publication_date", "authorships", "primary_location", "best_oa_location"]
|
|
1068
|
+
)
|
|
1069
|
+
if is_openalex_record and not is_openalex_chemrxiv_work(item):
|
|
1070
|
+
continue
|
|
1071
|
+
|
|
1072
|
+
published = normalize_published_date(
|
|
1073
|
+
item.get("published") or item.get("published_date") or item.get("date") or item.get("publication_date")
|
|
1074
|
+
)
|
|
1075
|
+
if not published:
|
|
1076
|
+
continue
|
|
1077
|
+
|
|
1078
|
+
published_at = datetime.fromisoformat(f"{published}T00:00:00+00:00")
|
|
1079
|
+
if published_at < cutoff:
|
|
1080
|
+
continue
|
|
1081
|
+
|
|
1082
|
+
category_value = item.get("categories") or item.get("category") or extract_openalex_category(item) or scope or ""
|
|
1083
|
+
category = normalize_whitespace(str(category_value))
|
|
1084
|
+
if scope:
|
|
1085
|
+
scope_terms = normalize_string_list(scope)
|
|
1086
|
+
if scope_terms and not any(term.lower() in category.lower() for term in scope_terms):
|
|
1087
|
+
continue
|
|
1088
|
+
|
|
1089
|
+
link = item.get("link") or item.get("url") or item.get("doi")
|
|
1090
|
+
if not link and is_openalex_record:
|
|
1091
|
+
link = normalize_openalex_chemrxiv_link(item) or ""
|
|
1092
|
+
|
|
1093
|
+
summary = item.get("summary") or item.get("abstract") or item.get("description")
|
|
1094
|
+
if not summary and is_openalex_record:
|
|
1095
|
+
summary = extract_openalex_abstract(item)
|
|
1096
|
+
|
|
1097
|
+
authors = item.get("authors") or item.get("author_names")
|
|
1098
|
+
if not authors and is_openalex_record:
|
|
1099
|
+
authors = extract_openalex_author_names(item)
|
|
1100
|
+
|
|
1101
|
+
papers.append(
|
|
1102
|
+
normalize_paper_record(
|
|
1103
|
+
"ChemRxiv",
|
|
1104
|
+
{
|
|
1105
|
+
"title": item.get("title") or item.get("display_name"),
|
|
1106
|
+
"link": link,
|
|
1107
|
+
"published": published,
|
|
1108
|
+
"summary": summary,
|
|
1109
|
+
"authors": authors,
|
|
1110
|
+
"categories": category,
|
|
1111
|
+
},
|
|
1112
|
+
scope or category,
|
|
1113
|
+
language,
|
|
1114
|
+
)
|
|
1115
|
+
)
|
|
1116
|
+
|
|
1117
|
+
return papers
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
def fetch_biorxiv_pubs(server: str, cutoff: datetime) -> dict:
|
|
1121
|
+
"""Fetch bioRxiv/medRxiv preprints for a date window."""
|
|
1122
|
+
interval = f"{cutoff.date().isoformat()}/{datetime.now(timezone.utc).date().isoformat()}"
|
|
1123
|
+
request = urllib.request.Request(
|
|
1124
|
+
f"https://api.biorxiv.org/details/{server}/{interval}/0/json",
|
|
1125
|
+
headers=REQUEST_HEADERS,
|
|
1126
|
+
)
|
|
1127
|
+
last_error: Exception | None = None
|
|
1128
|
+
for _ in range(3):
|
|
1129
|
+
try:
|
|
1130
|
+
with urllib.request.urlopen(request, timeout=60) as response:
|
|
1131
|
+
return json.loads(response.read().decode("utf-8"))
|
|
1132
|
+
except Exception as error:
|
|
1133
|
+
last_error = error
|
|
1134
|
+
|
|
1135
|
+
if last_error is not None:
|
|
1136
|
+
raise last_error
|
|
1137
|
+
|
|
1138
|
+
return {"collection": []}
|
|
1139
|
+
|
|
1140
|
+
|
|
1141
|
+
def fetch_chemrxiv_results(source: dict, cutoff: datetime) -> object:
|
|
1142
|
+
"""Fetch ChemRxiv results via repository-filtered OpenAlex search."""
|
|
1143
|
+
query = build_openalex_query(normalize_string_list(source.get("queries")))
|
|
1144
|
+
max_results = int(source.get("max_results", 200))
|
|
1145
|
+
return fetch_openalex_works(
|
|
1146
|
+
query,
|
|
1147
|
+
cutoff,
|
|
1148
|
+
max_results,
|
|
1149
|
+
extra_filters=[f"repository:{CHEMRXIV_OPENALEX_REPOSITORY_ID}"],
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
def fetch_socarxiv_results(source: dict, cutoff: datetime) -> object:
|
|
1154
|
+
"""Fetch SocArXiv results via repository-filtered OpenAlex search."""
|
|
1155
|
+
query = build_openalex_query(normalize_string_list(source.get("queries")))
|
|
1156
|
+
max_results = int(source.get("max_results", 200))
|
|
1157
|
+
return fetch_openalex_works(
|
|
1158
|
+
query,
|
|
1159
|
+
cutoff,
|
|
1160
|
+
max_results,
|
|
1161
|
+
extra_filters=[f"repository:{SOCARXIV_OPENALEX_REPOSITORY_ID}"],
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
def fetch_ssrn_results(source: dict, cutoff: datetime) -> object:
|
|
1166
|
+
"""Fetch SSRN results via repository-filtered OpenAlex search."""
|
|
1167
|
+
query = build_openalex_query(normalize_string_list(source.get("queries")))
|
|
1168
|
+
max_results = int(source.get("max_results", 200))
|
|
1169
|
+
return fetch_openalex_works(
|
|
1170
|
+
query,
|
|
1171
|
+
cutoff,
|
|
1172
|
+
max_results,
|
|
1173
|
+
extra_filters=[f"repository:{SSRN_OPENALEX_REPOSITORY_ID}"],
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
|
|
1177
|
+
def collect_biorxiv_like_source(
|
|
1178
|
+
source: dict[str, object],
|
|
1179
|
+
cutoff: datetime,
|
|
1180
|
+
language: str,
|
|
1181
|
+
source_key: str,
|
|
1182
|
+
server: str,
|
|
1183
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1184
|
+
"""Collect bioRxiv/medRxiv papers using the official API."""
|
|
1185
|
+
queries = [query for query in normalize_string_list(source.get("queries")) if query]
|
|
1186
|
+
if any(keyword_contains_non_english(query) for query in queries):
|
|
1187
|
+
return {
|
|
1188
|
+
"papers": [],
|
|
1189
|
+
"errors": [build_error(source_key, "config", f"{display_source_type(source_key)} keywords must be English")],
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
expressions = compile_keyword_expressions(queries)
|
|
1193
|
+
if not expressions:
|
|
1194
|
+
return {
|
|
1195
|
+
"papers": [],
|
|
1196
|
+
"errors": [],
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
try:
|
|
1200
|
+
payload = fetch_biorxiv_pubs(server, cutoff)
|
|
1201
|
+
candidates = parse_biorxiv_results(payload, cutoff, language, source_key, str(source.get("scope", "")))
|
|
1202
|
+
ranked = rank_papers(candidates, expressions)
|
|
1203
|
+
max_results = int(source.get("max_results", 200))
|
|
1204
|
+
return {
|
|
1205
|
+
"papers": deduplicate_papers(ranked)[:max_results],
|
|
1206
|
+
"errors": [],
|
|
1207
|
+
}
|
|
1208
|
+
except Exception as error:
|
|
1209
|
+
return {
|
|
1210
|
+
"papers": [],
|
|
1211
|
+
"errors": [build_error(source_key, "adapter", str(error))],
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
def collect_arxiv_source(
|
|
1216
|
+
source: dict[str, object],
|
|
1217
|
+
cutoff: datetime,
|
|
1218
|
+
language: str,
|
|
1219
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1220
|
+
"""Collect arXiv papers using official feeds first and OpenAlex fallback second."""
|
|
1221
|
+
import time
|
|
1222
|
+
|
|
1223
|
+
queries = [query for query in normalize_string_list(source.get("queries")) if query]
|
|
1224
|
+
categories = [category for category in normalize_string_list(source.get("scope")) if category]
|
|
1225
|
+
max_results = int(source.get("max_results", 200))
|
|
1226
|
+
fallback_enabled = bool(source.get("fallback_enabled", True))
|
|
1227
|
+
require_arxiv_link = bool(source.get("require_arxiv_link", True))
|
|
1228
|
+
|
|
1229
|
+
if any(keyword_contains_non_english(keyword) for keyword in queries):
|
|
1230
|
+
return {
|
|
1231
|
+
"papers": [],
|
|
1232
|
+
"errors": [build_error("arxiv", "config", "arXiv keywords must be English")],
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
expressions = compile_keyword_expressions(queries)
|
|
1236
|
+
if not expressions:
|
|
1237
|
+
return {
|
|
1238
|
+
"papers": [],
|
|
1239
|
+
"errors": [],
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
errors: list[dict[str, str]] = []
|
|
1243
|
+
primary_candidates: list[dict[str, str]] = []
|
|
1244
|
+
per_category_results = max(50, min(100, max_results))
|
|
1245
|
+
|
|
1246
|
+
if categories:
|
|
1247
|
+
for index, category in enumerate(dict.fromkeys(categories)):
|
|
1248
|
+
if index > 0:
|
|
1249
|
+
time.sleep(ARXIV_REQUEST_INTERVAL_SECONDS)
|
|
1250
|
+
try:
|
|
1251
|
+
xml_data = fetch_recent_arxiv_category(category, per_category_results)
|
|
1252
|
+
primary_candidates.extend(parse_arxiv_feed(xml_data, cutoff, language))
|
|
1253
|
+
except Exception as error:
|
|
1254
|
+
errors.append(build_error("arxiv", "arxiv-api", str(error)))
|
|
1255
|
+
|
|
1256
|
+
primary_ranked = rank_papers(primary_candidates, expressions)
|
|
1257
|
+
primary_papers = deduplicate_papers(primary_ranked)[:max_results]
|
|
1258
|
+
if primary_papers:
|
|
1259
|
+
return {
|
|
1260
|
+
"papers": primary_papers,
|
|
1261
|
+
"errors": errors,
|
|
1262
|
+
}
|
|
1263
|
+
|
|
1264
|
+
if not fallback_enabled:
|
|
1265
|
+
return {
|
|
1266
|
+
"papers": [],
|
|
1267
|
+
"errors": errors,
|
|
1268
|
+
}
|
|
1269
|
+
|
|
1270
|
+
try:
|
|
1271
|
+
openalex_payload = fetch_openalex_works(build_openalex_query(queries), cutoff, max_results)
|
|
1272
|
+
fallback_candidates = parse_openalex_results(
|
|
1273
|
+
openalex_payload,
|
|
1274
|
+
cutoff,
|
|
1275
|
+
language,
|
|
1276
|
+
require_arxiv_link=require_arxiv_link,
|
|
1277
|
+
)
|
|
1278
|
+
fallback_ranked = rank_papers(fallback_candidates, expressions)
|
|
1279
|
+
return {
|
|
1280
|
+
"papers": deduplicate_papers(fallback_ranked)[:max_results],
|
|
1281
|
+
"errors": errors,
|
|
1282
|
+
}
|
|
1283
|
+
except Exception as error:
|
|
1284
|
+
errors.append(build_error("arxiv", "openalex", str(error)))
|
|
1285
|
+
return {
|
|
1286
|
+
"papers": [],
|
|
1287
|
+
"errors": errors,
|
|
1288
|
+
}
|
|
1289
|
+
|
|
1290
|
+
|
|
1291
|
+
def collect_biorxiv_source(
|
|
1292
|
+
source: dict[str, object],
|
|
1293
|
+
cutoff: datetime,
|
|
1294
|
+
language: str,
|
|
1295
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1296
|
+
"""Collect bioRxiv papers."""
|
|
1297
|
+
return collect_biorxiv_like_source(source, cutoff, language, "biorxiv", "biorxiv")
|
|
1298
|
+
|
|
1299
|
+
|
|
1300
|
+
def collect_medrxiv_source(
|
|
1301
|
+
source: dict[str, object],
|
|
1302
|
+
cutoff: datetime,
|
|
1303
|
+
language: str,
|
|
1304
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1305
|
+
"""Collect medRxiv papers."""
|
|
1306
|
+
return collect_biorxiv_like_source(source, cutoff, language, "medrxiv", "medrxiv")
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
def collect_chemrxiv_source(
|
|
1310
|
+
source: dict[str, object],
|
|
1311
|
+
cutoff: datetime,
|
|
1312
|
+
language: str,
|
|
1313
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1314
|
+
"""Collect ChemRxiv papers through the adapter boundary."""
|
|
1315
|
+
queries = [query for query in normalize_string_list(source.get("queries")) if query]
|
|
1316
|
+
if any(keyword_contains_non_english(query) for query in queries):
|
|
1317
|
+
return {
|
|
1318
|
+
"papers": [],
|
|
1319
|
+
"errors": [build_error("chemrxiv", "config", "ChemRxiv keywords must be English")],
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
expressions = compile_keyword_expressions(queries)
|
|
1323
|
+
if not expressions:
|
|
1324
|
+
return {
|
|
1325
|
+
"papers": [],
|
|
1326
|
+
"errors": [],
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
try:
|
|
1330
|
+
payload = fetch_chemrxiv_results(source, cutoff)
|
|
1331
|
+
candidates = parse_chemrxiv_results(payload, cutoff, language, str(source.get("scope", "")))
|
|
1332
|
+
ranked = rank_papers(candidates, expressions)
|
|
1333
|
+
max_results = int(source.get("max_results", 200))
|
|
1334
|
+
return {
|
|
1335
|
+
"papers": deduplicate_papers(ranked)[:max_results],
|
|
1336
|
+
"errors": [],
|
|
1337
|
+
}
|
|
1338
|
+
except Exception as error:
|
|
1339
|
+
return {
|
|
1340
|
+
"papers": [],
|
|
1341
|
+
"errors": [build_error("chemrxiv", "adapter", str(error))],
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1344
|
+
|
|
1345
|
+
def collect_openalex_repository_source(
|
|
1346
|
+
source: dict[str, object],
|
|
1347
|
+
cutoff: datetime,
|
|
1348
|
+
language: str,
|
|
1349
|
+
source_key: str,
|
|
1350
|
+
fetcher,
|
|
1351
|
+
link_normalizer,
|
|
1352
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1353
|
+
"""Collect repository-backed OpenAlex papers for a source type."""
|
|
1354
|
+
queries = [query for query in normalize_string_list(source.get("queries")) if query]
|
|
1355
|
+
if any(keyword_contains_non_english(query) for query in queries):
|
|
1356
|
+
return {
|
|
1357
|
+
"papers": [],
|
|
1358
|
+
"errors": [build_error(source_key, "config", f"{display_source_type(source_key)} keywords must be English")],
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
expressions = compile_keyword_expressions(queries)
|
|
1362
|
+
if not expressions:
|
|
1363
|
+
return {
|
|
1364
|
+
"papers": [],
|
|
1365
|
+
"errors": [],
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
try:
|
|
1369
|
+
payload = fetcher(source, cutoff)
|
|
1370
|
+
candidates = parse_openalex_repository_results(
|
|
1371
|
+
payload,
|
|
1372
|
+
cutoff,
|
|
1373
|
+
language,
|
|
1374
|
+
display_source_type(source_key),
|
|
1375
|
+
str(source.get("scope", "")),
|
|
1376
|
+
link_normalizer,
|
|
1377
|
+
)
|
|
1378
|
+
ranked = rank_papers(candidates, expressions)
|
|
1379
|
+
max_results = int(source.get("max_results", 200))
|
|
1380
|
+
return {
|
|
1381
|
+
"papers": deduplicate_papers(ranked)[:max_results],
|
|
1382
|
+
"errors": [],
|
|
1383
|
+
}
|
|
1384
|
+
except Exception as error:
|
|
1385
|
+
return {
|
|
1386
|
+
"papers": [],
|
|
1387
|
+
"errors": [build_error(source_key, "adapter", str(error))],
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
|
|
1391
|
+
def collect_socarxiv_source(
|
|
1392
|
+
source: dict[str, object],
|
|
1393
|
+
cutoff: datetime,
|
|
1394
|
+
language: str,
|
|
1395
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1396
|
+
"""Collect SocArXiv papers through repository-filtered OpenAlex search."""
|
|
1397
|
+
return collect_openalex_repository_source(
|
|
1398
|
+
source,
|
|
1399
|
+
cutoff,
|
|
1400
|
+
language,
|
|
1401
|
+
"socarxiv",
|
|
1402
|
+
fetch_socarxiv_results,
|
|
1403
|
+
normalize_openalex_socarxiv_link,
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1406
|
+
|
|
1407
|
+
def collect_ssrn_source(
|
|
1408
|
+
source: dict[str, object],
|
|
1409
|
+
cutoff: datetime,
|
|
1410
|
+
language: str,
|
|
1411
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1412
|
+
"""Collect SSRN papers through repository-filtered OpenAlex search."""
|
|
1413
|
+
return collect_openalex_repository_source(
|
|
1414
|
+
source,
|
|
1415
|
+
cutoff,
|
|
1416
|
+
language,
|
|
1417
|
+
"ssrn",
|
|
1418
|
+
fetch_ssrn_results,
|
|
1419
|
+
normalize_openalex_ssrn_link,
|
|
1420
|
+
)
|
|
1421
|
+
|
|
1422
|
+
|
|
1423
|
+
def collect_papers(
|
|
1424
|
+
paper_sources: list[dict[str, object]],
|
|
1425
|
+
cutoff: datetime,
|
|
1426
|
+
language: str,
|
|
1427
|
+
max_results: int = 200,
|
|
1428
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1429
|
+
"""Collect papers across multiple explicit source adapters."""
|
|
1430
|
+
papers: list[dict[str, str]] = []
|
|
1431
|
+
errors: list[dict[str, str]] = []
|
|
1432
|
+
|
|
1433
|
+
for source in paper_sources:
|
|
1434
|
+
if not isinstance(source, dict):
|
|
1435
|
+
continue
|
|
1436
|
+
if not source.get("enabled", True):
|
|
1437
|
+
continue
|
|
1438
|
+
|
|
1439
|
+
source_key = normalize_source_type(str(source.get("source_type") or ""))
|
|
1440
|
+
try:
|
|
1441
|
+
if source_key == "arxiv":
|
|
1442
|
+
result = collect_arxiv_source(source, cutoff, language)
|
|
1443
|
+
elif source_key == "biorxiv":
|
|
1444
|
+
result = collect_biorxiv_source(source, cutoff, language)
|
|
1445
|
+
elif source_key == "medrxiv":
|
|
1446
|
+
result = collect_medrxiv_source(source, cutoff, language)
|
|
1447
|
+
elif source_key == "chemrxiv":
|
|
1448
|
+
result = collect_chemrxiv_source(source, cutoff, language)
|
|
1449
|
+
elif source_key == "socarxiv":
|
|
1450
|
+
result = collect_socarxiv_source(source, cutoff, language)
|
|
1451
|
+
elif source_key == "ssrn":
|
|
1452
|
+
result = collect_ssrn_source(source, cutoff, language)
|
|
1453
|
+
else:
|
|
1454
|
+
errors.append(
|
|
1455
|
+
build_error(
|
|
1456
|
+
source_key or "papers",
|
|
1457
|
+
"config",
|
|
1458
|
+
f"Unsupported paper source type: {source.get('source_type')}",
|
|
1459
|
+
)
|
|
1460
|
+
)
|
|
1461
|
+
continue
|
|
1462
|
+
except Exception as error:
|
|
1463
|
+
errors.append(build_error(source_key or "papers", "adapter", str(error)))
|
|
1464
|
+
continue
|
|
1465
|
+
|
|
1466
|
+
papers.extend(result["papers"])
|
|
1467
|
+
errors.extend(result["errors"])
|
|
1468
|
+
|
|
1469
|
+
return {
|
|
1470
|
+
"papers": deduplicate_papers(papers)[:max_results],
|
|
1471
|
+
"errors": errors,
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
|
|
1475
|
+
def fetch_arxiv(
|
|
1476
|
+
keywords: list[str], categories: list[str], max_results: int, cutoff: datetime, language: str
|
|
1477
|
+
) -> list[dict[str, str]]:
|
|
1478
|
+
"""Backward-compatible wrapper that returns only paper rows."""
|
|
1479
|
+
return collect_arxiv_papers(
|
|
1480
|
+
keywords=keywords,
|
|
1481
|
+
categories=categories,
|
|
1482
|
+
max_results=max_results,
|
|
1483
|
+
cutoff=cutoff,
|
|
1484
|
+
language=language,
|
|
1485
|
+
)["papers"]
|
|
1486
|
+
|
|
1487
|
+
|
|
1488
|
+
def collect_arxiv_papers(
|
|
1489
|
+
keywords: list[str],
|
|
1490
|
+
categories: list[str],
|
|
1491
|
+
max_results: int,
|
|
1492
|
+
cutoff: datetime,
|
|
1493
|
+
language: str,
|
|
1494
|
+
fallback_enabled: bool = True,
|
|
1495
|
+
require_arxiv_link: bool = True,
|
|
1496
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
1497
|
+
"""Backward-compatible wrapper around the generic papers collector."""
|
|
1498
|
+
paper_sources = normalize_paper_sources(
|
|
1499
|
+
{
|
|
1500
|
+
"arxiv": {
|
|
1501
|
+
"enabled": True,
|
|
1502
|
+
"keywords": keywords,
|
|
1503
|
+
"categories": categories,
|
|
1504
|
+
"max_results": max_results,
|
|
1505
|
+
"fallback_enabled": fallback_enabled,
|
|
1506
|
+
"require_arxiv_link": require_arxiv_link,
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
)
|
|
1510
|
+
return collect_papers(paper_sources, cutoff, language, max_results)
|
|
1511
|
+
|
|
1512
|
+
|
|
1513
|
+
def main() -> None:
|
|
1514
|
+
"""Read config from stdin, run enabled fetchers, and print JSON."""
|
|
1515
|
+
config = json.loads(sys.stdin.read())
|
|
1516
|
+
language = normalize_language(config.get("language"))
|
|
1517
|
+
days = config.get("days", 7)
|
|
1518
|
+
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
|
|
1519
|
+
|
|
1520
|
+
rss_articles: list[dict[str, str]] = []
|
|
1521
|
+
arxiv_papers: list[dict[str, str]] = []
|
|
1522
|
+
errors: list[dict[str, str]] = []
|
|
1523
|
+
|
|
1524
|
+
rss_config = config.get("rss", {})
|
|
1525
|
+
if rss_config.get("enabled", False):
|
|
1526
|
+
ensure_dependencies()
|
|
1527
|
+
feeds = rss_config.get("feeds", [])
|
|
1528
|
+
rss_articles = fetch_rss(feeds, cutoff, language)
|
|
1529
|
+
|
|
1530
|
+
paper_sources = normalize_paper_sources(config)
|
|
1531
|
+
if paper_sources:
|
|
1532
|
+
paper_result = collect_papers(paper_sources, cutoff, language, 200)
|
|
1533
|
+
arxiv_papers = paper_result["papers"]
|
|
1534
|
+
errors.extend(paper_result["errors"])
|
|
1535
|
+
|
|
1536
|
+
result = {
|
|
1537
|
+
"rss_articles": rss_articles,
|
|
1538
|
+
"arxiv_papers": arxiv_papers,
|
|
1539
|
+
"stats": {
|
|
1540
|
+
"rss_count": len([article for article in rss_articles if not article["title"].startswith("[")]),
|
|
1541
|
+
"arxiv_count": len(arxiv_papers),
|
|
1542
|
+
},
|
|
1543
|
+
"errors": errors,
|
|
1544
|
+
}
|
|
1545
|
+
print(json.dumps(result, ensure_ascii=False))
|
|
1546
|
+
|
|
1547
|
+
|
|
1548
|
+
if __name__ == "__main__":
|
|
1549
|
+
main()
|