@yeyuan98/opencode-bioresearcher-plugin 1.3.1 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -0
- package/dist/index.js +4 -1
- package/dist/misc-tools/index.d.ts +3 -0
- package/dist/misc-tools/index.js +3 -0
- package/dist/misc-tools/json-extract.d.ts +13 -0
- package/dist/misc-tools/json-extract.js +394 -0
- package/dist/misc-tools/json-infer.d.ts +13 -0
- package/dist/misc-tools/json-infer.js +199 -0
- package/dist/misc-tools/json-tools.d.ts +33 -0
- package/dist/misc-tools/json-tools.js +187 -0
- package/dist/misc-tools/json-validate.d.ts +13 -0
- package/dist/misc-tools/json-validate.js +228 -0
- package/dist/skills/bioresearcher-core/README.md +210 -0
- package/dist/skills/bioresearcher-core/SKILL.md +128 -0
- package/dist/skills/bioresearcher-core/examples/contexts.json +29 -0
- package/dist/skills/bioresearcher-core/examples/data-exchange-example.md +303 -0
- package/dist/skills/bioresearcher-core/examples/template.md +49 -0
- package/dist/skills/bioresearcher-core/patterns/calculator.md +215 -0
- package/dist/skills/bioresearcher-core/patterns/data-exchange.md +406 -0
- package/dist/skills/bioresearcher-core/patterns/json-tools.md +263 -0
- package/dist/skills/bioresearcher-core/patterns/progress.md +127 -0
- package/dist/skills/bioresearcher-core/patterns/retry.md +110 -0
- package/dist/skills/bioresearcher-core/patterns/shell-commands.md +79 -0
- package/dist/skills/bioresearcher-core/patterns/subagent-waves.md +186 -0
- package/dist/skills/bioresearcher-core/patterns/table-tools.md +260 -0
- package/dist/skills/bioresearcher-core/patterns/user-confirmation.md +187 -0
- package/dist/skills/bioresearcher-core/python/template.md +273 -0
- package/dist/skills/bioresearcher-core/python/template.py +323 -0
- package/dist/skills/long-table-summary/SKILL.md +374 -0
- package/dist/skills/long-table-summary/__init__.py +3 -0
- package/dist/skills/long-table-summary/combine_outputs.py +345 -0
- package/dist/skills/long-table-summary/pyproject.toml +11 -0
- package/dist/skills/pubmed-weekly/SKILL.md +329 -329
- package/dist/skills/pubmed-weekly/pubmed_weekly.py +411 -411
- package/dist/skills/pubmed-weekly/pyproject.toml +8 -8
- package/package.json +7 -2
|
@@ -1,411 +1,411 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
PubMed Weekly Daily Updates Downloader
|
|
4
|
-
|
|
5
|
-
This script handles:
|
|
6
|
-
- Calculating the past week's date range (Monday-Sunday)
|
|
7
|
-
- Fetching FTP file list from NCBI
|
|
8
|
-
- Filtering files for the specific week
|
|
9
|
-
- Downloading files with retry logic
|
|
10
|
-
- Combining Excel files into combined.xlsx
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
import os
|
|
14
|
-
import sys
|
|
15
|
-
import re
|
|
16
|
-
import time
|
|
17
|
-
import json
|
|
18
|
-
import glob
|
|
19
|
-
import urllib.request
|
|
20
|
-
import argparse
|
|
21
|
-
from datetime import datetime, timedelta
|
|
22
|
-
from typing import List, Dict, Any
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def calculate_week() -> str:
|
|
26
|
-
"""Calculate the past week's date range (Monday-Sunday).
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
Week folder name in format 'YYYYMMDD-YYYYMMDD' for the PREVIOUS week
|
|
30
|
-
"""
|
|
31
|
-
today = datetime.now()
|
|
32
|
-
|
|
33
|
-
# Find the most recent Monday of the current week
|
|
34
|
-
days_since_monday = today.weekday() # Monday = 0, Sunday = 6
|
|
35
|
-
current_monday = today - timedelta(days=days_since_monday)
|
|
36
|
-
|
|
37
|
-
# Go back one week to get the previous week's Monday
|
|
38
|
-
previous_week_monday = current_monday - timedelta(days=7)
|
|
39
|
-
|
|
40
|
-
# Calculate the previous week's Sunday (6 days after Monday)
|
|
41
|
-
previous_week_sunday = previous_week_monday + timedelta(days=6)
|
|
42
|
-
|
|
43
|
-
week_start = previous_week_monday.strftime("%Y%m%d")
|
|
44
|
-
week_end = previous_week_sunday.strftime("%Y%m%d")
|
|
45
|
-
|
|
46
|
-
return f"{week_start}-{week_end}"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def parse_date_from_filename(filename: str) -> datetime | None:
|
|
50
|
-
"""Extract date from PubMed filename.
|
|
51
|
-
|
|
52
|
-
PubMed filenames are in format: pubmed24nYYYYMMDD.xml.gz
|
|
53
|
-
The date is embedded in the number following 'n'
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
filename: PubMed filename (e.g., pubmed24n1234.xml.gz)
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
datetime object or None if date cannot be parsed
|
|
60
|
-
"""
|
|
61
|
-
# Pattern to extract the numeric part after 'n'
|
|
62
|
-
match = re.match(r"pubmed\d+n(\d+)\.xml\.gz", filename)
|
|
63
|
-
if not match:
|
|
64
|
-
return None
|
|
65
|
-
|
|
66
|
-
number = match.group(1)
|
|
67
|
-
|
|
68
|
-
# PubMed daily update files use a specific numbering scheme
|
|
69
|
-
# The first 4 digits represent the year (e.g., 2024)
|
|
70
|
-
# The next 4 digits represent a sequential number within the year
|
|
71
|
-
# We need to convert this to a date
|
|
72
|
-
|
|
73
|
-
# For daily updates, NCBI uses a sequential number that increments daily
|
|
74
|
-
# We need to look up the actual date from the FTP directory listing
|
|
75
|
-
# which includes modification time
|
|
76
|
-
|
|
77
|
-
return None
|
|
78
|
-
|
|
79
|
-
number = match.group(1)
|
|
80
|
-
|
|
81
|
-
# PubMed daily update files use a specific numbering scheme
|
|
82
|
-
# The first 4 digits represent the year (e.g., 2024)
|
|
83
|
-
# The next 4 digits represent a sequential number within the year
|
|
84
|
-
# We need to convert this to a date
|
|
85
|
-
|
|
86
|
-
# For daily updates, NCBI uses a sequential number that increments daily
|
|
87
|
-
# We need to look up the actual date from the FTP directory listing
|
|
88
|
-
# which includes modification time
|
|
89
|
-
|
|
90
|
-
return None
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def fetch_ftp_file_list() -> List[str]:
|
|
94
|
-
"""Fetch list of xml.gz files from NCBI FTP server.
|
|
95
|
-
|
|
96
|
-
Returns:
|
|
97
|
-
List of xml.gz filenames from the FTP server
|
|
98
|
-
"""
|
|
99
|
-
url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
|
|
100
|
-
|
|
101
|
-
try:
|
|
102
|
-
with urllib.request.urlopen(url) as response:
|
|
103
|
-
html_content = response.read().decode("utf-8")
|
|
104
|
-
|
|
105
|
-
# Parse HTML to extract filenames
|
|
106
|
-
# FTP directory listing returns HTML with links
|
|
107
|
-
filenames = []
|
|
108
|
-
for line in html_content.split("\n"):
|
|
109
|
-
match = re.search(r"pubmed\d+n\d+\.xml\.gz", line)
|
|
110
|
-
if match:
|
|
111
|
-
filename = match.group(0)
|
|
112
|
-
if filename not in filenames:
|
|
113
|
-
filenames.append(filename)
|
|
114
|
-
|
|
115
|
-
return sorted(filenames)
|
|
116
|
-
|
|
117
|
-
except Exception as e:
|
|
118
|
-
print(f"Error fetching FTP file list: {e}", file=sys.stderr)
|
|
119
|
-
sys.exit(1)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def filter_files_by_date(week_name: str, file_list: List[str]) -> List[str]:
|
|
123
|
-
"""Filter files to include only those from the past week.
|
|
124
|
-
|
|
125
|
-
Since PubMed files don't encode the date directly in the filename,
|
|
126
|
-
we need to download the directory listing with timestamps and filter
|
|
127
|
-
based on modification date.
|
|
128
|
-
|
|
129
|
-
Args:
|
|
130
|
-
week_name: Week folder name (YYYYMMDD-YYYYMMDD)
|
|
131
|
-
file_list: List of all xml.gz filenames
|
|
132
|
-
|
|
133
|
-
Returns:
|
|
134
|
-
List of filenames that fall within the date range
|
|
135
|
-
"""
|
|
136
|
-
# Parse week dates
|
|
137
|
-
start_date_str, end_date_str = week_name.split("-")
|
|
138
|
-
start_date = datetime.strptime(start_date_str, "%Y%m%d")
|
|
139
|
-
end_date = datetime.strptime(end_date_str, "%Y%m%d").replace(
|
|
140
|
-
hour=23, minute=59, second=59
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
# Fetch directory listing with timestamps
|
|
144
|
-
url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
|
|
145
|
-
|
|
146
|
-
try:
|
|
147
|
-
with urllib.request.urlopen(url) as response:
|
|
148
|
-
content = response.read().decode("utf-8", errors="ignore")
|
|
149
|
-
|
|
150
|
-
# Parse file listings with dates
|
|
151
|
-
# NCBI FTP format uses ISO date: "2026-01-30 14:02"
|
|
152
|
-
file_dates = {}
|
|
153
|
-
|
|
154
|
-
for filename in file_list:
|
|
155
|
-
# Find the line containing this file
|
|
156
|
-
# Pattern: filename followed by non-digits, then date YYYY-MM-DD HH:MM
|
|
157
|
-
pattern = re.escape(filename)
|
|
158
|
-
match = re.search(
|
|
159
|
-
rf"({pattern})[^0-9]*(\d{{4}}-\d{{2}}-\d{{2}})\s+(\d{{2}}:\d{{2}})",
|
|
160
|
-
content,
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
if match:
|
|
164
|
-
date_str = match.group(2)
|
|
165
|
-
time_str = match.group(3)
|
|
166
|
-
try:
|
|
167
|
-
# Parse date in ISO format: "2026-01-30 14:02"
|
|
168
|
-
file_date = datetime.strptime(
|
|
169
|
-
f"{date_str} {time_str}", "%Y-%m-%d %H:%M"
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
file_dates[filename] = file_date
|
|
173
|
-
except ValueError:
|
|
174
|
-
continue
|
|
175
|
-
|
|
176
|
-
# Filter files within date range
|
|
177
|
-
filtered_files = [
|
|
178
|
-
filename
|
|
179
|
-
for filename, file_date in file_dates.items()
|
|
180
|
-
if start_date <= file_date <= end_date
|
|
181
|
-
]
|
|
182
|
-
|
|
183
|
-
return sorted(filtered_files)
|
|
184
|
-
|
|
185
|
-
except Exception as e:
|
|
186
|
-
print(f"Error filtering files by date: {e}", file=sys.stderr)
|
|
187
|
-
sys.exit(1)
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
def download_file(week_name: str, filename: str, max_retries: int = 3) -> int:
|
|
191
|
-
"""Download a single file from NCBI FTP server with retry logic.
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
week_name: Week folder name
|
|
195
|
-
filename: XML.gz filename to download
|
|
196
|
-
max_retries: Maximum number of retry attempts
|
|
197
|
-
|
|
198
|
-
Returns:
|
|
199
|
-
0 on success, 1 on failure (after all retries)
|
|
200
|
-
"""
|
|
201
|
-
base_url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
|
|
202
|
-
url = f"{base_url}{filename}"
|
|
203
|
-
|
|
204
|
-
# Create download directory in current working directory
|
|
205
|
-
base_dir = os.getcwd()
|
|
206
|
-
download_dir = os.path.join(base_dir, ".download", "pubmed-daily", week_name)
|
|
207
|
-
os.makedirs(download_dir, exist_ok=True)
|
|
208
|
-
|
|
209
|
-
filepath = os.path.join(download_dir, filename)
|
|
210
|
-
|
|
211
|
-
for attempt in range(max_retries):
|
|
212
|
-
try:
|
|
213
|
-
print(f"Downloading {filename} (attempt {attempt + 1}/{max_retries})...")
|
|
214
|
-
|
|
215
|
-
urllib.request.urlretrieve(url, filepath)
|
|
216
|
-
|
|
217
|
-
# Verify file was downloaded and has content
|
|
218
|
-
if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
|
|
219
|
-
print(f"Successfully downloaded {filename}")
|
|
220
|
-
return 0
|
|
221
|
-
else:
|
|
222
|
-
raise Exception("Downloaded file is empty or missing")
|
|
223
|
-
|
|
224
|
-
except Exception as e:
|
|
225
|
-
print(f"Error downloading {filename}: {e}", file=sys.stderr)
|
|
226
|
-
|
|
227
|
-
if attempt < max_retries - 1:
|
|
228
|
-
print(f"Retrying in 2 seconds...")
|
|
229
|
-
time.sleep(2)
|
|
230
|
-
else:
|
|
231
|
-
print(f"Failed to download {filename} after {max_retries} attempts")
|
|
232
|
-
return 1
|
|
233
|
-
|
|
234
|
-
return 1
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def combine_excel(week_name: str) -> Dict[str, Any]:
|
|
238
|
-
"""Combine all Excel files in week folder into combined.xlsx.
|
|
239
|
-
|
|
240
|
-
Args:
|
|
241
|
-
week_name: Week folder name (e.g., '20250217-20250223')
|
|
242
|
-
|
|
243
|
-
Returns:
|
|
244
|
-
Dict with success, total_rows, source_files, output_file
|
|
245
|
-
"""
|
|
246
|
-
try:
|
|
247
|
-
from openpyxl import load_workbook, Workbook
|
|
248
|
-
except ImportError:
|
|
249
|
-
print("Error: openpyxl package not installed.", file=sys.stderr)
|
|
250
|
-
print("Please install with: uv add openpyxl", file=sys.stderr)
|
|
251
|
-
return {
|
|
252
|
-
"success": False,
|
|
253
|
-
"error": "openpyxl not installed",
|
|
254
|
-
"total_rows": 0,
|
|
255
|
-
"source_files": [],
|
|
256
|
-
"output_file": None,
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
# Use current working directory
|
|
260
|
-
base_dir = os.getcwd()
|
|
261
|
-
week_dir = os.path.join(base_dir, ".download", "pubmed-daily", week_name)
|
|
262
|
-
|
|
263
|
-
if not os.path.exists(week_dir):
|
|
264
|
-
return {
|
|
265
|
-
"success": False,
|
|
266
|
-
"error": f"Directory not found: {week_dir}",
|
|
267
|
-
"total_rows": 0,
|
|
268
|
-
"source_files": [],
|
|
269
|
-
"output_file": None,
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
xlsx_pattern = os.path.join(week_dir, "*.xlsx")
|
|
273
|
-
all_xlsx_files = glob.glob(xlsx_pattern)
|
|
274
|
-
|
|
275
|
-
source_files = [
|
|
276
|
-
os.path.basename(f) for f in all_xlsx_files if not f.endswith("combined.xlsx")
|
|
277
|
-
]
|
|
278
|
-
source_files.sort()
|
|
279
|
-
|
|
280
|
-
if not source_files:
|
|
281
|
-
return {
|
|
282
|
-
"success": False,
|
|
283
|
-
"error": "No Excel files found to combine",
|
|
284
|
-
"total_rows": 0,
|
|
285
|
-
"source_files": [],
|
|
286
|
-
"output_file": None,
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
combined_wb = Workbook()
|
|
290
|
-
combined_ws = combined_wb.active
|
|
291
|
-
if combined_ws is None:
|
|
292
|
-
combined_ws = combined_wb.create_sheet("PubMed Articles")
|
|
293
|
-
else:
|
|
294
|
-
combined_ws.title = "PubMed Articles"
|
|
295
|
-
|
|
296
|
-
header_written = False
|
|
297
|
-
total_rows = 0
|
|
298
|
-
processed_files = []
|
|
299
|
-
|
|
300
|
-
for filename in source_files:
|
|
301
|
-
filepath = os.path.join(week_dir, filename)
|
|
302
|
-
|
|
303
|
-
try:
|
|
304
|
-
wb = load_workbook(filepath, read_only=True, data_only=True)
|
|
305
|
-
ws = wb.active
|
|
306
|
-
if ws is None:
|
|
307
|
-
print(f"Warning: {filename} has no active sheet, skipping")
|
|
308
|
-
wb.close()
|
|
309
|
-
continue
|
|
310
|
-
|
|
311
|
-
rows = list(ws.rows)
|
|
312
|
-
if not rows:
|
|
313
|
-
print(f"Warning: {filename} is empty, skipping")
|
|
314
|
-
wb.close()
|
|
315
|
-
continue
|
|
316
|
-
|
|
317
|
-
if not header_written:
|
|
318
|
-
headers = [cell.value for cell in rows[0]]
|
|
319
|
-
combined_ws.append(headers)
|
|
320
|
-
header_written = True
|
|
321
|
-
data_start = 1
|
|
322
|
-
else:
|
|
323
|
-
data_start = 0
|
|
324
|
-
|
|
325
|
-
for row in rows[data_start:]:
|
|
326
|
-
row_values = [cell.value for cell in row]
|
|
327
|
-
if any(v is not None for v in row_values):
|
|
328
|
-
combined_ws.append(row_values)
|
|
329
|
-
total_rows += 1
|
|
330
|
-
|
|
331
|
-
processed_files.append(filename)
|
|
332
|
-
wb.close()
|
|
333
|
-
print(f"Processed: {filename}")
|
|
334
|
-
|
|
335
|
-
except Exception as e:
|
|
336
|
-
print(f"Warning: Error processing {filename}: {e}", file=sys.stderr)
|
|
337
|
-
continue
|
|
338
|
-
|
|
339
|
-
output_path = os.path.join(week_dir, "combined.xlsx")
|
|
340
|
-
combined_wb.save(output_path)
|
|
341
|
-
|
|
342
|
-
print(f"\nCombined {total_rows} rows from {len(processed_files)} files")
|
|
343
|
-
print(f"Output: {output_path}")
|
|
344
|
-
|
|
345
|
-
return {
|
|
346
|
-
"success": True,
|
|
347
|
-
"total_rows": total_rows,
|
|
348
|
-
"source_files": processed_files,
|
|
349
|
-
"output_file": "combined.xlsx",
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
def main():
|
|
354
|
-
"""Main entry point for command-line usage."""
|
|
355
|
-
parser = argparse.ArgumentParser(
|
|
356
|
-
description="PubMed Weekly Daily Updates Downloader"
|
|
357
|
-
)
|
|
358
|
-
parser.add_argument("command", type=str, help="Command to execute")
|
|
359
|
-
parser.add_argument("args", nargs="*", help="Command arguments")
|
|
360
|
-
|
|
361
|
-
parsed = parser.parse_args()
|
|
362
|
-
|
|
363
|
-
command = parsed.command
|
|
364
|
-
args = parsed.args
|
|
365
|
-
|
|
366
|
-
if command == "calculate_week":
|
|
367
|
-
week = calculate_week()
|
|
368
|
-
print(week)
|
|
369
|
-
|
|
370
|
-
elif command == "fetch_files":
|
|
371
|
-
files = fetch_ftp_file_list()
|
|
372
|
-
print(" ".join(files))
|
|
373
|
-
|
|
374
|
-
elif command == "filter_files":
|
|
375
|
-
if len(args) < 2:
|
|
376
|
-
print("Usage: python pubmed_weekly.py filter_files <week_name> <file_list>")
|
|
377
|
-
sys.exit(1)
|
|
378
|
-
|
|
379
|
-
week_name = args[0]
|
|
380
|
-
file_list = args[1].split()
|
|
381
|
-
filtered = filter_files_by_date(week_name, file_list)
|
|
382
|
-
print(" ".join(filtered))
|
|
383
|
-
|
|
384
|
-
elif command == "download_file":
|
|
385
|
-
if len(args) < 2:
|
|
386
|
-
print("Usage: python pubmed_weekly.py download_file <week_name> <filename>")
|
|
387
|
-
sys.exit(1)
|
|
388
|
-
|
|
389
|
-
week_name = args[0]
|
|
390
|
-
filename = args[1]
|
|
391
|
-
sys.exit(download_file(week_name, filename))
|
|
392
|
-
|
|
393
|
-
elif command == "combine_excel":
|
|
394
|
-
if len(args) < 1:
|
|
395
|
-
print("Usage: python pubmed_weekly.py combine_excel <week_name>")
|
|
396
|
-
sys.exit(1)
|
|
397
|
-
|
|
398
|
-
week_name = args[0]
|
|
399
|
-
result = combine_excel(week_name)
|
|
400
|
-
print(json.dumps(result, indent=2))
|
|
401
|
-
|
|
402
|
-
if not result.get("success"):
|
|
403
|
-
sys.exit(1)
|
|
404
|
-
|
|
405
|
-
else:
|
|
406
|
-
print(f"Unknown command: {command}")
|
|
407
|
-
sys.exit(1)
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
if __name__ == "__main__":
|
|
411
|
-
main()
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
PubMed Weekly Daily Updates Downloader
|
|
4
|
+
|
|
5
|
+
This script handles:
|
|
6
|
+
- Calculating the past week's date range (Monday-Sunday)
|
|
7
|
+
- Fetching FTP file list from NCBI
|
|
8
|
+
- Filtering files for the specific week
|
|
9
|
+
- Downloading files with retry logic
|
|
10
|
+
- Combining Excel files into combined.xlsx
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
import re
|
|
16
|
+
import time
|
|
17
|
+
import json
|
|
18
|
+
import glob
|
|
19
|
+
import urllib.request
|
|
20
|
+
import argparse
|
|
21
|
+
from datetime import datetime, timedelta
|
|
22
|
+
from typing import List, Dict, Any
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def calculate_week() -> str:
|
|
26
|
+
"""Calculate the past week's date range (Monday-Sunday).
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Week folder name in format 'YYYYMMDD-YYYYMMDD' for the PREVIOUS week
|
|
30
|
+
"""
|
|
31
|
+
today = datetime.now()
|
|
32
|
+
|
|
33
|
+
# Find the most recent Monday of the current week
|
|
34
|
+
days_since_monday = today.weekday() # Monday = 0, Sunday = 6
|
|
35
|
+
current_monday = today - timedelta(days=days_since_monday)
|
|
36
|
+
|
|
37
|
+
# Go back one week to get the previous week's Monday
|
|
38
|
+
previous_week_monday = current_monday - timedelta(days=7)
|
|
39
|
+
|
|
40
|
+
# Calculate the previous week's Sunday (6 days after Monday)
|
|
41
|
+
previous_week_sunday = previous_week_monday + timedelta(days=6)
|
|
42
|
+
|
|
43
|
+
week_start = previous_week_monday.strftime("%Y%m%d")
|
|
44
|
+
week_end = previous_week_sunday.strftime("%Y%m%d")
|
|
45
|
+
|
|
46
|
+
return f"{week_start}-{week_end}"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def parse_date_from_filename(filename: str) -> datetime | None:
|
|
50
|
+
"""Extract date from PubMed filename.
|
|
51
|
+
|
|
52
|
+
PubMed filenames are in format: pubmed24nYYYYMMDD.xml.gz
|
|
53
|
+
The date is embedded in the number following 'n'
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
filename: PubMed filename (e.g., pubmed24n1234.xml.gz)
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
datetime object or None if date cannot be parsed
|
|
60
|
+
"""
|
|
61
|
+
# Pattern to extract the numeric part after 'n'
|
|
62
|
+
match = re.match(r"pubmed\d+n(\d+)\.xml\.gz", filename)
|
|
63
|
+
if not match:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
number = match.group(1)
|
|
67
|
+
|
|
68
|
+
# PubMed daily update files use a specific numbering scheme
|
|
69
|
+
# The first 4 digits represent the year (e.g., 2024)
|
|
70
|
+
# The next 4 digits represent a sequential number within the year
|
|
71
|
+
# We need to convert this to a date
|
|
72
|
+
|
|
73
|
+
# For daily updates, NCBI uses a sequential number that increments daily
|
|
74
|
+
# We need to look up the actual date from the FTP directory listing
|
|
75
|
+
# which includes modification time
|
|
76
|
+
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
number = match.group(1)
|
|
80
|
+
|
|
81
|
+
# PubMed daily update files use a specific numbering scheme
|
|
82
|
+
# The first 4 digits represent the year (e.g., 2024)
|
|
83
|
+
# The next 4 digits represent a sequential number within the year
|
|
84
|
+
# We need to convert this to a date
|
|
85
|
+
|
|
86
|
+
# For daily updates, NCBI uses a sequential number that increments daily
|
|
87
|
+
# We need to look up the actual date from the FTP directory listing
|
|
88
|
+
# which includes modification time
|
|
89
|
+
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def fetch_ftp_file_list() -> List[str]:
|
|
94
|
+
"""Fetch list of xml.gz files from NCBI FTP server.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of xml.gz filenames from the FTP server
|
|
98
|
+
"""
|
|
99
|
+
url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
with urllib.request.urlopen(url) as response:
|
|
103
|
+
html_content = response.read().decode("utf-8")
|
|
104
|
+
|
|
105
|
+
# Parse HTML to extract filenames
|
|
106
|
+
# FTP directory listing returns HTML with links
|
|
107
|
+
filenames = []
|
|
108
|
+
for line in html_content.split("\n"):
|
|
109
|
+
match = re.search(r"pubmed\d+n\d+\.xml\.gz", line)
|
|
110
|
+
if match:
|
|
111
|
+
filename = match.group(0)
|
|
112
|
+
if filename not in filenames:
|
|
113
|
+
filenames.append(filename)
|
|
114
|
+
|
|
115
|
+
return sorted(filenames)
|
|
116
|
+
|
|
117
|
+
except Exception as e:
|
|
118
|
+
print(f"Error fetching FTP file list: {e}", file=sys.stderr)
|
|
119
|
+
sys.exit(1)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def filter_files_by_date(week_name: str, file_list: List[str]) -> List[str]:
|
|
123
|
+
"""Filter files to include only those from the past week.
|
|
124
|
+
|
|
125
|
+
Since PubMed files don't encode the date directly in the filename,
|
|
126
|
+
we need to download the directory listing with timestamps and filter
|
|
127
|
+
based on modification date.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
week_name: Week folder name (YYYYMMDD-YYYYMMDD)
|
|
131
|
+
file_list: List of all xml.gz filenames
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of filenames that fall within the date range
|
|
135
|
+
"""
|
|
136
|
+
# Parse week dates
|
|
137
|
+
start_date_str, end_date_str = week_name.split("-")
|
|
138
|
+
start_date = datetime.strptime(start_date_str, "%Y%m%d")
|
|
139
|
+
end_date = datetime.strptime(end_date_str, "%Y%m%d").replace(
|
|
140
|
+
hour=23, minute=59, second=59
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Fetch directory listing with timestamps
|
|
144
|
+
url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
with urllib.request.urlopen(url) as response:
|
|
148
|
+
content = response.read().decode("utf-8", errors="ignore")
|
|
149
|
+
|
|
150
|
+
# Parse file listings with dates
|
|
151
|
+
# NCBI FTP format uses ISO date: "2026-01-30 14:02"
|
|
152
|
+
file_dates = {}
|
|
153
|
+
|
|
154
|
+
for filename in file_list:
|
|
155
|
+
# Find the line containing this file
|
|
156
|
+
# Pattern: filename followed by non-digits, then date YYYY-MM-DD HH:MM
|
|
157
|
+
pattern = re.escape(filename)
|
|
158
|
+
match = re.search(
|
|
159
|
+
rf"({pattern})[^0-9]*(\d{{4}}-\d{{2}}-\d{{2}})\s+(\d{{2}}:\d{{2}})",
|
|
160
|
+
content,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if match:
|
|
164
|
+
date_str = match.group(2)
|
|
165
|
+
time_str = match.group(3)
|
|
166
|
+
try:
|
|
167
|
+
# Parse date in ISO format: "2026-01-30 14:02"
|
|
168
|
+
file_date = datetime.strptime(
|
|
169
|
+
f"{date_str} {time_str}", "%Y-%m-%d %H:%M"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
file_dates[filename] = file_date
|
|
173
|
+
except ValueError:
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
# Filter files within date range
|
|
177
|
+
filtered_files = [
|
|
178
|
+
filename
|
|
179
|
+
for filename, file_date in file_dates.items()
|
|
180
|
+
if start_date <= file_date <= end_date
|
|
181
|
+
]
|
|
182
|
+
|
|
183
|
+
return sorted(filtered_files)
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
print(f"Error filtering files by date: {e}", file=sys.stderr)
|
|
187
|
+
sys.exit(1)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def download_file(week_name: str, filename: str, max_retries: int = 3) -> int:
|
|
191
|
+
"""Download a single file from NCBI FTP server with retry logic.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
week_name: Week folder name
|
|
195
|
+
filename: XML.gz filename to download
|
|
196
|
+
max_retries: Maximum number of retry attempts
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
0 on success, 1 on failure (after all retries)
|
|
200
|
+
"""
|
|
201
|
+
base_url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
|
|
202
|
+
url = f"{base_url}{filename}"
|
|
203
|
+
|
|
204
|
+
# Create download directory in current working directory
|
|
205
|
+
base_dir = os.getcwd()
|
|
206
|
+
download_dir = os.path.join(base_dir, ".download", "pubmed-daily", week_name)
|
|
207
|
+
os.makedirs(download_dir, exist_ok=True)
|
|
208
|
+
|
|
209
|
+
filepath = os.path.join(download_dir, filename)
|
|
210
|
+
|
|
211
|
+
for attempt in range(max_retries):
|
|
212
|
+
try:
|
|
213
|
+
print(f"Downloading {filename} (attempt {attempt + 1}/{max_retries})...")
|
|
214
|
+
|
|
215
|
+
urllib.request.urlretrieve(url, filepath)
|
|
216
|
+
|
|
217
|
+
# Verify file was downloaded and has content
|
|
218
|
+
if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
|
|
219
|
+
print(f"Successfully downloaded {filename}")
|
|
220
|
+
return 0
|
|
221
|
+
else:
|
|
222
|
+
raise Exception("Downloaded file is empty or missing")
|
|
223
|
+
|
|
224
|
+
except Exception as e:
|
|
225
|
+
print(f"Error downloading {filename}: {e}", file=sys.stderr)
|
|
226
|
+
|
|
227
|
+
if attempt < max_retries - 1:
|
|
228
|
+
print(f"Retrying in 2 seconds...")
|
|
229
|
+
time.sleep(2)
|
|
230
|
+
else:
|
|
231
|
+
print(f"Failed to download {filename} after {max_retries} attempts")
|
|
232
|
+
return 1
|
|
233
|
+
|
|
234
|
+
return 1
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def combine_excel(week_name: str) -> Dict[str, Any]:
|
|
238
|
+
"""Combine all Excel files in week folder into combined.xlsx.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
week_name: Week folder name (e.g., '20250217-20250223')
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Dict with success, total_rows, source_files, output_file
|
|
245
|
+
"""
|
|
246
|
+
try:
|
|
247
|
+
from openpyxl import load_workbook, Workbook
|
|
248
|
+
except ImportError:
|
|
249
|
+
print("Error: openpyxl package not installed.", file=sys.stderr)
|
|
250
|
+
print("Please install with: uv add openpyxl", file=sys.stderr)
|
|
251
|
+
return {
|
|
252
|
+
"success": False,
|
|
253
|
+
"error": "openpyxl not installed",
|
|
254
|
+
"total_rows": 0,
|
|
255
|
+
"source_files": [],
|
|
256
|
+
"output_file": None,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
# Use current working directory
|
|
260
|
+
base_dir = os.getcwd()
|
|
261
|
+
week_dir = os.path.join(base_dir, ".download", "pubmed-daily", week_name)
|
|
262
|
+
|
|
263
|
+
if not os.path.exists(week_dir):
|
|
264
|
+
return {
|
|
265
|
+
"success": False,
|
|
266
|
+
"error": f"Directory not found: {week_dir}",
|
|
267
|
+
"total_rows": 0,
|
|
268
|
+
"source_files": [],
|
|
269
|
+
"output_file": None,
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
xlsx_pattern = os.path.join(week_dir, "*.xlsx")
|
|
273
|
+
all_xlsx_files = glob.glob(xlsx_pattern)
|
|
274
|
+
|
|
275
|
+
source_files = [
|
|
276
|
+
os.path.basename(f) for f in all_xlsx_files if not f.endswith("combined.xlsx")
|
|
277
|
+
]
|
|
278
|
+
source_files.sort()
|
|
279
|
+
|
|
280
|
+
if not source_files:
|
|
281
|
+
return {
|
|
282
|
+
"success": False,
|
|
283
|
+
"error": "No Excel files found to combine",
|
|
284
|
+
"total_rows": 0,
|
|
285
|
+
"source_files": [],
|
|
286
|
+
"output_file": None,
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
combined_wb = Workbook()
|
|
290
|
+
combined_ws = combined_wb.active
|
|
291
|
+
if combined_ws is None:
|
|
292
|
+
combined_ws = combined_wb.create_sheet("PubMed Articles")
|
|
293
|
+
else:
|
|
294
|
+
combined_ws.title = "PubMed Articles"
|
|
295
|
+
|
|
296
|
+
header_written = False
|
|
297
|
+
total_rows = 0
|
|
298
|
+
processed_files = []
|
|
299
|
+
|
|
300
|
+
for filename in source_files:
|
|
301
|
+
filepath = os.path.join(week_dir, filename)
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
wb = load_workbook(filepath, read_only=True, data_only=True)
|
|
305
|
+
ws = wb.active
|
|
306
|
+
if ws is None:
|
|
307
|
+
print(f"Warning: {filename} has no active sheet, skipping")
|
|
308
|
+
wb.close()
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
rows = list(ws.rows)
|
|
312
|
+
if not rows:
|
|
313
|
+
print(f"Warning: {filename} is empty, skipping")
|
|
314
|
+
wb.close()
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
if not header_written:
|
|
318
|
+
headers = [cell.value for cell in rows[0]]
|
|
319
|
+
combined_ws.append(headers)
|
|
320
|
+
header_written = True
|
|
321
|
+
data_start = 1
|
|
322
|
+
else:
|
|
323
|
+
data_start = 0
|
|
324
|
+
|
|
325
|
+
for row in rows[data_start:]:
|
|
326
|
+
row_values = [cell.value for cell in row]
|
|
327
|
+
if any(v is not None for v in row_values):
|
|
328
|
+
combined_ws.append(row_values)
|
|
329
|
+
total_rows += 1
|
|
330
|
+
|
|
331
|
+
processed_files.append(filename)
|
|
332
|
+
wb.close()
|
|
333
|
+
print(f"Processed: {filename}")
|
|
334
|
+
|
|
335
|
+
except Exception as e:
|
|
336
|
+
print(f"Warning: Error processing {filename}: {e}", file=sys.stderr)
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
output_path = os.path.join(week_dir, "combined.xlsx")
|
|
340
|
+
combined_wb.save(output_path)
|
|
341
|
+
|
|
342
|
+
print(f"\nCombined {total_rows} rows from {len(processed_files)} files")
|
|
343
|
+
print(f"Output: {output_path}")
|
|
344
|
+
|
|
345
|
+
return {
|
|
346
|
+
"success": True,
|
|
347
|
+
"total_rows": total_rows,
|
|
348
|
+
"source_files": processed_files,
|
|
349
|
+
"output_file": "combined.xlsx",
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def main():
|
|
354
|
+
"""Main entry point for command-line usage."""
|
|
355
|
+
parser = argparse.ArgumentParser(
|
|
356
|
+
description="PubMed Weekly Daily Updates Downloader"
|
|
357
|
+
)
|
|
358
|
+
parser.add_argument("command", type=str, help="Command to execute")
|
|
359
|
+
parser.add_argument("args", nargs="*", help="Command arguments")
|
|
360
|
+
|
|
361
|
+
parsed = parser.parse_args()
|
|
362
|
+
|
|
363
|
+
command = parsed.command
|
|
364
|
+
args = parsed.args
|
|
365
|
+
|
|
366
|
+
if command == "calculate_week":
|
|
367
|
+
week = calculate_week()
|
|
368
|
+
print(week)
|
|
369
|
+
|
|
370
|
+
elif command == "fetch_files":
|
|
371
|
+
files = fetch_ftp_file_list()
|
|
372
|
+
print(" ".join(files))
|
|
373
|
+
|
|
374
|
+
elif command == "filter_files":
|
|
375
|
+
if len(args) < 2:
|
|
376
|
+
print("Usage: python pubmed_weekly.py filter_files <week_name> <file_list>")
|
|
377
|
+
sys.exit(1)
|
|
378
|
+
|
|
379
|
+
week_name = args[0]
|
|
380
|
+
file_list = args[1].split()
|
|
381
|
+
filtered = filter_files_by_date(week_name, file_list)
|
|
382
|
+
print(" ".join(filtered))
|
|
383
|
+
|
|
384
|
+
elif command == "download_file":
|
|
385
|
+
if len(args) < 2:
|
|
386
|
+
print("Usage: python pubmed_weekly.py download_file <week_name> <filename>")
|
|
387
|
+
sys.exit(1)
|
|
388
|
+
|
|
389
|
+
week_name = args[0]
|
|
390
|
+
filename = args[1]
|
|
391
|
+
sys.exit(download_file(week_name, filename))
|
|
392
|
+
|
|
393
|
+
elif command == "combine_excel":
|
|
394
|
+
if len(args) < 1:
|
|
395
|
+
print("Usage: python pubmed_weekly.py combine_excel <week_name>")
|
|
396
|
+
sys.exit(1)
|
|
397
|
+
|
|
398
|
+
week_name = args[0]
|
|
399
|
+
result = combine_excel(week_name)
|
|
400
|
+
print(json.dumps(result, indent=2))
|
|
401
|
+
|
|
402
|
+
if not result.get("success"):
|
|
403
|
+
sys.exit(1)
|
|
404
|
+
|
|
405
|
+
else:
|
|
406
|
+
print(f"Unknown command: {command}")
|
|
407
|
+
sys.exit(1)
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
if __name__ == "__main__":
|
|
411
|
+
main()
|