@yeyuan98/opencode-bioresearcher-plugin 1.3.1 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +14 -0
  2. package/dist/index.js +4 -1
  3. package/dist/misc-tools/index.d.ts +3 -0
  4. package/dist/misc-tools/index.js +3 -0
  5. package/dist/misc-tools/json-extract.d.ts +13 -0
  6. package/dist/misc-tools/json-extract.js +394 -0
  7. package/dist/misc-tools/json-infer.d.ts +13 -0
  8. package/dist/misc-tools/json-infer.js +199 -0
  9. package/dist/misc-tools/json-tools.d.ts +33 -0
  10. package/dist/misc-tools/json-tools.js +187 -0
  11. package/dist/misc-tools/json-validate.d.ts +13 -0
  12. package/dist/misc-tools/json-validate.js +228 -0
  13. package/dist/skills/bioresearcher-core/README.md +210 -0
  14. package/dist/skills/bioresearcher-core/SKILL.md +128 -0
  15. package/dist/skills/bioresearcher-core/examples/contexts.json +29 -0
  16. package/dist/skills/bioresearcher-core/examples/data-exchange-example.md +303 -0
  17. package/dist/skills/bioresearcher-core/examples/template.md +49 -0
  18. package/dist/skills/bioresearcher-core/patterns/calculator.md +215 -0
  19. package/dist/skills/bioresearcher-core/patterns/data-exchange.md +406 -0
  20. package/dist/skills/bioresearcher-core/patterns/json-tools.md +263 -0
  21. package/dist/skills/bioresearcher-core/patterns/progress.md +127 -0
  22. package/dist/skills/bioresearcher-core/patterns/retry.md +110 -0
  23. package/dist/skills/bioresearcher-core/patterns/shell-commands.md +79 -0
  24. package/dist/skills/bioresearcher-core/patterns/subagent-waves.md +186 -0
  25. package/dist/skills/bioresearcher-core/patterns/table-tools.md +260 -0
  26. package/dist/skills/bioresearcher-core/patterns/user-confirmation.md +187 -0
  27. package/dist/skills/bioresearcher-core/python/template.md +273 -0
  28. package/dist/skills/bioresearcher-core/python/template.py +323 -0
  29. package/dist/skills/long-table-summary/SKILL.md +374 -0
  30. package/dist/skills/long-table-summary/__init__.py +3 -0
  31. package/dist/skills/long-table-summary/combine_outputs.py +345 -0
  32. package/dist/skills/long-table-summary/pyproject.toml +11 -0
  33. package/dist/skills/pubmed-weekly/SKILL.md +329 -329
  34. package/dist/skills/pubmed-weekly/pubmed_weekly.py +411 -411
  35. package/dist/skills/pubmed-weekly/pyproject.toml +8 -8
  36. package/package.json +7 -2
@@ -1,411 +1,411 @@
1
- #!/usr/bin/env python3
2
- """
3
- PubMed Weekly Daily Updates Downloader
4
-
5
- This script handles:
6
- - Calculating the past week's date range (Monday-Sunday)
7
- - Fetching FTP file list from NCBI
8
- - Filtering files for the specific week
9
- - Downloading files with retry logic
10
- - Combining Excel files into combined.xlsx
11
- """
12
-
13
- import os
14
- import sys
15
- import re
16
- import time
17
- import json
18
- import glob
19
- import urllib.request
20
- import argparse
21
- from datetime import datetime, timedelta
22
- from typing import List, Dict, Any
23
-
24
-
25
- def calculate_week() -> str:
26
- """Calculate the past week's date range (Monday-Sunday).
27
-
28
- Returns:
29
- Week folder name in format 'YYYYMMDD-YYYYMMDD' for the PREVIOUS week
30
- """
31
- today = datetime.now()
32
-
33
- # Find the most recent Monday of the current week
34
- days_since_monday = today.weekday() # Monday = 0, Sunday = 6
35
- current_monday = today - timedelta(days=days_since_monday)
36
-
37
- # Go back one week to get the previous week's Monday
38
- previous_week_monday = current_monday - timedelta(days=7)
39
-
40
- # Calculate the previous week's Sunday (6 days after Monday)
41
- previous_week_sunday = previous_week_monday + timedelta(days=6)
42
-
43
- week_start = previous_week_monday.strftime("%Y%m%d")
44
- week_end = previous_week_sunday.strftime("%Y%m%d")
45
-
46
- return f"{week_start}-{week_end}"
47
-
48
-
49
- def parse_date_from_filename(filename: str) -> datetime | None:
50
- """Extract date from PubMed filename.
51
-
52
- PubMed filenames are in format: pubmed24nYYYYMMDD.xml.gz
53
- The date is embedded in the number following 'n'
54
-
55
- Args:
56
- filename: PubMed filename (e.g., pubmed24n1234.xml.gz)
57
-
58
- Returns:
59
- datetime object or None if date cannot be parsed
60
- """
61
- # Pattern to extract the numeric part after 'n'
62
- match = re.match(r"pubmed\d+n(\d+)\.xml\.gz", filename)
63
- if not match:
64
- return None
65
-
66
- number = match.group(1)
67
-
68
- # PubMed daily update files use a specific numbering scheme
69
- # The first 4 digits represent the year (e.g., 2024)
70
- # The next 4 digits represent a sequential number within the year
71
- # We need to convert this to a date
72
-
73
- # For daily updates, NCBI uses a sequential number that increments daily
74
- # We need to look up the actual date from the FTP directory listing
75
- # which includes modification time
76
-
77
- return None
78
-
79
- number = match.group(1)
80
-
81
- # PubMed daily update files use a specific numbering scheme
82
- # The first 4 digits represent the year (e.g., 2024)
83
- # The next 4 digits represent a sequential number within the year
84
- # We need to convert this to a date
85
-
86
- # For daily updates, NCBI uses a sequential number that increments daily
87
- # We need to look up the actual date from the FTP directory listing
88
- # which includes modification time
89
-
90
- return None
91
-
92
-
93
- def fetch_ftp_file_list() -> List[str]:
94
- """Fetch list of xml.gz files from NCBI FTP server.
95
-
96
- Returns:
97
- List of xml.gz filenames from the FTP server
98
- """
99
- url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
100
-
101
- try:
102
- with urllib.request.urlopen(url) as response:
103
- html_content = response.read().decode("utf-8")
104
-
105
- # Parse HTML to extract filenames
106
- # FTP directory listing returns HTML with links
107
- filenames = []
108
- for line in html_content.split("\n"):
109
- match = re.search(r"pubmed\d+n\d+\.xml\.gz", line)
110
- if match:
111
- filename = match.group(0)
112
- if filename not in filenames:
113
- filenames.append(filename)
114
-
115
- return sorted(filenames)
116
-
117
- except Exception as e:
118
- print(f"Error fetching FTP file list: {e}", file=sys.stderr)
119
- sys.exit(1)
120
-
121
-
122
- def filter_files_by_date(week_name: str, file_list: List[str]) -> List[str]:
123
- """Filter files to include only those from the past week.
124
-
125
- Since PubMed files don't encode the date directly in the filename,
126
- we need to download the directory listing with timestamps and filter
127
- based on modification date.
128
-
129
- Args:
130
- week_name: Week folder name (YYYYMMDD-YYYYMMDD)
131
- file_list: List of all xml.gz filenames
132
-
133
- Returns:
134
- List of filenames that fall within the date range
135
- """
136
- # Parse week dates
137
- start_date_str, end_date_str = week_name.split("-")
138
- start_date = datetime.strptime(start_date_str, "%Y%m%d")
139
- end_date = datetime.strptime(end_date_str, "%Y%m%d").replace(
140
- hour=23, minute=59, second=59
141
- )
142
-
143
- # Fetch directory listing with timestamps
144
- url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
145
-
146
- try:
147
- with urllib.request.urlopen(url) as response:
148
- content = response.read().decode("utf-8", errors="ignore")
149
-
150
- # Parse file listings with dates
151
- # NCBI FTP format uses ISO date: "2026-01-30 14:02"
152
- file_dates = {}
153
-
154
- for filename in file_list:
155
- # Find the line containing this file
156
- # Pattern: filename followed by non-digits, then date YYYY-MM-DD HH:MM
157
- pattern = re.escape(filename)
158
- match = re.search(
159
- rf"({pattern})[^0-9]*(\d{{4}}-\d{{2}}-\d{{2}})\s+(\d{{2}}:\d{{2}})",
160
- content,
161
- )
162
-
163
- if match:
164
- date_str = match.group(2)
165
- time_str = match.group(3)
166
- try:
167
- # Parse date in ISO format: "2026-01-30 14:02"
168
- file_date = datetime.strptime(
169
- f"{date_str} {time_str}", "%Y-%m-%d %H:%M"
170
- )
171
-
172
- file_dates[filename] = file_date
173
- except ValueError:
174
- continue
175
-
176
- # Filter files within date range
177
- filtered_files = [
178
- filename
179
- for filename, file_date in file_dates.items()
180
- if start_date <= file_date <= end_date
181
- ]
182
-
183
- return sorted(filtered_files)
184
-
185
- except Exception as e:
186
- print(f"Error filtering files by date: {e}", file=sys.stderr)
187
- sys.exit(1)
188
-
189
-
190
- def download_file(week_name: str, filename: str, max_retries: int = 3) -> int:
191
- """Download a single file from NCBI FTP server with retry logic.
192
-
193
- Args:
194
- week_name: Week folder name
195
- filename: XML.gz filename to download
196
- max_retries: Maximum number of retry attempts
197
-
198
- Returns:
199
- 0 on success, 1 on failure (after all retries)
200
- """
201
- base_url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
202
- url = f"{base_url}{filename}"
203
-
204
- # Create download directory in current working directory
205
- base_dir = os.getcwd()
206
- download_dir = os.path.join(base_dir, ".download", "pubmed-daily", week_name)
207
- os.makedirs(download_dir, exist_ok=True)
208
-
209
- filepath = os.path.join(download_dir, filename)
210
-
211
- for attempt in range(max_retries):
212
- try:
213
- print(f"Downloading {filename} (attempt {attempt + 1}/{max_retries})...")
214
-
215
- urllib.request.urlretrieve(url, filepath)
216
-
217
- # Verify file was downloaded and has content
218
- if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
219
- print(f"Successfully downloaded {filename}")
220
- return 0
221
- else:
222
- raise Exception("Downloaded file is empty or missing")
223
-
224
- except Exception as e:
225
- print(f"Error downloading {filename}: {e}", file=sys.stderr)
226
-
227
- if attempt < max_retries - 1:
228
- print(f"Retrying in 2 seconds...")
229
- time.sleep(2)
230
- else:
231
- print(f"Failed to download {filename} after {max_retries} attempts")
232
- return 1
233
-
234
- return 1
235
-
236
-
237
- def combine_excel(week_name: str) -> Dict[str, Any]:
238
- """Combine all Excel files in week folder into combined.xlsx.
239
-
240
- Args:
241
- week_name: Week folder name (e.g., '20250217-20250223')
242
-
243
- Returns:
244
- Dict with success, total_rows, source_files, output_file
245
- """
246
- try:
247
- from openpyxl import load_workbook, Workbook
248
- except ImportError:
249
- print("Error: openpyxl package not installed.", file=sys.stderr)
250
- print("Please install with: uv add openpyxl", file=sys.stderr)
251
- return {
252
- "success": False,
253
- "error": "openpyxl not installed",
254
- "total_rows": 0,
255
- "source_files": [],
256
- "output_file": None,
257
- }
258
-
259
- # Use current working directory
260
- base_dir = os.getcwd()
261
- week_dir = os.path.join(base_dir, ".download", "pubmed-daily", week_name)
262
-
263
- if not os.path.exists(week_dir):
264
- return {
265
- "success": False,
266
- "error": f"Directory not found: {week_dir}",
267
- "total_rows": 0,
268
- "source_files": [],
269
- "output_file": None,
270
- }
271
-
272
- xlsx_pattern = os.path.join(week_dir, "*.xlsx")
273
- all_xlsx_files = glob.glob(xlsx_pattern)
274
-
275
- source_files = [
276
- os.path.basename(f) for f in all_xlsx_files if not f.endswith("combined.xlsx")
277
- ]
278
- source_files.sort()
279
-
280
- if not source_files:
281
- return {
282
- "success": False,
283
- "error": "No Excel files found to combine",
284
- "total_rows": 0,
285
- "source_files": [],
286
- "output_file": None,
287
- }
288
-
289
- combined_wb = Workbook()
290
- combined_ws = combined_wb.active
291
- if combined_ws is None:
292
- combined_ws = combined_wb.create_sheet("PubMed Articles")
293
- else:
294
- combined_ws.title = "PubMed Articles"
295
-
296
- header_written = False
297
- total_rows = 0
298
- processed_files = []
299
-
300
- for filename in source_files:
301
- filepath = os.path.join(week_dir, filename)
302
-
303
- try:
304
- wb = load_workbook(filepath, read_only=True, data_only=True)
305
- ws = wb.active
306
- if ws is None:
307
- print(f"Warning: {filename} has no active sheet, skipping")
308
- wb.close()
309
- continue
310
-
311
- rows = list(ws.rows)
312
- if not rows:
313
- print(f"Warning: {filename} is empty, skipping")
314
- wb.close()
315
- continue
316
-
317
- if not header_written:
318
- headers = [cell.value for cell in rows[0]]
319
- combined_ws.append(headers)
320
- header_written = True
321
- data_start = 1
322
- else:
323
- data_start = 0
324
-
325
- for row in rows[data_start:]:
326
- row_values = [cell.value for cell in row]
327
- if any(v is not None for v in row_values):
328
- combined_ws.append(row_values)
329
- total_rows += 1
330
-
331
- processed_files.append(filename)
332
- wb.close()
333
- print(f"Processed: {filename}")
334
-
335
- except Exception as e:
336
- print(f"Warning: Error processing {filename}: {e}", file=sys.stderr)
337
- continue
338
-
339
- output_path = os.path.join(week_dir, "combined.xlsx")
340
- combined_wb.save(output_path)
341
-
342
- print(f"\nCombined {total_rows} rows from {len(processed_files)} files")
343
- print(f"Output: {output_path}")
344
-
345
- return {
346
- "success": True,
347
- "total_rows": total_rows,
348
- "source_files": processed_files,
349
- "output_file": "combined.xlsx",
350
- }
351
-
352
-
353
- def main():
354
- """Main entry point for command-line usage."""
355
- parser = argparse.ArgumentParser(
356
- description="PubMed Weekly Daily Updates Downloader"
357
- )
358
- parser.add_argument("command", type=str, help="Command to execute")
359
- parser.add_argument("args", nargs="*", help="Command arguments")
360
-
361
- parsed = parser.parse_args()
362
-
363
- command = parsed.command
364
- args = parsed.args
365
-
366
- if command == "calculate_week":
367
- week = calculate_week()
368
- print(week)
369
-
370
- elif command == "fetch_files":
371
- files = fetch_ftp_file_list()
372
- print(" ".join(files))
373
-
374
- elif command == "filter_files":
375
- if len(args) < 2:
376
- print("Usage: python pubmed_weekly.py filter_files <week_name> <file_list>")
377
- sys.exit(1)
378
-
379
- week_name = args[0]
380
- file_list = args[1].split()
381
- filtered = filter_files_by_date(week_name, file_list)
382
- print(" ".join(filtered))
383
-
384
- elif command == "download_file":
385
- if len(args) < 2:
386
- print("Usage: python pubmed_weekly.py download_file <week_name> <filename>")
387
- sys.exit(1)
388
-
389
- week_name = args[0]
390
- filename = args[1]
391
- sys.exit(download_file(week_name, filename))
392
-
393
- elif command == "combine_excel":
394
- if len(args) < 1:
395
- print("Usage: python pubmed_weekly.py combine_excel <week_name>")
396
- sys.exit(1)
397
-
398
- week_name = args[0]
399
- result = combine_excel(week_name)
400
- print(json.dumps(result, indent=2))
401
-
402
- if not result.get("success"):
403
- sys.exit(1)
404
-
405
- else:
406
- print(f"Unknown command: {command}")
407
- sys.exit(1)
408
-
409
-
410
- if __name__ == "__main__":
411
- main()
1
+ #!/usr/bin/env python3
2
+ """
3
+ PubMed Weekly Daily Updates Downloader
4
+
5
+ This script handles:
6
+ - Calculating the past week's date range (Monday-Sunday)
7
+ - Fetching FTP file list from NCBI
8
+ - Filtering files for the specific week
9
+ - Downloading files with retry logic
10
+ - Combining Excel files into combined.xlsx
11
+ """
12
+
13
+ import os
14
+ import sys
15
+ import re
16
+ import time
17
+ import json
18
+ import glob
19
+ import urllib.request
20
+ import argparse
21
+ from datetime import datetime, timedelta
22
+ from typing import List, Dict, Any
23
+
24
+
25
+ def calculate_week() -> str:
26
+ """Calculate the past week's date range (Monday-Sunday).
27
+
28
+ Returns:
29
+ Week folder name in format 'YYYYMMDD-YYYYMMDD' for the PREVIOUS week
30
+ """
31
+ today = datetime.now()
32
+
33
+ # Find the most recent Monday of the current week
34
+ days_since_monday = today.weekday() # Monday = 0, Sunday = 6
35
+ current_monday = today - timedelta(days=days_since_monday)
36
+
37
+ # Go back one week to get the previous week's Monday
38
+ previous_week_monday = current_monday - timedelta(days=7)
39
+
40
+ # Calculate the previous week's Sunday (6 days after Monday)
41
+ previous_week_sunday = previous_week_monday + timedelta(days=6)
42
+
43
+ week_start = previous_week_monday.strftime("%Y%m%d")
44
+ week_end = previous_week_sunday.strftime("%Y%m%d")
45
+
46
+ return f"{week_start}-{week_end}"
47
+
48
+
49
+ def parse_date_from_filename(filename: str) -> datetime | None:
50
+ """Extract date from PubMed filename.
51
+
52
+ PubMed filenames are in format: pubmed24nYYYYMMDD.xml.gz
53
+ The date is embedded in the number following 'n'
54
+
55
+ Args:
56
+ filename: PubMed filename (e.g., pubmed24n1234.xml.gz)
57
+
58
+ Returns:
59
+ datetime object or None if date cannot be parsed
60
+ """
61
+ # Pattern to extract the numeric part after 'n'
62
+ match = re.match(r"pubmed\d+n(\d+)\.xml\.gz", filename)
63
+ if not match:
64
+ return None
65
+
66
+ number = match.group(1)
67
+
68
+ # PubMed daily update files use a specific numbering scheme
69
+ # The first 4 digits represent the year (e.g., 2024)
70
+ # The next 4 digits represent a sequential number within the year
71
+ # We need to convert this to a date
72
+
73
+ # For daily updates, NCBI uses a sequential number that increments daily
74
+ # We need to look up the actual date from the FTP directory listing
75
+ # which includes modification time
76
+
77
+ return None
78
+
79
+ number = match.group(1)
80
+
81
+ # PubMed daily update files use a specific numbering scheme
82
+ # The first 4 digits represent the year (e.g., 2024)
83
+ # The next 4 digits represent a sequential number within the year
84
+ # We need to convert this to a date
85
+
86
+ # For daily updates, NCBI uses a sequential number that increments daily
87
+ # We need to look up the actual date from the FTP directory listing
88
+ # which includes modification time
89
+
90
+ return None
91
+
92
+
93
+ def fetch_ftp_file_list() -> List[str]:
94
+ """Fetch list of xml.gz files from NCBI FTP server.
95
+
96
+ Returns:
97
+ List of xml.gz filenames from the FTP server
98
+ """
99
+ url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
100
+
101
+ try:
102
+ with urllib.request.urlopen(url) as response:
103
+ html_content = response.read().decode("utf-8")
104
+
105
+ # Parse HTML to extract filenames
106
+ # FTP directory listing returns HTML with links
107
+ filenames = []
108
+ for line in html_content.split("\n"):
109
+ match = re.search(r"pubmed\d+n\d+\.xml\.gz", line)
110
+ if match:
111
+ filename = match.group(0)
112
+ if filename not in filenames:
113
+ filenames.append(filename)
114
+
115
+ return sorted(filenames)
116
+
117
+ except Exception as e:
118
+ print(f"Error fetching FTP file list: {e}", file=sys.stderr)
119
+ sys.exit(1)
120
+
121
+
122
+ def filter_files_by_date(week_name: str, file_list: List[str]) -> List[str]:
123
+ """Filter files to include only those from the past week.
124
+
125
+ Since PubMed files don't encode the date directly in the filename,
126
+ we need to download the directory listing with timestamps and filter
127
+ based on modification date.
128
+
129
+ Args:
130
+ week_name: Week folder name (YYYYMMDD-YYYYMMDD)
131
+ file_list: List of all xml.gz filenames
132
+
133
+ Returns:
134
+ List of filenames that fall within the date range
135
+ """
136
+ # Parse week dates
137
+ start_date_str, end_date_str = week_name.split("-")
138
+ start_date = datetime.strptime(start_date_str, "%Y%m%d")
139
+ end_date = datetime.strptime(end_date_str, "%Y%m%d").replace(
140
+ hour=23, minute=59, second=59
141
+ )
142
+
143
+ # Fetch directory listing with timestamps
144
+ url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
145
+
146
+ try:
147
+ with urllib.request.urlopen(url) as response:
148
+ content = response.read().decode("utf-8", errors="ignore")
149
+
150
+ # Parse file listings with dates
151
+ # NCBI FTP format uses ISO date: "2026-01-30 14:02"
152
+ file_dates = {}
153
+
154
+ for filename in file_list:
155
+ # Find the line containing this file
156
+ # Pattern: filename followed by non-digits, then date YYYY-MM-DD HH:MM
157
+ pattern = re.escape(filename)
158
+ match = re.search(
159
+ rf"({pattern})[^0-9]*(\d{{4}}-\d{{2}}-\d{{2}})\s+(\d{{2}}:\d{{2}})",
160
+ content,
161
+ )
162
+
163
+ if match:
164
+ date_str = match.group(2)
165
+ time_str = match.group(3)
166
+ try:
167
+ # Parse date in ISO format: "2026-01-30 14:02"
168
+ file_date = datetime.strptime(
169
+ f"{date_str} {time_str}", "%Y-%m-%d %H:%M"
170
+ )
171
+
172
+ file_dates[filename] = file_date
173
+ except ValueError:
174
+ continue
175
+
176
+ # Filter files within date range
177
+ filtered_files = [
178
+ filename
179
+ for filename, file_date in file_dates.items()
180
+ if start_date <= file_date <= end_date
181
+ ]
182
+
183
+ return sorted(filtered_files)
184
+
185
+ except Exception as e:
186
+ print(f"Error filtering files by date: {e}", file=sys.stderr)
187
+ sys.exit(1)
188
+
189
+
190
+ def download_file(week_name: str, filename: str, max_retries: int = 3) -> int:
191
+ """Download a single file from NCBI FTP server with retry logic.
192
+
193
+ Args:
194
+ week_name: Week folder name
195
+ filename: XML.gz filename to download
196
+ max_retries: Maximum number of retry attempts
197
+
198
+ Returns:
199
+ 0 on success, 1 on failure (after all retries)
200
+ """
201
+ base_url = "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/"
202
+ url = f"{base_url}{filename}"
203
+
204
+ # Create download directory in current working directory
205
+ base_dir = os.getcwd()
206
+ download_dir = os.path.join(base_dir, ".download", "pubmed-daily", week_name)
207
+ os.makedirs(download_dir, exist_ok=True)
208
+
209
+ filepath = os.path.join(download_dir, filename)
210
+
211
+ for attempt in range(max_retries):
212
+ try:
213
+ print(f"Downloading {filename} (attempt {attempt + 1}/{max_retries})...")
214
+
215
+ urllib.request.urlretrieve(url, filepath)
216
+
217
+ # Verify file was downloaded and has content
218
+ if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
219
+ print(f"Successfully downloaded {filename}")
220
+ return 0
221
+ else:
222
+ raise Exception("Downloaded file is empty or missing")
223
+
224
+ except Exception as e:
225
+ print(f"Error downloading {filename}: {e}", file=sys.stderr)
226
+
227
+ if attempt < max_retries - 1:
228
+ print(f"Retrying in 2 seconds...")
229
+ time.sleep(2)
230
+ else:
231
+ print(f"Failed to download {filename} after {max_retries} attempts")
232
+ return 1
233
+
234
+ return 1
235
+
236
+
237
+ def combine_excel(week_name: str) -> Dict[str, Any]:
238
+ """Combine all Excel files in week folder into combined.xlsx.
239
+
240
+ Args:
241
+ week_name: Week folder name (e.g., '20250217-20250223')
242
+
243
+ Returns:
244
+ Dict with success, total_rows, source_files, output_file
245
+ """
246
+ try:
247
+ from openpyxl import load_workbook, Workbook
248
+ except ImportError:
249
+ print("Error: openpyxl package not installed.", file=sys.stderr)
250
+ print("Please install with: uv add openpyxl", file=sys.stderr)
251
+ return {
252
+ "success": False,
253
+ "error": "openpyxl not installed",
254
+ "total_rows": 0,
255
+ "source_files": [],
256
+ "output_file": None,
257
+ }
258
+
259
+ # Use current working directory
260
+ base_dir = os.getcwd()
261
+ week_dir = os.path.join(base_dir, ".download", "pubmed-daily", week_name)
262
+
263
+ if not os.path.exists(week_dir):
264
+ return {
265
+ "success": False,
266
+ "error": f"Directory not found: {week_dir}",
267
+ "total_rows": 0,
268
+ "source_files": [],
269
+ "output_file": None,
270
+ }
271
+
272
+ xlsx_pattern = os.path.join(week_dir, "*.xlsx")
273
+ all_xlsx_files = glob.glob(xlsx_pattern)
274
+
275
+ source_files = [
276
+ os.path.basename(f) for f in all_xlsx_files if not f.endswith("combined.xlsx")
277
+ ]
278
+ source_files.sort()
279
+
280
+ if not source_files:
281
+ return {
282
+ "success": False,
283
+ "error": "No Excel files found to combine",
284
+ "total_rows": 0,
285
+ "source_files": [],
286
+ "output_file": None,
287
+ }
288
+
289
+ combined_wb = Workbook()
290
+ combined_ws = combined_wb.active
291
+ if combined_ws is None:
292
+ combined_ws = combined_wb.create_sheet("PubMed Articles")
293
+ else:
294
+ combined_ws.title = "PubMed Articles"
295
+
296
+ header_written = False
297
+ total_rows = 0
298
+ processed_files = []
299
+
300
+ for filename in source_files:
301
+ filepath = os.path.join(week_dir, filename)
302
+
303
+ try:
304
+ wb = load_workbook(filepath, read_only=True, data_only=True)
305
+ ws = wb.active
306
+ if ws is None:
307
+ print(f"Warning: {filename} has no active sheet, skipping")
308
+ wb.close()
309
+ continue
310
+
311
+ rows = list(ws.rows)
312
+ if not rows:
313
+ print(f"Warning: {filename} is empty, skipping")
314
+ wb.close()
315
+ continue
316
+
317
+ if not header_written:
318
+ headers = [cell.value for cell in rows[0]]
319
+ combined_ws.append(headers)
320
+ header_written = True
321
+ data_start = 1
322
+ else:
323
+ data_start = 0
324
+
325
+ for row in rows[data_start:]:
326
+ row_values = [cell.value for cell in row]
327
+ if any(v is not None for v in row_values):
328
+ combined_ws.append(row_values)
329
+ total_rows += 1
330
+
331
+ processed_files.append(filename)
332
+ wb.close()
333
+ print(f"Processed: {filename}")
334
+
335
+ except Exception as e:
336
+ print(f"Warning: Error processing {filename}: {e}", file=sys.stderr)
337
+ continue
338
+
339
+ output_path = os.path.join(week_dir, "combined.xlsx")
340
+ combined_wb.save(output_path)
341
+
342
+ print(f"\nCombined {total_rows} rows from {len(processed_files)} files")
343
+ print(f"Output: {output_path}")
344
+
345
+ return {
346
+ "success": True,
347
+ "total_rows": total_rows,
348
+ "source_files": processed_files,
349
+ "output_file": "combined.xlsx",
350
+ }
351
+
352
+
353
+ def main():
354
+ """Main entry point for command-line usage."""
355
+ parser = argparse.ArgumentParser(
356
+ description="PubMed Weekly Daily Updates Downloader"
357
+ )
358
+ parser.add_argument("command", type=str, help="Command to execute")
359
+ parser.add_argument("args", nargs="*", help="Command arguments")
360
+
361
+ parsed = parser.parse_args()
362
+
363
+ command = parsed.command
364
+ args = parsed.args
365
+
366
+ if command == "calculate_week":
367
+ week = calculate_week()
368
+ print(week)
369
+
370
+ elif command == "fetch_files":
371
+ files = fetch_ftp_file_list()
372
+ print(" ".join(files))
373
+
374
+ elif command == "filter_files":
375
+ if len(args) < 2:
376
+ print("Usage: python pubmed_weekly.py filter_files <week_name> <file_list>")
377
+ sys.exit(1)
378
+
379
+ week_name = args[0]
380
+ file_list = args[1].split()
381
+ filtered = filter_files_by_date(week_name, file_list)
382
+ print(" ".join(filtered))
383
+
384
+ elif command == "download_file":
385
+ if len(args) < 2:
386
+ print("Usage: python pubmed_weekly.py download_file <week_name> <filename>")
387
+ sys.exit(1)
388
+
389
+ week_name = args[0]
390
+ filename = args[1]
391
+ sys.exit(download_file(week_name, filename))
392
+
393
+ elif command == "combine_excel":
394
+ if len(args) < 1:
395
+ print("Usage: python pubmed_weekly.py combine_excel <week_name>")
396
+ sys.exit(1)
397
+
398
+ week_name = args[0]
399
+ result = combine_excel(week_name)
400
+ print(json.dumps(result, indent=2))
401
+
402
+ if not result.get("success"):
403
+ sys.exit(1)
404
+
405
+ else:
406
+ print(f"Unknown command: {command}")
407
+ sys.exit(1)
408
+
409
+
410
+ if __name__ == "__main__":
411
+ main()