maestro-fetch 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. maestro_fetch-0.2.0/.claude/skills/fetch/SKILL.md +281 -0
  2. maestro_fetch-0.2.0/.claude-plugin/marketplace.json +28 -0
  3. maestro_fetch-0.2.0/.claude-plugin/plugin.json +14 -0
  4. maestro_fetch-0.2.0/.gitignore +18 -0
  5. maestro_fetch-0.2.0/LICENSE +21 -0
  6. maestro_fetch-0.2.0/PKG-INFO +323 -0
  7. maestro_fetch-0.2.0/README.md +259 -0
  8. maestro_fetch-0.2.0/benchmarks/bench_cdp_social.py +322 -0
  9. maestro_fetch-0.2.0/benchmarks/results/cdp_social.json +46 -0
  10. maestro_fetch-0.2.0/benchmarks/results/cdp_social_final.json +178 -0
  11. maestro_fetch-0.2.0/benchmarks/results/cdp_social_loggedin.json +46 -0
  12. maestro_fetch-0.2.0/benchmarks/results/cdp_social_others.json +134 -0
  13. maestro_fetch-0.2.0/benchmarks/results/cdp_social_v2.json +46 -0
  14. maestro_fetch-0.2.0/examples/baidu_get_bduss.py +158 -0
  15. maestro_fetch-0.2.0/examples/china_weather.py +247 -0
  16. maestro_fetch-0.2.0/examples/china_weather_historical.py +330 -0
  17. maestro_fetch-0.2.0/examples/global_weather.py +415 -0
  18. maestro_fetch-0.2.0/examples/macrodatas_county_grain.py +100 -0
  19. maestro_fetch-0.2.0/llms.txt +128 -0
  20. maestro_fetch-0.2.0/pyproject.toml +72 -0
  21. maestro_fetch-0.2.0/pyrightconfig.json +8 -0
  22. maestro_fetch-0.2.0/skill/SKILL.md +284 -0
  23. maestro_fetch-0.2.0/src/maestro_fetch/__init__.py +4 -0
  24. maestro_fetch-0.2.0/src/maestro_fetch/__main__.py +4 -0
  25. maestro_fetch-0.2.0/src/maestro_fetch/adapters/__init__.py +7 -0
  26. maestro_fetch-0.2.0/src/maestro_fetch/adapters/baidu_pan.py +440 -0
  27. maestro_fetch-0.2.0/src/maestro_fetch/adapters/base.py +25 -0
  28. maestro_fetch-0.2.0/src/maestro_fetch/adapters/binary.py +248 -0
  29. maestro_fetch-0.2.0/src/maestro_fetch/adapters/browser.py +60 -0
  30. maestro_fetch-0.2.0/src/maestro_fetch/adapters/cloud.py +142 -0
  31. maestro_fetch-0.2.0/src/maestro_fetch/adapters/doc.py +142 -0
  32. maestro_fetch-0.2.0/src/maestro_fetch/adapters/media.py +102 -0
  33. maestro_fetch-0.2.0/src/maestro_fetch/adapters/web.py +312 -0
  34. maestro_fetch-0.2.0/src/maestro_fetch/backends/__init__.py +84 -0
  35. maestro_fetch-0.2.0/src/maestro_fetch/backends/base.py +41 -0
  36. maestro_fetch-0.2.0/src/maestro_fetch/backends/bb_browser.py +88 -0
  37. maestro_fetch-0.2.0/src/maestro_fetch/backends/browser_use.py +109 -0
  38. maestro_fetch-0.2.0/src/maestro_fetch/backends/cdp.py +389 -0
  39. maestro_fetch-0.2.0/src/maestro_fetch/backends/cdp_actions.py +554 -0
  40. maestro_fetch-0.2.0/src/maestro_fetch/backends/cloudflare.py +93 -0
  41. maestro_fetch-0.2.0/src/maestro_fetch/backends/opencli.py +104 -0
  42. maestro_fetch-0.2.0/src/maestro_fetch/backends/playwright.py +111 -0
  43. maestro_fetch-0.2.0/src/maestro_fetch/cli/__init__.py +33 -0
  44. maestro_fetch-0.2.0/src/maestro_fetch/cli/cache_cmd.py +66 -0
  45. maestro_fetch-0.2.0/src/maestro_fetch/cli/config_cmd.py +30 -0
  46. maestro_fetch-0.2.0/src/maestro_fetch/cli/discover_cmd.py +56 -0
  47. maestro_fetch-0.2.0/src/maestro_fetch/cli/do_cmd.py +55 -0
  48. maestro_fetch-0.2.0/src/maestro_fetch/cli/fetch.py +164 -0
  49. maestro_fetch-0.2.0/src/maestro_fetch/cli/session.py +292 -0
  50. maestro_fetch-0.2.0/src/maestro_fetch/cli/source.py +177 -0
  51. maestro_fetch-0.2.0/src/maestro_fetch/core/__init__.py +0 -0
  52. maestro_fetch-0.2.0/src/maestro_fetch/core/action_router.py +195 -0
  53. maestro_fetch-0.2.0/src/maestro_fetch/core/cache.py +229 -0
  54. maestro_fetch-0.2.0/src/maestro_fetch/core/config.py +167 -0
  55. maestro_fetch-0.2.0/src/maestro_fetch/core/errors.py +26 -0
  56. maestro_fetch-0.2.0/src/maestro_fetch/core/fetcher.py +57 -0
  57. maestro_fetch-0.2.0/src/maestro_fetch/core/platform_registry.py +130 -0
  58. maestro_fetch-0.2.0/src/maestro_fetch/core/result.py +23 -0
  59. maestro_fetch-0.2.0/src/maestro_fetch/core/router.py +75 -0
  60. maestro_fetch-0.2.0/src/maestro_fetch/core/session.py +315 -0
  61. maestro_fetch-0.2.0/src/maestro_fetch/interfaces/__init__.py +1 -0
  62. maestro_fetch-0.2.0/src/maestro_fetch/interfaces/cli.py +138 -0
  63. maestro_fetch-0.2.0/src/maestro_fetch/interfaces/sdk.py +56 -0
  64. maestro_fetch-0.2.0/src/maestro_fetch/providers/__init__.py +1 -0
  65. maestro_fetch-0.2.0/src/maestro_fetch/providers/anthropic.py +39 -0
  66. maestro_fetch-0.2.0/src/maestro_fetch/providers/base.py +13 -0
  67. maestro_fetch-0.2.0/src/maestro_fetch/providers/openai.py +36 -0
  68. maestro_fetch-0.2.0/src/maestro_fetch/providers/registry.py +19 -0
  69. maestro_fetch-0.2.0/src/maestro_fetch/skill/maestro-data-fetch.md +88 -0
  70. maestro_fetch-0.2.0/src/maestro_fetch/sources/__init__.py +25 -0
  71. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/academic/arxiv/search.py +38 -0
  72. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/climate/openmeteo/archive.py +52 -0
  73. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/economics/worldbank/gdp.py +36 -0
  74. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/github/trending.py +82 -0
  75. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/hackernews/front.py +49 -0
  76. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/reddit/frontpage.py +38 -0
  77. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/reddit/hot.py +38 -0
  78. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/reddit/search.py +44 -0
  79. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/reddit/subreddit.py +44 -0
  80. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/twitter/_utils.py +39 -0
  81. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/twitter/search.py +35 -0
  82. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/twitter/timeline.py +35 -0
  83. maestro_fetch-0.2.0/src/maestro_fetch/sources/community/social/twitter/trending.py +28 -0
  84. maestro_fetch-0.2.0/src/maestro_fetch/sources/loader.py +288 -0
  85. maestro_fetch-0.2.0/tests/__init__.py +0 -0
  86. maestro_fetch-0.2.0/tests/integration/__init__.py +0 -0
  87. maestro_fetch-0.2.0/tests/integration/test_maff_rice_report.py +198 -0
  88. maestro_fetch-0.2.0/tests/integration/test_real_csv.py +226 -0
  89. maestro_fetch-0.2.0/tests/integration/test_real_excel.py +206 -0
  90. maestro_fetch-0.2.0/tests/integration/test_real_fetch.py +60 -0
  91. maestro_fetch-0.2.0/tests/integration/test_real_parsing.py +144 -0
  92. maestro_fetch-0.2.0/tests/integration/test_real_web.py +166 -0
  93. maestro_fetch-0.2.0/tests/test_backends.py +105 -0
  94. maestro_fetch-0.2.0/tests/test_browser_adapter.py +43 -0
  95. maestro_fetch-0.2.0/tests/test_cache.py +208 -0
  96. maestro_fetch-0.2.0/tests/test_cdp_actions.py +165 -0
  97. maestro_fetch-0.2.0/tests/test_cli_v2.py +41 -0
  98. maestro_fetch-0.2.0/tests/test_config.py +82 -0
  99. maestro_fetch-0.2.0/tests/test_router_v2.py +36 -0
  100. maestro_fetch-0.2.0/tests/test_sources.py +103 -0
  101. maestro_fetch-0.2.0/tests/unit/__init__.py +0 -0
  102. maestro_fetch-0.2.0/tests/unit/fixtures/multi_sheet.xlsx +0 -0
  103. maestro_fetch-0.2.0/tests/unit/fixtures/sample.csv +5 -0
  104. maestro_fetch-0.2.0/tests/unit/fixtures/sample.xlsx +0 -0
  105. maestro_fetch-0.2.0/tests/unit/test_adapters/__init__.py +0 -0
  106. maestro_fetch-0.2.0/tests/unit/test_adapters/test_base.py +32 -0
  107. maestro_fetch-0.2.0/tests/unit/test_adapters/test_cloud.py +60 -0
  108. maestro_fetch-0.2.0/tests/unit/test_adapters/test_doc.py +43 -0
  109. maestro_fetch-0.2.0/tests/unit/test_adapters/test_media.py +35 -0
  110. maestro_fetch-0.2.0/tests/unit/test_adapters/test_web.py +40 -0
  111. maestro_fetch-0.2.0/tests/unit/test_cli.py +26 -0
  112. maestro_fetch-0.2.0/tests/unit/test_fetcher.py +33 -0
  113. maestro_fetch-0.2.0/tests/unit/test_providers.py +42 -0
  114. maestro_fetch-0.2.0/tests/unit/test_result.py +22 -0
  115. maestro_fetch-0.2.0/tests/unit/test_router.py +29 -0
@@ -0,0 +1,281 @@
1
+ ---
2
+ name: maestro-fetch
3
+ description: "Universal file acquisition. Use when you need to download any file or data from a URL to local disk: web pages, PDFs, Excel/CSV, Dropbox, Google Drive, Baidu Pan share links, APIs, YouTube. maestro-fetch's job ends at raw_path — downstream skills (analyst, QA, OCR) handle processing. Triggers: fetch, download, scrape, crawl, get data, acquire data, Baidu Pan, pan.baidu.com, macrodatas, dropbox, drive.google.com, PDF download, Excel download, API data."
4
+ ---
5
+
6
+ # maestro-fetch — Universal File Acquisition
7
+
8
+ ## Role in the Pipeline
9
+
10
+ maestro-fetch does ONE thing: **get the file to disk**.
11
+
12
+ ```
13
+ URL → maestro-fetch → ~/.maestro/cache/<filename>
14
+
15
+ maestro-data-analyst
16
+ maestro-data-qa
17
+ maestro-data-ocr
18
+ (any downstream skill)
19
+ ```
20
+
21
+ It does NOT clean, transform, or analyze. Once `raw_path` exists, hand off.
22
+
23
+ ## Recommended Usage Pattern
24
+
25
+ ### Step 1 — Acquire (maestro-fetch)
26
+
27
+ ```python
28
+ from maestro_fetch import fetch
29
+ import asyncio
30
+
31
+ result = asyncio.run(fetch("https://any-url.com/data.xlsx"))
32
+ print(result.raw_path) # ~/.maestro/cache/data.xlsx ← hand this off
33
+ ```
34
+
35
+ Or via CLI:
36
+ ```bash
37
+ maestro-fetch "https://example.com/data.xlsx"
38
+ # raw file saved to ~/.maestro/cache/data.xlsx
39
+ # prints markdown preview to stdout
40
+ ```
41
+
42
+ ### Step 2 — Process (downstream skill)
43
+
44
+ ```python
45
+ import pandas as pd
46
+
47
+ # Downstream script receives raw_path and works from there
48
+ df = pd.read_excel(result.raw_path) # or pd.read_csv(...)
49
+ # ... cleaning, analysis, QA
50
+ ```
51
+
52
+ **The handoff contract:** `result.raw_path` is the only thing maestro-fetch guarantees downstream. `result.tables` and `result.content` are convenience previews, not the authoritative output.
53
+
54
+ ---
55
+
56
+ ## URL Auto-Detection
57
+
58
+ | URL Pattern | Adapter | What Gets Downloaded |
59
+ |-------------|---------|----------------------|
60
+ | `pan.baidu.com/s/*` | BaiduPan | xlsx/csv from share (playwright + PCS OAuth) |
61
+ | `dropbox.com/*` | Cloud | resolved file |
62
+ | `drive.google.com/*` | Cloud | resolved file |
63
+ | `docs.google.com/document/d/*` | Cloud | exported as .txt |
64
+ | `docs.google.com/spreadsheets/d/*` | Cloud | exported as .csv |
65
+ | `docs.google.com/presentation/d/*` | Cloud | exported as .pdf |
66
+ | `youtube.com/watch*` | Media | audio transcript |
67
+ | `*.zip` `*.gz` `*.tar` `*.bz2` `*.7z` | Binary | streamed to disk, progress bar, cache-hit skip |
68
+ | `*.shp` `*.nc` `*.geotiff` `*.tif` | Binary | geospatial binary — streamed |
69
+ | `*.parquet` `*.feather` `*.h5` `*.nc` | Binary | data science binary — streamed |
70
+ | `*.dta` `*.sas7bdat` `*.rds` | Binary | stats software binary — streamed |
71
+ | `*.pdf` | Doc | PDF file |
72
+ | `*.xlsx`, `*.csv` | Doc | spreadsheet file |
73
+ | Everything else | Web | HTML → markdown (crawl4ai; httpx fallback) |
74
+
75
+ Adapter priority: **BaiduPan** > Cloud > **Binary** > Doc > Web
76
+
77
+ **Binary adapter features:**
78
+ - HEAD request → Content-Length → **cache hit detection** (skip re-download if size matches)
79
+ - **Streaming** (1 MB chunks) → safe for files >500 MB (no OOM)
80
+ - **Range-resume with auto-retry** (up to 5×): sends `Range: bytes=N-` on reconnect; appends if server returns 206, restarts if 200 (no Range support)
81
+ - ASCII progress bar with size/percentage
82
+ - 1-hour read timeout for slow servers
83
+ - Covers: `.zip .gz .tar .bz2 .7z .rar .shp .nc .tiff .parquet .feather .h5 .dta .sas7bdat .rds .npy` and more
84
+
85
+ ---
86
+
87
+ ## CLI Reference
88
+
89
+ ```bash
90
+ # Download any URL — raw file to ~/.maestro/cache/
91
+ maestro-fetch "https://example.com/data.xlsx"
92
+
93
+ # Save parsed CSV to a specific directory
94
+ maestro-fetch "https://example.com/data.xlsx" --output csv --output-dir ./out/
95
+
96
+ # Batch (one URL per line)
97
+ maestro-fetch dummy --batch urls.txt --output-dir ./data/
98
+
99
+ # Custom cache location
100
+ maestro-fetch "https://..." --cache-dir /tmp/myproject/
101
+ ```
102
+
103
+ Default cache: `~/.maestro/cache/` (global, shared across projects)
104
+
105
+ ---
106
+
107
+ ## Python SDK Reference
108
+
109
+ ```python
110
+ from maestro_fetch import fetch, batch_fetch
111
+
112
+ # Single URL
113
+ result = await fetch("https://any-url.com/data")
114
+ result.raw_path # Path — the downloaded file (authoritative output)
115
+ result.tables # list[pd.DataFrame] — convenience parse (may be empty)
116
+ result.content # str — markdown preview
117
+ result.source_type # "web" | "doc" | "cloud" | "media" | "baidu_pan"
118
+ result.metadata # dict — adapter-specific info
119
+
120
+ # Batch with concurrency
121
+ results = await batch_fetch(urls, concurrency=10)
122
+
123
+ # Custom timeout / headers
124
+ result = await fetch(url, timeout=120, headers={"User-Agent": "my-app"})
125
+ ```
126
+
127
+ ---
128
+
129
+ ## Baidu Pan (百度网盘) Share Links
130
+
131
+ **One-time setup** — triggers automatically on first use:
132
+ ```bash
133
+ maestro-fetch "https://pan.baidu.com/s/1xxxxx?pwd=abcd"
134
+ # Browser opens → log in → OAuth token saved to ~/.bypy/bypy.json
135
+ ```
136
+
137
+ **Usage:**
138
+ ```bash
139
+ maestro-fetch "https://pan.baidu.com/s/1xxxxx?pwd=abcd"
140
+ # Downloads primary data file (xlsx > xls > csv > json > pdf) from share
141
+ # Saved to ~/.maestro/cache/<filename>
142
+ ```
143
+
144
+ **How it works:**
145
+ 1. Opens share URL in playwright persistent browser (`~/.maestro_fetch/playwright_profile`)
146
+ 2. Intercepts `share/list` XHR (before goto) to get filename
147
+ 3. Clicks 保存到网盘 (silently skips if already saved)
148
+ 4. PCS API + OAuth token → dlink → download bytes
149
+ 5. Share is a directory → recurse, pick primary data file
150
+
151
+ **Pitfalls (hard-won):**
152
+ - `wait_until="networkidle"` times out — Baidu SPA polls continuously, use `domcontentloaded`
153
+ - bdstoken not in static HTML (SPA) — playwright handles CSRF automatically, never try to extract bdstoken via httpx
154
+ - dlink download requires `?access_token=` suffix AND `User-Agent: pan.baidu.com`
155
+ - Share always saves to pan root `/` (not configurable); `_resolve_dlink` searches root then recurses dirs
156
+
157
+ ---
158
+
159
+ ## Installation
160
+
161
+ ```bash
162
+ pip install -e . # editable install from repo root
163
+ maestro-fetch --help # CLI available after install
164
+ ```
165
+
166
+ ```python
167
+ from maestro_fetch import fetch # Python SDK
168
+ ```
169
+
170
+ ### MCP Server (Claude Code)
171
+
172
+ Configured in `~/.claude/settings.json`:
173
+ ```json
174
+ {
175
+ "mcpServers": {
176
+ "maestro-fetch": {
177
+ "command": "$VENV/bin/python",
178
+ "args": ["-m", "maestro_fetch.interfaces.mcp_server"]
179
+ }
180
+ }
181
+ }
182
+ ```
183
+
184
+ MCP tools: `fetch_url`, `batch_fetch_urls`, `detect_url_type`
185
+
186
+ **IMPORTANT**: curl and wget are DENIED. Always use maestro-fetch for any URL download task.
187
+
188
+ ---
189
+
190
+ ## Key Rules
191
+
192
+ 1. maestro-fetch acquires; downstream skills process — do not mix
193
+ 2. `raw_path` is the handoff contract; `tables`/`content` are previews only
194
+ 3. No API key needed for core fetch; LLM key only for `schema=` extraction (rarely needed)
195
+ 4. Default cache `~/.maestro/cache/` is global — do not use per-project relative paths
196
+ 5. For Open-Meteo long date ranges: chunk into ≤365-day segments
197
+
198
+ ---
199
+
200
+ ## Built-in Public Sources — Additional Notes
201
+
202
+ ### data.gov.sg (Singapore Open Data Portal)
203
+ - Base URL: `https://data.gov.sg/api/action/`
204
+ - Search datasets: `package_search?q=<keyword>`
205
+ - Download: `datastore_search?resource_id=<id>&limit=50000&offset=<N>`
206
+ - Rate limiting: Aggressive 429 errors — use 5s delays, exponential backoff
207
+ - Pagination: Use offset param; repeat until `records` < `limit`
208
+ - GeoJSON boundaries: Direct URL download (no pagination needed)
209
+ - Auth: None required
210
+ - License: Singapore Open Data Licence v1.0 (free for any use with attribution)
211
+
212
+ ### Zenodo / Figshare Large File Downloads
213
+ - **Zenodo CDN does NOT support HTTP Range requests** — responds HTTP 200 (full file) instead of 206 (partial). Never attempt resume/append: always overwrite from scratch or you get a corrupted file (double-size, binary garbage appended).
214
+ - DocAdapter streams all binary files from scratch (no Range header), so this does not affect maestro-fetch internal behavior.
215
+ - Zenodo API for file list: `curl https://zenodo.org/api/records/{id}` → `files[].links.self` for content URLs
216
+ - Figshare API: `curl https://api.figshare.com/v2/articles/{id}` → `files[].download_url`
217
+ - Both services redirect to CDN URLs; Content-Disposition header on redirect contains the real filename.
218
+
219
+ ### NASA Earthdata (SEDAC, LP DAAC, etc.)
220
+ - Auth via `~/.netrc`: `machine urs.earthdata.nasa.gov login <user> password <pass>`
221
+ - curl usage: `curl -L --netrc-file ~/.netrc -c /tmp/ed_cookies.txt -b /tmp/ed_cookies.txt -o out.zip <URL>`
222
+ - Token API: `curl -u user:pass https://urs.earthdata.nasa.gov/api/users/tokens` → `[{"access_token": "..."}]`
223
+ - **SEDAC China datasets: PERMANENTLY UNAVAILABLE (2026).** CIESIN contract ended 2025-04-30; S3 buckets empty; all 3 China datasets (population census, agricultural stats, county socioeconomic) return HTTP 404. Do NOT attempt download — use USDA FAS PSD or SciDB as alternatives.
224
+ - Other SEDAC datasets (non-China) may still work via Earthdata Cloud; check `search.earthdata.nasa.gov` first.
225
+ - Download status page: `https://search.earthdata.nasa.gov/downloads/{order_id}` — get order via Earthdata Search UI.
226
+
227
+ ### SciDB (scidb.cn / sciencedb.cn) — Chinese National Science Data
228
+ - **Vue.js SPA**: file list uses v-lazy lazy loading — files NOT visible in initial HTML, only rendered after scroll/click.
229
+ - **Download mechanism** (from JS bundle reverse engineering):
230
+ - `fileDown(item)` → `genDownloadUrl(item.id)` → `window.open(url)`
231
+ - Download URL = `https://china.scidb.cn/download?fileId=<item.id>` (note: `china.scidb.cn`, NOT `www.scidb.cn`)
232
+ - `window.open()` opens a NEW TAB — `page.on("response")` does NOT capture this; use `context.on("page")` to intercept the popup
233
+ - OR extract `item.id` directly from Vue component tree via `__vue__.$children` walk (no click needed)
234
+ - **API endpoints that work**:
235
+ - File listing: `POST https://www.scidb.cn/api/gin-sdb-filetree/public/file/childrenFileListByPath` body `{dataSetId, version, path, lastIndex, pageSize}` — returns empty if version unknown
236
+ - Dataset info: `GET /api/sdb-dataset-service/dataset/details/<id>` requires auth (PERMISSION NO ACCESS without login)
237
+ - **API endpoints broken**: `/api/dataset/v2/en/detail` returns 404; `/api/sdb-dataset-service/public/dataset/details/<id>` returns 404.
238
+ - **Required approach**: Use playwright + DOM manipulation:
239
+ 1. Navigate to `https://www.scidb.cn/en/detail?dataSetId=<id>` with `waitUntil="domcontentloaded"` (NOT networkidle — Nuxt.js polls)
240
+ 2. Expand file tree: click `.v-treeview-node__toggle`; wait 2s
241
+ 3. Read file list from `.fileTree innerText` (filename on one line, `MD5:<hash> (<size> KB)` on next)
242
+ 4. For each file: walk `__vue__.$children` to find node with matching `item.label`, get `item.id`; construct download URL
243
+ 5. Download via `GET https://china.scidb.cn/download?fileId=<id>` with httpx (redirects to CDN)
244
+ - **Reusable script template**: `scripts/download_scidb.py` in RAD-20260211-0001 project — copy-adapt for new SciDB datasets. Supports `--doi`, `--dataset-id`, `--pattern`, `--list` flags.
245
+ - **Auth**: CC BY 4.0 public datasets download without login.
246
+ - **Dataset ID format**: `DS_<hex32>` or via DOI redirect: `doi.org/10.57760/sciencedb.<id>` → final URL contains `dataSetId=` query param.
247
+ - **File naming**: datasets contain 数据实体.xlsx, 数据文档.docx, 数据样例.xlsx, 缩略图.jpg; download only 数据实体.xlsx.
248
+
249
+ ### FAOSTAT (FAO Agricultural Statistics)
250
+ - **WARNING**: FAOSTAT backend is frequently down (HTTP 521 "Web server is down")
251
+ - Direct bulk download URL returns 403 (hotlink blocked): `fenixservices.fao.org/faostat/static/bulkdownloads/*.zip`
252
+ - **Fallback 1 (best)**: USDA FAS PSD — same data, always available:
253
+ - Grains only: `https://apps.fas.usda.gov/psdonline/downloads/psd_grains_pulses_csv.zip` (2.8MB)
254
+ - All commodities: `https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip` (10MB)
255
+ - Columns: `Commodity_Description, Country_Name, Market_Year, Attribute_Description, Value, Unit_Description`
256
+ - Yield unit: MT/HA (multiply ×1000 for kg/ha)
257
+ - Filter: `Attribute_Description == 'Yield'` + country + crop
258
+ - **Fallback 2**: World Bank API (national cereal yield only):
259
+ - `https://api.worldbank.org/v2/country/CN/indicator/AG.YLD.CREL.KG?format=json&per_page=100&mrv=40`
260
+ - **Limitation**: both fallbacks are national-level only. Province-level yield requires NBS yearbooks (Chinese, manual extraction).
261
+
262
+ ---
263
+
264
+ ## Gotchas (Hard-Won)
265
+
266
+ ### File extension cannot be trusted
267
+ Downloaded files may have wrong extensions. SciDB `.xls` files have been observed to contain OOXML `.docx` content (Word document with metadata/logos, not spreadsheet data). Always verify:
268
+ ```python
269
+ import magic # python-magic
270
+ mime = magic.from_file(str(raw_path), mime=True)
271
+ # Expected: 'application/vnd.ms-excel' or 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
272
+ # Got: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' → wrong!
273
+ ```
274
+ Fallback without python-magic: try `pd.read_excel()` and catch exceptions; or check first 4 bytes (`PK\x03\x04` = zip-based Office, then inspect `[Content_Types].xml` inside).
275
+
276
+ ### Downloaded archive may contain applications, not data
277
+ Example: `nongzuowu.zip` (105MB) from a Chinese agricultural statistics site contained a complete Java Spring Boot web application (RuoYi framework) with embedded CSV files, not a standalone dataset. The CSV files inside were national+provincial only (no county-level data), making the entire download useless for the intended purpose.
278
+ **Rule**: After downloading any archive, immediately inspect its contents (`zipfile.namelist()` or `tar tf`) before assuming it contains the expected data format. Check file sizes, directory structure, and sample content.
279
+
280
+ ### Zenodo CDN ignores Range headers (reminder)
281
+ Zenodo responds HTTP 200 (full file) to Range requests instead of 206 (partial). Never attempt resume/append — you get a corrupted double-size file. Always download from scratch or verify file size matches expected before use.
@@ -0,0 +1,28 @@
1
+ {
2
+ "$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
3
+ "name": "maestro-fetch",
4
+ "metadata": {
5
+ "description": "Universal data acquisition for AI agents with smart routing.",
6
+ "version": "1.0.0"
7
+ },
8
+ "owner": {
9
+ "name": "Maestro AI",
10
+ "email": "hello@maestro.onl"
11
+ },
12
+ "plugins": [
13
+ {
14
+ "name": "maestro-fetch",
15
+ "description": "Fetch everything, for agents. Smart routing across web, PDF, Excel, cloud storage, video, authenticated pages, and 50+ public data sources.",
16
+ "version": "0.2.0",
17
+ "author": {
18
+ "name": "Maestro AI"
19
+ },
20
+ "source": "./",
21
+ "category": "data",
22
+ "homepage": "https://github.com/maestro-ai-stack/maestro-fetch",
23
+ "repository": "https://github.com/maestro-ai-stack/maestro-fetch",
24
+ "license": "MIT",
25
+ "keywords": ["fetch", "scraping", "data", "download", "agent", "browser", "pdf", "excel"]
26
+ }
27
+ ]
28
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "name": "maestro-fetch",
3
+ "description": "Fetch everything, for agents. Universal data acquisition with smart routing across 15+ source types.",
4
+ "version": "0.2.0",
5
+ "author": {
6
+ "name": "Maestro AI",
7
+ "email": "hello@maestro.onl"
8
+ },
9
+ "homepage": "https://github.com/maestro-ai-stack/maestro-fetch",
10
+ "repository": "https://github.com/maestro-ai-stack/maestro-fetch",
11
+ "license": "MIT",
12
+ "keywords": ["fetch", "scraping", "data-acquisition", "agent", "cli"],
13
+ "skills": "./.claude/skills"
14
+ }
@@ -0,0 +1,18 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .venv/
4
+ dist/
5
+ *.egg-info/
6
+ src/maestro_fetch.egg-info/
7
+ .pytest_cache/
8
+ .ruff_cache/
9
+ .worktrees/
10
+ .maestro_cache/
11
+
12
+ # Internal dev files (not for open source release)
13
+ CLAUDE.md
14
+ docs/plans/
15
+ docs/test-urls.md
16
+ cert_key*
17
+ .env*
18
+ docs/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Maestro AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.