cnhkmcp 1.2.9__tar.gz → 1.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cnhkmcp-1.2.9/cnhkmcp.egg-info → cnhkmcp-1.3.1}/PKG-INFO +1 -1
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp/__init__.py +1 -1
- cnhkmcp-1.3.1/cnhkmcp/untracked/BRAIN_Alpha_Test_Requirements_and_Tips.md +202 -0
- cnhkmcp-1.3.1/cnhkmcp/untracked/arXiv_API_Tool_Manual.md +490 -0
- cnhkmcp-1.3.1/cnhkmcp/untracked/arxiv_api.py +229 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp/untracked/platform_functions.py +43 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1/cnhkmcp.egg-info}/PKG-INFO +1 -1
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp.egg-info/SOURCES.txt +3 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/setup.py +1 -1
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/LICENSE +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/MANIFEST.in +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/README.md +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp/untracked/BRAIN_6_Tips_Datafield_Exploration_Guide.md +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp/untracked/Dataset_Exploration_Expert_Manual.md +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp/untracked/daily_report_workflow.md +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp/untracked/forum_functions.py +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp/untracked/sample_mcp_config.json +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp/untracked/user_config.json +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp.egg-info/dependency_links.txt +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp.egg-info/entry_points.txt +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp.egg-info/not-zip-safe +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp.egg-info/requires.txt +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp.egg-info/top_level.txt +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/requirements.txt +0 -0
- {cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/setup.cfg +0 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# BRAIN Alpha Submission Tests: Requirements and Improvement Tips
|
|
2
|
+
|
|
3
|
+
This document compiles the key requirements for passing alpha submission tests on the WorldQuant BRAIN platform, based on official documentation and community experiences from the forum. I've focused on the main tests (Fitness, Sharpe, Turnover, Weight, Sub-universe, and Self-Correlation). For each, I'll outline the thresholds, explanations, and strategies to improve or pass them, drawing from doc pages like "Clear these tests before submitting an Alpha" and forum searches on specific topics.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
## What is an Alpha?
|
|
7
|
+
An alpha is a mathematical model or signal designed to predict the future movements of financial instruments (e.g., stocks). On BRAIN, alphas are expressed using the platform's FASTEXPR language and simulated against historical data to evaluate performance. Successful alphas can earn payments and contribute to production strategies.
|
|
8
|
+
|
|
9
|
+
## What Are Alpha Tests?
|
|
10
|
+
Alphas must pass a series of pre-submission checks (e.g., via the `get_submission_check` tool) to ensure they meet quality thresholds. Key tests include:
|
|
11
|
+
- **Fitness and Sharpe Ratio**: Measures risk-adjusted returns. Must be above cutoffs (e.g., IS Sharpe > 1.25 for some universes).
|
|
12
|
+
- **Correlation Checks**: Against self-alphas and production alphas (threshold ~0.7) to avoid redundancy.
|
|
13
|
+
- **Turnover and Drawdown**: Ensures stability (e.g., low turnover < 250%).
|
|
14
|
+
- **Regional/Universe-Specific**: Vary by settings like USA TOP3000 (D1) or GLB TOP3000.
|
|
15
|
+
- **Other Metrics**: PnL, yearly stats, and risk-neutralized metrics (e.g., RAM, Crowding Risk-Neutralized).
|
|
16
|
+
|
|
17
|
+
Failing tests result in errors like "Sub-universe Sharpe NaN is not above cutoff" or low fitness.
|
|
18
|
+
|
|
19
|
+
## General Guidance on Passing Tests
|
|
20
|
+
- **Start Simple**: Use basic operators like `ts_rank`, `ts_corr`, or `neutralize` on price-volume data.
|
|
21
|
+
- **Optimize Settings**: Choose universes like TOP3000 (USA, D1) for easier testing. Neutralize against MARKET or SUBINDUSTRY to reduce correlation.
|
|
22
|
+
- **Improve Metrics**: Apply `ts_decay_linear` for stability, `scale` for normalization, and check with `check_correlation`.
|
|
23
|
+
- **Common Pitfalls**: Avoid high correlation (use `check_correlation`), ensure non-NaN data (e.g., via `ts_backfill`), and target high IR/Fitness.
|
|
24
|
+
- **Resources**: Review operators (e.g., 102 available like `ts_zscore`), documentation (e.g., "Interpret Results" section), and forum posts.
|
|
25
|
+
|
|
26
|
+
Alphas must pass these in-sample (IS) performance tests to be submitted for out-of-sample (OS) testing. Only submitted alphas contribute to scoring and payments. Tests are run in sequence, and failure messages guide improvements (e.g., "Improve fitness" or "Reduce max correlation").
|
|
27
|
+
|
|
28
|
+
## Generating and Improving Alpha Ideas: The Conceptual Foundation
|
|
29
|
+
Before diving into metrics and optimizations, strong alphas start with solid ideas rooted in financial theory, market behaviors, or data insights. Improving from an "idea angle" means iterating on the core concept rather than just tweaking parameters—this often leads to more robust alphas that pass tests naturally. Use resources like BRAIN's "Alpha Examples for Beginners" (from Discover BRAIN category) or forum-shared ideas.
|
|
30
|
+
|
|
31
|
+
### Key Principles
|
|
32
|
+
- **Idea Sources**: Draw from academic papers, economic indicators, or datasets (e.g., sentiment, earnings surprises). Validate ideas with backtests to ensure they generalize.
|
|
33
|
+
- **Iteration**: Start simple, then refine: Add neutralization for correlation, decay for stability, or grouping for diversification.
|
|
34
|
+
- **Avoid Overfitting**: Test ideas across universes/regions; use train/test splits.
|
|
35
|
+
- **Tools**: Explore datasets via Data Explorer; use operators like `ts_rank` for signals.
|
|
36
|
+
|
|
37
|
+
### Using arXiv for Idea Discovery
|
|
38
|
+
A powerful way to source fresh ideas is through academic papers on arXiv. Use the provided `arxiv_api.py` script (detailed in `arXiv_API_Tool_Manual.md`) to search and download relevant research.
|
|
39
|
+
|
|
40
|
+
- **Search Example**: Run `python arxiv_api.py "quantitative finance momentum strategies"` to find papers on momentum ideas. Download top results for detailed study.
|
|
41
|
+
- **Integration Tip**: Extract concepts like "earnings surprises" from abstracts, then implement in BRAIN (e.g., using sentiment datasets). This helps generate diverse alphas that pass correlation tests.
|
|
42
|
+
- **Why It Helps**: Papers often provide theoretical backing, reducing overfitting risks when adapting to BRAIN simulations.
|
|
43
|
+
|
|
44
|
+
Refer to the manual for interactive mode and advanced queries to streamline your research workflow.
|
|
45
|
+
|
|
46
|
+
### Avoid Mixing Datasets: The ATOM Principle
|
|
47
|
+
When improving an alpha, prioritize modifications that stay within the same dataset as the original. ATOM (Atomic) alphas are those built from a single dataset (excluding permitted grouping fields like country, sector, etc.), which qualify for relaxed submission criteria—focusing on last 2Y Sharpe instead of full IS Ladder tests.
|
|
48
|
+
|
|
49
|
+
**Why It's Important**:
|
|
50
|
+
- **Robustness**: Mixing datasets can introduce conflicting signals, leading to overfitting and poor out-of-sample performance (forum insights on ATOM alphas).
|
|
51
|
+
- **Submission Benefits**: Single-dataset alphas have easier thresholds (e.g., Delay-1: >1 for last 2Y Sharpe in USA) and may align with themes offering multipliers (up to x1.1 for low-utilization pyramids).
|
|
52
|
+
- **Correlation Control**: ATOM alphas often have lower self-correlation, helping pass tests and diversify your portfolio.
|
|
53
|
+
|
|
54
|
+
**How to Apply**:
|
|
55
|
+
- Check the alpha's data fields via simulation results or code.
|
|
56
|
+
- Search for improvements in the same dataset first (use Data Explorer).
|
|
57
|
+
- If mixing is needed, verify it doesn't disqualify ATOM status and retest thoroughly.
|
|
58
|
+
|
|
59
|
+
This principle, highlighted in BRAIN docs and forums, ensures alphas remain "atomic" and competitive.
|
|
60
|
+
|
|
61
|
+
### Understanding Datafields Before Improvements
|
|
62
|
+
Before optimizing alphas, thoroughly evaluate the datafields involved to address potential issues like unit mismatches or update frequencies. This prevents common pitfalls in tests (e.g., NaN errors, poor sub-universe performance) and ensures appropriate operators are used. Use these 6 methods from the BRAIN exploration guide (adapted for quick simulation in "None" neutralization, decay 0, test_period P0Y0M):
|
|
63
|
+
|
|
64
|
+
1. **Basic Coverage**: For example, Simulate `datafield` (or `vec_op(datafield)` for vectors). Insight: % coverage = (Long + Short Count) / Universe Size.
|
|
65
|
+
2. **Non-Zero Coverage**: For example, Simulate `datafield != 0 ? 1 : 0`. Insight: Actual meaningful data points.
|
|
66
|
+
3. **Update Frequency**: For example, Simulate `ts_std_dev(datafield, N) != 0 ? 1 : 0` (vary N=5,22,66). Insight: Daily/weekly/monthly/quarterly updates.
|
|
67
|
+
4. **Data Bounds**: For example, Simulate `abs(datafield) > X` (vary X). Insight: Value ranges and normalization.
|
|
68
|
+
5. **Central Tendency**: For example, Simulate `ts_median(datafield, 1000) > X` (vary X). Insight: Typical values over time.
|
|
69
|
+
6. **Distribution**: Simulate `X < scale_down(datafield) && scale_down(datafield) < Y` (vary X/Y between 0-1). Insight: Data spread patterns.
|
|
70
|
+
|
|
71
|
+
Apply insights to choose operators (e.g., ts_backfill for sparse data, scale for unit issues) and fix problems before improvements.
|
|
72
|
+
|
|
73
|
+
### Examples from Community and Docs (From Alpha Template Sharing Post)
|
|
74
|
+
These examples are sourced from the forum post on sharing unique alpha ideas and implementations, emphasizing templates that generate robust signals for passing submission tests.
|
|
75
|
+
|
|
76
|
+
- **Multi-Smoothing Ranking Signal** (User: JB71859): For earnings data, apply double smoothing with ranking and statistical ops. Example: `ts_mean(ts_rank(earnings_field, decay1), decay2)`. First ts_rank normalizes values over time (pre-processing), then ts_mean smooths for stable signals (main signal). Helps improve fitness and reduce turnover by lowering noise; produced 3 ATOM alphas after 2000 simulations.
|
|
77
|
+
- **Momentum Divergence Factor** (User: YK49234): Capture divergence between short and long-term momentum on the same field. Example: `ts_delta(ts_zscore(field, short_window), short_window) - ts_delta(ts_zscore(field, long_window), long_window)`. Processes data with z-scoring for normalization, then delta/mean for change detection (main signal). Boosts Sharpe by highlighting momentum shifts; yielded 4 submitable alphas from 20k tests with ~5% signal rate.
|
|
78
|
+
- **Network Factor Difference Momentum** (User: JR23144): Compute differences in oth455 PCA factors for 'imbalance' signals, then apply time series ops. Example: `ts_sum(oth455_fact2 - oth455_fact1, 240)`. Math op creates difference (pre-processing), ts op captures persistence (main signal). Enhances correlation passing via unique network insights; effective in EUR for low-fitness but high-margin alphas.
|
|
79
|
+
|
|
80
|
+
These community-shared templates promote diverse, ATOM-friendly ideas that align with test requirements like low correlation and high robustness.
|
|
81
|
+
|
|
82
|
+
### Official BRAIN Examples
|
|
83
|
+
Draw from BRAIN's structured tutorials for foundational ideas:
|
|
84
|
+
|
|
85
|
+
- **Beginner Level** ([19 Alpha Examples](https://platform.worldquantbrain.com/learn/documentation/create-alphas/19-alpha-examples)): Start with simple price-based signals. Example: `ts_rank(close, 20)` – Ranks closing prices over 20 days to capture momentum. Improve by adding neutralization: `neutralize(ts_rank(close, 20), "MARKET")` to reduce market bias and pass correlation tests.
|
|
86
|
+
|
|
87
|
+
- **Bronze Level** ([Sample Alpha Concepts](https://platform.worldquantbrain.com/learn/documentation/create-alphas/sample-alpha-concepts)): Incorporate multiple data fields. Example: `ts_corr(close, volume, 10)` – Correlation between price and volume over 10 days. Enhance fitness by decaying: `ts_decay_linear(ts_corr(close, volume, 10), 5)` for smoother signals.
|
|
88
|
+
|
|
89
|
+
- **Silver Level** ([Example Expression Alphas](https://platform.worldquantbrain.com/learn/documentation/create-alphas/example-expression-alphas)): Advanced combinations. Example: `scale(ts_rank(ts_delay(vwap, 1) / vwap, 252))` – Normalized 1-year price change. Iterate by adding groups: `group_zscore(scale(ts_rank(ts_delay(vwap, 1) / vwap, 252)), "INDUSTRY")` to improve sub-universe robustness.
|
|
90
|
+
|
|
91
|
+
These examples show how starting with a core idea (e.g., momentum) and layering improvements (e.g., neutralization, decay) can help pass tests like fitness and sub-universe.
|
|
92
|
+
|
|
93
|
+
## 1. Fitness
|
|
94
|
+
### Requirements
|
|
95
|
+
- At least "Average": Greater than 1.3 for Delay-0 or Greater than 1 for Delay-1.
|
|
96
|
+
- Fitness = Sharpe * sqrt(abs(Returns) / max(Turnover, 0.125)).
|
|
97
|
+
- Ratings: Spectacular (>2.5 Delay-1 or >3.25 Delay-0), Excellent (>2 or >2.6), etc.
|
|
98
|
+
|
|
99
|
+
### Explanation
|
|
100
|
+
Fitness balances Sharpe, Returns, and Turnover. High fitness indicates a robust alpha. It's a key metric for alpha quality.
|
|
101
|
+
|
|
102
|
+
### Tips to Improve
|
|
103
|
+
- **From Docs**: Increase Sharpe/Returns and reduce Turnover. Optimize by balancing these—improving one may hurt another. Aim for upward PnL trends with minimal drawdown.
|
|
104
|
+
- **Forum Experiences** (from searches on "increase fitness alpha"):
|
|
105
|
+
- Use group operators (e.g., with pv13) to boost fitness without overcomplicating expressions.
|
|
106
|
+
- Screen alphas with author_fitness >=2 or similar in competitions like Super Alpha.
|
|
107
|
+
- Manage alphas via databases or tags; query for high-fitness ones (e.g., via API with fitness filters).
|
|
108
|
+
- In hand-crafting alphas, iteratively add operators like left_tail and group to push fitness over thresholds, but watch for overfitting.
|
|
109
|
+
- Community shares: High-fitness alphas (e.g., >2) often come from multi-factor fusions or careful data field selection.
|
|
110
|
+
|
|
111
|
+
## 2. Sharpe Ratio
|
|
112
|
+
### Requirements
|
|
113
|
+
- Greater than 2 for Delay-0 or Greater than 1.25 for Delay-1.
|
|
114
|
+
- Sharpe = sqrt(252) * IR, where IR = mean(PnL) / stdev(PnL).
|
|
115
|
+
|
|
116
|
+
### Explanation
|
|
117
|
+
Measures risk-adjusted returns. Higher Sharpe means more consistent performance. For GLB alphas, additional sub-geography Sharpes (>=1 for AMER, APAC, EMEA).
|
|
118
|
+
|
|
119
|
+
### Tips to Improve
|
|
120
|
+
- **From Docs**: Focus on consistent PnL with low volatility. Use visualization to ensure upward trends. For sub-geography, incorporate region-specific signals (e.g., earnings for AMER, microstructure for APAC).
|
|
121
|
+
- **Forum Experiences** (from searches on "improve Sharpe ratio alpha"):
|
|
122
|
+
- Decay signals separately for liquid/non-liquid stocks (e.g., ts_decay_linear with rank(volume*close)).
|
|
123
|
+
- Avoid size-related multipliers (e.g., rank(-assets)) that shift weights to illiquid stocks.
|
|
124
|
+
- Check yearly Sharpe data via API and store in databases for analysis.
|
|
125
|
+
- In templates like CCI-based, combine with z-score and delay to stabilize Sharpe.
|
|
126
|
+
- Community tip: Prune low-Sharpe alphas in pools using weighted methods to retain high-Sharpe ones.
|
|
127
|
+
- **Flipping Negative Sharpe**: For non-CHN regions, if an alpha shows negative Sharpe (e.g., -1 to -2), add a minus sign to the expression (e.g., `-original_expression`) to flip it positive. This preserves the signal while improving metrics; verify it doesn't introduce correlation issues.
|
|
128
|
+
|
|
129
|
+
## 3. Turnover
|
|
130
|
+
### Requirements
|
|
131
|
+
- 1% < Turnover < 70%.
|
|
132
|
+
- Turnover = Dollar trading volume / Book size.
|
|
133
|
+
|
|
134
|
+
### Explanation
|
|
135
|
+
Indicates trading frequency. Low turnover reduces costs; extremes fail submission.
|
|
136
|
+
|
|
137
|
+
### Tips to Improve
|
|
138
|
+
- **From Docs**: Aim for balanced trading—too low means inactive, too high means over-trading.
|
|
139
|
+
- **Forum Experiences**: (Note: Specific turnover searches weren't direct, but tied to fitness/Sharpe improvements)
|
|
140
|
+
- Use decay functions to smooth signals, reducing unnecessary trades.
|
|
141
|
+
- In multi-alpha simulations, filter by turnover thresholds in code to pre-select candidates.
|
|
142
|
+
|
|
143
|
+
## 4. Weight Test
|
|
144
|
+
### Requirements
|
|
145
|
+
- Max weight in any stock <10%.
|
|
146
|
+
- Sufficient instruments assigned weight (varies by universe, e.g., TOP3000).
|
|
147
|
+
|
|
148
|
+
### Explanation
|
|
149
|
+
Ensures diversification; fails if concentrated or too few stocks weighted.
|
|
150
|
+
|
|
151
|
+
### Tips to Improve
|
|
152
|
+
- **From Docs**: Avoid expressions that overly concentrate weights. Assign weights broadly after simulation start.
|
|
153
|
+
- **Forum Experiences**: (Limited direct posts; inferred from general submission tips)
|
|
154
|
+
- Use neutralization (e.g., market) to distribute weights evenly.
|
|
155
|
+
- Check via simulation stats; adjust with rank or scale operators.
|
|
156
|
+
|
|
157
|
+
## 5. Sub-universe Test
|
|
158
|
+
### Requirements
|
|
159
|
+
- Sub-universe Sharpe >= 0.75 * sqrt(subuniverse_size / alpha_universe_size) * alpha_sharpe.
|
|
160
|
+
- Ensures robustness in more liquid sub-universes (e.g., TOP1000 for TOP3000).
|
|
161
|
+
|
|
162
|
+
### Explanation
|
|
163
|
+
Tests if alpha performs in liquid stocks, avoiding over-reliance on illiquid ones.
|
|
164
|
+
|
|
165
|
+
### Tips to Improve
|
|
166
|
+
- **From Docs**: Avoid size-related multipliers. Decay liquid/non-liquid parts separately (e.g., ts_decay_linear(signal,5)*rank(volume*close) + ts_decay_linear(signal,10)*(1-rank(volume*close))). From this example, we can see that the signal can be inflated by different weights for different parts of an datafield.
|
|
167
|
+
- Step-by-step improvements; discard non-robust signals.
|
|
168
|
+
- **Forum Experiences**: (From "how to pass submission tests")
|
|
169
|
+
- Improve overall Sharpe first, as it scales the threshold.
|
|
170
|
+
- Use pasteurize to handle NaNs and ensure even distribution.
|
|
171
|
+
|
|
172
|
+
## 6. Self-Correlation
|
|
173
|
+
### Requirements
|
|
174
|
+
- <0.7 PnL correlation with own submitted alphas.
|
|
175
|
+
- Or Sharpe at least 10% greater than correlated alphas.
|
|
176
|
+
|
|
177
|
+
### Explanation
|
|
178
|
+
Promotes diversity; based on 4-year PnL window. Allows improvements if new alpha is significantly better.
|
|
179
|
+
|
|
180
|
+
### Tips to Improve
|
|
181
|
+
- **From Docs**: Submit diverse ideas. Use correlation table in results to identify issues.
|
|
182
|
+
- **Forum Experiences** (from searches on "reduce correlation self alphas"):
|
|
183
|
+
- Local computation of self-correlation (e.g., via PnL matrices) to pre-filter before submission.
|
|
184
|
+
- Code optimizations: Prune high-correlation alphas, use clustering or weighted pruning (e.g., Sharpe-weighted) to retain diverse sets.
|
|
185
|
+
- Handle negatives: Transform negatively correlated alphas (e.g., in China market) by inversion or adjustments.
|
|
186
|
+
- Scripts for batch checking: Use machine_lib modifications to print correlations and pyramid info.
|
|
187
|
+
- Community shares: Differences between local and platform calculations (e.g., due to NaN handling); align by using full PnL data.
|
|
188
|
+
|
|
189
|
+
### Evaluating Whole Alpha Quality
|
|
190
|
+
Before final submission, perform these checks on simulation results:
|
|
191
|
+
|
|
192
|
+
- **Yearly Stats Quality Check**: Review yearly statistics. If records are missing for >5 years, it indicates low data quality (e.g., sparse coverage). Fix with ts_backfill, data selection, or alternative fields to ensure robust performance across tests.
|
|
193
|
+
|
|
194
|
+
This complements per-test improvements by validating overall alpha reliability.
|
|
195
|
+
|
|
196
|
+
## General Advice
|
|
197
|
+
- Start with broad simulations, narrow based on stats.
|
|
198
|
+
- Use tools like check_submission API for pre-checks.
|
|
199
|
+
- Forum consensus: Automate with Python scripts for efficiency (e.g., threading for simulates, databases for alpha management).
|
|
200
|
+
- Risks: Overfitting in manual tweaks; validate with train/test splits.
|
|
201
|
+
|
|
202
|
+
This guide is based on tool-gathered data. For updates, check BRAIN docs or forum.
|
|
@@ -0,0 +1,490 @@
|
|
|
1
|
+
# 🔍 arXiv Paper Search & Download Tool
|
|
2
|
+
|
|
3
|
+
A comprehensive Python tool for searching, analyzing, and downloading research papers from arXiv using their public API. Perfect for researchers, students, and anyone interested in academic papers.
|
|
4
|
+
|
|
5
|
+
## 📋 Table of Contents
|
|
6
|
+
|
|
7
|
+
- [Features](#-features)
|
|
8
|
+
- [Installation](#-installation)
|
|
9
|
+
- [Quick Start](#-quick-start)
|
|
10
|
+
- [Usage Modes](#-usage-modes)
|
|
11
|
+
- [API Functions](#-api-functions)
|
|
12
|
+
- [Examples](#-examples)
|
|
13
|
+
- [Advanced Usage](#-advanced-usage)
|
|
14
|
+
- [Troubleshooting](#-troubleshooting)
|
|
15
|
+
|
|
16
|
+
## ✨ Features
|
|
17
|
+
|
|
18
|
+
- **🔍 Smart Search**: Search arXiv papers by title, author, abstract, or any keyword
|
|
19
|
+
- **📥 Smart Download**: Download PDFs with automatic filename renaming to paper titles
|
|
20
|
+
- **📊 Result Parsing**: Automatically extract structured information (title, authors, abstract, ID)
|
|
21
|
+
- **🖥️ Interactive Mode**: Command-line interface for easy searching and downloading
|
|
22
|
+
- **⚡ Batch Operations**: Search multiple papers and download in sequence
|
|
23
|
+
- **📈 Academic Research**: Perfect for literature reviews and research discovery
|
|
24
|
+
- **🔄 Auto-Rename**: Downloaded files are automatically named using paper titles instead of cryptic IDs
|
|
25
|
+
|
|
26
|
+
## 🚀 Installation
|
|
27
|
+
|
|
28
|
+
### Prerequisites
|
|
29
|
+
- Python 3.6 or higher
|
|
30
|
+
- Internet connection for API access
|
|
31
|
+
|
|
32
|
+
### Install Dependencies
|
|
33
|
+
```bash
|
|
34
|
+
pip install requests
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Download the Script
|
|
38
|
+
```bash
|
|
39
|
+
# Clone or download arxiv_api.py to your working directory
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## 🎯 Quick Start
|
|
43
|
+
|
|
44
|
+
### Basic Search
|
|
45
|
+
```bash
|
|
46
|
+
python arxiv_api.py "machine learning"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Search with Custom Results
|
|
50
|
+
```bash
|
|
51
|
+
python arxiv_api.py "quantum computing" -n 10
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Search and Download First Result
|
|
55
|
+
```bash
|
|
56
|
+
python arxiv_api.py "deep learning" -d
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Interactive Mode
|
|
60
|
+
```bash
|
|
61
|
+
python arxiv_api.py -i
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Download Paper by ID (with auto-rename)
|
|
65
|
+
```bash
|
|
66
|
+
# In interactive mode:
|
|
67
|
+
# 📚 arxiv> download 2502.05218v1
|
|
68
|
+
# This will automatically rename the file to the paper's title
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## 🎮 Usage Modes
|
|
72
|
+
|
|
73
|
+
### 1. Command Line Mode
|
|
74
|
+
Direct search queries from the command line.
|
|
75
|
+
|
|
76
|
+
**Syntax:**
|
|
77
|
+
```bash
|
|
78
|
+
python arxiv_api.py [query] [options]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Options:**
|
|
82
|
+
- `-n, --max_results`: Maximum number of results (default: 5)
|
|
83
|
+
- `-d, --download`: Download the first result automatically
|
|
84
|
+
- `-i, --interactive`: Start interactive mode
|
|
85
|
+
- `-h, --help`: Show help message
|
|
86
|
+
|
|
87
|
+
### 2. Interactive Mode
|
|
88
|
+
Interactive command-line interface for multiple operations.
|
|
89
|
+
|
|
90
|
+
**Commands:**
|
|
91
|
+
- `search <query> [max_results]`: Search for papers
|
|
92
|
+
- `download <paper_id>`: Download a specific paper (with auto-rename)
|
|
93
|
+
- `help`: Show available commands
|
|
94
|
+
- `quit/exit`: Exit the program
|
|
95
|
+
|
|
96
|
+
## 🔧 API Functions
|
|
97
|
+
|
|
98
|
+
### Core Functions
|
|
99
|
+
|
|
100
|
+
#### `search_arxiv(query, max_results=10)`
|
|
101
|
+
Searches arXiv for papers using the public API.
|
|
102
|
+
|
|
103
|
+
**Parameters:**
|
|
104
|
+
- `query` (str): Search query string
|
|
105
|
+
- `max_results` (int): Maximum number of results (default: 10)
|
|
106
|
+
|
|
107
|
+
**Returns:**
|
|
108
|
+
- `str`: XML response from arXiv API
|
|
109
|
+
|
|
110
|
+
**Example:**
|
|
111
|
+
```python
|
|
112
|
+
from arxiv_api import search_arxiv
|
|
113
|
+
|
|
114
|
+
results = search_arxiv("artificial intelligence", max_results=5)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
#### `get_paper_metadata(paper_id)`
|
|
118
|
+
Fetches paper metadata directly from arXiv API using paper ID.
|
|
119
|
+
|
|
120
|
+
**Parameters:**
|
|
121
|
+
- `paper_id` (str): arXiv paper ID (e.g., "2502.05218v1")
|
|
122
|
+
|
|
123
|
+
**Returns:**
|
|
124
|
+
- `dict`: Paper information dictionary, or `None` if not found
|
|
125
|
+
|
|
126
|
+
**Example:**
|
|
127
|
+
```python
|
|
128
|
+
from arxiv_api import get_paper_metadata
|
|
129
|
+
|
|
130
|
+
paper_info = get_paper_metadata("2502.05218v1")
|
|
131
|
+
if paper_info:
|
|
132
|
+
print(f"Title: {paper_info['title']}")
|
|
133
|
+
print(f"Authors: {', '.join(paper_info['authors'])}")
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
#### `download_paper(paper_id, output_dir=".", paper_title=None)`
|
|
137
|
+
Downloads a specific paper by its arXiv ID and automatically renames it to the paper title.
|
|
138
|
+
|
|
139
|
+
**Parameters:**
|
|
140
|
+
- `paper_id` (str): arXiv paper ID (e.g., "2502.05218v1")
|
|
141
|
+
- `output_dir` (str): Output directory (default: current directory)
|
|
142
|
+
- `paper_title` (str): Paper title for filename (optional, will be fetched automatically if not provided)
|
|
143
|
+
|
|
144
|
+
**Returns:**
|
|
145
|
+
- `str`: File path of downloaded PDF, or `None` if failed
|
|
146
|
+
|
|
147
|
+
**Features:**
|
|
148
|
+
- **Auto-rename**: Automatically renames downloaded files to paper titles
|
|
149
|
+
- **Smart cleaning**: Removes special characters and limits filename length
|
|
150
|
+
- **Fallback**: Uses paper ID if title is unavailable
|
|
151
|
+
|
|
152
|
+
**Example:**
|
|
153
|
+
```python
|
|
154
|
+
from arxiv_api import download_paper
|
|
155
|
+
|
|
156
|
+
# Download with automatic title fetching and renaming
|
|
157
|
+
filepath = download_paper("2502.05218v1")
|
|
158
|
+
|
|
159
|
+
# Download with custom title
|
|
160
|
+
filepath = download_paper("2502.05218v1", paper_title="My Custom Title")
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
#### `parse_search_results(xml_content)`
|
|
164
|
+
Parses XML search results and extracts structured paper information.
|
|
165
|
+
|
|
166
|
+
**Parameters:**
|
|
167
|
+
- `xml_content` (str): XML response from arXiv API
|
|
168
|
+
|
|
169
|
+
**Returns:**
|
|
170
|
+
- `list`: List of dictionaries containing paper information
|
|
171
|
+
|
|
172
|
+
**Paper Information Structure:**
|
|
173
|
+
```python
|
|
174
|
+
{
|
|
175
|
+
'title': 'Paper Title',
|
|
176
|
+
'authors': ['Author 1', 'Author 2'],
|
|
177
|
+
'abstract': 'Paper abstract...',
|
|
178
|
+
'paper_id': '2502.05218v1',
|
|
179
|
+
'published': '2025-02-05T12:37:15Z'
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
#### `search_and_download(query, max_results=5, download_first=False)`
|
|
184
|
+
Combined function that searches for papers and optionally downloads the first result.
|
|
185
|
+
|
|
186
|
+
**Parameters:**
|
|
187
|
+
- `query` (str): Search query string
|
|
188
|
+
- `max_results` (int): Maximum number of results (default: 5)
|
|
189
|
+
- `download_first` (bool): Whether to download first result (default: False)
|
|
190
|
+
|
|
191
|
+
**Example:**
|
|
192
|
+
```python
|
|
193
|
+
from arxiv_api import search_and_download
|
|
194
|
+
|
|
195
|
+
# Search and display results only
|
|
196
|
+
search_and_download("machine learning", max_results=3)
|
|
197
|
+
|
|
198
|
+
# Search and download first result (with auto-rename)
|
|
199
|
+
search_and_download("deep learning", max_results=5, download_first=True)
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Interactive Mode Functions
|
|
203
|
+
|
|
204
|
+
#### `interactive_mode()`
|
|
205
|
+
Starts the interactive command-line interface.
|
|
206
|
+
|
|
207
|
+
**Features:**
|
|
208
|
+
- Command history
|
|
209
|
+
- Error handling
|
|
210
|
+
- User-friendly prompts
|
|
211
|
+
- Multiple search sessions
|
|
212
|
+
- **Smart download with auto-rename**
|
|
213
|
+
|
|
214
|
+
## 📚 Examples
|
|
215
|
+
|
|
216
|
+
### Example 1: Basic Paper Search
|
|
217
|
+
```bash
|
|
218
|
+
# Search for machine learning papers
|
|
219
|
+
python arxiv_api.py "machine learning"
|
|
220
|
+
|
|
221
|
+
# Output:
|
|
222
|
+
# Searching arXiv for: 'machine learning'
|
|
223
|
+
# --------------------------------------------------
|
|
224
|
+
# Found 5 papers:
|
|
225
|
+
#
|
|
226
|
+
# 1. Title: Introduction to Machine Learning
|
|
227
|
+
# Authors: John Doe, Jane Smith
|
|
228
|
+
# Paper ID: 2103.12345
|
|
229
|
+
# Published: 2021-03-15T10:30:00Z
|
|
230
|
+
# Abstract: This paper introduces...
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
### Example 2: Search with Custom Results
|
|
234
|
+
```bash
|
|
235
|
+
# Get 10 results for quantum computing
|
|
236
|
+
python arxiv_api.py "quantum computing" -n 10
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Example 3: Search and Download (with auto-rename)
|
|
240
|
+
```bash
|
|
241
|
+
# Search for papers and download the first one
|
|
242
|
+
python arxiv_api.py "artificial intelligence" -d
|
|
243
|
+
# Downloaded file will be automatically renamed to the paper title
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Example 4: Interactive Mode with Smart Download
|
|
247
|
+
```bash
|
|
248
|
+
python arxiv_api.py -i
|
|
249
|
+
|
|
250
|
+
# 📚 arxiv> search blockchain finance 5
|
|
251
|
+
# 📚 arxiv> download 2502.05218v1
|
|
252
|
+
# Fetching paper information for 2502.05218v1...
|
|
253
|
+
# Found paper: FactorGCL: A Hypergraph-Based Factor Model...
|
|
254
|
+
# Downloaded: .\FactorGCL_A_Hypergraph-Based_Factor_Model...pdf
|
|
255
|
+
# 📚 arxiv> help
|
|
256
|
+
# 📚 arxiv> quit
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Example 5: Python Script Integration
|
|
260
|
+
```python
|
|
261
|
+
from arxiv_api import search_and_download, download_paper, get_paper_metadata
|
|
262
|
+
|
|
263
|
+
# Search for papers on a specific topic
|
|
264
|
+
search_and_download("quantitative finance China", max_results=3)
|
|
265
|
+
|
|
266
|
+
# Download a specific paper with auto-rename
|
|
267
|
+
download_paper("2502.05218v1")
|
|
268
|
+
|
|
269
|
+
# Get paper metadata
|
|
270
|
+
paper_info = get_paper_metadata("2502.05218v1")
|
|
271
|
+
if paper_info:
|
|
272
|
+
print(f"Title: {paper_info['title']}")
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
## 🔍 Advanced Usage
|
|
276
|
+
|
|
277
|
+
### Smart Download Features
|
|
278
|
+
|
|
279
|
+
#### Automatic Filename Generation
|
|
280
|
+
```python
|
|
281
|
+
from arxiv_api import download_paper
|
|
282
|
+
|
|
283
|
+
# The tool automatically:
|
|
284
|
+
# 1. Fetches paper metadata
|
|
285
|
+
# 2. Extracts the title
|
|
286
|
+
# 3. Cleans the title for filename use
|
|
287
|
+
# 4. Downloads and renames the file
|
|
288
|
+
|
|
289
|
+
# Example output filename:
|
|
290
|
+
# "FactorGCL_A_Hypergraph-Based_Factor_Model_with_Temporal_Residual_Contrastive_Learning_for_Stock_Returns_Prediction.pdf"
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
#### Custom Search Queries
|
|
294
|
+
|
|
295
|
+
##### Field-Specific Searches
|
|
296
|
+
```bash
|
|
297
|
+
# Search by author
|
|
298
|
+
python arxiv_api.py "au:Yann LeCun"
|
|
299
|
+
|
|
300
|
+
# Search by title
|
|
301
|
+
python arxiv_api.py "ti:deep learning"
|
|
302
|
+
|
|
303
|
+
# Search by abstract
|
|
304
|
+
python arxiv_api.py "abs:neural networks"
|
|
305
|
+
|
|
306
|
+
# Search by category
|
|
307
|
+
python arxiv_api.py "cat:cs.AI"
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
##### Complex Queries
|
|
311
|
+
```bash
|
|
312
|
+
# Multiple terms
|
|
313
|
+
python arxiv_api.py "machine learning AND neural networks"
|
|
314
|
+
|
|
315
|
+
# Exclude terms
|
|
316
|
+
python arxiv_api.py "deep learning NOT reinforcement"
|
|
317
|
+
|
|
318
|
+
# Date range
|
|
319
|
+
python arxiv_api.py "machine learning AND submittedDate:[20230101 TO 20231231]"
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### Batch Operations
|
|
323
|
+
|
|
324
|
+
#### Download Multiple Papers with Auto-Rename
|
|
325
|
+
```python
|
|
326
|
+
from arxiv_api import search_arxiv, parse_search_results, download_paper
|
|
327
|
+
|
|
328
|
+
# Search for papers
|
|
329
|
+
query = "quantum computing"
|
|
330
|
+
results = search_arxiv(query, max_results=10)
|
|
331
|
+
papers = parse_search_results(results)
|
|
332
|
+
|
|
333
|
+
# Download all papers (each will be automatically renamed)
|
|
334
|
+
for paper in papers:
|
|
335
|
+
paper_id = paper.get('paper_id')
|
|
336
|
+
if paper_id:
|
|
337
|
+
download_paper(paper_id, output_dir="./quantum_papers")
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
#### Custom Output Formatting
|
|
341
|
+
```python
|
|
342
|
+
from arxiv_api import search_and_download
|
|
343
|
+
|
|
344
|
+
# Custom display function
|
|
345
|
+
def custom_display(papers):
|
|
346
|
+
for i, paper in enumerate(papers, 1):
|
|
347
|
+
print(f"📄 Paper {i}: {paper['title']}")
|
|
348
|
+
print(f"👥 Authors: {', '.join(paper['authors'])}")
|
|
349
|
+
print(f"🆔 ID: {paper['paper_id']}")
|
|
350
|
+
print(f"📅 Date: {paper['published']}")
|
|
351
|
+
print(f"📝 Abstract: {paper['abstract'][:150]}...")
|
|
352
|
+
print("-" * 80)
|
|
353
|
+
|
|
354
|
+
# Use custom display
|
|
355
|
+
search_and_download("blockchain", max_results=3)
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
## 🛠️ Troubleshooting
|
|
359
|
+
|
|
360
|
+
### Common Issues
|
|
361
|
+
|
|
362
|
+
#### 1. No Results Found
|
|
363
|
+
**Problem:** Search returns no papers
|
|
364
|
+
**Solution:**
|
|
365
|
+
- Check spelling and use broader terms
|
|
366
|
+
- Try different keyword combinations
|
|
367
|
+
- Verify internet connection
|
|
368
|
+
|
|
369
|
+
#### 2. Download Failed
|
|
370
|
+
**Problem:** Paper download fails
|
|
371
|
+
**Solution:**
|
|
372
|
+
- Verify paper ID is correct
|
|
373
|
+
- Check if paper exists on arXiv
|
|
374
|
+
- Ensure write permissions in output directory
|
|
375
|
+
|
|
376
|
+
#### 3. API Rate Limiting
|
|
377
|
+
**Problem:** Too many requests
|
|
378
|
+
**Solution:**
|
|
379
|
+
- Wait between requests
|
|
380
|
+
- Reduce batch size
|
|
381
|
+
- Use interactive mode for multiple searches
|
|
382
|
+
|
|
383
|
+
#### 4. XML Parsing Errors
|
|
384
|
+
**Problem:** Error parsing search results
|
|
385
|
+
**Solution:**
|
|
386
|
+
- Check internet connection
|
|
387
|
+
- Verify API response format
|
|
388
|
+
- Update the script if needed
|
|
389
|
+
|
|
390
|
+
#### 5. Filename Too Long
|
|
391
|
+
**Problem:** Generated filename exceeds system limits
|
|
392
|
+
**Solution:**
|
|
393
|
+
- The tool automatically limits filenames to 100 characters
|
|
394
|
+
- Special characters are automatically cleaned
|
|
395
|
+
- Fallback to paper ID if title is unavailable
|
|
396
|
+
|
|
397
|
+
### Error Messages
|
|
398
|
+
|
|
399
|
+
```
|
|
400
|
+
Error: Failed to download paper 2502.05218v1
|
|
401
|
+
```
|
|
402
|
+
- Paper ID may not exist
|
|
403
|
+
- Network connection issue
|
|
404
|
+
- arXiv server problem
|
|
405
|
+
|
|
406
|
+
```
|
|
407
|
+
Error parsing XML: ...
|
|
408
|
+
```
|
|
409
|
+
- Malformed API response
|
|
410
|
+
- Network interruption
|
|
411
|
+
- API format change
|
|
412
|
+
|
|
413
|
+
```
|
|
414
|
+
Could not find paper information for 2502.05218v1
|
|
415
|
+
```
|
|
416
|
+
- Paper ID may be invalid
|
|
417
|
+
- arXiv API issue
|
|
418
|
+
- Network connectivity problem
|
|
419
|
+
|
|
420
|
+
## 📖 API Reference
|
|
421
|
+
|
|
422
|
+
### arXiv API Endpoints
|
|
423
|
+
- **Search API**: `http://export.arxiv.org/api/query`
|
|
424
|
+
- **Metadata API**: `http://export.arxiv.org/api/query?id_list={paper_id}`
|
|
425
|
+
- **Documentation**: https://arxiv.org/help/api
|
|
426
|
+
- **Rate Limits**: Be respectful, avoid excessive requests
|
|
427
|
+
|
|
428
|
+
### Data Fields Available
|
|
429
|
+
- **Title**: Paper title
|
|
430
|
+
- **Authors**: List of author names
|
|
431
|
+
- **Abstract**: Paper abstract
|
|
432
|
+
- **Paper ID**: Unique arXiv identifier
|
|
433
|
+
- **Published Date**: Publication timestamp
|
|
434
|
+
- **Categories**: arXiv subject categories
|
|
435
|
+
|
|
436
|
+
### Paper ID Format
|
|
437
|
+
- **Format**: `YYMM.NNNNNvN`
|
|
438
|
+
- **Example**: `2502.05218v1`
|
|
439
|
+
- **Download URL**: `https://arxiv.org/pdf/{paper_id}.pdf`
|
|
440
|
+
|
|
441
|
+
### Smart Download Features
|
|
442
|
+
- **Automatic Metadata Fetching**: Gets paper information before download
|
|
443
|
+
- **Intelligent Filename Generation**: Converts paper titles to valid filenames
|
|
444
|
+
- **Character Cleaning**: Removes special characters and spaces
|
|
445
|
+
- **Length Limiting**: Ensures filenames don't exceed system limits
|
|
446
|
+
- **Fallback Naming**: Uses paper ID if title is unavailable
|
|
447
|
+
|
|
448
|
+
## 🤝 Contributing
|
|
449
|
+
|
|
450
|
+
### Adding New Features
|
|
451
|
+
1. Fork the repository
|
|
452
|
+
2. Create a feature branch
|
|
453
|
+
3. Implement your changes
|
|
454
|
+
4. Add tests and documentation
|
|
455
|
+
5. Submit a pull request
|
|
456
|
+
|
|
457
|
+
### Reporting Issues
|
|
458
|
+
- Check existing issues first
|
|
459
|
+
- Provide detailed error messages
|
|
460
|
+
- Include system information
|
|
461
|
+
- Describe steps to reproduce
|
|
462
|
+
|
|
463
|
+
## 📄 License
|
|
464
|
+
|
|
465
|
+
This project is open source and available under the MIT License.
|
|
466
|
+
|
|
467
|
+
## 🙏 Acknowledgments
|
|
468
|
+
|
|
469
|
+
- **arXiv**: For providing the public API
|
|
470
|
+
- **Python Community**: For excellent libraries and tools
|
|
471
|
+
- **Researchers**: For contributing to open science
|
|
472
|
+
|
|
473
|
+
## 📞 Support
|
|
474
|
+
|
|
475
|
+
### Getting Help
|
|
476
|
+
- Check this documentation first
|
|
477
|
+
- Review the examples section
|
|
478
|
+
- Search existing issues
|
|
479
|
+
- Create a new issue for bugs
|
|
480
|
+
|
|
481
|
+
### Useful Links
|
|
482
|
+
- [arXiv Official Site](https://arxiv.org/)
|
|
483
|
+
- [arXiv API Documentation](https://arxiv.org/help/api)
|
|
484
|
+
- [Python Requests Library](https://requests.readthedocs.io/)
|
|
485
|
+
|
|
486
|
+
---
|
|
487
|
+
|
|
488
|
+
**Happy Researching! 🎓📚**
|
|
489
|
+
|
|
490
|
+
*This tool makes academic research more accessible and efficient. Use it responsibly and respect arXiv's terms of service.*
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import xml.etree.ElementTree as ET
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import argparse
|
|
6
|
+
|
|
7
|
+
def search_arxiv(query, max_results=10):
|
|
8
|
+
"""Search arXiv for papers"""
|
|
9
|
+
base_url = "http://export.arxiv.org/api/query"
|
|
10
|
+
params = {
|
|
11
|
+
'search_query': query,
|
|
12
|
+
'start': 0,
|
|
13
|
+
'max_results': max_results
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
response = requests.get(base_url, params=params)
|
|
17
|
+
return response.text
|
|
18
|
+
|
|
19
|
+
def get_paper_metadata(paper_id):
|
|
20
|
+
"""Get paper metadata directly from arXiv API"""
|
|
21
|
+
try:
|
|
22
|
+
# Use the arXiv API to get paper metadata
|
|
23
|
+
metadata_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"
|
|
24
|
+
response = requests.get(metadata_url)
|
|
25
|
+
|
|
26
|
+
if response.status_code == 200:
|
|
27
|
+
papers = parse_search_results(response.text)
|
|
28
|
+
if papers and len(papers) > 0:
|
|
29
|
+
return papers[0]
|
|
30
|
+
return None
|
|
31
|
+
except Exception as e:
|
|
32
|
+
print(f"Error fetching paper metadata: {e}")
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
def download_paper(paper_id, output_dir=".", paper_title=None):
|
|
36
|
+
"""Download a paper by its ID and rename it to the paper title"""
|
|
37
|
+
pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
|
|
38
|
+
response = requests.get(pdf_url)
|
|
39
|
+
|
|
40
|
+
if response.status_code == 200:
|
|
41
|
+
# Create filename from paper title if available, otherwise use paper ID
|
|
42
|
+
if paper_title:
|
|
43
|
+
# Clean the title for filename (remove special characters, limit length)
|
|
44
|
+
clean_title = "".join(c for c in paper_title if c.isalnum() or c in (' ', '-', '_')).rstrip()
|
|
45
|
+
clean_title = clean_title.replace(' ', '_')[:100] # Limit length to 100 chars
|
|
46
|
+
filename = f"{clean_title}.pdf"
|
|
47
|
+
else:
|
|
48
|
+
filename = f"{paper_id}.pdf"
|
|
49
|
+
|
|
50
|
+
filepath = os.path.join(output_dir, filename)
|
|
51
|
+
|
|
52
|
+
with open(filepath, 'wb') as f:
|
|
53
|
+
f.write(response.content)
|
|
54
|
+
print(f"Downloaded: {filepath}")
|
|
55
|
+
return filepath
|
|
56
|
+
else:
|
|
57
|
+
print(f"Failed to download paper {paper_id}")
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
def parse_search_results(xml_content):
|
|
61
|
+
"""Parse XML search results and extract paper information"""
|
|
62
|
+
try:
|
|
63
|
+
root = ET.fromstring(xml_content)
|
|
64
|
+
papers = []
|
|
65
|
+
|
|
66
|
+
# Find all entry elements
|
|
67
|
+
for entry in root.findall('.//{http://www.w3.org/2005/Atom}entry'):
|
|
68
|
+
paper_info = {}
|
|
69
|
+
|
|
70
|
+
# Extract title
|
|
71
|
+
title_elem = entry.find('.//{http://www.w3.org/2005/Atom}title')
|
|
72
|
+
if title_elem is not None:
|
|
73
|
+
paper_info['title'] = title_elem.text.strip()
|
|
74
|
+
|
|
75
|
+
# Extract authors
|
|
76
|
+
authors = []
|
|
77
|
+
for author in entry.findall('.//{http://www.w3.org/2005/Atom}author'):
|
|
78
|
+
name_elem = author.find('.//{http://www.w3.org/2005/Atom}name')
|
|
79
|
+
if name_elem is not None:
|
|
80
|
+
authors.append(name_elem.text.strip())
|
|
81
|
+
paper_info['authors'] = authors
|
|
82
|
+
|
|
83
|
+
# Extract abstract
|
|
84
|
+
summary_elem = entry.find('.//{http://www.w3.org/2005/Atom}summary')
|
|
85
|
+
if summary_elem is not None:
|
|
86
|
+
paper_info['abstract'] = summary_elem.text.strip()
|
|
87
|
+
|
|
88
|
+
# Extract paper ID from the id field
|
|
89
|
+
id_elem = entry.find('.//{http://www.w3.org/2005/Atom}id')
|
|
90
|
+
if id_elem is not None:
|
|
91
|
+
# Extract ID from URL like "http://arxiv.org/abs/2103.12345"
|
|
92
|
+
paper_id = id_elem.text.split('/')[-1]
|
|
93
|
+
paper_info['paper_id'] = paper_id
|
|
94
|
+
|
|
95
|
+
# Extract published date
|
|
96
|
+
published_elem = entry.find('.//{http://www.w3.org/2005/Atom}published')
|
|
97
|
+
if published_elem is not None:
|
|
98
|
+
paper_info['published'] = published_elem.text.strip()
|
|
99
|
+
|
|
100
|
+
papers.append(paper_info)
|
|
101
|
+
|
|
102
|
+
return papers
|
|
103
|
+
except ET.ParseError as e:
|
|
104
|
+
print(f"Error parsing XML: {e}")
|
|
105
|
+
return []
|
|
106
|
+
|
|
107
|
+
def search_and_download(query, max_results=5, download_first=False):
|
|
108
|
+
"""Search for papers and optionally download the first result"""
|
|
109
|
+
print(f"Searching arXiv for: '{query}'")
|
|
110
|
+
print("-" * 50)
|
|
111
|
+
|
|
112
|
+
# Search for papers
|
|
113
|
+
results = search_arxiv(query, max_results)
|
|
114
|
+
papers = parse_search_results(results)
|
|
115
|
+
|
|
116
|
+
if not papers:
|
|
117
|
+
print("No papers found.")
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
# Display search results
|
|
121
|
+
print(f"Found {len(papers)} papers:\n")
|
|
122
|
+
for i, paper in enumerate(papers, 1):
|
|
123
|
+
print(f"{i}. Title: {paper.get('title', 'N/A')}")
|
|
124
|
+
print(f" Authors: {', '.join(paper.get('authors', ['N/A']))}")
|
|
125
|
+
print(f" Paper ID: {paper.get('paper_id', 'N/A')}")
|
|
126
|
+
print(f" Published: {paper.get('published', 'N/A')}")
|
|
127
|
+
print(f" Abstract: {paper.get('abstract', 'N/A')[:200]}...")
|
|
128
|
+
print()
|
|
129
|
+
|
|
130
|
+
# Optionally download first paper
|
|
131
|
+
if download_first and papers:
|
|
132
|
+
first_paper = papers[0]
|
|
133
|
+
paper_id = first_paper.get('paper_id')
|
|
134
|
+
paper_title = first_paper.get('title')
|
|
135
|
+
if paper_id:
|
|
136
|
+
print(f"Downloading first paper: {paper_id}")
|
|
137
|
+
download_paper(paper_id, paper_title=paper_title)
|
|
138
|
+
else:
|
|
139
|
+
print("Could not extract paper ID for download")
|
|
140
|
+
|
|
141
|
+
def interactive_mode():
|
|
142
|
+
"""Interactive mode for searching arXiv"""
|
|
143
|
+
print("🔍 arXiv Paper Search Tool")
|
|
144
|
+
print("=" * 40)
|
|
145
|
+
print("Commands:")
|
|
146
|
+
print(" search <query> [max_results] - Search for papers")
|
|
147
|
+
print(" download <paper_id> - Download a specific paper")
|
|
148
|
+
print(" help - Show this help message")
|
|
149
|
+
print(" quit/exit - Exit the program")
|
|
150
|
+
print()
|
|
151
|
+
|
|
152
|
+
while True:
|
|
153
|
+
try:
|
|
154
|
+
command = input("📚 arxiv> ").strip()
|
|
155
|
+
|
|
156
|
+
if not command:
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
parts = command.split()
|
|
160
|
+
cmd = parts[0].lower()
|
|
161
|
+
|
|
162
|
+
if cmd in ['quit', 'exit', 'q']:
|
|
163
|
+
print("Goodbye! 👋")
|
|
164
|
+
break
|
|
165
|
+
|
|
166
|
+
elif cmd == 'help':
|
|
167
|
+
print("Commands:")
|
|
168
|
+
print(" search <query> [max_results] - Search for papers")
|
|
169
|
+
print(" download <paper_id> - Download a specific paper")
|
|
170
|
+
print(" help - Show this help message")
|
|
171
|
+
print(" quit/exit - Exit the program")
|
|
172
|
+
print()
|
|
173
|
+
|
|
174
|
+
elif cmd == 'search':
|
|
175
|
+
if len(parts) < 2:
|
|
176
|
+
print("Usage: search <query> [max_results]")
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
query = ' '.join(parts[1:-1]) if len(parts) > 2 else parts[1]
|
|
180
|
+
max_results = int(parts[-1]) if len(parts) > 2 and parts[-1].isdigit() else 5
|
|
181
|
+
|
|
182
|
+
search_and_download(query, max_results, download_first=False)
|
|
183
|
+
|
|
184
|
+
elif cmd == 'download':
|
|
185
|
+
if len(parts) < 2:
|
|
186
|
+
print("Usage: download <paper_id>")
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
paper_id = parts[1]
|
|
190
|
+
# Get paper metadata first
|
|
191
|
+
print(f"Fetching paper information for {paper_id}...")
|
|
192
|
+
paper_info = get_paper_metadata(paper_id)
|
|
193
|
+
|
|
194
|
+
if paper_info and paper_info.get('title'):
|
|
195
|
+
paper_title = paper_info['title']
|
|
196
|
+
print(f"Found paper: {paper_title}")
|
|
197
|
+
download_paper(paper_id, paper_title=paper_title)
|
|
198
|
+
else:
|
|
199
|
+
print(f"Could not find paper information for {paper_id}")
|
|
200
|
+
print("Downloading with paper ID as filename...")
|
|
201
|
+
download_paper(paper_id)
|
|
202
|
+
|
|
203
|
+
else:
|
|
204
|
+
print(f"Unknown command: {cmd}")
|
|
205
|
+
print("Type 'help' for available commands")
|
|
206
|
+
|
|
207
|
+
except KeyboardInterrupt:
|
|
208
|
+
print("\nGoodbye! 👋")
|
|
209
|
+
break
|
|
210
|
+
except Exception as e:
|
|
211
|
+
print(f"Error: {e}")
|
|
212
|
+
|
|
213
|
+
# Example usage
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
parser = argparse.ArgumentParser(description='Search and download papers from arXiv')
|
|
216
|
+
parser.add_argument('query', nargs='?', help='Search query')
|
|
217
|
+
parser.add_argument('-n', '--max_results', type=int, default=5, help='Maximum number of results (default: 5)')
|
|
218
|
+
parser.add_argument('-d', '--download', action='store_true', help='Download the first result')
|
|
219
|
+
parser.add_argument('-i', '--interactive', action='store_true', help='Start interactive mode')
|
|
220
|
+
|
|
221
|
+
args = parser.parse_args()
|
|
222
|
+
|
|
223
|
+
if args.interactive:
|
|
224
|
+
interactive_mode()
|
|
225
|
+
elif args.query:
|
|
226
|
+
search_and_download(args.query, args.max_results, args.download)
|
|
227
|
+
else:
|
|
228
|
+
# Default behavior - start interactive mode
|
|
229
|
+
interactive_mode()
|
|
@@ -2502,6 +2502,49 @@ async def get_daily_and_quarterly_payment(email: str = "", password: str = "") -
|
|
|
2502
2502
|
except Exception as e:
|
|
2503
2503
|
return {"error": f"Error retrieving payment information: {str(e)}"}
|
|
2504
2504
|
|
|
2505
|
+
|
|
2506
|
+
|
|
2507
|
+
# New MCP tool: get_error_message_fromAlphaLocation
|
|
2508
|
+
from typing import Sequence
|
|
2509
|
+
@mcp.tool()
|
|
2510
|
+
async def get_error_message_fromAlphaLocation(locations: Sequence[str]) -> dict:
|
|
2511
|
+
"""
|
|
2512
|
+
Fetch and parse error/status from multiple simulation locations (URLs).
|
|
2513
|
+
Args:
|
|
2514
|
+
locations: List of simulation result URLs (e.g., /simulations/{id})
|
|
2515
|
+
Returns:
|
|
2516
|
+
List of dicts with location, error message, and raw response
|
|
2517
|
+
"""
|
|
2518
|
+
results = []
|
|
2519
|
+
for loc in locations:
|
|
2520
|
+
try:
|
|
2521
|
+
resp = brain_client.session.get(loc)
|
|
2522
|
+
if resp.status_code != 200:
|
|
2523
|
+
results.append({
|
|
2524
|
+
"location": loc,
|
|
2525
|
+
"error": f"HTTP {resp.status_code}",
|
|
2526
|
+
"raw": resp.text
|
|
2527
|
+
})
|
|
2528
|
+
continue
|
|
2529
|
+
data = resp.json() if resp.text else {}
|
|
2530
|
+
# Try to extract error message or status
|
|
2531
|
+
error_msg = data.get("error") or data.get("message")
|
|
2532
|
+
# If alpha ID is missing, include that info
|
|
2533
|
+
if not data.get("alpha"):
|
|
2534
|
+
error_msg = error_msg or "Simulation did not get through, if you are running a multisimulation, check the other children location in your request"
|
|
2535
|
+
results.append({
|
|
2536
|
+
"location": loc,
|
|
2537
|
+
"error": error_msg,
|
|
2538
|
+
"raw": data
|
|
2539
|
+
})
|
|
2540
|
+
except Exception as e:
|
|
2541
|
+
results.append({
|
|
2542
|
+
"location": loc,
|
|
2543
|
+
"error": str(e),
|
|
2544
|
+
"raw": None
|
|
2545
|
+
})
|
|
2546
|
+
return {"results": results}
|
|
2547
|
+
|
|
2505
2548
|
if __name__ == "__main__":
|
|
2506
2549
|
print("🧠 WorldQuant BRAIN MCP Server Starting...", file=sys.stderr)
|
|
2507
2550
|
mcp.run()
|
|
@@ -12,7 +12,10 @@ cnhkmcp.egg-info/not-zip-safe
|
|
|
12
12
|
cnhkmcp.egg-info/requires.txt
|
|
13
13
|
cnhkmcp.egg-info/top_level.txt
|
|
14
14
|
cnhkmcp/untracked/BRAIN_6_Tips_Datafield_Exploration_Guide.md
|
|
15
|
+
cnhkmcp/untracked/BRAIN_Alpha_Test_Requirements_and_Tips.md
|
|
15
16
|
cnhkmcp/untracked/Dataset_Exploration_Expert_Manual.md
|
|
17
|
+
cnhkmcp/untracked/arXiv_API_Tool_Manual.md
|
|
18
|
+
cnhkmcp/untracked/arxiv_api.py
|
|
16
19
|
cnhkmcp/untracked/daily_report_workflow.md
|
|
17
20
|
cnhkmcp/untracked/forum_functions.py
|
|
18
21
|
cnhkmcp/untracked/platform_functions.py
|
|
@@ -13,7 +13,7 @@ def read_requirements():
|
|
|
13
13
|
|
|
14
14
|
setup(
|
|
15
15
|
name="cnhkmcp",
|
|
16
|
-
version="1.
|
|
16
|
+
version="1.3.1",
|
|
17
17
|
author="CNHK",
|
|
18
18
|
author_email="cnhk@example.com",
|
|
19
19
|
description="A comprehensive Model Context Protocol (MCP) server for quantitative trading platform integration",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cnhkmcp-1.2.9 → cnhkmcp-1.3.1}/cnhkmcp/untracked/BRAIN_6_Tips_Datafield_Exploration_Guide.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|