opencite 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opencite-0.1.0/.claude-plugin/plugin.json +12 -0
- opencite-0.1.0/.context/research.md +197 -0
- opencite-0.1.0/.github/workflows/publish.yml +109 -0
- opencite-0.1.0/.github/workflows/tests.yml +43 -0
- opencite-0.1.0/.gitignore +44 -0
- opencite-0.1.0/.serena/.gitignore +1 -0
- opencite-0.1.0/.serena/project.yml +84 -0
- opencite-0.1.0/CLAUDE.md +75 -0
- opencite-0.1.0/LICENSE +21 -0
- opencite-0.1.0/PKG-INFO +80 -0
- opencite-0.1.0/README.md +47 -0
- opencite-0.1.0/commands/opencite.md +76 -0
- opencite-0.1.0/pyproject.toml +85 -0
- opencite-0.1.0/scripts/lit_search.py +816 -0
- opencite-0.1.0/skills/opencite-cli/SKILL.md +174 -0
- opencite-0.1.0/src/opencite/__init__.py +25 -0
- opencite-0.1.0/src/opencite/__main__.py +7 -0
- opencite-0.1.0/src/opencite/bibtex.py +102 -0
- opencite-0.1.0/src/opencite/citations.py +226 -0
- opencite-0.1.0/src/opencite/cli.py +414 -0
- opencite-0.1.0/src/opencite/clients/__init__.py +1 -0
- opencite-0.1.0/src/opencite/clients/base.py +155 -0
- opencite-0.1.0/src/opencite/clients/id_converter.py +95 -0
- opencite-0.1.0/src/opencite/clients/openalex.py +395 -0
- opencite-0.1.0/src/opencite/clients/pubmed.py +399 -0
- opencite-0.1.0/src/opencite/clients/semantic_scholar.py +256 -0
- opencite-0.1.0/src/opencite/config.py +94 -0
- opencite-0.1.0/src/opencite/convert.py +82 -0
- opencite-0.1.0/src/opencite/dedup.py +168 -0
- opencite-0.1.0/src/opencite/exceptions.py +86 -0
- opencite-0.1.0/src/opencite/formatters/__init__.py +30 -0
- opencite-0.1.0/src/opencite/formatters/base.py +25 -0
- opencite-0.1.0/src/opencite/formatters/bibtex_fmt.py +36 -0
- opencite-0.1.0/src/opencite/formatters/csv_fmt.py +65 -0
- opencite-0.1.0/src/opencite/formatters/json_fmt.py +81 -0
- opencite-0.1.0/src/opencite/formatters/text.py +84 -0
- opencite-0.1.0/src/opencite/models.py +275 -0
- opencite-0.1.0/src/opencite/pdf.py +167 -0
- opencite-0.1.0/src/opencite/search.py +259 -0
- opencite-0.1.0/src/opencite/utils.py +67 -0
- opencite-0.1.0/tests/__init__.py +0 -0
- opencite-0.1.0/tests/conftest.py +72 -0
- opencite-0.1.0/tests/test_bibtex.py +100 -0
- opencite-0.1.0/tests/test_cli.py +175 -0
- opencite-0.1.0/tests/test_clients/__init__.py +0 -0
- opencite-0.1.0/tests/test_clients/test_openalex.py +134 -0
- opencite-0.1.0/tests/test_clients/test_pubmed.py +249 -0
- opencite-0.1.0/tests/test_clients/test_semantic_scholar.py +113 -0
- opencite-0.1.0/tests/test_config.py +54 -0
- opencite-0.1.0/tests/test_dedup.py +195 -0
- opencite-0.1.0/tests/test_formatters.py +143 -0
- opencite-0.1.0/tests/test_models.py +216 -0
- opencite-0.1.0/tests/test_pdf.py +108 -0
- opencite-0.1.0/tests/test_search.py +128 -0
- opencite-0.1.0/tests/test_utils.py +90 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "opencite",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Academic literature search, citation management, and PDF retrieval using the opencite CLI",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "Seyed Yahya Shirazi",
|
|
7
|
+
"email": "shirazi@ieee.org"
|
|
8
|
+
},
|
|
9
|
+
"repository": "https://github.com/neuromechanist/opencite",
|
|
10
|
+
"license": "MIT",
|
|
11
|
+
"keywords": ["academic", "literature", "citations", "bibtex", "pdf", "pubmed", "openalex", "semantic-scholar"]
|
|
12
|
+
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# OpenCite API Research
|
|
2
|
+
|
|
3
|
+
Comprehensive analysis of the three academic APIs for building the OpenCite CLI.
|
|
4
|
+
|
|
5
|
+
## Capability Matrix
|
|
6
|
+
|
|
7
|
+
| Capability | OpenAlex | Semantic Scholar | PubMed/PMC |
|
|
8
|
+
|---|---|---|---|
|
|
9
|
+
| **Keyword search** | Yes (title, abstract, fulltext) | Yes (relevance + bulk/boolean) | Yes (field tags, MeSH) |
|
|
10
|
+
| **DOI lookup** | `/works/doi:X` | `/paper/DOI:X` | `X[lid]` or `X[aid]` search |
|
|
11
|
+
| **PMID lookup** | `/works/pmid:X` | `/paper/PMID:X` | Native (efetch by ID) |
|
|
12
|
+
| **PMCID lookup** | `/works/pmcid:X` | `/paper/PMCID:X` | Native (efetch by ID) |
|
|
13
|
+
| **ArXiv lookup** | No direct | `/paper/ARXIV:X` | No |
|
|
14
|
+
| **Batch lookup** | 50 IDs via filter pipe | POST 500 IDs `/paper/batch` | POST thousands via epost |
|
|
15
|
+
| **Citing papers** | `filter=cites:W123` | `/paper/{id}/citations` | elink `pubmed_pubmed_citedin` |
|
|
16
|
+
| **References** | `referenced_works` field or `filter=cited_by:W123` | `/paper/{id}/references` | elink `pubmed_pubmed_refs` |
|
|
17
|
+
| **Related papers** | `related_works` field | Recommendations API | elink `pubmed_pubmed` (similarity) |
|
|
18
|
+
| **PDF URLs** | `best_oa_location.pdf_url`, `locations[].pdf_url` | `openAccessPdf.url` | PMC OA Service |
|
|
19
|
+
| **Full-text download** | `content_url` (100 credits) | No hosted content | PMC efetch XML, BioC API, FTP |
|
|
20
|
+
| **BibTeX** | No native | `citationStyles` field | No native (parse XML) |
|
|
21
|
+
| **TLDR summaries** | No | `tldr` field (~60M papers) | No |
|
|
22
|
+
| **Paper embeddings** | No | SPECTER v1/v2 | No |
|
|
23
|
+
| **Author search** | `/authors?search=X` | `/author/search?query=X` | `smith j[au]` |
|
|
24
|
+
| **Author profiles** | h-index, i10-index, works_count | h-index, paperCount, citationCount | No profiles |
|
|
25
|
+
| **ORCID support** | `author.orcid` | `externalIds.ORCID` | No |
|
|
26
|
+
| **MeSH terms** | `mesh` field (PubMed-indexed) | No | Native, hierarchical |
|
|
27
|
+
| **Topics/concepts** | Topics (domain>field>subfield) | `s2FieldsOfStudy` | MeSH hierarchy |
|
|
28
|
+
| **Publication types** | `type` field | `publicationTypes` | 70+ pub types via `[pt]` |
|
|
29
|
+
| **Retraction status** | `is_retracted` (Retraction Watch) | No | `"retracted publication"[pt]` |
|
|
30
|
+
| **Funder/grants** | `grants.funder`, `grants.award_id` | No | `[gr]` field tag |
|
|
31
|
+
| **ID conversion** | Implicit (accepts pmid, pmcid, doi) | Implicit (accepts all) | Dedicated ID Converter API |
|
|
32
|
+
| **Autocomplete** | `/autocomplete/{entity}` | `/paper/autocomplete` | No |
|
|
33
|
+
| **Aggregations** | `group_by` parameter | No | No |
|
|
34
|
+
| **Rate limit (no key)** | Not allowed (key required as of Feb 2026) | Shared pool, unreliable | 3 req/sec |
|
|
35
|
+
| **Rate limit (with key)** | 100 req/sec, 100K credits/day | 1 req/sec | 10 req/sec |
|
|
36
|
+
| **Pagination max** | Unlimited (cursor) | 1K (relevance), 10M (bulk) | 10K per search |
|
|
37
|
+
|
|
38
|
+
## PDF Retrieval Strategy
|
|
39
|
+
|
|
40
|
+
### Tier 1: Direct PDF URLs (fastest)
|
|
41
|
+
|
|
42
|
+
**OpenAlex** is the richest source for PDF URLs:
|
|
43
|
+
- `best_oa_location.pdf_url` -- algorithmically chosen best OA copy
|
|
44
|
+
- `locations[].pdf_url` -- all known PDF locations
|
|
45
|
+
- Priority: published version > accepted > submitted
|
|
46
|
+
- `content_url` field for OpenAlex-hosted PDFs (costs 100 credits)
|
|
47
|
+
- Filter: `has_content.pdf:true` or `is_oa:true`
|
|
48
|
+
|
|
49
|
+
**Semantic Scholar**:
|
|
50
|
+
- `openAccessPdf.url` -- direct PDF link when available
|
|
51
|
+
- `isOpenAccess` -- boolean flag
|
|
52
|
+
- Can filter search results: `openAccessPdf` parameter
|
|
53
|
+
|
|
54
|
+
### Tier 2: PMC Full Text
|
|
55
|
+
|
|
56
|
+
**PMC OA Web Service** (`https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi`):
|
|
57
|
+
- Query by PMCID: `?id=PMC5334499`
|
|
58
|
+
- Returns PDF and tgz (XML + images) download links
|
|
59
|
+
- Only works for PMC Open Access Subset (~3M articles)
|
|
60
|
+
- Filter by format: `&format=pdf`
|
|
61
|
+
|
|
62
|
+
**PMC BioC API** (structured full text):
|
|
63
|
+
- `https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{PMID}/unicode`
|
|
64
|
+
- Returns structured JSON with sections, paragraphs, annotations
|
|
65
|
+
- Good for text extraction without PDF parsing
|
|
66
|
+
|
|
67
|
+
**PMC efetch** (full text XML):
|
|
68
|
+
- `efetch.fcgi?db=pmc&id=PMC1234567&retmode=xml`
|
|
69
|
+
- Returns JATS XML for OA articles
|
|
70
|
+
|
|
71
|
+
### Tier 3: DOI Content Negotiation (fallback)
|
|
72
|
+
|
|
73
|
+
Already implemented in current code:
|
|
74
|
+
- `GET https://doi.org/{doi}` with `Accept: application/x-bibtex` for BibTeX
|
|
75
|
+
- Can also request `Accept: application/pdf` for PDFs (publisher-dependent)
|
|
76
|
+
|
|
77
|
+
### Recommended PDF Retrieval Pipeline
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
1. Check OpenAlex best_oa_location.pdf_url
|
|
81
|
+
2. Check Semantic Scholar openAccessPdf.url
|
|
82
|
+
3. If PMCID available, try PMC OA Service for PDF
|
|
83
|
+
4. If PMCID available, try BioC API for structured text
|
|
84
|
+
5. Fall back to DOI content negotiation
|
|
85
|
+
6. If all fail, return landing page URL for manual download
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### ID Conversion for Cross-API Lookup
|
|
89
|
+
|
|
90
|
+
To maximize PDF retrieval, convert between ID types:
|
|
91
|
+
- **PMC ID Converter**: `https://pmc.ncbi.nlm.nih.gov/tools/id-converter-api/?tool=opencite&email=X&ids=PMID1,PMID2&format=json`
|
|
92
|
+
- Up to 200 IDs per request
|
|
93
|
+
- Converts PMID <-> PMCID <-> DOI <-> Manuscript ID
|
|
94
|
+
- OpenAlex also accepts `/works/pmid:X` and `/works/pmcid:X` directly
|
|
95
|
+
|
|
96
|
+
## Search Strategy by Use Case
|
|
97
|
+
|
|
98
|
+
### Keyword Search
|
|
99
|
+
Best approach: query all three in parallel (current design is good)
|
|
100
|
+
- **OpenAlex**: Broadest coverage (~250M works), fulltext search available
|
|
101
|
+
- **Semantic Scholar**: Good relevance ranking, TLDR summaries, boolean bulk search
|
|
102
|
+
- **PubMed**: Best for biomedical, MeSH term precision, structured field tags
|
|
103
|
+
|
|
104
|
+
### DOI Lookup
|
|
105
|
+
- **Semantic Scholar** `/paper/DOI:X` is fastest for single lookups
|
|
106
|
+
- **OpenAlex** `/works/doi:X` provides richest metadata (locations, topics, grants)
|
|
107
|
+
- Use both; merge results
|
|
108
|
+
|
|
109
|
+
### Citation Graph Traversal
|
|
110
|
+
- **OpenAlex**: `filter=cites:W123` returns citing works with full filtering/sorting
|
|
111
|
+
- **Semantic Scholar**: `/paper/{id}/citations` with up to 1K results, includes `influentialCitationCount`
|
|
112
|
+
- **PubMed**: elink `pubmed_pubmed_citedin` for biomedical citation chains
|
|
113
|
+
|
|
114
|
+
### Batch DOI Lookup
|
|
115
|
+
- **Semantic Scholar** POST `/paper/batch` with up to 500 IDs is most efficient
|
|
116
|
+
- **OpenAlex** filter pipe: up to 50 DOIs per request
|
|
117
|
+
- **PubMed** epost + efetch: thousands of PMIDs
|
|
118
|
+
|
|
119
|
+
## BibTeX Generation Strategy
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
1. Semantic Scholar citationStyles field (if available)
|
|
123
|
+
2. DOI content negotiation: GET doi.org/{doi} Accept: application/x-bibtex
|
|
124
|
+
3. Generate from metadata (current fallback, already implemented)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## PDF-to-Markdown Conversion
|
|
128
|
+
|
|
129
|
+
Two backends available:
|
|
130
|
+
|
|
131
|
+
### markit-mistral (Mistral AI OCR)
|
|
132
|
+
- Location: `../markit-mistral`
|
|
133
|
+
- CLI: `markit-mistral document.pdf -o output.md`
|
|
134
|
+
- Python: `MarkItMistral(api_key=KEY).convert_file("doc.pdf")`
|
|
135
|
+
- Strengths: math/LaTeX preservation, complex layouts, tables
|
|
136
|
+
- Requires: `MISTRAL_API_KEY`
|
|
137
|
+
- Cost: API usage fees
|
|
138
|
+
|
|
139
|
+
### markitdown (Microsoft, open source)
|
|
140
|
+
- Repo: https://github.com/microsoft/markitdown
|
|
141
|
+
- CLI: `markitdown document.pdf -o output.md`
|
|
142
|
+
- Strengths: free, no API key, good for simple documents
|
|
143
|
+
- Weaknesses: less accurate on math, complex layouts
|
|
144
|
+
|
|
145
|
+
### Recommended Strategy
|
|
146
|
+
```
|
|
147
|
+
1. Try markitdown first (free, no API costs)
|
|
148
|
+
2. If document has math/complex layout, use markit-mistral
|
|
149
|
+
3. Let user choose via CLI flag: --converter markitdown|mistral
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Rate Limit Management
|
|
153
|
+
|
|
154
|
+
| API | Strategy |
|
|
155
|
+
|---|---|
|
|
156
|
+
| OpenAlex | 100 req/sec is generous; batch via filter pipes; use `select` to reduce payload |
|
|
157
|
+
| Semantic Scholar | 1 req/sec is the bottleneck; use batch endpoint for multi-ID lookups; queue requests |
|
|
158
|
+
| PubMed | 10 req/sec is reasonable; use History Server for large jobs; batch efetch up to 500 records |
|
|
159
|
+
|
|
160
|
+
### Recommended: Async with per-API rate limiters
|
|
161
|
+
```python
|
|
162
|
+
# Per-API semaphores
|
|
163
|
+
openalex_limiter = AsyncLimiter(100, 1) # 100/sec
|
|
164
|
+
s2_limiter = AsyncLimiter(1, 1) # 1/sec
|
|
165
|
+
pubmed_limiter = AsyncLimiter(10, 1) # 10/sec
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Unique Strengths Per API
|
|
169
|
+
|
|
170
|
+
### OpenAlex -- Use for:
|
|
171
|
+
- Broadest coverage (250M+ works)
|
|
172
|
+
- PDF URL discovery (best_oa_location)
|
|
173
|
+
- Rich filtering (90+ filters)
|
|
174
|
+
- Funder/grant information
|
|
175
|
+
- Aggregations/analytics (group_by)
|
|
176
|
+
- Institution and country-level analysis
|
|
177
|
+
- Retraction status (Retraction Watch)
|
|
178
|
+
|
|
179
|
+
### Semantic Scholar -- Use for:
|
|
180
|
+
- TLDR auto-summaries
|
|
181
|
+
- Paper embeddings (SPECTER v2) for similarity
|
|
182
|
+
- Recommendations API
|
|
183
|
+
- Influential citation count
|
|
184
|
+
- Boolean bulk search (10M results)
|
|
185
|
+
- Batch lookup (500 IDs at once)
|
|
186
|
+
- ArXiv paper lookup
|
|
187
|
+
- BibTeX via citationStyles field
|
|
188
|
+
|
|
189
|
+
### PubMed/PMC -- Use for:
|
|
190
|
+
- Biomedical literature (gold standard)
|
|
191
|
+
- MeSH term searching (hierarchical, exploded)
|
|
192
|
+
- PMC full-text access (XML, PDF, BioC)
|
|
193
|
+
- Publication type precision (70+ types)
|
|
194
|
+
- Proximity searching ("term1 term2"[tiab:~3])
|
|
195
|
+
- ID conversion service (PMID/PMCID/DOI)
|
|
196
|
+
- Free full text filter
|
|
197
|
+
- Clinical trial and systematic review filters
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
push:
|
|
7
|
+
tags:
|
|
8
|
+
- 'v*'
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Install uv
|
|
17
|
+
uses: astral-sh/setup-uv@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
run: uv python install 3.12
|
|
21
|
+
|
|
22
|
+
- name: Create venv and install build dependencies
|
|
23
|
+
run: |
|
|
24
|
+
uv venv --python 3.12
|
|
25
|
+
uv pip install build twine
|
|
26
|
+
|
|
27
|
+
- name: Build package
|
|
28
|
+
run: uv run python -m build
|
|
29
|
+
|
|
30
|
+
- name: Check package
|
|
31
|
+
run: uv run twine check dist/*
|
|
32
|
+
|
|
33
|
+
- name: Upload build artifacts
|
|
34
|
+
uses: actions/upload-artifact@v4
|
|
35
|
+
with:
|
|
36
|
+
name: dist
|
|
37
|
+
path: dist/
|
|
38
|
+
|
|
39
|
+
test-install:
|
|
40
|
+
needs: build
|
|
41
|
+
runs-on: ubuntu-latest
|
|
42
|
+
strategy:
|
|
43
|
+
matrix:
|
|
44
|
+
python-version: ['3.11', '3.12', '3.13']
|
|
45
|
+
steps:
|
|
46
|
+
- uses: actions/checkout@v4
|
|
47
|
+
|
|
48
|
+
- name: Install uv
|
|
49
|
+
uses: astral-sh/setup-uv@v4
|
|
50
|
+
|
|
51
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
52
|
+
run: uv python install ${{ matrix.python-version }}
|
|
53
|
+
|
|
54
|
+
- name: Download build artifacts
|
|
55
|
+
uses: actions/download-artifact@v4
|
|
56
|
+
with:
|
|
57
|
+
name: dist
|
|
58
|
+
path: dist/
|
|
59
|
+
|
|
60
|
+
- name: Create venv and install from wheel
|
|
61
|
+
run: |
|
|
62
|
+
uv venv --python ${{ matrix.python-version }}
|
|
63
|
+
uv pip install dist/*.whl
|
|
64
|
+
|
|
65
|
+
- name: Test import
|
|
66
|
+
run: |
|
|
67
|
+
uv run python -c "import opencite; print(f'opencite version: {opencite.__version__}')"
|
|
68
|
+
uv run python -c "from opencite import Paper, Config; print('Core imports successful')"
|
|
69
|
+
|
|
70
|
+
publish-testpypi:
|
|
71
|
+
needs: [build, test-install]
|
|
72
|
+
runs-on: ubuntu-latest
|
|
73
|
+
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
74
|
+
environment:
|
|
75
|
+
name: testpypi
|
|
76
|
+
url: https://test.pypi.org/p/opencite
|
|
77
|
+
permissions:
|
|
78
|
+
id-token: write
|
|
79
|
+
steps:
|
|
80
|
+
- name: Download build artifacts
|
|
81
|
+
uses: actions/download-artifact@v4
|
|
82
|
+
with:
|
|
83
|
+
name: dist
|
|
84
|
+
path: dist/
|
|
85
|
+
|
|
86
|
+
- name: Publish to TestPyPI
|
|
87
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
88
|
+
with:
|
|
89
|
+
repository-url: https://test.pypi.org/legacy/
|
|
90
|
+
skip-existing: true
|
|
91
|
+
|
|
92
|
+
publish-pypi:
|
|
93
|
+
needs: [build, test-install]
|
|
94
|
+
runs-on: ubuntu-latest
|
|
95
|
+
if: github.event_name == 'release' && github.event.action == 'published'
|
|
96
|
+
environment:
|
|
97
|
+
name: pypi
|
|
98
|
+
url: https://pypi.org/p/opencite
|
|
99
|
+
permissions:
|
|
100
|
+
id-token: write
|
|
101
|
+
steps:
|
|
102
|
+
- name: Download build artifacts
|
|
103
|
+
uses: actions/download-artifact@v4
|
|
104
|
+
with:
|
|
105
|
+
name: dist
|
|
106
|
+
path: dist/
|
|
107
|
+
|
|
108
|
+
- name: Publish to PyPI
|
|
109
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ main ]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [ main ]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Install uv
|
|
20
|
+
uses: astral-sh/setup-uv@v4
|
|
21
|
+
|
|
22
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
23
|
+
run: uv python install ${{ matrix.python-version }}
|
|
24
|
+
|
|
25
|
+
- name: Create venv and install dependencies
|
|
26
|
+
run: |
|
|
27
|
+
uv venv --python ${{ matrix.python-version }}
|
|
28
|
+
uv pip install -e ".[dev]"
|
|
29
|
+
|
|
30
|
+
- name: Lint with ruff
|
|
31
|
+
run: |
|
|
32
|
+
uv run ruff check src/ tests/
|
|
33
|
+
|
|
34
|
+
- name: Run tests with coverage
|
|
35
|
+
run: |
|
|
36
|
+
uv run pytest --ignore=tests/test_clients --cov=opencite --cov-report=xml
|
|
37
|
+
|
|
38
|
+
- name: Upload coverage to Codecov
|
|
39
|
+
uses: codecov/codecov-action@v5
|
|
40
|
+
with:
|
|
41
|
+
file: ./coverage.xml
|
|
42
|
+
fail_ci_if_error: false
|
|
43
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
.eggs/
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
|
|
15
|
+
# IDE
|
|
16
|
+
.vscode/
|
|
17
|
+
.idea/
|
|
18
|
+
.cursor/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
|
|
22
|
+
# Testing
|
|
23
|
+
.pytest_cache/
|
|
24
|
+
.coverage
|
|
25
|
+
htmlcov/
|
|
26
|
+
coverage.xml
|
|
27
|
+
|
|
28
|
+
# Linting/type checking
|
|
29
|
+
.ruff_cache/
|
|
30
|
+
.mypy_cache/
|
|
31
|
+
|
|
32
|
+
# Environment
|
|
33
|
+
.env
|
|
34
|
+
.env.local
|
|
35
|
+
|
|
36
|
+
# OS
|
|
37
|
+
.DS_Store
|
|
38
|
+
Thumbs.db
|
|
39
|
+
|
|
40
|
+
# Plan files
|
|
41
|
+
.plan
|
|
42
|
+
|
|
43
|
+
# UV
|
|
44
|
+
uv.lock
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
/cache
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# list of languages for which language servers are started; choose from:
|
|
2
|
+
# al bash clojure cpp csharp csharp_omnisharp
|
|
3
|
+
# dart elixir elm erlang fortran go
|
|
4
|
+
# haskell java julia kotlin lua markdown
|
|
5
|
+
# nix perl php python python_jedi r
|
|
6
|
+
# rego ruby ruby_solargraph rust scala swift
|
|
7
|
+
# terraform typescript typescript_vts yaml zig
|
|
8
|
+
# Note:
|
|
9
|
+
# - For C, use cpp
|
|
10
|
+
# - For JavaScript, use typescript
|
|
11
|
+
# Special requirements:
|
|
12
|
+
# - csharp: Requires the presence of a .sln file in the project folder.
|
|
13
|
+
# When using multiple languages, the first language server that supports a given file will be used for that file.
|
|
14
|
+
# The first language is the default language and the respective language server will be used as a fallback.
|
|
15
|
+
# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
|
|
16
|
+
languages:
|
|
17
|
+
- python
|
|
18
|
+
|
|
19
|
+
# the encoding used by text files in the project
|
|
20
|
+
# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
|
|
21
|
+
encoding: "utf-8"
|
|
22
|
+
|
|
23
|
+
# whether to use the project's gitignore file to ignore files
|
|
24
|
+
# Added on 2025-04-07
|
|
25
|
+
ignore_all_files_in_gitignore: true
|
|
26
|
+
|
|
27
|
+
# list of additional paths to ignore
|
|
28
|
+
# same syntax as gitignore, so you can use * and **
|
|
29
|
+
# Was previously called `ignored_dirs`, please update your config if you are using that.
|
|
30
|
+
# Added (renamed) on 2025-04-07
|
|
31
|
+
ignored_paths: []
|
|
32
|
+
|
|
33
|
+
# whether the project is in read-only mode
|
|
34
|
+
# If set to true, all editing tools will be disabled and attempts to use them will result in an error
|
|
35
|
+
# Added on 2025-04-18
|
|
36
|
+
read_only: false
|
|
37
|
+
|
|
38
|
+
# list of tool names to exclude. We recommend not excluding any tools, see the readme for more details.
|
|
39
|
+
# Below is the complete list of tools for convenience.
|
|
40
|
+
# To make sure you have the latest list of tools, and to view their descriptions,
|
|
41
|
+
# execute `uv run scripts/print_tool_overview.py`.
|
|
42
|
+
#
|
|
43
|
+
# * `activate_project`: Activates a project by name.
|
|
44
|
+
# * `check_onboarding_performed`: Checks whether project onboarding was already performed.
|
|
45
|
+
# * `create_text_file`: Creates/overwrites a file in the project directory.
|
|
46
|
+
# * `delete_lines`: Deletes a range of lines within a file.
|
|
47
|
+
# * `delete_memory`: Deletes a memory from Serena's project-specific memory store.
|
|
48
|
+
# * `execute_shell_command`: Executes a shell command.
|
|
49
|
+
# * `find_referencing_code_snippets`: Finds code snippets in which the symbol at the given location is referenced.
|
|
50
|
+
# * `find_referencing_symbols`: Finds symbols that reference the symbol at the given location (optionally filtered by type).
|
|
51
|
+
# * `find_symbol`: Performs a global (or local) search for symbols with/containing a given name/substring (optionally filtered by type).
|
|
52
|
+
# * `get_current_config`: Prints the current configuration of the agent, including the active and available projects, tools, contexts, and modes.
|
|
53
|
+
# * `get_symbols_overview`: Gets an overview of the top-level symbols defined in a given file.
|
|
54
|
+
# * `initial_instructions`: Gets the initial instructions for the current project.
|
|
55
|
+
# Should only be used in settings where the system prompt cannot be set,
|
|
56
|
+
# e.g. in clients you have no control over, like Claude Desktop.
|
|
57
|
+
# * `insert_after_symbol`: Inserts content after the end of the definition of a given symbol.
|
|
58
|
+
# * `insert_at_line`: Inserts content at a given line in a file.
|
|
59
|
+
# * `insert_before_symbol`: Inserts content before the beginning of the definition of a given symbol.
|
|
60
|
+
# * `list_dir`: Lists files and directories in the given directory (optionally with recursion).
|
|
61
|
+
# * `list_memories`: Lists memories in Serena's project-specific memory store.
|
|
62
|
+
# * `onboarding`: Performs onboarding (identifying the project structure and essential tasks, e.g. for testing or building).
|
|
63
|
+
# * `prepare_for_new_conversation`: Provides instructions for preparing for a new conversation (in order to continue with the necessary context).
|
|
64
|
+
# * `read_file`: Reads a file within the project directory.
|
|
65
|
+
# * `read_memory`: Reads the memory with the given name from Serena's project-specific memory store.
|
|
66
|
+
# * `remove_project`: Removes a project from the Serena configuration.
|
|
67
|
+
# * `replace_lines`: Replaces a range of lines within a file with new content.
|
|
68
|
+
# * `replace_symbol_body`: Replaces the full definition of a symbol.
|
|
69
|
+
# * `restart_language_server`: Restarts the language server, may be necessary when edits not through Serena happen.
|
|
70
|
+
# * `search_for_pattern`: Performs a search for a pattern in the project.
|
|
71
|
+
# * `summarize_changes`: Provides instructions for summarizing the changes made to the codebase.
|
|
72
|
+
# * `switch_modes`: Activates modes by providing a list of their names
|
|
73
|
+
# * `think_about_collected_information`: Thinking tool for pondering the completeness of collected information.
|
|
74
|
+
# * `think_about_task_adherence`: Thinking tool for determining whether the agent is still on track with the current task.
|
|
75
|
+
# * `think_about_whether_you_are_done`: Thinking tool for determining whether the task is truly completed.
|
|
76
|
+
# * `write_memory`: Writes a named memory (for future reference) to Serena's project-specific memory store.
|
|
77
|
+
excluded_tools: []
|
|
78
|
+
|
|
79
|
+
# initial prompt for the project. It will always be given to the LLM upon activating the project
|
|
80
|
+
# (contrary to the memories, which are loaded on demand).
|
|
81
|
+
initial_prompt: ""
|
|
82
|
+
|
|
83
|
+
project_name: "opencite"
|
|
84
|
+
included_optional_tools: []
|
opencite-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
OpenCite is a Python CLI tool and library for academic literature search and citation management. It aggregates results from three academic APIs (Semantic Scholar, OpenAlex, PubMed), deduplicates them, and outputs results as formatted text, JSON, BibTeX, or CSV. It also supports PDF retrieval and PDF-to-markdown conversion.
|
|
8
|
+
|
|
9
|
+
## Build and Run
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
uv sync --extra dev # install package + dev deps
|
|
13
|
+
uv run opencite --version # verify CLI works
|
|
14
|
+
uv run opencite search "query" # search for papers
|
|
15
|
+
uv run opencite lookup DOI # look up a specific paper
|
|
16
|
+
uv run opencite cite DOI # citation graph
|
|
17
|
+
uv run opencite canonical "topic" # most-cited papers in a field
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Testing and Linting
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uv run pytest # run all tests
|
|
24
|
+
uv run pytest tests/test_models.py -v # single test file
|
|
25
|
+
uv run pytest -k "test_doi" # single test by name
|
|
26
|
+
uv run pytest -m integration # API integration tests only
|
|
27
|
+
uv run ruff check src/ tests/ # lint
|
|
28
|
+
uv run ruff check --fix src/ tests/ # auto-fix lint
|
|
29
|
+
uv run ruff format src/ tests/ # format
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Architecture
|
|
33
|
+
|
|
34
|
+
### Package Layout (`src/opencite/`)
|
|
35
|
+
- `models.py` -- central data models: `Paper`, `Author`, `IDSet` (frozen), `Source`, `PDFLocation`, `SearchResult`, `CitationResult`, `parse_identifier()`
|
|
36
|
+
- `config.py` -- `Config` dataclass with `from_env()`, manual `.env` loading
|
|
37
|
+
- `exceptions.py` -- `OpenCiteError` hierarchy: `APIError`, `RateLimitError`, `APIKeyError`, etc.
|
|
38
|
+
- `cli.py` -- argparse with subcommands: search, lookup, cite, canonical, pdf, convert, ids
|
|
39
|
+
- `utils.py` -- title normalization, fuzzy matching, author name parsing, abstract reconstruction
|
|
40
|
+
- `clients/` -- per-API async clients with rate limiting (base.py, openalex.py, semantic_scholar.py, pubmed.py, id_converter.py)
|
|
41
|
+
- `search.py` -- `SearchOrchestrator` for parallel multi-source search with dedup/merge
|
|
42
|
+
- `citations.py` -- `CitationExplorer` for citation graph traversal and canonical paper discovery
|
|
43
|
+
- `dedup.py` -- DOI + fuzzy title deduplication with paper merging
|
|
44
|
+
- `bibtex.py` -- BibTeX fetch (S2 citationStyles, DOI negotiation) and generation
|
|
45
|
+
- `pdf.py` -- multi-source PDF retrieval pipeline
|
|
46
|
+
- `convert.py` -- PDF-to-markdown (markitdown, markit-mistral)
|
|
47
|
+
- `formatters/` -- output formatters (text, json, bibtex, csv)
|
|
48
|
+
|
|
49
|
+
### Key Design Patterns
|
|
50
|
+
- **IDSet** is frozen (immutable) and centralizes all identifier types (DOI, PMID, PMCID, OpenAlex, S2, ArXiv) for cross-API lookup
|
|
51
|
+
- **Paper.data_sources** tracks which APIs contributed data (provenance)
|
|
52
|
+
- **BaseClient** ABC provides rate limiting (token bucket), retry with backoff, httpx session management
|
|
53
|
+
- Rate limits: OpenAlex 100 req/sec, PubMed 10 req/sec, Semantic Scholar 1 req/sec
|
|
54
|
+
- PDF retrieval tries sources in priority: OpenAlex -> S2 -> PMC OA -> DOI negotiation
|
|
55
|
+
- PDF-to-markdown `auto` mode: if `MISTRAL_API_KEY` is set, use markit-mistral (better for math/complex layouts); otherwise fall back to markitdown (free)
|
|
56
|
+
- `scripts/lit_search.py` is the original prototype, kept as reference
|
|
57
|
+
|
|
58
|
+
### API Integrations
|
|
59
|
+
- **OpenAlex** -- `pyalex` library; broadest coverage (250M+ works); best for PDF URLs, filtering, citation counts
|
|
60
|
+
- **Semantic Scholar** -- `httpx` REST; TLDR summaries, SPECTER embeddings, batch 500 IDs, `citationStyles` for BibTeX
|
|
61
|
+
- **PubMed/PMC** -- NCBI eutils XML; MeSH terms, PMC full text, ID Converter API
|
|
62
|
+
|
|
63
|
+
## Environment Variables
|
|
64
|
+
|
|
65
|
+
Required in `.env` (gitignored):
|
|
66
|
+
- `SEMANTIC_SCHOLAR_API_KEY` -- Semantic Scholar API
|
|
67
|
+
- `PUBMED_API_KEY` -- NCBI/PubMed API
|
|
68
|
+
- `OPENALEX_API_KEY` -- OpenAlex API (required since Feb 2026)
|
|
69
|
+
- `MISTRAL_API_KEY` -- Mistral AI for PDF-to-markdown conversion
|
|
70
|
+
|
|
71
|
+
## Dependencies
|
|
72
|
+
|
|
73
|
+
Core: `httpx`, `pyalex`
|
|
74
|
+
Optional [convert]: `markitdown`, `markit-mistral` (local, not on PyPI)
|
|
75
|
+
Dev: `pytest`, `pytest-asyncio`, `pytest-cov`, `ruff`, `pre-commit`
|
opencite-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Seyed Yahya Shirazi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
opencite-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: opencite
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Academic literature search, citation management, and PDF retrieval CLI
|
|
5
|
+
Project-URL: Repository, https://github.com/neuromechanist/opencite
|
|
6
|
+
Project-URL: Issues, https://github.com/neuromechanist/opencite/issues
|
|
7
|
+
Author-email: Seyed Yahya Shirazi <shirazi@ieee.org>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: academic,bibtex,citations,literature,openalex,pdf,pubmed,search,semantic-scholar
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Requires-Dist: httpx>=0.27.0
|
|
22
|
+
Requires-Dist: pyalex>=0.15
|
|
23
|
+
Provides-Extra: convert
|
|
24
|
+
Requires-Dist: markit-mistral; extra == 'convert'
|
|
25
|
+
Requires-Dist: markitdown>=0.1.0; extra == 'convert'
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pre-commit>=4.0.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.8.0; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# OpenCite
|
|
35
|
+
|
|
36
|
+
Academic literature search, citation management, and PDF retrieval CLI.
|
|
37
|
+
|
|
38
|
+
Searches Semantic Scholar, OpenAlex, and PubMed in parallel, deduplicates results, and supports BibTeX output, citation graph traversal, PDF retrieval, and PDF-to-markdown conversion.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
uv pip install -e .
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
With PDF conversion support:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
uv pip install -e ".[convert]"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# Search for papers
|
|
56
|
+
opencite search "transformer attention mechanism"
|
|
57
|
+
|
|
58
|
+
# Look up a paper by DOI
|
|
59
|
+
opencite lookup 10.1038/nature12345
|
|
60
|
+
|
|
61
|
+
# Find most-cited papers in a field
|
|
62
|
+
opencite canonical "deep learning for neuroscience" --min-citations 500
|
|
63
|
+
|
|
64
|
+
# Get papers citing a specific work
|
|
65
|
+
opencite cite 10.1038/nature12345
|
|
66
|
+
|
|
67
|
+
# Download a PDF
|
|
68
|
+
opencite pdf 10.1038/nature12345 -o papers/
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Configuration
|
|
72
|
+
|
|
73
|
+
Set API keys in a `.env` file or as environment variables:
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
SEMANTIC_SCHOLAR_API_KEY=your_key
|
|
77
|
+
PUBMED_API_KEY=your_key
|
|
78
|
+
OPENALEX_API_KEY=your_key
|
|
79
|
+
MISTRAL_API_KEY=your_key # for PDF-to-markdown via Mistral OCR
|
|
80
|
+
```
|