citesentry 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- citesentry-0.1.1/.claude/settings.local.json +18 -0
- citesentry-0.1.1/.github/workflows/publish.yml +26 -0
- citesentry-0.1.1/CLAUDE.md +36 -0
- citesentry-0.1.1/PKG-INFO +201 -0
- citesentry-0.1.1/README.md +173 -0
- citesentry-0.1.1/citesentry/__init__.py +3 -0
- citesentry-0.1.1/citesentry/cache.py +70 -0
- citesentry-0.1.1/citesentry/checks/__init__.py +0 -0
- citesentry-0.1.1/citesentry/checks/existence.py +219 -0
- citesentry-0.1.1/citesentry/checks/relevance.py +176 -0
- citesentry-0.1.1/citesentry/checks/url_liveness.py +143 -0
- citesentry-0.1.1/citesentry/cli.py +193 -0
- citesentry-0.1.1/citesentry/config.py +46 -0
- citesentry-0.1.1/citesentry/core/__init__.py +4 -0
- citesentry-0.1.1/citesentry/core/cascade.py +17 -0
- citesentry-0.1.1/citesentry/core/engine.py +94 -0
- citesentry-0.1.1/citesentry/core/verdict.py +56 -0
- citesentry-0.1.1/citesentry/llm/__init__.py +3 -0
- citesentry-0.1.1/citesentry/llm/base.py +13 -0
- citesentry-0.1.1/citesentry/llm/deepseek.py +41 -0
- citesentry-0.1.1/citesentry/llm/mcp_sampling.py +17 -0
- citesentry-0.1.1/citesentry/mcp_server.py +156 -0
- citesentry-0.1.1/citesentry/models.py +81 -0
- citesentry-0.1.1/citesentry/parse/__init__.py +3 -0
- citesentry-0.1.1/citesentry/parse/bibtex.py +87 -0
- citesentry-0.1.1/citesentry/parse/csl_json.py +87 -0
- citesentry-0.1.1/citesentry/parse/detect.py +100 -0
- citesentry-0.1.1/citesentry/parse/doi_list.py +20 -0
- citesentry-0.1.1/citesentry/parse/nbib.py +87 -0
- citesentry-0.1.1/citesentry/parse/pdf_refs.py +47 -0
- citesentry-0.1.1/citesentry/parse/plaintext.py +329 -0
- citesentry-0.1.1/citesentry/parse/ris.py +75 -0
- citesentry-0.1.1/citesentry/sources/__init__.py +3 -0
- citesentry-0.1.1/citesentry/sources/arxiv.py +111 -0
- citesentry-0.1.1/citesentry/sources/base.py +19 -0
- citesentry-0.1.1/citesentry/sources/crossref.py +97 -0
- citesentry-0.1.1/citesentry/sources/domain/__init__.py +0 -0
- citesentry-0.1.1/citesentry/sources/domain/dblp.py +86 -0
- citesentry-0.1.1/citesentry/sources/domain/pubmed.py +153 -0
- citesentry-0.1.1/citesentry/sources/openalex.py +99 -0
- citesentry-0.1.1/citesentry/sources/semantic_scholar.py +73 -0
- citesentry-0.1.1/citesentry/sources/unpaywall.py +46 -0
- citesentry-0.1.1/pyproject.toml +49 -0
- citesentry-0.1.1/refsift_build_plan.md +407 -0
- citesentry-0.1.1/tests/__init__.py +0 -0
- citesentry-0.1.1/tests/fixtures/apa_style.txt +9 -0
- citesentry-0.1.1/tests/fixtures/fabricated.bib +37 -0
- citesentry-0.1.1/tests/fixtures/ieee_style.txt +9 -0
- citesentry-0.1.1/tests/fixtures/known_real.bib +39 -0
- citesentry-0.1.1/tests/fixtures/metadata_mismatch.bib +41 -0
- citesentry-0.1.1/tests/fixtures/pdf_copypaste.txt +12 -0
- citesentry-0.1.1/tests/fixtures/sample.json +24 -0
- citesentry-0.1.1/tests/fixtures/sample.nbib +26 -0
- citesentry-0.1.1/tests/fixtures/sample.ris +20 -0
- citesentry-0.1.1/tests/fixtures/urls.bib +36 -0
- citesentry-0.1.1/tests/test_checks.py +127 -0
- citesentry-0.1.1/tests/test_engine.py +129 -0
- citesentry-0.1.1/tests/test_parse.py +176 -0
- citesentry-0.1.1/tests/test_sources.py +148 -0
- citesentry-0.1.1/uv.lock +1879 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(pip install *)",
|
|
5
|
+
"Bash(pip index *)",
|
|
6
|
+
"Bash(python *)",
|
|
7
|
+
"Bash(refsift check-one *)",
|
|
8
|
+
"Bash(python3 -c \"import json,sys; d=json.load\\(sys.stdin\\); print\\(d['overall_verdict'], d['reference']['title']\\)\")",
|
|
9
|
+
"Bash(refsift check *)",
|
|
10
|
+
"Bash(python3 *)",
|
|
11
|
+
"Bash(xargs sed -i '' 's/from refsift\\\\./from citesentry./g; s/from refsift import/from citesentry import/g; s/import refsift\\\\./import citesentry./g')"
|
|
12
|
+
]
|
|
13
|
+
},
|
|
14
|
+
"enableAllProjectMcpServers": true,
|
|
15
|
+
"enabledMcpjsonServers": [
|
|
16
|
+
"semantic-scholar-mcp"
|
|
17
|
+
]
|
|
18
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build-and-publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment: pypi
|
|
12
|
+
permissions:
|
|
13
|
+
id-token: write # required for trusted publishing
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- uses: astral-sh/setup-uv@v4
|
|
19
|
+
with:
|
|
20
|
+
version: "latest"
|
|
21
|
+
|
|
22
|
+
- name: Build package
|
|
23
|
+
run: uv build
|
|
24
|
+
|
|
25
|
+
- name: Publish to PyPI
|
|
26
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# citesentry — Claude Code session notes
|
|
2
|
+
|
|
3
|
+
## Guardrails (non-negotiable)
|
|
4
|
+
|
|
5
|
+
- Never label a reference "fake" or "fraudulent" — only "could not verify / needs review."
|
|
6
|
+
- Never bypass CAPTCHA or bot-protection; classify as SKIPPED.
|
|
7
|
+
- Never hardcode API keys; read from env; degrade gracefully when absent.
|
|
8
|
+
- Core (`citesentry/core/`, `citesentry/checks/`, `citesentry/sources/`, `citesentry/parse/`) must never import Typer, Rich, or MCP.
|
|
9
|
+
- MCP server stdout must stay clean (JSON-RPC stream). Log to stderr only.
|
|
10
|
+
- Always send `mailto` to OpenAlex/Crossref; respect rate limits; cache aggressively.
|
|
11
|
+
- Report all counts honestly: checked, skipped, errored — never silently drop.
|
|
12
|
+
|
|
13
|
+
## Architecture
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
┌──────────────────────────────┐
|
|
17
|
+
bib/pdf/txt → │ citesentry.core (library) │ → VerificationReport (pydantic)
|
|
18
|
+
└──────────────────────────────┘
|
|
19
|
+
▲ ▲
|
|
20
|
+
│ │
|
|
21
|
+
citesentry.cli citesentry.mcp_server
|
|
22
|
+
(Typer + Rich) (FastMCP / stdio)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
If verification logic ever appears inside `cli.py` or `mcp_server.py`, that is a bug — move it to core.
|
|
26
|
+
|
|
27
|
+
## LLM strategy
|
|
28
|
+
|
|
29
|
+
- MCP server: uses MCP sampling (`ctx.sample()`) — no API key needed.
|
|
30
|
+
- CLI: uses DeepSeek via OpenAI-compatible endpoint; requires `DEEPSEEK_API_KEY`.
|
|
31
|
+
- `--no-llm` skips relevance checks entirely; tool remains fully usable.
|
|
32
|
+
|
|
33
|
+
## Verdict wording
|
|
34
|
+
|
|
35
|
+
`NOT_FOUND` → "could not verify — likely fabricated, needs manual review"
|
|
36
|
+
Never use the word "fake."
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: citesentry
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Citation verification tool: existence, URL liveness, and content relevance checks
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: bibtexparser>=1.4
|
|
8
|
+
Requires-Dist: httpx>=0.27
|
|
9
|
+
Requires-Dist: mcp[cli]>=1.0
|
|
10
|
+
Requires-Dist: pdfminer-six>=20221105
|
|
11
|
+
Requires-Dist: platformdirs>=4
|
|
12
|
+
Requires-Dist: pydantic>=2
|
|
13
|
+
Requires-Dist: rapidfuzz>=3
|
|
14
|
+
Requires-Dist: rich>=13
|
|
15
|
+
Requires-Dist: rispy>=0.9
|
|
16
|
+
Requires-Dist: typer>=0.12
|
|
17
|
+
Provides-Extra: cli-llm
|
|
18
|
+
Requires-Dist: openai>=1.0; extra == 'cli-llm'
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
21
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
22
|
+
Requires-Dist: respx>=0.21; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
24
|
+
Provides-Extra: domain
|
|
25
|
+
Provides-Extra: pdf
|
|
26
|
+
Requires-Dist: refextract; extra == 'pdf'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# CiteSentry
|
|
30
|
+
|
|
31
|
+
[](https://pypi.org/project/citesentry/)
|
|
32
|
+
[](https://pypi.org/project/citesentry/)
|
|
33
|
+
[](https://github.com/mkassaf/CiteSentry/actions/workflows/publish.yml)
|
|
34
|
+
|
|
35
|
+
Citation verification tool: check whether references actually exist, whether their URLs are live, and whether the content is relevant to the citation.
|
|
36
|
+
|
|
37
|
+
## What it does
|
|
38
|
+
|
|
39
|
+
Three checks per reference:
|
|
40
|
+
|
|
41
|
+
1. **Existence** — resolves against OpenAlex, Crossref, Semantic Scholar, arXiv, and domain-specific databases (PubMed for biomedical, DBLP for CS)
|
|
42
|
+
2. **URL liveness** — HTTP HEAD/GET check; classifies 2xx/4xx/timeout/bot-protection
|
|
43
|
+
3. **Content relevance** — LLM-backed check comparing fetched content to the cited title/topic (requires `DEEPSEEK_API_KEY` for CLI use)
|
|
44
|
+
|
|
45
|
+
Verdicts: `VERIFIED`, `METADATA_MISMATCH`, `DEAD_URL`, `CONTENT_DRIFT`, `NOT_FOUND`, `UNRESOLVABLE`.
|
|
46
|
+
|
|
47
|
+
`NOT_FOUND` means "could not verify — likely fabricated, needs manual review." Never "fake."
|
|
48
|
+
|
|
49
|
+
## Install
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install citesentry # basic install
|
|
53
|
+
pip install "citesentry[cli-llm]" # + DeepSeek for relevance checks
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
For development:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
git clone https://github.com/mkassaf/CiteSentry
|
|
60
|
+
cd CiteSentry
|
|
61
|
+
pip install -e ".[dev]"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## CLI usage
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Check a BibTeX file
|
|
68
|
+
citesentry check refs.bib
|
|
69
|
+
|
|
70
|
+
# Check a RIS/CSL-JSON/NBIB/plaintext file
|
|
71
|
+
citesentry check refs.ris
|
|
72
|
+
citesentry check refs.json
|
|
73
|
+
|
|
74
|
+
# Read from stdin
|
|
75
|
+
cat refs.txt | citesentry check -
|
|
76
|
+
|
|
77
|
+
# Single ad-hoc reference
|
|
78
|
+
citesentry check-one "Vaswani et al. (2017). Attention is all you need. NeurIPS."
|
|
79
|
+
|
|
80
|
+
# Output formats: table (default), json, md
|
|
81
|
+
citesentry check refs.bib --format json
|
|
82
|
+
citesentry check refs.bib --format md > report.md
|
|
83
|
+
|
|
84
|
+
# Skip checks
|
|
85
|
+
citesentry check refs.bib --no-llm # skip relevance (no API key needed)
|
|
86
|
+
citesentry check refs.bib --no-url # skip URL liveness
|
|
87
|
+
|
|
88
|
+
# Domain adapters (auto by default)
|
|
89
|
+
citesentry check refs.bib --domain pubmed # force PubMed only
|
|
90
|
+
citesentry check refs.bib --domain none # disable domain adapters
|
|
91
|
+
|
|
92
|
+
# Override plaintext style detection
|
|
93
|
+
citesentry check refs.txt --style ieee
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Exit code is non-zero if any reference is `NOT_FOUND` or `DEAD_URL` (useful in CI).
|
|
97
|
+
|
|
98
|
+
## MCP server (Claude Desktop / Claude Code)
|
|
99
|
+
|
|
100
|
+
Add to your `claude_desktop_config.json`:
|
|
101
|
+
|
|
102
|
+
```json
|
|
103
|
+
{
|
|
104
|
+
"mcpServers": {
|
|
105
|
+
"citesentry": {
|
|
106
|
+
"command": "citesentry-mcp",
|
|
107
|
+
"env": {
|
|
108
|
+
"CITESENTRY_MAILTO": "you@example.com",
|
|
109
|
+
"DEEPSEEK_API_KEY": "sk-..."
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Or with `uvx` (no prior install needed):
|
|
117
|
+
|
|
118
|
+
```json
|
|
119
|
+
{
|
|
120
|
+
"mcpServers": {
|
|
121
|
+
"citesentry": {
|
|
122
|
+
"command": "uvx",
|
|
123
|
+
"args": ["--from", "citesentry", "citesentry-mcp"],
|
|
124
|
+
"env": { "CITESENTRY_MAILTO": "you@example.com" }
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
MCP tools exposed:
|
|
131
|
+
- `verify_reference(reference, check_url, check_relevance)` — single reference
|
|
132
|
+
- `verify_reference_list(references, format, check_url, check_relevance)` — batch
|
|
133
|
+
- `check_url_alive(url)` — standalone URL check
|
|
134
|
+
|
|
135
|
+
### Claude Code (CLI)
|
|
136
|
+
|
|
137
|
+
Register the server once:
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
claude mcp add citesentry \
|
|
141
|
+
-e CITESENTRY_MAILTO=you@example.com \
|
|
142
|
+
-- uvx --from citesentry citesentry-mcp
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Then in any Claude Code session, ask naturally:
|
|
146
|
+
|
|
147
|
+
> "Use citesentry to verify this reference: Vaswani et al. (2017). Attention is all you need. NeurIPS."
|
|
148
|
+
|
|
149
|
+
> "Check whether all the references in refs.bib are real."
|
|
150
|
+
|
|
151
|
+
> "Is https://arxiv.org/abs/1706.03762 still live?"
|
|
152
|
+
|
|
153
|
+
### Any MCP-compatible agent (Python example)
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
import asyncio
|
|
157
|
+
from mcp import ClientSession, StdioServerParameters
|
|
158
|
+
from mcp.client.stdio import stdio_client
|
|
159
|
+
|
|
160
|
+
server = StdioServerParameters(
|
|
161
|
+
command="uvx",
|
|
162
|
+
args=["--from", "citesentry", "citesentry-mcp"],
|
|
163
|
+
env={"CITESENTRY_MAILTO": "you@example.com"},
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
async def main():
|
|
167
|
+
async with stdio_client(server) as (read, write):
|
|
168
|
+
async with ClientSession(read, write) as session:
|
|
169
|
+
await session.initialize()
|
|
170
|
+
|
|
171
|
+
result = await session.call_tool(
|
|
172
|
+
"verify_reference",
|
|
173
|
+
{"reference": "Vaswani et al. (2017). Attention is all you need. NeurIPS."},
|
|
174
|
+
)
|
|
175
|
+
print(result.content[0].text)
|
|
176
|
+
|
|
177
|
+
asyncio.run(main())
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Environment variables
|
|
181
|
+
|
|
182
|
+
| Variable | Default | Description |
|
|
183
|
+
|---|---|---|
|
|
184
|
+
| `CITESENTRY_MAILTO` | `citesentry@example.com` | Polite email for OpenAlex/Crossref API |
|
|
185
|
+
| `DEEPSEEK_API_KEY` | — | Required for relevance checks in CLI |
|
|
186
|
+
| `DEEPSEEK_BASE_URL` | `https://api.deepseek.com/v1` | OpenAI-compatible endpoint |
|
|
187
|
+
| `DEEPSEEK_MODEL` | `deepseek-chat` | Model for relevance judgments |
|
|
188
|
+
|
|
189
|
+
## Supported input formats
|
|
190
|
+
|
|
191
|
+
- BibTeX (`.bib`) — via bibtexparser
|
|
192
|
+
- RIS (`.ris`) — via rispy; covers Zotero, Mendeley, EndNote, Web of Science
|
|
193
|
+
- CSL JSON (`.json`) — Zotero exports
|
|
194
|
+
- PubMed NBIB (`.nbib`)
|
|
195
|
+
- DOI list (`.txt` with one DOI per line)
|
|
196
|
+
- Plaintext reference sections — IEEE, APA, Vancouver, MLA, Chicago; auto-detected
|
|
197
|
+
- PDF (`.pdf`) — extracts reference section text via pdfminer.six
|
|
198
|
+
|
|
199
|
+
## Caching
|
|
200
|
+
|
|
201
|
+
Results are cached in a SQLite database (`~/.cache/citesentry/cache.db`). Pass `--no-cache` to bypass.
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# CiteSentry
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/citesentry/)
|
|
4
|
+
[](https://pypi.org/project/citesentry/)
|
|
5
|
+
[](https://github.com/mkassaf/CiteSentry/actions/workflows/publish.yml)
|
|
6
|
+
|
|
7
|
+
Citation verification tool: check whether references actually exist, whether their URLs are live, and whether the content is relevant to the citation.
|
|
8
|
+
|
|
9
|
+
## What it does
|
|
10
|
+
|
|
11
|
+
Three checks per reference:
|
|
12
|
+
|
|
13
|
+
1. **Existence** — resolves against OpenAlex, Crossref, Semantic Scholar, arXiv, and domain-specific databases (PubMed for biomedical, DBLP for CS)
|
|
14
|
+
2. **URL liveness** — HTTP HEAD/GET check; classifies 2xx/4xx/timeout/bot-protection
|
|
15
|
+
3. **Content relevance** — LLM-backed check comparing fetched content to the cited title/topic (requires `DEEPSEEK_API_KEY` for CLI use)
|
|
16
|
+
|
|
17
|
+
Verdicts: `VERIFIED`, `METADATA_MISMATCH`, `DEAD_URL`, `CONTENT_DRIFT`, `NOT_FOUND`, `UNRESOLVABLE`.
|
|
18
|
+
|
|
19
|
+
`NOT_FOUND` means "could not verify — likely fabricated, needs manual review." Never "fake."
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install citesentry # basic install
|
|
25
|
+
pip install "citesentry[cli-llm]" # + DeepSeek for relevance checks
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
For development:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
git clone https://github.com/mkassaf/CiteSentry
|
|
32
|
+
cd CiteSentry
|
|
33
|
+
pip install -e ".[dev]"
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## CLI usage
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Check a BibTeX file
|
|
40
|
+
citesentry check refs.bib
|
|
41
|
+
|
|
42
|
+
# Check a RIS/CSL-JSON/NBIB/plaintext file
|
|
43
|
+
citesentry check refs.ris
|
|
44
|
+
citesentry check refs.json
|
|
45
|
+
|
|
46
|
+
# Read from stdin
|
|
47
|
+
cat refs.txt | citesentry check -
|
|
48
|
+
|
|
49
|
+
# Single ad-hoc reference
|
|
50
|
+
citesentry check-one "Vaswani et al. (2017). Attention is all you need. NeurIPS."
|
|
51
|
+
|
|
52
|
+
# Output formats: table (default), json, md
|
|
53
|
+
citesentry check refs.bib --format json
|
|
54
|
+
citesentry check refs.bib --format md > report.md
|
|
55
|
+
|
|
56
|
+
# Skip checks
|
|
57
|
+
citesentry check refs.bib --no-llm # skip relevance (no API key needed)
|
|
58
|
+
citesentry check refs.bib --no-url # skip URL liveness
|
|
59
|
+
|
|
60
|
+
# Domain adapters (auto by default)
|
|
61
|
+
citesentry check refs.bib --domain pubmed # force PubMed only
|
|
62
|
+
citesentry check refs.bib --domain none # disable domain adapters
|
|
63
|
+
|
|
64
|
+
# Override plaintext style detection
|
|
65
|
+
citesentry check refs.txt --style ieee
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Exit code is non-zero if any reference is `NOT_FOUND` or `DEAD_URL` (useful in CI).
|
|
69
|
+
|
|
70
|
+
## MCP server (Claude Desktop / Claude Code)
|
|
71
|
+
|
|
72
|
+
Add to your `claude_desktop_config.json`:
|
|
73
|
+
|
|
74
|
+
```json
|
|
75
|
+
{
|
|
76
|
+
"mcpServers": {
|
|
77
|
+
"citesentry": {
|
|
78
|
+
"command": "citesentry-mcp",
|
|
79
|
+
"env": {
|
|
80
|
+
"CITESENTRY_MAILTO": "you@example.com",
|
|
81
|
+
"DEEPSEEK_API_KEY": "sk-..."
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Or with `uvx` (no prior install needed):
|
|
89
|
+
|
|
90
|
+
```json
|
|
91
|
+
{
|
|
92
|
+
"mcpServers": {
|
|
93
|
+
"citesentry": {
|
|
94
|
+
"command": "uvx",
|
|
95
|
+
"args": ["--from", "citesentry", "citesentry-mcp"],
|
|
96
|
+
"env": { "CITESENTRY_MAILTO": "you@example.com" }
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
MCP tools exposed:
|
|
103
|
+
- `verify_reference(reference, check_url, check_relevance)` — single reference
|
|
104
|
+
- `verify_reference_list(references, format, check_url, check_relevance)` — batch
|
|
105
|
+
- `check_url_alive(url)` — standalone URL check
|
|
106
|
+
|
|
107
|
+
### Claude Code (CLI)
|
|
108
|
+
|
|
109
|
+
Register the server once:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
claude mcp add citesentry \
|
|
113
|
+
-e CITESENTRY_MAILTO=you@example.com \
|
|
114
|
+
-- uvx --from citesentry citesentry-mcp
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Then in any Claude Code session, ask naturally:
|
|
118
|
+
|
|
119
|
+
> "Use citesentry to verify this reference: Vaswani et al. (2017). Attention is all you need. NeurIPS."
|
|
120
|
+
|
|
121
|
+
> "Check whether all the references in refs.bib are real."
|
|
122
|
+
|
|
123
|
+
> "Is https://arxiv.org/abs/1706.03762 still live?"
|
|
124
|
+
|
|
125
|
+
### Any MCP-compatible agent (Python example)
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
import asyncio
|
|
129
|
+
from mcp import ClientSession, StdioServerParameters
|
|
130
|
+
from mcp.client.stdio import stdio_client
|
|
131
|
+
|
|
132
|
+
server = StdioServerParameters(
|
|
133
|
+
command="uvx",
|
|
134
|
+
args=["--from", "citesentry", "citesentry-mcp"],
|
|
135
|
+
env={"CITESENTRY_MAILTO": "you@example.com"},
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
async def main():
|
|
139
|
+
async with stdio_client(server) as (read, write):
|
|
140
|
+
async with ClientSession(read, write) as session:
|
|
141
|
+
await session.initialize()
|
|
142
|
+
|
|
143
|
+
result = await session.call_tool(
|
|
144
|
+
"verify_reference",
|
|
145
|
+
{"reference": "Vaswani et al. (2017). Attention is all you need. NeurIPS."},
|
|
146
|
+
)
|
|
147
|
+
print(result.content[0].text)
|
|
148
|
+
|
|
149
|
+
asyncio.run(main())
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Environment variables
|
|
153
|
+
|
|
154
|
+
| Variable | Default | Description |
|
|
155
|
+
|---|---|---|
|
|
156
|
+
| `CITESENTRY_MAILTO` | `citesentry@example.com` | Polite email for OpenAlex/Crossref API |
|
|
157
|
+
| `DEEPSEEK_API_KEY` | — | Required for relevance checks in CLI |
|
|
158
|
+
| `DEEPSEEK_BASE_URL` | `https://api.deepseek.com/v1` | OpenAI-compatible endpoint |
|
|
159
|
+
| `DEEPSEEK_MODEL` | `deepseek-chat` | Model for relevance judgments |
|
|
160
|
+
|
|
161
|
+
## Supported input formats
|
|
162
|
+
|
|
163
|
+
- BibTeX (`.bib`) — via bibtexparser
|
|
164
|
+
- RIS (`.ris`) — via rispy; covers Zotero, Mendeley, EndNote, Web of Science
|
|
165
|
+
- CSL JSON (`.json`) — Zotero exports
|
|
166
|
+
- PubMed NBIB (`.nbib`)
|
|
167
|
+
- DOI list (`.txt` with one DOI per line)
|
|
168
|
+
- Plaintext reference sections — IEEE, APA, Vancouver, MLA, Chicago; auto-detected
|
|
169
|
+
- PDF (`.pdf`) — extracts reference section text via pdfminer.six
|
|
170
|
+
|
|
171
|
+
## Caching
|
|
172
|
+
|
|
173
|
+
Results are cached in a SQLite database (`~/.cache/citesentry/cache.db`). Pass `--no-cache` to bypass.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import sqlite3
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Cache:
|
|
12
|
+
def __init__(self, path: Path) -> None:
|
|
13
|
+
self._path = path
|
|
14
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
self._conn = sqlite3.connect(str(path), check_same_thread=False)
|
|
16
|
+
self._init()
|
|
17
|
+
|
|
18
|
+
def _init(self) -> None:
|
|
19
|
+
self._conn.execute(
|
|
20
|
+
"""CREATE TABLE IF NOT EXISTS cache (
|
|
21
|
+
key TEXT PRIMARY KEY,
|
|
22
|
+
value TEXT NOT NULL,
|
|
23
|
+
created_at REAL NOT NULL
|
|
24
|
+
)"""
|
|
25
|
+
)
|
|
26
|
+
self._conn.commit()
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def _key(namespace: str, identifier: str) -> str:
|
|
30
|
+
h = hashlib.sha256(f"{namespace}:{identifier}".encode()).hexdigest()
|
|
31
|
+
return h
|
|
32
|
+
|
|
33
|
+
def get(self, namespace: str, identifier: str) -> Any | None:
|
|
34
|
+
key = self._key(namespace, identifier)
|
|
35
|
+
row = self._conn.execute(
|
|
36
|
+
"SELECT value FROM cache WHERE key = ?", (key,)
|
|
37
|
+
).fetchone()
|
|
38
|
+
if row is None:
|
|
39
|
+
return None
|
|
40
|
+
return json.loads(row[0])
|
|
41
|
+
|
|
42
|
+
def set(self, namespace: str, identifier: str, value: Any) -> None:
|
|
43
|
+
key = self._key(namespace, identifier)
|
|
44
|
+
self._conn.execute(
|
|
45
|
+
"INSERT OR REPLACE INTO cache (key, value, created_at) VALUES (?, ?, ?)",
|
|
46
|
+
(key, json.dumps(value), time.time()),
|
|
47
|
+
)
|
|
48
|
+
self._conn.commit()
|
|
49
|
+
|
|
50
|
+
def close(self) -> None:
|
|
51
|
+
self._conn.close()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_cache: Cache | None = None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_cache(path: Path | None = None) -> Cache:
|
|
58
|
+
global _cache
|
|
59
|
+
if _cache is None:
|
|
60
|
+
from citesentry.config import get_settings
|
|
61
|
+
p = path or get_settings().cache_path
|
|
62
|
+
_cache = Cache(p)
|
|
63
|
+
return _cache
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def reset_cache() -> None:
|
|
67
|
+
global _cache
|
|
68
|
+
if _cache is not None:
|
|
69
|
+
_cache.close()
|
|
70
|
+
_cache = None
|
|
File without changes
|