mcp-server-wayback 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_server_wayback-0.1.0/LICENSE +21 -0
- mcp_server_wayback-0.1.0/PKG-INFO +158 -0
- mcp_server_wayback-0.1.0/README.md +129 -0
- mcp_server_wayback-0.1.0/mcp_server_wayback.egg-info/PKG-INFO +158 -0
- mcp_server_wayback-0.1.0/mcp_server_wayback.egg-info/SOURCES.txt +34 -0
- mcp_server_wayback-0.1.0/mcp_server_wayback.egg-info/dependency_links.txt +1 -0
- mcp_server_wayback-0.1.0/mcp_server_wayback.egg-info/entry_points.txt +2 -0
- mcp_server_wayback-0.1.0/mcp_server_wayback.egg-info/requires.txt +4 -0
- mcp_server_wayback-0.1.0/mcp_server_wayback.egg-info/top_level.txt +1 -0
- mcp_server_wayback-0.1.0/pyproject.toml +73 -0
- mcp_server_wayback-0.1.0/setup.cfg +4 -0
- mcp_server_wayback-0.1.0/src/__init__.py +0 -0
- mcp_server_wayback-0.1.0/src/client/__init__.py +0 -0
- mcp_server_wayback-0.1.0/src/client/cache.py +58 -0
- mcp_server_wayback-0.1.0/src/client/cdx.py +48 -0
- mcp_server_wayback-0.1.0/src/client/extractor.py +90 -0
- mcp_server_wayback-0.1.0/src/client/http.py +73 -0
- mcp_server_wayback-0.1.0/src/client/parsers.py +84 -0
- mcp_server_wayback-0.1.0/src/client/rate_limiter.py +46 -0
- mcp_server_wayback-0.1.0/src/config.py +45 -0
- mcp_server_wayback-0.1.0/src/models.py +89 -0
- mcp_server_wayback-0.1.0/src/server.py +204 -0
- mcp_server_wayback-0.1.0/src/tools/__init__.py +0 -0
- mcp_server_wayback-0.1.0/src/tools/content.py +66 -0
- mcp_server_wayback-0.1.0/src/tools/search.py +73 -0
- mcp_server_wayback-0.1.0/src/tools/snapshots.py +51 -0
- mcp_server_wayback-0.1.0/tests/test_auth.py +134 -0
- mcp_server_wayback-0.1.0/tests/test_cache.py +218 -0
- mcp_server_wayback-0.1.0/tests/test_client.py +66 -0
- mcp_server_wayback-0.1.0/tests/test_content.py +313 -0
- mcp_server_wayback-0.1.0/tests/test_e2e_stdio.py +86 -0
- mcp_server_wayback-0.1.0/tests/test_extractor.py +167 -0
- mcp_server_wayback-0.1.0/tests/test_prompts.py +62 -0
- mcp_server_wayback-0.1.0/tests/test_resources.py +53 -0
- mcp_server_wayback-0.1.0/tests/test_search.py +383 -0
- mcp_server_wayback-0.1.0/tests/test_snapshots.py +154 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lakshya Mehta
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mcp-server-wayback
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server giving Claude and other LLM clients structured access to the Internet Archive's Wayback Machine.
|
|
5
|
+
Author-email: Lakshya Mehta <lakshyamehta03@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/lakshyamehta03/wayback-machine-mcp
|
|
8
|
+
Project-URL: Repository, https://github.com/lakshyamehta03/wayback-machine-mcp
|
|
9
|
+
Project-URL: Issues, https://github.com/lakshyamehta03/wayback-machine-mcp/issues
|
|
10
|
+
Keywords: mcp,model-context-protocol,internet-archive,wayback-machine,archive,claude,anthropic,llm
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: mcp>=1.0
|
|
25
|
+
Requires-Dist: httpx>=0.27
|
|
26
|
+
Requires-Dist: pydantic>=2.0
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
<div align="center">
|
|
31
|
+
|
|
32
|
+
<img src="https://raw.githubusercontent.com/lakshyamehta03/wayback-machine-mcp/main/Wayback_Machine_logo_2010.svg" alt="Wayback Machine" width="420" />
|
|
33
|
+
|
|
34
|
+
# wayback-mcp
|
|
35
|
+
|
|
36
|
+
**A Model Context Protocol server giving Claude structured access to the Internet Archive's Wayback Machine.**
|
|
37
|
+
|
|
38
|
+
[](https://github.com/lakshyamehta03/wayback-machine-mcp/actions/workflows/test.yml)
|
|
39
|
+
[](https://www.python.org/downloads/)
|
|
40
|
+
[](https://modelcontextprotocol.io/)
|
|
41
|
+
[](https://github.com/astral-sh/uv)
|
|
42
|
+
|
|
43
|
+
</div>
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Overview
|
|
48
|
+
|
|
49
|
+
`wayback-mcp` is an async Python MCP server that exposes the Internet Archive's six core APIs — Availability, CDX, Advanced Search, Metadata, and Wayback content — as first-class tools, prompts, and resources for Claude. It handles rate limiting, retry/back-off, and response shape normalisation so the model only sees structured Pydantic data.
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- **Six MCP tools** covering availability checks, snapshot lookups, full-text item search, domain crawls, page-text extraction, and item metadata
|
|
54
|
+
- **Four guided prompts** — `research_topic`, `track_site_changes`, `audit_link_rot`, `setup_authentication`
|
|
55
|
+
- **One MCP resource** — `wayback://item/{identifier}` exposes IA item metadata as JSON
|
|
56
|
+
- **Async token-bucket rate limiter** with per-endpoint buckets and `Retry-After` honoring
|
|
57
|
+
- **In-memory response cache** with per-endpoint TTLs to keep token usage and IA load low
|
|
58
|
+
- **Internet Archive S3 authentication** (optional) for higher rate-limit ceilings
|
|
59
|
+
- **Structured error model** — expected failures return `ToolError`; unexpected ones raise
|
|
60
|
+
- **Tested against live IA APIs** via an opt-in `--integration` pytest flag
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
|
|
64
|
+
Requires Python 3.11+.
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install mcp-server-wayback
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
> _Once published to PyPI._ Until then, see [Development](#development) for the from-source workflow.
|
|
71
|
+
|
|
72
|
+
## Usage
|
|
73
|
+
|
|
74
|
+
### Wire it into Claude Desktop
|
|
75
|
+
|
|
76
|
+
Add an entry to `claude_desktop_config.json` (on macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`):
|
|
77
|
+
|
|
78
|
+
```json
|
|
79
|
+
{
|
|
80
|
+
"mcpServers": {
|
|
81
|
+
"wayback": {
|
|
82
|
+
"command": "mcp-server-wayback"
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Restart Claude Desktop. The `wayback` tools, prompts, and resources will appear in the MCP picker.
|
|
89
|
+
|
|
90
|
+
If you prefer not to install globally, run it on demand with [`uvx`](https://github.com/astral-sh/uv):
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"mcpServers": {
|
|
95
|
+
"wayback": {
|
|
96
|
+
"command": "uvx",
|
|
97
|
+
"args": ["mcp-server-wayback"]
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Optional: Internet Archive authentication
|
|
104
|
+
|
|
105
|
+
Set both keys in the server's environment to authenticate every IA request and raise your rate-limit ceiling. Run the `setup_authentication` prompt from Claude to walk through it interactively.
|
|
106
|
+
|
|
107
|
+
```json
|
|
108
|
+
"env": {
|
|
109
|
+
"WAYBACK_MCP_IA_ACCESS_KEY": "<your access key>",
|
|
110
|
+
"WAYBACK_MCP_IA_SECRET_KEY": "<your secret key>"
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Get keys at <https://archive.org/account/s3.php>.
|
|
115
|
+
|
|
116
|
+
## Tools
|
|
117
|
+
|
|
118
|
+
| Tool | Purpose |
|
|
119
|
+
|---|---|
|
|
120
|
+
| `check_availability` | Is this URL archived? Returns the closest snapshot |
|
|
121
|
+
| `lookup_snapshots` | List CDX snapshots for a URL with date / status filters |
|
|
122
|
+
| `search_archive` | Lucene search across IA collections with mediatype + year range |
|
|
123
|
+
| `search_domain` | Discover archived URLs under a domain or path prefix |
|
|
124
|
+
| `get_snapshot_content` | Fetch an archived page and extract its readable text |
|
|
125
|
+
| `get_item_metadata` | Rich structured metadata for any IA item identifier |
|
|
126
|
+
|
|
127
|
+
## Prompts
|
|
128
|
+
|
|
129
|
+
| Prompt | What it does |
|
|
130
|
+
|---|---|
|
|
131
|
+
| `research_topic` | Multi-mediatype IA search → synthesised topic overview |
|
|
132
|
+
| `track_site_changes` | Sample snapshots over time → narrate how a page evolved |
|
|
133
|
+
| `audit_link_rot` | Bulk-check URLs and surface archived alternatives |
|
|
134
|
+
| `setup_authentication` | Walks the user through configuring IA S3 keys |
|
|
135
|
+
|
|
136
|
+
## Development
|
|
137
|
+
|
|
138
|
+
Requires Python 3.11+ and [`uv`](https://github.com/astral-sh/uv).
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
git clone https://github.com/lakshyamehta03/wayback-machine-mcp.git
|
|
142
|
+
cd wayback-machine-mcp
|
|
143
|
+
uv sync
|
|
144
|
+
uv run mcp-server-wayback # run the server
|
|
145
|
+
uv run pytest # unit tests (httpx mocked via respx)
|
|
146
|
+
uv run pytest --integration # also hit live Internet Archive APIs
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
CI runs the unit suite on every push and pull request via GitHub Actions.
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
[MIT](LICENSE). The Wayback Machine logo is © Internet Archive and used here under fair use to identify the upstream service this project integrates with.
|
|
154
|
+
|
|
155
|
+
## Acknowledgments
|
|
156
|
+
|
|
157
|
+
- The [Internet Archive](https://archive.org/) for the Wayback Machine and the open APIs that make this server possible
|
|
158
|
+
- [Anthropic](https://www.anthropic.com/) for the [Model Context Protocol](https://modelcontextprotocol.io/) specification and SDK
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<img src="https://raw.githubusercontent.com/lakshyamehta03/wayback-machine-mcp/main/Wayback_Machine_logo_2010.svg" alt="Wayback Machine" width="420" />
|
|
4
|
+
|
|
5
|
+
# wayback-mcp
|
|
6
|
+
|
|
7
|
+
**A Model Context Protocol server giving Claude structured access to the Internet Archive's Wayback Machine.**
|
|
8
|
+
|
|
9
|
+
[](https://github.com/lakshyamehta03/wayback-machine-mcp/actions/workflows/test.yml)
|
|
10
|
+
[](https://www.python.org/downloads/)
|
|
11
|
+
[](https://modelcontextprotocol.io/)
|
|
12
|
+
[](https://github.com/astral-sh/uv)
|
|
13
|
+
|
|
14
|
+
</div>
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Overview
|
|
19
|
+
|
|
20
|
+
`wayback-mcp` is an async Python MCP server that exposes the Internet Archive's six core APIs — Availability, CDX, Advanced Search, Metadata, and Wayback content — as first-class tools, prompts, and resources for Claude. It handles rate limiting, retry/back-off, and response shape normalisation so the model only sees structured Pydantic data.
|
|
21
|
+
|
|
22
|
+
## Features
|
|
23
|
+
|
|
24
|
+
- **Six MCP tools** covering availability checks, snapshot lookups, full-text item search, domain crawls, page-text extraction, and item metadata
|
|
25
|
+
- **Four guided prompts** — `research_topic`, `track_site_changes`, `audit_link_rot`, `setup_authentication`
|
|
26
|
+
- **One MCP resource** — `wayback://item/{identifier}` exposes IA item metadata as JSON
|
|
27
|
+
- **Async token-bucket rate limiter** with per-endpoint buckets and `Retry-After` honoring
|
|
28
|
+
- **In-memory response cache** with per-endpoint TTLs to keep token usage and IA load low
|
|
29
|
+
- **Internet Archive S3 authentication** (optional) for higher rate-limit ceilings
|
|
30
|
+
- **Structured error model** — expected failures return `ToolError`; unexpected ones raise
|
|
31
|
+
- **Tested against live IA APIs** via an opt-in `--integration` pytest flag
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
Requires Python 3.11+.
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install mcp-server-wayback
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
> _Once published to PyPI._ Until then, see [Development](#development) for the from-source workflow.
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
### Wire it into Claude Desktop
|
|
46
|
+
|
|
47
|
+
Add an entry to `claude_desktop_config.json` (on macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`):
|
|
48
|
+
|
|
49
|
+
```json
|
|
50
|
+
{
|
|
51
|
+
"mcpServers": {
|
|
52
|
+
"wayback": {
|
|
53
|
+
"command": "mcp-server-wayback"
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Restart Claude Desktop. The `wayback` tools, prompts, and resources will appear in the MCP picker.
|
|
60
|
+
|
|
61
|
+
If you prefer not to install globally, run it on demand with [`uvx`](https://github.com/astral-sh/uv):
|
|
62
|
+
|
|
63
|
+
```json
|
|
64
|
+
{
|
|
65
|
+
"mcpServers": {
|
|
66
|
+
"wayback": {
|
|
67
|
+
"command": "uvx",
|
|
68
|
+
"args": ["mcp-server-wayback"]
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Optional: Internet Archive authentication
|
|
75
|
+
|
|
76
|
+
Set both keys in the server's environment to authenticate every IA request and raise your rate-limit ceiling. Run the `setup_authentication` prompt from Claude to walk through it interactively.
|
|
77
|
+
|
|
78
|
+
```json
|
|
79
|
+
"env": {
|
|
80
|
+
"WAYBACK_MCP_IA_ACCESS_KEY": "<your access key>",
|
|
81
|
+
"WAYBACK_MCP_IA_SECRET_KEY": "<your secret key>"
|
|
82
|
+
}
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Get keys at <https://archive.org/account/s3.php>.
|
|
86
|
+
|
|
87
|
+
## Tools
|
|
88
|
+
|
|
89
|
+
| Tool | Purpose |
|
|
90
|
+
|---|---|
|
|
91
|
+
| `check_availability` | Is this URL archived? Returns the closest snapshot |
|
|
92
|
+
| `lookup_snapshots` | List CDX snapshots for a URL with date / status filters |
|
|
93
|
+
| `search_archive` | Lucene search across IA collections with mediatype + year range |
|
|
94
|
+
| `search_domain` | Discover archived URLs under a domain or path prefix |
|
|
95
|
+
| `get_snapshot_content` | Fetch an archived page and extract its readable text |
|
|
96
|
+
| `get_item_metadata` | Rich structured metadata for any IA item identifier |
|
|
97
|
+
|
|
98
|
+
## Prompts
|
|
99
|
+
|
|
100
|
+
| Prompt | What it does |
|
|
101
|
+
|---|---|
|
|
102
|
+
| `research_topic` | Multi-mediatype IA search → synthesised topic overview |
|
|
103
|
+
| `track_site_changes` | Sample snapshots over time → narrate how a page evolved |
|
|
104
|
+
| `audit_link_rot` | Bulk-check URLs and surface archived alternatives |
|
|
105
|
+
| `setup_authentication` | Walks the user through configuring IA S3 keys |
|
|
106
|
+
|
|
107
|
+
## Development
|
|
108
|
+
|
|
109
|
+
Requires Python 3.11+ and [`uv`](https://github.com/astral-sh/uv).
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
git clone https://github.com/lakshyamehta03/wayback-machine-mcp.git
|
|
113
|
+
cd wayback-machine-mcp
|
|
114
|
+
uv sync
|
|
115
|
+
uv run mcp-server-wayback # run the server
|
|
116
|
+
uv run pytest # unit tests (httpx mocked via respx)
|
|
117
|
+
uv run pytest --integration # also hit live Internet Archive APIs
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
CI runs the unit suite on every push and pull request via GitHub Actions.
|
|
121
|
+
|
|
122
|
+
## License
|
|
123
|
+
|
|
124
|
+
[MIT](LICENSE). The Wayback Machine logo is © Internet Archive and used here under fair use to identify the upstream service this project integrates with.
|
|
125
|
+
|
|
126
|
+
## Acknowledgments
|
|
127
|
+
|
|
128
|
+
- The [Internet Archive](https://archive.org/) for the Wayback Machine and the open APIs that make this server possible
|
|
129
|
+
- [Anthropic](https://www.anthropic.com/) for the [Model Context Protocol](https://modelcontextprotocol.io/) specification and SDK
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mcp-server-wayback
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server giving Claude and other LLM clients structured access to the Internet Archive's Wayback Machine.
|
|
5
|
+
Author-email: Lakshya Mehta <lakshyamehta03@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/lakshyamehta03/wayback-machine-mcp
|
|
8
|
+
Project-URL: Repository, https://github.com/lakshyamehta03/wayback-machine-mcp
|
|
9
|
+
Project-URL: Issues, https://github.com/lakshyamehta03/wayback-machine-mcp/issues
|
|
10
|
+
Keywords: mcp,model-context-protocol,internet-archive,wayback-machine,archive,claude,anthropic,llm
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: mcp>=1.0
|
|
25
|
+
Requires-Dist: httpx>=0.27
|
|
26
|
+
Requires-Dist: pydantic>=2.0
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
<div align="center">
|
|
31
|
+
|
|
32
|
+
<img src="https://raw.githubusercontent.com/lakshyamehta03/wayback-machine-mcp/main/Wayback_Machine_logo_2010.svg" alt="Wayback Machine" width="420" />
|
|
33
|
+
|
|
34
|
+
# wayback-mcp
|
|
35
|
+
|
|
36
|
+
**A Model Context Protocol server giving Claude structured access to the Internet Archive's Wayback Machine.**
|
|
37
|
+
|
|
38
|
+
[](https://github.com/lakshyamehta03/wayback-machine-mcp/actions/workflows/test.yml)
|
|
39
|
+
[](https://www.python.org/downloads/)
|
|
40
|
+
[](https://modelcontextprotocol.io/)
|
|
41
|
+
[](https://github.com/astral-sh/uv)
|
|
42
|
+
|
|
43
|
+
</div>
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Overview
|
|
48
|
+
|
|
49
|
+
`wayback-mcp` is an async Python MCP server that exposes the Internet Archive's six core APIs — Availability, CDX, Advanced Search, Metadata, and Wayback content — as first-class tools, prompts, and resources for Claude. It handles rate limiting, retry/back-off, and response shape normalisation so the model only sees structured Pydantic data.
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- **Six MCP tools** covering availability checks, snapshot lookups, full-text item search, domain crawls, page-text extraction, and item metadata
|
|
54
|
+
- **Four guided prompts** — `research_topic`, `track_site_changes`, `audit_link_rot`, `setup_authentication`
|
|
55
|
+
- **One MCP resource** — `wayback://item/{identifier}` exposes IA item metadata as JSON
|
|
56
|
+
- **Async token-bucket rate limiter** with per-endpoint buckets and `Retry-After` honoring
|
|
57
|
+
- **In-memory response cache** with per-endpoint TTLs to keep token usage and IA load low
|
|
58
|
+
- **Internet Archive S3 authentication** (optional) for higher rate-limit ceilings
|
|
59
|
+
- **Structured error model** — expected failures return `ToolError`; unexpected ones raise
|
|
60
|
+
- **Tested against live IA APIs** via an opt-in `--integration` pytest flag
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
|
|
64
|
+
Requires Python 3.11+.
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install mcp-server-wayback
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
> _Once published to PyPI._ Until then, see [Development](#development) for the from-source workflow.
|
|
71
|
+
|
|
72
|
+
## Usage
|
|
73
|
+
|
|
74
|
+
### Wire it into Claude Desktop
|
|
75
|
+
|
|
76
|
+
Add an entry to `claude_desktop_config.json` (on macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`):
|
|
77
|
+
|
|
78
|
+
```json
|
|
79
|
+
{
|
|
80
|
+
"mcpServers": {
|
|
81
|
+
"wayback": {
|
|
82
|
+
"command": "mcp-server-wayback"
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Restart Claude Desktop. The `wayback` tools, prompts, and resources will appear in the MCP picker.
|
|
89
|
+
|
|
90
|
+
If you prefer not to install globally, run it on demand with [`uvx`](https://github.com/astral-sh/uv):
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"mcpServers": {
|
|
95
|
+
"wayback": {
|
|
96
|
+
"command": "uvx",
|
|
97
|
+
"args": ["mcp-server-wayback"]
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Optional: Internet Archive authentication
|
|
104
|
+
|
|
105
|
+
Set both keys in the server's environment to authenticate every IA request and raise your rate-limit ceiling. Run the `setup_authentication` prompt from Claude to walk through it interactively.
|
|
106
|
+
|
|
107
|
+
```json
|
|
108
|
+
"env": {
|
|
109
|
+
"WAYBACK_MCP_IA_ACCESS_KEY": "<your access key>",
|
|
110
|
+
"WAYBACK_MCP_IA_SECRET_KEY": "<your secret key>"
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Get keys at <https://archive.org/account/s3.php>.
|
|
115
|
+
|
|
116
|
+
## Tools
|
|
117
|
+
|
|
118
|
+
| Tool | Purpose |
|
|
119
|
+
|---|---|
|
|
120
|
+
| `check_availability` | Is this URL archived? Returns the closest snapshot |
|
|
121
|
+
| `lookup_snapshots` | List CDX snapshots for a URL with date / status filters |
|
|
122
|
+
| `search_archive` | Lucene search across IA collections with mediatype + year range |
|
|
123
|
+
| `search_domain` | Discover archived URLs under a domain or path prefix |
|
|
124
|
+
| `get_snapshot_content` | Fetch an archived page and extract its readable text |
|
|
125
|
+
| `get_item_metadata` | Rich structured metadata for any IA item identifier |
|
|
126
|
+
|
|
127
|
+
## Prompts
|
|
128
|
+
|
|
129
|
+
| Prompt | What it does |
|
|
130
|
+
|---|---|
|
|
131
|
+
| `research_topic` | Multi-mediatype IA search → synthesised topic overview |
|
|
132
|
+
| `track_site_changes` | Sample snapshots over time → narrate how a page evolved |
|
|
133
|
+
| `audit_link_rot` | Bulk-check URLs and surface archived alternatives |
|
|
134
|
+
| `setup_authentication` | Walks the user through configuring IA S3 keys |
|
|
135
|
+
|
|
136
|
+
## Development
|
|
137
|
+
|
|
138
|
+
Requires Python 3.11+ and [`uv`](https://github.com/astral-sh/uv).
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
git clone https://github.com/lakshyamehta03/wayback-machine-mcp.git
|
|
142
|
+
cd wayback-machine-mcp
|
|
143
|
+
uv sync
|
|
144
|
+
uv run mcp-server-wayback # run the server
|
|
145
|
+
uv run pytest # unit tests (httpx mocked via respx)
|
|
146
|
+
uv run pytest --integration # also hit live Internet Archive APIs
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
CI runs the unit suite on every push and pull request via GitHub Actions.
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
[MIT](LICENSE). The Wayback Machine logo is © Internet Archive and used here under fair use to identify the upstream service this project integrates with.
|
|
154
|
+
|
|
155
|
+
## Acknowledgments
|
|
156
|
+
|
|
157
|
+
- The [Internet Archive](https://archive.org/) for the Wayback Machine and the open APIs that make this server possible
|
|
158
|
+
- [Anthropic](https://www.anthropic.com/) for the [Model Context Protocol](https://modelcontextprotocol.io/) specification and SDK
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
mcp_server_wayback.egg-info/PKG-INFO
|
|
5
|
+
mcp_server_wayback.egg-info/SOURCES.txt
|
|
6
|
+
mcp_server_wayback.egg-info/dependency_links.txt
|
|
7
|
+
mcp_server_wayback.egg-info/entry_points.txt
|
|
8
|
+
mcp_server_wayback.egg-info/requires.txt
|
|
9
|
+
mcp_server_wayback.egg-info/top_level.txt
|
|
10
|
+
src/__init__.py
|
|
11
|
+
src/config.py
|
|
12
|
+
src/models.py
|
|
13
|
+
src/server.py
|
|
14
|
+
src/client/__init__.py
|
|
15
|
+
src/client/cache.py
|
|
16
|
+
src/client/cdx.py
|
|
17
|
+
src/client/extractor.py
|
|
18
|
+
src/client/http.py
|
|
19
|
+
src/client/parsers.py
|
|
20
|
+
src/client/rate_limiter.py
|
|
21
|
+
src/tools/__init__.py
|
|
22
|
+
src/tools/content.py
|
|
23
|
+
src/tools/search.py
|
|
24
|
+
src/tools/snapshots.py
|
|
25
|
+
tests/test_auth.py
|
|
26
|
+
tests/test_cache.py
|
|
27
|
+
tests/test_client.py
|
|
28
|
+
tests/test_content.py
|
|
29
|
+
tests/test_e2e_stdio.py
|
|
30
|
+
tests/test_extractor.py
|
|
31
|
+
tests/test_prompts.py
|
|
32
|
+
tests/test_resources.py
|
|
33
|
+
tests/test_search.py
|
|
34
|
+
tests/test_snapshots.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
wayback_mcp
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "mcp-server-wayback"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "MCP server giving Claude and other LLM clients structured access to the Internet Archive's Wayback Machine."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
license-files = ["LICENSE"]
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Lakshya Mehta", email = "lakshyamehta03@gmail.com" },
|
|
11
|
+
]
|
|
12
|
+
keywords = [
|
|
13
|
+
"mcp",
|
|
14
|
+
"model-context-protocol",
|
|
15
|
+
"internet-archive",
|
|
16
|
+
"wayback-machine",
|
|
17
|
+
"archive",
|
|
18
|
+
"claude",
|
|
19
|
+
"anthropic",
|
|
20
|
+
"llm",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 4 - Beta",
|
|
24
|
+
"Intended Audience :: Developers",
|
|
25
|
+
"Operating System :: OS Independent",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Programming Language :: Python :: 3.13",
|
|
30
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
31
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
32
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
|
+
]
|
|
34
|
+
dependencies = [
|
|
35
|
+
"mcp>=1.0",
|
|
36
|
+
"httpx>=0.27",
|
|
37
|
+
"pydantic>=2.0",
|
|
38
|
+
"beautifulsoup4>=4.12",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.urls]
|
|
42
|
+
Homepage = "https://github.com/lakshyamehta03/wayback-machine-mcp"
|
|
43
|
+
Repository = "https://github.com/lakshyamehta03/wayback-machine-mcp"
|
|
44
|
+
Issues = "https://github.com/lakshyamehta03/wayback-machine-mcp/issues"
|
|
45
|
+
|
|
46
|
+
[project.scripts]
|
|
47
|
+
mcp-server-wayback = "wayback_mcp.server:main"
|
|
48
|
+
|
|
49
|
+
[build-system]
|
|
50
|
+
requires = ["setuptools>=77"]
|
|
51
|
+
build-backend = "setuptools.build_meta"
|
|
52
|
+
|
|
53
|
+
[tool.setuptools]
|
|
54
|
+
packages = ["wayback_mcp", "wayback_mcp.client", "wayback_mcp.tools"]
|
|
55
|
+
|
|
56
|
+
[tool.setuptools.package-dir]
|
|
57
|
+
"wayback_mcp" = "src"
|
|
58
|
+
|
|
59
|
+
[tool.pytest.ini_options]
|
|
60
|
+
asyncio_mode = "auto"
|
|
61
|
+
testpaths = ["tests"]
|
|
62
|
+
markers = [
|
|
63
|
+
"integration: marks tests as integration (deselected by default)",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
[dependency-groups]
|
|
67
|
+
dev = [
|
|
68
|
+
"pytest>=8.0",
|
|
69
|
+
"pytest-asyncio>=0.23",
|
|
70
|
+
"respx>=0.20",
|
|
71
|
+
"build>=1.2",
|
|
72
|
+
"twine>=5.0",
|
|
73
|
+
]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
from typing import Callable
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
CacheKey = tuple[str, frozenset]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _make_key(url: str, params: dict | None) -> CacheKey:
|
|
12
|
+
return (url, frozenset(params.items()) if params else frozenset())
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ResponseCache:
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
max_size: int,
|
|
19
|
+
ttls: dict[str, float],
|
|
20
|
+
now: Callable[[], float] = time.monotonic,
|
|
21
|
+
) -> None:
|
|
22
|
+
self._max_size = max_size
|
|
23
|
+
self._ttls = ttls
|
|
24
|
+
self._now = now
|
|
25
|
+
self._entries: "OrderedDict[CacheKey, tuple[float, int, dict, bytes]]" = OrderedDict()
|
|
26
|
+
self._lock = asyncio.Lock()
|
|
27
|
+
|
|
28
|
+
def clear(self) -> None:
|
|
29
|
+
self._entries.clear()
|
|
30
|
+
|
|
31
|
+
async def get(self, url: str, params: dict | None) -> httpx.Response | None:
|
|
32
|
+
key = _make_key(url, params)
|
|
33
|
+
async with self._lock:
|
|
34
|
+
entry = self._entries.get(key)
|
|
35
|
+
if entry is None:
|
|
36
|
+
return None
|
|
37
|
+
expires_at, status, headers, body = entry
|
|
38
|
+
if self._now() >= expires_at:
|
|
39
|
+
del self._entries[key]
|
|
40
|
+
return None
|
|
41
|
+
self._entries.move_to_end(key)
|
|
42
|
+
return httpx.Response(status, headers=headers, content=body)
|
|
43
|
+
|
|
44
|
+
async def set(
|
|
45
|
+
self, url: str, params: dict | None, bucket: str, response: httpx.Response
|
|
46
|
+
) -> None:
|
|
47
|
+
ttl = self._ttls.get(bucket)
|
|
48
|
+
if ttl is None:
|
|
49
|
+
return
|
|
50
|
+
key = _make_key(url, params)
|
|
51
|
+
expires_at = self._now() + ttl
|
|
52
|
+
headers = dict(response.headers)
|
|
53
|
+
body = response.content
|
|
54
|
+
async with self._lock:
|
|
55
|
+
self._entries[key] = (expires_at, response.status_code, headers, body)
|
|
56
|
+
self._entries.move_to_end(key)
|
|
57
|
+
while len(self._entries) > self._max_size:
|
|
58
|
+
self._entries.popitem(last=False)
|