citekit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- citekit-0.1.0/PKG-INFO +45 -0
- citekit-0.1.0/README.md +17 -0
- citekit-0.1.0/build_log.txt +0 -0
- citekit-0.1.0/citekit/__init__.py +21 -0
- citekit-0.1.0/citekit/__main__.py +4 -0
- citekit-0.1.0/citekit/address.py +166 -0
- citekit-0.1.0/citekit/cli.py +119 -0
- citekit-0.1.0/citekit/client.py +176 -0
- citekit-0.1.0/citekit/mapper/__init__.py +1 -0
- citekit-0.1.0/citekit/mapper/base.py +36 -0
- citekit-0.1.0/citekit/mapper/gemini.py +332 -0
- citekit-0.1.0/citekit/mcp_server.py +138 -0
- citekit-0.1.0/citekit/models.py +66 -0
- citekit-0.1.0/citekit/resolvers/__init__.py +15 -0
- citekit-0.1.0/citekit/resolvers/audio.py +67 -0
- citekit-0.1.0/citekit/resolvers/base.py +33 -0
- citekit-0.1.0/citekit/resolvers/document.py +54 -0
- citekit-0.1.0/citekit/resolvers/image.py +58 -0
- citekit-0.1.0/citekit/resolvers/video.py +69 -0
- citekit-0.1.0/examples/usage.py +80 -0
- citekit-0.1.0/pyproject.toml +45 -0
- citekit-0.1.0/tests/test_address.py +119 -0
- citekit-0.1.0/tests/test_models.py +127 -0
- citekit-0.1.0/tests/test_resolvers.py +185 -0
citekit-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: citekit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A local SDK that lets AI agents open specific parts of files instead of sending entire documents.
|
|
5
|
+
Project-URL: Repository, https://github.com/citekit/citekit
|
|
6
|
+
Author: CiteKit Team
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: ai,audio,llm,mcp,pdf,rag,sdk,video
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: click>=8.0
|
|
19
|
+
Requires-Dist: google-genai>=1.0
|
|
20
|
+
Requires-Dist: mcp>=1.0
|
|
21
|
+
Requires-Dist: pillow>=10.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0
|
|
23
|
+
Requires-Dist: pymupdf>=1.24
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# CiteKit SDK for Python
|
|
30
|
+
|
|
31
|
+
See the main [README](../../README.md) for usage instructions.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install citekit
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from citekit import CiteKitClient, GeminiMapper
|
|
43
|
+
|
|
44
|
+
# ... (see main docs)
|
|
45
|
+
```
|
citekit-0.1.0/README.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# CiteKit SDK for Python
|
|
2
|
+
|
|
3
|
+
See the main [README](../../README.md) for usage instructions.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install citekit
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from citekit import CiteKitClient, GeminiMapper
|
|
15
|
+
|
|
16
|
+
# ... (see main docs)
|
|
17
|
+
```
|
|
Binary file
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""CiteKit — Let AI agents open specific parts of files."""
|
|
2
|
+
|
|
3
|
+
from citekit.models import ResourceMap, Node, Location, ResolvedEvidence
|
|
4
|
+
from citekit.client import CiteKitClient
|
|
5
|
+
from citekit.address import parse_address, build_address
|
|
6
|
+
from citekit.mapper.base import MapperProvider
|
|
7
|
+
from citekit.mapper.gemini import GeminiMapper
|
|
8
|
+
|
|
9
|
+
__version__ = "0.1.0"
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ResourceMap",
|
|
13
|
+
"Node",
|
|
14
|
+
"Location",
|
|
15
|
+
"ResolvedEvidence",
|
|
16
|
+
"CiteKitClient",
|
|
17
|
+
"parse_address",
|
|
18
|
+
"build_address",
|
|
19
|
+
"MapperProvider",
|
|
20
|
+
"GeminiMapper",
|
|
21
|
+
]
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Address parser and builder for CiteKit URI-style addresses.
|
|
2
|
+
|
|
3
|
+
Formats:
|
|
4
|
+
doc://resource_id#pages=3-5
|
|
5
|
+
video://resource_id#t=192-230
|
|
6
|
+
audio://resource_id#t=60-120
|
|
7
|
+
image://resource_id#bbox=0.2,0.3,0.8,0.7
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
from urllib.parse import urlparse, parse_qs
|
|
14
|
+
|
|
15
|
+
from citekit.models import Location
|
|
16
|
+
|
|
17
|
+
# Scheme → modality mapping
|
|
18
|
+
_SCHEME_TO_MODALITY = {
|
|
19
|
+
"doc": "document",
|
|
20
|
+
"video": "video",
|
|
21
|
+
"audio": "audio",
|
|
22
|
+
"image": "image",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
_MODALITY_TO_SCHEME = {v: k for k, v in _SCHEME_TO_MODALITY.items()}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def parse_address(address: str) -> tuple[str, Location]:
|
|
29
|
+
"""Parse a CiteKit address into (resource_id, Location).
|
|
30
|
+
|
|
31
|
+
Examples:
|
|
32
|
+
>>> parse_address("doc://calculus_book#pages=12-13")
|
|
33
|
+
('calculus_book', Location(modality='document', pages=[12, 13]))
|
|
34
|
+
|
|
35
|
+
>>> parse_address("video://lecture1#t=192-230")
|
|
36
|
+
('lecture1', Location(modality='video', start=192.0, end=230.0))
|
|
37
|
+
|
|
38
|
+
>>> parse_address("image://diagram#bbox=0.2,0.3,0.8,0.7")
|
|
39
|
+
('diagram', Location(modality='image', bbox=(0.2, 0.3, 0.8, 0.7)))
|
|
40
|
+
"""
|
|
41
|
+
# Parse scheme manually since urllib doesn't handle custom schemes well
|
|
42
|
+
match = re.match(r"^(\w+)://([^#]+)(?:#(.+))?$", address)
|
|
43
|
+
if not match:
|
|
44
|
+
raise ValueError(f"Invalid CiteKit address: {address}")
|
|
45
|
+
|
|
46
|
+
scheme, resource_id, fragment = match.group(1), match.group(2), match.group(3)
|
|
47
|
+
|
|
48
|
+
if scheme not in _SCHEME_TO_MODALITY:
|
|
49
|
+
raise ValueError(f"Unknown scheme '{scheme}'. Expected one of: {list(_SCHEME_TO_MODALITY.keys())}")
|
|
50
|
+
|
|
51
|
+
modality = _SCHEME_TO_MODALITY[scheme]
|
|
52
|
+
|
|
53
|
+
# Parse fragment parameters
|
|
54
|
+
pages = None
|
|
55
|
+
start = None
|
|
56
|
+
end = None
|
|
57
|
+
bbox = None
|
|
58
|
+
|
|
59
|
+
if fragment:
|
|
60
|
+
params = dict(part.split("=", 1) for part in fragment.split("&") if "=" in part)
|
|
61
|
+
|
|
62
|
+
if "pages" in params:
|
|
63
|
+
page_str = params["pages"]
|
|
64
|
+
if "-" in page_str:
|
|
65
|
+
p_start, p_end = page_str.split("-", 1)
|
|
66
|
+
pages = list(range(int(p_start), int(p_end) + 1))
|
|
67
|
+
else:
|
|
68
|
+
pages = [int(p) for p in page_str.split(",")]
|
|
69
|
+
|
|
70
|
+
if "t" in params:
|
|
71
|
+
time_str = params["t"]
|
|
72
|
+
if "-" in time_str:
|
|
73
|
+
t_start, t_end = time_str.split("-", 1)
|
|
74
|
+
start = _parse_time(t_start)
|
|
75
|
+
end = _parse_time(t_end)
|
|
76
|
+
|
|
77
|
+
if "bbox" in params:
|
|
78
|
+
parts = params["bbox"].split(",")
|
|
79
|
+
if len(parts) != 4:
|
|
80
|
+
raise ValueError(f"bbox must have 4 values, got {len(parts)}")
|
|
81
|
+
bbox = tuple(float(p) for p in parts)
|
|
82
|
+
|
|
83
|
+
return resource_id, Location(
|
|
84
|
+
modality=modality,
|
|
85
|
+
pages=pages,
|
|
86
|
+
start=start,
|
|
87
|
+
end=end,
|
|
88
|
+
bbox=bbox,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def build_address(resource_id: str, location: Location) -> str:
|
|
93
|
+
"""Build a CiteKit address from a resource ID and location.
|
|
94
|
+
|
|
95
|
+
Examples:
|
|
96
|
+
>>> build_address("book", Location(modality="document", pages=[3, 4, 5]))
|
|
97
|
+
'doc://book#pages=3-5'
|
|
98
|
+
|
|
99
|
+
>>> build_address("lecture", Location(modality="video", start=192.0, end=230.0))
|
|
100
|
+
'video://lecture#t=192-230'
|
|
101
|
+
"""
|
|
102
|
+
scheme = _MODALITY_TO_SCHEME.get(location.modality)
|
|
103
|
+
if not scheme:
|
|
104
|
+
raise ValueError(f"Unknown modality: {location.modality}")
|
|
105
|
+
|
|
106
|
+
fragment_parts = []
|
|
107
|
+
|
|
108
|
+
if location.pages is not None:
|
|
109
|
+
if len(location.pages) == 0:
|
|
110
|
+
raise ValueError("Pages list cannot be empty")
|
|
111
|
+
pages_sorted = sorted(location.pages)
|
|
112
|
+
# Check if pages are consecutive for range notation
|
|
113
|
+
if pages_sorted == list(range(pages_sorted[0], pages_sorted[-1] + 1)):
|
|
114
|
+
fragment_parts.append(f"pages={pages_sorted[0]}-{pages_sorted[-1]}")
|
|
115
|
+
else:
|
|
116
|
+
fragment_parts.append(f"pages={','.join(str(p) for p in pages_sorted)}")
|
|
117
|
+
|
|
118
|
+
if location.start is not None and location.end is not None:
|
|
119
|
+
start_str = _format_time(location.start)
|
|
120
|
+
end_str = _format_time(location.end)
|
|
121
|
+
fragment_parts.append(f"t={start_str}-{end_str}")
|
|
122
|
+
|
|
123
|
+
if location.bbox is not None:
|
|
124
|
+
bbox_str = ",".join(f"{v:g}" for v in location.bbox)
|
|
125
|
+
fragment_parts.append(f"bbox={bbox_str}")
|
|
126
|
+
|
|
127
|
+
fragment = "&".join(fragment_parts)
|
|
128
|
+
if fragment:
|
|
129
|
+
return f"{scheme}://{resource_id}#{fragment}"
|
|
130
|
+
return f"{scheme}://{resource_id}"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _parse_time(time_str: str) -> float:
|
|
134
|
+
"""Parse a time string to seconds. Supports seconds or HH:MM:SS format."""
|
|
135
|
+
time_str = time_str.strip()
|
|
136
|
+
|
|
137
|
+
# HH:MM:SS or MM:SS format
|
|
138
|
+
if ":" in time_str:
|
|
139
|
+
parts = time_str.split(":")
|
|
140
|
+
if len(parts) == 3:
|
|
141
|
+
h, m, s = parts
|
|
142
|
+
return int(h) * 3600 + int(m) * 60 + float(s)
|
|
143
|
+
elif len(parts) == 2:
|
|
144
|
+
m, s = parts
|
|
145
|
+
return int(m) * 60 + float(s)
|
|
146
|
+
|
|
147
|
+
return float(time_str)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _format_time(seconds: float) -> str:
|
|
151
|
+
"""Format seconds as a compact time string."""
|
|
152
|
+
if seconds == int(seconds):
|
|
153
|
+
seconds = int(seconds)
|
|
154
|
+
|
|
155
|
+
# Use HH:MM:SS for large values
|
|
156
|
+
if isinstance(seconds, int) and seconds >= 3600:
|
|
157
|
+
h = seconds // 3600
|
|
158
|
+
m = (seconds % 3600) // 60
|
|
159
|
+
s = seconds % 60
|
|
160
|
+
return f"{h:02d}:{m:02d}:{s:02d}"
|
|
161
|
+
elif isinstance(seconds, int) and seconds >= 60:
|
|
162
|
+
m = seconds // 60
|
|
163
|
+
s = seconds % 60
|
|
164
|
+
return f"{m:02d}:{s:02d}"
|
|
165
|
+
|
|
166
|
+
return str(seconds)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import functools
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from citekit.client import CiteKitClient
|
|
10
|
+
|
|
11
|
+
def async_command(f):
|
|
12
|
+
@functools.wraps(f)
|
|
13
|
+
def wrapper(*args, **kwargs):
|
|
14
|
+
return asyncio.run(f(*args, **kwargs))
|
|
15
|
+
return wrapper
|
|
16
|
+
|
|
17
|
+
@click.group()
|
|
18
|
+
def main():
|
|
19
|
+
"""CiteKit CLI - Local AI Resource Mapper & Resolver."""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
@main.command()
|
|
23
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
24
|
+
@click.option("--type", "-t", help="Resource type (document, video, audio, image). If omitted, inferred from extension.")
|
|
25
|
+
@async_command
|
|
26
|
+
async def ingest(path, type):
|
|
27
|
+
"""Ingest a file and generate a resource map."""
|
|
28
|
+
client = CiteKitClient()
|
|
29
|
+
|
|
30
|
+
if not type:
|
|
31
|
+
ext = Path(path).suffix.lower()
|
|
32
|
+
if ext in (".pdf", ".txt", ".md"):
|
|
33
|
+
type = "document"
|
|
34
|
+
elif ext in (".mp4", ".mov", ".avi", ".mkv"):
|
|
35
|
+
type = "video"
|
|
36
|
+
elif ext in (".mp3", ".wav", ".m4a"):
|
|
37
|
+
type = "audio"
|
|
38
|
+
elif ext in (".png", ".jpg", ".jpeg", ".webp"):
|
|
39
|
+
type = "image"
|
|
40
|
+
else:
|
|
41
|
+
click.echo(f"⚠️ Could not infer type from extension '{ext}'. Please specify --type.", err=True)
|
|
42
|
+
sys.exit(1)
|
|
43
|
+
|
|
44
|
+
click.echo(f"🔍 Ingesting {path} as '{type}'...")
|
|
45
|
+
try:
|
|
46
|
+
resource_map = await client.ingest(path, resource_type=type)
|
|
47
|
+
click.echo(f"✅ Map generated: {resource_map.resource_id}")
|
|
48
|
+
click.echo(f" Title: {resource_map.title}")
|
|
49
|
+
click.echo(f" Nodes: {len(resource_map.nodes)}")
|
|
50
|
+
except Exception as e:
|
|
51
|
+
click.echo(f"❌ Error: {e}", err=True)
|
|
52
|
+
sys.exit(1)
|
|
53
|
+
|
|
54
|
+
@main.command()
|
|
55
|
+
@click.argument("node_id")
|
|
56
|
+
@async_command
|
|
57
|
+
async def resolve(node_id):
|
|
58
|
+
"""Resolve a node ID to its value (file chunk)."""
|
|
59
|
+
client = CiteKitClient()
|
|
60
|
+
|
|
61
|
+
click.echo(f"📎 Resolving node: {node_id}")
|
|
62
|
+
try:
|
|
63
|
+
evidence = await client.resolve(node_id)
|
|
64
|
+
click.echo(f"✅ Output: {evidence.output_path}")
|
|
65
|
+
click.echo(f" Modality: {evidence.modality}")
|
|
66
|
+
click.echo(f" Address: {evidence.address}")
|
|
67
|
+
except Exception as e:
|
|
68
|
+
click.echo(f"❌ Error: {e}", err=True)
|
|
69
|
+
sys.exit(1)
|
|
70
|
+
|
|
71
|
+
@main.command()
|
|
72
|
+
@click.argument("resource_id")
|
|
73
|
+
@async_command
|
|
74
|
+
async def structure(resource_id):
|
|
75
|
+
"""Get the JSON structure (map) for a resource ID."""
|
|
76
|
+
client = CiteKitClient()
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
resource_map = await client.get_map(resource_id)
|
|
80
|
+
click.echo(json.dumps(resource_map.model_dump(), indent=2, default=str))
|
|
81
|
+
except Exception as e:
|
|
82
|
+
click.echo(f"❌ Error: {e}", err=True)
|
|
83
|
+
sys.exit(1)
|
|
84
|
+
|
|
85
|
+
@main.command("list")
|
|
86
|
+
@async_command
|
|
87
|
+
async def list_resources():
|
|
88
|
+
"""List all ingested resources."""
|
|
89
|
+
map_dir = Path(".resource_maps")
|
|
90
|
+
if not map_dir.exists():
|
|
91
|
+
click.echo("No resources found (directory .resource_maps missing).")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
maps = list(map_dir.glob("*.json"))
|
|
95
|
+
if not maps:
|
|
96
|
+
click.echo("No resources found.")
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
click.echo(f"found {len(maps)} resources:")
|
|
100
|
+
for map_file in maps:
|
|
101
|
+
try:
|
|
102
|
+
data = json.loads(map_file.read_text(encoding="utf-8"))
|
|
103
|
+
click.echo(f" - {data.get('resource_id')} ({data.get('type')}): {data.get('title')}")
|
|
104
|
+
except:
|
|
105
|
+
click.echo(f" - {map_file.name} (corrupt)")
|
|
106
|
+
|
|
107
|
+
@main.command()
|
|
108
|
+
@async_command
|
|
109
|
+
async def serve():
|
|
110
|
+
"""Run the MCP server (stdio mode) for AI agents."""
|
|
111
|
+
from citekit.mcp_server import create_server
|
|
112
|
+
from mcp.server.stdio import stdio_server
|
|
113
|
+
|
|
114
|
+
server = create_server()
|
|
115
|
+
async with stdio_server() as (read_stream, write_stream):
|
|
116
|
+
await server.run(read_stream, write_stream, server.create_initialization_options())
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
main()
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""CiteKit client — main entry point for the SDK."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from citekit.address import build_address
|
|
9
|
+
from citekit.mapper.base import MapperProvider
|
|
10
|
+
from citekit.models import Location, Node, ResolvedEvidence, ResourceMap
|
|
11
|
+
from citekit.resolvers.audio import AudioResolver
|
|
12
|
+
from citekit.resolvers.document import DocumentResolver
|
|
13
|
+
from citekit.resolvers.image import ImageResolver
|
|
14
|
+
from citekit.resolvers.video import VideoResolver
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CiteKitClient:
|
|
18
|
+
"""Main client for ingesting resources, reading maps, and resolving nodes.
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
from citekit import CiteKitClient, GeminiMapper
|
|
22
|
+
|
|
23
|
+
mapper = GeminiMapper(api_key="YOUR_KEY")
|
|
24
|
+
client = CiteKitClient(mapper=mapper)
|
|
25
|
+
|
|
26
|
+
# Ingest a PDF
|
|
27
|
+
resource_map = await client.ingest("textbook.pdf", "document")
|
|
28
|
+
|
|
29
|
+
# Later: resolve a specific node
|
|
30
|
+
evidence = client.resolve("textbook", "derivatives.definition")
|
|
31
|
+
print(evidence.output_path) # → .citekit_output/textbook_pages_12-13.pdf
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
mapper: MapperProvider | None = None,
|
|
37
|
+
storage_dir: str = ".resource_maps",
|
|
38
|
+
output_dir: str = ".citekit_output",
|
|
39
|
+
):
|
|
40
|
+
self._mapper = mapper
|
|
41
|
+
self._storage_dir = Path(storage_dir)
|
|
42
|
+
self._output_dir = Path(output_dir)
|
|
43
|
+
self._storage_dir.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
self._output_dir.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
|
|
46
|
+
# Initialize resolvers
|
|
47
|
+
self._resolvers = {
|
|
48
|
+
"document": DocumentResolver(output_dir=output_dir),
|
|
49
|
+
"video": VideoResolver(output_dir=output_dir),
|
|
50
|
+
"audio": AudioResolver(output_dir=output_dir),
|
|
51
|
+
"image": ImageResolver(output_dir=output_dir),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# ── Ingestion ────────────────────────────────────────────────────────────
|
|
55
|
+
|
|
56
|
+
async def ingest(
|
|
57
|
+
self,
|
|
58
|
+
resource_path: str,
|
|
59
|
+
resource_type: str,
|
|
60
|
+
resource_id: str | None = None,
|
|
61
|
+
) -> ResourceMap:
|
|
62
|
+
"""Ingest a resource: analyze it with the mapper and save the map locally.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
resource_path: Path to the resource file (PDF, video, audio, image).
|
|
66
|
+
resource_type: One of "document", "video", "audio", "image".
|
|
67
|
+
resource_id: Optional custom ID. Defaults to the filename stem.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
The generated ResourceMap.
|
|
71
|
+
"""
|
|
72
|
+
if self._mapper is None:
|
|
73
|
+
raise RuntimeError(
|
|
74
|
+
"No mapper provider configured. "
|
|
75
|
+
"Pass a MapperProvider (e.g. GeminiMapper) to CiteKitClient()."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
resource_map = await self._mapper.generate_map(
|
|
79
|
+
resource_path=resource_path,
|
|
80
|
+
resource_type=resource_type,
|
|
81
|
+
resource_id=resource_id,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Save to local storage
|
|
85
|
+
self._save_map(resource_map)
|
|
86
|
+
|
|
87
|
+
return resource_map
|
|
88
|
+
|
|
89
|
+
# ── Map access ───────────────────────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
def get_map(self, resource_id: str) -> ResourceMap:
|
|
92
|
+
"""Load a previously generated map from local storage.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
resource_id: The resource ID to look up.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
The ResourceMap.
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
FileNotFoundError: If no map exists for this resource_id.
|
|
102
|
+
"""
|
|
103
|
+
map_path = self._storage_dir / f"{resource_id}.json"
|
|
104
|
+
if not map_path.exists():
|
|
105
|
+
raise FileNotFoundError(
|
|
106
|
+
f"No map found for resource '{resource_id}'. "
|
|
107
|
+
f"Expected at: {map_path}"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
data = json.loads(map_path.read_text(encoding="utf-8"))
|
|
111
|
+
return ResourceMap.model_validate(data)
|
|
112
|
+
|
|
113
|
+
def list_maps(self) -> list[str]:
|
|
114
|
+
"""List all available resource map IDs."""
|
|
115
|
+
return [
|
|
116
|
+
p.stem
|
|
117
|
+
for p in self._storage_dir.glob("*.json")
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
def get_structure(self, resource_id: str) -> dict:
|
|
121
|
+
"""Get the map as a plain dict — useful for MCP/JSON responses."""
|
|
122
|
+
return self.get_map(resource_id).model_dump(mode="json")
|
|
123
|
+
|
|
124
|
+
# ── Resolution ───────────────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
def resolve(self, resource_id: str, node_id: str) -> ResolvedEvidence:
|
|
127
|
+
"""Resolve a node into extracted evidence.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
resource_id: The resource to look up.
|
|
131
|
+
node_id: The node ID within that resource.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
ResolvedEvidence with the path to the extracted file.
|
|
135
|
+
"""
|
|
136
|
+
resource_map = self.get_map(resource_id)
|
|
137
|
+
node = resource_map.get_node(node_id)
|
|
138
|
+
|
|
139
|
+
if node is None:
|
|
140
|
+
available = resource_map.list_node_ids()
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Node '{node_id}' not found in resource '{resource_id}'. "
|
|
143
|
+
f"Available nodes: {available}"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Pick the right resolver
|
|
147
|
+
modality = node.location.modality
|
|
148
|
+
resolver = self._resolvers.get(modality)
|
|
149
|
+
|
|
150
|
+
if resolver is None:
|
|
151
|
+
raise ValueError(f"No resolver for modality: {modality}")
|
|
152
|
+
|
|
153
|
+
# Resolve
|
|
154
|
+
output_path = resolver.resolve(node, resource_map.source_path)
|
|
155
|
+
|
|
156
|
+
# Build the URI address
|
|
157
|
+
address = build_address(resource_id, node.location)
|
|
158
|
+
|
|
159
|
+
return ResolvedEvidence(
|
|
160
|
+
output_path=output_path,
|
|
161
|
+
modality=modality,
|
|
162
|
+
address=address,
|
|
163
|
+
node=node,
|
|
164
|
+
resource_id=resource_id,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# ── Private ──────────────────────────────────────────────────────────────
|
|
168
|
+
|
|
169
|
+
def _save_map(self, resource_map: ResourceMap) -> None:
|
|
170
|
+
"""Save a ResourceMap to local JSON storage."""
|
|
171
|
+
map_path = self._storage_dir / f"{resource_map.resource_id}.json"
|
|
172
|
+
data = resource_map.model_dump(mode="json")
|
|
173
|
+
map_path.write_text(
|
|
174
|
+
json.dumps(data, indent=2, ensure_ascii=False),
|
|
175
|
+
encoding="utf-8",
|
|
176
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Mapper providers for CiteKit."""
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Abstract base class for mapper providers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
from citekit.models import ResourceMap
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MapperProvider(ABC):
|
|
11
|
+
"""Base class for all mapper providers.
|
|
12
|
+
|
|
13
|
+
A mapper analyzes a resource file and produces a structured ResourceMap
|
|
14
|
+
containing nodes that point to physical locations within the resource.
|
|
15
|
+
|
|
16
|
+
To create a custom provider, subclass this and implement generate_map().
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
async def generate_map(
|
|
21
|
+
self,
|
|
22
|
+
resource_path: str,
|
|
23
|
+
resource_type: str,
|
|
24
|
+
resource_id: str | None = None,
|
|
25
|
+
) -> ResourceMap:
|
|
26
|
+
"""Analyze a resource and produce a structured map.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
resource_path: Path to the resource file.
|
|
30
|
+
resource_type: Type of resource ("document", "video", "audio", "image").
|
|
31
|
+
resource_id: Optional custom ID. If not provided, derived from filename.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
A ResourceMap with nodes pointing to locations in the resource.
|
|
35
|
+
"""
|
|
36
|
+
...
|