citekit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
citekit-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.4
2
+ Name: citekit
3
+ Version: 0.1.0
4
+ Summary: A local SDK that lets AI agents open specific parts of files instead of sending entire documents.
5
+ Project-URL: Repository, https://github.com/citekit/citekit
6
+ Author: CiteKit Team
7
+ License: MIT
8
+ Keywords: ai,audio,llm,mcp,pdf,rag,sdk,video
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: click>=8.0
19
+ Requires-Dist: google-genai>=1.0
20
+ Requires-Dist: mcp>=1.0
21
+ Requires-Dist: pillow>=10.0
22
+ Requires-Dist: pydantic>=2.0
23
+ Requires-Dist: pymupdf>=1.24
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
26
+ Requires-Dist: pytest>=7.0; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # CiteKit SDK for Python
30
+
31
+ See the main [README](../../README.md) for usage instructions.
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install citekit
37
+ ```
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from citekit import CiteKitClient, GeminiMapper
43
+
44
+ # ... (see main docs)
45
+ ```
@@ -0,0 +1,17 @@
1
+ # CiteKit SDK for Python
2
+
3
+ See the main [README](../../README.md) for usage instructions.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install citekit
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ from citekit import CiteKitClient, GeminiMapper
15
+
16
+ # ... (see main docs)
17
+ ```
Binary file
@@ -0,0 +1,21 @@
1
+ """CiteKit — Let AI agents open specific parts of files."""
2
+
3
+ from citekit.models import ResourceMap, Node, Location, ResolvedEvidence
4
+ from citekit.client import CiteKitClient
5
+ from citekit.address import parse_address, build_address
6
+ from citekit.mapper.base import MapperProvider
7
+ from citekit.mapper.gemini import GeminiMapper
8
+
9
+ __version__ = "0.1.0"
10
+
11
+ __all__ = [
12
+ "ResourceMap",
13
+ "Node",
14
+ "Location",
15
+ "ResolvedEvidence",
16
+ "CiteKitClient",
17
+ "parse_address",
18
+ "build_address",
19
+ "MapperProvider",
20
+ "GeminiMapper",
21
+ ]
@@ -0,0 +1,4 @@
1
+ from citekit.cli import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -0,0 +1,166 @@
1
+ """Address parser and builder for CiteKit URI-style addresses.
2
+
3
+ Formats:
4
+ doc://resource_id#pages=3-5
5
+ video://resource_id#t=192-230
6
+ audio://resource_id#t=60-120
7
+ image://resource_id#bbox=0.2,0.3,0.8,0.7
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ from urllib.parse import urlparse, parse_qs
14
+
15
+ from citekit.models import Location
16
+
17
+ # Scheme → modality mapping
18
+ _SCHEME_TO_MODALITY = {
19
+ "doc": "document",
20
+ "video": "video",
21
+ "audio": "audio",
22
+ "image": "image",
23
+ }
24
+
25
+ _MODALITY_TO_SCHEME = {v: k for k, v in _SCHEME_TO_MODALITY.items()}
26
+
27
+
28
+ def parse_address(address: str) -> tuple[str, Location]:
29
+ """Parse a CiteKit address into (resource_id, Location).
30
+
31
+ Examples:
32
+ >>> parse_address("doc://calculus_book#pages=12-13")
33
+ ('calculus_book', Location(modality='document', pages=[12, 13]))
34
+
35
+ >>> parse_address("video://lecture1#t=192-230")
36
+ ('lecture1', Location(modality='video', start=192.0, end=230.0))
37
+
38
+ >>> parse_address("image://diagram#bbox=0.2,0.3,0.8,0.7")
39
+ ('diagram', Location(modality='image', bbox=(0.2, 0.3, 0.8, 0.7)))
40
+ """
41
+ # Parse scheme manually since urllib doesn't handle custom schemes well
42
+ match = re.match(r"^(\w+)://([^#]+)(?:#(.+))?$", address)
43
+ if not match:
44
+ raise ValueError(f"Invalid CiteKit address: {address}")
45
+
46
+ scheme, resource_id, fragment = match.group(1), match.group(2), match.group(3)
47
+
48
+ if scheme not in _SCHEME_TO_MODALITY:
49
+ raise ValueError(f"Unknown scheme '{scheme}'. Expected one of: {list(_SCHEME_TO_MODALITY.keys())}")
50
+
51
+ modality = _SCHEME_TO_MODALITY[scheme]
52
+
53
+ # Parse fragment parameters
54
+ pages = None
55
+ start = None
56
+ end = None
57
+ bbox = None
58
+
59
+ if fragment:
60
+ params = dict(part.split("=", 1) for part in fragment.split("&") if "=" in part)
61
+
62
+ if "pages" in params:
63
+ page_str = params["pages"]
64
+ if "-" in page_str:
65
+ p_start, p_end = page_str.split("-", 1)
66
+ pages = list(range(int(p_start), int(p_end) + 1))
67
+ else:
68
+ pages = [int(p) for p in page_str.split(",")]
69
+
70
+ if "t" in params:
71
+ time_str = params["t"]
72
+ if "-" in time_str:
73
+ t_start, t_end = time_str.split("-", 1)
74
+ start = _parse_time(t_start)
75
+ end = _parse_time(t_end)
76
+
77
+ if "bbox" in params:
78
+ parts = params["bbox"].split(",")
79
+ if len(parts) != 4:
80
+ raise ValueError(f"bbox must have 4 values, got {len(parts)}")
81
+ bbox = tuple(float(p) for p in parts)
82
+
83
+ return resource_id, Location(
84
+ modality=modality,
85
+ pages=pages,
86
+ start=start,
87
+ end=end,
88
+ bbox=bbox,
89
+ )
90
+
91
+
92
+ def build_address(resource_id: str, location: Location) -> str:
93
+ """Build a CiteKit address from a resource ID and location.
94
+
95
+ Examples:
96
+ >>> build_address("book", Location(modality="document", pages=[3, 4, 5]))
97
+ 'doc://book#pages=3-5'
98
+
99
+ >>> build_address("lecture", Location(modality="video", start=192.0, end=230.0))
100
+ 'video://lecture#t=192-230'
101
+ """
102
+ scheme = _MODALITY_TO_SCHEME.get(location.modality)
103
+ if not scheme:
104
+ raise ValueError(f"Unknown modality: {location.modality}")
105
+
106
+ fragment_parts = []
107
+
108
+ if location.pages is not None:
109
+ if len(location.pages) == 0:
110
+ raise ValueError("Pages list cannot be empty")
111
+ pages_sorted = sorted(location.pages)
112
+ # Check if pages are consecutive for range notation
113
+ if pages_sorted == list(range(pages_sorted[0], pages_sorted[-1] + 1)):
114
+ fragment_parts.append(f"pages={pages_sorted[0]}-{pages_sorted[-1]}")
115
+ else:
116
+ fragment_parts.append(f"pages={','.join(str(p) for p in pages_sorted)}")
117
+
118
+ if location.start is not None and location.end is not None:
119
+ start_str = _format_time(location.start)
120
+ end_str = _format_time(location.end)
121
+ fragment_parts.append(f"t={start_str}-{end_str}")
122
+
123
+ if location.bbox is not None:
124
+ bbox_str = ",".join(f"{v:g}" for v in location.bbox)
125
+ fragment_parts.append(f"bbox={bbox_str}")
126
+
127
+ fragment = "&".join(fragment_parts)
128
+ if fragment:
129
+ return f"{scheme}://{resource_id}#{fragment}"
130
+ return f"{scheme}://{resource_id}"
131
+
132
+
133
+ def _parse_time(time_str: str) -> float:
134
+ """Parse a time string to seconds. Supports seconds or HH:MM:SS format."""
135
+ time_str = time_str.strip()
136
+
137
+ # HH:MM:SS or MM:SS format
138
+ if ":" in time_str:
139
+ parts = time_str.split(":")
140
+ if len(parts) == 3:
141
+ h, m, s = parts
142
+ return int(h) * 3600 + int(m) * 60 + float(s)
143
+ elif len(parts) == 2:
144
+ m, s = parts
145
+ return int(m) * 60 + float(s)
146
+
147
+ return float(time_str)
148
+
149
+
150
+ def _format_time(seconds: float) -> str:
151
+ """Format seconds as a compact time string."""
152
+ if seconds == int(seconds):
153
+ seconds = int(seconds)
154
+
155
+ # Use HH:MM:SS for large values
156
+ if isinstance(seconds, int) and seconds >= 3600:
157
+ h = seconds // 3600
158
+ m = (seconds % 3600) // 60
159
+ s = seconds % 60
160
+ return f"{h:02d}:{m:02d}:{s:02d}"
161
+ elif isinstance(seconds, int) and seconds >= 60:
162
+ m = seconds // 60
163
+ s = seconds % 60
164
+ return f"{m:02d}:{s:02d}"
165
+
166
+ return str(seconds)
@@ -0,0 +1,119 @@
1
+ import asyncio
2
+ import functools
3
+ import json
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import click
9
+ from citekit.client import CiteKitClient
10
+
11
+ def async_command(f):
12
+ @functools.wraps(f)
13
+ def wrapper(*args, **kwargs):
14
+ return asyncio.run(f(*args, **kwargs))
15
+ return wrapper
16
+
17
+ @click.group()
18
+ def main():
19
+ """CiteKit CLI - Local AI Resource Mapper & Resolver."""
20
+ pass
21
+
22
+ @main.command()
23
+ @click.argument("path", type=click.Path(exists=True))
24
+ @click.option("--type", "-t", help="Resource type (document, video, audio, image). If omitted, inferred from extension.")
25
+ @async_command
26
+ async def ingest(path, type):
27
+ """Ingest a file and generate a resource map."""
28
+ client = CiteKitClient()
29
+
30
+ if not type:
31
+ ext = Path(path).suffix.lower()
32
+ if ext in (".pdf", ".txt", ".md"):
33
+ type = "document"
34
+ elif ext in (".mp4", ".mov", ".avi", ".mkv"):
35
+ type = "video"
36
+ elif ext in (".mp3", ".wav", ".m4a"):
37
+ type = "audio"
38
+ elif ext in (".png", ".jpg", ".jpeg", ".webp"):
39
+ type = "image"
40
+ else:
41
+ click.echo(f"⚠️ Could not infer type from extension '{ext}'. Please specify --type.", err=True)
42
+ sys.exit(1)
43
+
44
+ click.echo(f"🔍 Ingesting {path} as '{type}'...")
45
+ try:
46
+ resource_map = await client.ingest(path, resource_type=type)
47
+ click.echo(f"✅ Map generated: {resource_map.resource_id}")
48
+ click.echo(f" Title: {resource_map.title}")
49
+ click.echo(f" Nodes: {len(resource_map.nodes)}")
50
+ except Exception as e:
51
+ click.echo(f"❌ Error: {e}", err=True)
52
+ sys.exit(1)
53
+
54
+ @main.command()
55
+ @click.argument("node_id")
56
+ @async_command
57
+ async def resolve(node_id):
58
+ """Resolve a node ID to its value (file chunk)."""
59
+ client = CiteKitClient()
60
+
61
+ click.echo(f"📎 Resolving node: {node_id}")
62
+ try:
63
+ evidence = await client.resolve(node_id)
64
+ click.echo(f"✅ Output: {evidence.output_path}")
65
+ click.echo(f" Modality: {evidence.modality}")
66
+ click.echo(f" Address: {evidence.address}")
67
+ except Exception as e:
68
+ click.echo(f"❌ Error: {e}", err=True)
69
+ sys.exit(1)
70
+
71
+ @main.command()
72
+ @click.argument("resource_id")
73
+ @async_command
74
+ async def structure(resource_id):
75
+ """Get the JSON structure (map) for a resource ID."""
76
+ client = CiteKitClient()
77
+
78
+ try:
79
+ resource_map = await client.get_map(resource_id)
80
+ click.echo(json.dumps(resource_map.model_dump(), indent=2, default=str))
81
+ except Exception as e:
82
+ click.echo(f"❌ Error: {e}", err=True)
83
+ sys.exit(1)
84
+
85
+ @main.command("list")
86
+ @async_command
87
+ async def list_resources():
88
+ """List all ingested resources."""
89
+ map_dir = Path(".resource_maps")
90
+ if not map_dir.exists():
91
+ click.echo("No resources found (directory .resource_maps missing).")
92
+ return
93
+
94
+ maps = list(map_dir.glob("*.json"))
95
+ if not maps:
96
+ click.echo("No resources found.")
97
+ return
98
+
99
+ click.echo(f"found {len(maps)} resources:")
100
+ for map_file in maps:
101
+ try:
102
+ data = json.loads(map_file.read_text(encoding="utf-8"))
103
+ click.echo(f" - {data.get('resource_id')} ({data.get('type')}): {data.get('title')}")
104
+ except:
105
+ click.echo(f" - {map_file.name} (corrupt)")
106
+
107
+ @main.command()
108
+ @async_command
109
+ async def serve():
110
+ """Run the MCP server (stdio mode) for AI agents."""
111
+ from citekit.mcp_server import create_server
112
+ from mcp.server.stdio import stdio_server
113
+
114
+ server = create_server()
115
+ async with stdio_server() as (read_stream, write_stream):
116
+ await server.run(read_stream, write_stream, server.create_initialization_options())
117
+
118
+ if __name__ == "__main__":
119
+ main()
@@ -0,0 +1,176 @@
1
+ """CiteKit client — main entry point for the SDK."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ from citekit.address import build_address
9
+ from citekit.mapper.base import MapperProvider
10
+ from citekit.models import Location, Node, ResolvedEvidence, ResourceMap
11
+ from citekit.resolvers.audio import AudioResolver
12
+ from citekit.resolvers.document import DocumentResolver
13
+ from citekit.resolvers.image import ImageResolver
14
+ from citekit.resolvers.video import VideoResolver
15
+
16
+
17
+ class CiteKitClient:
18
+ """Main client for ingesting resources, reading maps, and resolving nodes.
19
+
20
+ Usage:
21
+ from citekit import CiteKitClient, GeminiMapper
22
+
23
+ mapper = GeminiMapper(api_key="YOUR_KEY")
24
+ client = CiteKitClient(mapper=mapper)
25
+
26
+ # Ingest a PDF
27
+ resource_map = await client.ingest("textbook.pdf", "document")
28
+
29
+ # Later: resolve a specific node
30
+ evidence = client.resolve("textbook", "derivatives.definition")
31
+ print(evidence.output_path) # → .citekit_output/textbook_pages_12-13.pdf
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ mapper: MapperProvider | None = None,
37
+ storage_dir: str = ".resource_maps",
38
+ output_dir: str = ".citekit_output",
39
+ ):
40
+ self._mapper = mapper
41
+ self._storage_dir = Path(storage_dir)
42
+ self._output_dir = Path(output_dir)
43
+ self._storage_dir.mkdir(parents=True, exist_ok=True)
44
+ self._output_dir.mkdir(parents=True, exist_ok=True)
45
+
46
+ # Initialize resolvers
47
+ self._resolvers = {
48
+ "document": DocumentResolver(output_dir=output_dir),
49
+ "video": VideoResolver(output_dir=output_dir),
50
+ "audio": AudioResolver(output_dir=output_dir),
51
+ "image": ImageResolver(output_dir=output_dir),
52
+ }
53
+
54
+ # ── Ingestion ────────────────────────────────────────────────────────────
55
+
56
+ async def ingest(
57
+ self,
58
+ resource_path: str,
59
+ resource_type: str,
60
+ resource_id: str | None = None,
61
+ ) -> ResourceMap:
62
+ """Ingest a resource: analyze it with the mapper and save the map locally.
63
+
64
+ Args:
65
+ resource_path: Path to the resource file (PDF, video, audio, image).
66
+ resource_type: One of "document", "video", "audio", "image".
67
+ resource_id: Optional custom ID. Defaults to the filename stem.
68
+
69
+ Returns:
70
+ The generated ResourceMap.
71
+ """
72
+ if self._mapper is None:
73
+ raise RuntimeError(
74
+ "No mapper provider configured. "
75
+ "Pass a MapperProvider (e.g. GeminiMapper) to CiteKitClient()."
76
+ )
77
+
78
+ resource_map = await self._mapper.generate_map(
79
+ resource_path=resource_path,
80
+ resource_type=resource_type,
81
+ resource_id=resource_id,
82
+ )
83
+
84
+ # Save to local storage
85
+ self._save_map(resource_map)
86
+
87
+ return resource_map
88
+
89
+ # ── Map access ───────────────────────────────────────────────────────────
90
+
91
+ def get_map(self, resource_id: str) -> ResourceMap:
92
+ """Load a previously generated map from local storage.
93
+
94
+ Args:
95
+ resource_id: The resource ID to look up.
96
+
97
+ Returns:
98
+ The ResourceMap.
99
+
100
+ Raises:
101
+ FileNotFoundError: If no map exists for this resource_id.
102
+ """
103
+ map_path = self._storage_dir / f"{resource_id}.json"
104
+ if not map_path.exists():
105
+ raise FileNotFoundError(
106
+ f"No map found for resource '{resource_id}'. "
107
+ f"Expected at: {map_path}"
108
+ )
109
+
110
+ data = json.loads(map_path.read_text(encoding="utf-8"))
111
+ return ResourceMap.model_validate(data)
112
+
113
+ def list_maps(self) -> list[str]:
114
+ """List all available resource map IDs."""
115
+ return [
116
+ p.stem
117
+ for p in self._storage_dir.glob("*.json")
118
+ ]
119
+
120
+ def get_structure(self, resource_id: str) -> dict:
121
+ """Get the map as a plain dict — useful for MCP/JSON responses."""
122
+ return self.get_map(resource_id).model_dump(mode="json")
123
+
124
+ # ── Resolution ───────────────────────────────────────────────────────────
125
+
126
+ def resolve(self, resource_id: str, node_id: str) -> ResolvedEvidence:
127
+ """Resolve a node into extracted evidence.
128
+
129
+ Args:
130
+ resource_id: The resource to look up.
131
+ node_id: The node ID within that resource.
132
+
133
+ Returns:
134
+ ResolvedEvidence with the path to the extracted file.
135
+ """
136
+ resource_map = self.get_map(resource_id)
137
+ node = resource_map.get_node(node_id)
138
+
139
+ if node is None:
140
+ available = resource_map.list_node_ids()
141
+ raise ValueError(
142
+ f"Node '{node_id}' not found in resource '{resource_id}'. "
143
+ f"Available nodes: {available}"
144
+ )
145
+
146
+ # Pick the right resolver
147
+ modality = node.location.modality
148
+ resolver = self._resolvers.get(modality)
149
+
150
+ if resolver is None:
151
+ raise ValueError(f"No resolver for modality: {modality}")
152
+
153
+ # Resolve
154
+ output_path = resolver.resolve(node, resource_map.source_path)
155
+
156
+ # Build the URI address
157
+ address = build_address(resource_id, node.location)
158
+
159
+ return ResolvedEvidence(
160
+ output_path=output_path,
161
+ modality=modality,
162
+ address=address,
163
+ node=node,
164
+ resource_id=resource_id,
165
+ )
166
+
167
+ # ── Private ──────────────────────────────────────────────────────────────
168
+
169
+ def _save_map(self, resource_map: ResourceMap) -> None:
170
+ """Save a ResourceMap to local JSON storage."""
171
+ map_path = self._storage_dir / f"{resource_map.resource_id}.json"
172
+ data = resource_map.model_dump(mode="json")
173
+ map_path.write_text(
174
+ json.dumps(data, indent=2, ensure_ascii=False),
175
+ encoding="utf-8",
176
+ )
@@ -0,0 +1 @@
1
+ """Mapper providers for CiteKit."""
@@ -0,0 +1,36 @@
1
+ """Abstract base class for mapper providers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ from citekit.models import ResourceMap
8
+
9
+
10
+ class MapperProvider(ABC):
11
+ """Base class for all mapper providers.
12
+
13
+ A mapper analyzes a resource file and produces a structured ResourceMap
14
+ containing nodes that point to physical locations within the resource.
15
+
16
+ To create a custom provider, subclass this and implement generate_map().
17
+ """
18
+
19
+ @abstractmethod
20
+ async def generate_map(
21
+ self,
22
+ resource_path: str,
23
+ resource_type: str,
24
+ resource_id: str | None = None,
25
+ ) -> ResourceMap:
26
+ """Analyze a resource and produce a structured map.
27
+
28
+ Args:
29
+ resource_path: Path to the resource file.
30
+ resource_type: Type of resource ("document", "video", "audio", "image").
31
+ resource_id: Optional custom ID. If not provided, derived from filename.
32
+
33
+ Returns:
34
+ A ResourceMap with nodes pointing to locations in the resource.
35
+ """
36
+ ...