uridx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
uridx-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.3
2
+ Name: uridx
3
+ Version: 0.1.0
4
+ Summary: uridx - Personal Reference Index
5
+ Author: Justyn Shull
6
+ Requires-Dist: fastmcp>=2.0,<3
7
+ Requires-Dist: sqlmodel>=0.0.16
8
+ Requires-Dist: httpx>=0.27
9
+ Requires-Dist: typer>=0.12
10
+ Requires-Dist: sqlite-vec>=0.1.6
11
+ Requires-Dist: rich>=13
12
+ Requires-Dist: ruff ; extra == 'dev'
13
+ Requires-Dist: docling>=2.66 ; extra == 'docling'
14
+ Requires-Dist: pdfplumber>=0.10 ; extra == 'pdf'
15
+ Requires-Python: >=3.12
16
+ Provides-Extra: dev
17
+ Provides-Extra: docling
18
+ Provides-Extra: pdf
@@ -0,0 +1,36 @@
1
+ [project]
2
+ name = "uridx"
3
+ version = "0.1.0"
4
+ description = "uridx - Personal Reference Index"
5
+ authors = [
6
+ { name = "Justyn Shull" }
7
+ ]
8
+ requires-python = ">=3.12"
9
+ dependencies = [
10
+ "fastmcp>=2.0,<3",
11
+ "sqlmodel>=0.0.16",
12
+ "httpx>=0.27",
13
+ "typer>=0.12",
14
+ "sqlite-vec>=0.1.6",
15
+ "rich>=13",
16
+ ]
17
+
18
+ [project.scripts]
19
+ uridx = "uridx.cli.main:app"
20
+
21
+ [project.optional-dependencies]
22
+ dev = ["ruff"]
23
+ docling = ["docling>=2.66"]
24
+ pdf = ["pdfplumber>=0.10"]
25
+
26
+ [tool.ruff]
27
+ line-length = 120
28
+
29
+ [build-system]
30
+ requires = ["uv_build>=0.9.22,<0.10.0"]
31
+ build-backend = "uv_build"
32
+
33
+ [dependency-groups]
34
+ dev = [
35
+ "ruff>=0.14.10",
36
+ ]
@@ -0,0 +1,7 @@
1
+ __version__ = "0.1.0"
2
+
3
+
4
+ def main():
5
+ from uridx.cli.main import app
6
+
7
+ app()
@@ -0,0 +1,3 @@
1
+ from uridx.cli.main import app
2
+
3
+ __all__ = ["app"]
@@ -0,0 +1,41 @@
1
+ """Extract subcommands with plugin support.
2
+
3
+ Built-in extractors and plugin discovery via entry points.
4
+ Plugins register under 'uridx.extractors' entry point group.
5
+ """
6
+
7
+ import sys
8
+ from importlib.metadata import entry_points
9
+
10
+ import typer
11
+
12
+ from . import claude_code, docling, image, markdown, pdf
13
+
14
+ app = typer.Typer(help="Extract content to JSONL for ingestion")
15
+
16
+ app.command("claude-code")(claude_code.extract)
17
+ app.command("docling")(docling.extract)
18
+ app.command("markdown")(markdown.extract)
19
+ app.command("pdf")(pdf.extract)
20
+ app.command("image")(image.extract)
21
+
22
+
23
+ def load_plugins():
24
+ """Load extractor plugins from entry points."""
25
+ try:
26
+ eps = entry_points(group="uridx.extractors")
27
+ except TypeError:
28
+ eps = entry_points().get("uridx.extractors", [])
29
+
30
+ for ep in eps:
31
+ try:
32
+ extractor = ep.load()
33
+ if isinstance(extractor, typer.Typer):
34
+ app.add_typer(extractor, name=ep.name)
35
+ elif callable(extractor):
36
+ app.command(ep.name)(extractor)
37
+ except Exception as e:
38
+ print(f"Failed to load extractor plugin '{ep.name}': {e}", file=sys.stderr)
39
+
40
+
41
+ load_plugins()
@@ -0,0 +1,30 @@
1
+ """Shared utilities for extractors."""
2
+
3
+ import json
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+
7
+
8
+ def get_file_mtime(path: Path) -> str:
9
+ """Get file modification time as ISO8601 string."""
10
+ mtime = path.stat().st_mtime
11
+ return datetime.fromtimestamp(mtime, tz=timezone.utc).isoformat()
12
+
13
+
14
+ def output(record: dict) -> None:
15
+ """Output a single JSONL record to stdout."""
16
+ print(json.dumps(record))
17
+
18
+
19
+ def resolve_paths(paths: list[Path], extensions: set[str]) -> list[Path]:
20
+ """Resolve a list of paths to matching files."""
21
+ if not paths:
22
+ paths = [Path.cwd()]
23
+
24
+ files = []
25
+ for p in paths:
26
+ if p.is_file():
27
+ files.append(p)
28
+ elif p.is_dir():
29
+ files.extend(f for f in p.rglob("*") if f.suffix.lower() in extensions and f.is_file())
30
+ return files
@@ -0,0 +1,172 @@
1
+ """Extract Claude Code conversations from ~/.claude/projects/"""
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Annotated, Optional
7
+
8
+ import typer
9
+
10
+ from .base import output
11
+
12
+
13
+ def _extract_content(message: dict) -> str:
14
+ content = message.get("content")
15
+ if isinstance(content, str):
16
+ return content
17
+ if isinstance(content, list):
18
+ texts = []
19
+ for block in content:
20
+ if isinstance(block, dict):
21
+ if block.get("type") == "text":
22
+ texts.append(block.get("text", ""))
23
+ elif block.get("type") == "tool_use":
24
+ texts.append(f"[Tool: {block.get('name')}]")
25
+ return "\n".join(texts)
26
+ return ""
27
+
28
+
29
+ def _is_tool_result(msg: dict) -> bool:
30
+ content = msg.get("message", {}).get("content", [])
31
+ if isinstance(content, list) and content:
32
+ first = content[0] if content else {}
33
+ return isinstance(first, dict) and first.get("type") == "tool_result"
34
+ return False
35
+
36
+
37
+ def _build_turns(messages: list[dict]) -> list[dict]:
38
+ turns = []
39
+ current_user = None
40
+ current_assistant = []
41
+ turn_index = 0
42
+
43
+ for msg in messages:
44
+ msg_type = msg.get("type")
45
+ if msg_type not in ("user", "assistant"):
46
+ continue
47
+
48
+ content = _extract_content(msg.get("message", {}))
49
+ if not content:
50
+ continue
51
+
52
+ if msg_type == "user" and not _is_tool_result(msg):
53
+ if current_user or current_assistant:
54
+ text_parts = []
55
+ if current_user:
56
+ text_parts.append(f"User: {current_user}")
57
+ if current_assistant:
58
+ text_parts.append(f"Assistant: {' '.join(current_assistant)}")
59
+ if text_parts:
60
+ turns.append(
61
+ {
62
+ "text": "\n\n".join(text_parts),
63
+ "key": f"turn-{turn_index}",
64
+ "meta": {"turn_index": turn_index},
65
+ }
66
+ )
67
+ turn_index += 1
68
+ current_user = content
69
+ current_assistant = []
70
+ elif msg_type == "assistant":
71
+ current_assistant.append(content)
72
+
73
+ if current_user or current_assistant:
74
+ text_parts = []
75
+ if current_user:
76
+ text_parts.append(f"User: {current_user}")
77
+ if current_assistant:
78
+ text_parts.append(f"Assistant: {' '.join(current_assistant)}")
79
+ if text_parts:
80
+ turns.append(
81
+ {
82
+ "text": "\n\n".join(text_parts),
83
+ "key": f"turn-{turn_index}",
84
+ "meta": {"turn_index": turn_index},
85
+ }
86
+ )
87
+
88
+ return turns
89
+
90
+
91
+ def _parse_conversation(jsonl_path: Path) -> dict | None:
92
+ messages = []
93
+ metadata = {}
94
+ first_timestamp = None
95
+ last_timestamp = None
96
+
97
+ with open(jsonl_path, encoding="utf-8") as f:
98
+ for line in f:
99
+ line = line.strip()
100
+ if not line:
101
+ continue
102
+ try:
103
+ msg = json.loads(line)
104
+ messages.append(msg)
105
+ if first_timestamp is None:
106
+ first_timestamp = msg.get("timestamp")
107
+ last_timestamp = msg.get("timestamp")
108
+ if not metadata and msg.get("cwd"):
109
+ metadata = {
110
+ "project_path": msg.get("cwd"),
111
+ "agent_id": msg.get("agentId"),
112
+ "session_id": msg.get("sessionId"),
113
+ "git_branch": msg.get("gitBranch"),
114
+ "slug": msg.get("slug"),
115
+ }
116
+ except json.JSONDecodeError:
117
+ continue
118
+
119
+ if not messages:
120
+ return None
121
+
122
+ chunks = _build_turns(messages)
123
+ if not chunks:
124
+ return None
125
+
126
+ title = metadata.get("slug") or jsonl_path.stem
127
+ metadata["started_at"] = first_timestamp
128
+ metadata["ended_at"] = last_timestamp
129
+
130
+ return {"chunks": chunks, "metadata": metadata, "title": title}
131
+
132
+
133
+ def extract(
134
+ path: Annotated[Optional[Path], typer.Argument(help="Projects directory")] = None,
135
+ ):
136
+ """Extract Claude Code conversations from ~/.claude/projects/"""
137
+ projects_dir = path or (Path.home() / ".claude" / "projects")
138
+
139
+ if not projects_dir.exists():
140
+ print(f"Projects directory not found: {projects_dir}", file=sys.stderr)
141
+ raise typer.Exit(1)
142
+
143
+ for project_dir in projects_dir.iterdir():
144
+ if not project_dir.is_dir():
145
+ continue
146
+
147
+ project_hash = project_dir.name
148
+
149
+ for jsonl_file in project_dir.glob("*.jsonl"):
150
+ if jsonl_file.stat().st_size == 0:
151
+ continue
152
+
153
+ try:
154
+ result = _parse_conversation(jsonl_file)
155
+ except Exception as e:
156
+ print(f"Error parsing {jsonl_file}: {e}", file=sys.stderr)
157
+ continue
158
+
159
+ if not result or not result["chunks"]:
160
+ continue
161
+
162
+ output(
163
+ {
164
+ "source_uri": f"claude-code://{project_hash}/{jsonl_file.stem}",
165
+ "chunks": result["chunks"],
166
+ "tags": ["claude-code", "conversation"],
167
+ "title": result["title"],
168
+ "source_type": "claude-code",
169
+ "context": json.dumps(result["metadata"]),
170
+ "replace": True,
171
+ }
172
+ )
@@ -0,0 +1,84 @@
1
+ """Extract documents using docling (PDF, DOCX, XLSX, PPTX, HTML, images)."""
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Annotated, Optional
7
+ from urllib.parse import urlparse
8
+
9
+ import typer
10
+
11
+ from .base import get_file_mtime, output, resolve_paths
12
+
13
+ SUPPORTED_EXTENSIONS = {
14
+ ".pdf", ".docx", ".xlsx", ".pptx",
15
+ ".html", ".xhtml", ".htm",
16
+ ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp",
17
+ ".md", ".adoc", ".csv",
18
+ }
19
+
20
+
21
+ def extract(
22
+ sources: Annotated[Optional[list[str]], typer.Argument(help="Files, directories, or URLs")] = None,
23
+ ):
24
+ """Extract documents using docling (requires docling)."""
25
+ try:
26
+ from docling.document_converter import DocumentConverter
27
+ from docling_core.transforms.chunker import HybridChunker
28
+ except ImportError:
29
+ print("docling not installed. Install with: uv pip install 'uridx[docling]'", file=sys.stderr)
30
+ raise typer.Exit(1)
31
+
32
+ converter = DocumentConverter()
33
+ chunker = HybridChunker()
34
+
35
+ sources = sources or []
36
+ urls = [s for s in sources if s.startswith(("http://", "https://"))]
37
+ local_paths = [Path(s) for s in sources if not s.startswith(("http://", "https://"))]
38
+
39
+ for url in urls:
40
+ _convert_source(converter, chunker, url, url, created_at=None)
41
+
42
+ for file_path in resolve_paths(local_paths, SUPPORTED_EXTENSIONS):
43
+ _convert_source(converter, chunker, str(file_path), f"file://{file_path.resolve()}", created_at=get_file_mtime(file_path))
44
+
45
+
46
+ def _convert_source(converter, chunker, source: str, source_uri: str, created_at: str | None = None):
47
+ """Convert a single source and output JSONL."""
48
+ try:
49
+ result = converter.convert(source)
50
+ doc = result.document
51
+ chunk_iter = chunker.chunk(dl_doc=doc)
52
+ except Exception as e:
53
+ print(f"Error processing {source}: {e}", file=sys.stderr)
54
+ return
55
+
56
+ chunks = []
57
+ for i, chunk in enumerate(chunk_iter):
58
+ text = chunk.text.strip() if hasattr(chunk, "text") else str(chunk).strip()
59
+ if text:
60
+ chunks.append({"text": text, "key": f"chunk-{i}"})
61
+
62
+ if not chunks:
63
+ return
64
+
65
+ parsed = urlparse(source)
66
+ if parsed.scheme in ("http", "https"):
67
+ title = Path(parsed.path).stem or parsed.netloc
68
+ ext = Path(parsed.path).suffix.lstrip(".").lower() or "html"
69
+ else:
70
+ title = Path(source).stem
71
+ ext = Path(source).suffix.lstrip(".").lower()
72
+
73
+ record = {
74
+ "source_uri": source_uri,
75
+ "chunks": chunks,
76
+ "tags": ["document", ext] if ext else ["document"],
77
+ "title": title,
78
+ "source_type": "document",
79
+ "context": json.dumps({"source": source}),
80
+ "replace": True,
81
+ }
82
+ if created_at:
83
+ record["created_at"] = created_at
84
+ output(record)
@@ -0,0 +1,67 @@
1
+ """Extract image descriptions via Ollama vision model."""
2
+
3
+ import base64
4
+ import json
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Annotated, Optional
9
+
10
+ import httpx
11
+ import typer
12
+
13
+ from .base import get_file_mtime, output, resolve_paths
14
+
15
+ EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
16
+
17
+
18
+ def extract(
19
+ paths: Annotated[Optional[list[Path]], typer.Argument(help="Files or directories")] = None,
20
+ model: Annotated[str, typer.Option("--model", "-m", help="Vision model")] = "",
21
+ base_url: Annotated[str, typer.Option("--base-url", help="Ollama URL")] = "",
22
+ ):
23
+ """Extract image descriptions via Ollama vision model."""
24
+ ollama_url = base_url or os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
25
+ vision_model = model or os.getenv("OLLAMA_VISION_MODEL", "llama3.2-vision")
26
+
27
+ for img_file in resolve_paths(paths or [], EXTENSIONS):
28
+ try:
29
+ with open(img_file, "rb") as f:
30
+ image_data = base64.b64encode(f.read()).decode("utf-8")
31
+
32
+ with httpx.Client(timeout=120.0) as client:
33
+ response = client.post(
34
+ f"{ollama_url}/api/generate",
35
+ json={
36
+ "model": vision_model,
37
+ "prompt": "Describe this image in detail. Include any text visible in the image.",
38
+ "images": [image_data],
39
+ "stream": False,
40
+ },
41
+ )
42
+ response.raise_for_status()
43
+ description = response.json()["response"]
44
+ except httpx.ConnectError:
45
+ print(f"Cannot connect to Ollama at {ollama_url}", file=sys.stderr)
46
+ raise typer.Exit(1)
47
+ except Exception as e:
48
+ print(f"Error describing {img_file}: {e}", file=sys.stderr)
49
+ continue
50
+
51
+ if not description or not description.strip():
52
+ continue
53
+
54
+ output(
55
+ {
56
+ "source_uri": f"file://{img_file.resolve()}",
57
+ "chunks": [
58
+ {"text": description.strip(), "key": "description", "meta": {"original_filename": img_file.name}}
59
+ ],
60
+ "tags": ["image"],
61
+ "title": img_file.stem,
62
+ "source_type": "image",
63
+ "context": json.dumps({"path": str(img_file), "vision_model": vision_model}),
64
+ "replace": True,
65
+ "created_at": get_file_mtime(img_file),
66
+ }
67
+ )
@@ -0,0 +1,100 @@
1
+ """Extract markdown files, splitting by headings."""
2
+
3
+ import json
4
+ import re
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Annotated, Optional
8
+
9
+ import typer
10
+
11
+ from .base import get_file_mtime, output, resolve_paths
12
+
13
+ EXTENSIONS = {".md", ".markdown"}
14
+
15
+
16
+ def _slugify(text: str) -> str:
17
+ if not text:
18
+ return "untitled"
19
+ text = re.sub(r"^#+\s*", "", text)
20
+ text = text.lower()
21
+ text = re.sub(r"[^a-z0-9]+", "-", text)
22
+ return text.strip("-")[:50] or "untitled"
23
+
24
+
25
+ def _parse(path: Path) -> list[dict]:
26
+ content = path.read_text(encoding="utf-8")
27
+ heading_pattern = r"^(#{1,6}\s+.+)$"
28
+ parts = re.split(heading_pattern, content, flags=re.MULTILINE)
29
+
30
+ chunks = []
31
+ current_heading = None
32
+ current_content = []
33
+
34
+ for part in parts:
35
+ part = part.strip()
36
+ if not part:
37
+ continue
38
+ if re.match(r"^#{1,6}\s+", part):
39
+ if current_heading or current_content:
40
+ text_parts = [current_heading] if current_heading else []
41
+ text_parts.extend(current_content)
42
+ chunk_text = "\n\n".join(text_parts)
43
+ if chunk_text.strip():
44
+ chunks.append(
45
+ {
46
+ "text": chunk_text,
47
+ "key": _slugify(current_heading) if current_heading else f"section-{len(chunks)}",
48
+ "meta": {"heading": current_heading},
49
+ }
50
+ )
51
+ current_heading = part
52
+ current_content = []
53
+ else:
54
+ current_content.append(part)
55
+
56
+ if current_heading or current_content:
57
+ text_parts = [current_heading] if current_heading else []
58
+ text_parts.extend(current_content)
59
+ chunk_text = "\n\n".join(text_parts)
60
+ if chunk_text.strip():
61
+ chunks.append(
62
+ {
63
+ "text": chunk_text,
64
+ "key": _slugify(current_heading) if current_heading else f"section-{len(chunks)}",
65
+ "meta": {"heading": current_heading},
66
+ }
67
+ )
68
+
69
+ if not chunks and content.strip():
70
+ chunks.append({"text": content.strip(), "key": "full", "meta": {}})
71
+
72
+ return chunks
73
+
74
+
75
+ def extract(
76
+ paths: Annotated[Optional[list[Path]], typer.Argument(help="Files or directories")] = None,
77
+ ):
78
+ """Extract markdown files, splitting by headings."""
79
+ for md_file in resolve_paths(paths or [], EXTENSIONS):
80
+ try:
81
+ chunks = _parse(md_file)
82
+ except Exception as e:
83
+ print(f"Error parsing {md_file}: {e}", file=sys.stderr)
84
+ continue
85
+
86
+ if not chunks:
87
+ continue
88
+
89
+ output(
90
+ {
91
+ "source_uri": f"file://{md_file.resolve()}",
92
+ "chunks": chunks,
93
+ "tags": ["markdown", "document"],
94
+ "title": md_file.stem,
95
+ "source_type": "markdown",
96
+ "context": json.dumps({"path": str(md_file)}),
97
+ "replace": True,
98
+ "created_at": get_file_mtime(md_file),
99
+ }
100
+ )
@@ -0,0 +1,62 @@
1
+ """Extract PDF files by page using pdfplumber."""
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Annotated, Optional
7
+
8
+ import typer
9
+
10
+ from .base import output
11
+
12
+
13
+ def extract(
14
+ path: Annotated[Optional[Path], typer.Argument(help="File or directory")] = None,
15
+ ):
16
+ """Extract PDF files by page (requires pdfplumber)."""
17
+ try:
18
+ import pdfplumber
19
+ except ImportError:
20
+ print("pdfplumber not installed. Install with: uv pip install 'uridx[pdf]'", file=sys.stderr)
21
+ raise typer.Exit(1)
22
+
23
+ root = path or Path.cwd()
24
+
25
+ if root.is_file():
26
+ files = [root]
27
+ else:
28
+ files = list(root.rglob("*.pdf"))
29
+
30
+ for pdf_file in files:
31
+ if not pdf_file.is_file():
32
+ continue
33
+
34
+ chunks = []
35
+ try:
36
+ with pdfplumber.open(pdf_file) as pdf:
37
+ for i, page in enumerate(pdf.pages):
38
+ try:
39
+ text = page.extract_text()
40
+ except Exception as e:
41
+ print(f"Error extracting page {i + 1} from {pdf_file}: {e}", file=sys.stderr)
42
+ continue
43
+ if text and text.strip():
44
+ chunks.append({"text": text.strip(), "key": f"page-{i + 1}", "meta": {"page_number": i + 1}})
45
+ except Exception as e:
46
+ print(f"Error processing {pdf_file}: {e}", file=sys.stderr)
47
+ continue
48
+
49
+ if not chunks:
50
+ continue
51
+
52
+ output(
53
+ {
54
+ "source_uri": f"file://{pdf_file.resolve()}",
55
+ "chunks": chunks,
56
+ "tags": ["pdf", "document"],
57
+ "title": pdf_file.stem,
58
+ "source_type": "pdf",
59
+ "context": json.dumps({"path": str(pdf_file), "pages": len(chunks)}),
60
+ "replace": True,
61
+ }
62
+ )