uridx 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uridx-0.1.0/PKG-INFO +18 -0
- uridx-0.1.0/pyproject.toml +36 -0
- uridx-0.1.0/src/uridx/__init__.py +7 -0
- uridx-0.1.0/src/uridx/cli/__init__.py +3 -0
- uridx-0.1.0/src/uridx/cli/extract/__init__.py +41 -0
- uridx-0.1.0/src/uridx/cli/extract/base.py +30 -0
- uridx-0.1.0/src/uridx/cli/extract/claude_code.py +172 -0
- uridx-0.1.0/src/uridx/cli/extract/docling.py +84 -0
- uridx-0.1.0/src/uridx/cli/extract/image.py +67 -0
- uridx-0.1.0/src/uridx/cli/extract/markdown.py +100 -0
- uridx-0.1.0/src/uridx/cli/extract/pdf.py +62 -0
- uridx-0.1.0/src/uridx/cli/main.py +112 -0
- uridx-0.1.0/src/uridx/config.py +6 -0
- uridx-0.1.0/src/uridx/db/__init__.py +3 -0
- uridx-0.1.0/src/uridx/db/engine.py +145 -0
- uridx-0.1.0/src/uridx/db/models.py +44 -0
- uridx-0.1.0/src/uridx/db/operations.py +184 -0
- uridx-0.1.0/src/uridx/embeddings/__init__.py +15 -0
- uridx-0.1.0/src/uridx/embeddings/ollama.py +51 -0
- uridx-0.1.0/src/uridx/mcp/__init__.py +3 -0
- uridx-0.1.0/src/uridx/mcp/server.py +152 -0
- uridx-0.1.0/src/uridx/search/__init__.py +3 -0
- uridx-0.1.0/src/uridx/search/hybrid.py +145 -0
uridx-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: uridx
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: uridx - Personal Reference Index
|
|
5
|
+
Author: Justyn Shull
|
|
6
|
+
Requires-Dist: fastmcp>=2.0,<3
|
|
7
|
+
Requires-Dist: sqlmodel>=0.0.16
|
|
8
|
+
Requires-Dist: httpx>=0.27
|
|
9
|
+
Requires-Dist: typer>=0.12
|
|
10
|
+
Requires-Dist: sqlite-vec>=0.1.6
|
|
11
|
+
Requires-Dist: rich>=13
|
|
12
|
+
Requires-Dist: ruff ; extra == 'dev'
|
|
13
|
+
Requires-Dist: docling>=2.66 ; extra == 'docling'
|
|
14
|
+
Requires-Dist: pdfplumber>=0.10 ; extra == 'pdf'
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Provides-Extra: docling
|
|
18
|
+
Provides-Extra: pdf
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "uridx"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "uridx - Personal Reference Index"
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "Justyn Shull" }
|
|
7
|
+
]
|
|
8
|
+
requires-python = ">=3.12"
|
|
9
|
+
dependencies = [
|
|
10
|
+
"fastmcp>=2.0,<3",
|
|
11
|
+
"sqlmodel>=0.0.16",
|
|
12
|
+
"httpx>=0.27",
|
|
13
|
+
"typer>=0.12",
|
|
14
|
+
"sqlite-vec>=0.1.6",
|
|
15
|
+
"rich>=13",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
uridx = "uridx.cli.main:app"
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
dev = ["ruff"]
|
|
23
|
+
docling = ["docling>=2.66"]
|
|
24
|
+
pdf = ["pdfplumber>=0.10"]
|
|
25
|
+
|
|
26
|
+
[tool.ruff]
|
|
27
|
+
line-length = 120
|
|
28
|
+
|
|
29
|
+
[build-system]
|
|
30
|
+
requires = ["uv_build>=0.9.22,<0.10.0"]
|
|
31
|
+
build-backend = "uv_build"
|
|
32
|
+
|
|
33
|
+
[dependency-groups]
|
|
34
|
+
dev = [
|
|
35
|
+
"ruff>=0.14.10",
|
|
36
|
+
]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Extract subcommands with plugin support.
|
|
2
|
+
|
|
3
|
+
Built-in extractors and plugin discovery via entry points.
|
|
4
|
+
Plugins register under 'uridx.extractors' entry point group.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
from importlib.metadata import entry_points
|
|
9
|
+
|
|
10
|
+
import typer
|
|
11
|
+
|
|
12
|
+
from . import claude_code, docling, image, markdown, pdf
|
|
13
|
+
|
|
14
|
+
app = typer.Typer(help="Extract content to JSONL for ingestion")
|
|
15
|
+
|
|
16
|
+
app.command("claude-code")(claude_code.extract)
|
|
17
|
+
app.command("docling")(docling.extract)
|
|
18
|
+
app.command("markdown")(markdown.extract)
|
|
19
|
+
app.command("pdf")(pdf.extract)
|
|
20
|
+
app.command("image")(image.extract)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_plugins():
|
|
24
|
+
"""Load extractor plugins from entry points."""
|
|
25
|
+
try:
|
|
26
|
+
eps = entry_points(group="uridx.extractors")
|
|
27
|
+
except TypeError:
|
|
28
|
+
eps = entry_points().get("uridx.extractors", [])
|
|
29
|
+
|
|
30
|
+
for ep in eps:
|
|
31
|
+
try:
|
|
32
|
+
extractor = ep.load()
|
|
33
|
+
if isinstance(extractor, typer.Typer):
|
|
34
|
+
app.add_typer(extractor, name=ep.name)
|
|
35
|
+
elif callable(extractor):
|
|
36
|
+
app.command(ep.name)(extractor)
|
|
37
|
+
except Exception as e:
|
|
38
|
+
print(f"Failed to load extractor plugin '{ep.name}': {e}", file=sys.stderr)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
load_plugins()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Shared utilities for extractors."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_file_mtime(path: Path) -> str:
|
|
9
|
+
"""Get file modification time as ISO8601 string."""
|
|
10
|
+
mtime = path.stat().st_mtime
|
|
11
|
+
return datetime.fromtimestamp(mtime, tz=timezone.utc).isoformat()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def output(record: dict) -> None:
|
|
15
|
+
"""Output a single JSONL record to stdout."""
|
|
16
|
+
print(json.dumps(record))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def resolve_paths(paths: list[Path], extensions: set[str]) -> list[Path]:
|
|
20
|
+
"""Resolve a list of paths to matching files."""
|
|
21
|
+
if not paths:
|
|
22
|
+
paths = [Path.cwd()]
|
|
23
|
+
|
|
24
|
+
files = []
|
|
25
|
+
for p in paths:
|
|
26
|
+
if p.is_file():
|
|
27
|
+
files.append(p)
|
|
28
|
+
elif p.is_dir():
|
|
29
|
+
files.extend(f for f in p.rglob("*") if f.suffix.lower() in extensions and f.is_file())
|
|
30
|
+
return files
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Extract Claude Code conversations from ~/.claude/projects/"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Optional
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from .base import output
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _extract_content(message: dict) -> str:
|
|
14
|
+
content = message.get("content")
|
|
15
|
+
if isinstance(content, str):
|
|
16
|
+
return content
|
|
17
|
+
if isinstance(content, list):
|
|
18
|
+
texts = []
|
|
19
|
+
for block in content:
|
|
20
|
+
if isinstance(block, dict):
|
|
21
|
+
if block.get("type") == "text":
|
|
22
|
+
texts.append(block.get("text", ""))
|
|
23
|
+
elif block.get("type") == "tool_use":
|
|
24
|
+
texts.append(f"[Tool: {block.get('name')}]")
|
|
25
|
+
return "\n".join(texts)
|
|
26
|
+
return ""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _is_tool_result(msg: dict) -> bool:
|
|
30
|
+
content = msg.get("message", {}).get("content", [])
|
|
31
|
+
if isinstance(content, list) and content:
|
|
32
|
+
first = content[0] if content else {}
|
|
33
|
+
return isinstance(first, dict) and first.get("type") == "tool_result"
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _build_turns(messages: list[dict]) -> list[dict]:
|
|
38
|
+
turns = []
|
|
39
|
+
current_user = None
|
|
40
|
+
current_assistant = []
|
|
41
|
+
turn_index = 0
|
|
42
|
+
|
|
43
|
+
for msg in messages:
|
|
44
|
+
msg_type = msg.get("type")
|
|
45
|
+
if msg_type not in ("user", "assistant"):
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
content = _extract_content(msg.get("message", {}))
|
|
49
|
+
if not content:
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
if msg_type == "user" and not _is_tool_result(msg):
|
|
53
|
+
if current_user or current_assistant:
|
|
54
|
+
text_parts = []
|
|
55
|
+
if current_user:
|
|
56
|
+
text_parts.append(f"User: {current_user}")
|
|
57
|
+
if current_assistant:
|
|
58
|
+
text_parts.append(f"Assistant: {' '.join(current_assistant)}")
|
|
59
|
+
if text_parts:
|
|
60
|
+
turns.append(
|
|
61
|
+
{
|
|
62
|
+
"text": "\n\n".join(text_parts),
|
|
63
|
+
"key": f"turn-{turn_index}",
|
|
64
|
+
"meta": {"turn_index": turn_index},
|
|
65
|
+
}
|
|
66
|
+
)
|
|
67
|
+
turn_index += 1
|
|
68
|
+
current_user = content
|
|
69
|
+
current_assistant = []
|
|
70
|
+
elif msg_type == "assistant":
|
|
71
|
+
current_assistant.append(content)
|
|
72
|
+
|
|
73
|
+
if current_user or current_assistant:
|
|
74
|
+
text_parts = []
|
|
75
|
+
if current_user:
|
|
76
|
+
text_parts.append(f"User: {current_user}")
|
|
77
|
+
if current_assistant:
|
|
78
|
+
text_parts.append(f"Assistant: {' '.join(current_assistant)}")
|
|
79
|
+
if text_parts:
|
|
80
|
+
turns.append(
|
|
81
|
+
{
|
|
82
|
+
"text": "\n\n".join(text_parts),
|
|
83
|
+
"key": f"turn-{turn_index}",
|
|
84
|
+
"meta": {"turn_index": turn_index},
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return turns
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _parse_conversation(jsonl_path: Path) -> dict | None:
|
|
92
|
+
messages = []
|
|
93
|
+
metadata = {}
|
|
94
|
+
first_timestamp = None
|
|
95
|
+
last_timestamp = None
|
|
96
|
+
|
|
97
|
+
with open(jsonl_path, encoding="utf-8") as f:
|
|
98
|
+
for line in f:
|
|
99
|
+
line = line.strip()
|
|
100
|
+
if not line:
|
|
101
|
+
continue
|
|
102
|
+
try:
|
|
103
|
+
msg = json.loads(line)
|
|
104
|
+
messages.append(msg)
|
|
105
|
+
if first_timestamp is None:
|
|
106
|
+
first_timestamp = msg.get("timestamp")
|
|
107
|
+
last_timestamp = msg.get("timestamp")
|
|
108
|
+
if not metadata and msg.get("cwd"):
|
|
109
|
+
metadata = {
|
|
110
|
+
"project_path": msg.get("cwd"),
|
|
111
|
+
"agent_id": msg.get("agentId"),
|
|
112
|
+
"session_id": msg.get("sessionId"),
|
|
113
|
+
"git_branch": msg.get("gitBranch"),
|
|
114
|
+
"slug": msg.get("slug"),
|
|
115
|
+
}
|
|
116
|
+
except json.JSONDecodeError:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
if not messages:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
chunks = _build_turns(messages)
|
|
123
|
+
if not chunks:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
title = metadata.get("slug") or jsonl_path.stem
|
|
127
|
+
metadata["started_at"] = first_timestamp
|
|
128
|
+
metadata["ended_at"] = last_timestamp
|
|
129
|
+
|
|
130
|
+
return {"chunks": chunks, "metadata": metadata, "title": title}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def extract(
|
|
134
|
+
path: Annotated[Optional[Path], typer.Argument(help="Projects directory")] = None,
|
|
135
|
+
):
|
|
136
|
+
"""Extract Claude Code conversations from ~/.claude/projects/"""
|
|
137
|
+
projects_dir = path or (Path.home() / ".claude" / "projects")
|
|
138
|
+
|
|
139
|
+
if not projects_dir.exists():
|
|
140
|
+
print(f"Projects directory not found: {projects_dir}", file=sys.stderr)
|
|
141
|
+
raise typer.Exit(1)
|
|
142
|
+
|
|
143
|
+
for project_dir in projects_dir.iterdir():
|
|
144
|
+
if not project_dir.is_dir():
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
project_hash = project_dir.name
|
|
148
|
+
|
|
149
|
+
for jsonl_file in project_dir.glob("*.jsonl"):
|
|
150
|
+
if jsonl_file.stat().st_size == 0:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
result = _parse_conversation(jsonl_file)
|
|
155
|
+
except Exception as e:
|
|
156
|
+
print(f"Error parsing {jsonl_file}: {e}", file=sys.stderr)
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
if not result or not result["chunks"]:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
output(
|
|
163
|
+
{
|
|
164
|
+
"source_uri": f"claude-code://{project_hash}/{jsonl_file.stem}",
|
|
165
|
+
"chunks": result["chunks"],
|
|
166
|
+
"tags": ["claude-code", "conversation"],
|
|
167
|
+
"title": result["title"],
|
|
168
|
+
"source_type": "claude-code",
|
|
169
|
+
"context": json.dumps(result["metadata"]),
|
|
170
|
+
"replace": True,
|
|
171
|
+
}
|
|
172
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Extract documents using docling (PDF, DOCX, XLSX, PPTX, HTML, images)."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Optional
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
|
|
11
|
+
from .base import get_file_mtime, output, resolve_paths
|
|
12
|
+
|
|
13
|
+
SUPPORTED_EXTENSIONS = {
|
|
14
|
+
".pdf", ".docx", ".xlsx", ".pptx",
|
|
15
|
+
".html", ".xhtml", ".htm",
|
|
16
|
+
".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp",
|
|
17
|
+
".md", ".adoc", ".csv",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def extract(
|
|
22
|
+
sources: Annotated[Optional[list[str]], typer.Argument(help="Files, directories, or URLs")] = None,
|
|
23
|
+
):
|
|
24
|
+
"""Extract documents using docling (requires docling)."""
|
|
25
|
+
try:
|
|
26
|
+
from docling.document_converter import DocumentConverter
|
|
27
|
+
from docling_core.transforms.chunker import HybridChunker
|
|
28
|
+
except ImportError:
|
|
29
|
+
print("docling not installed. Install with: uv pip install 'uridx[docling]'", file=sys.stderr)
|
|
30
|
+
raise typer.Exit(1)
|
|
31
|
+
|
|
32
|
+
converter = DocumentConverter()
|
|
33
|
+
chunker = HybridChunker()
|
|
34
|
+
|
|
35
|
+
sources = sources or []
|
|
36
|
+
urls = [s for s in sources if s.startswith(("http://", "https://"))]
|
|
37
|
+
local_paths = [Path(s) for s in sources if not s.startswith(("http://", "https://"))]
|
|
38
|
+
|
|
39
|
+
for url in urls:
|
|
40
|
+
_convert_source(converter, chunker, url, url, created_at=None)
|
|
41
|
+
|
|
42
|
+
for file_path in resolve_paths(local_paths, SUPPORTED_EXTENSIONS):
|
|
43
|
+
_convert_source(converter, chunker, str(file_path), f"file://{file_path.resolve()}", created_at=get_file_mtime(file_path))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _convert_source(converter, chunker, source: str, source_uri: str, created_at: str | None = None):
|
|
47
|
+
"""Convert a single source and output JSONL."""
|
|
48
|
+
try:
|
|
49
|
+
result = converter.convert(source)
|
|
50
|
+
doc = result.document
|
|
51
|
+
chunk_iter = chunker.chunk(dl_doc=doc)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print(f"Error processing {source}: {e}", file=sys.stderr)
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
chunks = []
|
|
57
|
+
for i, chunk in enumerate(chunk_iter):
|
|
58
|
+
text = chunk.text.strip() if hasattr(chunk, "text") else str(chunk).strip()
|
|
59
|
+
if text:
|
|
60
|
+
chunks.append({"text": text, "key": f"chunk-{i}"})
|
|
61
|
+
|
|
62
|
+
if not chunks:
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
parsed = urlparse(source)
|
|
66
|
+
if parsed.scheme in ("http", "https"):
|
|
67
|
+
title = Path(parsed.path).stem or parsed.netloc
|
|
68
|
+
ext = Path(parsed.path).suffix.lstrip(".").lower() or "html"
|
|
69
|
+
else:
|
|
70
|
+
title = Path(source).stem
|
|
71
|
+
ext = Path(source).suffix.lstrip(".").lower()
|
|
72
|
+
|
|
73
|
+
record = {
|
|
74
|
+
"source_uri": source_uri,
|
|
75
|
+
"chunks": chunks,
|
|
76
|
+
"tags": ["document", ext] if ext else ["document"],
|
|
77
|
+
"title": title,
|
|
78
|
+
"source_type": "document",
|
|
79
|
+
"context": json.dumps({"source": source}),
|
|
80
|
+
"replace": True,
|
|
81
|
+
}
|
|
82
|
+
if created_at:
|
|
83
|
+
record["created_at"] = created_at
|
|
84
|
+
output(record)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Extract image descriptions via Ollama vision model."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Annotated, Optional
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
import typer
|
|
12
|
+
|
|
13
|
+
from .base import get_file_mtime, output, resolve_paths
|
|
14
|
+
|
|
15
|
+
EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def extract(
|
|
19
|
+
paths: Annotated[Optional[list[Path]], typer.Argument(help="Files or directories")] = None,
|
|
20
|
+
model: Annotated[str, typer.Option("--model", "-m", help="Vision model")] = "",
|
|
21
|
+
base_url: Annotated[str, typer.Option("--base-url", help="Ollama URL")] = "",
|
|
22
|
+
):
|
|
23
|
+
"""Extract image descriptions via Ollama vision model."""
|
|
24
|
+
ollama_url = base_url or os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
|
25
|
+
vision_model = model or os.getenv("OLLAMA_VISION_MODEL", "llama3.2-vision")
|
|
26
|
+
|
|
27
|
+
for img_file in resolve_paths(paths or [], EXTENSIONS):
|
|
28
|
+
try:
|
|
29
|
+
with open(img_file, "rb") as f:
|
|
30
|
+
image_data = base64.b64encode(f.read()).decode("utf-8")
|
|
31
|
+
|
|
32
|
+
with httpx.Client(timeout=120.0) as client:
|
|
33
|
+
response = client.post(
|
|
34
|
+
f"{ollama_url}/api/generate",
|
|
35
|
+
json={
|
|
36
|
+
"model": vision_model,
|
|
37
|
+
"prompt": "Describe this image in detail. Include any text visible in the image.",
|
|
38
|
+
"images": [image_data],
|
|
39
|
+
"stream": False,
|
|
40
|
+
},
|
|
41
|
+
)
|
|
42
|
+
response.raise_for_status()
|
|
43
|
+
description = response.json()["response"]
|
|
44
|
+
except httpx.ConnectError:
|
|
45
|
+
print(f"Cannot connect to Ollama at {ollama_url}", file=sys.stderr)
|
|
46
|
+
raise typer.Exit(1)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(f"Error describing {img_file}: {e}", file=sys.stderr)
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
if not description or not description.strip():
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
output(
|
|
55
|
+
{
|
|
56
|
+
"source_uri": f"file://{img_file.resolve()}",
|
|
57
|
+
"chunks": [
|
|
58
|
+
{"text": description.strip(), "key": "description", "meta": {"original_filename": img_file.name}}
|
|
59
|
+
],
|
|
60
|
+
"tags": ["image"],
|
|
61
|
+
"title": img_file.stem,
|
|
62
|
+
"source_type": "image",
|
|
63
|
+
"context": json.dumps({"path": str(img_file), "vision_model": vision_model}),
|
|
64
|
+
"replace": True,
|
|
65
|
+
"created_at": get_file_mtime(img_file),
|
|
66
|
+
}
|
|
67
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Extract markdown files, splitting by headings."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated, Optional
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
|
|
11
|
+
from .base import get_file_mtime, output, resolve_paths
|
|
12
|
+
|
|
13
|
+
EXTENSIONS = {".md", ".markdown"}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _slugify(text: str) -> str:
|
|
17
|
+
if not text:
|
|
18
|
+
return "untitled"
|
|
19
|
+
text = re.sub(r"^#+\s*", "", text)
|
|
20
|
+
text = text.lower()
|
|
21
|
+
text = re.sub(r"[^a-z0-9]+", "-", text)
|
|
22
|
+
return text.strip("-")[:50] or "untitled"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _parse(path: Path) -> list[dict]:
|
|
26
|
+
content = path.read_text(encoding="utf-8")
|
|
27
|
+
heading_pattern = r"^(#{1,6}\s+.+)$"
|
|
28
|
+
parts = re.split(heading_pattern, content, flags=re.MULTILINE)
|
|
29
|
+
|
|
30
|
+
chunks = []
|
|
31
|
+
current_heading = None
|
|
32
|
+
current_content = []
|
|
33
|
+
|
|
34
|
+
for part in parts:
|
|
35
|
+
part = part.strip()
|
|
36
|
+
if not part:
|
|
37
|
+
continue
|
|
38
|
+
if re.match(r"^#{1,6}\s+", part):
|
|
39
|
+
if current_heading or current_content:
|
|
40
|
+
text_parts = [current_heading] if current_heading else []
|
|
41
|
+
text_parts.extend(current_content)
|
|
42
|
+
chunk_text = "\n\n".join(text_parts)
|
|
43
|
+
if chunk_text.strip():
|
|
44
|
+
chunks.append(
|
|
45
|
+
{
|
|
46
|
+
"text": chunk_text,
|
|
47
|
+
"key": _slugify(current_heading) if current_heading else f"section-{len(chunks)}",
|
|
48
|
+
"meta": {"heading": current_heading},
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
current_heading = part
|
|
52
|
+
current_content = []
|
|
53
|
+
else:
|
|
54
|
+
current_content.append(part)
|
|
55
|
+
|
|
56
|
+
if current_heading or current_content:
|
|
57
|
+
text_parts = [current_heading] if current_heading else []
|
|
58
|
+
text_parts.extend(current_content)
|
|
59
|
+
chunk_text = "\n\n".join(text_parts)
|
|
60
|
+
if chunk_text.strip():
|
|
61
|
+
chunks.append(
|
|
62
|
+
{
|
|
63
|
+
"text": chunk_text,
|
|
64
|
+
"key": _slugify(current_heading) if current_heading else f"section-{len(chunks)}",
|
|
65
|
+
"meta": {"heading": current_heading},
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if not chunks and content.strip():
|
|
70
|
+
chunks.append({"text": content.strip(), "key": "full", "meta": {}})
|
|
71
|
+
|
|
72
|
+
return chunks
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def extract(
|
|
76
|
+
paths: Annotated[Optional[list[Path]], typer.Argument(help="Files or directories")] = None,
|
|
77
|
+
):
|
|
78
|
+
"""Extract markdown files, splitting by headings."""
|
|
79
|
+
for md_file in resolve_paths(paths or [], EXTENSIONS):
|
|
80
|
+
try:
|
|
81
|
+
chunks = _parse(md_file)
|
|
82
|
+
except Exception as e:
|
|
83
|
+
print(f"Error parsing {md_file}: {e}", file=sys.stderr)
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
if not chunks:
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
output(
|
|
90
|
+
{
|
|
91
|
+
"source_uri": f"file://{md_file.resolve()}",
|
|
92
|
+
"chunks": chunks,
|
|
93
|
+
"tags": ["markdown", "document"],
|
|
94
|
+
"title": md_file.stem,
|
|
95
|
+
"source_type": "markdown",
|
|
96
|
+
"context": json.dumps({"path": str(md_file)}),
|
|
97
|
+
"replace": True,
|
|
98
|
+
"created_at": get_file_mtime(md_file),
|
|
99
|
+
}
|
|
100
|
+
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Extract PDF files by page using pdfplumber."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Optional
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from .base import output
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract(
|
|
14
|
+
path: Annotated[Optional[Path], typer.Argument(help="File or directory")] = None,
|
|
15
|
+
):
|
|
16
|
+
"""Extract PDF files by page (requires pdfplumber)."""
|
|
17
|
+
try:
|
|
18
|
+
import pdfplumber
|
|
19
|
+
except ImportError:
|
|
20
|
+
print("pdfplumber not installed. Install with: uv pip install 'uridx[pdf]'", file=sys.stderr)
|
|
21
|
+
raise typer.Exit(1)
|
|
22
|
+
|
|
23
|
+
root = path or Path.cwd()
|
|
24
|
+
|
|
25
|
+
if root.is_file():
|
|
26
|
+
files = [root]
|
|
27
|
+
else:
|
|
28
|
+
files = list(root.rglob("*.pdf"))
|
|
29
|
+
|
|
30
|
+
for pdf_file in files:
|
|
31
|
+
if not pdf_file.is_file():
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
chunks = []
|
|
35
|
+
try:
|
|
36
|
+
with pdfplumber.open(pdf_file) as pdf:
|
|
37
|
+
for i, page in enumerate(pdf.pages):
|
|
38
|
+
try:
|
|
39
|
+
text = page.extract_text()
|
|
40
|
+
except Exception as e:
|
|
41
|
+
print(f"Error extracting page {i + 1} from {pdf_file}: {e}", file=sys.stderr)
|
|
42
|
+
continue
|
|
43
|
+
if text and text.strip():
|
|
44
|
+
chunks.append({"text": text.strip(), "key": f"page-{i + 1}", "meta": {"page_number": i + 1}})
|
|
45
|
+
except Exception as e:
|
|
46
|
+
print(f"Error processing {pdf_file}: {e}", file=sys.stderr)
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
if not chunks:
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
output(
|
|
53
|
+
{
|
|
54
|
+
"source_uri": f"file://{pdf_file.resolve()}",
|
|
55
|
+
"chunks": chunks,
|
|
56
|
+
"tags": ["pdf", "document"],
|
|
57
|
+
"title": pdf_file.stem,
|
|
58
|
+
"source_type": "pdf",
|
|
59
|
+
"context": json.dumps({"path": str(pdf_file), "pages": len(chunks)}),
|
|
60
|
+
"replace": True,
|
|
61
|
+
}
|
|
62
|
+
)
|