ragtime-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragtime-cli might be problematic. Click here for more details.

src/indexers/docs.py ADDED
@@ -0,0 +1,129 @@
1
+ """
2
+ Docs indexer - parses markdown files with YAML frontmatter.
3
+
4
+ Designed for .claude/memory/ style files but works with any markdown.
5
+ """
6
+
7
+ import re
8
+ from pathlib import Path
9
+ from dataclasses import dataclass
10
+ import yaml
11
+
12
+
13
+ @dataclass
14
+ class DocEntry:
15
+ """A parsed document ready for indexing."""
16
+ content: str
17
+ file_path: str
18
+ namespace: str | None = None
19
+ category: str | None = None
20
+ component: str | None = None
21
+ title: str | None = None
22
+
23
+ def to_metadata(self) -> dict:
24
+ """Convert to ChromaDB metadata dict."""
25
+ return {
26
+ "type": "docs",
27
+ "file": self.file_path,
28
+ "namespace": self.namespace or "default",
29
+ "category": self.category or "",
30
+ "component": self.component or "",
31
+ "title": self.title or Path(self.file_path).stem,
32
+ }
33
+
34
+
35
+ def parse_frontmatter(content: str) -> tuple[dict, str]:
36
+ """
37
+ Parse YAML frontmatter from markdown content.
38
+
39
+ Returns (metadata_dict, body_content).
40
+ If no frontmatter, returns ({}, full_content).
41
+ """
42
+ pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
43
+ match = re.match(pattern, content, re.DOTALL)
44
+
45
+ if not match:
46
+ return {}, content
47
+
48
+ try:
49
+ metadata = yaml.safe_load(match.group(1)) or {}
50
+ body = match.group(2)
51
+ return metadata, body
52
+ except yaml.YAMLError:
53
+ return {}, content
54
+
55
+
56
+ def index_file(file_path: Path) -> DocEntry | None:
57
+ """
58
+ Parse a single markdown file into a DocEntry.
59
+
60
+ Returns None if file can't be parsed.
61
+ """
62
+ try:
63
+ content = file_path.read_text(encoding='utf-8')
64
+ except (IOError, UnicodeDecodeError):
65
+ return None
66
+
67
+ metadata, body = parse_frontmatter(content)
68
+
69
+ # Skip empty documents
70
+ if not body.strip():
71
+ return None
72
+
73
+ return DocEntry(
74
+ content=body.strip(),
75
+ file_path=str(file_path),
76
+ namespace=metadata.get("namespace"),
77
+ category=metadata.get("category"),
78
+ component=metadata.get("component"),
79
+ title=metadata.get("title"),
80
+ )
81
+
82
+
83
+ def discover_docs(
84
+ root: Path,
85
+ patterns: list[str] | None = None,
86
+ exclude: list[str] | None = None,
87
+ ) -> list[Path]:
88
+ """
89
+ Find all markdown files to index.
90
+
91
+ Args:
92
+ root: Directory to search
93
+ patterns: Glob patterns to include (default: ["**/*.md"])
94
+ exclude: Patterns to exclude (default: ["**/node_modules/**", "**/.git/**"])
95
+ """
96
+ patterns = patterns or ["**/*.md"]
97
+ exclude = exclude or ["**/node_modules/**", "**/.git/**", "**/.ragtime/**"]
98
+
99
+ files = []
100
+ for pattern in patterns:
101
+ for path in root.glob(pattern):
102
+ if path.is_file():
103
+ # Check exclusions
104
+ skip = False
105
+ for ex in exclude:
106
+ if path.match(ex):
107
+ skip = True
108
+ break
109
+ if not skip:
110
+ files.append(path)
111
+
112
+ return files
113
+
114
+
115
+ def index_directory(root: Path, **kwargs) -> list[DocEntry]:
116
+ """
117
+ Index all markdown files in a directory.
118
+
119
+ Returns list of DocEntry objects ready for vector DB.
120
+ """
121
+ files = discover_docs(root, **kwargs)
122
+ entries = []
123
+
124
+ for file_path in files:
125
+ entry = index_file(file_path)
126
+ if entry:
127
+ entries.append(entry)
128
+
129
+ return entries