wikigen 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wikigen/mcp/server.py ADDED
@@ -0,0 +1,232 @@
1
+ """MCP server implementation for wikigen."""
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Optional
5
+
6
+ from mcp.server.fastmcp import FastMCP
7
+
8
+ from .output_resources import discover_all_projects
9
+ from ..config import get_output_dir
10
+ from .search_index import FileIndexer
11
+
12
+ # Initialize the MCP server
13
+ # Instructions help editors/clients understand what this server provides
14
+ app = FastMCP(
15
+ "wikigen",
16
+ instructions=(
17
+ "Expose local wiki markdown files as MCP tools. "
18
+ "Available tools: search_docs (semantic search across indexed directories), "
19
+ "get_docs (fetch content by resource name or file path), and index_directories (index dirs for search). "
20
+ "Doc names mirror paths under your configured output_dir (without .md extension)."
21
+ ),
22
+ )
23
+
24
+ # Initialize search indexer for fast machine-wide search
25
+ _indexer: Optional[FileIndexer] = None
26
+
27
+
28
+ def _get_indexer() -> FileIndexer:
29
+ """Get or create the search indexer instance."""
30
+ global _indexer
31
+ if _indexer is None:
32
+ _indexer = FileIndexer()
33
+ return _indexer
34
+
35
+
36
+ # Store discovered projects (refreshed on each request)
37
+ _projects: Dict[str, Path] = {}
38
+
39
+
40
+ def _refresh_projects():
41
+ """Refresh the project registry."""
42
+ global _projects
43
+ _projects = discover_all_projects()
44
+
45
+
46
+ def _get_project_resources():
47
+ """Get list of available project resources."""
48
+ _refresh_projects()
49
+ return _projects
50
+
51
+
52
+ # MCP Tools - executable actions for interacting with documentation
53
+ @app.tool()
54
+ def get_docs(identifier: str) -> str:
55
+ """
56
+ Get the full content of a documentation file by resource name or absolute file path.
57
+
58
+ This tool can fetch documentation using either:
59
+ - Resource name (e.g., 'README', 'Order Management/Felis Stream') - searches output_dir
60
+ - Absolute file path (e.g., '/Users/name/Documents/doc.md') - works with any indexed file
61
+
62
+ Args:
63
+ identifier: Either a resource name (from output_dir) or an absolute file path
64
+
65
+ Returns:
66
+ The markdown content of the requested documentation file.
67
+ """
68
+ # Check if it looks like an absolute path
69
+ path_obj = Path(identifier)
70
+ if path_obj.is_absolute() and path_obj.exists():
71
+ # Treat as absolute file path
72
+ try:
73
+ content = path_obj.read_text(encoding="utf-8")
74
+ return content
75
+ except Exception as e:
76
+ raise RuntimeError(
77
+ f"Failed to read file at path '{identifier}': {e}"
78
+ ) from e
79
+
80
+ # Treat as resource name from output_dir
81
+ projects = _get_project_resources()
82
+
83
+ if identifier not in projects:
84
+ # Provide helpful error message
85
+ available = ", ".join(sorted(projects.keys())[:10]) if projects else "none"
86
+ if len(projects) > 10:
87
+ available += f" ... (and {len(projects) - 10} more)"
88
+ raise ValueError(
89
+ f"Documentation '{identifier}' not found in output directory. "
90
+ f"Available resources: {available}. "
91
+ f"If you meant to use a file path, provide an absolute path starting with '/'."
92
+ )
93
+
94
+ doc_path = projects[identifier]
95
+ try:
96
+ content = doc_path.read_text(encoding="utf-8")
97
+ return content
98
+ except Exception as e:
99
+ raise RuntimeError(
100
+ f"Failed to read documentation for '{identifier}': {e}"
101
+ ) from e
102
+
103
+
104
+ @app.tool()
105
+ def search_docs(
106
+ query: str,
107
+ limit: int = 20,
108
+ directory_filter: Optional[str] = None,
109
+ chunk_limit: int = 5,
110
+ ) -> str:
111
+ """
112
+ Search for markdown files across indexed directories using semantic search.
113
+
114
+ This tool uses semantic search to find relevant chunks from indexed documentation.
115
+ It returns relevant chunks with content snippets instead of entire files.
116
+ Index directories first using index_directories, or files are auto-indexed from
117
+ the configured output_dir on first search.
118
+
119
+ Args:
120
+ query: Search query (supports multi-word queries and natural language)
121
+ limit: Maximum number of chunks to return (default: 20)
122
+ directory_filter: Optional directory path to filter results
123
+ chunk_limit: Maximum chunks per file (default: 5)
124
+
125
+ Returns:
126
+ Formatted list of relevant chunks with their paths, resource names, scores, and content snippets.
127
+ """
128
+ indexer = _get_indexer()
129
+
130
+ # Auto-index default output directory if no files indexed yet
131
+ stats = indexer.get_stats()
132
+ if stats["total_files"] == 0:
133
+ output_dir = get_output_dir()
134
+ if output_dir.exists():
135
+ added, updated, skipped = indexer.index_directory(output_dir)
136
+ if added > 0 or updated > 0:
137
+ return f"Indexed {added} new files, updated {updated}. Try searching again."
138
+
139
+ # Always use semantic search
140
+ if not indexer.enable_semantic_search or not indexer.vector_index:
141
+ # Fallback to keyword search if semantic search is not available
142
+ results = indexer.search(query, limit=limit, directory_filter=directory_filter)
143
+ if not results:
144
+ return f"No files found matching '{query}'."
145
+
146
+ # Format file results
147
+ lines = [f"Found {len(results)} file(s) matching '{query}':\n"]
148
+ for i, result in enumerate(results, 1):
149
+ lines.append(
150
+ f"{i}. {result['resource_name']}\n"
151
+ f" Path: {result['file_path']}\n"
152
+ f" Directory: {result['directory']}"
153
+ )
154
+ return "\n".join(lines)
155
+
156
+ # Use semantic search
157
+ results = indexer.search_semantic(
158
+ query,
159
+ limit=limit,
160
+ directory_filter=directory_filter,
161
+ max_chunks_per_file=chunk_limit,
162
+ )
163
+
164
+ if not results:
165
+ return f"No chunks found matching '{query}'."
166
+
167
+ # Format chunk results
168
+ lines = [f"Found {len(results)} relevant chunk(s) matching '{query}':\n"]
169
+ for i, result in enumerate(results, 1):
170
+ # Truncate chunk content for display (first 200 chars)
171
+ content_snippet = result.get("content", "")[:200]
172
+ if len(result.get("content", "")) > 200:
173
+ content_snippet += "..."
174
+
175
+ lines.append(
176
+ f"{i}. {result['resource_name']} (chunk {result.get('chunk_index', 0)})\n"
177
+ f" Path: {result['file_path']}\n"
178
+ f" Score: {result.get('score', 0):.4f}\n"
179
+ f" Content: {content_snippet}"
180
+ )
181
+
182
+ return "\n".join(lines)
183
+
184
+
185
+ @app.tool()
186
+ def index_directories(directories: List[str], max_depth: Optional[int] = None) -> str:
187
+ """
188
+ Index markdown files from specified directories for fast searching.
189
+
190
+ Args:
191
+ directories: List of directory paths to index
192
+ max_depth: Maximum recursion depth (None = unlimited)
193
+
194
+ Returns:
195
+ Summary of indexing results.
196
+ """
197
+ indexer = _get_indexer()
198
+
199
+ total_added = 0
200
+ total_updated = 0
201
+ total_skipped = 0
202
+
203
+ results = []
204
+
205
+ for dir_path in directories:
206
+ path = Path(dir_path).expanduser()
207
+ if not path.exists():
208
+ results.append(f"✗ {dir_path}: Directory does not exist")
209
+ continue
210
+
211
+ if not path.is_dir():
212
+ results.append(f"✗ {dir_path}: Path is not a directory")
213
+ continue
214
+
215
+ added, updated, skipped = indexer.index_directory(path, max_depth=max_depth)
216
+ total_added += added
217
+ total_updated += updated
218
+ total_skipped += skipped
219
+
220
+ results.append(
221
+ f"✓ {dir_path}: {added} added, {updated} updated, {skipped} skipped"
222
+ )
223
+
224
+ summary = "\n".join(results)
225
+ summary += f"\n\nTotal: {total_added} added, {total_updated} updated, {total_skipped} skipped"
226
+
227
+ return summary
228
+
229
+
230
+ def run_mcp_server():
231
+ """Entry point to run MCP server."""
232
+ app.run()
@@ -0,0 +1,297 @@
1
+ """FAISS vector index management for semantic search.
2
+
3
+ This module provides FAISS index management for storing and searching
4
+ document chunk embeddings.
5
+ """
6
+
7
+ import pickle
8
+ from pathlib import Path
9
+ from typing import List, Dict, Any, Optional, Tuple
10
+ from threading import Lock
11
+ import numpy as np
12
+
13
+ try:
14
+ import faiss
15
+
16
+ FAISS_AVAILABLE = True
17
+ except ImportError:
18
+ FAISS_AVAILABLE = False
19
+ faiss = None
20
+
21
+ from ..config import CONFIG_DIR
22
+
23
+
24
+ class VectorIndex:
25
+ """
26
+ FAISS vector index manager for semantic search.
27
+
28
+ Manages a FAISS index for storing and searching document chunk embeddings.
29
+ Also maintains metadata mapping chunk IDs to file paths and chunk information.
30
+ """
31
+
32
+ def __init__(self, index_path: Optional[Path] = None, embedding_dim: int = 384):
33
+ """
34
+ Initialize the vector index.
35
+
36
+ Args:
37
+ index_path: Path to save/load the FAISS index. Defaults to config_dir/vector_index.faiss
38
+ embedding_dim: Dimension of embeddings (default: 384 for all-MiniLM-L6-v2)
39
+ """
40
+ if not FAISS_AVAILABLE:
41
+ raise ImportError(
42
+ "FAISS is not available. Please install faiss-cpu: pip install faiss-cpu"
43
+ )
44
+
45
+ if index_path is None:
46
+ index_path = CONFIG_DIR / "vector_index.faiss"
47
+
48
+ self.index_path = index_path
49
+ self.metadata_path = index_path.with_suffix(".metadata.pkl")
50
+ self.embedding_dim = embedding_dim
51
+ self._lock = Lock()
52
+
53
+ # FAISS index (FlatIndex for exact search)
54
+ self.index: Optional[faiss.Index] = None
55
+
56
+ # Metadata: chunk_id -> {file_path, chunk_index, content, start_pos, end_pos}
57
+ self.metadata: Dict[int, Dict[str, Any]] = {}
58
+
59
+ # File to chunk IDs mapping: file_path -> [chunk_id, ...]
60
+ self.file_to_chunks: Dict[str, List[int]] = {}
61
+
62
+ # Next chunk ID
63
+ self.next_chunk_id = 0
64
+
65
+ # Load existing index if available
66
+ self._load()
67
+
68
+ def _load(self) -> None:
69
+ """Load FAISS index and metadata from disk."""
70
+ with self._lock:
71
+ if self.index_path.exists() and self.metadata_path.exists():
72
+ try:
73
+ # Load FAISS index
74
+ self.index = faiss.read_index(str(self.index_path))
75
+
76
+ # Load metadata
77
+ with open(self.metadata_path, "rb") as f:
78
+ data = pickle.load(f)
79
+ self.metadata = data.get("metadata", {})
80
+ self.file_to_chunks = data.get("file_to_chunks", {})
81
+ self.next_chunk_id = data.get("next_chunk_id", 0)
82
+
83
+ # Verify embedding dimension matches
84
+ if self.index.d != self.embedding_dim:
85
+ raise ValueError(
86
+ f"Index embedding dimension ({self.index.d}) "
87
+ f"does not match expected ({self.embedding_dim})"
88
+ )
89
+ except Exception as e:
90
+ # If loading fails, start fresh
91
+ print(f"Warning: Could not load vector index: {e}")
92
+ self._init_index()
93
+ else:
94
+ self._init_index()
95
+
96
+ def _init_index(self) -> None:
97
+ """Initialize a new FAISS index."""
98
+ # Use FlatIndex for exact search (good for <1M vectors)
99
+ # For larger datasets, consider IVF or HNSW
100
+ self.index = faiss.IndexFlatL2(self.embedding_dim)
101
+ self.metadata = {}
102
+ self.file_to_chunks = {}
103
+ self.next_chunk_id = 0
104
+
105
+ def add_chunks(
106
+ self,
107
+ file_path: str,
108
+ chunks: List[Dict[str, Any]],
109
+ embeddings: np.ndarray,
110
+ ) -> None:
111
+ """
112
+ Add chunks and their embeddings to the index.
113
+
114
+ Args:
115
+ file_path: Path to the file these chunks belong to
116
+ chunks: List of chunk dictionaries with 'content', 'start_pos', 'end_pos', 'chunk_index'
117
+ embeddings: NumPy array of embeddings (shape: (len(chunks), embedding_dim))
118
+ """
119
+ if not FAISS_AVAILABLE:
120
+ raise ImportError("FAISS is not available")
121
+
122
+ if len(chunks) != len(embeddings):
123
+ raise ValueError(
124
+ f"Number of chunks ({len(chunks)}) does not match "
125
+ f"number of embeddings ({len(embeddings)})"
126
+ )
127
+
128
+ with self._lock:
129
+ # Remove existing chunks for this file
130
+ if file_path in self.file_to_chunks:
131
+ self._remove_file(file_path)
132
+
133
+ # Ensure embeddings are float32 and 2D
134
+ if embeddings.dtype != np.float32:
135
+ embeddings = embeddings.astype(np.float32)
136
+ if len(embeddings.shape) == 1:
137
+ embeddings = embeddings.reshape(1, -1)
138
+
139
+ # Add embeddings to FAISS index
140
+ self.index.add(embeddings)
141
+
142
+ # Add metadata for each chunk
143
+ chunk_ids = []
144
+ for i, chunk in enumerate(chunks):
145
+ chunk_id = self.next_chunk_id
146
+ chunk_ids.append(chunk_id)
147
+
148
+ self.metadata[chunk_id] = {
149
+ "file_path": file_path,
150
+ "chunk_index": chunk.get("chunk_index", i),
151
+ "content": chunk.get("content", ""),
152
+ "start_pos": chunk.get("start_pos", 0),
153
+ "end_pos": chunk.get("end_pos", 0),
154
+ }
155
+
156
+ self.next_chunk_id += 1
157
+
158
+ # Update file to chunks mapping
159
+ self.file_to_chunks[file_path] = chunk_ids
160
+
161
+ def search(
162
+ self,
163
+ query_embedding: np.ndarray,
164
+ k: int = 10,
165
+ file_filter: Optional[List[str]] = None,
166
+ ) -> List[Tuple[int, float, Dict[str, Any]]]:
167
+ """
168
+ Search for similar chunks.
169
+
170
+ Args:
171
+ query_embedding: Query embedding vector
172
+ k: Number of results to return
173
+ file_filter: Optional list of file paths to filter results
174
+
175
+ Returns:
176
+ List of tuples: (chunk_id, distance, metadata_dict)
177
+ Sorted by distance (lower is better)
178
+ """
179
+ if not FAISS_AVAILABLE:
180
+ raise ImportError("FAISS is not available")
181
+
182
+ if self.index is None or self.index.ntotal == 0:
183
+ return []
184
+
185
+ with self._lock:
186
+ # Ensure query embedding is float32 and 2D
187
+ if query_embedding.dtype != np.float32:
188
+ query_embedding = query_embedding.astype(np.float32)
189
+ if len(query_embedding.shape) == 1:
190
+ query_embedding = query_embedding.reshape(1, -1)
191
+
192
+ # Search in FAISS
193
+ distances, indices = self.index.search(
194
+ query_embedding, k * 2
195
+ ) # Get more, filter later
196
+
197
+ # Filter and format results
198
+ results = []
199
+ seen_files = {} # Track chunks per file for file_filter
200
+
201
+ for idx, dist in zip(indices[0], distances[0]):
202
+ if idx < 0: # Invalid index
203
+ continue
204
+
205
+ chunk_id = idx
206
+ if chunk_id not in self.metadata:
207
+ continue
208
+
209
+ metadata = self.metadata[chunk_id]
210
+ file_path = metadata["file_path"]
211
+
212
+ # Apply file filter if provided
213
+ if file_filter is not None and file_path not in file_filter:
214
+ continue
215
+
216
+ # Limit chunks per file (if file_filter is used)
217
+ if file_filter is not None:
218
+ if file_path not in seen_files:
219
+ seen_files[file_path] = 0
220
+ if seen_files[file_path] >= 5: # Max chunks per file
221
+ continue
222
+ seen_files[file_path] += 1
223
+
224
+ results.append((chunk_id, float(dist), metadata))
225
+
226
+ if len(results) >= k:
227
+ break
228
+
229
+ return results
230
+
231
+ def _remove_file(self, file_path: str) -> None:
232
+ """Remove all chunks for a file (internal method, not thread-safe)."""
233
+ if file_path not in self.file_to_chunks:
234
+ return
235
+
236
+ chunk_ids = self.file_to_chunks[file_path]
237
+
238
+ # Note: FAISS doesn't support removing individual vectors efficiently
239
+ # For now, we'll mark them as removed in metadata and rebuild on next save
240
+ # A better approach would be to rebuild the index, but that's expensive
241
+ # For production, consider using a more advanced index type that supports deletion
242
+
243
+ # Remove from metadata
244
+ for chunk_id in chunk_ids:
245
+ self.metadata.pop(chunk_id, None)
246
+
247
+ # Remove from file mapping
248
+ del self.file_to_chunks[file_path]
249
+
250
+ def remove_file(self, file_path: str) -> None:
251
+ """
252
+ Remove all chunks for a file from the index.
253
+
254
+ Note: This marks chunks as removed but doesn't actually remove them
255
+ from the FAISS index. The index will be rebuilt on next save.
256
+
257
+ Args:
258
+ file_path: Path to the file to remove
259
+ """
260
+ with self._lock:
261
+ self._remove_file(file_path)
262
+
263
+ def save(self) -> None:
264
+ """Save FAISS index and metadata to disk."""
265
+ if not FAISS_AVAILABLE:
266
+ return
267
+
268
+ with self._lock:
269
+ if self.index is None:
270
+ return
271
+
272
+ # Ensure directory exists
273
+ self.index_path.parent.mkdir(parents=True, exist_ok=True)
274
+
275
+ # Save FAISS index
276
+ faiss.write_index(self.index, str(self.index_path))
277
+
278
+ # Save metadata
279
+ with open(self.metadata_path, "wb") as f:
280
+ pickle.dump(
281
+ {
282
+ "metadata": self.metadata,
283
+ "file_to_chunks": self.file_to_chunks,
284
+ "next_chunk_id": self.next_chunk_id,
285
+ },
286
+ f,
287
+ )
288
+
289
+ def get_stats(self) -> Dict[str, Any]:
290
+ """Get statistics about the index."""
291
+ with self._lock:
292
+ return {
293
+ "total_chunks": len(self.metadata),
294
+ "total_files_with_chunks": len(self.file_to_chunks),
295
+ "index_size": self.index.ntotal if self.index else 0,
296
+ "embedding_dim": self.embedding_dim,
297
+ }
@@ -0,0 +1,35 @@
1
+ """
2
+ Metadata package for WikiGen.
3
+ Centralized source of truth for project information.
4
+ """
5
+
6
+ from .project import (
7
+ PROJECT_NAME,
8
+ AUTHOR_NAME,
9
+ ORGANIZATION,
10
+ DESCRIPTION,
11
+ REPOSITORY_URL,
12
+ HOMEPAGE_URL,
13
+ ISSUES_URL,
14
+ COPYRIGHT_TEXT,
15
+ MIN_PYTHON_VERSION,
16
+ CLI_ENTRY_POINT,
17
+ )
18
+
19
+ from .version import get_version, __version__
20
+
21
+ # Re-export commonly used items
22
+ __all__ = [
23
+ "PROJECT_NAME",
24
+ "AUTHOR_NAME",
25
+ "ORGANIZATION",
26
+ "DESCRIPTION",
27
+ "REPOSITORY_URL",
28
+ "HOMEPAGE_URL",
29
+ "ISSUES_URL",
30
+ "COPYRIGHT_TEXT",
31
+ "MIN_PYTHON_VERSION",
32
+ "CLI_ENTRY_POINT",
33
+ "get_version",
34
+ "__version__",
35
+ ]
@@ -0,0 +1,28 @@
1
+ """ASCII logo for WikiGen CLI."""
2
+
3
+ from .project import DESCRIPTION
4
+ from .version import get_version
5
+
6
+
7
+ def print_logo():
8
+ """Print the WikiGen ASCII logo with simple gray colors."""
9
+ # Simple colors that work well on both light and dark backgrounds
10
+ LOGO_COLOR = "\033[38;5;240m" # Medium gray - visible everywhere
11
+ ATTRIB_COLOR = "\033[38;5;245m" # Light gray
12
+ RESET = "\033[0m"
13
+
14
+ version = get_version()
15
+ logo = f"""
16
+ {ATTRIB_COLOR}INTRODUCING
17
+ {RESET}{LOGO_COLOR}
18
+ ██╗ ██╗ ██╗ ██╗ ██╗ ██╗ ██╗ ██╗ ██████╗ ███████╗ ███╗ ██╗
19
+ ██║ ██║ ██║ ██║ ██║ ██║ ██╔╝ ██║ ██╔════╝ ██╔════╝ ████╗ ██║
20
+ ██║ ██║ ██║ █╗ ██║ ██║ █████╔╝ ██║ ██║ ███╗ ██████╗ ██╔██╗ ██║
21
+ ██║ ██║ ██║███╗██║ ██║ ██╔═██╗ ██║ ██║ ██║ ██╔══╝ ██║╚██╗██║
22
+ ██║ ██║ ╚███╔███╔╝ ██║ ██║ ██╗ ██║ ╚██████╔╝ ███████╗ ██║ ╚████║
23
+ ╚═╝ ╚═╝ ╚══╝╚══╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚══════╝ ╚═╝ ╚═══╝
24
+ {RESET}
25
+ {ATTRIB_COLOR}{DESCRIPTION} ♥ {RESET}
26
+ {ATTRIB_COLOR}v{version}{RESET}
27
+ """
28
+ print(logo)
@@ -0,0 +1,28 @@
1
+ """
2
+ Project metadata for WikiGen.
3
+ Single source of truth for project information.
4
+ """
5
+
6
+ import datetime
7
+
8
+ # Project info
9
+ PROJECT_NAME = "wikigen"
10
+ AUTHOR_NAME = "Mithun Ramesh"
11
+ ORGANIZATION = "USEWIKIGEN.CO"
12
+ DESCRIPTION = "WIKI'S FOR NERDS, BY NERDS"
13
+
14
+ # Repository info
15
+ REPOSITORY_URL = "https://github.com/usesalt/wikigen"
16
+ HOMEPAGE_URL = "https://usesalt.co"
17
+ ISSUES_URL = "https://github.com/usesalt/wikigen/issues"
18
+
19
+ # Dynamic values
20
+ CURRENT_YEAR = datetime.datetime.now().year
21
+ COPYRIGHT_TEXT = f"Copyright (c) {CURRENT_YEAR} {AUTHOR_NAME}"
22
+
23
+ # Python requirements
24
+ MIN_PYTHON_VERSION = "3.12"
25
+
26
+ # Package info
27
+ PACKAGE_NAME = "wikigen"
28
+ CLI_ENTRY_POINT = "wikigen"
@@ -0,0 +1,17 @@
1
+ """
2
+ Version management for WikiGen.
3
+ Centralized version definition for consistency.
4
+ """
5
+
6
+ # Current version - update this when releasing
7
+ __version__ = "1.0.0"
8
+
9
+
10
+ def get_version():
11
+ """
12
+ Get the current version.
13
+
14
+ Returns:
15
+ str: The current version string (e.g., "1.0.0")
16
+ """
17
+ return __version__
@@ -0,0 +1 @@
1
+ """Nodes module for WikiGen."""