wikigen 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikigen/__init__.py +7 -0
- wikigen/cli.py +690 -0
- wikigen/config.py +526 -0
- wikigen/defaults.py +78 -0
- wikigen/flows/__init__.py +1 -0
- wikigen/flows/flow.py +38 -0
- wikigen/formatter/help_formatter.py +194 -0
- wikigen/formatter/init_formatter.py +56 -0
- wikigen/formatter/output_formatter.py +290 -0
- wikigen/mcp/__init__.py +12 -0
- wikigen/mcp/chunking.py +127 -0
- wikigen/mcp/embeddings.py +69 -0
- wikigen/mcp/output_resources.py +65 -0
- wikigen/mcp/search_index.py +826 -0
- wikigen/mcp/server.py +232 -0
- wikigen/mcp/vector_index.py +297 -0
- wikigen/metadata/__init__.py +35 -0
- wikigen/metadata/logo.py +28 -0
- wikigen/metadata/project.py +28 -0
- wikigen/metadata/version.py +17 -0
- wikigen/nodes/__init__.py +1 -0
- wikigen/nodes/nodes.py +1080 -0
- wikigen/utils/__init__.py +0 -0
- wikigen/utils/adjust_headings.py +72 -0
- wikigen/utils/call_llm.py +271 -0
- wikigen/utils/crawl_github_files.py +450 -0
- wikigen/utils/crawl_local_files.py +151 -0
- wikigen/utils/llm_providers.py +101 -0
- wikigen/utils/version_check.py +84 -0
- wikigen-1.0.0.dist-info/METADATA +352 -0
- wikigen-1.0.0.dist-info/RECORD +35 -0
- wikigen-1.0.0.dist-info/WHEEL +5 -0
- wikigen-1.0.0.dist-info/entry_points.txt +2 -0
- wikigen-1.0.0.dist-info/licenses/LICENSE +21 -0
- wikigen-1.0.0.dist-info/top_level.txt +1 -0
wikigen/mcp/server.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""MCP server implementation for wikigen."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from mcp.server.fastmcp import FastMCP
|
|
7
|
+
|
|
8
|
+
from .output_resources import discover_all_projects
|
|
9
|
+
from ..config import get_output_dir
|
|
10
|
+
from .search_index import FileIndexer
|
|
11
|
+
|
|
12
|
+
# Initialize the MCP server
|
|
13
|
+
# Instructions help editors/clients understand what this server provides
|
|
14
|
+
app = FastMCP(
|
|
15
|
+
"wikigen",
|
|
16
|
+
instructions=(
|
|
17
|
+
"Expose local wiki markdown files as MCP tools. "
|
|
18
|
+
"Available tools: search_docs (semantic search across indexed directories), "
|
|
19
|
+
"get_docs (fetch content by resource name or file path), and index_directories (index dirs for search). "
|
|
20
|
+
"Doc names mirror paths under your configured output_dir (without .md extension)."
|
|
21
|
+
),
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Initialize search indexer for fast machine-wide search
|
|
25
|
+
_indexer: Optional[FileIndexer] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_indexer() -> FileIndexer:
|
|
29
|
+
"""Get or create the search indexer instance."""
|
|
30
|
+
global _indexer
|
|
31
|
+
if _indexer is None:
|
|
32
|
+
_indexer = FileIndexer()
|
|
33
|
+
return _indexer
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Store discovered projects (refreshed on each request)
|
|
37
|
+
_projects: Dict[str, Path] = {}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _refresh_projects():
|
|
41
|
+
"""Refresh the project registry."""
|
|
42
|
+
global _projects
|
|
43
|
+
_projects = discover_all_projects()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_project_resources():
|
|
47
|
+
"""Get list of available project resources."""
|
|
48
|
+
_refresh_projects()
|
|
49
|
+
return _projects
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# MCP Tools - executable actions for interacting with documentation
|
|
53
|
+
@app.tool()
|
|
54
|
+
def get_docs(identifier: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Get the full content of a documentation file by resource name or absolute file path.
|
|
57
|
+
|
|
58
|
+
This tool can fetch documentation using either:
|
|
59
|
+
- Resource name (e.g., 'README', 'Order Management/Felis Stream') - searches output_dir
|
|
60
|
+
- Absolute file path (e.g., '/Users/name/Documents/doc.md') - works with any indexed file
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
identifier: Either a resource name (from output_dir) or an absolute file path
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
The markdown content of the requested documentation file.
|
|
67
|
+
"""
|
|
68
|
+
# Check if it looks like an absolute path
|
|
69
|
+
path_obj = Path(identifier)
|
|
70
|
+
if path_obj.is_absolute() and path_obj.exists():
|
|
71
|
+
# Treat as absolute file path
|
|
72
|
+
try:
|
|
73
|
+
content = path_obj.read_text(encoding="utf-8")
|
|
74
|
+
return content
|
|
75
|
+
except Exception as e:
|
|
76
|
+
raise RuntimeError(
|
|
77
|
+
f"Failed to read file at path '{identifier}': {e}"
|
|
78
|
+
) from e
|
|
79
|
+
|
|
80
|
+
# Treat as resource name from output_dir
|
|
81
|
+
projects = _get_project_resources()
|
|
82
|
+
|
|
83
|
+
if identifier not in projects:
|
|
84
|
+
# Provide helpful error message
|
|
85
|
+
available = ", ".join(sorted(projects.keys())[:10]) if projects else "none"
|
|
86
|
+
if len(projects) > 10:
|
|
87
|
+
available += f" ... (and {len(projects) - 10} more)"
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f"Documentation '{identifier}' not found in output directory. "
|
|
90
|
+
f"Available resources: {available}. "
|
|
91
|
+
f"If you meant to use a file path, provide an absolute path starting with '/'."
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
doc_path = projects[identifier]
|
|
95
|
+
try:
|
|
96
|
+
content = doc_path.read_text(encoding="utf-8")
|
|
97
|
+
return content
|
|
98
|
+
except Exception as e:
|
|
99
|
+
raise RuntimeError(
|
|
100
|
+
f"Failed to read documentation for '{identifier}': {e}"
|
|
101
|
+
) from e
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@app.tool()
|
|
105
|
+
def search_docs(
|
|
106
|
+
query: str,
|
|
107
|
+
limit: int = 20,
|
|
108
|
+
directory_filter: Optional[str] = None,
|
|
109
|
+
chunk_limit: int = 5,
|
|
110
|
+
) -> str:
|
|
111
|
+
"""
|
|
112
|
+
Search for markdown files across indexed directories using semantic search.
|
|
113
|
+
|
|
114
|
+
This tool uses semantic search to find relevant chunks from indexed documentation.
|
|
115
|
+
It returns relevant chunks with content snippets instead of entire files.
|
|
116
|
+
Index directories first using index_directories, or files are auto-indexed from
|
|
117
|
+
the configured output_dir on first search.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
query: Search query (supports multi-word queries and natural language)
|
|
121
|
+
limit: Maximum number of chunks to return (default: 20)
|
|
122
|
+
directory_filter: Optional directory path to filter results
|
|
123
|
+
chunk_limit: Maximum chunks per file (default: 5)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Formatted list of relevant chunks with their paths, resource names, scores, and content snippets.
|
|
127
|
+
"""
|
|
128
|
+
indexer = _get_indexer()
|
|
129
|
+
|
|
130
|
+
# Auto-index default output directory if no files indexed yet
|
|
131
|
+
stats = indexer.get_stats()
|
|
132
|
+
if stats["total_files"] == 0:
|
|
133
|
+
output_dir = get_output_dir()
|
|
134
|
+
if output_dir.exists():
|
|
135
|
+
added, updated, skipped = indexer.index_directory(output_dir)
|
|
136
|
+
if added > 0 or updated > 0:
|
|
137
|
+
return f"Indexed {added} new files, updated {updated}. Try searching again."
|
|
138
|
+
|
|
139
|
+
# Always use semantic search
|
|
140
|
+
if not indexer.enable_semantic_search or not indexer.vector_index:
|
|
141
|
+
# Fallback to keyword search if semantic search is not available
|
|
142
|
+
results = indexer.search(query, limit=limit, directory_filter=directory_filter)
|
|
143
|
+
if not results:
|
|
144
|
+
return f"No files found matching '{query}'."
|
|
145
|
+
|
|
146
|
+
# Format file results
|
|
147
|
+
lines = [f"Found {len(results)} file(s) matching '{query}':\n"]
|
|
148
|
+
for i, result in enumerate(results, 1):
|
|
149
|
+
lines.append(
|
|
150
|
+
f"{i}. {result['resource_name']}\n"
|
|
151
|
+
f" Path: {result['file_path']}\n"
|
|
152
|
+
f" Directory: {result['directory']}"
|
|
153
|
+
)
|
|
154
|
+
return "\n".join(lines)
|
|
155
|
+
|
|
156
|
+
# Use semantic search
|
|
157
|
+
results = indexer.search_semantic(
|
|
158
|
+
query,
|
|
159
|
+
limit=limit,
|
|
160
|
+
directory_filter=directory_filter,
|
|
161
|
+
max_chunks_per_file=chunk_limit,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if not results:
|
|
165
|
+
return f"No chunks found matching '{query}'."
|
|
166
|
+
|
|
167
|
+
# Format chunk results
|
|
168
|
+
lines = [f"Found {len(results)} relevant chunk(s) matching '{query}':\n"]
|
|
169
|
+
for i, result in enumerate(results, 1):
|
|
170
|
+
# Truncate chunk content for display (first 200 chars)
|
|
171
|
+
content_snippet = result.get("content", "")[:200]
|
|
172
|
+
if len(result.get("content", "")) > 200:
|
|
173
|
+
content_snippet += "..."
|
|
174
|
+
|
|
175
|
+
lines.append(
|
|
176
|
+
f"{i}. {result['resource_name']} (chunk {result.get('chunk_index', 0)})\n"
|
|
177
|
+
f" Path: {result['file_path']}\n"
|
|
178
|
+
f" Score: {result.get('score', 0):.4f}\n"
|
|
179
|
+
f" Content: {content_snippet}"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return "\n".join(lines)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@app.tool()
|
|
186
|
+
def index_directories(directories: List[str], max_depth: Optional[int] = None) -> str:
|
|
187
|
+
"""
|
|
188
|
+
Index markdown files from specified directories for fast searching.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
directories: List of directory paths to index
|
|
192
|
+
max_depth: Maximum recursion depth (None = unlimited)
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Summary of indexing results.
|
|
196
|
+
"""
|
|
197
|
+
indexer = _get_indexer()
|
|
198
|
+
|
|
199
|
+
total_added = 0
|
|
200
|
+
total_updated = 0
|
|
201
|
+
total_skipped = 0
|
|
202
|
+
|
|
203
|
+
results = []
|
|
204
|
+
|
|
205
|
+
for dir_path in directories:
|
|
206
|
+
path = Path(dir_path).expanduser()
|
|
207
|
+
if not path.exists():
|
|
208
|
+
results.append(f"✗ {dir_path}: Directory does not exist")
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
if not path.is_dir():
|
|
212
|
+
results.append(f"✗ {dir_path}: Path is not a directory")
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
added, updated, skipped = indexer.index_directory(path, max_depth=max_depth)
|
|
216
|
+
total_added += added
|
|
217
|
+
total_updated += updated
|
|
218
|
+
total_skipped += skipped
|
|
219
|
+
|
|
220
|
+
results.append(
|
|
221
|
+
f"✓ {dir_path}: {added} added, {updated} updated, {skipped} skipped"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
summary = "\n".join(results)
|
|
225
|
+
summary += f"\n\nTotal: {total_added} added, {total_updated} updated, {total_skipped} skipped"
|
|
226
|
+
|
|
227
|
+
return summary
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def run_mcp_server():
|
|
231
|
+
"""Entry point to run MCP server."""
|
|
232
|
+
app.run()
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""FAISS vector index management for semantic search.
|
|
2
|
+
|
|
3
|
+
This module provides FAISS index management for storing and searching
|
|
4
|
+
document chunk embeddings.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pickle
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
10
|
+
from threading import Lock
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import faiss
|
|
15
|
+
|
|
16
|
+
FAISS_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
FAISS_AVAILABLE = False
|
|
19
|
+
faiss = None
|
|
20
|
+
|
|
21
|
+
from ..config import CONFIG_DIR
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class VectorIndex:
|
|
25
|
+
"""
|
|
26
|
+
FAISS vector index manager for semantic search.
|
|
27
|
+
|
|
28
|
+
Manages a FAISS index for storing and searching document chunk embeddings.
|
|
29
|
+
Also maintains metadata mapping chunk IDs to file paths and chunk information.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, index_path: Optional[Path] = None, embedding_dim: int = 384):
|
|
33
|
+
"""
|
|
34
|
+
Initialize the vector index.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
index_path: Path to save/load the FAISS index. Defaults to config_dir/vector_index.faiss
|
|
38
|
+
embedding_dim: Dimension of embeddings (default: 384 for all-MiniLM-L6-v2)
|
|
39
|
+
"""
|
|
40
|
+
if not FAISS_AVAILABLE:
|
|
41
|
+
raise ImportError(
|
|
42
|
+
"FAISS is not available. Please install faiss-cpu: pip install faiss-cpu"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
if index_path is None:
|
|
46
|
+
index_path = CONFIG_DIR / "vector_index.faiss"
|
|
47
|
+
|
|
48
|
+
self.index_path = index_path
|
|
49
|
+
self.metadata_path = index_path.with_suffix(".metadata.pkl")
|
|
50
|
+
self.embedding_dim = embedding_dim
|
|
51
|
+
self._lock = Lock()
|
|
52
|
+
|
|
53
|
+
# FAISS index (FlatIndex for exact search)
|
|
54
|
+
self.index: Optional[faiss.Index] = None
|
|
55
|
+
|
|
56
|
+
# Metadata: chunk_id -> {file_path, chunk_index, content, start_pos, end_pos}
|
|
57
|
+
self.metadata: Dict[int, Dict[str, Any]] = {}
|
|
58
|
+
|
|
59
|
+
# File to chunk IDs mapping: file_path -> [chunk_id, ...]
|
|
60
|
+
self.file_to_chunks: Dict[str, List[int]] = {}
|
|
61
|
+
|
|
62
|
+
# Next chunk ID
|
|
63
|
+
self.next_chunk_id = 0
|
|
64
|
+
|
|
65
|
+
# Load existing index if available
|
|
66
|
+
self._load()
|
|
67
|
+
|
|
68
|
+
def _load(self) -> None:
|
|
69
|
+
"""Load FAISS index and metadata from disk."""
|
|
70
|
+
with self._lock:
|
|
71
|
+
if self.index_path.exists() and self.metadata_path.exists():
|
|
72
|
+
try:
|
|
73
|
+
# Load FAISS index
|
|
74
|
+
self.index = faiss.read_index(str(self.index_path))
|
|
75
|
+
|
|
76
|
+
# Load metadata
|
|
77
|
+
with open(self.metadata_path, "rb") as f:
|
|
78
|
+
data = pickle.load(f)
|
|
79
|
+
self.metadata = data.get("metadata", {})
|
|
80
|
+
self.file_to_chunks = data.get("file_to_chunks", {})
|
|
81
|
+
self.next_chunk_id = data.get("next_chunk_id", 0)
|
|
82
|
+
|
|
83
|
+
# Verify embedding dimension matches
|
|
84
|
+
if self.index.d != self.embedding_dim:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Index embedding dimension ({self.index.d}) "
|
|
87
|
+
f"does not match expected ({self.embedding_dim})"
|
|
88
|
+
)
|
|
89
|
+
except Exception as e:
|
|
90
|
+
# If loading fails, start fresh
|
|
91
|
+
print(f"Warning: Could not load vector index: {e}")
|
|
92
|
+
self._init_index()
|
|
93
|
+
else:
|
|
94
|
+
self._init_index()
|
|
95
|
+
|
|
96
|
+
def _init_index(self) -> None:
|
|
97
|
+
"""Initialize a new FAISS index."""
|
|
98
|
+
# Use FlatIndex for exact search (good for <1M vectors)
|
|
99
|
+
# For larger datasets, consider IVF or HNSW
|
|
100
|
+
self.index = faiss.IndexFlatL2(self.embedding_dim)
|
|
101
|
+
self.metadata = {}
|
|
102
|
+
self.file_to_chunks = {}
|
|
103
|
+
self.next_chunk_id = 0
|
|
104
|
+
|
|
105
|
+
def add_chunks(
|
|
106
|
+
self,
|
|
107
|
+
file_path: str,
|
|
108
|
+
chunks: List[Dict[str, Any]],
|
|
109
|
+
embeddings: np.ndarray,
|
|
110
|
+
) -> None:
|
|
111
|
+
"""
|
|
112
|
+
Add chunks and their embeddings to the index.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
file_path: Path to the file these chunks belong to
|
|
116
|
+
chunks: List of chunk dictionaries with 'content', 'start_pos', 'end_pos', 'chunk_index'
|
|
117
|
+
embeddings: NumPy array of embeddings (shape: (len(chunks), embedding_dim))
|
|
118
|
+
"""
|
|
119
|
+
if not FAISS_AVAILABLE:
|
|
120
|
+
raise ImportError("FAISS is not available")
|
|
121
|
+
|
|
122
|
+
if len(chunks) != len(embeddings):
|
|
123
|
+
raise ValueError(
|
|
124
|
+
f"Number of chunks ({len(chunks)}) does not match "
|
|
125
|
+
f"number of embeddings ({len(embeddings)})"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
with self._lock:
|
|
129
|
+
# Remove existing chunks for this file
|
|
130
|
+
if file_path in self.file_to_chunks:
|
|
131
|
+
self._remove_file(file_path)
|
|
132
|
+
|
|
133
|
+
# Ensure embeddings are float32 and 2D
|
|
134
|
+
if embeddings.dtype != np.float32:
|
|
135
|
+
embeddings = embeddings.astype(np.float32)
|
|
136
|
+
if len(embeddings.shape) == 1:
|
|
137
|
+
embeddings = embeddings.reshape(1, -1)
|
|
138
|
+
|
|
139
|
+
# Add embeddings to FAISS index
|
|
140
|
+
self.index.add(embeddings)
|
|
141
|
+
|
|
142
|
+
# Add metadata for each chunk
|
|
143
|
+
chunk_ids = []
|
|
144
|
+
for i, chunk in enumerate(chunks):
|
|
145
|
+
chunk_id = self.next_chunk_id
|
|
146
|
+
chunk_ids.append(chunk_id)
|
|
147
|
+
|
|
148
|
+
self.metadata[chunk_id] = {
|
|
149
|
+
"file_path": file_path,
|
|
150
|
+
"chunk_index": chunk.get("chunk_index", i),
|
|
151
|
+
"content": chunk.get("content", ""),
|
|
152
|
+
"start_pos": chunk.get("start_pos", 0),
|
|
153
|
+
"end_pos": chunk.get("end_pos", 0),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
self.next_chunk_id += 1
|
|
157
|
+
|
|
158
|
+
# Update file to chunks mapping
|
|
159
|
+
self.file_to_chunks[file_path] = chunk_ids
|
|
160
|
+
|
|
161
|
+
def search(
|
|
162
|
+
self,
|
|
163
|
+
query_embedding: np.ndarray,
|
|
164
|
+
k: int = 10,
|
|
165
|
+
file_filter: Optional[List[str]] = None,
|
|
166
|
+
) -> List[Tuple[int, float, Dict[str, Any]]]:
|
|
167
|
+
"""
|
|
168
|
+
Search for similar chunks.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
query_embedding: Query embedding vector
|
|
172
|
+
k: Number of results to return
|
|
173
|
+
file_filter: Optional list of file paths to filter results
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
List of tuples: (chunk_id, distance, metadata_dict)
|
|
177
|
+
Sorted by distance (lower is better)
|
|
178
|
+
"""
|
|
179
|
+
if not FAISS_AVAILABLE:
|
|
180
|
+
raise ImportError("FAISS is not available")
|
|
181
|
+
|
|
182
|
+
if self.index is None or self.index.ntotal == 0:
|
|
183
|
+
return []
|
|
184
|
+
|
|
185
|
+
with self._lock:
|
|
186
|
+
# Ensure query embedding is float32 and 2D
|
|
187
|
+
if query_embedding.dtype != np.float32:
|
|
188
|
+
query_embedding = query_embedding.astype(np.float32)
|
|
189
|
+
if len(query_embedding.shape) == 1:
|
|
190
|
+
query_embedding = query_embedding.reshape(1, -1)
|
|
191
|
+
|
|
192
|
+
# Search in FAISS
|
|
193
|
+
distances, indices = self.index.search(
|
|
194
|
+
query_embedding, k * 2
|
|
195
|
+
) # Get more, filter later
|
|
196
|
+
|
|
197
|
+
# Filter and format results
|
|
198
|
+
results = []
|
|
199
|
+
seen_files = {} # Track chunks per file for file_filter
|
|
200
|
+
|
|
201
|
+
for idx, dist in zip(indices[0], distances[0]):
|
|
202
|
+
if idx < 0: # Invalid index
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
chunk_id = idx
|
|
206
|
+
if chunk_id not in self.metadata:
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
metadata = self.metadata[chunk_id]
|
|
210
|
+
file_path = metadata["file_path"]
|
|
211
|
+
|
|
212
|
+
# Apply file filter if provided
|
|
213
|
+
if file_filter is not None and file_path not in file_filter:
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
# Limit chunks per file (if file_filter is used)
|
|
217
|
+
if file_filter is not None:
|
|
218
|
+
if file_path not in seen_files:
|
|
219
|
+
seen_files[file_path] = 0
|
|
220
|
+
if seen_files[file_path] >= 5: # Max chunks per file
|
|
221
|
+
continue
|
|
222
|
+
seen_files[file_path] += 1
|
|
223
|
+
|
|
224
|
+
results.append((chunk_id, float(dist), metadata))
|
|
225
|
+
|
|
226
|
+
if len(results) >= k:
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
return results
|
|
230
|
+
|
|
231
|
+
def _remove_file(self, file_path: str) -> None:
|
|
232
|
+
"""Remove all chunks for a file (internal method, not thread-safe)."""
|
|
233
|
+
if file_path not in self.file_to_chunks:
|
|
234
|
+
return
|
|
235
|
+
|
|
236
|
+
chunk_ids = self.file_to_chunks[file_path]
|
|
237
|
+
|
|
238
|
+
# Note: FAISS doesn't support removing individual vectors efficiently
|
|
239
|
+
# For now, we'll mark them as removed in metadata and rebuild on next save
|
|
240
|
+
# A better approach would be to rebuild the index, but that's expensive
|
|
241
|
+
# For production, consider using a more advanced index type that supports deletion
|
|
242
|
+
|
|
243
|
+
# Remove from metadata
|
|
244
|
+
for chunk_id in chunk_ids:
|
|
245
|
+
self.metadata.pop(chunk_id, None)
|
|
246
|
+
|
|
247
|
+
# Remove from file mapping
|
|
248
|
+
del self.file_to_chunks[file_path]
|
|
249
|
+
|
|
250
|
+
def remove_file(self, file_path: str) -> None:
|
|
251
|
+
"""
|
|
252
|
+
Remove all chunks for a file from the index.
|
|
253
|
+
|
|
254
|
+
Note: This marks chunks as removed but doesn't actually remove them
|
|
255
|
+
from the FAISS index. The index will be rebuilt on next save.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
file_path: Path to the file to remove
|
|
259
|
+
"""
|
|
260
|
+
with self._lock:
|
|
261
|
+
self._remove_file(file_path)
|
|
262
|
+
|
|
263
|
+
def save(self) -> None:
|
|
264
|
+
"""Save FAISS index and metadata to disk."""
|
|
265
|
+
if not FAISS_AVAILABLE:
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
with self._lock:
|
|
269
|
+
if self.index is None:
|
|
270
|
+
return
|
|
271
|
+
|
|
272
|
+
# Ensure directory exists
|
|
273
|
+
self.index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
274
|
+
|
|
275
|
+
# Save FAISS index
|
|
276
|
+
faiss.write_index(self.index, str(self.index_path))
|
|
277
|
+
|
|
278
|
+
# Save metadata
|
|
279
|
+
with open(self.metadata_path, "wb") as f:
|
|
280
|
+
pickle.dump(
|
|
281
|
+
{
|
|
282
|
+
"metadata": self.metadata,
|
|
283
|
+
"file_to_chunks": self.file_to_chunks,
|
|
284
|
+
"next_chunk_id": self.next_chunk_id,
|
|
285
|
+
},
|
|
286
|
+
f,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
290
|
+
"""Get statistics about the index."""
|
|
291
|
+
with self._lock:
|
|
292
|
+
return {
|
|
293
|
+
"total_chunks": len(self.metadata),
|
|
294
|
+
"total_files_with_chunks": len(self.file_to_chunks),
|
|
295
|
+
"index_size": self.index.ntotal if self.index else 0,
|
|
296
|
+
"embedding_dim": self.embedding_dim,
|
|
297
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Metadata package for WikiGen.
|
|
3
|
+
Centralized source of truth for project information.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .project import (
|
|
7
|
+
PROJECT_NAME,
|
|
8
|
+
AUTHOR_NAME,
|
|
9
|
+
ORGANIZATION,
|
|
10
|
+
DESCRIPTION,
|
|
11
|
+
REPOSITORY_URL,
|
|
12
|
+
HOMEPAGE_URL,
|
|
13
|
+
ISSUES_URL,
|
|
14
|
+
COPYRIGHT_TEXT,
|
|
15
|
+
MIN_PYTHON_VERSION,
|
|
16
|
+
CLI_ENTRY_POINT,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from .version import get_version, __version__
|
|
20
|
+
|
|
21
|
+
# Re-export commonly used items
|
|
22
|
+
__all__ = [
|
|
23
|
+
"PROJECT_NAME",
|
|
24
|
+
"AUTHOR_NAME",
|
|
25
|
+
"ORGANIZATION",
|
|
26
|
+
"DESCRIPTION",
|
|
27
|
+
"REPOSITORY_URL",
|
|
28
|
+
"HOMEPAGE_URL",
|
|
29
|
+
"ISSUES_URL",
|
|
30
|
+
"COPYRIGHT_TEXT",
|
|
31
|
+
"MIN_PYTHON_VERSION",
|
|
32
|
+
"CLI_ENTRY_POINT",
|
|
33
|
+
"get_version",
|
|
34
|
+
"__version__",
|
|
35
|
+
]
|
wikigen/metadata/logo.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""ASCII logo for WikiGen CLI."""
|
|
2
|
+
|
|
3
|
+
from .project import DESCRIPTION
|
|
4
|
+
from .version import get_version
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def print_logo():
|
|
8
|
+
"""Print the WikiGen ASCII logo with simple gray colors."""
|
|
9
|
+
# Simple colors that work well on both light and dark backgrounds
|
|
10
|
+
LOGO_COLOR = "\033[38;5;240m" # Medium gray - visible everywhere
|
|
11
|
+
ATTRIB_COLOR = "\033[38;5;245m" # Light gray
|
|
12
|
+
RESET = "\033[0m"
|
|
13
|
+
|
|
14
|
+
version = get_version()
|
|
15
|
+
logo = f"""
|
|
16
|
+
{ATTRIB_COLOR}INTRODUCING
|
|
17
|
+
{RESET}{LOGO_COLOR}
|
|
18
|
+
██╗ ██╗ ██╗ ██╗ ██╗ ██╗ ██╗ ██╗ ██████╗ ███████╗ ███╗ ██╗
|
|
19
|
+
██║ ██║ ██║ ██║ ██║ ██║ ██╔╝ ██║ ██╔════╝ ██╔════╝ ████╗ ██║
|
|
20
|
+
██║ ██║ ██║ █╗ ██║ ██║ █████╔╝ ██║ ██║ ███╗ ██████╗ ██╔██╗ ██║
|
|
21
|
+
██║ ██║ ██║███╗██║ ██║ ██╔═██╗ ██║ ██║ ██║ ██╔══╝ ██║╚██╗██║
|
|
22
|
+
██║ ██║ ╚███╔███╔╝ ██║ ██║ ██╗ ██║ ╚██████╔╝ ███████╗ ██║ ╚████║
|
|
23
|
+
╚═╝ ╚═╝ ╚══╝╚══╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚══════╝ ╚═╝ ╚═══╝
|
|
24
|
+
{RESET}
|
|
25
|
+
{ATTRIB_COLOR}{DESCRIPTION} ♥ {RESET}
|
|
26
|
+
{ATTRIB_COLOR}v{version}{RESET}
|
|
27
|
+
"""
|
|
28
|
+
print(logo)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Project metadata for WikiGen.
|
|
3
|
+
Single source of truth for project information.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import datetime
|
|
7
|
+
|
|
8
|
+
# Project info
|
|
9
|
+
PROJECT_NAME = "wikigen"
|
|
10
|
+
AUTHOR_NAME = "Mithun Ramesh"
|
|
11
|
+
ORGANIZATION = "USEWIKIGEN.CO"
|
|
12
|
+
DESCRIPTION = "WIKI'S FOR NERDS, BY NERDS"
|
|
13
|
+
|
|
14
|
+
# Repository info
|
|
15
|
+
REPOSITORY_URL = "https://github.com/usesalt/wikigen"
|
|
16
|
+
HOMEPAGE_URL = "https://usesalt.co"
|
|
17
|
+
ISSUES_URL = "https://github.com/usesalt/wikigen/issues"
|
|
18
|
+
|
|
19
|
+
# Dynamic values
|
|
20
|
+
CURRENT_YEAR = datetime.datetime.now().year
|
|
21
|
+
COPYRIGHT_TEXT = f"Copyright (c) {CURRENT_YEAR} {AUTHOR_NAME}"
|
|
22
|
+
|
|
23
|
+
# Python requirements
|
|
24
|
+
MIN_PYTHON_VERSION = "3.12"
|
|
25
|
+
|
|
26
|
+
# Package info
|
|
27
|
+
PACKAGE_NAME = "wikigen"
|
|
28
|
+
CLI_ENTRY_POINT = "wikigen"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Version management for WikiGen.
|
|
3
|
+
Centralized version definition for consistency.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# Current version - update this when releasing
|
|
7
|
+
__version__ = "1.0.0"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_version():
|
|
11
|
+
"""
|
|
12
|
+
Get the current version.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
str: The current version string (e.g., "1.0.0")
|
|
16
|
+
"""
|
|
17
|
+
return __version__
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Nodes module for WikiGen."""
|