fusesearch 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fusesearch-0.1.0/LICENSE +21 -0
- fusesearch-0.1.0/PKG-INFO +74 -0
- fusesearch-0.1.0/README.md +45 -0
- fusesearch-0.1.0/fusesearch/__init__.py +0 -0
- fusesearch-0.1.0/fusesearch/__main__.py +120 -0
- fusesearch-0.1.0/fusesearch/api/__init__.py +0 -0
- fusesearch-0.1.0/fusesearch/api/server.py +54 -0
- fusesearch-0.1.0/fusesearch/core/__init__.py +0 -0
- fusesearch-0.1.0/fusesearch/core/chunker.py +89 -0
- fusesearch-0.1.0/fusesearch/core/embedder.py +37 -0
- fusesearch-0.1.0/fusesearch/indexer.py +57 -0
- fusesearch-0.1.0/fusesearch/mcp_server.py +84 -0
- fusesearch-0.1.0/fusesearch/models.py +34 -0
- fusesearch-0.1.0/fusesearch/sources/__init__.py +0 -0
- fusesearch-0.1.0/fusesearch/sources/base.py +25 -0
- fusesearch-0.1.0/fusesearch/sources/local_files.py +59 -0
- fusesearch-0.1.0/fusesearch/store/__init__.py +0 -0
- fusesearch-0.1.0/fusesearch/store/qdrant.py +187 -0
- fusesearch-0.1.0/fusesearch.egg-info/PKG-INFO +74 -0
- fusesearch-0.1.0/fusesearch.egg-info/SOURCES.txt +23 -0
- fusesearch-0.1.0/fusesearch.egg-info/dependency_links.txt +1 -0
- fusesearch-0.1.0/fusesearch.egg-info/requires.txt +18 -0
- fusesearch-0.1.0/fusesearch.egg-info/top_level.txt +1 -0
- fusesearch-0.1.0/pyproject.toml +38 -0
- fusesearch-0.1.0/setup.cfg +4 -0
fusesearch-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anton Lebedev
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fusesearch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-source search aggregation tool with AI-powered retrieval and response synthesis
|
|
5
|
+
Author-email: Anton Lebedev <pypi@katzo.net>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: pydantic>=2.12
|
|
15
|
+
Requires-Dist: qdrant-client>=1.16
|
|
16
|
+
Requires-Dist: fastapi>=0.129
|
|
17
|
+
Requires-Dist: uvicorn>=0.40
|
|
18
|
+
Requires-Dist: tqdm>=4.67
|
|
19
|
+
Provides-Extra: mcp
|
|
20
|
+
Requires-Dist: mcp[cli]>=1.26; extra == "mcp"
|
|
21
|
+
Provides-Extra: local
|
|
22
|
+
Requires-Dist: sentence-transformers>=5.2; extra == "local"
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: fusesearch[local,mcp]; extra == "all"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=9.0; extra == "dev"
|
|
27
|
+
Requires-Dist: ruff>=0.15; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# FuseSearch
|
|
31
|
+
|
|
32
|
+
Multi-source search aggregation tool that unifies retrieval across diverse data sources — Confluence, MCP servers, local files, and more — using AI-powered search and response synthesis through a single query interface.
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
make build
|
|
38
|
+
make start
|
|
39
|
+
make index # index docs from data/docs
|
|
40
|
+
make search "your query"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## MCP Server
|
|
44
|
+
|
|
45
|
+
The `fusesearch-mcp` Docker service exposes a streamable HTTP endpoint on port 8001. Tools: `search` (hybrid search), `count` (indexed chunks).
|
|
46
|
+
|
|
47
|
+
### Claude Code
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
claude mcp add fusesearch http://localhost:8001/mcp --transport http
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Claude Desktop
|
|
54
|
+
|
|
55
|
+
**Option 1: Connectors UI (recommended)**
|
|
56
|
+
|
|
57
|
+
In Claude Desktop, go to **Settings > Connectors > Add custom connector** and enter `https://localhost:8001/mcp`.
|
|
58
|
+
|
|
59
|
+
**Option 2: Config file with `mcp-remote` bridge (local dev)**
|
|
60
|
+
|
|
61
|
+
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
62
|
+
|
|
63
|
+
```json
|
|
64
|
+
{
|
|
65
|
+
"mcpServers": {
|
|
66
|
+
"fusesearch": {
|
|
67
|
+
"command": "npx",
|
|
68
|
+
"args": ["-y", "mcp-remote", "http://localhost:8001/mcp", "--allow-http"]
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Requires Node.js >= 18. `--allow-http` is required for plain HTTP (not needed for HTTPS).
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# FuseSearch
|
|
2
|
+
|
|
3
|
+
Multi-source search aggregation tool that unifies retrieval across diverse data sources — Confluence, MCP servers, local files, and more — using AI-powered search and response synthesis through a single query interface.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
make build
|
|
9
|
+
make start
|
|
10
|
+
make index # index docs from data/docs
|
|
11
|
+
make search "your query"
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## MCP Server
|
|
15
|
+
|
|
16
|
+
The `fusesearch-mcp` Docker service exposes a streamable HTTP endpoint on port 8001. Tools: `search` (hybrid search), `count` (indexed chunks).
|
|
17
|
+
|
|
18
|
+
### Claude Code
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
claude mcp add fusesearch http://localhost:8001/mcp --transport http
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Claude Desktop
|
|
25
|
+
|
|
26
|
+
**Option 1: Connectors UI (recommended)**
|
|
27
|
+
|
|
28
|
+
In Claude Desktop, go to **Settings > Connectors > Add custom connector** and enter `https://localhost:8001/mcp`.
|
|
29
|
+
|
|
30
|
+
**Option 2: Config file with `mcp-remote` bridge (local dev)**
|
|
31
|
+
|
|
32
|
+
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
33
|
+
|
|
34
|
+
```json
|
|
35
|
+
{
|
|
36
|
+
"mcpServers": {
|
|
37
|
+
"fusesearch": {
|
|
38
|
+
"command": "npx",
|
|
39
|
+
"args": ["-y", "mcp-remote", "http://localhost:8001/mcp", "--allow-http"]
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Requires Node.js >= 18. `--allow-http` is required for plain HTTP (not needed for HTTPS).
|
|
File without changes
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _make_embedder():
|
|
6
|
+
from fusesearch.core.embedder import LocalEmbedder
|
|
7
|
+
|
|
8
|
+
return LocalEmbedder()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _make_store(embedder):
|
|
12
|
+
from fusesearch.store.qdrant import QdrantStore
|
|
13
|
+
|
|
14
|
+
return QdrantStore(
|
|
15
|
+
host=os.getenv("QDRANT_HOST", "localhost"),
|
|
16
|
+
port=int(os.getenv("QDRANT_PORT", "6333")),
|
|
17
|
+
dimension=embedder.dimension,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def cmd_serve(args):
|
|
22
|
+
import uvicorn
|
|
23
|
+
|
|
24
|
+
host = args.host or os.getenv("FUSESEARCH_HOST", "0.0.0.0")
|
|
25
|
+
port = int(args.port or os.getenv("FUSESEARCH_PORT", "8000"))
|
|
26
|
+
uvicorn.run("fusesearch.api.server:app", host=host, port=port)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def cmd_index(args):
|
|
30
|
+
from fusesearch.indexer import Indexer
|
|
31
|
+
from fusesearch.sources.local_files import LocalFilesAdapter
|
|
32
|
+
|
|
33
|
+
embedder = _make_embedder()
|
|
34
|
+
store = _make_store(embedder)
|
|
35
|
+
|
|
36
|
+
adapter = LocalFilesAdapter(directories=args.paths)
|
|
37
|
+
documents = list(adapter.fetch())
|
|
38
|
+
print(f"Found {len(documents)} documents")
|
|
39
|
+
|
|
40
|
+
indexer = Indexer(store=store, embedder=embedder)
|
|
41
|
+
stats = indexer.index_documents(documents)
|
|
42
|
+
print(
|
|
43
|
+
f"Indexed: {stats['new']} new, {stats['skipped']} skipped, {stats['deleted']} deleted"
|
|
44
|
+
)
|
|
45
|
+
print(f"Total chunks in store: {store.count()}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def cmd_search(args):
|
|
49
|
+
embedder = _make_embedder()
|
|
50
|
+
store = _make_store(embedder)
|
|
51
|
+
hybrid = not args.no_hybrid
|
|
52
|
+
|
|
53
|
+
query_vector = embedder.embed_one(args.query)
|
|
54
|
+
if hybrid:
|
|
55
|
+
results = store.hybrid_search(query_vector, args.query, limit=args.limit)
|
|
56
|
+
else:
|
|
57
|
+
results = store.search(query_vector, limit=args.limit)
|
|
58
|
+
|
|
59
|
+
mode = "hybrid" if hybrid else "vector-only"
|
|
60
|
+
print(f"Search mode: {mode}")
|
|
61
|
+
for i, result in enumerate(results, 1):
|
|
62
|
+
print(f"\n--- Result {i} (score: {result['score']:.4f}) ---")
|
|
63
|
+
print(f"Title: {result['title']}")
|
|
64
|
+
if result["heading_path"]:
|
|
65
|
+
print(f"Section: {' > '.join(result['heading_path'])}")
|
|
66
|
+
print(f"Content: {result['content'][:300]}...")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def main():
|
|
70
|
+
parser = argparse.ArgumentParser(
|
|
71
|
+
prog="fusesearch", description="FuseSearch - multi-source search"
|
|
72
|
+
)
|
|
73
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
74
|
+
|
|
75
|
+
# Serve command (default)
|
|
76
|
+
serve_parser = subparsers.add_parser("serve", help="Start the API server")
|
|
77
|
+
serve_parser.add_argument("--host", default=None, help="Host to bind to")
|
|
78
|
+
serve_parser.add_argument("--port", default=None, help="Port to bind to")
|
|
79
|
+
|
|
80
|
+
# Index command
|
|
81
|
+
index_parser = subparsers.add_parser(
|
|
82
|
+
"index", help="Index documents from local files"
|
|
83
|
+
)
|
|
84
|
+
index_parser.add_argument("paths", nargs="+", help="Directories to index")
|
|
85
|
+
|
|
86
|
+
# Search command
|
|
87
|
+
search_parser = subparsers.add_parser("search", help="Search indexed documents")
|
|
88
|
+
search_parser.add_argument("query", help="Search query")
|
|
89
|
+
search_parser.add_argument("--limit", type=int, default=5, help="Number of results")
|
|
90
|
+
search_parser.add_argument(
|
|
91
|
+
"--no-hybrid", action="store_true", help="Disable hybrid search (vector-only)"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# MCP server command
|
|
95
|
+
mcp_parser = subparsers.add_parser("mcp", help="Start the MCP server")
|
|
96
|
+
mcp_parser.add_argument(
|
|
97
|
+
"--transport",
|
|
98
|
+
choices=["stdio", "sse", "streamable-http"],
|
|
99
|
+
default="stdio",
|
|
100
|
+
help="Transport type",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
args = parser.parse_args()
|
|
104
|
+
|
|
105
|
+
if args.command == "serve":
|
|
106
|
+
cmd_serve(args)
|
|
107
|
+
elif args.command == "index":
|
|
108
|
+
cmd_index(args)
|
|
109
|
+
elif args.command == "search":
|
|
110
|
+
cmd_search(args)
|
|
111
|
+
elif args.command == "mcp":
|
|
112
|
+
from fusesearch.mcp_server import main as mcp_main
|
|
113
|
+
|
|
114
|
+
mcp_main(transport=args.transport)
|
|
115
|
+
else:
|
|
116
|
+
cmd_serve(argparse.Namespace(host=None, port=None))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from fastapi import FastAPI
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from fusesearch.core.embedder import LocalEmbedder
|
|
7
|
+
from fusesearch.indexer import Indexer
|
|
8
|
+
from fusesearch.sources.local_files import LocalFilesAdapter
|
|
9
|
+
from fusesearch.store.qdrant import QdrantStore
|
|
10
|
+
|
|
11
|
+
app = FastAPI(title="FuseSearch", version="0.1.0")
|
|
12
|
+
|
|
13
|
+
embedder = LocalEmbedder()
|
|
14
|
+
|
|
15
|
+
qdrant_host = os.getenv("QDRANT_HOST", "localhost")
|
|
16
|
+
qdrant_port = int(os.getenv("QDRANT_PORT", "6333"))
|
|
17
|
+
store = QdrantStore(host=qdrant_host, port=qdrant_port, dimension=embedder.dimension)
|
|
18
|
+
indexer = Indexer(store=store, embedder=embedder)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SearchRequest(BaseModel):
|
|
22
|
+
query: str
|
|
23
|
+
limit: int = 5
|
|
24
|
+
hybrid: bool = True
|
|
25
|
+
vector_weight: float = 0.7
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class IndexRequest(BaseModel):
|
|
29
|
+
paths: list[str]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@app.get("/health")
|
|
33
|
+
def health():
|
|
34
|
+
return {"status": "ok", "chunks_indexed": store.count()}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@app.post("/search")
|
|
38
|
+
def search(req: SearchRequest):
|
|
39
|
+
query_vector = embedder.embed_one(req.query)
|
|
40
|
+
if req.hybrid:
|
|
41
|
+
results = store.hybrid_search(
|
|
42
|
+
query_vector, req.query, limit=req.limit, vector_weight=req.vector_weight
|
|
43
|
+
)
|
|
44
|
+
else:
|
|
45
|
+
results = store.search(query_vector, limit=req.limit)
|
|
46
|
+
return {"results": results}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@app.post("/index")
|
|
50
|
+
def index(req: IndexRequest):
|
|
51
|
+
adapter = LocalFilesAdapter(directories=req.paths)
|
|
52
|
+
documents = list(adapter.fetch())
|
|
53
|
+
stats = indexer.index_documents(documents)
|
|
54
|
+
return {"documents_found": len(documents), **stats}
|
|
File without changes
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from fusesearch.models import Chunk, Document
|
|
4
|
+
|
|
5
|
+
HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def chunk_document(document: Document, max_chunk_size: int = 1000) -> list[Chunk]:
|
|
9
|
+
"""Split a document into chunks by markdown headings.
|
|
10
|
+
|
|
11
|
+
Splits on headings first (semantic boundaries), then splits
|
|
12
|
+
oversized sections by paragraphs.
|
|
13
|
+
"""
|
|
14
|
+
sections = _split_by_headings(document.content)
|
|
15
|
+
chunks: list[Chunk] = []
|
|
16
|
+
|
|
17
|
+
for heading_path, content in sections:
|
|
18
|
+
for piece in _split_by_size(content, max_chunk_size):
|
|
19
|
+
chunk = Chunk(
|
|
20
|
+
document_source_id=document.source_id,
|
|
21
|
+
source_type=document.source_type,
|
|
22
|
+
title=document.title,
|
|
23
|
+
content=piece.strip(),
|
|
24
|
+
url=document.url,
|
|
25
|
+
metadata=document.metadata,
|
|
26
|
+
heading_path=heading_path,
|
|
27
|
+
chunk_index=len(chunks),
|
|
28
|
+
)
|
|
29
|
+
if chunk.content:
|
|
30
|
+
chunks.append(chunk)
|
|
31
|
+
|
|
32
|
+
return chunks
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _split_by_headings(text: str) -> list[tuple[list[str], str]]:
|
|
36
|
+
"""Split markdown text by headings, tracking the heading hierarchy."""
|
|
37
|
+
matches = list(HEADING_PATTERN.finditer(text))
|
|
38
|
+
|
|
39
|
+
if not matches:
|
|
40
|
+
return [([], text)]
|
|
41
|
+
|
|
42
|
+
sections: list[tuple[list[str], str]] = []
|
|
43
|
+
heading_stack: list[tuple[int, str]] = []
|
|
44
|
+
|
|
45
|
+
# Content before the first heading
|
|
46
|
+
preamble = text[: matches[0].start()].strip()
|
|
47
|
+
if preamble:
|
|
48
|
+
sections.append(([], preamble))
|
|
49
|
+
|
|
50
|
+
for i, match in enumerate(matches):
|
|
51
|
+
level = len(match.group(1))
|
|
52
|
+
title = match.group(2).strip()
|
|
53
|
+
|
|
54
|
+
# Update heading stack — pop headings at same or deeper level
|
|
55
|
+
heading_stack = [(lvl, t) for lvl, t in heading_stack if lvl < level]
|
|
56
|
+
heading_stack.append((level, title))
|
|
57
|
+
|
|
58
|
+
# Extract content between this heading and the next
|
|
59
|
+
start = match.end()
|
|
60
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
61
|
+
content = text[start:end].strip()
|
|
62
|
+
|
|
63
|
+
if content:
|
|
64
|
+
path = [t for _, t in heading_stack]
|
|
65
|
+
sections.append((path, content))
|
|
66
|
+
|
|
67
|
+
return sections
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _split_by_size(text: str, max_size: int) -> list[str]:
|
|
71
|
+
"""Split text into pieces that fit within max_size, splitting on paragraphs."""
|
|
72
|
+
if len(text) <= max_size:
|
|
73
|
+
return [text]
|
|
74
|
+
|
|
75
|
+
paragraphs = text.split("\n\n")
|
|
76
|
+
pieces: list[str] = []
|
|
77
|
+
current = ""
|
|
78
|
+
|
|
79
|
+
for paragraph in paragraphs:
|
|
80
|
+
if current and len(current) + len(paragraph) + 2 > max_size:
|
|
81
|
+
pieces.append(current)
|
|
82
|
+
current = paragraph
|
|
83
|
+
else:
|
|
84
|
+
current = f"{current}\n\n{paragraph}" if current else paragraph
|
|
85
|
+
|
|
86
|
+
if current:
|
|
87
|
+
pieces.append(current)
|
|
88
|
+
|
|
89
|
+
return pieces
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Embedder(ABC):
|
|
6
|
+
"""Abstract base class for embedding providers."""
|
|
7
|
+
|
|
8
|
+
@property
|
|
9
|
+
@abstractmethod
|
|
10
|
+
def dimension(self) -> int:
|
|
11
|
+
"""Dimension of the embedding vectors."""
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
15
|
+
"""Generate embeddings for a batch of texts."""
|
|
16
|
+
|
|
17
|
+
def embed_one(self, text: str) -> list[float]:
|
|
18
|
+
return self.embed([text])[0]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LocalEmbedder(Embedder):
|
|
22
|
+
"""Local embedding provider using sentence-transformers."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, model: str | None = None, local_files_only: bool = False):
|
|
25
|
+
from sentence_transformers import SentenceTransformer
|
|
26
|
+
|
|
27
|
+
model = model or os.getenv("FUSESEARCH_EMBED_MODEL", "all-MiniLM-L6-v2")
|
|
28
|
+
self.model = SentenceTransformer(model, local_files_only=local_files_only)
|
|
29
|
+
self._dimension = self.model.get_sentence_embedding_dimension()
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def dimension(self) -> int:
|
|
33
|
+
return self._dimension
|
|
34
|
+
|
|
35
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
36
|
+
embeddings = self.model.encode(texts)
|
|
37
|
+
return embeddings.tolist()
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from tqdm import tqdm
|
|
2
|
+
|
|
3
|
+
from fusesearch.core.chunker import chunk_document
|
|
4
|
+
from fusesearch.core.embedder import Embedder
|
|
5
|
+
from fusesearch.models import Chunk, Document
|
|
6
|
+
from fusesearch.store.qdrant import QdrantStore, hash_to_uuid
|
|
7
|
+
|
|
8
|
+
EMBED_BATCH_SIZE = 64
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Indexer:
|
|
12
|
+
"""Indexes documents into the vector store."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, store: QdrantStore, embedder: Embedder):
|
|
15
|
+
self.store = store
|
|
16
|
+
self.embedder = embedder
|
|
17
|
+
|
|
18
|
+
def index_documents(self, documents: list[Document]) -> dict:
|
|
19
|
+
"""Chunk, diff, embed, and store documents.
|
|
20
|
+
|
|
21
|
+
Returns stats: total_chunks, new, skipped, deleted.
|
|
22
|
+
"""
|
|
23
|
+
# Chunk all documents
|
|
24
|
+
all_chunks: list[Chunk] = []
|
|
25
|
+
for doc in tqdm(documents, desc="Chunking", unit="doc"):
|
|
26
|
+
all_chunks.extend(chunk_document(doc))
|
|
27
|
+
|
|
28
|
+
# Diff against existing index
|
|
29
|
+
new_ids = {hash_to_uuid(chunk.content_hash) for chunk in all_chunks}
|
|
30
|
+
existing_ids = self.store.get_existing_hashes()
|
|
31
|
+
|
|
32
|
+
to_add = [
|
|
33
|
+
c for c in all_chunks if hash_to_uuid(c.content_hash) not in existing_ids
|
|
34
|
+
]
|
|
35
|
+
to_delete = existing_ids - new_ids
|
|
36
|
+
|
|
37
|
+
# Delete removed chunks
|
|
38
|
+
if to_delete:
|
|
39
|
+
self.store.delete_by_hashes(to_delete)
|
|
40
|
+
|
|
41
|
+
# Embed and store new chunks in batches
|
|
42
|
+
if to_add:
|
|
43
|
+
pbar = tqdm(total=len(to_add), desc="Embedding", unit="chunk")
|
|
44
|
+
for i in range(0, len(to_add), EMBED_BATCH_SIZE):
|
|
45
|
+
batch = to_add[i : i + EMBED_BATCH_SIZE]
|
|
46
|
+
texts = [chunk.content for chunk in batch]
|
|
47
|
+
embeddings = self.embedder.embed(texts)
|
|
48
|
+
self.store.upsert(batch, embeddings)
|
|
49
|
+
pbar.update(len(batch))
|
|
50
|
+
pbar.close()
|
|
51
|
+
|
|
52
|
+
return {
|
|
53
|
+
"total_chunks": len(all_chunks),
|
|
54
|
+
"new": len(to_add),
|
|
55
|
+
"skipped": len(all_chunks) - len(to_add),
|
|
56
|
+
"deleted": len(to_delete),
|
|
57
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from mcp.server.fastmcp import FastMCP
|
|
5
|
+
|
|
6
|
+
mcp = FastMCP(
|
|
7
|
+
"FuseSearch",
|
|
8
|
+
instructions=(
|
|
9
|
+
"FuseSearch is a knowledge base with indexed documents, blog posts, and notes. "
|
|
10
|
+
"Use this server when the user asks factual questions, wants to look up a topic, "
|
|
11
|
+
"or needs information that might exist in indexed sources."
|
|
12
|
+
),
|
|
13
|
+
host=os.getenv("MCP_HOST", "0.0.0.0"),
|
|
14
|
+
port=int(os.getenv("MCP_PORT", "8001")),
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Lazy-initialized globals
|
|
18
|
+
_embedder = None
|
|
19
|
+
_store = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_embedder():
|
|
23
|
+
global _embedder
|
|
24
|
+
if _embedder is None:
|
|
25
|
+
from fusesearch.core.embedder import LocalEmbedder
|
|
26
|
+
|
|
27
|
+
_embedder = LocalEmbedder()
|
|
28
|
+
return _embedder
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_store():
|
|
32
|
+
global _store
|
|
33
|
+
if _store is None:
|
|
34
|
+
from fusesearch.store.qdrant import QdrantStore
|
|
35
|
+
|
|
36
|
+
_store = QdrantStore(
|
|
37
|
+
host=os.getenv("QDRANT_HOST", "localhost"),
|
|
38
|
+
port=int(os.getenv("QDRANT_PORT", "6333")),
|
|
39
|
+
dimension=_get_embedder().dimension,
|
|
40
|
+
)
|
|
41
|
+
return _store
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@mcp.tool()
|
|
45
|
+
def search(query: str, limit: int = 5) -> str:
|
|
46
|
+
"""Search the FuseSearch knowledge base. Use this tool whenever the user asks a factual or knowledge question — about a topic, concept, person, event, or anything that indexed documents might answer. Returns relevant document chunks with source titles and scores. Always search BEFORE answering knowledge questions."""
|
|
47
|
+
embedder = _get_embedder()
|
|
48
|
+
store = _get_store()
|
|
49
|
+
|
|
50
|
+
query_vector = embedder.embed_one(query)
|
|
51
|
+
results = store.hybrid_search(query_vector, query, limit=limit)
|
|
52
|
+
|
|
53
|
+
if not results:
|
|
54
|
+
return "No results found."
|
|
55
|
+
|
|
56
|
+
parts = []
|
|
57
|
+
for i, result in enumerate(results, 1):
|
|
58
|
+
title = result.get("title", "Untitled")
|
|
59
|
+
heading = ""
|
|
60
|
+
if result.get("heading_path"):
|
|
61
|
+
heading = f" > {' > '.join(result['heading_path'])}"
|
|
62
|
+
score = result.get("score", 0)
|
|
63
|
+
content = result.get("content", "")
|
|
64
|
+
parts.append(f"[{i}] {title}{heading} (score: {score:.4f})\n{content}")
|
|
65
|
+
|
|
66
|
+
return "\n\n---\n\n".join(parts)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@mcp.tool()
|
|
70
|
+
def count() -> str:
|
|
71
|
+
"""Return the number of indexed chunks in the store."""
|
|
72
|
+
store = _get_store()
|
|
73
|
+
return f"{store.count()} chunks indexed"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
Transport = Literal["stdio", "sse", "streamable-http"]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def main(transport: Transport = "streamable-http"):
|
|
80
|
+
mcp.run(transport=transport)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
main()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from hashlib import sha256
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, computed_field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Document(BaseModel):
|
|
8
|
+
"""A normalized document from any source."""
|
|
9
|
+
|
|
10
|
+
source_type: str
|
|
11
|
+
source_id: str
|
|
12
|
+
title: str
|
|
13
|
+
content: str
|
|
14
|
+
url: str | None = None
|
|
15
|
+
metadata: dict = Field(default_factory=dict)
|
|
16
|
+
fetched_at: datetime = Field(default_factory=datetime.now)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Chunk(BaseModel):
|
|
20
|
+
"""A piece of a document, ready for embedding and indexing."""
|
|
21
|
+
|
|
22
|
+
document_source_id: str
|
|
23
|
+
source_type: str
|
|
24
|
+
title: str
|
|
25
|
+
content: str
|
|
26
|
+
url: str | None = None
|
|
27
|
+
metadata: dict = Field(default_factory=dict)
|
|
28
|
+
heading_path: list[str] = Field(default_factory=list)
|
|
29
|
+
chunk_index: int = 0
|
|
30
|
+
|
|
31
|
+
@computed_field
|
|
32
|
+
@property
|
|
33
|
+
def content_hash(self) -> str:
|
|
34
|
+
return sha256(self.content.encode()).hexdigest()
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Iterator
|
|
3
|
+
|
|
4
|
+
from fusesearch.models import Document
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SourceAdapter(ABC):
|
|
8
|
+
"""Abstract base class for all source adapters."""
|
|
9
|
+
|
|
10
|
+
@property
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def source_type(self) -> str:
|
|
13
|
+
"""Unique identifier for this source type (e.g. 'local_files')."""
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def fetch(self) -> Iterator[Document]:
|
|
17
|
+
"""Fetch all documents from this source."""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def fetch_updated(self, since: str | None = None) -> Iterator[Document]:
|
|
21
|
+
"""Fetch only documents updated since the given cursor.
|
|
22
|
+
|
|
23
|
+
The cursor format is source-specific (e.g. timestamp, page token).
|
|
24
|
+
If None, behaves like fetch() (full sync).
|
|
25
|
+
"""
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from fusesearch.models import Document
|
|
6
|
+
from fusesearch.sources.base import SourceAdapter
|
|
7
|
+
|
|
8
|
+
SUPPORTED_EXTENSIONS = {".md", ".txt", ".rst"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LocalFilesAdapter(SourceAdapter):
|
|
12
|
+
"""Source adapter for local markdown and text files."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, directories: list[str | Path]):
|
|
15
|
+
self.directories = [Path(d) for d in directories]
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def source_type(self) -> str:
|
|
19
|
+
return "local_files"
|
|
20
|
+
|
|
21
|
+
def fetch(self) -> Iterator[Document]:
|
|
22
|
+
for directory in self.directories:
|
|
23
|
+
yield from self._scan_directory(directory)
|
|
24
|
+
|
|
25
|
+
def fetch_updated(self, since: str | None = None) -> Iterator[Document]:
|
|
26
|
+
if since is None:
|
|
27
|
+
yield from self.fetch()
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
cutoff = datetime.fromisoformat(since)
|
|
31
|
+
for document in self.fetch():
|
|
32
|
+
if document.fetched_at >= cutoff:
|
|
33
|
+
yield document
|
|
34
|
+
|
|
35
|
+
def _scan_directory(self, directory: Path) -> Iterator[Document]:
|
|
36
|
+
if not directory.is_dir():
|
|
37
|
+
return
|
|
38
|
+
|
|
39
|
+
for path in directory.rglob("*"):
|
|
40
|
+
if path.suffix not in SUPPORTED_EXTENSIONS:
|
|
41
|
+
continue
|
|
42
|
+
if not path.is_file():
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
content = path.read_text(encoding="utf-8", errors="replace")
|
|
46
|
+
stat = path.stat()
|
|
47
|
+
|
|
48
|
+
yield Document(
|
|
49
|
+
source_type=self.source_type,
|
|
50
|
+
source_id=str(path.resolve()),
|
|
51
|
+
title=path.stem,
|
|
52
|
+
content=content,
|
|
53
|
+
metadata={
|
|
54
|
+
"path": str(path.resolve()),
|
|
55
|
+
"extension": path.suffix,
|
|
56
|
+
"size_bytes": stat.st_size,
|
|
57
|
+
"modified_at": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
|
58
|
+
},
|
|
59
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import uuid
|
|
3
|
+
|
|
4
|
+
from qdrant_client import QdrantClient
|
|
5
|
+
from qdrant_client.models import (
|
|
6
|
+
Distance,
|
|
7
|
+
FieldCondition,
|
|
8
|
+
Filter,
|
|
9
|
+
MatchText,
|
|
10
|
+
PayloadSchemaType,
|
|
11
|
+
PointStruct,
|
|
12
|
+
VectorParams,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from fusesearch.models import Chunk
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def hash_to_uuid(content_hash: str) -> str:
|
|
19
|
+
"""Convert a SHA-256 hex string to a UUID (uses first 32 hex chars)."""
|
|
20
|
+
return str(uuid.UUID(content_hash[:32]))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class QdrantStore:
|
|
24
|
+
"""Vector store backed by Qdrant."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, host: str = "localhost", port: int = 6333, dimension: int = 384):
|
|
27
|
+
self.client = QdrantClient(host=host, port=port)
|
|
28
|
+
self.collection_name = os.getenv("FUSESEARCH_COLLECTION", "fusesearch")
|
|
29
|
+
self.dimension = dimension
|
|
30
|
+
self._ensure_collection()
|
|
31
|
+
|
|
32
|
+
def _ensure_collection(self):
|
|
33
|
+
collections = [c.name for c in self.client.get_collections().collections]
|
|
34
|
+
if self.collection_name not in collections:
|
|
35
|
+
self.client.create_collection(
|
|
36
|
+
collection_name=self.collection_name,
|
|
37
|
+
vectors_config=VectorParams(
|
|
38
|
+
size=self.dimension,
|
|
39
|
+
distance=Distance.COSINE,
|
|
40
|
+
),
|
|
41
|
+
)
|
|
42
|
+
self._ensure_text_index()
|
|
43
|
+
|
|
44
|
+
def _ensure_text_index(self):
|
|
45
|
+
"""Create a full-text index on the content field for keyword search."""
|
|
46
|
+
collection_info = self.client.get_collection(self.collection_name)
|
|
47
|
+
if "content" not in (collection_info.payload_schema or {}):
|
|
48
|
+
self.client.create_payload_index(
|
|
49
|
+
collection_name=self.collection_name,
|
|
50
|
+
field_name="content",
|
|
51
|
+
field_schema=PayloadSchemaType.TEXT,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def upsert(self, chunks: list[Chunk], embeddings: list[list[float]]):
|
|
55
|
+
"""Insert or update chunks with their embeddings."""
|
|
56
|
+
points = [
|
|
57
|
+
PointStruct(
|
|
58
|
+
id=hash_to_uuid(chunk.content_hash),
|
|
59
|
+
vector=embedding,
|
|
60
|
+
payload={
|
|
61
|
+
"content": chunk.content,
|
|
62
|
+
"title": chunk.title,
|
|
63
|
+
"source_type": chunk.source_type,
|
|
64
|
+
"document_source_id": chunk.document_source_id,
|
|
65
|
+
"heading_path": chunk.heading_path,
|
|
66
|
+
"chunk_index": chunk.chunk_index,
|
|
67
|
+
"url": chunk.url,
|
|
68
|
+
"metadata": chunk.metadata,
|
|
69
|
+
},
|
|
70
|
+
)
|
|
71
|
+
for chunk, embedding in zip(chunks, embeddings)
|
|
72
|
+
]
|
|
73
|
+
self.client.upsert(collection_name=self.collection_name, points=points)
|
|
74
|
+
|
|
75
|
+
def search(self, query_vector: list[float], limit: int = 5) -> list[dict]:
|
|
76
|
+
"""Search for similar chunks by vector similarity."""
|
|
77
|
+
results = self.client.query_points(
|
|
78
|
+
collection_name=self.collection_name,
|
|
79
|
+
query=query_vector,
|
|
80
|
+
limit=limit,
|
|
81
|
+
)
|
|
82
|
+
return [
|
|
83
|
+
{
|
|
84
|
+
"_id": point.id,
|
|
85
|
+
"score": point.score,
|
|
86
|
+
"content": point.payload["content"],
|
|
87
|
+
"title": point.payload["title"],
|
|
88
|
+
"source_type": point.payload["source_type"],
|
|
89
|
+
"heading_path": point.payload["heading_path"],
|
|
90
|
+
"metadata": point.payload["metadata"],
|
|
91
|
+
}
|
|
92
|
+
for point in results.points
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
def keyword_search(self, query: str, limit: int = 20) -> list[dict]:
|
|
96
|
+
"""BM25 keyword search using Qdrant's full-text index."""
|
|
97
|
+
results, _ = self.client.scroll(
|
|
98
|
+
collection_name=self.collection_name,
|
|
99
|
+
scroll_filter=Filter(
|
|
100
|
+
must=[FieldCondition(key="content", match=MatchText(text=query))]
|
|
101
|
+
),
|
|
102
|
+
limit=limit,
|
|
103
|
+
with_payload=True,
|
|
104
|
+
with_vectors=False,
|
|
105
|
+
)
|
|
106
|
+
return [
|
|
107
|
+
{
|
|
108
|
+
"_id": point.id,
|
|
109
|
+
"content": point.payload["content"],
|
|
110
|
+
"title": point.payload["title"],
|
|
111
|
+
"source_type": point.payload["source_type"],
|
|
112
|
+
"heading_path": point.payload["heading_path"],
|
|
113
|
+
"metadata": point.payload["metadata"],
|
|
114
|
+
}
|
|
115
|
+
for point in results
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
def hybrid_search(
|
|
119
|
+
self,
|
|
120
|
+
query_vector: list[float],
|
|
121
|
+
query_text: str,
|
|
122
|
+
limit: int = 5,
|
|
123
|
+
vector_weight: float = 0.7,
|
|
124
|
+
) -> list[dict]:
|
|
125
|
+
"""Run vector + keyword search and fuse results with RRF."""
|
|
126
|
+
vector_results = self.search(query_vector, limit=limit * 2)
|
|
127
|
+
keyword_results = self.keyword_search(query_text, limit=limit * 2)
|
|
128
|
+
fused = self._rrf_fuse(vector_results, keyword_results, vector_weight)
|
|
129
|
+
return fused[:limit]
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def _rrf_fuse(
|
|
133
|
+
vector_results: list[dict],
|
|
134
|
+
keyword_results: list[dict],
|
|
135
|
+
vector_weight: float = 0.7,
|
|
136
|
+
k: int = 60,
|
|
137
|
+
) -> list[dict]:
|
|
138
|
+
"""Reciprocal Rank Fusion of two result lists."""
|
|
139
|
+
keyword_weight = 1.0 - vector_weight
|
|
140
|
+
scores: dict[str, float] = {}
|
|
141
|
+
result_map: dict[str, dict] = {}
|
|
142
|
+
|
|
143
|
+
for rank, result in enumerate(vector_results):
|
|
144
|
+
rid = str(result["_id"])
|
|
145
|
+
scores[rid] = scores.get(rid, 0) + vector_weight / (k + rank + 1)
|
|
146
|
+
result_map[rid] = result
|
|
147
|
+
|
|
148
|
+
for rank, result in enumerate(keyword_results):
|
|
149
|
+
rid = str(result["_id"])
|
|
150
|
+
scores[rid] = scores.get(rid, 0) + keyword_weight / (k + rank + 1)
|
|
151
|
+
if rid not in result_map:
|
|
152
|
+
result_map[rid] = result
|
|
153
|
+
|
|
154
|
+
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
155
|
+
return [{**result_map[rid], "score": score} for rid, score in ranked]
|
|
156
|
+
|
|
157
|
+
def get_existing_hashes(self) -> set[str]:
|
|
158
|
+
"""Get all content hashes currently in the store."""
|
|
159
|
+
hashes = set()
|
|
160
|
+
offset = None
|
|
161
|
+
while True:
|
|
162
|
+
result = self.client.scroll(
|
|
163
|
+
collection_name=self.collection_name,
|
|
164
|
+
limit=100,
|
|
165
|
+
offset=offset,
|
|
166
|
+
with_payload=False,
|
|
167
|
+
with_vectors=False,
|
|
168
|
+
)
|
|
169
|
+
points, offset = result
|
|
170
|
+
for point in points:
|
|
171
|
+
hashes.add(point.id)
|
|
172
|
+
if offset is None:
|
|
173
|
+
break
|
|
174
|
+
return hashes
|
|
175
|
+
|
|
176
|
+
def delete_by_hashes(self, hashes: set[str]):
|
|
177
|
+
"""Delete chunks by their content hashes."""
|
|
178
|
+
if not hashes:
|
|
179
|
+
return
|
|
180
|
+
self.client.delete(
|
|
181
|
+
collection_name=self.collection_name,
|
|
182
|
+
points_selector=list(hashes),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def count(self) -> int:
|
|
186
|
+
"""Return number of indexed chunks."""
|
|
187
|
+
return self.client.count(collection_name=self.collection_name).count
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fusesearch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-source search aggregation tool with AI-powered retrieval and response synthesis
|
|
5
|
+
Author-email: Anton Lebedev <pypi@katzo.net>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: pydantic>=2.12
|
|
15
|
+
Requires-Dist: qdrant-client>=1.16
|
|
16
|
+
Requires-Dist: fastapi>=0.129
|
|
17
|
+
Requires-Dist: uvicorn>=0.40
|
|
18
|
+
Requires-Dist: tqdm>=4.67
|
|
19
|
+
Provides-Extra: mcp
|
|
20
|
+
Requires-Dist: mcp[cli]>=1.26; extra == "mcp"
|
|
21
|
+
Provides-Extra: local
|
|
22
|
+
Requires-Dist: sentence-transformers>=5.2; extra == "local"
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: fusesearch[local,mcp]; extra == "all"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=9.0; extra == "dev"
|
|
27
|
+
Requires-Dist: ruff>=0.15; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# FuseSearch
|
|
31
|
+
|
|
32
|
+
Multi-source search aggregation tool that unifies retrieval across diverse data sources — Confluence, MCP servers, local files, and more — using AI-powered search and response synthesis through a single query interface.
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
make build
|
|
38
|
+
make start
|
|
39
|
+
make index # index docs from data/docs
|
|
40
|
+
make search "your query"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## MCP Server
|
|
44
|
+
|
|
45
|
+
The `fusesearch-mcp` Docker service exposes a streamable HTTP endpoint on port 8001. Tools: `search` (hybrid search), `count` (indexed chunks).
|
|
46
|
+
|
|
47
|
+
### Claude Code
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
claude mcp add fusesearch http://localhost:8001/mcp --transport http
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Claude Desktop
|
|
54
|
+
|
|
55
|
+
**Option 1: Connectors UI (recommended)**
|
|
56
|
+
|
|
57
|
+
In Claude Desktop, go to **Settings > Connectors > Add custom connector** and enter `https://localhost:8001/mcp`.
|
|
58
|
+
|
|
59
|
+
**Option 2: Config file with `mcp-remote` bridge (local dev)**
|
|
60
|
+
|
|
61
|
+
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
62
|
+
|
|
63
|
+
```json
|
|
64
|
+
{
|
|
65
|
+
"mcpServers": {
|
|
66
|
+
"fusesearch": {
|
|
67
|
+
"command": "npx",
|
|
68
|
+
"args": ["-y", "mcp-remote", "http://localhost:8001/mcp", "--allow-http"]
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Requires Node.js >= 18. `--allow-http` is required for plain HTTP (not needed for HTTPS).
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
fusesearch/__init__.py
|
|
5
|
+
fusesearch/__main__.py
|
|
6
|
+
fusesearch/indexer.py
|
|
7
|
+
fusesearch/mcp_server.py
|
|
8
|
+
fusesearch/models.py
|
|
9
|
+
fusesearch.egg-info/PKG-INFO
|
|
10
|
+
fusesearch.egg-info/SOURCES.txt
|
|
11
|
+
fusesearch.egg-info/dependency_links.txt
|
|
12
|
+
fusesearch.egg-info/requires.txt
|
|
13
|
+
fusesearch.egg-info/top_level.txt
|
|
14
|
+
fusesearch/api/__init__.py
|
|
15
|
+
fusesearch/api/server.py
|
|
16
|
+
fusesearch/core/__init__.py
|
|
17
|
+
fusesearch/core/chunker.py
|
|
18
|
+
fusesearch/core/embedder.py
|
|
19
|
+
fusesearch/sources/__init__.py
|
|
20
|
+
fusesearch/sources/base.py
|
|
21
|
+
fusesearch/sources/local_files.py
|
|
22
|
+
fusesearch/store/__init__.py
|
|
23
|
+
fusesearch/store/qdrant.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fusesearch
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "fusesearch"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Multi-source search aggregation tool with AI-powered retrieval and response synthesis"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.12"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Anton Lebedev", email = "pypi@katzo.net" },
|
|
10
|
+
]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 3 - Alpha",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.12",
|
|
16
|
+
]
|
|
17
|
+
dependencies = [
|
|
18
|
+
"pydantic>=2.12",
|
|
19
|
+
"qdrant-client>=1.16",
|
|
20
|
+
"fastapi>=0.129",
|
|
21
|
+
"uvicorn>=0.40",
|
|
22
|
+
"tqdm>=4.67",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
mcp = [
|
|
27
|
+
"mcp[cli]>=1.26",
|
|
28
|
+
]
|
|
29
|
+
local = [
|
|
30
|
+
"sentence-transformers>=5.2",
|
|
31
|
+
]
|
|
32
|
+
all = [
|
|
33
|
+
"fusesearch[mcp,local]",
|
|
34
|
+
]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=9.0",
|
|
37
|
+
"ruff>=0.15",
|
|
38
|
+
]
|