fusesearch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anton Lebedev
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.4
2
+ Name: fusesearch
3
+ Version: 0.1.0
4
+ Summary: Multi-source search aggregation tool with AI-powered retrieval and response synthesis
5
+ Author-email: Anton Lebedev <pypi@katzo.net>
6
+ License-Expression: MIT
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pydantic>=2.12
15
+ Requires-Dist: qdrant-client>=1.16
16
+ Requires-Dist: fastapi>=0.129
17
+ Requires-Dist: uvicorn>=0.40
18
+ Requires-Dist: tqdm>=4.67
19
+ Provides-Extra: mcp
20
+ Requires-Dist: mcp[cli]>=1.26; extra == "mcp"
21
+ Provides-Extra: local
22
+ Requires-Dist: sentence-transformers>=5.2; extra == "local"
23
+ Provides-Extra: all
24
+ Requires-Dist: fusesearch[local,mcp]; extra == "all"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=9.0; extra == "dev"
27
+ Requires-Dist: ruff>=0.15; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # FuseSearch
31
+
32
+ Multi-source search aggregation tool that unifies retrieval across diverse data sources — Confluence, MCP servers, local files, and more — using AI-powered search and response synthesis through a single query interface.
33
+
34
+ ## Quick Start
35
+
36
+ ```bash
37
+ make build
38
+ make start
39
+ make index # index docs from data/docs
40
+ make search "your query"
41
+ ```
42
+
43
+ ## MCP Server
44
+
45
+ The `fusesearch-mcp` Docker service exposes a streamable HTTP endpoint on port 8001. Tools: `search` (hybrid search), `count` (indexed chunks).
46
+
47
+ ### Claude Code
48
+
49
+ ```bash
50
+ claude mcp add fusesearch http://localhost:8001/mcp --transport http
51
+ ```
52
+
53
+ ### Claude Desktop
54
+
55
+ **Option 1: Connectors UI (recommended)**
56
+
57
+ In Claude Desktop, go to **Settings > Connectors > Add custom connector** and enter `https://localhost:8001/mcp`.
58
+
59
+ **Option 2: Config file with `mcp-remote` bridge (local dev)**
60
+
61
+ Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
62
+
63
+ ```json
64
+ {
65
+ "mcpServers": {
66
+ "fusesearch": {
67
+ "command": "npx",
68
+ "args": ["-y", "mcp-remote", "http://localhost:8001/mcp", "--allow-http"]
69
+ }
70
+ }
71
+ }
72
+ ```
73
+
74
+ Requires Node.js >= 18. `--allow-http` is required for plain HTTP (not needed for HTTPS).
@@ -0,0 +1,45 @@
1
+ # FuseSearch
2
+
3
+ Multi-source search aggregation tool that unifies retrieval across diverse data sources — Confluence, MCP servers, local files, and more — using AI-powered search and response synthesis through a single query interface.
4
+
5
+ ## Quick Start
6
+
7
+ ```bash
8
+ make build
9
+ make start
10
+ make index # index docs from data/docs
11
+ make search "your query"
12
+ ```
13
+
14
+ ## MCP Server
15
+
16
+ The `fusesearch-mcp` Docker service exposes a streamable HTTP endpoint on port 8001. Tools: `search` (hybrid search), `count` (indexed chunks).
17
+
18
+ ### Claude Code
19
+
20
+ ```bash
21
+ claude mcp add fusesearch http://localhost:8001/mcp --transport http
22
+ ```
23
+
24
+ ### Claude Desktop
25
+
26
+ **Option 1: Connectors UI (recommended)**
27
+
28
+ In Claude Desktop, go to **Settings > Connectors > Add custom connector** and enter `https://localhost:8001/mcp`.
29
+
30
+ **Option 2: Config file with `mcp-remote` bridge (local dev)**
31
+
32
+ Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
33
+
34
+ ```json
35
+ {
36
+ "mcpServers": {
37
+ "fusesearch": {
38
+ "command": "npx",
39
+ "args": ["-y", "mcp-remote", "http://localhost:8001/mcp", "--allow-http"]
40
+ }
41
+ }
42
+ }
43
+ ```
44
+
45
+ Requires Node.js >= 18. `--allow-http` is required for plain HTTP (not needed for HTTPS).
File without changes
@@ -0,0 +1,120 @@
1
+ import argparse
2
+ import os
3
+
4
+
5
+ def _make_embedder():
6
+ from fusesearch.core.embedder import LocalEmbedder
7
+
8
+ return LocalEmbedder()
9
+
10
+
11
+ def _make_store(embedder):
12
+ from fusesearch.store.qdrant import QdrantStore
13
+
14
+ return QdrantStore(
15
+ host=os.getenv("QDRANT_HOST", "localhost"),
16
+ port=int(os.getenv("QDRANT_PORT", "6333")),
17
+ dimension=embedder.dimension,
18
+ )
19
+
20
+
21
+ def cmd_serve(args):
22
+ import uvicorn
23
+
24
+ host = args.host or os.getenv("FUSESEARCH_HOST", "0.0.0.0")
25
+ port = int(args.port or os.getenv("FUSESEARCH_PORT", "8000"))
26
+ uvicorn.run("fusesearch.api.server:app", host=host, port=port)
27
+
28
+
29
+ def cmd_index(args):
30
+ from fusesearch.indexer import Indexer
31
+ from fusesearch.sources.local_files import LocalFilesAdapter
32
+
33
+ embedder = _make_embedder()
34
+ store = _make_store(embedder)
35
+
36
+ adapter = LocalFilesAdapter(directories=args.paths)
37
+ documents = list(adapter.fetch())
38
+ print(f"Found {len(documents)} documents")
39
+
40
+ indexer = Indexer(store=store, embedder=embedder)
41
+ stats = indexer.index_documents(documents)
42
+ print(
43
+ f"Indexed: {stats['new']} new, {stats['skipped']} skipped, {stats['deleted']} deleted"
44
+ )
45
+ print(f"Total chunks in store: {store.count()}")
46
+
47
+
48
+ def cmd_search(args):
49
+ embedder = _make_embedder()
50
+ store = _make_store(embedder)
51
+ hybrid = not args.no_hybrid
52
+
53
+ query_vector = embedder.embed_one(args.query)
54
+ if hybrid:
55
+ results = store.hybrid_search(query_vector, args.query, limit=args.limit)
56
+ else:
57
+ results = store.search(query_vector, limit=args.limit)
58
+
59
+ mode = "hybrid" if hybrid else "vector-only"
60
+ print(f"Search mode: {mode}")
61
+ for i, result in enumerate(results, 1):
62
+ print(f"\n--- Result {i} (score: {result['score']:.4f}) ---")
63
+ print(f"Title: {result['title']}")
64
+ if result["heading_path"]:
65
+ print(f"Section: {' > '.join(result['heading_path'])}")
66
+ print(f"Content: {result['content'][:300]}...")
67
+
68
+
69
+ def main():
70
+ parser = argparse.ArgumentParser(
71
+ prog="fusesearch", description="FuseSearch - multi-source search"
72
+ )
73
+ subparsers = parser.add_subparsers(dest="command")
74
+
75
+ # Serve command (default)
76
+ serve_parser = subparsers.add_parser("serve", help="Start the API server")
77
+ serve_parser.add_argument("--host", default=None, help="Host to bind to")
78
+ serve_parser.add_argument("--port", default=None, help="Port to bind to")
79
+
80
+ # Index command
81
+ index_parser = subparsers.add_parser(
82
+ "index", help="Index documents from local files"
83
+ )
84
+ index_parser.add_argument("paths", nargs="+", help="Directories to index")
85
+
86
+ # Search command
87
+ search_parser = subparsers.add_parser("search", help="Search indexed documents")
88
+ search_parser.add_argument("query", help="Search query")
89
+ search_parser.add_argument("--limit", type=int, default=5, help="Number of results")
90
+ search_parser.add_argument(
91
+ "--no-hybrid", action="store_true", help="Disable hybrid search (vector-only)"
92
+ )
93
+
94
+ # MCP server command
95
+ mcp_parser = subparsers.add_parser("mcp", help="Start the MCP server")
96
+ mcp_parser.add_argument(
97
+ "--transport",
98
+ choices=["stdio", "sse", "streamable-http"],
99
+ default="stdio",
100
+ help="Transport type",
101
+ )
102
+
103
+ args = parser.parse_args()
104
+
105
+ if args.command == "serve":
106
+ cmd_serve(args)
107
+ elif args.command == "index":
108
+ cmd_index(args)
109
+ elif args.command == "search":
110
+ cmd_search(args)
111
+ elif args.command == "mcp":
112
+ from fusesearch.mcp_server import main as mcp_main
113
+
114
+ mcp_main(transport=args.transport)
115
+ else:
116
+ cmd_serve(argparse.Namespace(host=None, port=None))
117
+
118
+
119
+ if __name__ == "__main__":
120
+ main()
File without changes
@@ -0,0 +1,54 @@
1
+ import os
2
+
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel
5
+
6
+ from fusesearch.core.embedder import LocalEmbedder
7
+ from fusesearch.indexer import Indexer
8
+ from fusesearch.sources.local_files import LocalFilesAdapter
9
+ from fusesearch.store.qdrant import QdrantStore
10
+
11
+ app = FastAPI(title="FuseSearch", version="0.1.0")
12
+
13
+ embedder = LocalEmbedder()
14
+
15
+ qdrant_host = os.getenv("QDRANT_HOST", "localhost")
16
+ qdrant_port = int(os.getenv("QDRANT_PORT", "6333"))
17
+ store = QdrantStore(host=qdrant_host, port=qdrant_port, dimension=embedder.dimension)
18
+ indexer = Indexer(store=store, embedder=embedder)
19
+
20
+
21
+ class SearchRequest(BaseModel):
22
+ query: str
23
+ limit: int = 5
24
+ hybrid: bool = True
25
+ vector_weight: float = 0.7
26
+
27
+
28
+ class IndexRequest(BaseModel):
29
+ paths: list[str]
30
+
31
+
32
+ @app.get("/health")
33
+ def health():
34
+ return {"status": "ok", "chunks_indexed": store.count()}
35
+
36
+
37
+ @app.post("/search")
38
+ def search(req: SearchRequest):
39
+ query_vector = embedder.embed_one(req.query)
40
+ if req.hybrid:
41
+ results = store.hybrid_search(
42
+ query_vector, req.query, limit=req.limit, vector_weight=req.vector_weight
43
+ )
44
+ else:
45
+ results = store.search(query_vector, limit=req.limit)
46
+ return {"results": results}
47
+
48
+
49
+ @app.post("/index")
50
+ def index(req: IndexRequest):
51
+ adapter = LocalFilesAdapter(directories=req.paths)
52
+ documents = list(adapter.fetch())
53
+ stats = indexer.index_documents(documents)
54
+ return {"documents_found": len(documents), **stats}
File without changes
@@ -0,0 +1,89 @@
1
+ import re
2
+
3
+ from fusesearch.models import Chunk, Document
4
+
5
+ HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
6
+
7
+
8
+ def chunk_document(document: Document, max_chunk_size: int = 1000) -> list[Chunk]:
9
+ """Split a document into chunks by markdown headings.
10
+
11
+ Splits on headings first (semantic boundaries), then splits
12
+ oversized sections by paragraphs.
13
+ """
14
+ sections = _split_by_headings(document.content)
15
+ chunks: list[Chunk] = []
16
+
17
+ for heading_path, content in sections:
18
+ for piece in _split_by_size(content, max_chunk_size):
19
+ chunk = Chunk(
20
+ document_source_id=document.source_id,
21
+ source_type=document.source_type,
22
+ title=document.title,
23
+ content=piece.strip(),
24
+ url=document.url,
25
+ metadata=document.metadata,
26
+ heading_path=heading_path,
27
+ chunk_index=len(chunks),
28
+ )
29
+ if chunk.content:
30
+ chunks.append(chunk)
31
+
32
+ return chunks
33
+
34
+
35
+ def _split_by_headings(text: str) -> list[tuple[list[str], str]]:
36
+ """Split markdown text by headings, tracking the heading hierarchy."""
37
+ matches = list(HEADING_PATTERN.finditer(text))
38
+
39
+ if not matches:
40
+ return [([], text)]
41
+
42
+ sections: list[tuple[list[str], str]] = []
43
+ heading_stack: list[tuple[int, str]] = []
44
+
45
+ # Content before the first heading
46
+ preamble = text[: matches[0].start()].strip()
47
+ if preamble:
48
+ sections.append(([], preamble))
49
+
50
+ for i, match in enumerate(matches):
51
+ level = len(match.group(1))
52
+ title = match.group(2).strip()
53
+
54
+ # Update heading stack — pop headings at same or deeper level
55
+ heading_stack = [(lvl, t) for lvl, t in heading_stack if lvl < level]
56
+ heading_stack.append((level, title))
57
+
58
+ # Extract content between this heading and the next
59
+ start = match.end()
60
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
61
+ content = text[start:end].strip()
62
+
63
+ if content:
64
+ path = [t for _, t in heading_stack]
65
+ sections.append((path, content))
66
+
67
+ return sections
68
+
69
+
70
+ def _split_by_size(text: str, max_size: int) -> list[str]:
71
+ """Split text into pieces that fit within max_size, splitting on paragraphs."""
72
+ if len(text) <= max_size:
73
+ return [text]
74
+
75
+ paragraphs = text.split("\n\n")
76
+ pieces: list[str] = []
77
+ current = ""
78
+
79
+ for paragraph in paragraphs:
80
+ if current and len(current) + len(paragraph) + 2 > max_size:
81
+ pieces.append(current)
82
+ current = paragraph
83
+ else:
84
+ current = f"{current}\n\n{paragraph}" if current else paragraph
85
+
86
+ if current:
87
+ pieces.append(current)
88
+
89
+ return pieces
@@ -0,0 +1,37 @@
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+
4
+
5
+ class Embedder(ABC):
6
+ """Abstract base class for embedding providers."""
7
+
8
+ @property
9
+ @abstractmethod
10
+ def dimension(self) -> int:
11
+ """Dimension of the embedding vectors."""
12
+
13
+ @abstractmethod
14
+ def embed(self, texts: list[str]) -> list[list[float]]:
15
+ """Generate embeddings for a batch of texts."""
16
+
17
+ def embed_one(self, text: str) -> list[float]:
18
+ return self.embed([text])[0]
19
+
20
+
21
+ class LocalEmbedder(Embedder):
22
+ """Local embedding provider using sentence-transformers."""
23
+
24
+ def __init__(self, model: str | None = None, local_files_only: bool = False):
25
+ from sentence_transformers import SentenceTransformer
26
+
27
+ model = model or os.getenv("FUSESEARCH_EMBED_MODEL", "all-MiniLM-L6-v2")
28
+ self.model = SentenceTransformer(model, local_files_only=local_files_only)
29
+ self._dimension = self.model.get_sentence_embedding_dimension()
30
+
31
+ @property
32
+ def dimension(self) -> int:
33
+ return self._dimension
34
+
35
+ def embed(self, texts: list[str]) -> list[list[float]]:
36
+ embeddings = self.model.encode(texts)
37
+ return embeddings.tolist()
@@ -0,0 +1,57 @@
1
+ from tqdm import tqdm
2
+
3
+ from fusesearch.core.chunker import chunk_document
4
+ from fusesearch.core.embedder import Embedder
5
+ from fusesearch.models import Chunk, Document
6
+ from fusesearch.store.qdrant import QdrantStore, hash_to_uuid
7
+
8
+ EMBED_BATCH_SIZE = 64
9
+
10
+
11
+ class Indexer:
12
+ """Indexes documents into the vector store."""
13
+
14
+ def __init__(self, store: QdrantStore, embedder: Embedder):
15
+ self.store = store
16
+ self.embedder = embedder
17
+
18
+ def index_documents(self, documents: list[Document]) -> dict:
19
+ """Chunk, diff, embed, and store documents.
20
+
21
+ Returns stats: total_chunks, new, skipped, deleted.
22
+ """
23
+ # Chunk all documents
24
+ all_chunks: list[Chunk] = []
25
+ for doc in tqdm(documents, desc="Chunking", unit="doc"):
26
+ all_chunks.extend(chunk_document(doc))
27
+
28
+ # Diff against existing index
29
+ new_ids = {hash_to_uuid(chunk.content_hash) for chunk in all_chunks}
30
+ existing_ids = self.store.get_existing_hashes()
31
+
32
+ to_add = [
33
+ c for c in all_chunks if hash_to_uuid(c.content_hash) not in existing_ids
34
+ ]
35
+ to_delete = existing_ids - new_ids
36
+
37
+ # Delete removed chunks
38
+ if to_delete:
39
+ self.store.delete_by_hashes(to_delete)
40
+
41
+ # Embed and store new chunks in batches
42
+ if to_add:
43
+ pbar = tqdm(total=len(to_add), desc="Embedding", unit="chunk")
44
+ for i in range(0, len(to_add), EMBED_BATCH_SIZE):
45
+ batch = to_add[i : i + EMBED_BATCH_SIZE]
46
+ texts = [chunk.content for chunk in batch]
47
+ embeddings = self.embedder.embed(texts)
48
+ self.store.upsert(batch, embeddings)
49
+ pbar.update(len(batch))
50
+ pbar.close()
51
+
52
+ return {
53
+ "total_chunks": len(all_chunks),
54
+ "new": len(to_add),
55
+ "skipped": len(all_chunks) - len(to_add),
56
+ "deleted": len(to_delete),
57
+ }
@@ -0,0 +1,84 @@
1
+ import os
2
+ from typing import Literal
3
+
4
+ from mcp.server.fastmcp import FastMCP
5
+
6
+ mcp = FastMCP(
7
+ "FuseSearch",
8
+ instructions=(
9
+ "FuseSearch is a knowledge base with indexed documents, blog posts, and notes. "
10
+ "Use this server when the user asks factual questions, wants to look up a topic, "
11
+ "or needs information that might exist in indexed sources."
12
+ ),
13
+ host=os.getenv("MCP_HOST", "0.0.0.0"),
14
+ port=int(os.getenv("MCP_PORT", "8001")),
15
+ )
16
+
17
+ # Lazy-initialized globals
18
+ _embedder = None
19
+ _store = None
20
+
21
+
22
+ def _get_embedder():
23
+ global _embedder
24
+ if _embedder is None:
25
+ from fusesearch.core.embedder import LocalEmbedder
26
+
27
+ _embedder = LocalEmbedder()
28
+ return _embedder
29
+
30
+
31
+ def _get_store():
32
+ global _store
33
+ if _store is None:
34
+ from fusesearch.store.qdrant import QdrantStore
35
+
36
+ _store = QdrantStore(
37
+ host=os.getenv("QDRANT_HOST", "localhost"),
38
+ port=int(os.getenv("QDRANT_PORT", "6333")),
39
+ dimension=_get_embedder().dimension,
40
+ )
41
+ return _store
42
+
43
+
44
+ @mcp.tool()
45
+ def search(query: str, limit: int = 5) -> str:
46
+ """Search the FuseSearch knowledge base. Use this tool whenever the user asks a factual or knowledge question — about a topic, concept, person, event, or anything that indexed documents might answer. Returns relevant document chunks with source titles and scores. Always search BEFORE answering knowledge questions."""
47
+ embedder = _get_embedder()
48
+ store = _get_store()
49
+
50
+ query_vector = embedder.embed_one(query)
51
+ results = store.hybrid_search(query_vector, query, limit=limit)
52
+
53
+ if not results:
54
+ return "No results found."
55
+
56
+ parts = []
57
+ for i, result in enumerate(results, 1):
58
+ title = result.get("title", "Untitled")
59
+ heading = ""
60
+ if result.get("heading_path"):
61
+ heading = f" > {' > '.join(result['heading_path'])}"
62
+ score = result.get("score", 0)
63
+ content = result.get("content", "")
64
+ parts.append(f"[{i}] {title}{heading} (score: {score:.4f})\n{content}")
65
+
66
+ return "\n\n---\n\n".join(parts)
67
+
68
+
69
+ @mcp.tool()
70
+ def count() -> str:
71
+ """Return the number of indexed chunks in the store."""
72
+ store = _get_store()
73
+ return f"{store.count()} chunks indexed"
74
+
75
+
76
+ Transport = Literal["stdio", "sse", "streamable-http"]
77
+
78
+
79
+ def main(transport: Transport = "streamable-http"):
80
+ mcp.run(transport=transport)
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
@@ -0,0 +1,34 @@
1
+ from datetime import datetime
2
+ from hashlib import sha256
3
+
4
+ from pydantic import BaseModel, Field, computed_field
5
+
6
+
7
+ class Document(BaseModel):
8
+ """A normalized document from any source."""
9
+
10
+ source_type: str
11
+ source_id: str
12
+ title: str
13
+ content: str
14
+ url: str | None = None
15
+ metadata: dict = Field(default_factory=dict)
16
+ fetched_at: datetime = Field(default_factory=datetime.now)
17
+
18
+
19
+ class Chunk(BaseModel):
20
+ """A piece of a document, ready for embedding and indexing."""
21
+
22
+ document_source_id: str
23
+ source_type: str
24
+ title: str
25
+ content: str
26
+ url: str | None = None
27
+ metadata: dict = Field(default_factory=dict)
28
+ heading_path: list[str] = Field(default_factory=list)
29
+ chunk_index: int = 0
30
+
31
+ @computed_field
32
+ @property
33
+ def content_hash(self) -> str:
34
+ return sha256(self.content.encode()).hexdigest()
File without changes
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Iterator
3
+
4
+ from fusesearch.models import Document
5
+
6
+
7
+ class SourceAdapter(ABC):
8
+ """Abstract base class for all source adapters."""
9
+
10
+ @property
11
+ @abstractmethod
12
+ def source_type(self) -> str:
13
+ """Unique identifier for this source type (e.g. 'local_files')."""
14
+
15
+ @abstractmethod
16
+ def fetch(self) -> Iterator[Document]:
17
+ """Fetch all documents from this source."""
18
+
19
+ @abstractmethod
20
+ def fetch_updated(self, since: str | None = None) -> Iterator[Document]:
21
+ """Fetch only documents updated since the given cursor.
22
+
23
+ The cursor format is source-specific (e.g. timestamp, page token).
24
+ If None, behaves like fetch() (full sync).
25
+ """
@@ -0,0 +1,59 @@
1
+ from collections.abc import Iterator
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+
5
+ from fusesearch.models import Document
6
+ from fusesearch.sources.base import SourceAdapter
7
+
8
+ SUPPORTED_EXTENSIONS = {".md", ".txt", ".rst"}
9
+
10
+
11
+ class LocalFilesAdapter(SourceAdapter):
12
+ """Source adapter for local markdown and text files."""
13
+
14
+ def __init__(self, directories: list[str | Path]):
15
+ self.directories = [Path(d) for d in directories]
16
+
17
+ @property
18
+ def source_type(self) -> str:
19
+ return "local_files"
20
+
21
+ def fetch(self) -> Iterator[Document]:
22
+ for directory in self.directories:
23
+ yield from self._scan_directory(directory)
24
+
25
+ def fetch_updated(self, since: str | None = None) -> Iterator[Document]:
26
+ if since is None:
27
+ yield from self.fetch()
28
+ return
29
+
30
+ cutoff = datetime.fromisoformat(since)
31
+ for document in self.fetch():
32
+ if document.fetched_at >= cutoff:
33
+ yield document
34
+
35
+ def _scan_directory(self, directory: Path) -> Iterator[Document]:
36
+ if not directory.is_dir():
37
+ return
38
+
39
+ for path in directory.rglob("*"):
40
+ if path.suffix not in SUPPORTED_EXTENSIONS:
41
+ continue
42
+ if not path.is_file():
43
+ continue
44
+
45
+ content = path.read_text(encoding="utf-8", errors="replace")
46
+ stat = path.stat()
47
+
48
+ yield Document(
49
+ source_type=self.source_type,
50
+ source_id=str(path.resolve()),
51
+ title=path.stem,
52
+ content=content,
53
+ metadata={
54
+ "path": str(path.resolve()),
55
+ "extension": path.suffix,
56
+ "size_bytes": stat.st_size,
57
+ "modified_at": datetime.fromtimestamp(stat.st_mtime).isoformat(),
58
+ },
59
+ )
File without changes
@@ -0,0 +1,187 @@
1
+ import os
2
+ import uuid
3
+
4
+ from qdrant_client import QdrantClient
5
+ from qdrant_client.models import (
6
+ Distance,
7
+ FieldCondition,
8
+ Filter,
9
+ MatchText,
10
+ PayloadSchemaType,
11
+ PointStruct,
12
+ VectorParams,
13
+ )
14
+
15
+ from fusesearch.models import Chunk
16
+
17
+
18
+ def hash_to_uuid(content_hash: str) -> str:
19
+ """Convert a SHA-256 hex string to a UUID (uses first 32 hex chars)."""
20
+ return str(uuid.UUID(content_hash[:32]))
21
+
22
+
23
+ class QdrantStore:
24
+ """Vector store backed by Qdrant."""
25
+
26
+ def __init__(self, host: str = "localhost", port: int = 6333, dimension: int = 384):
27
+ self.client = QdrantClient(host=host, port=port)
28
+ self.collection_name = os.getenv("FUSESEARCH_COLLECTION", "fusesearch")
29
+ self.dimension = dimension
30
+ self._ensure_collection()
31
+
32
+ def _ensure_collection(self):
33
+ collections = [c.name for c in self.client.get_collections().collections]
34
+ if self.collection_name not in collections:
35
+ self.client.create_collection(
36
+ collection_name=self.collection_name,
37
+ vectors_config=VectorParams(
38
+ size=self.dimension,
39
+ distance=Distance.COSINE,
40
+ ),
41
+ )
42
+ self._ensure_text_index()
43
+
44
+ def _ensure_text_index(self):
45
+ """Create a full-text index on the content field for keyword search."""
46
+ collection_info = self.client.get_collection(self.collection_name)
47
+ if "content" not in (collection_info.payload_schema or {}):
48
+ self.client.create_payload_index(
49
+ collection_name=self.collection_name,
50
+ field_name="content",
51
+ field_schema=PayloadSchemaType.TEXT,
52
+ )
53
+
54
+ def upsert(self, chunks: list[Chunk], embeddings: list[list[float]]):
55
+ """Insert or update chunks with their embeddings."""
56
+ points = [
57
+ PointStruct(
58
+ id=hash_to_uuid(chunk.content_hash),
59
+ vector=embedding,
60
+ payload={
61
+ "content": chunk.content,
62
+ "title": chunk.title,
63
+ "source_type": chunk.source_type,
64
+ "document_source_id": chunk.document_source_id,
65
+ "heading_path": chunk.heading_path,
66
+ "chunk_index": chunk.chunk_index,
67
+ "url": chunk.url,
68
+ "metadata": chunk.metadata,
69
+ },
70
+ )
71
+ for chunk, embedding in zip(chunks, embeddings)
72
+ ]
73
+ self.client.upsert(collection_name=self.collection_name, points=points)
74
+
75
+ def search(self, query_vector: list[float], limit: int = 5) -> list[dict]:
76
+ """Search for similar chunks by vector similarity."""
77
+ results = self.client.query_points(
78
+ collection_name=self.collection_name,
79
+ query=query_vector,
80
+ limit=limit,
81
+ )
82
+ return [
83
+ {
84
+ "_id": point.id,
85
+ "score": point.score,
86
+ "content": point.payload["content"],
87
+ "title": point.payload["title"],
88
+ "source_type": point.payload["source_type"],
89
+ "heading_path": point.payload["heading_path"],
90
+ "metadata": point.payload["metadata"],
91
+ }
92
+ for point in results.points
93
+ ]
94
+
95
+ def keyword_search(self, query: str, limit: int = 20) -> list[dict]:
96
+ """BM25 keyword search using Qdrant's full-text index."""
97
+ results, _ = self.client.scroll(
98
+ collection_name=self.collection_name,
99
+ scroll_filter=Filter(
100
+ must=[FieldCondition(key="content", match=MatchText(text=query))]
101
+ ),
102
+ limit=limit,
103
+ with_payload=True,
104
+ with_vectors=False,
105
+ )
106
+ return [
107
+ {
108
+ "_id": point.id,
109
+ "content": point.payload["content"],
110
+ "title": point.payload["title"],
111
+ "source_type": point.payload["source_type"],
112
+ "heading_path": point.payload["heading_path"],
113
+ "metadata": point.payload["metadata"],
114
+ }
115
+ for point in results
116
+ ]
117
+
118
+ def hybrid_search(
119
+ self,
120
+ query_vector: list[float],
121
+ query_text: str,
122
+ limit: int = 5,
123
+ vector_weight: float = 0.7,
124
+ ) -> list[dict]:
125
+ """Run vector + keyword search and fuse results with RRF."""
126
+ vector_results = self.search(query_vector, limit=limit * 2)
127
+ keyword_results = self.keyword_search(query_text, limit=limit * 2)
128
+ fused = self._rrf_fuse(vector_results, keyword_results, vector_weight)
129
+ return fused[:limit]
130
+
131
+ @staticmethod
132
+ def _rrf_fuse(
133
+ vector_results: list[dict],
134
+ keyword_results: list[dict],
135
+ vector_weight: float = 0.7,
136
+ k: int = 60,
137
+ ) -> list[dict]:
138
+ """Reciprocal Rank Fusion of two result lists."""
139
+ keyword_weight = 1.0 - vector_weight
140
+ scores: dict[str, float] = {}
141
+ result_map: dict[str, dict] = {}
142
+
143
+ for rank, result in enumerate(vector_results):
144
+ rid = str(result["_id"])
145
+ scores[rid] = scores.get(rid, 0) + vector_weight / (k + rank + 1)
146
+ result_map[rid] = result
147
+
148
+ for rank, result in enumerate(keyword_results):
149
+ rid = str(result["_id"])
150
+ scores[rid] = scores.get(rid, 0) + keyword_weight / (k + rank + 1)
151
+ if rid not in result_map:
152
+ result_map[rid] = result
153
+
154
+ ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
155
+ return [{**result_map[rid], "score": score} for rid, score in ranked]
156
+
157
+ def get_existing_hashes(self) -> set[str]:
158
+ """Get all content hashes currently in the store."""
159
+ hashes = set()
160
+ offset = None
161
+ while True:
162
+ result = self.client.scroll(
163
+ collection_name=self.collection_name,
164
+ limit=100,
165
+ offset=offset,
166
+ with_payload=False,
167
+ with_vectors=False,
168
+ )
169
+ points, offset = result
170
+ for point in points:
171
+ hashes.add(point.id)
172
+ if offset is None:
173
+ break
174
+ return hashes
175
+
176
+ def delete_by_hashes(self, hashes: set[str]):
177
+ """Delete chunks by their content hashes."""
178
+ if not hashes:
179
+ return
180
+ self.client.delete(
181
+ collection_name=self.collection_name,
182
+ points_selector=list(hashes),
183
+ )
184
+
185
+ def count(self) -> int:
186
+ """Return number of indexed chunks."""
187
+ return self.client.count(collection_name=self.collection_name).count
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.4
2
+ Name: fusesearch
3
+ Version: 0.1.0
4
+ Summary: Multi-source search aggregation tool with AI-powered retrieval and response synthesis
5
+ Author-email: Anton Lebedev <pypi@katzo.net>
6
+ License-Expression: MIT
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pydantic>=2.12
15
+ Requires-Dist: qdrant-client>=1.16
16
+ Requires-Dist: fastapi>=0.129
17
+ Requires-Dist: uvicorn>=0.40
18
+ Requires-Dist: tqdm>=4.67
19
+ Provides-Extra: mcp
20
+ Requires-Dist: mcp[cli]>=1.26; extra == "mcp"
21
+ Provides-Extra: local
22
+ Requires-Dist: sentence-transformers>=5.2; extra == "local"
23
+ Provides-Extra: all
24
+ Requires-Dist: fusesearch[local,mcp]; extra == "all"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=9.0; extra == "dev"
27
+ Requires-Dist: ruff>=0.15; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # FuseSearch
31
+
32
+ Multi-source search aggregation tool that unifies retrieval across diverse data sources — Confluence, MCP servers, local files, and more — using AI-powered search and response synthesis through a single query interface.
33
+
34
+ ## Quick Start
35
+
36
+ ```bash
37
+ make build
38
+ make start
39
+ make index # index docs from data/docs
40
+ make search "your query"
41
+ ```
42
+
43
+ ## MCP Server
44
+
45
+ The `fusesearch-mcp` Docker service exposes a streamable HTTP endpoint on port 8001. Tools: `search` (hybrid search), `count` (indexed chunks).
46
+
47
+ ### Claude Code
48
+
49
+ ```bash
50
+ claude mcp add fusesearch http://localhost:8001/mcp --transport http
51
+ ```
52
+
53
+ ### Claude Desktop
54
+
55
+ **Option 1: Connectors UI (recommended)**
56
+
57
+ In Claude Desktop, go to **Settings > Connectors > Add custom connector** and enter `https://localhost:8001/mcp`.
58
+
59
+ **Option 2: Config file with `mcp-remote` bridge (local dev)**
60
+
61
+ Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
62
+
63
+ ```json
64
+ {
65
+ "mcpServers": {
66
+ "fusesearch": {
67
+ "command": "npx",
68
+ "args": ["-y", "mcp-remote", "http://localhost:8001/mcp", "--allow-http"]
69
+ }
70
+ }
71
+ }
72
+ ```
73
+
74
+ Requires Node.js >= 18. `--allow-http` is required for plain HTTP (not needed for HTTPS).
@@ -0,0 +1,23 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ fusesearch/__init__.py
5
+ fusesearch/__main__.py
6
+ fusesearch/indexer.py
7
+ fusesearch/mcp_server.py
8
+ fusesearch/models.py
9
+ fusesearch.egg-info/PKG-INFO
10
+ fusesearch.egg-info/SOURCES.txt
11
+ fusesearch.egg-info/dependency_links.txt
12
+ fusesearch.egg-info/requires.txt
13
+ fusesearch.egg-info/top_level.txt
14
+ fusesearch/api/__init__.py
15
+ fusesearch/api/server.py
16
+ fusesearch/core/__init__.py
17
+ fusesearch/core/chunker.py
18
+ fusesearch/core/embedder.py
19
+ fusesearch/sources/__init__.py
20
+ fusesearch/sources/base.py
21
+ fusesearch/sources/local_files.py
22
+ fusesearch/store/__init__.py
23
+ fusesearch/store/qdrant.py
@@ -0,0 +1,18 @@
1
+ pydantic>=2.12
2
+ qdrant-client>=1.16
3
+ fastapi>=0.129
4
+ uvicorn>=0.40
5
+ tqdm>=4.67
6
+
7
+ [all]
8
+ fusesearch[local,mcp]
9
+
10
+ [dev]
11
+ pytest>=9.0
12
+ ruff>=0.15
13
+
14
+ [local]
15
+ sentence-transformers>=5.2
16
+
17
+ [mcp]
18
+ mcp[cli]>=1.26
@@ -0,0 +1 @@
1
+ fusesearch
@@ -0,0 +1,38 @@
1
+ [project]
2
+ name = "fusesearch"
3
+ version = "0.1.0"
4
+ description = "Multi-source search aggregation tool with AI-powered retrieval and response synthesis"
5
+ readme = "README.md"
6
+ license = "MIT"
7
+ requires-python = ">=3.12"
8
+ authors = [
9
+ { name = "Anton Lebedev", email = "pypi@katzo.net" },
10
+ ]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: Developers",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.12",
16
+ ]
17
+ dependencies = [
18
+ "pydantic>=2.12",
19
+ "qdrant-client>=1.16",
20
+ "fastapi>=0.129",
21
+ "uvicorn>=0.40",
22
+ "tqdm>=4.67",
23
+ ]
24
+
25
+ [project.optional-dependencies]
26
+ mcp = [
27
+ "mcp[cli]>=1.26",
28
+ ]
29
+ local = [
30
+ "sentence-transformers>=5.2",
31
+ ]
32
+ all = [
33
+ "fusesearch[mcp,local]",
34
+ ]
35
+ dev = [
36
+ "pytest>=9.0",
37
+ "ruff>=0.15",
38
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+