open-db 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. open_db-1.3.0/LICENSE +21 -0
  2. open_db-1.3.0/PKG-INFO +455 -0
  3. open_db-1.3.0/README.md +397 -0
  4. open_db-1.3.0/mcp_server/__init__.py +0 -0
  5. open_db-1.3.0/mcp_server/__main__.py +34 -0
  6. open_db-1.3.0/mcp_server/client.py +363 -0
  7. open_db-1.3.0/mcp_server/models.py +140 -0
  8. open_db-1.3.0/mcp_server/server.py +370 -0
  9. open_db-1.3.0/open_db.egg-info/PKG-INFO +455 -0
  10. open_db-1.3.0/open_db.egg-info/SOURCES.txt +75 -0
  11. open_db-1.3.0/open_db.egg-info/dependency_links.txt +1 -0
  12. open_db-1.3.0/open_db.egg-info/entry_points.txt +2 -0
  13. open_db-1.3.0/open_db.egg-info/requires.txt +42 -0
  14. open_db-1.3.0/open_db.egg-info/top_level.txt +4 -0
  15. open_db-1.3.0/opendb/__init__.py +21 -0
  16. open_db-1.3.0/opendb/cli.py +230 -0
  17. open_db-1.3.0/opendb_core/__init__.py +0 -0
  18. open_db-1.3.0/opendb_core/config.py +51 -0
  19. open_db-1.3.0/opendb_core/database.py +46 -0
  20. open_db-1.3.0/opendb_core/main.py +96 -0
  21. open_db-1.3.0/opendb_core/middleware/__init__.py +0 -0
  22. open_db-1.3.0/opendb_core/middleware/auth.py +40 -0
  23. open_db-1.3.0/opendb_core/parsers/__init__.py +0 -0
  24. open_db-1.3.0/opendb_core/parsers/base.py +23 -0
  25. open_db-1.3.0/opendb_core/parsers/docx.py +166 -0
  26. open_db-1.3.0/opendb_core/parsers/image.py +56 -0
  27. open_db-1.3.0/opendb_core/parsers/pdf.py +174 -0
  28. open_db-1.3.0/opendb_core/parsers/pptx.py +116 -0
  29. open_db-1.3.0/opendb_core/parsers/registry.py +28 -0
  30. open_db-1.3.0/opendb_core/parsers/spreadsheet.py +278 -0
  31. open_db-1.3.0/opendb_core/parsers/text.py +91 -0
  32. open_db-1.3.0/opendb_core/routers/__init__.py +0 -0
  33. open_db-1.3.0/opendb_core/routers/files.py +107 -0
  34. open_db-1.3.0/opendb_core/routers/glob.py +68 -0
  35. open_db-1.3.0/opendb_core/routers/health.py +13 -0
  36. open_db-1.3.0/opendb_core/routers/index.py +83 -0
  37. open_db-1.3.0/opendb_core/routers/info.py +16 -0
  38. open_db-1.3.0/opendb_core/routers/memory.py +98 -0
  39. open_db-1.3.0/opendb_core/routers/read.py +75 -0
  40. open_db-1.3.0/opendb_core/routers/search.py +81 -0
  41. open_db-1.3.0/opendb_core/services/__init__.py +0 -0
  42. open_db-1.3.0/opendb_core/services/grep_service.py +156 -0
  43. open_db-1.3.0/opendb_core/services/index_service.py +179 -0
  44. open_db-1.3.0/opendb_core/services/ingest_service.py +295 -0
  45. open_db-1.3.0/opendb_core/services/memory_service.py +108 -0
  46. open_db-1.3.0/opendb_core/services/read_service.py +230 -0
  47. open_db-1.3.0/opendb_core/services/search_service.py +21 -0
  48. open_db-1.3.0/opendb_core/services/vision_service.py +147 -0
  49. open_db-1.3.0/opendb_core/services/watch_service.py +305 -0
  50. open_db-1.3.0/opendb_core/storage/__init__.py +130 -0
  51. open_db-1.3.0/opendb_core/storage/base.py +234 -0
  52. open_db-1.3.0/opendb_core/storage/postgres.py +893 -0
  53. open_db-1.3.0/opendb_core/storage/shared.py +165 -0
  54. open_db-1.3.0/opendb_core/storage/sqlite.py +994 -0
  55. open_db-1.3.0/opendb_core/utils/__init__.py +0 -0
  56. open_db-1.3.0/opendb_core/utils/hashing.py +11 -0
  57. open_db-1.3.0/opendb_core/utils/text.py +181 -0
  58. open_db-1.3.0/opendb_core/utils/tokenizer.py +108 -0
  59. open_db-1.3.0/opendb_core/workspace.py +286 -0
  60. open_db-1.3.0/opendb_integration/__init__.py +27 -0
  61. open_db-1.3.0/opendb_integration/client.py +500 -0
  62. open_db-1.3.0/opendb_integration/index.py +77 -0
  63. open_db-1.3.0/opendb_integration/tools.py +652 -0
  64. open_db-1.3.0/pyproject.toml +87 -0
  65. open_db-1.3.0/setup.cfg +4 -0
  66. open_db-1.3.0/tests/test_classification.py +92 -0
  67. open_db-1.3.0/tests/test_database.py +27 -0
  68. open_db-1.3.0/tests/test_e2e.py +173 -0
  69. open_db-1.3.0/tests/test_hashing.py +37 -0
  70. open_db-1.3.0/tests/test_index.py +108 -0
  71. open_db-1.3.0/tests/test_ingest_service.py +88 -0
  72. open_db-1.3.0/tests/test_parsers.py +168 -0
  73. open_db-1.3.0/tests/test_read_structured.py +171 -0
  74. open_db-1.3.0/tests/test_resolve_filename.py +103 -0
  75. open_db-1.3.0/tests/test_sqlite_backend.py +244 -0
  76. open_db-1.3.0/tests/test_utils.py +149 -0
  77. open_db-1.3.0/tests/test_watch_service.py +183 -0
open_db-1.3.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024-2026 wuwangzhang1216
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
open_db-1.3.0/PKG-INFO ADDED
@@ -0,0 +1,455 @@
1
+ Metadata-Version: 2.4
2
+ Name: open-db
3
+ Version: 1.3.0
4
+ Summary: The AI-native file database and memory store. Built for LLM agents to read, search, and remember.
5
+ Author-email: wuwangzhang1216 <wuwangzhang1216@users.noreply.github.com>
6
+ Maintainer-email: wuwangzhang1216 <wuwangzhang1216@users.noreply.github.com>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/wuwangzhang1216/openDB
9
+ Project-URL: Repository, https://github.com/wuwangzhang1216/openDB
10
+ Project-URL: Issues, https://github.com/wuwangzhang1216/openDB/issues
11
+ Project-URL: Documentation, https://github.com/wuwangzhang1216/openDB#readme
12
+ Keywords: file,parser,pdf,docx,pptx,xlsx,ocr,ai,agents,api
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Text Processing
21
+ Requires-Python: >=3.11
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: fastapi>=0.115.0
25
+ Requires-Dist: uvicorn[standard]>=0.30.0
26
+ Requires-Dist: asyncpg>=0.29.0
27
+ Requires-Dist: python-multipart>=0.0.9
28
+ Requires-Dist: PyMuPDF>=1.24.0
29
+ Requires-Dist: python-docx>=1.1.0
30
+ Requires-Dist: python-pptx>=0.6.23
31
+ Requires-Dist: openpyxl>=3.1.0
32
+ Requires-Dist: pandas>=2.2.0
33
+ Requires-Dist: pytesseract>=0.3.10
34
+ Requires-Dist: Pillow>=10.0.0
35
+ Requires-Dist: python-magic-bin>=0.4.14; sys_platform == "win32"
36
+ Requires-Dist: python-magic>=0.4.27; sys_platform != "win32"
37
+ Requires-Dist: pydantic-settings>=2.0.0
38
+ Requires-Dist: watchdog>=4.0.0
39
+ Requires-Dist: jieba>=0.42.1
40
+ Provides-Extra: mcp
41
+ Requires-Dist: mcp>=1.0.0; extra == "mcp"
42
+ Requires-Dist: httpx>=0.28.0; extra == "mcp"
43
+ Provides-Extra: integration
44
+ Requires-Dist: httpx>=0.28.0; extra == "integration"
45
+ Provides-Extra: embedded
46
+ Requires-Dist: aiosqlite>=0.19.0; extra == "embedded"
47
+ Requires-Dist: httpx>=0.27.0; extra == "embedded"
48
+ Provides-Extra: cli
49
+ Requires-Dist: aiosqlite>=0.19.0; extra == "cli"
50
+ Requires-Dist: typer>=0.12.0; extra == "cli"
51
+ Provides-Extra: dev
52
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
53
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
54
+ Requires-Dist: httpx>=0.27.0; extra == "dev"
55
+ Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
56
+ Requires-Dist: aiosqlite>=0.19.0; extra == "dev"
57
+ Dynamic: license-file
58
+
59
+ <p align="center">
60
+ <a href="https://github.com/wuwangzhang1216/openDB">
61
+ <img loading="lazy" alt="OpenDB" src="https://github.com/wuwangzhang1216/openDB/raw/main/docs/assets/opendb-banner.svg" width="100%"/>
62
+ </a>
63
+ </p>
64
+
65
+ <p align="center">
66
+ <strong>3 lines to give your AI agent a file database and long-term memory.</strong><br/>
67
+ Read any file. Search any workspace. Remember everything.
68
+ </p>
69
+
70
+ <p align="center">
71
+ <a href="https://pypi.org/project/opendb/"><img src="https://img.shields.io/pypi/v/opendb" alt="PyPI version"/></a>
72
+ <a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.11+-blue.svg" alt="Python 3.11+"/></a>
73
+ <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License: MIT"/></a>
74
+ <a href="https://github.com/wuwangzhang1216/openDB/stargazers"><img src="https://img.shields.io/github/stars/wuwangzhang1216/openDB" alt="GitHub stars"/></a>
75
+ </p>
76
+
77
+ ---
78
+
79
+ ```bash
80
+ pip install opendb[cli]
81
+ opendb index ./my_workspace
82
+ opendb serve-mcp
83
+ ```
84
+
85
+ That's it. Your agent now has 7 MCP tools — read any file format, search across documents and code, and store/recall persistent memories. Works with every major agent framework out of the box.
86
+
87
+ ## Works with Every Agent Framework
88
+
89
+ OpenDB speaks [MCP](https://modelcontextprotocol.io/) — the universal standard supported by all major frameworks. Pick yours:
90
+
91
+ <details>
92
+ <summary><b>Claude Code / Cursor / Windsurf</b></summary>
93
+
94
+ Add to your MCP config (`.mcp.json`, `mcp_servers` in settings, etc.):
95
+
96
+ ```json
97
+ {
98
+ "mcpServers": {
99
+ "opendb": {
100
+ "command": "opendb",
101
+ "args": ["serve-mcp", "--workspace", "/path/to/workspace"]
102
+ }
103
+ }
104
+ }
105
+ ```
106
+
107
+ </details>
108
+
109
+ <details>
110
+ <summary><b>Claude Agent SDK (Anthropic)</b></summary>
111
+
112
+ ```python
113
+ from claude_agent_sdk import query, ClaudeAgentOptions
114
+ from claude_agent_sdk.mcp import MCPServerStdio
115
+
116
+ async with MCPServerStdio("opendb", ["serve-mcp", "--workspace", "./docs"]) as opendb:
117
+ options = ClaudeAgentOptions(
118
+ model="claude-sonnet-4-6",
119
+ mcp_servers={"opendb": opendb},
120
+ allowed_tools=["mcp__opendb__*"],
121
+ )
122
+ async for msg in query(prompt="Summarize the Q4 report", options=options):
123
+ print(msg.content)
124
+ ```
125
+
126
+ </details>
127
+
128
+ <details>
129
+ <summary><b>OpenAI Agents SDK</b></summary>
130
+
131
+ ```python
132
+ from agents import Agent, Runner
133
+ from agents.mcp import MCPServerStdio
134
+
135
+ async with MCPServerStdio(name="opendb", params={
136
+ "command": "opendb", "args": ["serve-mcp", "--workspace", "./docs"]
137
+ }) as opendb:
138
+ agent = Agent(name="Analyst", model="gpt-4.1", mcp_servers=[opendb])
139
+ result = await Runner.run(agent, "Find all revenue mentions in the PDF reports")
140
+ print(result.final_output)
141
+ ```
142
+
143
+ </details>
144
+
145
+ <details>
146
+ <summary><b>LangChain / LangGraph</b></summary>
147
+
148
+ ```python
149
+ from langchain_mcp_adapters.client import MultiServerMCPClient
150
+ from langgraph.prebuilt import create_react_agent
151
+
152
+ async with MultiServerMCPClient({
153
+ "opendb": {"command": "opendb", "args": ["serve-mcp", "--workspace", "./docs"], "transport": "stdio"}
154
+ }) as client:
155
+ agent = create_react_agent("anthropic:claude-sonnet-4-6", await client.get_tools())
156
+ result = await agent.ainvoke({"messages": [("user", "What changed in the latest spec?")]})
157
+ ```
158
+
159
+ </details>
160
+
161
+ <details>
162
+ <summary><b>CrewAI</b></summary>
163
+
164
+ ```python
165
+ from crewai import Agent, Task, Crew
166
+ from crewai.tools import MCPServerStdio
167
+
168
+ opendb = MCPServerStdio(command="opendb", args=["serve-mcp", "--workspace", "./docs"])
169
+
170
+ analyst = Agent(role="Document Analyst", goal="Analyze workspace files", mcps=[opendb])
171
+ task = Task(description="Summarize all PDF reports in the workspace", agent=analyst)
172
+ Crew(agents=[analyst], tasks=[task]).kickoff()
173
+ ```
174
+
175
+ </details>
176
+
177
+ <details>
178
+ <summary><b>AutoGen (Microsoft)</b></summary>
179
+
180
+ ```python
181
+ from autogen_ext.tools.mcp import mcp_server_tools, StdioServerParams
182
+ from autogen_agentchat.agents import AssistantAgent
183
+
184
+ tools = await mcp_server_tools(StdioServerParams(command="opendb", args=["serve-mcp", "--workspace", "./docs"]))
185
+ agent = AssistantAgent(name="analyst", model_client=client, tools=tools)
186
+ await agent.run("Search for deployment-related memories")
187
+ ```
188
+
189
+ </details>
190
+
191
+ <details>
192
+ <summary><b>Google ADK</b></summary>
193
+
194
+ ```python
195
+ from google.adk.agents import LlmAgent
196
+ from google.adk.tools.mcp_tool import McpToolset
197
+ from google.adk.tools.mcp_tool.mcp_session_manager import StdioConnectionParams
198
+
199
+ agent = LlmAgent(
200
+ model="gemini-2.5-flash",
201
+ name="analyst",
202
+ tools=[McpToolset(connection_params=StdioConnectionParams(command="opendb", args=["serve-mcp", "--workspace", "./docs"]))],
203
+ )
204
+ ```
205
+
206
+ </details>
207
+
208
+ <details>
209
+ <summary><b>Mastra (TypeScript)</b></summary>
210
+
211
+ ```typescript
212
+ import { MCPClient } from "@mastra/mcp";
213
+ import { Agent } from "@mastra/core/agent";
214
+
215
+ const mcp = new MCPClient({
216
+ servers: { opendb: { command: "opendb", args: ["serve-mcp", "--workspace", "./docs"] } },
217
+ });
218
+
219
+ const agent = new Agent({
220
+ name: "Analyst",
221
+ model: "openai/gpt-4.1",
222
+ tools: await mcp.listTools(),
223
+ });
224
+ ```
225
+
226
+ </details>
227
+
228
+ <details>
229
+ <summary><b>Python (direct, no framework)</b></summary>
230
+
231
+ ```python
232
+ from opendb import OpenDB
233
+
234
+ db = OpenDB.open("./my_workspace")
235
+ await db.init()
236
+ await db.index()
237
+
238
+ text = await db.read("report.pdf", pages="1-3")
239
+ results = await db.search("quarterly revenue")
240
+ await db.memory_store("User prefers concise answers")
241
+ memories = await db.memory_recall("user preferences")
242
+
243
+ await db.close()
244
+ ```
245
+
246
+ </details>
247
+
248
+ ## Why OpenDB?
249
+
250
+ Without OpenDB, agents write inline parsing code for every document:
251
+
252
+ ```python
253
+ # Agent writes this every time — 500+ tokens, often fails
254
+ run_command("""python -c "
255
+ import PyMuPDF; doc = PyMuPDF.open('report.pdf')
256
+ for page in doc: print(page.get_text())
257
+ " """)
258
+ ```
259
+
260
+ With OpenDB:
261
+
262
+ ```python
263
+ read_file("report.pdf") # 50 tokens, always works
264
+ ```
265
+
266
+ **Benchmarked across 4 LLMs on 24 document tasks:**
267
+
268
+ | Metric | Without OpenDB | With OpenDB |
269
+ |--------|---------------|-------------|
270
+ | Tokens used | 100% | **27-45%** (55-73% saved) |
271
+ | Task speed | 100% | **36-58%** faster |
272
+ | Answer quality | 2.4-3.2 / 5 | **3.4-3.9 / 5** |
273
+ | Success rate | 79% | **100%** |
274
+
275
+ **FTS vs RAG vector retrieval (25-325 documents):**
276
+
277
+ | Scale | FTS Tokens Saved | FTS Quality | RAG Quality |
278
+ |-------|-----------------|------------|------------|
279
+ | 25 docs | **47%** | 3.9/5 | 4.2/5 |
280
+ | 125 docs | **44%** | **4.7/5** | 4.0/5 |
281
+ | 325 docs | **45%** | **4.6/5** | 3.5/5 |
282
+
283
+ FTS quality **improves with scale** while RAG degrades from distractor noise. See [benchmark/REPORT.md](benchmark/REPORT.md) for methodology.
284
+
285
+ ## MCP Tools
286
+
287
+ 7 tools, auto-discovered by any MCP-compatible agent:
288
+
289
+ ### `opendb_info` — Workspace overview
290
+
291
+ ```
292
+ opendb_info()
293
+ → Workspace: 47 files (ready: 45, processing: 1, failed: 1)
294
+ By type: Python (.py) 20 | PDF 12 | Excel (.xlsx) 5 | ...
295
+ Recently updated: config.yaml (2 min ago) | main.py (1 hr ago)
296
+ ```
297
+
298
+ ### `opendb_read` — Read any file
299
+
300
+ Code with line numbers, documents as plain text, spreadsheets as structured JSON.
301
+
302
+ ```
303
+ opendb_read(filename="main.py") # Code with line numbers
304
+ opendb_read(filename="report.pdf", pages="1-3") # PDF pages
305
+ opendb_read(filename="report.pdf", grep="revenue+growth") # Search within file
306
+ opendb_read(filename="budget.xlsx", format="json") # Structured spreadsheet
307
+ opendb_read(filename="app.py", offset=50, limit=31) # Lines 50-80
308
+ ```
309
+
310
+ ### `opendb_search` — Search across code and documents
311
+
312
+ Regex grep for code, full-text search for documents. Auto-detects mode.
313
+
314
+ ```
315
+ opendb_search(query="def main", path="/workspace", glob="*.py") # Grep code
316
+ opendb_search(query="quarterly revenue") # FTS documents
317
+ opendb_search(query="TODO", path="/src", case_insensitive=True) # Case insensitive
318
+ ```
319
+
320
+ ### `opendb_glob` — Find files
321
+
322
+ ```
323
+ opendb_glob(pattern="**/*.py", path="/workspace")
324
+ opendb_glob(pattern="src/**/*.{ts,tsx}", path="/workspace")
325
+ ```
326
+
327
+ ### `opendb_memory_store` — Store a memory
328
+
329
+ ```
330
+ opendb_memory_store(content="User prefers dark mode", memory_type="semantic")
331
+ opendb_memory_store(content="Deployed v2.1, rollback required", memory_type="episodic", tags=["deploy"])
332
+ opendb_memory_store(content="Always run tests before merging", memory_type="procedural")
333
+ opendb_memory_store(content="User is a senior engineer at Acme", pinned=true)
334
+ ```
335
+
336
+ Three memory types: **semantic** (facts/knowledge), **episodic** (events/outcomes), **procedural** (workflows/rules).
337
+
338
+ Set `pinned=true` for critical facts — they get 10x ranking boost and can be retrieved instantly with `pinned_only=true`.
339
+
340
+ ### `opendb_memory_recall` — Search memories
341
+
342
+ Results ranked by **relevance × recency**. Pinned memories always surface first.
343
+
344
+ ```
345
+ opendb_memory_recall(query="user preferences")
346
+ opendb_memory_recall(query="deploy", memory_type="episodic")
347
+ opendb_memory_recall(pinned_only=true) # Instant — no search needed, ideal for agent startup
348
+ ```
349
+
350
+ ### `opendb_memory_forget` — Delete memories
351
+
352
+ ```
353
+ opendb_memory_forget(memory_id="abc-123-def")
354
+ opendb_memory_forget(query="outdated preferences")
355
+ ```
356
+
357
+ ## Agent Memory
358
+
359
+ OpenDB doubles as a **long-term memory store** for AI agents — persistent across sessions, ranked by relevance and recency, with pinned priorities.
360
+
361
+ ### Why not Markdown files?
362
+
363
+ | | Markdown files | OpenDB Memory |
364
+ |---|---|---|
365
+ | **Search** | Full-file scan, substring match | FTS5 BM25 index, O(log n) |
366
+ | **Ranking** | None — all matches are equal | Relevance × recency decay |
367
+ | **Capacity** | Claude Code: 200-line hard limit | No hard limit, indexed |
368
+ | **CJK** | Broken (no word segmentation) | jieba tokenization, native CJK |
369
+ | **Staleness** | Old = new, manual cleanup | `0.5^(age/30)` auto-decay |
370
+ | **Structure** | Free text + frontmatter | tags[], metadata{}, memory_type, pinned |
371
+ | **Agent cost** | Tokens spent on file management | 3 API calls: store/recall/forget |
372
+
373
+ ### Why not vector databases?
374
+
375
+ FTS quality **improves with scale** while vector/RAG degrades. Vector similarity retrieves topically-similar noise; FTS retrieves exactly what the agent asked for.
376
+
377
+ ### LongMemEval benchmark
378
+
379
+ Tested against [LongMemEval](https://github.com/xiaowu0162/LongMemEval) (ICLR 2025) — 470 questions across 6 types:
380
+
381
+ | | OpenDB (FTS5) | MemPalace (ChromaDB) |
382
+ |---|---|---|
383
+ | **R@5** | **100%** (470/470) | 96.6% |
384
+ | Embedding model | None (keyword index) | all-MiniLM-L6-v2 |
385
+ | API calls | 0 | 0 |
386
+ | Median recall latency | **0.9 ms** | — |
387
+ | Total benchmark time | **32 s** | ~5 min |
388
+
389
+ All 6 question types score 100%. Reproduce: `python benchmark/longmemeval_bench.py`
390
+
391
+ ## Supported Formats
392
+
393
+ | Format | Extensions | Features |
394
+ |--------|-----------|----------|
395
+ | PDF | `.pdf` | Pages, tables, OCR for scanned docs |
396
+ | Word | `.docx` | Page breaks, tables, headings |
397
+ | PowerPoint | `.pptx` | Slides, speaker notes, tables |
398
+ | Excel | `.xlsx` | Multiple sheets, structured JSON output |
399
+ | CSV | `.csv` | Auto-encoding detection, structured JSON |
400
+ | Code | `.py` `.js` `.ts` `.go` `.rs` `.java` ... | Line-numbered output |
401
+ | Text | `.txt` `.md` `.html` `.json` `.xml` | Paragraph chunking |
402
+ | Images | `.png` `.jpg` `.tiff` `.bmp` | OCR (English + Chinese) |
403
+
404
+ ## Key Features
405
+
406
+ - **3-line setup** — `pip install`, `index`, `serve-mcp` — works with every agent framework
407
+ - **7 MCP tools** — `read`, `search`, `glob`, `info` for files + `memory_store`, `memory_recall`, `memory_forget` for memory
408
+ - **Agent memory** — FTS + time-decay ranking, pinned memories, 100% on LongMemEval; no vector DB needed
409
+ - **Dual-mode** — Embedded (SQLite, zero-config) or Server (PostgreSQL, shared access); same API
410
+ - **Real-time sync** — Directories are watched via OS-native events after indexing
411
+ - **Full-text search** — FTS5 / tsvector with jieba CJK tokenization
412
+ - **Structured output** — Spreadsheets as `{sheets: [{columns, rows}]}` for direct analysis
413
+ - **Fuzzy filename resolution** — Find files by exact name, partial match, path, or UUID
414
+
415
+ ## REST API
416
+
417
+ OpenDB also exposes a full HTTP API. Run with `opendb serve` (embedded) or `docker-compose up` (PostgreSQL).
418
+
419
+ | Endpoint | Method | Description |
420
+ |----------|--------|-------------|
421
+ | `/info` | `GET` | Workspace statistics |
422
+ | `/read/{filename}` | `GET` | Read file (`?pages=`, `?lines=`, `?grep=`, `?format=json`) |
423
+ | `/search` | `POST` | Full-text search or regex grep |
424
+ | `/glob` | `GET` | Find files by glob pattern |
425
+ | `/index` | `POST` | Index a directory and start watching |
426
+ | `/files` | `POST`/`GET` | Upload or list files |
427
+ | `/memory` | `POST`/`GET` | Store or list memories |
428
+ | `/memory/recall` | `POST` | Search memories with ranking |
429
+ | `/memory/forget` | `POST` | Delete memories |
430
+ | `/health` | `GET` | Health check |
431
+
432
+ ## Configuration
433
+
434
+ Environment variables (`FILEDB_` prefix):
435
+
436
+ | Variable | Default | Description |
437
+ |----------|---------|-------------|
438
+ | `FILEDB_BACKEND` | `postgres` | `postgres` or `sqlite` |
439
+ | `FILEDB_DATABASE_URL` | `postgresql://...` | PostgreSQL connection |
440
+ | `FILEDB_OCR_ENABLED` | `true` | Enable Tesseract OCR |
441
+ | `FILEDB_OCR_LANGUAGES` | `eng+chi_sim+chi_tra` | OCR languages |
442
+ | `FILEDB_MAX_FILE_SIZE` | `104857600` | Max file size (100MB) |
443
+ | `FILEDB_INDEX_EXCLUDE_PATTERNS` | `[]` | Exclude patterns for indexing |
444
+ | `OPENDB_URL` | `http://localhost:8000` | MCP server → REST API URL |
445
+
446
+ ## Development
447
+
448
+ ```bash
449
+ pip install -e ".[dev]"
450
+ pytest
451
+ ```
452
+
453
+ ## License
454
+
455
+ [MIT](LICENSE)