dirsql 0.0.16 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +227 -0
- package/package.json +1 -1
package/README.md
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# dirsql
|
|
2
|
+
|
|
3
|
+
Ephemeral SQL index over a local directory. Watches a filesystem, ingests structured files into an in-memory SQLite database, and exposes a SQL query interface. On shutdown, the database is discarded -- the filesystem remains the source of truth.
|
|
4
|
+
|
|
5
|
+
## Why
|
|
6
|
+
|
|
7
|
+
Structured data stored as flat files (JSONL, JSON) is easy to read, write, diff, and version. But querying across many files is slow -- "show me all unresolved comments across 50 documents" requires opening and parsing every file.
|
|
8
|
+
|
|
9
|
+
dirsql bridges this gap: files remain the source of truth, but you get SQL queries and real-time change events for free.
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install dirsql
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Rust (library only, no Python bindings):
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
cargo add dirsql
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
import json
|
|
27
|
+
import os
|
|
28
|
+
import tempfile
|
|
29
|
+
from dirsql import DirSQL, Table
|
|
30
|
+
|
|
31
|
+
# Create some data files
|
|
32
|
+
root = tempfile.mkdtemp()
|
|
33
|
+
os.makedirs(os.path.join(root, "comments", "abc"), exist_ok=True)
|
|
34
|
+
os.makedirs(os.path.join(root, "comments", "def"), exist_ok=True)
|
|
35
|
+
|
|
36
|
+
with open(os.path.join(root, "comments", "abc", "index.jsonl"), "w") as f:
|
|
37
|
+
f.write(json.dumps({"body": "looks good", "author": "alice"}) + "\n")
|
|
38
|
+
f.write(json.dumps({"body": "needs work", "author": "bob"}) + "\n")
|
|
39
|
+
|
|
40
|
+
with open(os.path.join(root, "comments", "def", "index.jsonl"), "w") as f:
|
|
41
|
+
f.write(json.dumps({"body": "agreed", "author": "carol"}) + "\n")
|
|
42
|
+
|
|
43
|
+
# Define a table: DDL, glob pattern, and an extract function
|
|
44
|
+
db = DirSQL(
|
|
45
|
+
root,
|
|
46
|
+
tables=[
|
|
47
|
+
Table(
|
|
48
|
+
ddl="CREATE TABLE comments (id TEXT, body TEXT, author TEXT)",
|
|
49
|
+
glob="comments/**/index.jsonl",
|
|
50
|
+
extract=lambda path, content: [
|
|
51
|
+
{
|
|
52
|
+
"id": os.path.basename(os.path.dirname(path)),
|
|
53
|
+
"body": row["body"],
|
|
54
|
+
"author": row["author"],
|
|
55
|
+
}
|
|
56
|
+
for line in content.splitlines()
|
|
57
|
+
for row in [json.loads(line)]
|
|
58
|
+
],
|
|
59
|
+
),
|
|
60
|
+
],
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Query with SQL
|
|
64
|
+
results = db.query("SELECT * FROM comments WHERE author = 'alice'")
|
|
65
|
+
# [{"id": "abc", "body": "looks good", "author": "alice"}]
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Multiple Tables and Joins
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
db = DirSQL(
|
|
72
|
+
root,
|
|
73
|
+
tables=[
|
|
74
|
+
Table(
|
|
75
|
+
ddl="CREATE TABLE posts (title TEXT, author_id TEXT)",
|
|
76
|
+
glob="posts/*.json",
|
|
77
|
+
extract=lambda path, content: [json.loads(content)],
|
|
78
|
+
),
|
|
79
|
+
Table(
|
|
80
|
+
ddl="CREATE TABLE authors (id TEXT, name TEXT)",
|
|
81
|
+
glob="authors/*.json",
|
|
82
|
+
extract=lambda path, content: [json.loads(content)],
|
|
83
|
+
),
|
|
84
|
+
],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
results = db.query("""
|
|
88
|
+
SELECT posts.title, authors.name
|
|
89
|
+
FROM posts JOIN authors ON posts.author_id = authors.id
|
|
90
|
+
""")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Async API
|
|
94
|
+
|
|
95
|
+
`AsyncDirSQL` wraps the synchronous API for use with asyncio. Initialization is awaitable, and `watch()` returns an async iterator of row-level change events.
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import asyncio
|
|
99
|
+
import json
|
|
100
|
+
import os
|
|
101
|
+
from dirsql import AsyncDirSQL, Table
|
|
102
|
+
|
|
103
|
+
async def main():
|
|
104
|
+
db = AsyncDirSQL(
|
|
105
|
+
"/path/to/data",
|
|
106
|
+
tables=[
|
|
107
|
+
Table(
|
|
108
|
+
ddl="CREATE TABLE items (name TEXT)",
|
|
109
|
+
glob="**/*.json",
|
|
110
|
+
extract=lambda path, content: [json.loads(content)],
|
|
111
|
+
),
|
|
112
|
+
],
|
|
113
|
+
)
|
|
114
|
+
await db.ready() # wait for initial scan to complete
|
|
115
|
+
|
|
116
|
+
# Query works the same way
|
|
117
|
+
results = await db.query("SELECT * FROM items")
|
|
118
|
+
|
|
119
|
+
# Watch for file changes (insert/update/delete/error events)
|
|
120
|
+
async for event in db.watch():
|
|
121
|
+
print(f"{event.action} on {event.table}: {event.row}")
|
|
122
|
+
if event.action == "error":
|
|
123
|
+
print(f" error: {event.error}")
|
|
124
|
+
|
|
125
|
+
asyncio.run(main())
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Ignoring Files
|
|
129
|
+
|
|
130
|
+
Pass `ignore` patterns to skip files during scanning and watching:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
db = DirSQL(
|
|
134
|
+
root,
|
|
135
|
+
ignore=["**/drafts/**", "**/.git/**"],
|
|
136
|
+
tables=[...],
|
|
137
|
+
)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## API Reference
|
|
141
|
+
|
|
142
|
+
### `Table(*, ddl, glob, extract)`
|
|
143
|
+
|
|
144
|
+
Defines how files map to a SQL table.
|
|
145
|
+
|
|
146
|
+
- **`ddl`** (`str`): A `CREATE TABLE` statement defining the schema.
|
|
147
|
+
- **`glob`** (`str`): A glob pattern matched against file paths relative to root.
|
|
148
|
+
- **`extract`** (`Callable[[str, str], list[dict]]`): A function receiving `(relative_path, file_content)` and returning a list of row dicts. Each dict's keys must match the DDL column names.
|
|
149
|
+
|
|
150
|
+
### `DirSQL(root, *, tables, ignore=None)`
|
|
151
|
+
|
|
152
|
+
Creates an in-memory SQLite database indexed from the directory at `root`.
|
|
153
|
+
|
|
154
|
+
- **`root`** (`str`): Path to the directory to index.
|
|
155
|
+
- **`tables`** (`list[Table]`): Table definitions.
|
|
156
|
+
- **`ignore`** (`list[str] | None`): Glob patterns for paths to skip.
|
|
157
|
+
|
|
158
|
+
#### `DirSQL.query(sql) -> list[dict]`
|
|
159
|
+
|
|
160
|
+
Execute a SQL query. Returns a list of dicts keyed by column name. Internal tracking columns (`_dirsql_*`) are excluded from results.
|
|
161
|
+
|
|
162
|
+
### `AsyncDirSQL(root, *, tables, ignore=None)`
|
|
163
|
+
|
|
164
|
+
Async wrapper. Constructor is sync (returns immediately). Call `await db.ready()` to wait for the initial scan.
|
|
165
|
+
|
|
166
|
+
#### `await AsyncDirSQL.ready()`
|
|
167
|
+
|
|
168
|
+
Wait for the initial scan to complete. Idempotent -- safe to call multiple times. Raises any exception that occurred during init.
|
|
169
|
+
|
|
170
|
+
#### `await AsyncDirSQL.query(sql) -> list[dict]`
|
|
171
|
+
|
|
172
|
+
Same as `DirSQL.query`, but async.
|
|
173
|
+
|
|
174
|
+
#### `AsyncDirSQL.watch() -> AsyncIterator[RowEvent]`
|
|
175
|
+
|
|
176
|
+
Returns an async iterator that yields `RowEvent` objects as files change on disk. Starts the filesystem watcher on first iteration.
|
|
177
|
+
|
|
178
|
+
### `RowEvent`
|
|
179
|
+
|
|
180
|
+
Emitted by `watch()` when a file change produces row-level diffs.
|
|
181
|
+
|
|
182
|
+
- **`table`** (`str`): The affected table name.
|
|
183
|
+
- **`action`** (`str`): One of `"insert"`, `"update"`, `"delete"`, `"error"`.
|
|
184
|
+
- **`row`** (`dict | None`): The new row (for insert/update) or deleted row (for delete).
|
|
185
|
+
- **`old_row`** (`dict | None`): The previous row (for update only).
|
|
186
|
+
- **`error`** (`str | None`): Error message (for error events).
|
|
187
|
+
- **`file_path`** (`str | None`): The relative file path that triggered the event.
|
|
188
|
+
|
|
189
|
+
## How It Works
|
|
190
|
+
|
|
191
|
+
The Rust core (`rusqlite` + `notify` + `walkdir`) does the heavy lifting:
|
|
192
|
+
|
|
193
|
+
1. **Startup scan**: Walks the directory tree, matches files to tables via glob patterns, calls the user-provided `extract` function for each file, and inserts rows into an in-memory SQLite database.
|
|
194
|
+
2. **File watching**: Uses the `notify` crate (inotify on Linux, FSEvents on macOS) to detect file creates, modifications, and deletions.
|
|
195
|
+
3. **Row diffing**: When a file changes, the new rows are diffed against the previous rows for that file, producing granular insert/update/delete events.
|
|
196
|
+
4. **Python bindings**: PyO3 exposes the Rust core as a native Python extension module. The async layer runs blocking operations in a thread pool via `asyncio.to_thread`.
|
|
197
|
+
|
|
198
|
+
The SQLite database is purely ephemeral -- it exists only in memory and is discarded when the `DirSQL` instance is garbage collected. The filesystem is always the source of truth.
|
|
199
|
+
|
|
200
|
+
## Development
|
|
201
|
+
|
|
202
|
+
### Prerequisites
|
|
203
|
+
|
|
204
|
+
- Rust (stable)
|
|
205
|
+
- Python >= 3.12
|
|
206
|
+
- [maturin](https://github.com/PyO3/maturin) for building the Python extension
|
|
207
|
+
- [just](https://github.com/casey/just) as a task runner
|
|
208
|
+
|
|
209
|
+
### Build and Test
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
# Build the Python extension (dev mode)
|
|
213
|
+
maturin develop
|
|
214
|
+
|
|
215
|
+
# Run all CI checks
|
|
216
|
+
just ci
|
|
217
|
+
|
|
218
|
+
# Individual targets
|
|
219
|
+
just test-rust # Rust unit tests
|
|
220
|
+
just test-integration # Python integration tests
|
|
221
|
+
just clippy # Rust lints
|
|
222
|
+
just lint # Python lints (ruff)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## License
|
|
226
|
+
|
|
227
|
+
MIT
|