dirsql 0.0.17__tar.gz → 0.0.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {dirsql-0.0.17 → dirsql-0.0.19}/Cargo.toml +1 -0
  2. {dirsql-0.0.17 → dirsql-0.0.19}/PKG-INFO +1 -1
  3. dirsql-0.0.19/README.md +227 -0
  4. {dirsql-0.0.17 → dirsql-0.0.19}/pyproject.toml +2 -2
  5. {dirsql-0.0.17 → dirsql-0.0.19}/python/dirsql/_async.py +26 -10
  6. {dirsql-0.0.17 → dirsql-0.0.19}/tests/integration/test_async_dirsql.py +60 -25
  7. {dirsql-0.0.17 → dirsql-0.0.19}/.claude/CLAUDE.md +0 -0
  8. {dirsql-0.0.17 → dirsql-0.0.19}/.github/workflows/minor-release.yml +0 -0
  9. {dirsql-0.0.17 → dirsql-0.0.19}/.github/workflows/patch-release.yml +0 -0
  10. {dirsql-0.0.17 → dirsql-0.0.19}/.github/workflows/pr-monitor.yml +0 -0
  11. {dirsql-0.0.17 → dirsql-0.0.19}/.github/workflows/publish.yml +0 -0
  12. {dirsql-0.0.17 → dirsql-0.0.19}/.github/workflows/python-lint.yml +0 -0
  13. {dirsql-0.0.17 → dirsql-0.0.19}/.github/workflows/python-test.yml +0 -0
  14. {dirsql-0.0.17 → dirsql-0.0.19}/.github/workflows/rust-test.yml +0 -0
  15. {dirsql-0.0.17 → dirsql-0.0.19}/.gitignore +0 -0
  16. {dirsql-0.0.17 → dirsql-0.0.19}/.npmignore +0 -0
  17. {dirsql-0.0.17 → dirsql-0.0.19}/Cargo.lock +0 -0
  18. {dirsql-0.0.17 → dirsql-0.0.19}/LICENSE +0 -0
  19. {dirsql-0.0.17 → dirsql-0.0.19}/SUMMARY.md +0 -0
  20. {dirsql-0.0.17 → dirsql-0.0.19}/python/dirsql/__init__.py +0 -0
  21. {dirsql-0.0.17 → dirsql-0.0.19}/src/db.rs +0 -0
  22. {dirsql-0.0.17 → dirsql-0.0.19}/src/differ.rs +0 -0
  23. {dirsql-0.0.17 → dirsql-0.0.19}/src/lib.rs +0 -0
  24. {dirsql-0.0.17 → dirsql-0.0.19}/src/matcher.rs +0 -0
  25. {dirsql-0.0.17 → dirsql-0.0.19}/src/scanner.rs +0 -0
  26. {dirsql-0.0.17 → dirsql-0.0.19}/src/watcher.rs +0 -0
  27. {dirsql-0.0.17 → dirsql-0.0.19}/tests/__init__.py +0 -0
  28. {dirsql-0.0.17 → dirsql-0.0.19}/tests/conftest.py +0 -0
  29. {dirsql-0.0.17 → dirsql-0.0.19}/tests/integration/__init__.py +0 -0
  30. {dirsql-0.0.17 → dirsql-0.0.19}/tests/integration/test_dirsql.py +0 -0
@@ -7,6 +7,7 @@ license = "MIT"
7
7
  repository = "https://github.com/thekevinscott/dirsql"
8
8
  keywords = ["sql", "filesystem", "directory", "sqlite", "index"]
9
9
  categories = ["filesystem", "database"]
10
+ readme = "README.md"
10
11
 
11
12
  [lib]
12
13
  crate-type = ["cdylib", "rlib"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dirsql
3
- Version: 0.0.17
3
+ Version: 0.0.19
4
4
  Requires-Dist: pytest>=8 ; extra == 'dev'
5
5
  Requires-Dist: pytest-describe>=2 ; extra == 'dev'
6
6
  Requires-Dist: pytest-asyncio>=0.23 ; extra == 'dev'
@@ -0,0 +1,227 @@
1
+ # dirsql
2
+
3
+ Ephemeral SQL index over a local directory. Watches a filesystem, ingests structured files into an in-memory SQLite database, and exposes a SQL query interface. On shutdown, the database is discarded -- the filesystem remains the source of truth.
4
+
5
+ ## Why
6
+
7
+ Structured data stored as flat files (JSONL, JSON) is easy to read, write, diff, and version. But querying across many files is slow -- "show me all unresolved comments across 50 documents" requires opening and parsing every file.
8
+
9
+ dirsql bridges this gap: files remain the source of truth, but you get SQL queries and real-time change events for free.
10
+
11
+ ## Installation
12
+
13
+ ```bash
14
+ pip install dirsql
15
+ ```
16
+
17
+ Rust (library only, no Python bindings):
18
+
19
+ ```bash
20
+ cargo add dirsql
21
+ ```
22
+
23
+ ## Quick Start
24
+
25
+ ```python
26
+ import json
27
+ import os
28
+ import tempfile
29
+ from dirsql import DirSQL, Table
30
+
31
+ # Create some data files
32
+ root = tempfile.mkdtemp()
33
+ os.makedirs(os.path.join(root, "comments", "abc"), exist_ok=True)
34
+ os.makedirs(os.path.join(root, "comments", "def"), exist_ok=True)
35
+
36
+ with open(os.path.join(root, "comments", "abc", "index.jsonl"), "w") as f:
37
+ f.write(json.dumps({"body": "looks good", "author": "alice"}) + "\n")
38
+ f.write(json.dumps({"body": "needs work", "author": "bob"}) + "\n")
39
+
40
+ with open(os.path.join(root, "comments", "def", "index.jsonl"), "w") as f:
41
+ f.write(json.dumps({"body": "agreed", "author": "carol"}) + "\n")
42
+
43
+ # Define a table: DDL, glob pattern, and an extract function
44
+ db = DirSQL(
45
+ root,
46
+ tables=[
47
+ Table(
48
+ ddl="CREATE TABLE comments (id TEXT, body TEXT, author TEXT)",
49
+ glob="comments/**/index.jsonl",
50
+ extract=lambda path, content: [
51
+ {
52
+ "id": os.path.basename(os.path.dirname(path)),
53
+ "body": row["body"],
54
+ "author": row["author"],
55
+ }
56
+ for line in content.splitlines()
57
+ for row in [json.loads(line)]
58
+ ],
59
+ ),
60
+ ],
61
+ )
62
+
63
+ # Query with SQL
64
+ results = db.query("SELECT * FROM comments WHERE author = 'alice'")
65
+ # [{"id": "abc", "body": "looks good", "author": "alice"}]
66
+ ```
67
+
68
+ ## Multiple Tables and Joins
69
+
70
+ ```python
71
+ db = DirSQL(
72
+ root,
73
+ tables=[
74
+ Table(
75
+ ddl="CREATE TABLE posts (title TEXT, author_id TEXT)",
76
+ glob="posts/*.json",
77
+ extract=lambda path, content: [json.loads(content)],
78
+ ),
79
+ Table(
80
+ ddl="CREATE TABLE authors (id TEXT, name TEXT)",
81
+ glob="authors/*.json",
82
+ extract=lambda path, content: [json.loads(content)],
83
+ ),
84
+ ],
85
+ )
86
+
87
+ results = db.query("""
88
+ SELECT posts.title, authors.name
89
+ FROM posts JOIN authors ON posts.author_id = authors.id
90
+ """)
91
+ ```
92
+
93
+ ## Async API
94
+
95
+ `AsyncDirSQL` wraps the synchronous API for use with asyncio. Initialization is awaitable, and `watch()` returns an async iterator of row-level change events.
96
+
97
+ ```python
98
+ import asyncio
99
+ import json
100
+ import os
101
+ from dirsql import AsyncDirSQL, Table
102
+
103
+ async def main():
104
+ db = AsyncDirSQL(
105
+ "/path/to/data",
106
+ tables=[
107
+ Table(
108
+ ddl="CREATE TABLE items (name TEXT)",
109
+ glob="**/*.json",
110
+ extract=lambda path, content: [json.loads(content)],
111
+ ),
112
+ ],
113
+ )
114
+ await db.ready() # wait for initial scan to complete
115
+
116
+ # Query works the same way
117
+ results = await db.query("SELECT * FROM items")
118
+
119
+ # Watch for file changes (insert/update/delete/error events)
120
+ async for event in db.watch():
121
+ print(f"{event.action} on {event.table}: {event.row}")
122
+ if event.action == "error":
123
+ print(f" error: {event.error}")
124
+
125
+ asyncio.run(main())
126
+ ```
127
+
128
+ ## Ignoring Files
129
+
130
+ Pass `ignore` patterns to skip files during scanning and watching:
131
+
132
+ ```python
133
+ db = DirSQL(
134
+ root,
135
+ ignore=["**/drafts/**", "**/.git/**"],
136
+ tables=[...],
137
+ )
138
+ ```
139
+
140
+ ## API Reference
141
+
142
+ ### `Table(*, ddl, glob, extract)`
143
+
144
+ Defines how files map to a SQL table.
145
+
146
+ - **`ddl`** (`str`): A `CREATE TABLE` statement defining the schema.
147
+ - **`glob`** (`str`): A glob pattern matched against file paths relative to root.
148
+ - **`extract`** (`Callable[[str, str], list[dict]]`): A function receiving `(relative_path, file_content)` and returning a list of row dicts. Each dict's keys must match the DDL column names.
149
+
150
+ ### `DirSQL(root, *, tables, ignore=None)`
151
+
152
+ Creates an in-memory SQLite database indexed from the directory at `root`.
153
+
154
+ - **`root`** (`str`): Path to the directory to index.
155
+ - **`tables`** (`list[Table]`): Table definitions.
156
+ - **`ignore`** (`list[str] | None`): Glob patterns for paths to skip.
157
+
158
+ #### `DirSQL.query(sql) -> list[dict]`
159
+
160
+ Execute a SQL query. Returns a list of dicts keyed by column name. Internal tracking columns (`_dirsql_*`) are excluded from results.
161
+
162
+ ### `AsyncDirSQL(root, *, tables, ignore=None)`
163
+
164
+ Async wrapper. Constructor is sync (returns immediately). Call `await db.ready()` to wait for the initial scan.
165
+
166
+ #### `await AsyncDirSQL.ready()`
167
+
168
+ Wait for the initial scan to complete. Idempotent -- safe to call multiple times. Raises any exception that occurred during init.
169
+
170
+ #### `await AsyncDirSQL.query(sql) -> list[dict]`
171
+
172
+ Same as `DirSQL.query`, but async.
173
+
174
+ #### `AsyncDirSQL.watch() -> AsyncIterator[RowEvent]`
175
+
176
+ Returns an async iterator that yields `RowEvent` objects as files change on disk. Starts the filesystem watcher on first iteration.
177
+
178
+ ### `RowEvent`
179
+
180
+ Emitted by `watch()` when a file change produces row-level diffs.
181
+
182
+ - **`table`** (`str`): The affected table name.
183
+ - **`action`** (`str`): One of `"insert"`, `"update"`, `"delete"`, `"error"`.
184
+ - **`row`** (`dict | None`): The new row (for insert/update) or deleted row (for delete).
185
+ - **`old_row`** (`dict | None`): The previous row (for update only).
186
+ - **`error`** (`str | None`): Error message (for error events).
187
+ - **`file_path`** (`str | None`): The relative file path that triggered the event.
188
+
189
+ ## How It Works
190
+
191
+ The Rust core (`rusqlite` + `notify` + `walkdir`) does the heavy lifting:
192
+
193
+ 1. **Startup scan**: Walks the directory tree, matches files to tables via glob patterns, calls the user-provided `extract` function for each file, and inserts rows into an in-memory SQLite database.
194
+ 2. **File watching**: Uses the `notify` crate (inotify on Linux, FSEvents on macOS) to detect file creates, modifications, and deletions.
195
+ 3. **Row diffing**: When a file changes, the new rows are diffed against the previous rows for that file, producing granular insert/update/delete events.
196
+ 4. **Python bindings**: PyO3 exposes the Rust core as a native Python extension module. The async layer runs blocking operations in a thread pool via `asyncio.to_thread`.
197
+
198
+ The SQLite database is purely ephemeral -- it exists only in memory and is discarded when the `DirSQL` instance is garbage collected. The filesystem is always the source of truth.
199
+
200
+ ## Development
201
+
202
+ ### Prerequisites
203
+
204
+ - Rust (stable)
205
+ - Python >= 3.12
206
+ - [maturin](https://github.com/PyO3/maturin) for building the Python extension
207
+ - [just](https://github.com/casey/just) as a task runner
208
+
209
+ ### Build and Test
210
+
211
+ ```bash
212
+ # Build the Python extension (dev mode)
213
+ maturin develop
214
+
215
+ # Run all CI checks
216
+ just ci
217
+
218
+ # Individual targets
219
+ just test-rust # Rust unit tests
220
+ just test-integration # Python integration tests
221
+ just clippy # Rust lints
222
+ just lint # Python lints (ruff)
223
+ ```
224
+
225
+ ## License
226
+
227
+ MIT
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "dirsql"
7
- version = "0.0.17"
7
+ version = "0.0.19"
8
8
  description = "Ephemeral SQL index over a local directory"
9
9
  license = "MIT"
10
10
  requires-python = ">=3.12"
@@ -24,7 +24,7 @@ dev = [
24
24
  ]
25
25
 
26
26
  [tool.maturin]
27
- features = ["pyo3/extension-module"]
27
+ features = ["extension-module"]
28
28
  exclude = [
29
29
  ".github/",
30
30
  ".claude/",
@@ -33,7 +33,8 @@ class AsyncDirSQL:
33
33
  """Async wrapper around DirSQL.
34
34
 
35
35
  Usage:
36
- db = await AsyncDirSQL(root, tables=[...])
36
+ db = AsyncDirSQL(root, tables=[...])
37
+ await db.ready()
37
38
  results = await db.query("SELECT ...")
38
39
  async for event in db.watch():
39
40
  ...
@@ -44,15 +45,30 @@ class AsyncDirSQL:
44
45
  self._tables = tables
45
46
  self._ignore = ignore
46
47
  self._db = None
47
-
48
- def __await__(self):
49
- return self._init().__await__()
50
-
51
- async def _init(self):
52
- self._db = await asyncio.to_thread(
53
- DirSQL, self._root, tables=self._tables, ignore=self._ignore
54
- )
55
- return self
48
+ self._ready_event = asyncio.Event()
49
+ self._init_error = None
50
+ self._task = asyncio.ensure_future(self._init_bg())
51
+
52
+ async def _init_bg(self):
53
+ """Run the scan in the background."""
54
+ try:
55
+ self._db = await asyncio.to_thread(
56
+ DirSQL, self._root, tables=self._tables, ignore=self._ignore
57
+ )
58
+ except Exception as exc:
59
+ self._init_error = exc
60
+ finally:
61
+ self._ready_event.set()
62
+
63
+ async def ready(self):
64
+ """Wait until the initial scan is complete.
65
+
66
+ Raises any exception that occurred during init.
67
+ Can be called multiple times safely.
68
+ """
69
+ await self._ready_event.wait()
70
+ if self._init_error is not None:
71
+ raise self._init_error
56
72
 
57
73
  async def query(self, sql):
58
74
  """Execute a SQL query asynchronously."""
@@ -12,9 +12,9 @@ from dirsql import AsyncDirSQL, Table
12
12
  def describe_AsyncDirSQL():
13
13
  def describe_init():
14
14
  @pytest.mark.asyncio
15
- async def it_creates_instance_with_await(jsonl_dir):
16
- """AsyncDirSQL can be initialized with await."""
17
- db = await AsyncDirSQL(
15
+ async def it_creates_instance_synchronously(jsonl_dir):
16
+ """AsyncDirSQL constructor is sync and returns immediately."""
17
+ db = AsyncDirSQL(
18
18
  jsonl_dir,
19
19
  tables=[
20
20
  Table(
@@ -35,9 +35,9 @@ def describe_AsyncDirSQL():
35
35
  assert db is not None
36
36
 
37
37
  @pytest.mark.asyncio
38
- async def it_indexes_files_on_init(jsonl_dir):
39
- """Async init scans and indexes directory contents."""
40
- db = await AsyncDirSQL(
38
+ async def it_indexes_files_after_ready(jsonl_dir):
39
+ """Data is available after awaiting ready()."""
40
+ db = AsyncDirSQL(
41
41
  jsonl_dir,
42
42
  tables=[
43
43
  Table(
@@ -55,33 +55,61 @@ def describe_AsyncDirSQL():
55
55
  ),
56
56
  ],
57
57
  )
58
+ await db.ready()
58
59
  results = await db.query("SELECT * FROM comments")
59
60
  assert len(results) == 3
60
61
 
61
62
  @pytest.mark.asyncio
62
- async def it_raises_on_extract_error_during_init(tmp_dir):
63
- """Extract lambda errors during init raise exceptions."""
63
+ async def it_raises_on_extract_error_during_ready(tmp_dir):
64
+ """Extract lambda errors during ready() raise exceptions."""
64
65
  os.makedirs(os.path.join(tmp_dir, "data"), exist_ok=True)
65
66
  with open(os.path.join(tmp_dir, "data", "bad.json"), "w") as f:
66
67
  f.write("not valid json")
67
68
 
69
+ db = AsyncDirSQL(
70
+ tmp_dir,
71
+ tables=[
72
+ Table(
73
+ ddl="CREATE TABLE items (name TEXT)",
74
+ glob="data/*.json",
75
+ extract=lambda path, content: [json.loads(content)],
76
+ ),
77
+ ],
78
+ )
68
79
  with pytest.raises(Exception):
69
- await AsyncDirSQL(
70
- tmp_dir,
71
- tables=[
72
- Table(
73
- ddl="CREATE TABLE items (name TEXT)",
74
- glob="data/*.json",
75
- extract=lambda path, content: [json.loads(content)],
76
- ),
77
- ],
78
- )
80
+ await db.ready()
81
+
82
+ @pytest.mark.asyncio
83
+ async def it_allows_multiple_ready_calls(jsonl_dir):
84
+ """Calling ready() multiple times is safe and idempotent."""
85
+ db = AsyncDirSQL(
86
+ jsonl_dir,
87
+ tables=[
88
+ Table(
89
+ ddl="CREATE TABLE comments (id TEXT, body TEXT, author TEXT)",
90
+ glob="comments/**/index.jsonl",
91
+ extract=lambda path, content: [
92
+ {
93
+ "id": os.path.basename(os.path.dirname(path)),
94
+ "body": row["body"],
95
+ "author": row["author"],
96
+ }
97
+ for line in content.splitlines()
98
+ for row in [json.loads(line)]
99
+ ],
100
+ ),
101
+ ],
102
+ )
103
+ await db.ready()
104
+ await db.ready()
105
+ results = await db.query("SELECT * FROM comments")
106
+ assert len(results) == 3
79
107
 
80
108
  def describe_query():
81
109
  @pytest.mark.asyncio
82
110
  async def it_returns_results_as_list_of_dicts(jsonl_dir):
83
111
  """Async query returns list of dicts with column names."""
84
- db = await AsyncDirSQL(
112
+ db = AsyncDirSQL(
85
113
  jsonl_dir,
86
114
  tables=[
87
115
  Table(
@@ -99,6 +127,7 @@ def describe_AsyncDirSQL():
99
127
  ),
100
128
  ],
101
129
  )
130
+ await db.ready()
102
131
  results = await db.query(
103
132
  "SELECT author FROM comments WHERE body = 'first comment'"
104
133
  )
@@ -108,7 +137,7 @@ def describe_AsyncDirSQL():
108
137
  @pytest.mark.asyncio
109
138
  async def it_raises_on_invalid_sql(jsonl_dir):
110
139
  """Invalid SQL raises an exception."""
111
- db = await AsyncDirSQL(
140
+ db = AsyncDirSQL(
112
141
  jsonl_dir,
113
142
  tables=[
114
143
  Table(
@@ -126,6 +155,7 @@ def describe_AsyncDirSQL():
126
155
  ),
127
156
  ],
128
157
  )
158
+ await db.ready()
129
159
  with pytest.raises(Exception):
130
160
  await db.query("NOT VALID SQL")
131
161
 
@@ -133,7 +163,7 @@ def describe_AsyncDirSQL():
133
163
  @pytest.mark.asyncio
134
164
  async def it_emits_insert_events_for_new_files(tmp_dir):
135
165
  """watch() yields insert events when a new file is created."""
136
- db = await AsyncDirSQL(
166
+ db = AsyncDirSQL(
137
167
  tmp_dir,
138
168
  tables=[
139
169
  Table(
@@ -143,6 +173,7 @@ def describe_AsyncDirSQL():
143
173
  ),
144
174
  ],
145
175
  )
176
+ await db.ready()
146
177
 
147
178
  events = []
148
179
 
@@ -179,7 +210,7 @@ def describe_AsyncDirSQL():
179
210
  with open(os.path.join(tmp_dir, "doomed.json"), "w") as f:
180
211
  json.dump({"name": "doomed"}, f)
181
212
 
182
- db = await AsyncDirSQL(
213
+ db = AsyncDirSQL(
183
214
  tmp_dir,
184
215
  tables=[
185
216
  Table(
@@ -189,6 +220,7 @@ def describe_AsyncDirSQL():
189
220
  ),
190
221
  ],
191
222
  )
223
+ await db.ready()
192
224
 
193
225
  # Confirm initial data
194
226
  results = await db.query("SELECT * FROM items")
@@ -228,7 +260,7 @@ def describe_AsyncDirSQL():
228
260
  with open(os.path.join(tmp_dir, "item.json"), "w") as f:
229
261
  json.dump({"name": "draft"}, f)
230
262
 
231
- db = await AsyncDirSQL(
263
+ db = AsyncDirSQL(
232
264
  tmp_dir,
233
265
  tables=[
234
266
  Table(
@@ -238,6 +270,7 @@ def describe_AsyncDirSQL():
238
270
  ),
239
271
  ],
240
272
  )
273
+ await db.ready()
241
274
 
242
275
  events = []
243
276
 
@@ -267,7 +300,7 @@ def describe_AsyncDirSQL():
267
300
  @pytest.mark.asyncio
268
301
  async def it_emits_error_events_for_bad_extract(tmp_dir):
269
302
  """watch() yields error events when extract lambda fails."""
270
- db = await AsyncDirSQL(
303
+ db = AsyncDirSQL(
271
304
  tmp_dir,
272
305
  tables=[
273
306
  Table(
@@ -277,6 +310,7 @@ def describe_AsyncDirSQL():
277
310
  ),
278
311
  ],
279
312
  )
313
+ await db.ready()
280
314
 
281
315
  events = []
282
316
 
@@ -305,7 +339,7 @@ def describe_AsyncDirSQL():
305
339
  @pytest.mark.asyncio
306
340
  async def it_updates_db_on_file_changes(tmp_dir):
307
341
  """The database is kept in sync with file system changes."""
308
- db = await AsyncDirSQL(
342
+ db = AsyncDirSQL(
309
343
  tmp_dir,
310
344
  tables=[
311
345
  Table(
@@ -315,6 +349,7 @@ def describe_AsyncDirSQL():
315
349
  ),
316
350
  ],
317
351
  )
352
+ await db.ready()
318
353
 
319
354
  # Initially empty
320
355
  results = await db.query("SELECT * FROM items")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes