dirsql 0.2.9__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. {dirsql-0.2.9 → dirsql-0.3.0}/Cargo.lock +43 -1
  2. {dirsql-0.2.9 → dirsql-0.3.0}/PKG-INFO +1 -1
  3. {dirsql-0.2.9 → dirsql-0.3.0}/docs/.vitepress/config.ts +1 -0
  4. {dirsql-0.2.9 → dirsql-0.3.0}/docs/guide/config.md +14 -0
  5. dirsql-0.3.0/docs/guide/persistence.md +177 -0
  6. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/Cargo.toml +1 -1
  7. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/.vitepress/config.ts +1 -0
  8. {dirsql-0.2.9/packages/rust → dirsql-0.3.0/packages/python}/docs/guide/config.md +14 -0
  9. dirsql-0.3.0/packages/python/docs/guide/persistence.md +177 -0
  10. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/src/lib.rs +10 -1
  11. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/tests/integration/test_async_dirsql.py +12 -0
  12. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/tests/integration/test_binding.py +35 -1
  13. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/tests/integration/test_docs_examples.py +14 -0
  14. dirsql-0.3.0/packages/python/tests/integration/test_persist.py +300 -0
  15. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/Cargo.toml +1 -0
  16. {dirsql-0.2.9/packages/python → dirsql-0.3.0/packages/rust}/docs/guide/config.md +14 -0
  17. dirsql-0.3.0/packages/rust/docs/guide/persistence.md +177 -0
  18. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/config.rs +49 -3
  19. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/db.rs +15 -0
  20. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/lib.rs +376 -66
  21. dirsql-0.3.0/packages/rust/src/persist.rs +603 -0
  22. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/scanner.rs +33 -1
  23. dirsql-0.3.0/packages/rust/tests/persist.rs +393 -0
  24. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/tests/sdk.rs +6 -3
  25. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/_async.py +17 -1
  26. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/test_async.py +12 -1
  27. {dirsql-0.2.9 → dirsql-0.3.0}/Cargo.toml +0 -0
  28. {dirsql-0.2.9 → dirsql-0.3.0}/README.md +0 -0
  29. {dirsql-0.2.9 → dirsql-0.3.0}/docs/.claude/CLAUDE.md +0 -0
  30. {dirsql-0.2.9 → dirsql-0.3.0}/docs/.vitepress/theme/index.ts +0 -0
  31. {dirsql-0.2.9 → dirsql-0.3.0}/docs/.vitepress/theme/lang.ts +0 -0
  32. {dirsql-0.2.9 → dirsql-0.3.0}/docs/AGENTS.md +0 -0
  33. {dirsql-0.2.9 → dirsql-0.3.0}/docs/api/index.md +0 -0
  34. {dirsql-0.2.9 → dirsql-0.3.0}/docs/getting-started.md +0 -0
  35. {dirsql-0.2.9 → dirsql-0.3.0}/docs/guide/async.md +0 -0
  36. {dirsql-0.2.9 → dirsql-0.3.0}/docs/guide/cli.md +0 -0
  37. {dirsql-0.2.9 → dirsql-0.3.0}/docs/guide/crdt.md +0 -0
  38. {dirsql-0.2.9 → dirsql-0.3.0}/docs/guide/querying.md +0 -0
  39. {dirsql-0.2.9 → dirsql-0.3.0}/docs/guide/tables.md +0 -0
  40. {dirsql-0.2.9 → dirsql-0.3.0}/docs/guide/watching.md +0 -0
  41. {dirsql-0.2.9 → dirsql-0.3.0}/docs/index.md +0 -0
  42. {dirsql-0.2.9 → dirsql-0.3.0}/docs/migrations.md +0 -0
  43. {dirsql-0.2.9 → dirsql-0.3.0}/docs/package.json +0 -0
  44. {dirsql-0.2.9 → dirsql-0.3.0}/docs/playwright.config.ts +0 -0
  45. {dirsql-0.2.9 → dirsql-0.3.0}/docs/pnpm-lock.yaml +0 -0
  46. {dirsql-0.2.9 → dirsql-0.3.0}/docs/pnpm-workspace.yaml +0 -0
  47. {dirsql-0.2.9 → dirsql-0.3.0}/docs/tests/integration/home.spec.ts +0 -0
  48. {dirsql-0.2.9 → dirsql-0.3.0}/docs/tests/integration/language-flag.spec.ts +0 -0
  49. {dirsql-0.2.9 → dirsql-0.3.0}/docs/tests/unit/config.test.ts +0 -0
  50. {dirsql-0.2.9 → dirsql-0.3.0}/docs/tests/unit/lang.test.ts +0 -0
  51. {dirsql-0.2.9 → dirsql-0.3.0}/docs/vitest.config.ts +0 -0
  52. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/README.md +0 -0
  53. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/.claude/CLAUDE.md +0 -0
  54. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/.vitepress/theme/index.ts +0 -0
  55. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/.vitepress/theme/lang.ts +0 -0
  56. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/AGENTS.md +0 -0
  57. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/api/index.md +0 -0
  58. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/getting-started.md +0 -0
  59. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/guide/async.md +0 -0
  60. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/guide/cli.md +0 -0
  61. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/guide/crdt.md +0 -0
  62. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/guide/querying.md +0 -0
  63. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/guide/tables.md +0 -0
  64. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/guide/watching.md +0 -0
  65. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/index.md +0 -0
  66. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/migrations.md +0 -0
  67. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/package.json +0 -0
  68. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/playwright.config.ts +0 -0
  69. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/pnpm-lock.yaml +0 -0
  70. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/pnpm-workspace.yaml +0 -0
  71. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/tests/integration/home.spec.ts +0 -0
  72. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/tests/integration/language-flag.spec.ts +0 -0
  73. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/tests/unit/config.test.ts +0 -0
  74. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/tests/unit/lang.test.ts +0 -0
  75. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/docs/vitest.config.ts +0 -0
  76. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/python/conftest.py +0 -0
  77. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/tests/__init__.py +0 -0
  78. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/tests/conftest.py +0 -0
  79. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/tests/integration/__init__.py +0 -0
  80. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/tests/integration/test_dirsql.py +0 -0
  81. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/tests/integration/test_docs_gaps.py +0 -0
  82. {dirsql-0.2.9 → dirsql-0.3.0}/packages/python/tests/integration/test_from_config.py +0 -0
  83. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/README.md +0 -0
  84. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/benches/db_bench.rs +0 -0
  85. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/benches/differ_bench.rs +0 -0
  86. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/benches/matcher_bench.rs +0 -0
  87. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/benches/scanner_bench.rs +0 -0
  88. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/api/index.md +0 -0
  89. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/getting-started.md +0 -0
  90. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/guide/async.md +0 -0
  91. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/guide/cli.md +0 -0
  92. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/guide/crdt.md +0 -0
  93. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/guide/querying.md +0 -0
  94. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/guide/tables.md +0 -0
  95. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/guide/watching.md +0 -0
  96. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/index.md +0 -0
  97. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/docs/migrations.md +0 -0
  98. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/bin/dirsql.rs +0 -0
  99. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/cli/mod.rs +0 -0
  100. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/cli/router.rs +0 -0
  101. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/cli/serialize.rs +0 -0
  102. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/cli/server.rs +0 -0
  103. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/differ.rs +0 -0
  104. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/matcher.rs +0 -0
  105. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/parser.rs +0 -0
  106. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/src/watcher.rs +0 -0
  107. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/tests/async_sdk.rs +0 -0
  108. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/tests/cli_e2e.rs +0 -0
  109. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/tests/cli_integration.rs +0 -0
  110. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/tests/docs_examples.rs +0 -0
  111. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/tests/docs_gaps.rs +0 -0
  112. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/tests/from_config.rs +0 -0
  113. {dirsql-0.2.9 → dirsql-0.3.0}/packages/rust/tests/readonly_query.rs +0 -0
  114. {dirsql-0.2.9 → dirsql-0.3.0}/pyproject.toml +0 -0
  115. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/__init__.py +0 -0
  116. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/_cli/__init__.py +0 -0
  117. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/_cli/binary_path.py +0 -0
  118. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/_cli/binary_path_test.py +0 -0
  119. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/_cli/is_windows.py +0 -0
  120. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/_cli/is_windows_test.py +0 -0
  121. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/_cli/main.py +0 -0
  122. {dirsql-0.2.9 → dirsql-0.3.0}/python/dirsql/_cli/main_test.py +0 -0
@@ -73,6 +73,18 @@ version = "1.0.102"
73
73
  source = "registry+https://github.com/rust-lang/crates.io-index"
74
74
  checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
75
75
 
76
+ [[package]]
77
+ name = "arrayref"
78
+ version = "0.3.9"
79
+ source = "registry+https://github.com/rust-lang/crates.io-index"
80
+ checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
81
+
82
+ [[package]]
83
+ name = "arrayvec"
84
+ version = "0.7.6"
85
+ source = "registry+https://github.com/rust-lang/crates.io-index"
86
+ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
87
+
76
88
  [[package]]
77
89
  name = "assert_cmd"
78
90
  version = "2.2.1"
@@ -176,6 +188,20 @@ version = "2.11.1"
176
188
  source = "registry+https://github.com/rust-lang/crates.io-index"
177
189
  checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
178
190
 
191
+ [[package]]
192
+ name = "blake3"
193
+ version = "1.8.4"
194
+ source = "registry+https://github.com/rust-lang/crates.io-index"
195
+ checksum = "4d2d5991425dfd0785aed03aedcf0b321d61975c9b5b3689c774a2610ae0b51e"
196
+ dependencies = [
197
+ "arrayref",
198
+ "arrayvec",
199
+ "cc",
200
+ "cfg-if",
201
+ "constant_time_eq",
202
+ "cpufeatures",
203
+ ]
204
+
179
205
  [[package]]
180
206
  name = "bstr"
181
207
  version = "1.12.1"
@@ -300,6 +326,12 @@ version = "1.0.5"
300
326
  source = "registry+https://github.com/rust-lang/crates.io-index"
301
327
  checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
302
328
 
329
+ [[package]]
330
+ name = "constant_time_eq"
331
+ version = "0.4.2"
332
+ source = "registry+https://github.com/rust-lang/crates.io-index"
333
+ checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
334
+
303
335
  [[package]]
304
336
  name = "convert_case"
305
337
  version = "0.11.0"
@@ -325,6 +357,15 @@ version = "0.8.7"
325
357
  source = "registry+https://github.com/rust-lang/crates.io-index"
326
358
  checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
327
359
 
360
+ [[package]]
361
+ name = "cpufeatures"
362
+ version = "0.3.0"
363
+ source = "registry+https://github.com/rust-lang/crates.io-index"
364
+ checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
365
+ dependencies = [
366
+ "libc",
367
+ ]
368
+
328
369
  [[package]]
329
370
  name = "criterion"
330
371
  version = "0.5.1"
@@ -441,6 +482,7 @@ version = "0.2.7"
441
482
  dependencies = [
442
483
  "assert_cmd",
443
484
  "axum",
485
+ "blake3",
444
486
  "clap",
445
487
  "criterion",
446
488
  "csv",
@@ -480,7 +522,7 @@ dependencies = [
480
522
 
481
523
  [[package]]
482
524
  name = "dirsql-py-ext"
483
- version = "0.2.9"
525
+ version = "0.3.0"
484
526
  dependencies = [
485
527
  "dirsql",
486
528
  "pyo3",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dirsql
3
- Version: 0.2.9
3
+ Version: 0.3.0
4
4
  Requires-Dist: pytest>=8 ; extra == 'dev'
5
5
  Requires-Dist: pytest-describe>=2 ; extra == 'dev'
6
6
  Requires-Dist: pytest-asyncio>=0.23 ; extra == 'dev'
@@ -32,6 +32,7 @@ export default defineConfig({
32
32
  { text: 'Defining Tables', link: '/guide/tables' },
33
33
  { text: 'Querying', link: '/guide/querying' },
34
34
  { text: 'File Watching', link: '/guide/watching' },
35
+ { text: 'Persistence', link: '/guide/persistence' },
35
36
  { text: 'Async API', link: '/guide/async' },
36
37
  { text: 'Command-Line Interface', link: '/guide/cli' },
37
38
  { text: 'Collaboration with CRDTs', link: '/guide/crdt' }
@@ -169,6 +169,20 @@ The `ignore` list skips files and directories entirely (not even scanned):
169
169
  ignore = ["node_modules/**", ".git/**", "*.pyc", "__pycache__/**"]
170
170
  ```
171
171
 
172
+ The top-level `.dirsql/` directory is always excluded, whether you list it or not -- it is a reserved namespace for `dirsql`'s own metadata (see [Persistence](./persistence.md)).
173
+
174
+ ## Persistence
175
+
176
+ Set `persist = true` to keep the SQLite database on disk between runs instead of rebuilding from scratch on every startup:
177
+
178
+ ```toml
179
+ [dirsql]
180
+ persist = true
181
+ # persist_path = ".dirsql/cache.db" # optional; this is the default
182
+ ```
183
+
184
+ See [Persistence](./persistence.md) for the full reconcile algorithm, storage layout, and limitations.
185
+
172
186
  ## Strict Mode
173
187
 
174
188
  By default, extra keys in file content are ignored and missing keys become NULL. Enable strict mode to error on mismatches:
@@ -0,0 +1,177 @@
1
+ # Persistence
2
+
3
+ By default `dirsql` keeps its SQLite database in memory and rebuilds it from scratch every time the process starts. For large directories this can take seconds to minutes -- nearly all of which is spent re-parsing files that haven't changed since the previous run.
4
+
5
+ Persistence stores the SQLite database on disk so that subsequent startups only re-parse the files that have actually changed.
6
+
7
+ ::: tip Same answers, faster startup
8
+ The rows returned by `query()` after a persistent startup are equivalent to those produced by a from-scratch rebuild. Persistence is a startup-time optimization, not a correctness compromise. The reconcile algorithm is the same one `git status` uses to decide which files have changed since the last index write.
9
+ :::
10
+
11
+ ## Quick start
12
+
13
+ ::: code-group
14
+
15
+ ```toml [.dirsql.toml]
16
+ [dirsql]
17
+ persist = true
18
+ ```
19
+
20
+ ```python [Python]
21
+ from dirsql import DirSQL
22
+
23
+ db = DirSQL("./my-project", tables=[...], persist=True)
24
+ await db.ready()
25
+ ```
26
+
27
+ ```rust [Rust]
28
+ use dirsql::DirSQL;
29
+
30
+ let db = DirSQL::builder()
31
+ .root("./my-project")
32
+ .tables(vec![/* ... */])
33
+ .persist(true)
34
+ .build()?;
35
+ ```
36
+
37
+ ```typescript [TypeScript]
38
+ import { DirSQL } from "dirsql";
39
+
40
+ const db = new DirSQL({ root: "./my-project", tables: [/* ... */], persist: true });
41
+ await db.ready;
42
+ ```
43
+
44
+ :::
45
+
46
+ That's it. The first run writes the database to `./my-project/.dirsql/cache.db`. Every subsequent startup uses the cache.
47
+
48
+ ## Configuration
49
+
50
+ | Option | Type | Default | Meaning |
51
+ |---|---|---|---|
52
+ | `persist` | boolean | `false` | Enable persistent on-disk storage. |
53
+ | `persist_path` (Python, Rust) / `persistPath` (TypeScript) | string | `<root>/.dirsql/cache.db` | Override the database file path. Ignored when `persist` is `false`. |
54
+
55
+ The default location keeps the cache alongside the data it indexes, which means it follows the project around (clone, copy, move) without extra setup. Override `persist_path` if you want the cache somewhere else -- a CI cache directory, a tmpfs mount, an XDG cache dir, etc.
56
+
57
+ ::: code-group
58
+
59
+ ```toml [.dirsql.toml]
60
+ [dirsql]
61
+ persist = true
62
+ persist_path = "/var/cache/dirsql/myproject.db"
63
+ ```
64
+
65
+ ```python [Python]
66
+ db = DirSQL(
67
+ "./my-project",
68
+ tables=[...],
69
+ persist=True,
70
+ persist_path="/var/cache/dirsql/myproject.db",
71
+ )
72
+ ```
73
+
74
+ ```rust [Rust]
75
+ let db = DirSQL::builder()
76
+ .root("./my-project")
77
+ .tables(vec![/* ... */])
78
+ .persist(true)
79
+ .persist_path("/var/cache/dirsql/myproject.db")
80
+ .build()?;
81
+ ```
82
+
83
+ ```typescript [TypeScript]
84
+ const db = new DirSQL({
85
+ root: "./my-project",
86
+ tables: [/* ... */],
87
+ persist: true,
88
+ persistPath: "/var/cache/dirsql/myproject.db",
89
+ });
90
+ ```
91
+
92
+ :::
93
+
94
+ ## The `.dirsql/` directory
95
+
96
+ `dirsql` reserves the top-level `.dirsql/` directory inside every scanned root. It is **unconditionally excluded from the directory walk**, whether persistence is enabled or not. This means:
97
+
98
+ - The default cache path `<root>/.dirsql/cache.db` cannot accidentally be ingested as a data file.
99
+ - You can place additional `dirsql`-related files in `.dirsql/` (e.g. a project-local config snapshot) without them being parsed.
100
+ - You should not put your own data files in `.dirsql/` -- they will be silently ignored.
101
+
102
+ If you persist into `.dirsql/`, add it to your `.gitignore`:
103
+
104
+ ```
105
+ .dirsql/
106
+ ```
107
+
108
+ The cache file should never be committed -- it is reproducible from the source tree and frequently large.
109
+
110
+ ## How the startup reconcile works
111
+
112
+ When a persistent cache exists, `dirsql` does not blindly trust it. On startup it:
113
+
114
+ 1. **Checks compatibility metadata.** If the cached `dirsql` version, schema version, glob configuration, parser versions, or canonical root path differs from the current build, the cache is wiped and rebuilt from scratch.
115
+ 2. **Walks the tree and stats every matching file.** This is metadata-only -- no file contents are read.
116
+ 3. **For each file, compares the live `(size, mtime, ctime, inode, dev)` tuple against the cached row:**
117
+ - **Trust the cache** when every field matches *and* the file's mtime is older than the cache's snapshot time (outside the racy window).
118
+ - **Hash-confirm** when the tuple matches but the file's mtime falls inside the racy window. `dirsql` reads and hashes the file; if the hash matches the cached hash, the cache is trusted.
119
+ - **Re-parse** when any field of the tuple differs.
120
+ 4. **Deletes** rows for files that were in the cache but are no longer on disk.
121
+ 5. **Inserts** rows for files that are on disk but were not in the cache.
122
+
123
+ This is the same algorithm `git status` uses to decide which files have changed since the last index write. The "racy window" handling is what closes the gap when a file is modified within the same filesystem-timestamp resolution as the cache write.
124
+
125
+ ## When `dirsql` does a full rebuild
126
+
127
+ Any of the following will cause the cache to be discarded and rebuilt from scratch on the next startup:
128
+
129
+ - The `dirsql` library was upgraded between runs.
130
+ - The glob configuration changed (a new table, a removed table, a modified glob, a changed `ignore` list).
131
+ - A built-in parser version changed (this generally only happens on `dirsql` upgrades).
132
+ - The cache was written for a different root directory than the one currently configured.
133
+ - The internal schema of the cache changed (i.e. you upgraded `dirsql` across a schema version bump).
134
+
135
+ Full rebuilds take exactly as long as a non-persistent startup -- there is no penalty for them, only a missed optimization.
136
+
137
+ ## Limitations
138
+
139
+ ### Network filesystems
140
+
141
+ NFS, SMB/CIFS, and similar network filesystems cache file attributes on the client and can return stale `stat` results. Persistent mode is **not supported** on network filesystems and may produce stale rows. Use in-memory mode (the default) if your `root` lives on a network mount.
142
+
143
+ ### The mtime-preservation edge case
144
+
145
+ Racy-stat detection misses changes only when **all** of the following are true:
146
+
147
+ - A file's contents are modified.
148
+ - The file's size after modification is identical to its size before.
149
+ - The file's `mtime` is externally reset to a value older than the cache's snapshot time (e.g. via `touch -r` or a backup-restore tool that preserves mtime).
150
+
151
+ If you cannot tolerate this edge case, disable persistence (`persist = false`). This is the same trade-off `git` makes with `core.trustctime` / `core.checkStat`.
152
+
153
+ ### Single writer
154
+
155
+ Only one `dirsql` process should write to a given cache file at a time. Multiple read-only processes can query the same file safely once the writer finishes the initial reconcile. Coordinated multi-writer access is not supported in v0.3.0.
156
+
157
+ ## Inspecting the cache
158
+
159
+ The persistent database is a normal SQLite file. You can open it with any SQLite client:
160
+
161
+ ```bash
162
+ sqlite3 .dirsql/cache.db
163
+ ```
164
+
165
+ ```sql
166
+ .tables
167
+ -- comments documents metrics _dirsql_files _dirsql_meta
168
+
169
+ SELECT * FROM _dirsql_meta;
170
+ -- schema_version | 1
171
+ -- dirsql_version | 0.3.0
172
+ -- glob_config_hash | <hex>
173
+ -- parser_versions | {"json":"1","jsonl":"1","csv":"1",...}
174
+ -- root_canonical | /home/alice/my-project
175
+ ```
176
+
177
+ The `_dirsql_files` and `_dirsql_meta` tables are managed by `dirsql`. Do not modify them by hand -- on the next startup, `dirsql` will detect the inconsistency and rebuild from scratch.
@@ -4,7 +4,7 @@ name = "dirsql-py-ext"
4
4
  # pypi/maturin handler can rewrite it via `write-version` before
5
5
  # `maturin build`. `pyproject.toml` declares `dynamic = ["version"]`
6
6
  # and maturin reads this field. Mirrors `packages/rust/Cargo.toml`.
7
- version = "0.2.9"
7
+ version = "0.3.0"
8
8
  edition.workspace = true
9
9
  publish = false
10
10
  readme = "README.md"
@@ -32,6 +32,7 @@ export default defineConfig({
32
32
  { text: 'Defining Tables', link: '/guide/tables' },
33
33
  { text: 'Querying', link: '/guide/querying' },
34
34
  { text: 'File Watching', link: '/guide/watching' },
35
+ { text: 'Persistence', link: '/guide/persistence' },
35
36
  { text: 'Async API', link: '/guide/async' },
36
37
  { text: 'Command-Line Interface', link: '/guide/cli' },
37
38
  { text: 'Collaboration with CRDTs', link: '/guide/crdt' }
@@ -169,6 +169,20 @@ The `ignore` list skips files and directories entirely (not even scanned):
169
169
  ignore = ["node_modules/**", ".git/**", "*.pyc", "__pycache__/**"]
170
170
  ```
171
171
 
172
+ The top-level `.dirsql/` directory is always excluded, whether you list it or not -- it is a reserved namespace for `dirsql`'s own metadata (see [Persistence](./persistence.md)).
173
+
174
+ ## Persistence
175
+
176
+ Set `persist = true` to keep the SQLite database on disk between runs instead of rebuilding from scratch on every startup:
177
+
178
+ ```toml
179
+ [dirsql]
180
+ persist = true
181
+ # persist_path = ".dirsql/cache.db" # optional; this is the default
182
+ ```
183
+
184
+ See [Persistence](./persistence.md) for the full reconcile algorithm, storage layout, and limitations.
185
+
172
186
  ## Strict Mode
173
187
 
174
188
  By default, extra keys in file content are ignored and missing keys become NULL. Enable strict mode to error on mismatches:
@@ -0,0 +1,177 @@
1
+ # Persistence
2
+
3
+ By default `dirsql` keeps its SQLite database in memory and rebuilds it from scratch every time the process starts. For large directories this can take seconds to minutes -- nearly all of which is spent re-parsing files that haven't changed since the previous run.
4
+
5
+ Persistence stores the SQLite database on disk so that subsequent startups only re-parse the files that have actually changed.
6
+
7
+ ::: tip Same answers, faster startup
8
+ The rows returned by `query()` after a persistent startup are equivalent to those produced by a from-scratch rebuild. Persistence is a startup-time optimization, not a correctness compromise. The reconcile algorithm is the same one `git status` uses to decide which files have changed since the last index write.
9
+ :::
10
+
11
+ ## Quick start
12
+
13
+ ::: code-group
14
+
15
+ ```toml [.dirsql.toml]
16
+ [dirsql]
17
+ persist = true
18
+ ```
19
+
20
+ ```python [Python]
21
+ from dirsql import DirSQL
22
+
23
+ db = DirSQL("./my-project", tables=[...], persist=True)
24
+ await db.ready()
25
+ ```
26
+
27
+ ```rust [Rust]
28
+ use dirsql::DirSQL;
29
+
30
+ let db = DirSQL::builder()
31
+ .root("./my-project")
32
+ .tables(vec![/* ... */])
33
+ .persist(true)
34
+ .build()?;
35
+ ```
36
+
37
+ ```typescript [TypeScript]
38
+ import { DirSQL } from "dirsql";
39
+
40
+ const db = new DirSQL({ root: "./my-project", tables: [/* ... */], persist: true });
41
+ await db.ready;
42
+ ```
43
+
44
+ :::
45
+
46
+ That's it. The first run writes the database to `./my-project/.dirsql/cache.db`. Every subsequent startup uses the cache.
47
+
48
+ ## Configuration
49
+
50
+ | Option | Type | Default | Meaning |
51
+ |---|---|---|---|
52
+ | `persist` | boolean | `false` | Enable persistent on-disk storage. |
53
+ | `persist_path` (Python, Rust) / `persistPath` (TypeScript) | string | `<root>/.dirsql/cache.db` | Override the database file path. Ignored when `persist` is `false`. |
54
+
55
+ The default location keeps the cache alongside the data it indexes, which means it follows the project around (clone, copy, move) without extra setup. Override `persist_path` if you want the cache somewhere else -- a CI cache directory, a tmpfs mount, an XDG cache dir, etc.
56
+
57
+ ::: code-group
58
+
59
+ ```toml [.dirsql.toml]
60
+ [dirsql]
61
+ persist = true
62
+ persist_path = "/var/cache/dirsql/myproject.db"
63
+ ```
64
+
65
+ ```python [Python]
66
+ db = DirSQL(
67
+ "./my-project",
68
+ tables=[...],
69
+ persist=True,
70
+ persist_path="/var/cache/dirsql/myproject.db",
71
+ )
72
+ ```
73
+
74
+ ```rust [Rust]
75
+ let db = DirSQL::builder()
76
+ .root("./my-project")
77
+ .tables(vec![/* ... */])
78
+ .persist(true)
79
+ .persist_path("/var/cache/dirsql/myproject.db")
80
+ .build()?;
81
+ ```
82
+
83
+ ```typescript [TypeScript]
84
+ const db = new DirSQL({
85
+ root: "./my-project",
86
+ tables: [/* ... */],
87
+ persist: true,
88
+ persistPath: "/var/cache/dirsql/myproject.db",
89
+ });
90
+ ```
91
+
92
+ :::
93
+
94
+ ## The `.dirsql/` directory
95
+
96
+ `dirsql` reserves the top-level `.dirsql/` directory inside every scanned root. It is **unconditionally excluded from the directory walk**, whether persistence is enabled or not. This means:
97
+
98
+ - The default cache path `<root>/.dirsql/cache.db` cannot accidentally be ingested as a data file.
99
+ - You can place additional `dirsql`-related files in `.dirsql/` (e.g. a project-local config snapshot) without them being parsed.
100
+ - You should not put your own data files in `.dirsql/` -- they will be silently ignored.
101
+
102
+ If you persist into `.dirsql/`, add it to your `.gitignore`:
103
+
104
+ ```
105
+ .dirsql/
106
+ ```
107
+
108
+ The cache file should never be committed -- it is reproducible from the source tree and frequently large.
109
+
110
+ ## How the startup reconcile works
111
+
112
+ When a persistent cache exists, `dirsql` does not blindly trust it. On startup it:
113
+
114
+ 1. **Checks compatibility metadata.** If the cached `dirsql` version, schema version, glob configuration, parser versions, or canonical root path differs from the current build, the cache is wiped and rebuilt from scratch.
115
+ 2. **Walks the tree and stats every matching file.** This is metadata-only -- no file contents are read.
116
+ 3. **For each file, compares the live `(size, mtime, ctime, inode, dev)` tuple against the cached row:**
117
+ - **Trust the cache** when every field matches *and* the file's mtime is older than the cache's snapshot time (outside the racy window).
118
+ - **Hash-confirm** when the tuple matches but the file's mtime falls inside the racy window. `dirsql` reads and hashes the file; if the hash matches the cached hash, the cache is trusted.
119
+ - **Re-parse** when any field of the tuple differs.
120
+ 4. **Deletes** rows for files that were in the cache but are no longer on disk.
121
+ 5. **Inserts** rows for files that are on disk but were not in the cache.
122
+
123
+ This is the same algorithm `git status` uses to decide which files have changed since the last index write. The "racy window" handling is what closes the gap when a file is modified within the same filesystem-timestamp resolution as the cache write.
124
+
125
+ ## When `dirsql` does a full rebuild
126
+
127
+ Any of the following will cause the cache to be discarded and rebuilt from scratch on the next startup:
128
+
129
+ - The `dirsql` library was upgraded between runs.
130
+ - The glob configuration changed (a new table, a removed table, a modified glob, a changed `ignore` list).
131
+ - A built-in parser version changed (this generally only happens on `dirsql` upgrades).
132
+ - The cache was written for a different root directory than the one currently configured.
133
+ - The internal schema of the cache changed (i.e. you upgraded `dirsql` across a schema version bump).
134
+
135
+ Full rebuilds take exactly as long as a non-persistent startup -- there is no penalty for them, only a missed optimization.
136
+
137
+ ## Limitations
138
+
139
+ ### Network filesystems
140
+
141
+ NFS, SMB/CIFS, and similar network filesystems cache file attributes on the client and can return stale `stat` results. Persistent mode is **not supported** on network filesystems and may produce stale rows. Use in-memory mode (the default) if your `root` lives on a network mount.
142
+
143
+ ### The mtime-preservation edge case
144
+
145
+ Racy-stat detection misses changes only when **all** of the following are true:
146
+
147
+ - A file's contents are modified.
148
+ - The file's size after modification is identical to its size before.
149
+ - The file's `mtime` is externally reset to a value older than the cache's snapshot time (e.g. via `touch -r` or a backup-restore tool that preserves mtime).
150
+
151
+ If you cannot tolerate this edge case, disable persistence (`persist = false`). This is the same trade-off `git` makes with `core.trustctime` / `core.checkStat`.
152
+
153
+ ### Single writer
154
+
155
+ Only one `dirsql` process should write to a given cache file at a time. Multiple read-only processes can query the same file safely once the writer finishes the initial reconcile. Coordinated multi-writer access is not supported in v0.3.0.
156
+
157
+ ## Inspecting the cache
158
+
159
+ The persistent database is a normal SQLite file. You can open it with any SQLite client:
160
+
161
+ ```bash
162
+ sqlite3 .dirsql/cache.db
163
+ ```
164
+
165
+ ```sql
166
+ .tables
167
+ -- comments documents metrics _dirsql_files _dirsql_meta
168
+
169
+ SELECT * FROM _dirsql_meta;
170
+ -- schema_version | 1
171
+ -- dirsql_version | 0.3.0
172
+ -- glob_config_hash | <hex>
173
+ -- parser_versions | {"json":"1","jsonl":"1","csv":"1",...}
174
+ -- root_canonical | /home/alice/my-project
175
+ ```
176
+
177
+ The `_dirsql_files` and `_dirsql_meta` tables are managed by `dirsql`. Do not modify them by hand -- on the next startup, `dirsql` will detect the inconsistency and rebuild from scratch.
@@ -17,6 +17,7 @@ mod python {
17
17
  use pyo3::prelude::*;
18
18
  use pyo3::types::{PyDict, PyList};
19
19
  use std::collections::HashMap;
20
+ use std::path::PathBuf;
20
21
  use std::time::Duration;
21
22
 
22
23
  // -- Public PyO3 classes ------------------------------------------------
@@ -86,13 +87,15 @@ mod python {
86
87
  #[pymethods]
87
88
  impl PyDirSQL {
88
89
  #[new]
89
- #[pyo3(signature = (root=None, *, tables=None, ignore=None, config=None))]
90
+ #[pyo3(signature = (root=None, *, tables=None, ignore=None, config=None, persist=false, persist_path=None))]
90
91
  fn new(
91
92
  py: Python<'_>,
92
93
  root: Option<String>,
93
94
  tables: Option<Vec<PyRef<'_, PyTable>>>,
94
95
  ignore: Option<Vec<String>>,
95
96
  config: Option<String>,
97
+ persist: bool,
98
+ persist_path: Option<PathBuf>,
96
99
  ) -> PyResult<Self> {
97
100
  let rust_tables: Vec<Table> = tables
98
101
  .as_deref()
@@ -114,6 +117,12 @@ mod python {
114
117
  if let Some(c) = config {
115
118
  builder = builder.config(c);
116
119
  }
120
+ if persist {
121
+ builder = builder.persist(true);
122
+ }
123
+ if let Some(p) = persist_path {
124
+ builder = builder.persist_path(p);
125
+ }
117
126
  builder.build()
118
127
  })
119
128
  .map_err(to_py_err)?;
@@ -282,6 +282,11 @@ def describe_DirSQL_async():
282
282
 
283
283
  async def collect_events():
284
284
  async for event in db.watch():
285
+ # Mid-write the watcher can deliver a spurious error
286
+ # event before the real diff lands; only update / delete
287
+ # / insert are meaningful here.
288
+ if event.action not in ("update", "delete", "insert"):
289
+ continue
285
290
  events.append(event)
286
291
  if len(events) >= 1:
287
292
  break
@@ -322,6 +327,8 @@ def describe_DirSQL_async():
322
327
 
323
328
  async def collect_events():
324
329
  async for event in db.watch():
330
+ if event.action != "error":
331
+ continue
325
332
  events.append(event)
326
333
  if len(events) >= 1:
327
334
  break
@@ -369,6 +376,11 @@ def describe_DirSQL_async():
369
376
 
370
377
  async def collect_events():
371
378
  async for event in db.watch():
379
+ # Filter to insert events only: mid-write the watcher can
380
+ # deliver a spurious error/update event before the insert
381
+ # fires, which would race the query below.
382
+ if event.action != "insert":
383
+ continue
372
384
  events.append(event)
373
385
  if len(events) >= 1:
374
386
  break
@@ -27,11 +27,22 @@ class _FakeRustDirSQL:
27
27
 
28
28
  instances: list = []
29
29
 
30
- def __init__(self, root=None, *, tables=None, ignore=None, config=None):
30
+ def __init__(
31
+ self,
32
+ root=None,
33
+ *,
34
+ tables=None,
35
+ ignore=None,
36
+ config=None,
37
+ persist=False,
38
+ persist_path=None,
39
+ ):
31
40
  self.root = root
32
41
  self.tables = tables
33
42
  self.ignore = ignore
34
43
  self.config = config
44
+ self.persist = persist
45
+ self.persist_path = persist_path
35
46
  self.queries: list[str] = []
36
47
  self.query_results: list = (
37
48
  [{"from_config": config}] if config is not None else [{"ok": 1}]
@@ -254,3 +265,26 @@ def describe_binding_layer():
254
265
  db = async_mod.DirSQL("/root", tables=["t"])
255
266
  await db.ready()
256
267
  assert _FakeRustDirSQL.instances[0].ignore is None
268
+
269
+ def describe_persist_kwargs():
270
+ # Feature: persist / persist_path. See docs/guide/persistence.md.
271
+ @pytest.mark.asyncio
272
+ async def it_forwards_persist_kwargs_to_core(mock_core):
273
+ db = async_mod.DirSQL(
274
+ "/root",
275
+ tables=["t"],
276
+ persist=True,
277
+ persist_path="/tmp/cache.db",
278
+ )
279
+ await db.ready()
280
+ inst = _FakeRustDirSQL.instances[0]
281
+ assert inst.persist is True
282
+ assert inst.persist_path == "/tmp/cache.db"
283
+
284
+ @pytest.mark.asyncio
285
+ async def it_defaults_persist_to_false(mock_core):
286
+ db = async_mod.DirSQL("/root", tables=["t"])
287
+ await db.ready()
288
+ inst = _FakeRustDirSQL.instances[0]
289
+ assert inst.persist is False
290
+ assert inst.persist_path is None