dirsql 0.3.44__tar.gz → 0.3.45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. {dirsql-0.3.44 → dirsql-0.3.45}/Cargo.lock +1 -1
  2. {dirsql-0.3.44 → dirsql-0.3.45}/PKG-INFO +1 -1
  3. {dirsql-0.3.44/packages/python → dirsql-0.3.45}/docs/cli/config.md +82 -0
  4. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/Cargo.toml +1 -1
  5. {dirsql-0.3.44 → dirsql-0.3.45/packages/python}/docs/cli/config.md +82 -0
  6. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/cli/config.md +82 -0
  7. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/config.rs +62 -0
  8. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/lib.rs +213 -19
  9. {dirsql-0.3.44 → dirsql-0.3.45}/Cargo.toml +0 -0
  10. {dirsql-0.3.44 → dirsql-0.3.45}/README.md +0 -0
  11. {dirsql-0.3.44 → dirsql-0.3.45}/dirsql/__init__.py +0 -0
  12. {dirsql-0.3.44 → dirsql-0.3.45}/dirsql/_async.py +0 -0
  13. {dirsql-0.3.44 → dirsql-0.3.45}/dirsql/_dirsql.pyi +0 -0
  14. {dirsql-0.3.44 → dirsql-0.3.45}/dirsql/cli/__init__.py +0 -0
  15. {dirsql-0.3.44 → dirsql-0.3.45}/dirsql/cli/binary_path.py +0 -0
  16. {dirsql-0.3.44 → dirsql-0.3.45}/dirsql/cli/interpret/__init__.py +0 -0
  17. {dirsql-0.3.44 → dirsql-0.3.45}/dirsql/cli/is_windows.py +0 -0
  18. {dirsql-0.3.44 → dirsql-0.3.45}/dirsql/cli/main.py +0 -0
  19. {dirsql-0.3.44 → dirsql-0.3.45}/dirsql/py.typed +0 -0
  20. {dirsql-0.3.44 → dirsql-0.3.45}/docs/.claude/CLAUDE.md +0 -0
  21. {dirsql-0.3.44 → dirsql-0.3.45}/docs/.vitepress/config.ts +0 -0
  22. {dirsql-0.3.44 → dirsql-0.3.45}/docs/.vitepress/theme/index.ts +0 -0
  23. {dirsql-0.3.44 → dirsql-0.3.45}/docs/.vitepress/theme/lang.ts +0 -0
  24. {dirsql-0.3.44 → dirsql-0.3.45}/docs/AGENTS.md +0 -0
  25. {dirsql-0.3.44 → dirsql-0.3.45}/docs/api/index.md +0 -0
  26. {dirsql-0.3.44 → dirsql-0.3.45}/docs/cli/http-api.md +0 -0
  27. {dirsql-0.3.44 → dirsql-0.3.45}/docs/cli/index.md +0 -0
  28. {dirsql-0.3.44 → dirsql-0.3.45}/docs/cli/init.md +0 -0
  29. {dirsql-0.3.44 → dirsql-0.3.45}/docs/cli/server.md +0 -0
  30. {dirsql-0.3.44 → dirsql-0.3.45}/docs/getting-started.md +0 -0
  31. {dirsql-0.3.44 → dirsql-0.3.45}/docs/guide/async.md +0 -0
  32. {dirsql-0.3.44 → dirsql-0.3.45}/docs/guide/crdt.md +0 -0
  33. {dirsql-0.3.44 → dirsql-0.3.45}/docs/guide/persistence.md +0 -0
  34. {dirsql-0.3.44 → dirsql-0.3.45}/docs/guide/querying.md +0 -0
  35. {dirsql-0.3.44 → dirsql-0.3.45}/docs/guide/tables.md +0 -0
  36. {dirsql-0.3.44 → dirsql-0.3.45}/docs/guide/watching.md +0 -0
  37. {dirsql-0.3.44 → dirsql-0.3.45}/docs/index.md +0 -0
  38. {dirsql-0.3.44 → dirsql-0.3.45}/docs/migrations.md +0 -0
  39. {dirsql-0.3.44 → dirsql-0.3.45}/docs/package.json +0 -0
  40. {dirsql-0.3.44 → dirsql-0.3.45}/docs/playwright.config.ts +0 -0
  41. {dirsql-0.3.44 → dirsql-0.3.45}/docs/pnpm-lock.yaml +0 -0
  42. {dirsql-0.3.44 → dirsql-0.3.45}/docs/pnpm-workspace.yaml +0 -0
  43. {dirsql-0.3.44 → dirsql-0.3.45}/docs/tests/integration/home.spec.ts +0 -0
  44. {dirsql-0.3.44 → dirsql-0.3.45}/docs/tests/integration/language-flag.spec.ts +0 -0
  45. {dirsql-0.3.44 → dirsql-0.3.45}/docs/tests/integration/sidebar.spec.ts +0 -0
  46. {dirsql-0.3.44 → dirsql-0.3.45}/docs/tests/unit/config.test.ts +0 -0
  47. {dirsql-0.3.44 → dirsql-0.3.45}/docs/tests/unit/lang.test.ts +0 -0
  48. {dirsql-0.3.44 → dirsql-0.3.45}/docs/vitest.config.ts +0 -0
  49. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/README.md +0 -0
  50. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/conftest.py +0 -0
  51. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/.claude/CLAUDE.md +0 -0
  52. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/.vitepress/config.ts +0 -0
  53. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/.vitepress/theme/index.ts +0 -0
  54. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/.vitepress/theme/lang.ts +0 -0
  55. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/AGENTS.md +0 -0
  56. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/api/index.md +0 -0
  57. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/cli/http-api.md +0 -0
  58. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/cli/index.md +0 -0
  59. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/cli/init.md +0 -0
  60. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/cli/server.md +0 -0
  61. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/getting-started.md +0 -0
  62. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/guide/async.md +0 -0
  63. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/guide/crdt.md +0 -0
  64. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/guide/persistence.md +0 -0
  65. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/guide/querying.md +0 -0
  66. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/guide/tables.md +0 -0
  67. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/guide/watching.md +0 -0
  68. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/index.md +0 -0
  69. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/migrations.md +0 -0
  70. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/package.json +0 -0
  71. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/playwright.config.ts +0 -0
  72. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/pnpm-lock.yaml +0 -0
  73. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/pnpm-workspace.yaml +0 -0
  74. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/tests/integration/home.spec.ts +0 -0
  75. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/tests/integration/language-flag.spec.ts +0 -0
  76. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/tests/integration/sidebar.spec.ts +0 -0
  77. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/tests/unit/config.test.ts +0 -0
  78. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/tests/unit/lang.test.ts +0 -0
  79. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/docs/vitest.config.ts +0 -0
  80. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/e2e-attestation.json +0 -0
  81. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/src/lib.rs +0 -0
  82. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/tests/__init__.py +0 -0
  83. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/tests/conftest.py +0 -0
  84. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/tests/e2e/__init__.py +0 -0
  85. {dirsql-0.3.44 → dirsql-0.3.45}/packages/python/tests/integration/__init__.py +0 -0
  86. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/Cargo.toml +0 -0
  87. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/README.md +0 -0
  88. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/benches/db_bench.rs +0 -0
  89. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/benches/differ_bench.rs +0 -0
  90. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/benches/matcher_bench.rs +0 -0
  91. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/benches/scanner_bench.rs +0 -0
  92. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/api/index.md +0 -0
  93. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/cli/http-api.md +0 -0
  94. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/cli/index.md +0 -0
  95. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/cli/init.md +0 -0
  96. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/cli/server.md +0 -0
  97. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/getting-started.md +0 -0
  98. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/guide/async.md +0 -0
  99. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/guide/crdt.md +0 -0
  100. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/guide/persistence.md +0 -0
  101. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/guide/querying.md +0 -0
  102. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/guide/tables.md +0 -0
  103. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/guide/watching.md +0 -0
  104. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/index.md +0 -0
  105. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/docs/migrations.md +0 -0
  106. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/bin/dirsql.rs +0 -0
  107. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/cli/init.rs +0 -0
  108. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/cli/mod.rs +0 -0
  109. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/cli/router.rs +0 -0
  110. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/cli/serialize.rs +0 -0
  111. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/cli/server.rs +0 -0
  112. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/command.rs +0 -0
  113. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/db.rs +0 -0
  114. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/differ.rs +0 -0
  115. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/matcher.rs +0 -0
  116. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/persist.rs +0 -0
  117. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/scanner.rs +0 -0
  118. {dirsql-0.3.44 → dirsql-0.3.45}/packages/rust/src/watcher.rs +0 -0
  119. {dirsql-0.3.44 → dirsql-0.3.45}/pyproject.toml +0 -0
@@ -501,7 +501,7 @@ dependencies = [
501
501
 
502
502
  [[package]]
503
503
  name = "dirsql-py-ext"
504
- version = "0.3.44"
504
+ version = "0.3.45"
505
505
  dependencies = [
506
506
  "dirsql",
507
507
  "pyo3",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dirsql
3
- Version: 0.3.44
3
+ Version: 0.3.45
4
4
  Requires-Dist: pytest>=8 ; extra == 'dev'
5
5
  Requires-Dist: pytest-describe>=2 ; extra == 'dev'
6
6
  Requires-Dist: pytest-asyncio>=0.23 ; extra == 'dev'
@@ -191,6 +191,57 @@ always filtered to the DDL's declared columns regardless. Strict mode
191
191
  applies only to keys produced by an extract callback (relevant for
192
192
  programmatic [tables](../guide/tables.md)).
193
193
 
194
+ ### Per-file commands (`on-file`)
195
+
196
+ Reach for `on-file` when a table's rows come from the *contents* of each
197
+ matched file, not just its path and stat metadata. A filesystem-fact table
198
+ gives you one row per file; `on-file` runs a command per file that reads the
199
+ file and emits as many rows as it likes.
200
+
201
+ ```toml
202
+ [[table]]
203
+ ddl = "CREATE TABLE papers (paper_id TEXT, title TEXT)"
204
+ glob = "**/meta.json"
205
+ on-file = "uv run python extract_papers.py {path}"
206
+ ```
207
+
208
+ For every file matched by `glob`, `dirsql` runs the command. **The command
209
+ reads the file itself and prints a JSON array of row objects on stdout**; each
210
+ object becomes one row, its fields mapped to columns:
211
+
212
+ ```json
213
+ [
214
+ { "paper_id": "arXiv:2401.001", "title": "On Directories" },
215
+ { "paper_id": "arXiv:2401.002", "title": "SQL All The Way Down" }
216
+ ]
217
+ ```
218
+
219
+ Placeholders substituted into the command:
220
+
221
+ | Placeholder | Value |
222
+ |-------------|-------|
223
+ | `{path}` | The matched file's path **relative to the index root**. Appended automatically when the command omits it, so `extract.py` and `extract.py {path}` behave identically. |
224
+ | `{abspath}` | The matched file's absolute path. |
225
+ | `{root}` | The index root directory. |
226
+
227
+ Filesystem facts (stat virtuals and glob captures) are still merged onto every
228
+ `on-file` row, so you can declare `_path`, `_basename`, `{capture}`, etc. in the
229
+ DDL alongside the command's own columns — a column emitted by the command wins
230
+ over a same-named filesystem fact.
231
+
232
+ JSON values map to SQLite as follows: `null` → NULL; `true`/`false` → `1`/`0`;
233
+ an integer → INTEGER, any other number → REAL; a string → TEXT; a nested array
234
+ or object → its JSON text as TEXT.
235
+
236
+ **Per-file error isolation.** If a file's command fails — a non-zero exit, a
237
+ timeout, a spawn error, or output that isn't a JSON array of objects — that
238
+ file is skipped (it contributes no rows) and a one-line warning naming the file
239
+ and the error is written to stderr. One bad file never aborts the scan; the
240
+ other files' rows are indexed normally.
241
+
242
+ See [Command execution](#command-execution) for the full contract (argv
243
+ splitting, injection safety, cwd, environment, timeout, and output framing).
244
+
194
245
  ### Full Example
195
246
 
196
247
  ```toml
@@ -209,3 +260,34 @@ glob = "**/index.md"
209
260
  ddl = "CREATE TABLE logs (_path TEXT, _size INTEGER, _mtime INTEGER)"
210
261
  glob = "logs/*.csv"
211
262
  ```
263
+
264
+ ## Command execution
265
+
266
+ Config keys that run an external command — today `on-file`, with more events to
267
+ follow — share one execution contract:
268
+
269
+ - **argv, not a shell.** The command string is split into an argv with
270
+ shell-like quoting (spaces separate arguments; quotes group them), but **no
271
+ shell is invoked** — there is no globbing, piping, `$VAR` expansion, or
272
+ `&&`/`;` chaining. To get those, ask for a shell explicitly:
273
+ `sh -c 'grep foo {path} | sort'` — the quoted script stays a single argument.
274
+ - **Injection-safe placeholders.** Each placeholder (`{path}`, `{abspath}`,
275
+ `{root}`, …) is substituted into whole argv tokens, every occurrence, in a
276
+ single left-to-right pass. A substituted value is always exactly one argv
277
+ element, so a path with spaces — or untrusted content that itself contains
278
+ `{…}` or shell metacharacters — is inert and never re-scanned. An unknown
279
+ `{…}` is left literal.
280
+ - **Working directory.** The command runs in the **config file's directory**,
281
+ so relative paths in the command resolve predictably regardless of where you
282
+ launched `dirsql`.
283
+ - **Environment.** The command inherits `dirsql`'s environment, so tools like
284
+ `uvx --with …` / `npx …` resolve their dependencies as usual.
285
+ - **Output framing.** The command's result is the **last non-empty line of
286
+ stdout**; any log/chatter lines above it are ignored. stderr is never data —
287
+ it is captured only to enrich error messages.
288
+ - **Timeout.** Each command run is bounded by a fixed **30-second** timeout (no
289
+ per-table override yet); a command that exceeds it is killed and treated as a
290
+ failure.
291
+ - **Errors.** A non-zero exit, a timeout, a spawn failure, or output that does
292
+ not parse as expected is a per-file failure: the file is skipped with a
293
+ stderr warning and the scan continues.
@@ -4,7 +4,7 @@ name = "dirsql-py-ext"
4
4
  # pypi/maturin handler can rewrite it via `write-version` before
5
5
  # `maturin build`. `pyproject.toml` declares `dynamic = ["version"]`
6
6
  # and maturin reads this field. Mirrors `packages/rust/Cargo.toml`.
7
- version = "0.3.44"
7
+ version = "0.3.45"
8
8
  edition.workspace = true
9
9
  publish = false
10
10
  readme = "README.md"
@@ -191,6 +191,57 @@ always filtered to the DDL's declared columns regardless. Strict mode
191
191
  applies only to keys produced by an extract callback (relevant for
192
192
  programmatic [tables](../guide/tables.md)).
193
193
 
194
+ ### Per-file commands (`on-file`)
195
+
196
+ Reach for `on-file` when a table's rows come from the *contents* of each
197
+ matched file, not just its path and stat metadata. A filesystem-fact table
198
+ gives you one row per file; `on-file` runs a command per file that reads the
199
+ file and emits as many rows as it likes.
200
+
201
+ ```toml
202
+ [[table]]
203
+ ddl = "CREATE TABLE papers (paper_id TEXT, title TEXT)"
204
+ glob = "**/meta.json"
205
+ on-file = "uv run python extract_papers.py {path}"
206
+ ```
207
+
208
+ For every file matched by `glob`, `dirsql` runs the command. **The command
209
+ reads the file itself and prints a JSON array of row objects on stdout**; each
210
+ object becomes one row, its fields mapped to columns:
211
+
212
+ ```json
213
+ [
214
+ { "paper_id": "arXiv:2401.001", "title": "On Directories" },
215
+ { "paper_id": "arXiv:2401.002", "title": "SQL All The Way Down" }
216
+ ]
217
+ ```
218
+
219
+ Placeholders substituted into the command:
220
+
221
+ | Placeholder | Value |
222
+ |-------------|-------|
223
+ | `{path}` | The matched file's path **relative to the index root**. Appended automatically when the command omits it, so `extract.py` and `extract.py {path}` behave identically. |
224
+ | `{abspath}` | The matched file's absolute path. |
225
+ | `{root}` | The index root directory. |
226
+
227
+ Filesystem facts (stat virtuals and glob captures) are still merged onto every
228
+ `on-file` row, so you can declare `_path`, `_basename`, `{capture}`, etc. in the
229
+ DDL alongside the command's own columns — a column emitted by the command wins
230
+ over a same-named filesystem fact.
231
+
232
+ JSON values map to SQLite as follows: `null` → NULL; `true`/`false` → `1`/`0`;
233
+ an integer → INTEGER, any other number → REAL; a string → TEXT; a nested array
234
+ or object → its JSON text as TEXT.
235
+
236
+ **Per-file error isolation.** If a file's command fails — a non-zero exit, a
237
+ timeout, a spawn error, or output that isn't a JSON array of objects — that
238
+ file is skipped (it contributes no rows) and a one-line warning naming the file
239
+ and the error is written to stderr. One bad file never aborts the scan; the
240
+ other files' rows are indexed normally.
241
+
242
+ See [Command execution](#command-execution) for the full contract (argv
243
+ splitting, injection safety, cwd, environment, timeout, and output framing).
244
+
194
245
  ### Full Example
195
246
 
196
247
  ```toml
@@ -209,3 +260,34 @@ glob = "**/index.md"
209
260
  ddl = "CREATE TABLE logs (_path TEXT, _size INTEGER, _mtime INTEGER)"
210
261
  glob = "logs/*.csv"
211
262
  ```
263
+
264
+ ## Command execution
265
+
266
+ Config keys that run an external command — today `on-file`, with more events to
267
+ follow — share one execution contract:
268
+
269
+ - **argv, not a shell.** The command string is split into an argv with
270
+ shell-like quoting (spaces separate arguments; quotes group them), but **no
271
+ shell is invoked** — there is no globbing, piping, `$VAR` expansion, or
272
+ `&&`/`;` chaining. To get those, ask for a shell explicitly:
273
+ `sh -c 'grep foo {path} | sort'` — the quoted script stays a single argument.
274
+ - **Injection-safe placeholders.** Each placeholder (`{path}`, `{abspath}`,
275
+ `{root}`, …) is substituted into whole argv tokens, every occurrence, in a
276
+ single left-to-right pass. A substituted value is always exactly one argv
277
+ element, so a path with spaces — or untrusted content that itself contains
278
+ `{…}` or shell metacharacters — is inert and never re-scanned. An unknown
279
+ `{…}` is left literal.
280
+ - **Working directory.** The command runs in the **config file's directory**,
281
+ so relative paths in the command resolve predictably regardless of where you
282
+ launched `dirsql`.
283
+ - **Environment.** The command inherits `dirsql`'s environment, so tools like
284
+ `uvx --with …` / `npx …` resolve their dependencies as usual.
285
+ - **Output framing.** The command's result is the **last non-empty line of
286
+ stdout**; any log/chatter lines above it are ignored. stderr is never data —
287
+ it is captured only to enrich error messages.
288
+ - **Timeout.** Each command run is bounded by a fixed **30-second** timeout (no
289
+ per-table override yet); a command that exceeds it is killed and treated as a
290
+ failure.
291
+ - **Errors.** A non-zero exit, a timeout, a spawn failure, or output that does
292
+ not parse as expected is a per-file failure: the file is skipped with a
293
+ stderr warning and the scan continues.
@@ -191,6 +191,57 @@ always filtered to the DDL's declared columns regardless. Strict mode
191
191
  applies only to keys produced by an extract callback (relevant for
192
192
  programmatic [tables](../guide/tables.md)).
193
193
 
194
+ ### Per-file commands (`on-file`)
195
+
196
+ Reach for `on-file` when a table's rows come from the *contents* of each
197
+ matched file, not just its path and stat metadata. A filesystem-fact table
198
+ gives you one row per file; `on-file` runs a command per file that reads the
199
+ file and emits as many rows as it likes.
200
+
201
+ ```toml
202
+ [[table]]
203
+ ddl = "CREATE TABLE papers (paper_id TEXT, title TEXT)"
204
+ glob = "**/meta.json"
205
+ on-file = "uv run python extract_papers.py {path}"
206
+ ```
207
+
208
+ For every file matched by `glob`, `dirsql` runs the command. **The command
209
+ reads the file itself and prints a JSON array of row objects on stdout**; each
210
+ object becomes one row, its fields mapped to columns:
211
+
212
+ ```json
213
+ [
214
+ { "paper_id": "arXiv:2401.001", "title": "On Directories" },
215
+ { "paper_id": "arXiv:2401.002", "title": "SQL All The Way Down" }
216
+ ]
217
+ ```
218
+
219
+ Placeholders substituted into the command:
220
+
221
+ | Placeholder | Value |
222
+ |-------------|-------|
223
+ | `{path}` | The matched file's path **relative to the index root**. Appended automatically when the command omits it, so `extract.py` and `extract.py {path}` behave identically. |
224
+ | `{abspath}` | The matched file's absolute path. |
225
+ | `{root}` | The index root directory. |
226
+
227
+ Filesystem facts (stat virtuals and glob captures) are still merged onto every
228
+ `on-file` row, so you can declare `_path`, `_basename`, `{capture}`, etc. in the
229
+ DDL alongside the command's own columns — a column emitted by the command wins
230
+ over a same-named filesystem fact.
231
+
232
+ JSON values map to SQLite as follows: `null` → NULL; `true`/`false` → `1`/`0`;
233
+ an integer → INTEGER, any other number → REAL; a string → TEXT; a nested array
234
+ or object → its JSON text as TEXT.
235
+
236
+ **Per-file error isolation.** If a file's command fails — a non-zero exit, a
237
+ timeout, a spawn error, or output that isn't a JSON array of objects — that
238
+ file is skipped (it contributes no rows) and a one-line warning naming the file
239
+ and the error is written to stderr. One bad file never aborts the scan; the
240
+ other files' rows are indexed normally.
241
+
242
+ See [Command execution](#command-execution) for the full contract (argv
243
+ splitting, injection safety, cwd, environment, timeout, and output framing).
244
+
194
245
  ### Full Example
195
246
 
196
247
  ```toml
@@ -209,3 +260,34 @@ glob = "**/index.md"
209
260
  ddl = "CREATE TABLE logs (_path TEXT, _size INTEGER, _mtime INTEGER)"
210
261
  glob = "logs/*.csv"
211
262
  ```
263
+
264
+ ## Command execution
265
+
266
+ Config keys that run an external command — today `on-file`, with more events to
267
+ follow — share one execution contract:
268
+
269
+ - **argv, not a shell.** The command string is split into an argv with
270
+ shell-like quoting (spaces separate arguments; quotes group them), but **no
271
+ shell is invoked** — there is no globbing, piping, `$VAR` expansion, or
272
+ `&&`/`;` chaining. To get those, ask for a shell explicitly:
273
+ `sh -c 'grep foo {path} | sort'` — the quoted script stays a single argument.
274
+ - **Injection-safe placeholders.** Each placeholder (`{path}`, `{abspath}`,
275
+ `{root}`, …) is substituted into whole argv tokens, every occurrence, in a
276
+ single left-to-right pass. A substituted value is always exactly one argv
277
+ element, so a path with spaces — or untrusted content that itself contains
278
+ `{…}` or shell metacharacters — is inert and never re-scanned. An unknown
279
+ `{…}` is left literal.
280
+ - **Working directory.** The command runs in the **config file's directory**,
281
+ so relative paths in the command resolve predictably regardless of where you
282
+ launched `dirsql`.
283
+ - **Environment.** The command inherits `dirsql`'s environment, so tools like
284
+ `uvx --with …` / `npx …` resolve their dependencies as usual.
285
+ - **Output framing.** The command's result is the **last non-empty line of
286
+ stdout**; any log/chatter lines above it are ignored. stderr is never data —
287
+ it is captured only to enrich error messages.
288
+ - **Timeout.** Each command run is bounded by a fixed **30-second** timeout (no
289
+ per-table override yet); a command that exceeds it is killed and treated as a
290
+ failure.
291
+ - **Errors.** A non-zero exit, a timeout, a spawn failure, or output that does
292
+ not parse as expected is a per-file failure: the file is skipped with a
293
+ stderr warning and the scan continues.
@@ -16,6 +16,9 @@ pub enum ConfigError {
16
16
 
17
17
  #[error("Missing required field '{0}' in [[dirsql.extension]] entry")]
18
18
  MissingExtensionField(&'static str),
19
+
20
+ #[error("Field '{0}' in [[table]] entry must not be empty")]
21
+ EmptyField(&'static str),
19
22
  }
20
23
 
21
24
  pub type Result<T> = std::result::Result<T, ConfigError>;
@@ -75,6 +78,11 @@ pub struct TableConfig {
75
78
  pub ddl: String,
76
79
  pub glob: String,
77
80
  pub strict: Option<bool>,
81
+ /// Optional per-file command (`on-file`). When set, each matched file's
82
+ /// rows come from running this command (which reads the file and prints a
83
+ /// JSON array of row objects) instead of the empty filesystem-facts-only
84
+ /// row. See `dirsql::command` for the execution contract.
85
+ pub on_file: Option<String>,
78
86
  }
79
87
 
80
88
  // --- Raw deserialization types (serde) ---
@@ -105,6 +113,8 @@ struct RawTable {
105
113
  ddl: Option<String>,
106
114
  glob: Option<String>,
107
115
  strict: Option<bool>,
116
+ #[serde(rename = "on-file")]
117
+ on_file: Option<String>,
108
118
  }
109
119
 
110
120
  /// Load and parse a `.dirsql.toml` config file from the given path.
@@ -149,10 +159,20 @@ pub fn load_config_str(content: &str) -> Result<Config> {
149
159
  let ddl = raw_table.ddl.ok_or(ConfigError::MissingField("ddl"))?;
150
160
  let glob = raw_table.glob.ok_or(ConfigError::MissingField("glob"))?;
151
161
 
162
+ // A present-but-empty `on-file = ""` is as unusable as a missing key:
163
+ // reject it at parse time rather than spawning an empty command later.
164
+ let on_file = match raw_table.on_file {
165
+ Some(cmd) if cmd.trim().is_empty() => {
166
+ return Err(ConfigError::EmptyField("on-file"));
167
+ }
168
+ other => other,
169
+ };
170
+
152
171
  tables.push(TableConfig {
153
172
  ddl,
154
173
  glob,
155
174
  strict: raw_table.strict,
175
+ on_file,
156
176
  });
157
177
  }
158
178
 
@@ -444,6 +464,48 @@ path = "b.so"
444
464
  assert_eq!(config.extensions[1].path, PathBuf::from("b.so"));
445
465
  }
446
466
 
467
+ #[test]
468
+ fn on_file_parses_when_present() {
469
+ let toml = r#"
470
+ [[table]]
471
+ ddl = "CREATE TABLE papers (paper_id TEXT, title TEXT)"
472
+ glob = "**/meta.json"
473
+ on-file = "uv run python extract_papers.py {path}"
474
+ "#;
475
+ let config = load_config_str(toml).unwrap();
476
+ assert_eq!(config.tables.len(), 1);
477
+ assert_eq!(
478
+ config.tables[0].on_file.as_deref(),
479
+ Some("uv run python extract_papers.py {path}")
480
+ );
481
+ }
482
+
483
+ #[test]
484
+ fn on_file_absent_is_none() {
485
+ let toml = r#"
486
+ [[table]]
487
+ ddl = "CREATE TABLE t (_path TEXT)"
488
+ glob = "*.json"
489
+ "#;
490
+ let config = load_config_str(toml).unwrap();
491
+ assert!(config.tables[0].on_file.is_none());
492
+ }
493
+
494
+ #[test]
495
+ fn on_file_empty_errors() {
496
+ let toml = r#"
497
+ [[table]]
498
+ ddl = "CREATE TABLE t (_path TEXT)"
499
+ glob = "*.json"
500
+ on-file = " "
501
+ "#;
502
+ let err = load_config_str(toml).unwrap_err();
503
+ assert!(
504
+ matches!(err, ConfigError::EmptyField("on-file")),
505
+ "got: {err:?}"
506
+ );
507
+ }
508
+
447
509
  #[test]
448
510
  fn extension_empty_path_errors() {
449
511
  // An empty `path = ""` is as unusable as a missing key — it must be
@@ -27,6 +27,7 @@ pub mod watcher;
27
27
  #[cfg(feature = "cli")]
28
28
  pub mod cli;
29
29
 
30
+ use crate::command::Placeholder;
30
31
  use crate::db::{Db, parse_table_name};
31
32
  use crate::matcher::TableMatcher;
32
33
  use crate::persist::{
@@ -1038,18 +1039,20 @@ impl DirSQLBuilder {
1038
1039
  .parent()
1039
1040
  .map(PathBuf::from)
1040
1041
  .unwrap_or_else(|| PathBuf::from("."));
1041
- if let Some(cfg_root) = cfg.root.clone() {
1042
- let resolved = if cfg_root.is_absolute() {
1042
+ let resolved_root = if let Some(cfg_root) = cfg.root.clone() {
1043
+ if cfg_root.is_absolute() {
1043
1044
  cfg_root
1044
1045
  } else {
1045
1046
  cfg_parent.join(cfg_root)
1046
- };
1047
- config_root = Some(resolved);
1047
+ }
1048
1048
  } else {
1049
- config_root = Some(cfg_parent.clone());
1050
- }
1049
+ cfg_parent.clone()
1050
+ };
1051
+ config_root = Some(resolved_root.clone());
1051
1052
 
1052
- let cfg_tables = build_tables_from_config(&cfg)?;
1053
+ // `on-file` commands run in the config file's directory and compute
1054
+ // `{path}` relative to the resolved index root.
1055
+ let cfg_tables = build_tables_from_config(&cfg, &cfg_parent, &resolved_root)?;
1053
1056
  tables.extend(cfg_tables);
1054
1057
  ignore.extend(cfg.ignore);
1055
1058
 
@@ -1433,25 +1436,53 @@ fn relative_path(root: &Path, path: &Path) -> String {
1433
1436
  .to_string()
1434
1437
  }
1435
1438
 
1439
+ /// Fixed timeout for an `on-file` command. There is no per-table timeout key
1440
+ /// yet (#327); this module constant is the documented current default.
1441
+ const ON_FILE_TIMEOUT: Duration = Duration::from_secs(30);
1442
+
1436
1443
  /// Build [`Table`] objects from a parsed config.
1437
1444
  ///
1438
- /// Config-defined tables produce one row per matched file. The row is built
1445
+ /// A plain config-defined table produces one row per matched file built
1439
1446
  /// entirely from filesystem facts: glob path captures and stat virtuals
1440
1447
  /// (`_path`, `_basename`, `_dir`, `_ext`, `_size`, `_mtime`, `_ctime`) are
1441
- /// injected by the core pipeline ([`merge_filesystem_facts`]). The
1442
- /// synthesized extract therefore emits a single empty row per file; the
1443
- /// fact-injection layer fills it in. Content interpretation is not a dirsql
1444
- /// concern — for that, register a programmatic [`Table`] with your own
1445
- /// extract closure.
1446
- fn build_tables_from_config(cfg: &config::Config) -> Result<Vec<Table>> {
1448
+ /// injected by the core pipeline ([`merge_filesystem_facts`]). Its synthesized
1449
+ /// extract emits a single empty row per file; the fact-injection layer fills it
1450
+ /// in.
1451
+ ///
1452
+ /// A table with an `on-file` command instead runs that command once per matched
1453
+ /// file (see [`run_on_file`]): the command reads the file and prints a JSON
1454
+ /// array of row objects on stdout, which becomes the file's rows (filesystem
1455
+ /// facts are still merged on top, user values winning). `config_dir` is the
1456
+ /// command's working directory (the config file's parent) and `root` is the
1457
+ /// resolved index root used to compute the `{path}` placeholder.
1458
+ fn build_tables_from_config(
1459
+ cfg: &config::Config,
1460
+ config_dir: &Path,
1461
+ root: &Path,
1462
+ ) -> Result<Vec<Table>> {
1447
1463
  let mut tables = Vec::with_capacity(cfg.tables.len());
1448
1464
 
1449
1465
  for table_cfg in &cfg.tables {
1450
- let mut table = Table::new(
1451
- table_cfg.ddl.clone(),
1452
- table_cfg.glob.clone(),
1453
- |_path: &str| vec![Row::new()],
1454
- );
1466
+ let mut table = match &table_cfg.on_file {
1467
+ Some(command) => {
1468
+ let command = command.clone();
1469
+ let config_dir = config_dir.to_path_buf();
1470
+ let root = root.to_path_buf();
1471
+ // `Table::new` (infallible): `run_on_file` isolates its own
1472
+ // errors to an empty row set so one bad file never aborts the
1473
+ // scan (the scan aborts on an extract `Err`).
1474
+ Table::new(
1475
+ table_cfg.ddl.clone(),
1476
+ table_cfg.glob.clone(),
1477
+ move |abs_path: &str| run_on_file(&command, abs_path, &config_dir, &root),
1478
+ )
1479
+ }
1480
+ None => Table::new(
1481
+ table_cfg.ddl.clone(),
1482
+ table_cfg.glob.clone(),
1483
+ |_path: &str| vec![Row::new()],
1484
+ ),
1485
+ };
1455
1486
 
1456
1487
  if table_cfg.strict == Some(true) {
1457
1488
  table.strict = true;
@@ -1463,6 +1494,89 @@ fn build_tables_from_config(cfg: &config::Config) -> Result<Vec<Table>> {
1463
1494
  Ok(tables)
1464
1495
  }
1465
1496
 
1497
+ /// Run a table's `on-file` command for one matched file and parse its output
1498
+ /// into rows.
1499
+ ///
1500
+ /// Placeholders: `{path}` (the file relative to `root`, append-if-absent so
1501
+ /// `cmd` and `cmd {path}` behave identically), `{abspath}` (the absolute path),
1502
+ /// and `{root}` (the index root). The relative path is computed with a single
1503
+ /// [`Path::strip_prefix`] (#251/#252), falling back to the absolute path when
1504
+ /// the file is not under `root`.
1505
+ ///
1506
+ /// Per-file isolation: any failure — a spawn/exit/timeout error from
1507
+ /// [`command::run_command`], or output that is not a JSON array of objects —
1508
+ /// is logged to stderr and yields no rows (`vec![]`). Returning `Err` here
1509
+ /// would abort the whole scan, so it never does.
1510
+ fn run_on_file(command: &str, abs_path: &str, config_dir: &Path, root: &Path) -> Vec<Row> {
1511
+ let abs = Path::new(abs_path);
1512
+ let rel = abs
1513
+ .strip_prefix(root)
1514
+ .map(|p| p.to_string_lossy().into_owned())
1515
+ .unwrap_or_else(|_| abs_path.to_string());
1516
+ let placeholders = [
1517
+ Placeholder::append("path", rel),
1518
+ Placeholder::new("abspath", abs_path),
1519
+ Placeholder::new("root", root.to_string_lossy().into_owned()),
1520
+ ];
1521
+
1522
+ match command::run_command(command, &placeholders, config_dir, ON_FILE_TIMEOUT, None) {
1523
+ Ok(output) => match parse_command_rows(&output.payload) {
1524
+ Ok(rows) => rows,
1525
+ Err(message) => {
1526
+ eprintln!(
1527
+ "dirsql: skipping `{abs_path}`: on-file output was not a JSON array of rows: {message}"
1528
+ );
1529
+ Vec::new()
1530
+ }
1531
+ },
1532
+ Err(error) => {
1533
+ eprintln!("dirsql: skipping `{abs_path}`: on-file command failed: {error}");
1534
+ Vec::new()
1535
+ }
1536
+ }
1537
+ }
1538
+
1539
+ /// Parse an `on-file` command's stdout payload — a JSON array of row objects —
1540
+ /// into [`Row`]s. Returns `Err(msg)` when the top-level JSON is not an array or
1541
+ /// any element is not an object. Pure (no IO), so it stays colocated-unit-
1542
+ /// testable; the effectful spawn lives in [`run_on_file`].
1543
+ fn parse_command_rows(payload: &str) -> std::result::Result<Vec<Row>, String> {
1544
+ let parsed: serde_json::Value =
1545
+ serde_json::from_str(payload).map_err(|e| format!("invalid JSON: {e}"))?;
1546
+ let array = parsed
1547
+ .as_array()
1548
+ .ok_or_else(|| "expected a JSON array of row objects".to_string())?;
1549
+
1550
+ let mut rows = Vec::with_capacity(array.len());
1551
+ for element in array {
1552
+ let object = element
1553
+ .as_object()
1554
+ .ok_or_else(|| "expected each array element to be a JSON object".to_string())?;
1555
+ let mut row = Row::with_capacity(object.len());
1556
+ for (key, value) in object {
1557
+ row.insert(key.clone(), json_to_value(value));
1558
+ }
1559
+ rows.push(row);
1560
+ }
1561
+ Ok(rows)
1562
+ }
1563
+
1564
+ /// Map a JSON value to a SQLite [`Value`]: `null` → `Null`; `bool` → `Integer`
1565
+ /// (0/1); an integral number → `Integer`, otherwise `Real`; `string` → `Text`;
1566
+ /// an array/object → its JSON text as `Text`. Pure.
1567
+ fn json_to_value(value: &serde_json::Value) -> Value {
1568
+ match value {
1569
+ serde_json::Value::Null => Value::Null,
1570
+ serde_json::Value::Bool(b) => Value::Integer(i64::from(*b)),
1571
+ serde_json::Value::Number(n) => match n.as_i64() {
1572
+ Some(i) => Value::Integer(i),
1573
+ None => Value::Real(n.as_f64().unwrap_or(f64::NAN)),
1574
+ },
1575
+ serde_json::Value::String(s) => Value::Text(s.clone()),
1576
+ other => Value::Text(other.to_string()),
1577
+ }
1578
+ }
1579
+
1466
1580
  /// Reserved column names for filesystem-derived virtual columns. These are
1467
1581
  /// always available on every row when declared in the table DDL; if not
1468
1582
  /// declared, they are silently dropped during normalization.
@@ -2519,3 +2633,83 @@ mod internal_tests {
2519
2633
  assert!(rx.try_recv().is_err(), "loop should have ended");
2520
2634
  }
2521
2635
  }
2636
+
2637
+ #[cfg(test)]
2638
+ mod command_rows_tests {
2639
+ use super::*;
2640
+
2641
+ #[test]
2642
+ fn parses_an_array_of_row_objects() {
2643
+ let rows = parse_command_rows(r#"[{"id":"a","n":1},{"id":"b","n":2}]"#).unwrap();
2644
+ assert_eq!(rows.len(), 2);
2645
+ assert_eq!(rows[0]["id"], Value::Text("a".into()));
2646
+ assert_eq!(rows[0]["n"], Value::Integer(1));
2647
+ assert_eq!(rows[1]["id"], Value::Text("b".into()));
2648
+ assert_eq!(rows[1]["n"], Value::Integer(2));
2649
+ }
2650
+
2651
+ #[test]
2652
+ fn parses_an_empty_array_to_no_rows() {
2653
+ assert_eq!(parse_command_rows("[]").unwrap(), Vec::<Row>::new());
2654
+ }
2655
+
2656
+ #[test]
2657
+ fn maps_every_json_value_type_including_nested_to_text_json() {
2658
+ let rows = parse_command_rows(
2659
+ r#"[{"nul":null,"t":true,"f":false,"i":42,"r":1.5,"s":"hi","arr":[1,2],"obj":{"k":"v"}}]"#,
2660
+ )
2661
+ .unwrap();
2662
+ let row = &rows[0];
2663
+ assert_eq!(row["nul"], Value::Null);
2664
+ assert_eq!(row["t"], Value::Integer(1));
2665
+ assert_eq!(row["f"], Value::Integer(0));
2666
+ assert_eq!(row["i"], Value::Integer(42));
2667
+ assert_eq!(row["r"], Value::Real(1.5));
2668
+ assert_eq!(row["s"], Value::Text("hi".into()));
2669
+ assert_eq!(row["arr"], Value::Text("[1,2]".into()));
2670
+ assert_eq!(row["obj"], Value::Text(r#"{"k":"v"}"#.into()));
2671
+ }
2672
+
2673
+ #[test]
2674
+ fn a_number_that_does_not_fit_i64_becomes_real() {
2675
+ // 10^19 exceeds i64::MAX (~9.2e18) but fits u64, so `as_i64` is None and
2676
+ // it falls through to `Real`.
2677
+ let rows = parse_command_rows(r#"[{"big":10000000000000000000}]"#).unwrap();
2678
+ assert!(matches!(rows[0]["big"], Value::Real(_)));
2679
+ }
2680
+
2681
+ #[test]
2682
+ fn a_non_array_payload_is_an_error() {
2683
+ let err = parse_command_rows(r#"{"id":"a"}"#).unwrap_err();
2684
+ assert!(err.contains("array"), "got: {err}");
2685
+ }
2686
+
2687
+ #[test]
2688
+ fn an_element_that_is_not_an_object_is_an_error() {
2689
+ let err = parse_command_rows(r#"[{"id":"a"}, 3]"#).unwrap_err();
2690
+ assert!(err.contains("object"), "got: {err}");
2691
+ }
2692
+
2693
+ #[test]
2694
+ fn invalid_json_is_an_error() {
2695
+ let err = parse_command_rows("not json at all").unwrap_err();
2696
+ assert!(err.contains("invalid JSON"), "got: {err}");
2697
+ }
2698
+
2699
+ #[test]
2700
+ fn json_to_value_maps_each_variant() {
2701
+ assert_eq!(json_to_value(&serde_json::Value::Null), Value::Null);
2702
+ assert_eq!(json_to_value(&serde_json::json!(true)), Value::Integer(1));
2703
+ assert_eq!(json_to_value(&serde_json::json!(false)), Value::Integer(0));
2704
+ assert_eq!(json_to_value(&serde_json::json!(7)), Value::Integer(7));
2705
+ assert_eq!(json_to_value(&serde_json::json!(2.5)), Value::Real(2.5));
2706
+ assert_eq!(
2707
+ json_to_value(&serde_json::json!("x")),
2708
+ Value::Text("x".into())
2709
+ );
2710
+ assert_eq!(
2711
+ json_to_value(&serde_json::json!([1, 2])),
2712
+ Value::Text("[1,2]".into())
2713
+ );
2714
+ }
2715
+ }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes