flowquery 1.0.69 → 1.0.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/README.md +682 -86
  2. package/dist/compute/provenance.d.ts +201 -0
  3. package/dist/compute/provenance.d.ts.map +1 -0
  4. package/dist/compute/provenance.js +216 -0
  5. package/dist/compute/provenance.js.map +1 -0
  6. package/dist/compute/runner.d.ts +202 -3
  7. package/dist/compute/runner.d.ts.map +1 -1
  8. package/dist/compute/runner.js +309 -2
  9. package/dist/compute/runner.js.map +1 -1
  10. package/dist/flowquery.min.js +1 -1
  11. package/dist/graph/bindings.d.ts +78 -0
  12. package/dist/graph/bindings.d.ts.map +1 -0
  13. package/dist/graph/bindings.js +224 -0
  14. package/dist/graph/bindings.js.map +1 -0
  15. package/dist/graph/data_cache.d.ts +4 -0
  16. package/dist/graph/data_cache.d.ts.map +1 -1
  17. package/dist/graph/data_cache.js +15 -3
  18. package/dist/graph/data_cache.js.map +1 -1
  19. package/dist/graph/data_resolver.d.ts.map +1 -1
  20. package/dist/graph/data_resolver.js +23 -4
  21. package/dist/graph/data_resolver.js.map +1 -1
  22. package/dist/graph/database.d.ts +4 -2
  23. package/dist/graph/database.d.ts.map +1 -1
  24. package/dist/graph/database.js +45 -9
  25. package/dist/graph/database.js.map +1 -1
  26. package/dist/graph/physical_node.d.ts +7 -2
  27. package/dist/graph/physical_node.d.ts.map +1 -1
  28. package/dist/graph/physical_node.js +22 -26
  29. package/dist/graph/physical_node.js.map +1 -1
  30. package/dist/graph/physical_relationship.d.ts +7 -2
  31. package/dist/graph/physical_relationship.d.ts.map +1 -1
  32. package/dist/graph/physical_relationship.js +22 -26
  33. package/dist/graph/physical_relationship.js.map +1 -1
  34. package/dist/graph/relationship_match_collector.d.ts.map +1 -1
  35. package/dist/graph/relationship_match_collector.js +9 -0
  36. package/dist/graph/relationship_match_collector.js.map +1 -1
  37. package/dist/graph/virtual_sources.d.ts +8 -0
  38. package/dist/graph/virtual_sources.d.ts.map +1 -0
  39. package/dist/graph/virtual_sources.js +31 -0
  40. package/dist/graph/virtual_sources.js.map +1 -0
  41. package/dist/graph/virtual_statement.d.ts +41 -0
  42. package/dist/graph/virtual_statement.d.ts.map +1 -0
  43. package/dist/graph/virtual_statement.js +143 -0
  44. package/dist/graph/virtual_statement.js.map +1 -0
  45. package/dist/index.browser.d.ts +3 -3
  46. package/dist/index.browser.d.ts.map +1 -1
  47. package/dist/index.browser.js.map +1 -1
  48. package/dist/index.node.d.ts +3 -3
  49. package/dist/index.node.d.ts.map +1 -1
  50. package/dist/index.node.js.map +1 -1
  51. package/dist/parsing/ast_node.d.ts +9 -0
  52. package/dist/parsing/ast_node.d.ts.map +1 -1
  53. package/dist/parsing/ast_node.js +21 -4
  54. package/dist/parsing/ast_node.js.map +1 -1
  55. package/dist/parsing/expressions/binding_reference.d.ts +16 -0
  56. package/dist/parsing/expressions/binding_reference.d.ts.map +1 -0
  57. package/dist/parsing/expressions/binding_reference.js +34 -0
  58. package/dist/parsing/expressions/binding_reference.js.map +1 -0
  59. package/dist/parsing/operations/aggregated_return.d.ts +10 -0
  60. package/dist/parsing/operations/aggregated_return.d.ts.map +1 -1
  61. package/dist/parsing/operations/aggregated_return.js +56 -3
  62. package/dist/parsing/operations/aggregated_return.js.map +1 -1
  63. package/dist/parsing/operations/aggregated_with.d.ts +20 -0
  64. package/dist/parsing/operations/aggregated_with.d.ts.map +1 -1
  65. package/dist/parsing/operations/aggregated_with.js +33 -1
  66. package/dist/parsing/operations/aggregated_with.js.map +1 -1
  67. package/dist/parsing/operations/create_node.d.ts +5 -1
  68. package/dist/parsing/operations/create_node.d.ts.map +1 -1
  69. package/dist/parsing/operations/create_node.js +12 -2
  70. package/dist/parsing/operations/create_node.js.map +1 -1
  71. package/dist/parsing/operations/create_relationship.d.ts +5 -1
  72. package/dist/parsing/operations/create_relationship.d.ts.map +1 -1
  73. package/dist/parsing/operations/create_relationship.js +12 -2
  74. package/dist/parsing/operations/create_relationship.js.map +1 -1
  75. package/dist/parsing/operations/drop_binding.d.ts +15 -0
  76. package/dist/parsing/operations/drop_binding.d.ts.map +1 -0
  77. package/dist/parsing/operations/drop_binding.js +42 -0
  78. package/dist/parsing/operations/drop_binding.js.map +1 -0
  79. package/dist/parsing/operations/group_by.d.ts +32 -0
  80. package/dist/parsing/operations/group_by.d.ts.map +1 -1
  81. package/dist/parsing/operations/group_by.js +103 -0
  82. package/dist/parsing/operations/group_by.js.map +1 -1
  83. package/dist/parsing/operations/let.d.ts +36 -0
  84. package/dist/parsing/operations/let.d.ts.map +1 -0
  85. package/dist/parsing/operations/let.js +154 -0
  86. package/dist/parsing/operations/let.js.map +1 -0
  87. package/dist/parsing/operations/load.d.ts +31 -0
  88. package/dist/parsing/operations/load.d.ts.map +1 -1
  89. package/dist/parsing/operations/load.js +102 -2
  90. package/dist/parsing/operations/load.js.map +1 -1
  91. package/dist/parsing/operations/merge.d.ts +158 -0
  92. package/dist/parsing/operations/merge.d.ts.map +1 -0
  93. package/dist/parsing/operations/merge.js +338 -0
  94. package/dist/parsing/operations/merge.js.map +1 -0
  95. package/dist/parsing/operations/order_by.d.ts +6 -0
  96. package/dist/parsing/operations/order_by.d.ts.map +1 -1
  97. package/dist/parsing/operations/order_by.js +10 -1
  98. package/dist/parsing/operations/order_by.js.map +1 -1
  99. package/dist/parsing/operations/refresh_binding.d.ts +15 -0
  100. package/dist/parsing/operations/refresh_binding.d.ts.map +1 -0
  101. package/dist/parsing/operations/refresh_binding.js +42 -0
  102. package/dist/parsing/operations/refresh_binding.js.map +1 -0
  103. package/dist/parsing/operations/refresh_node.d.ts +11 -0
  104. package/dist/parsing/operations/refresh_node.d.ts.map +1 -0
  105. package/dist/parsing/operations/refresh_node.js +46 -0
  106. package/dist/parsing/operations/refresh_node.js.map +1 -0
  107. package/dist/parsing/operations/refresh_relationship.d.ts +11 -0
  108. package/dist/parsing/operations/refresh_relationship.d.ts.map +1 -0
  109. package/dist/parsing/operations/refresh_relationship.js +46 -0
  110. package/dist/parsing/operations/refresh_relationship.js.map +1 -0
  111. package/dist/parsing/operations/return.d.ts +34 -0
  112. package/dist/parsing/operations/return.d.ts.map +1 -1
  113. package/dist/parsing/operations/return.js +102 -3
  114. package/dist/parsing/operations/return.js.map +1 -1
  115. package/dist/parsing/operations/union.d.ts +19 -2
  116. package/dist/parsing/operations/union.d.ts.map +1 -1
  117. package/dist/parsing/operations/union.js +47 -11
  118. package/dist/parsing/operations/union.js.map +1 -1
  119. package/dist/parsing/operations/union_all.d.ts +5 -1
  120. package/dist/parsing/operations/union_all.d.ts.map +1 -1
  121. package/dist/parsing/operations/union_all.js +8 -2
  122. package/dist/parsing/operations/union_all.js.map +1 -1
  123. package/dist/parsing/operations/update.d.ts +27 -0
  124. package/dist/parsing/operations/update.d.ts.map +1 -0
  125. package/dist/parsing/operations/update.js +88 -0
  126. package/dist/parsing/operations/update.js.map +1 -0
  127. package/dist/parsing/operations/update_delete.d.ts +43 -0
  128. package/dist/parsing/operations/update_delete.d.ts.map +1 -0
  129. package/dist/parsing/operations/update_delete.js +105 -0
  130. package/dist/parsing/operations/update_delete.js.map +1 -0
  131. package/dist/parsing/parser.d.ts +70 -1
  132. package/dist/parsing/parser.d.ts.map +1 -1
  133. package/dist/parsing/parser.js +704 -10
  134. package/dist/parsing/parser.js.map +1 -1
  135. package/dist/parsing/statement_info_crawler.d.ts +106 -0
  136. package/dist/parsing/statement_info_crawler.d.ts.map +1 -1
  137. package/dist/parsing/statement_info_crawler.js +231 -12
  138. package/dist/parsing/statement_info_crawler.js.map +1 -1
  139. package/dist/tokenization/keyword.d.ts +21 -1
  140. package/dist/tokenization/keyword.d.ts.map +1 -1
  141. package/dist/tokenization/keyword.js +20 -0
  142. package/dist/tokenization/keyword.js.map +1 -1
  143. package/dist/tokenization/token.d.ts +24 -0
  144. package/dist/tokenization/token.d.ts.map +1 -1
  145. package/dist/tokenization/token.js +73 -0
  146. package/dist/tokenization/token.js.map +1 -1
  147. package/package.json +1 -1
package/README.md CHANGED
@@ -12,11 +12,11 @@ Beyond graphs, FlowQuery provides a full data processing pipeline language with
12
12
 
13
13
  The combination of graph querying and pipeline processing makes FlowQuery ideal for the retrieval stage of Retrieval Augmented Generation (RAG). A typical graph RAG flow works as follows:
14
14
 
15
- 1. **User query** The user asks a question in natural language.
16
- 2. **Schema retrieval** The application retrieves the virtual graph schema via `CALL schema()` and injects it into the system instructions of the query-generation LLM, so it knows which node labels, relationship types, and properties are available.
17
- 3. **Query generation** The LLM, grounded in the schema, generates a precise OpenCypher query to retrieve the data needed to answer the question.
18
- 4. **Query execution** The FlowQuery engine executes the generated OpenCypher query against the virtual graph and returns the results as grounding data.
19
- 5. **Response formulation** The LLM formulates a final response informed by the grounding data.
15
+ 1. **User query** - The user asks a question in natural language.
16
+ 2. **Schema retrieval** - The application retrieves the virtual graph schema via `CALL schema()` and injects it into the system instructions of the query-generation LLM, so it knows which node labels, relationship types, and properties are available.
17
+ 3. **Query generation** - The LLM, grounded in the schema, generates a precise OpenCypher query to retrieve the data needed to answer the question.
18
+ 4. **Query execution** - The FlowQuery engine executes the generated OpenCypher query against the virtual graph and returns the results as grounding data.
19
+ 5. **Response formulation** - The LLM formulates a final response informed by the grounding data.
20
20
 
21
21
  ```
22
22
  ┌───────────────────┐
@@ -37,7 +37,7 @@ The combination of graph querying and pipeline processing makes FlowQuery ideal
37
37
  └──────────┘
38
38
  ```
39
39
 
40
- The schema is retrieved using FlowQuery's built-in `schema()` function, which returns the structure of all registered virtual nodes and relationships including labels, types, endpoint labels, property names, and sample values. This schema is then included in the LLM's system instructions so it can generate correct queries grounded in the actual graph model:
40
+ The schema is retrieved using FlowQuery's built-in `schema()` function, which returns the structure of all registered virtual nodes and relationships - including labels, types, endpoint labels, property names, and sample values. This schema is then included in the LLM's system instructions so it can generate correct queries grounded in the actual graph model:
41
41
 
42
42
  ```cypher
43
43
  CALL schema() YIELD kind, label, type, from_label, to_label, properties, sample
@@ -227,7 +227,7 @@ WITH 1 AS x RETURN x UNION ALL WITH 1 AS x RETURN x
227
227
 
228
228
  #### Multi-Statement Queries
229
229
 
230
- Multiple statements can be separated by semicolons. Only `CREATE VIRTUAL` and `DELETE VIRTUAL` statements may appear before the last statement. The last statement can be any valid query.
230
+ Multiple statements can be separated by semicolons. Only declaration statements - `CREATE VIRTUAL`, `DELETE VIRTUAL` (alias: `DROP VIRTUAL`), `REFRESH VIRTUAL`, `LET`, `UPDATE`, and `MERGE INTO` - may appear before the last statement. The last statement can be any valid query.
231
231
 
232
232
  ```cypher
233
233
  CREATE VIRTUAL (:Person) AS {
@@ -253,87 +253,96 @@ console.log(runner.metadata);
253
253
  // info: { node_labels: ["X"], relationship_types: [], sources: [], ... } }
254
254
  ```
255
255
 
256
- #### Statement Info: Labels, Properties, and Source Lineage
256
+ #### Caching Virtual Entities: `STATIC` and `REFRESH`
257
257
 
258
- `metadata.info` carries a `StatementInfo` describing the _structure_ the
259
- query touches independent of execution. It captures:
258
+ By default, every `MATCH` against a virtual node or relationship re-executes
259
+ its backing sub-query. For expensive sources (HTTP endpoints, large CSV files)
260
+ you can opt in to persistent caching with the `STATIC` keyword:
260
261
 
261
- - The node labels and relationship types referenced.
262
- - The data sources backing the underlying virtual definitions.
263
- - The node/relationship properties **consumed** by the query —
264
- `alias.prop` accesses anywhere in `MATCH`, `WHERE`, `WITH`, `RETURN`,
265
- `ORDER BY`, or function arguments, plus inline pattern properties
266
- like `(u:User {id: 'rick.o'})`.
267
- - The properties **declared** by each virtual's `RETURN` clause via
268
- `info.declared`, so you can validate that a query references only
269
- declared properties.
270
- - Literal values supplied for properties at the call site via
271
- `info.nodes[Label].literal_values` — collected from inline pattern
272
- properties and from equality / `IN` predicates such as
273
- `WHERE u.id = 'rick.o'` or `WHERE u.id IN ['a', 'b']`.
262
+ ```cypher
263
+ CREATE STATIC VIRTUAL (:Country) AS {
264
+ LOAD JSON FROM 'https://restcountries.com/v3.1/all?fields=name,cca2,population' AS c
265
+ RETURN c.cca2 AS code, c.name.common AS name
266
+ };
267
+ ```
274
268
 
275
- This is useful for governance, lineage UIs, query-cost estimation, schema
276
- validation, or routing decisions before the query runs.
269
+ The sub-query runs once on first access and the result is reused for every
270
+ subsequent query in the same process - across `Runner` instances. STATIC
271
+ virtual entities are protected: re-running `CREATE STATIC VIRTUAL (:Country)`
272
+ without first dropping the existing entry raises an error. Use
273
+ `DROP VIRTUAL (:Country)` (an alias for `DELETE VIRTUAL`) to remove it.
277
274
 
278
- The same `StatementInfoCrawler` can also be used directly on any parsed
279
- AST without going through a `Runner`:
275
+ To refresh on a schedule, add a `REFRESH EVERY <n> <unit>` clause. Supported
276
+ units are `SECOND[S]`, `MINUTE[S]`, `HOUR[S]`, and `DAY[S]`:
280
277
 
281
- ```javascript
282
- import { StatementInfoCrawler } from "flowquery";
278
+ ```cypher
279
+ CREATE STATIC VIRTUAL (:Country) AS {
280
+ LOAD JSON FROM 'https://restcountries.com/v3.1/all?fields=name,cca2,population' AS c
281
+ RETURN c.cca2 AS code, c.name.common AS name
282
+ } REFRESH EVERY 1 HOUR;
283
+ ```
283
284
 
284
- const crawler = new StatementInfoCrawler();
285
- const info = crawler.crawl(parsedAst);
285
+ Refresh is lazy: the cache is re-populated on the first access after the TTL
286
+ elapses; no background timers are scheduled. `REFRESH EVERY` requires
287
+ `STATIC` (caching must be enabled to refresh).
288
+
289
+ To force an immediate refresh from anywhere in a query, use
290
+ `REFRESH VIRTUAL (...)`:
291
+
292
+ ```cypher
293
+ REFRESH VIRTUAL (:Country);
294
+ MATCH (c:Country) RETURN c.name
286
295
  ```
287
296
 
288
- For end-to-end lineage from a property to its data source, use the
289
- per-entity `nodes` and `relationships` maps:
297
+ `REFRESH VIRTUAL` works on both nodes and relationships and clears the cache
298
+ so that the next access re-executes the backing sub-query.
290
299
 
291
- ```javascript
292
- const runner = new FlowQuery(`
293
- CREATE VIRTUAL (:City) AS {
294
- LOAD JSON FROM "https://example.com/cities" AS c
295
- RETURN c.id AS id, c.name AS name, c.country AS country
296
- };
297
- CREATE VIRTUAL (:City)-[:FLIGHT]-(:City) AS {
298
- LOAD JSON FROM "https://example.com/flights" AS f
299
- RETURN f.left_id AS left_id, f.right_id AS right_id, f.airline AS airline
300
- };
301
- MATCH (a:City {name: 'NYC'})-[r:FLIGHT]->(b:City)
302
- WHERE b.country IN ['US', 'CA']
303
- RETURN a.name AS origin, b.name AS destination, r.airline AS airline
304
- `);
305
- const { info } = runner.metadata;
300
+ #### Refreshable `LET` Bindings: `REFRESH EVERY`, `REFRESH BINDING`, `DROP BINDING`
306
301
 
307
- console.log(info.nodes);
308
- // {
309
- // City: {
310
- // properties: ["country", "name"],
311
- // sources: ["https://example.com/cities"],
312
- // literal_values: { country: ["US", "CA"], name: ["NYC"] }
313
- // }
314
- // }
315
- console.log(info.relationships);
316
- // {
317
- // FLIGHT: {
318
- // properties: ["airline"],
319
- // sources: ["https://example.com/flights"],
320
- // literal_values: {}
321
- // }
322
- // }
323
- console.log(info.declared.nodes.City);
324
- // { properties: ["country", "id", "name"], sources: ["https://example.com/cities"] }
325
- console.log(info.sources);
326
- // ["https://example.com/cities", "https://example.com/flights"]
302
+ `LET` bindings live for the lifetime of the process, just like virtual
303
+ nodes and relationships, and the same caching primitives apply. A plain
304
+ `LET name = { ... }` evaluates the sub-query once, when the `LET`
305
+ statement executes, and stores the result in the global binding store;
306
+ all subsequent reads return that cached value without re-running the
307
+ sub-query. To opt into TTL-based re-evaluation, add a trailing
308
+ `REFRESH EVERY <n> <unit>` clause:
309
+
310
+ ```cypher
311
+ LET users = {
312
+ LOAD JSON FROM 'https://example.com/users.json' AS u
313
+ RETURN u.id AS id, u.name AS name
314
+ } REFRESH EVERY 5 MINUTES;
315
+ LOAD JSON FROM users AS u RETURN u.id AS id, u.name AS name
316
+ ```
317
+
318
+ The sub-query still runs eagerly at `LET` time and the result is cached
319
+ just like a plain binding; the `REFRESH EVERY` clause additionally
320
+ arranges for the next read after the TTL has elapsed to re-execute the
321
+ sub-query.
322
+
323
+ Refreshable bindings (those with a `REFRESH EVERY` clause) cannot be
324
+ silently overwritten: re-running `LET name = { ... } REFRESH EVERY ...`
325
+ without first dropping the existing binding raises an error, and so
326
+ does `UPDATE name = ...` or `MERGE INTO name ...` against the same
327
+ name. `LET ... REFRESH EVERY` requires a sub-query right-hand side
328
+ (an expression like `42 REFRESH EVERY 1 MINUTE` is rejected). To force
329
+ an immediate refresh outside the TTL schedule, use `REFRESH BINDING`:
330
+
331
+ ```cypher
332
+ REFRESH BINDING users;
333
+ LOAD JSON FROM users AS u RETURN u.id AS id
327
334
  ```
328
335
 
329
- `StatementInfo` resolves sources and declared schemas for **any** virtual
330
- the query touches — both inline `CREATE VIRTUAL` clauses and
331
- previously-registered virtuals reached via `MATCH` or `DELETE`. The flat
332
- `node_labels`, `relationship_types`, `sources`, `node_properties`, and
333
- `relationship_properties` fields stay in sync with the per-entity `nodes` /
334
- `relationships` maps and are convenient for quick aggregate checks. Only
335
- purely literal AST subtrees end up in `literal_values` values that depend
336
- on parameters, references, f-strings, or subqueries are skipped.
336
+ To remove any binding (plain or refreshable), use `DROP BINDING`:
337
+
338
+ ```cypher
339
+ DROP BINDING users;
340
+ ```
341
+
342
+ `UPDATE` and `MERGE INTO` against a refreshable binding are blocked
343
+ because the mutation would be invisibly overwritten by the next
344
+ refresh. Use `REFRESH BINDING name` to re-evaluate the source, or
345
+ `DROP BINDING name` and redefine the binding plainly.
337
346
 
338
347
  ### WHERE Clause
339
348
 
@@ -391,6 +400,115 @@ UNWIND [1, 1, 2, 2] AS i RETURN DISTINCT i
391
400
  // [{ i: 1 }, { i: 2 }]
392
401
  ```
393
402
 
403
+ ### Bindings (`LET` / `UPDATE` / `MERGE INTO`)
404
+
405
+ A **binding** is a named, mutable value that persists across statements in a multi-statement query. Bindings live in a flat per-query namespace, are introduced with `LET`, wholesale-replaced with `UPDATE`, row-filtered with `UPDATE … AS x DELETE WHERE …`, and per-row upserted/merged with `MERGE INTO … USING … ON … WHEN …`. Once bound, the value can be referenced anywhere an expression is allowed (e.g. as the source of `LOAD JSON FROM`, `UNWIND`, `MERGE INTO`'s `USING`, or directly inside an expression).
406
+
407
+ #### LET
408
+
409
+ `LET name = <expression-or-subquery>` introduces a new binding. The right-hand side can be any expression or a braced sub-query whose final `RETURN` provides the value.
410
+
411
+ ```cypher
412
+ LET data = [{id: 1, name: 'Alice'}, {id: 2, name: 'Bob'}];
413
+ LET threshold = 10;
414
+ LET fresh = {
415
+ UNWIND [1, 2, 3] AS n
416
+ RETURN n AS n
417
+ };
418
+ LOAD JSON FROM data AS d
419
+ WITH d WHERE d.id >= threshold OR d.id <= 2
420
+ RETURN d.id AS id, d.name AS name
421
+ ```
422
+
423
+ `LET` fails if the binding already exists - use `UPDATE` to overwrite.
424
+
425
+ #### UPDATE
426
+
427
+ `UPDATE name = <expression-or-subquery>` replaces the value of an existing binding wholesale. Works for any value (scalars, maps, arrays).
428
+
429
+ ```cypher
430
+ LET counter = 0;
431
+ UPDATE counter = counter + 1;
432
+ RETURN counter AS counter
433
+ // [{ counter: 1 }]
434
+ ```
435
+
436
+ `UPDATE` fails if the binding doesn't exist - use `LET` first.
437
+
438
+ #### UPDATE ... AS alias DELETE WHERE ...
439
+
440
+ Filters rows out of an array binding in place. The alias names each row during predicate evaluation.
441
+
442
+ ```cypher
443
+ LET users = [
444
+ {id: 1, name: 'Alice', expired: false},
445
+ {id: 2, name: 'Bob', expired: true}
446
+ ];
447
+ UPDATE users AS u DELETE WHERE u.expired;
448
+ LOAD JSON FROM users AS u
449
+ RETURN u.id AS id, u.name AS name
450
+ // [{ id: 1, name: 'Alice' }]
451
+ ```
452
+
453
+ #### MERGE INTO ... USING ... ON ... WHEN ...
454
+
455
+ SQL-style keyed merge. For each row of the source, find matching rows in the target (by key or predicate) and apply per-row branches:
456
+
457
+ ```
458
+ MERGE INTO target [AS t]
459
+ USING <source-expression-or-subquery> [AS s]
460
+ ON <key-or-key-list> | <predicate>
461
+ [WHEN MATCHED THEN UPDATE SET <field-list>]
462
+ [WHEN MATCHED THEN DELETE]
463
+ [WHEN NOT MATCHED THEN INSERT [<row-expression>]]
464
+ ```
465
+
466
+ ```cypher
467
+ // Upsert by key: replace listed fields on matches; append unmatched source rows
468
+ LET users = [{id: 1, name: 'Alice'}, {id: 2, name: 'Bob'}];
469
+ MERGE INTO users
470
+ USING [{id: 2, name: 'Bobby'}, {id: 3, name: 'Charlie'}]
471
+ ON id
472
+ WHEN MATCHED THEN UPDATE SET .id, .name
473
+ WHEN NOT MATCHED THEN INSERT;
474
+ // users → [{id:1,name:'Alice'}, {id:2,name:'Bobby'}, {id:3,name:'Charlie'}]
475
+
476
+ // Composite key
477
+ MERGE INTO rows
478
+ USING incoming
479
+ ON (tenant, id)
480
+ WHEN MATCHED THEN UPDATE SET .v
481
+ WHEN NOT MATCHED THEN INSERT;
482
+
483
+ // Per-row expressions across target (u) and source (s) aliases
484
+ MERGE INTO users AS u
485
+ USING incoming AS s
486
+ ON id
487
+ WHEN MATCHED THEN UPDATE SET .name = s.name + ' (' + u.name + ')'
488
+ WHEN NOT MATCHED THEN INSERT {id: s.id, name: 'New: ' + s.name};
489
+
490
+ // Predicate-based join
491
+ MERGE INTO users AS u
492
+ USING incoming AS s
493
+ ON u.tenant = s.tenant AND u.email = s.email
494
+ WHEN MATCHED THEN UPDATE SET .v = s.v
495
+ WHEN NOT MATCHED THEN INSERT;
496
+
497
+ // Tombstone delete: rows in target that also appear in source are removed
498
+ MERGE INTO users
499
+ USING [{id: 2}, {id: 3}]
500
+ ON id
501
+ WHEN MATCHED THEN DELETE;
502
+ ```
503
+
504
+ Notes:
505
+
506
+ - The source may be any expression (array literal, binding name) or a braced sub-query. When the source is a bare binding name, give it an alias: `USING incoming AS s`.
507
+ - `ON id` is shorthand for the equality predicate `t.id = s.id`; `ON (a, b)` requires equality on every listed key. Anything else is treated as a Boolean predicate evaluated per `(target, source)` pair.
508
+ - `WHEN MATCHED THEN UPDATE SET .field` overwrites only the listed fields, preserving the rest from the existing row. `SET .field = expr` evaluates `expr` per matched pair, with target and source aliases in scope.
509
+ - `WHEN NOT MATCHED THEN INSERT` (no row expression) appends the source row as-is. `INSERT { … }` appends an explicit row expression instead.
510
+ - A `MERGE INTO` must declare at least one `WHEN` clause. Branches are independent - omit `WHEN NOT MATCHED` to skip insertion, omit `WHEN MATCHED` to skip updates/deletes.
511
+
394
512
  ### Expressions
395
513
 
396
514
  #### Arithmetic
@@ -504,16 +622,16 @@ RETURN sum(n IN [1+2+3, 2, 3] | n^2) AS sum // 49
504
622
  Test list elements against a condition. Follow standard Cypher syntax.
505
623
 
506
624
  ```cypher
507
- // any true if at least one element matches
625
+ // any - true if at least one element matches
508
626
  RETURN any(n IN [1, 2, 3] WHERE n > 2) // true
509
627
 
510
- // all true if every element matches
628
+ // all - true if every element matches
511
629
  RETURN all(n IN [2, 4, 6] WHERE n > 0) // true
512
630
 
513
- // none true if no element matches
631
+ // none - true if no element matches
514
632
  RETURN none(n IN [1, 2, 3] WHERE n > 5) // true
515
633
 
516
- // single true if exactly one element matches
634
+ // single - true if exactly one element matches
517
635
  RETURN single(n IN [1, 2, 3] WHERE n > 2) // true
518
636
 
519
637
  // In a WHERE clause
@@ -698,7 +816,7 @@ MATCH (a:Person) WHERE NOT (a)-[:KNOWS]->(:Person) RETURN a.name
698
816
  **Subquery Expressions:** `EXISTS`, `COUNT`, and `COLLECT` evaluate a full subquery as an expression. The subquery can reference outer-scope variables and supports the complete FlowQuery pipeline (MATCH, WITH, WHERE, UNWIND, LOAD, etc.).
699
817
 
700
818
  ```cypher
701
- // EXISTS returns true if the subquery produces any rows
819
+ // EXISTS - returns true if the subquery produces any rows
702
820
  MATCH (p:Person)
703
821
  WHERE EXISTS {
704
822
  MATCH (p)-[:KNOWS]->(friend:Person)
@@ -706,12 +824,12 @@ WHERE EXISTS {
706
824
  }
707
825
  RETURN p.name
708
826
 
709
- // NOT EXISTS negate with NOT
827
+ // NOT EXISTS - negate with NOT
710
828
  MATCH (p:Person)
711
829
  WHERE NOT EXISTS { MATCH (p)-[:KNOWS]->(:Person) }
712
830
  RETURN p.name
713
831
 
714
- // COUNT returns the number of rows the subquery produces
832
+ // COUNT - returns the number of rows the subquery produces
715
833
  MATCH (p:Person)
716
834
  WHERE COUNT { MATCH (p)-[:KNOWS]->(:Person) } > 2
717
835
  RETURN p.name
@@ -720,7 +838,7 @@ RETURN p.name
720
838
  MATCH (p:Person)
721
839
  RETURN p.name, COUNT { MATCH (p)-[:KNOWS]->(:Person) } AS friendCount
722
840
 
723
- // COLLECT returns a list of single-column values from the subquery
841
+ // COLLECT - returns a list of single-column values from the subquery
724
842
  MATCH (p:Person)
725
843
  RETURN COLLECT {
726
844
  MATCH (p)-[:KNOWS]->(friend:Person)
@@ -832,6 +950,18 @@ RETURN f.name, f.description, f.category
832
950
  │ stmt1; stmt2; ... stmtN -- multi-statement │
833
951
  │ LIMIT n │
834
952
  ├─────────────────────────────────────────────────────────────┤
953
+ │ BINDINGS │
954
+ ├─────────────────────────────────────────────────────────────┤
955
+ │ LET name = expr | { subquery } -- new binding │
956
+ │ UPDATE name = expr | { subquery } -- replace existing │
957
+ │ UPDATE name AS x DELETE WHERE cond -- row-filter binding │
958
+ │ MERGE INTO target [AS t] │
959
+ │ USING <expr | subquery> [AS s] │
960
+ │ ON key | (k1,k2,...) | predicate │
961
+ │ [WHEN MATCHED THEN UPDATE SET .f [= expr], ...] │
962
+ │ [WHEN MATCHED THEN DELETE] │
963
+ │ [WHEN NOT MATCHED THEN INSERT [ {row-expr} ]] │
964
+ ├─────────────────────────────────────────────────────────────┤
835
965
  │ GRAPH OPERATIONS │
836
966
  ├─────────────────────────────────────────────────────────────┤
837
967
  │ CREATE VIRTUAL (:Label) AS { subquery } │
@@ -1084,7 +1214,7 @@ RETURN f.name AS name, f.description AS description, f.category AS category
1084
1214
 
1085
1215
  ### Virtual Org Chart
1086
1216
 
1087
- This single multi-statement query creates a virtual graph for a fictitious company complete with employees, skills, phone numbers, and a management chain then queries it to produce an org chart. [Try live!](https://microsoft.github.io/FlowQuery/?rZXPbtNAEMbFhUMOReXAeW4OwkH521ZBQgolrVQlSogDQlRVtdjTeKm9a603Bavqw_AAPEVfDO16s_EmEVElfPHuzGf7N9-M7UN4BfX-MM0SXiC-hkEA9zUAgJdwqc_quKdRH1o-MJJiH7yACAKnMTLP1-kf_PucykSlTocTE6weEWZEyBSZ7IM3_IXhUtI71MIs5kxd-KbV6PV6jWar2fR8yG9pkuR9uPQCKYjEReH54I2QRCjymGZqd0YZYSF6Vw--C9q2oGMiwmUOM3qHgnh-FfTLFPgNDNmCMkRB2ULTuKBOchO03Ww5oAMRxlRiKJdCVeadJnwZqcUYmeT6JlukHUs6FbQgMCUSEw2yRToVPFqG0prrkLrJTdLOBumMkyglWVbW5Q0YSQpJw1xtPn_dpuxayguSYg4fBOe3Sl2lDJBRLqyhBuVJfrYdynmRYRAKmqmyvGkhY67mzTsXJIs_jbYxexZzRBl8i4np6f_G7LhmLnMNGBS5xFR7-BHvJlm-DXhkAQepeoMueMxyXVQF0LQSxoSRxW7Afd12fQxCsUw1FpEEdLNzqjkDSW4x5kmEAsaLVG4TH1viOU8ff-dwTkT4-Ids9p7fyJ9EoGvrk0ztuqYi0fVtDMEcc7nzPTqxnGdE0pTAIGkEpMDI9XYX55Moew7leibHI2vwlGaYUIaq_xrySn1SBYZcRHp_YDZvaaQyNPJXAVWCCqmzDa7wVWK1XlcPsNKt61DK9c7eSFejcnphw2U5Kl6uag_vaofuP6Fx2Z8Np5PZPLieT64a-_8WCd7Ia_MlFnQRl5tWtWtW0tkv6VYl7Z2S3n7JUVXS2Sk5rkq6OyUnruTfHTZXqbRZWttXN9GXmrWy_gXUce1v7RnUt_x_X08XoqI50A_FcnzWxyAANBrfKOwsWYVcjxNWR8ikK2NkNOUUVR9SjpNJm2mqpMtImU8XwqXUvmVcyHzOa88dBN9U9Bc)
1217
+ This single multi-statement query creates a virtual graph for a fictitious company - complete with employees, skills, phone numbers, and a management chain - then queries it to produce an org chart. [Try live!](https://microsoft.github.io/FlowQuery/?rZXPbtNAEMbFhUMOReXAeW4OwkH521ZBQgolrVQlSogDQlRVtdjTeKm9a603Bavqw_AAPEVfDO16s_EmEVElfPHuzGf7N9-M7UN4BfX-MM0SXiC-hkEA9zUAgJdwqc_quKdRH1o-MJJiH7yACAKnMTLP1-kf_PucykSlTocTE6weEWZEyBSZ7IM3_IXhUtI71MIs5kxd-KbV6PV6jWar2fR8yG9pkuR9uPQCKYjEReH54I2QRCjymGZqd0YZYSF6Vw--C9q2oGMiwmUOM3qHgnh-FfTLFPgNDNmCMkRB2ULTuKBOchO03Ww5oAMRxlRiKJdCVeadJnwZqcUYmeT6JlukHUs6FbQgMCUSEw2yRToVPFqG0prrkLrJTdLOBumMkyglWVbW5Q0YSQpJw1xtPn_dpuxayguSYg4fBOe3Sl2lDJBRLqyhBuVJfrYdynmRYRAKmqmyvGkhY67mzTsXJIs_jbYxexZzRBl8i4np6f_G7LhmLnMNGBS5xFR7-BHvJlm-DXhkAQepeoMueMxyXVQF0LQSxoSRxW7Afd12fQxCsUw1FpEEdLNzqjkDSW4x5kmEAsaLVG4TH1viOU8ff-dwTkT4-Ids9p7fyJ9EoGvrk0ztuqYi0fVtDMEcc7nzPTqxnGdE0pTAIGkEpMDI9XYX55Moew7leibHI2vwlGaYUIaq_xrySn1SBYZcRHp_YDZvaaQyNPJXAVWCCqmzDa7wVWK1XlcPsNKt61DK9c7eSFejcnphw2U5Kl6uag_vaofuP6Fx2Z8Np5PZPLieT64a-_8WCd7Ia_MlFnQRl5tWtWtW0tkv6VYl7Z2S3n7JUVXS2Sk5rkq6OyUnruTfHTZXqbRZWttXN9GXmrWy_gXUce1v7RnUt_x_X08XoqI50A_FcnzWxyAANBrfKOwsWYVcjxNWR8ikK2NkNOUUVR9SjpNJm2mqpMtImU8XwqXUvmVcyHzOa88dBN9U9Bc)
1088
1218
 
1089
1219
  ```cypher
1090
1220
  CREATE VIRTUAL (:Employee) AS {
@@ -1138,7 +1268,7 @@ Output:
1138
1268
  | Amara Johnson | Product Manager | Product | +1-555-0302 | [Scrum, Data Analysis, Stakeholder Mgmt] | Priya Patel |
1139
1269
  | Priya Patel | VP of Product | Product | +1-555-0301 | [Roadmapping, Analytics, UX] | Sara Chen |
1140
1270
 
1141
- You can further explore the graph for example, find the full management chain from any employee up to the CEO:
1271
+ You can further explore the graph - for example, find the full management chain from any employee up to the CEO:
1142
1272
 
1143
1273
  ```cypher
1144
1274
  MATCH (e:Employee)-[:REPORTS_TO*1..]->(mgr:Employee)
@@ -1166,6 +1296,472 @@ RETURN a.name AS employee1, b.name AS employee2, shared AS sharedSkills
1166
1296
  ORDER BY size(shared) DESC
1167
1297
  ```
1168
1298
 
1299
+ ### Virtual Country Borders Graph
1300
+
1301
+ This example pulls live data from the public [REST Countries](https://restcountries.com/) API and projects it into a `(:Country)-[:BORDERS]-(:Country)` virtual graph in a single semicolon-chained query, then ranks European countries by how many direct neighbors they have. [Try live!](https://microsoft.github.io/FlowQuery/?nZBRa4MwFIXf8yvumx1YXelbRxlWM-qwCtGujDGKxrQVrJFoB2Xsv49Eq3Ure1iekuvxnu8cD0dA-amoRcYqmMMnAgDwAsuB5zDw4YkEK9AOdV1WM9MUrKo7tUH50fyYGhMzzvPHXcbytJoX8ZHplMZTXbB9xgu95OUpj2t5TbhImag0sEKgyofgaE18oIb8A9pjhZClOoL-UEOulX5HXsjvymWoaOy6Da37UNOzKM0V2lDXgl52tU_09YCQTbAVYXhxSbS2PBjNbFXH-U4Kb5bXt_s7dpa2aduIXbY-UJ9lGGDA_zfa-G22CIiDSfg-_h_w2t-4vnPVjGzlVpSc7eqtzJMo8Gx_kE-Ft7Iiewmj-DZW0o3RZokJhvhSwBw0fBK8ZBpqzeKuqwb1rDeXUaJCFSzbHxIutmqIlAUsXn_MwcGhjTx35UYwuf8G)
1302
+
1303
+ A single shared `LET` binding fetches the data once. Both virtuals (the `(:Country)` nodes and the `(:Country)-[:BORDERS]-(:Country)` relationships) are projected from the same binding, so there is exactly one HTTP round-trip per Runner invocation. Add `REFRESH EVERY 1 HOUR` to the `LET` if you want the cache to auto-refresh on a schedule.
1304
+
1305
+ ```cypher
1306
+ LET countries = {
1307
+ LOAD JSON FROM 'https://restcountries.com/v3.1/all?fields=name,cca3,region,population,borders' AS c
1308
+ RETURN c.cca3 AS id,
1309
+ c.name.common AS name,
1310
+ c.region AS region,
1311
+ c.population AS population,
1312
+ c.borders AS borders
1313
+ };
1314
+ CREATE VIRTUAL (:Country) AS {
1315
+ LOAD JSON FROM countries AS c
1316
+ RETURN c.id AS id, c.name AS name, c.region AS region, c.population AS population
1317
+ };
1318
+ CREATE VIRTUAL (:Country)-[:BORDERS]-(:Country) AS {
1319
+ LOAD JSON FROM countries AS c
1320
+ UNWIND c.borders AS b
1321
+ RETURN c.id AS left_id, b AS right_id
1322
+ };
1323
+ MATCH (a:Country)-[:BORDERS]-(b:Country)
1324
+ WHERE a.region = 'Europe'
1325
+ RETURN a.name AS country, count(b) AS neighbor_count
1326
+ ORDER BY neighbor_count DESC
1327
+ LIMIT 10
1328
+ ```
1329
+
1330
+ ## Lineage and Provenance
1331
+
1332
+ FlowQuery exposes two complementary forms of lineage:
1333
+
1334
+ - **Structural lineage** (`metadata.info`) - what labels, types,
1335
+ properties, and sources the parsed query touches. Available without
1336
+ running the query and with zero runtime overhead.
1337
+ - **Row-level provenance** (`runner.provenance`) - opt-in via
1338
+ `{ provenance: true }`; for every emitted result row, the concrete
1339
+ node ids and relationship hops bound to it, their matched property
1340
+ values, and (for virtual-backed records) the inner sub-query lineage
1341
+ that produced them.
1342
+
1343
+ Combined, they let you trace every cell of a result back to the source
1344
+ record, the source virtual, and ultimately the URL or call-site that
1345
+ backs it.
1346
+
1347
+ ### Statement Info: Labels, Properties, and Source Lineage
1348
+
1349
+ `metadata.info` carries a `StatementInfo` describing the _structure_ the
1350
+ query touches - independent of execution. It captures:
1351
+
1352
+ - The node labels and relationship types referenced.
1353
+ - The data sources backing the underlying virtual definitions.
1354
+ - The node/relationship properties **consumed** by the query -
1355
+ `alias.prop` accesses anywhere in `MATCH`, `WHERE`, `WITH`, `RETURN`,
1356
+ `ORDER BY`, or function arguments, plus inline pattern properties
1357
+ like `(u:User {id: 'rick.o'})`.
1358
+ - The properties **declared** by each virtual's `RETURN` clause via
1359
+ `info.declared`, so you can validate that a query references only
1360
+ declared properties.
1361
+ - Literal values supplied for properties at the call site via
1362
+ `info.nodes[Label].literal_values` - collected from inline pattern
1363
+ properties and from equality / `IN` predicates such as
1364
+ `WHERE u.id = 'rick.o'` or `WHERE u.id IN ['a', 'b']`.
1365
+
1366
+ This is useful for governance, lineage UIs, query-cost estimation, schema
1367
+ validation, or routing decisions before the query runs.
1368
+
1369
+ The same `StatementInfoCrawler` can also be used directly on any parsed
1370
+ AST without going through a `Runner`:
1371
+
1372
+ ```javascript
1373
+ import { StatementInfoCrawler } from "flowquery";
1374
+
1375
+ const crawler = new StatementInfoCrawler();
1376
+ const info = crawler.crawl(parsedAst);
1377
+ ```
1378
+
1379
+ For end-to-end lineage from a property to its data source, use the
1380
+ per-entity `nodes` and `relationships` maps:
1381
+
1382
+ ```javascript
1383
+ const runner = new FlowQuery(`
1384
+ CREATE VIRTUAL (:City) AS {
1385
+ LOAD JSON FROM "https://example.com/cities" AS c
1386
+ RETURN c.id AS id, c.name AS name, c.country AS country
1387
+ };
1388
+ CREATE VIRTUAL (:City)-[:FLIGHT]-(:City) AS {
1389
+ LOAD JSON FROM "https://example.com/flights" AS f
1390
+ RETURN f.left_id AS left_id, f.right_id AS right_id, f.airline AS airline
1391
+ };
1392
+ MATCH (a:City {name: 'NYC'})-[r:FLIGHT]->(b:City)
1393
+ WHERE b.country IN ['US', 'CA']
1394
+ RETURN a.name AS origin, b.name AS destination, r.airline AS airline
1395
+ `);
1396
+ const { info } = runner.metadata;
1397
+
1398
+ console.log(info.nodes);
1399
+ // {
1400
+ // City: {
1401
+ // properties: ["country", "name"],
1402
+ // sources: ["https://example.com/cities"],
1403
+ // literal_values: { country: ["US", "CA"], name: ["NYC"] }
1404
+ // }
1405
+ // }
1406
+ console.log(info.relationships);
1407
+ // {
1408
+ // FLIGHT: {
1409
+ // properties: ["airline"],
1410
+ // sources: ["https://example.com/flights"],
1411
+ // literal_values: {}
1412
+ // }
1413
+ // }
1414
+ console.log(info.declared.nodes.City);
1415
+ // { properties: ["country", "id", "name"], sources: ["https://example.com/cities"] }
1416
+ console.log(info.sources);
1417
+ // ["https://example.com/cities", "https://example.com/flights"]
1418
+ ```
1419
+
1420
+ `StatementInfo` resolves sources and declared schemas for **any** virtual
1421
+ the query touches - both inline `CREATE VIRTUAL` clauses and
1422
+ previously-registered virtuals reached via `MATCH` or `DELETE`. The flat
1423
+ `node_labels`, `relationship_types`, `sources`, `node_properties`, and
1424
+ `relationship_properties` fields stay in sync with the per-entity `nodes` /
1425
+ `relationships` maps and are convenient for quick aggregate checks. Only
1426
+ purely literal AST subtrees end up in `literal_values` - values that depend
1427
+ on parameters, references, f-strings, or subqueries are skipped.
1428
+
1429
+ ### Row-level Provenance: Node and Relationship IDs Behind Each Result
1430
+
1431
+ `StatementInfo` describes the _structural_ lineage of a query - which
1432
+ labels, types, and sources back it. To get the _row-level_ lineage -
1433
+ which concrete node ids and relationship `(left_id, right_id, type)`
1434
+ hops actually flowed into each result row - pass `{ provenance: true }`
1435
+ when constructing the runner and read `runner.provenance`:
1436
+
1437
+ ```javascript
1438
+ const fq = new FlowQuery(
1439
+ `
1440
+ MATCH (a:City {name: 'NYC'})-[r:FLIGHT]->(b:City)
1441
+ RETURN a.name AS origin, b.name AS destination
1442
+ `,
1443
+ null,
1444
+ null,
1445
+ { provenance: true }
1446
+ );
1447
+ await fq.run();
1448
+
1449
+ fq.results;
1450
+ // [{ origin: 'NYC', destination: 'LAX' }, { origin: 'NYC', destination: 'YYZ' }]
1451
+
1452
+ fq.provenance;
1453
+ // [
1454
+ // {
1455
+ // nodes: [
1456
+ // { alias: 'a', label: 'City', id: 'nyc' },
1457
+ // { alias: 'b', label: 'City', id: 'lax' }
1458
+ // ],
1459
+ // relationships: [
1460
+ // { alias: 'r', type: 'FLIGHT',
1461
+ // hops: [{ left_id: 'nyc', right_id: 'lax', type: 'FLIGHT' }],
1462
+ // path: ['nyc', 'lax'] }
1463
+ // ],
1464
+ // rows: [/* per-input-row segments, see below */]
1465
+ // },
1466
+ // { nodes: [...], relationships: [{ alias: 'r', type: 'FLIGHT',
1467
+ // hops: [{ left_id: 'nyc', right_id: 'yyz', type: 'FLIGHT' }],
1468
+ // path: ['nyc', 'yyz'] }], rows: [...] }
1469
+ // ]
1470
+ ```
1471
+
1472
+ Semantics:
1473
+
1474
+ - `runner.provenance` is aligned **by index** with `runner.results`.
1475
+ - Each `NodeBinding.id` preserves the original scalar type of the
1476
+ underlying record (a numeric id stays a number).
1477
+ - Anonymous nodes / relationships from the pattern are included with
1478
+ `alias: null`.
1479
+ - Variable-length matches (`[:T*m..n]`) populate `hops` with every
1480
+ traversed edge in path order.
1481
+ - Every `RelationshipBinding` also carries a `path` field listing every
1482
+ visited node id in order: `[hops[0].left_id, hops[0].right_id,
1483
+ hops[1].right_id, …]`. For single-hop matches `path` has exactly two
1484
+ entries; for variable-length matches `path.length === hops.length + 1`.
1485
+ - `OPTIONAL MATCH` misses surface as `id: null` for the unmatched node
1486
+ and an empty `hops: []` for the unmatched relationship.
1487
+ - `ORDER BY` and `LIMIT` permute and truncate `provenance` in lockstep
1488
+ with `results`.
1489
+ - Aggregate `RETURN` (e.g. `count`, `collect`, `sum`) unions all
1490
+ contributing bindings into the output group's provenance, deduplicated
1491
+ per `(alias, id)` for nodes and per `(alias, hops)` for relationships.
1492
+ - `UNION ALL` concatenates branch provenance; `UNION` keeps the first
1493
+ branch's lineage for deduplicated rows.
1494
+ - Aggregating `WITH` clauses carry lineage forward. Inside the group, the
1495
+ contributing bindings (the upstream `MATCH` nodes and relationships)
1496
+ are deduplicated and frozen; any subsequent `MATCH` adds its own
1497
+ live bindings on top, so the final `RETURN` row's provenance shows
1498
+ both the pre-aggregation sources and the post-aggregation bindings.
1499
+ Chained aggregating `WITH` clauses compose transitively - the original
1500
+ ids and hops survive every aggregation hop.
1501
+
1502
+ When the option is omitted or set to `false`, the runner has zero
1503
+ provenance overhead and `runner.provenance` returns an empty array.
1504
+
1505
+ #### Per-Input-Row Segments: Aligning `collect()` with its Sources
1506
+
1507
+ Each `RowProvenance` also carries a `rows` array: one **segment** per
1508
+ input row that contributed to the result. A segment is just the
1509
+ `{ nodes, relationships }` slice for that single contributing row.
1510
+
1511
+ For non-aggregate rows `rows` always has length 1 and mirrors the
1512
+ top-level `nodes`/`relationships`. For aggregate rows the array
1513
+ positionally aligns with array-valued aggregates such as `collect`:
1514
+
1515
+ ```cypher
1516
+ MATCH (a:City)-[:FLIGHT]->(b:City)
1517
+ RETURN a.country AS country, collect(b.name) AS destinations
1518
+ ```
1519
+
1520
+ ```javascript
1521
+ fq.results[0];
1522
+ // { country: 'US', destinations: ['LAX', 'YYZ'] }
1523
+ fq.provenance[0].rows.length; // 2
1524
+ fq.provenance[0].rows[0].nodes; // contributed LAX: includes b = lax
1525
+ fq.provenance[0].rows[1].nodes; // contributed YYZ: includes b = yyz
1526
+ ```
1527
+
1528
+ This lets you map each element of a `collect`/`sum`/`avg` result back
1529
+ to the exact node / relationship ids that produced it.
1530
+
1531
+ #### Property-Level Lineage
1532
+
1533
+ Each `NodeBinding` and `RelationshipHop` produced under `{ provenance:
1534
+ true }` also carries the matched **property values** alongside the ids:
1535
+
1536
+ ```javascript
1537
+ const fq = new FlowQuery(query, null, null, { provenance: true });
1538
+ await fq.run();
1539
+
1540
+ fq.provenance[0].nodes[0];
1541
+ // {
1542
+ // alias: 'a', label: 'City', id: 'nyc',
1543
+ // properties: { name: 'New York', country: 'US' }
1544
+ // }
1545
+ fq.provenance[0].relationships[0].hops[0];
1546
+ // {
1547
+ // left_id: 'nyc', right_id: 'lax', type: 'FLIGHT',
1548
+ // properties: { airline: 'AA' }
1549
+ // }
1550
+ ```
1551
+
1552
+ - `NodeBinding.properties` is a shallow copy of the matched record with
1553
+ `id` and `_label` stripped. `RelationshipHop.properties` is a shallow
1554
+ copy of the matched relationship's user-visible properties.
1555
+
1556
+ #### Threading Lineage Through Virtual Sub-Queries
1557
+
1558
+ A `CREATE VIRTUAL (:X) AS { ... }` block wraps an inner FlowQuery that
1559
+ produces the synthesised records exposed under the `:X` label. By
1560
+ default, a downstream `MATCH (x:X)` only sees the synthesised row's
1561
+ `id` - the upstream query that produced it is opaque.
1562
+
1563
+ When `{ provenance: true }` is set, the inner runner's `RowProvenance`
1564
+ is threaded onto every binding whose record came from a virtual. Each
1565
+ `NodeBinding` and each `RelationshipHop` gains an optional `source:
1566
+ RowProvenance` field carrying the inner row's full lineage -
1567
+ recursively, when a virtual matches another virtual:
1568
+
1569
+ ```typescript
1570
+ import { Runner } from "flowquery";
1571
+
1572
+ // Virtual graph: derived city = US-only subset of SrcCity.
1573
+ await new Runner(`
1574
+ CREATE VIRTUAL (:SrcCity) AS {
1575
+ UNWIND [
1576
+ { id: 'nyc', country: 'US' },
1577
+ { id: 'lhr', country: 'UK' }
1578
+ ] AS c
1579
+ RETURN c.id AS id, c.country AS country
1580
+ }
1581
+ `).run();
1582
+ await new Runner(`
1583
+ CREATE VIRTUAL (:DerivedCity) AS {
1584
+ MATCH (s:SrcCity)
1585
+ WHERE s.country = 'US'
1586
+ RETURN s.id AS id
1587
+ }
1588
+ `).run();
1589
+
1590
+ const fq = new Runner(`MATCH (d:DerivedCity) RETURN d.id AS id`, null, null, { provenance: true });
1591
+ await fq.run();
1592
+
1593
+ fq.provenance[0].nodes[0];
1594
+ // {
1595
+ // alias: 'd', label: 'DerivedCity', id: 'nyc',
1596
+ // source: {
1597
+ // nodes: [{ alias: 's', label: 'SrcCity', id: 'nyc' }],
1598
+ // relationships: []
1599
+ // }
1600
+ // }
1601
+ ```
1602
+
1603
+ Semantics:
1604
+
1605
+ - The `source` field is **omitted** when the binding's record did not
1606
+ come from a virtual sub-query (e.g. records from `UNWIND … RETURN`
1607
+ inside the virtual produce a `source` with empty `nodes` and
1608
+ `relationships`, signalling "lineage was threaded but no graph slots
1609
+ were bound at this level").
1610
+ - Sub-query lineage is **recursive**: a virtual that matches another
1611
+ virtual carries nested `source` chains all the way down.
1612
+ - Provenance mode bypasses the static-virtual cache because each
1613
+ invocation must produce fresh records to back the lineage weak-map.
1614
+ Static caching continues to apply when `provenance` is off.
1615
+
1616
+ #### Data Sources and LET Chaining
1617
+
1618
+ `LOAD` operations contribute row-level `data_sources` entries on each
1619
+ emitted `RowSegment`: one `{ source, source_provenance? }` per loaded
1620
+ record. `source` is the URL, file URI, async-function name, or
1621
+ `let://<name>` reference; `source_provenance` is the inner
1622
+ `RowProvenance` of the source row when the `LOAD` was reading from a
1623
+ `LET`-bound dataset whose right-hand side itself produced lineage.
1624
+
1625
+ The same chain surfaces structurally. `info.sources` and the per-label
1626
+ `info.nodes[Label].sources` arrays follow `LOAD FROM <letName>`
1627
+ references and emit a `let://<name>` entry, plus the underlying
1628
+ sources the `LET` sub-query touched (when both definitions are
1629
+ visible to the crawler in the same query).
1630
+
1631
+ ```cypher
1632
+ LET cities = { LOAD JSON FROM 'file:///data/cities.json' AS c RETURN c.id AS id, c.name AS name };
1633
+ CREATE VIRTUAL (:City) AS { LOAD JSON FROM cities AS c RETURN c.id AS id, c.name AS name };
1634
+ MATCH (c:City) RETURN c.name AS name
1635
+ ```
1636
+
1637
+ ```javascript
1638
+ runner.metadata.info.nodes.City.sources;
1639
+ // ['file:///data/cities.json', 'let://cities']
1640
+
1641
+ runner.provenance[0].nodes[0].source.data_sources[0];
1642
+ // {
1643
+ // source: 'let://cities',
1644
+ // source_provenance: {
1645
+ // nodes: [], relationships: [],
1646
+ // data_sources: [{ source: 'file:///data/cities.json' }]
1647
+ // }
1648
+ // }
1649
+ ```
1650
+
1651
+ #### Column-Level Lineage: Tracing Each Result Cell to Its Source
1652
+
1653
+ `runner.info.returns` (added to `StatementInfo`) maps every output
1654
+ column to the `alias.property` accesses that compose it. Combined with
1655
+ row provenance it gives you per-cell traceability - value → source
1656
+ binding → node id → source URL - without any runtime AST inspection.
1657
+
1658
+ For a query
1659
+
1660
+ ```cypher
1661
+ MATCH (c:City)-[f:FLIGHT]->(d:City)
1662
+ WHERE c.country = 'US'
1663
+ RETURN c.name AS origin, d.name AS destination, f.airline AS airline
1664
+ ```
1665
+
1666
+ `runner.info.returns` is:
1667
+
1668
+ ```javascript
1669
+ {
1670
+ origin: {
1671
+ references: [{ alias: 'c', kind: 'node', labels: ['City'], property: 'name' }],
1672
+ kind: 'property'
1673
+ },
1674
+ destination: {
1675
+ references: [{ alias: 'd', kind: 'node', labels: ['City'], property: 'name' }],
1676
+ kind: 'property'
1677
+ },
1678
+ airline: {
1679
+ references: [{
1680
+ alias: 'f', kind: 'relationship',
1681
+ labels: ['FLIGHT'], property: 'airline'
1682
+ }],
1683
+ kind: 'property'
1684
+ }
1685
+ }
1686
+ ```
1687
+
1688
+ `kind` summarises how the column was built:
1689
+
1690
+ | `kind` | Meaning |
1691
+ | -------------- | ------------------------------------------------------------------------- |
1692
+ | `'literal'` | Pure literal expression, no bindings (e.g. `42 AS answer`). |
1693
+ | `'property'` | Direct `alias.property` projection (or pass-through). |
1694
+ | `'expression'` | Computed from one or more `alias.property` accesses. |
1695
+ | `'aggregate'` | Aggregate function (`count`, `sum`, `collect`, …); see `aggregate` field. |
1696
+
1697
+ Notes:
1698
+
1699
+ - The map is keyed by the column's output alias (the part after `AS`),
1700
+ falling back to `expr0`, `expr1`, … for unnamed columns.
1701
+ - `count(c)` and similar aggregates over a bare binding produce
1702
+ `kind: 'aggregate'`, `aggregate: 'count'`, and an empty `references`
1703
+ list (no specific property is read). Use `count(c.id)` to surface a
1704
+ property reference.
1705
+ - Multi-label intersection matches (`MATCH (n:A:B)`) populate every
1706
+ label in `references[i].labels`.
1707
+
1708
+ #### Combining Lineage and Provenance: `traceRow()` and `lineage()`
1709
+
1710
+ `info.returns` (structural) and `runner.provenance` (runtime) are kept
1711
+ as separate streams so each is useful on its own. When you want both
1712
+ joined per cell, the `Runner` exposes two convenience methods.
1713
+
1714
+ `runner.traceRow(rowIndex)` returns one `CellTrace` per output column
1715
+ for a single row, pairing the column's structural lineage with the
1716
+ node / relationship bindings whose alias matches it. The matched
1717
+ property value is extracted for you (including the built-ins `id`,
1718
+ `left_id`, `right_id`, `type`):
1719
+
1720
+ ```javascript
1721
+ const runner = new FlowQuery(
1722
+ `
1723
+ MATCH (c:City)-[f:FLIGHT]->(d:City)
1724
+ WHERE c.country = 'US'
1725
+ RETURN c.name AS origin, d.name AS destination, f.airline AS airline
1726
+ `,
1727
+ null,
1728
+ null,
1729
+ { provenance: true }
1730
+ );
1731
+ await runner.run();
1732
+
1733
+ const trace = runner.traceRow(0);
1734
+ // trace.origin = {
1735
+ // column: 'origin',
1736
+ // value: 'New York',
1737
+ // lineage: { references: [{ alias: 'c', kind: 'node', labels: ['City'], property: 'name' }], kind: 'property' },
1738
+ // bindings: [{
1739
+ // reference: { alias: 'c', kind: 'node', labels: ['City'], property: 'name' },
1740
+ // value: 'New York',
1741
+ // node: { alias: 'c', label: 'City', id: 'nyc', properties: { name: 'New York', country: 'US' }, source: {...} }
1742
+ // }]
1743
+ // }
1744
+ // trace.airline.bindings[0].relationship.hops[0].properties.airline === 'AA'
1745
+ ```
1746
+
1747
+ `runner.lineage()` is the one-shot equivalent over the entire result
1748
+ set:
1749
+
1750
+ ```javascript
1751
+ const report = runner.lineage();
1752
+ // report.columns: same shape as info.returns (structural per-column).
1753
+ // report.rows[i]: same shape as traceRow(i) (per-cell trace).
1754
+ ```
1755
+
1756
+ `bindings` is empty for literal columns, when the runner was
1757
+ constructed without `{ provenance: true }`, or when a row's bindings
1758
+ don't intersect the column's references. Aggregate columns like
1759
+ `collect(c.id)` surface one binding per contributing input row, so
1760
+ `trace.ids.bindings.map(b => b.value)` aligns with the collected
1761
+ array. `info`, `provenance`, and `metadata` remain available unchanged
1762
+
1763
+ - `traceRow` / `lineage` are purely additive helpers.
1764
+
1169
1765
  ## Contributing
1170
1766
 
1171
1767
  This project welcomes contributions and suggestions. Most contributions require you to agree to a