flowquery 1.0.70 → 1.0.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +456 -103
  2. package/dist/compute/provenance.d.ts +201 -0
  3. package/dist/compute/provenance.d.ts.map +1 -0
  4. package/dist/compute/provenance.js +216 -0
  5. package/dist/compute/provenance.js.map +1 -0
  6. package/dist/compute/runner.d.ts +196 -3
  7. package/dist/compute/runner.d.ts.map +1 -1
  8. package/dist/compute/runner.js +283 -2
  9. package/dist/compute/runner.js.map +1 -1
  10. package/dist/flowquery.min.js +1 -1
  11. package/dist/graph/bindings.d.ts.map +1 -1
  12. package/dist/graph/bindings.js +15 -1
  13. package/dist/graph/bindings.js.map +1 -1
  14. package/dist/graph/data_cache.d.ts +4 -0
  15. package/dist/graph/data_cache.d.ts.map +1 -1
  16. package/dist/graph/data_cache.js +15 -3
  17. package/dist/graph/data_cache.js.map +1 -1
  18. package/dist/graph/data_resolver.d.ts.map +1 -1
  19. package/dist/graph/data_resolver.js +23 -4
  20. package/dist/graph/data_resolver.js.map +1 -1
  21. package/dist/graph/physical_node.d.ts +4 -8
  22. package/dist/graph/physical_node.d.ts.map +1 -1
  23. package/dist/graph/physical_node.js +14 -85
  24. package/dist/graph/physical_node.js.map +1 -1
  25. package/dist/graph/physical_relationship.d.ts +4 -8
  26. package/dist/graph/physical_relationship.d.ts.map +1 -1
  27. package/dist/graph/physical_relationship.js +14 -85
  28. package/dist/graph/physical_relationship.js.map +1 -1
  29. package/dist/graph/relationship_match_collector.d.ts.map +1 -1
  30. package/dist/graph/relationship_match_collector.js +9 -0
  31. package/dist/graph/relationship_match_collector.js.map +1 -1
  32. package/dist/graph/virtual_sources.d.ts +8 -0
  33. package/dist/graph/virtual_sources.d.ts.map +1 -0
  34. package/dist/graph/virtual_sources.js +31 -0
  35. package/dist/graph/virtual_sources.js.map +1 -0
  36. package/dist/graph/virtual_statement.d.ts +41 -0
  37. package/dist/graph/virtual_statement.d.ts.map +1 -0
  38. package/dist/graph/virtual_statement.js +143 -0
  39. package/dist/graph/virtual_statement.js.map +1 -0
  40. package/dist/index.browser.d.ts +3 -3
  41. package/dist/index.browser.d.ts.map +1 -1
  42. package/dist/index.browser.js.map +1 -1
  43. package/dist/index.node.d.ts +3 -3
  44. package/dist/index.node.d.ts.map +1 -1
  45. package/dist/index.node.js.map +1 -1
  46. package/dist/parsing/operations/aggregated_return.d.ts +10 -0
  47. package/dist/parsing/operations/aggregated_return.d.ts.map +1 -1
  48. package/dist/parsing/operations/aggregated_return.js +56 -3
  49. package/dist/parsing/operations/aggregated_return.js.map +1 -1
  50. package/dist/parsing/operations/aggregated_with.d.ts +20 -0
  51. package/dist/parsing/operations/aggregated_with.d.ts.map +1 -1
  52. package/dist/parsing/operations/aggregated_with.js +33 -1
  53. package/dist/parsing/operations/aggregated_with.js.map +1 -1
  54. package/dist/parsing/operations/group_by.d.ts +32 -0
  55. package/dist/parsing/operations/group_by.d.ts.map +1 -1
  56. package/dist/parsing/operations/group_by.js +103 -0
  57. package/dist/parsing/operations/group_by.js.map +1 -1
  58. package/dist/parsing/operations/let.d.ts.map +1 -1
  59. package/dist/parsing/operations/let.js +53 -0
  60. package/dist/parsing/operations/let.js.map +1 -1
  61. package/dist/parsing/operations/load.d.ts +20 -0
  62. package/dist/parsing/operations/load.d.ts.map +1 -1
  63. package/dist/parsing/operations/load.js +71 -0
  64. package/dist/parsing/operations/load.js.map +1 -1
  65. package/dist/parsing/operations/order_by.d.ts +6 -0
  66. package/dist/parsing/operations/order_by.d.ts.map +1 -1
  67. package/dist/parsing/operations/order_by.js +10 -1
  68. package/dist/parsing/operations/order_by.js.map +1 -1
  69. package/dist/parsing/operations/return.d.ts +34 -0
  70. package/dist/parsing/operations/return.d.ts.map +1 -1
  71. package/dist/parsing/operations/return.js +95 -2
  72. package/dist/parsing/operations/return.js.map +1 -1
  73. package/dist/parsing/operations/union.d.ts +19 -2
  74. package/dist/parsing/operations/union.d.ts.map +1 -1
  75. package/dist/parsing/operations/union.js +47 -11
  76. package/dist/parsing/operations/union.js.map +1 -1
  77. package/dist/parsing/operations/union_all.d.ts +5 -1
  78. package/dist/parsing/operations/union_all.d.ts.map +1 -1
  79. package/dist/parsing/operations/union_all.js +8 -2
  80. package/dist/parsing/operations/union_all.js.map +1 -1
  81. package/dist/parsing/statement_info_crawler.d.ts +106 -0
  82. package/dist/parsing/statement_info_crawler.d.ts.map +1 -1
  83. package/dist/parsing/statement_info_crawler.js +231 -12
  84. package/dist/parsing/statement_info_crawler.js.map +1 -1
  85. package/package.json +1 -1
package/README.md CHANGED
@@ -12,11 +12,11 @@ Beyond graphs, FlowQuery provides a full data processing pipeline language with
12
12
 
13
13
  The combination of graph querying and pipeline processing makes FlowQuery ideal for the retrieval stage of Retrieval Augmented Generation (RAG). A typical graph RAG flow works as follows:
14
14
 
15
- 1. **User query** The user asks a question in natural language.
16
- 2. **Schema retrieval** The application retrieves the virtual graph schema via `CALL schema()` and injects it into the system instructions of the query-generation LLM, so it knows which node labels, relationship types, and properties are available.
17
- 3. **Query generation** The LLM, grounded in the schema, generates a precise OpenCypher query to retrieve the data needed to answer the question.
18
- 4. **Query execution** The FlowQuery engine executes the generated OpenCypher query against the virtual graph and returns the results as grounding data.
19
- 5. **Response formulation** The LLM formulates a final response informed by the grounding data.
15
+ 1. **User query** - The user asks a question in natural language.
16
+ 2. **Schema retrieval** - The application retrieves the virtual graph schema via `CALL schema()` and injects it into the system instructions of the query-generation LLM, so it knows which node labels, relationship types, and properties are available.
17
+ 3. **Query generation** - The LLM, grounded in the schema, generates a precise OpenCypher query to retrieve the data needed to answer the question.
18
+ 4. **Query execution** - The FlowQuery engine executes the generated OpenCypher query against the virtual graph and returns the results as grounding data.
19
+ 5. **Response formulation** - The LLM formulates a final response informed by the grounding data.
20
20
 
21
21
  ```
22
22
  ┌───────────────────┐
@@ -37,7 +37,7 @@ The combination of graph querying and pipeline processing makes FlowQuery ideal
37
37
  └──────────┘
38
38
  ```
39
39
 
40
- The schema is retrieved using FlowQuery's built-in `schema()` function, which returns the structure of all registered virtual nodes and relationships including labels, types, endpoint labels, property names, and sample values. This schema is then included in the LLM's system instructions so it can generate correct queries grounded in the actual graph model:
40
+ The schema is retrieved using FlowQuery's built-in `schema()` function, which returns the structure of all registered virtual nodes and relationships - including labels, types, endpoint labels, property names, and sample values. This schema is then included in the LLM's system instructions so it can generate correct queries grounded in the actual graph model:
41
41
 
42
42
  ```cypher
43
43
  CALL schema() YIELD kind, label, type, from_label, to_label, properties, sample
@@ -227,7 +227,7 @@ WITH 1 AS x RETURN x UNION ALL WITH 1 AS x RETURN x
227
227
 
228
228
  #### Multi-Statement Queries
229
229
 
230
- Multiple statements can be separated by semicolons. Only declaration statements `CREATE VIRTUAL`, `DELETE VIRTUAL` (alias: `DROP VIRTUAL`), `REFRESH VIRTUAL`, `LET`, `UPDATE`, and `MERGE INTO` may appear before the last statement. The last statement can be any valid query.
230
+ Multiple statements can be separated by semicolons. Only declaration statements - `CREATE VIRTUAL`, `DELETE VIRTUAL` (alias: `DROP VIRTUAL`), `REFRESH VIRTUAL`, `LET`, `UPDATE`, and `MERGE INTO` - may appear before the last statement. The last statement can be any valid query.
231
231
 
232
232
  ```cypher
233
233
  CREATE VIRTUAL (:Person) AS {
@@ -267,7 +267,7 @@ CREATE STATIC VIRTUAL (:Country) AS {
267
267
  ```
268
268
 
269
269
  The sub-query runs once on first access and the result is reused for every
270
- subsequent query in the same process across `Runner` instances. STATIC
270
+ subsequent query in the same process - across `Runner` instances. STATIC
271
271
  virtual entities are protected: re-running `CREATE STATIC VIRTUAL (:Country)`
272
272
  without first dropping the existing entry raises an error. Use
273
273
  `DROP VIRTUAL (:Country)` (an alias for `DELETE VIRTUAL`) to remove it.
@@ -344,88 +344,6 @@ because the mutation would be invisibly overwritten by the next
344
344
  refresh. Use `REFRESH BINDING name` to re-evaluate the source, or
345
345
  `DROP BINDING name` and redefine the binding plainly.
346
346
 
347
- #### Statement Info: Labels, Properties, and Source Lineage
348
-
349
- `metadata.info` carries a `StatementInfo` describing the _structure_ the
350
- query touches — independent of execution. It captures:
351
-
352
- - The node labels and relationship types referenced.
353
- - The data sources backing the underlying virtual definitions.
354
- - The node/relationship properties **consumed** by the query —
355
- `alias.prop` accesses anywhere in `MATCH`, `WHERE`, `WITH`, `RETURN`,
356
- `ORDER BY`, or function arguments, plus inline pattern properties
357
- like `(u:User {id: 'rick.o'})`.
358
- - The properties **declared** by each virtual's `RETURN` clause via
359
- `info.declared`, so you can validate that a query references only
360
- declared properties.
361
- - Literal values supplied for properties at the call site via
362
- `info.nodes[Label].literal_values` — collected from inline pattern
363
- properties and from equality / `IN` predicates such as
364
- `WHERE u.id = 'rick.o'` or `WHERE u.id IN ['a', 'b']`.
365
-
366
- This is useful for governance, lineage UIs, query-cost estimation, schema
367
- validation, or routing decisions before the query runs.
368
-
369
- The same `StatementInfoCrawler` can also be used directly on any parsed
370
- AST without going through a `Runner`:
371
-
372
- ```javascript
373
- import { StatementInfoCrawler } from "flowquery";
374
-
375
- const crawler = new StatementInfoCrawler();
376
- const info = crawler.crawl(parsedAst);
377
- ```
378
-
379
- For end-to-end lineage from a property to its data source, use the
380
- per-entity `nodes` and `relationships` maps:
381
-
382
- ```javascript
383
- const runner = new FlowQuery(`
384
- CREATE VIRTUAL (:City) AS {
385
- LOAD JSON FROM "https://example.com/cities" AS c
386
- RETURN c.id AS id, c.name AS name, c.country AS country
387
- };
388
- CREATE VIRTUAL (:City)-[:FLIGHT]-(:City) AS {
389
- LOAD JSON FROM "https://example.com/flights" AS f
390
- RETURN f.left_id AS left_id, f.right_id AS right_id, f.airline AS airline
391
- };
392
- MATCH (a:City {name: 'NYC'})-[r:FLIGHT]->(b:City)
393
- WHERE b.country IN ['US', 'CA']
394
- RETURN a.name AS origin, b.name AS destination, r.airline AS airline
395
- `);
396
- const { info } = runner.metadata;
397
-
398
- console.log(info.nodes);
399
- // {
400
- // City: {
401
- // properties: ["country", "name"],
402
- // sources: ["https://example.com/cities"],
403
- // literal_values: { country: ["US", "CA"], name: ["NYC"] }
404
- // }
405
- // }
406
- console.log(info.relationships);
407
- // {
408
- // FLIGHT: {
409
- // properties: ["airline"],
410
- // sources: ["https://example.com/flights"],
411
- // literal_values: {}
412
- // }
413
- // }
414
- console.log(info.declared.nodes.City);
415
- // { properties: ["country", "id", "name"], sources: ["https://example.com/cities"] }
416
- console.log(info.sources);
417
- // ["https://example.com/cities", "https://example.com/flights"]
418
- ```
419
-
420
- `StatementInfo` resolves sources and declared schemas for **any** virtual
421
- the query touches — both inline `CREATE VIRTUAL` clauses and
422
- previously-registered virtuals reached via `MATCH` or `DELETE`. The flat
423
- `node_labels`, `relationship_types`, `sources`, `node_properties`, and
424
- `relationship_properties` fields stay in sync with the per-entity `nodes` /
425
- `relationships` maps and are convenient for quick aggregate checks. Only
426
- purely literal AST subtrees end up in `literal_values` — values that depend
427
- on parameters, references, f-strings, or subqueries are skipped.
428
-
429
347
  ### WHERE Clause
430
348
 
431
349
  Filters rows based on conditions. Supports the following operators:
@@ -502,7 +420,7 @@ WITH d WHERE d.id >= threshold OR d.id <= 2
502
420
  RETURN d.id AS id, d.name AS name
503
421
  ```
504
422
 
505
- `LET` fails if the binding already exists use `UPDATE` to overwrite.
423
+ `LET` fails if the binding already exists - use `UPDATE` to overwrite.
506
424
 
507
425
  #### UPDATE
508
426
 
@@ -515,7 +433,7 @@ RETURN counter AS counter
515
433
  // [{ counter: 1 }]
516
434
  ```
517
435
 
518
- `UPDATE` fails if the binding doesn't exist use `LET` first.
436
+ `UPDATE` fails if the binding doesn't exist - use `LET` first.
519
437
 
520
438
  #### UPDATE ... AS alias DELETE WHERE ...
521
439
 
@@ -589,7 +507,7 @@ Notes:
589
507
  - `ON id` is shorthand for the equality predicate `t.id = s.id`; `ON (a, b)` requires equality on every listed key. Anything else is treated as a Boolean predicate evaluated per `(target, source)` pair.
590
508
  - `WHEN MATCHED THEN UPDATE SET .field` overwrites only the listed fields, preserving the rest from the existing row. `SET .field = expr` evaluates `expr` per matched pair, with target and source aliases in scope.
591
509
  - `WHEN NOT MATCHED THEN INSERT` (no row expression) appends the source row as-is. `INSERT { … }` appends an explicit row expression instead.
592
- - A `MERGE INTO` must declare at least one `WHEN` clause. Branches are independent omit `WHEN NOT MATCHED` to skip insertion, omit `WHEN MATCHED` to skip updates/deletes.
510
+ - A `MERGE INTO` must declare at least one `WHEN` clause. Branches are independent - omit `WHEN NOT MATCHED` to skip insertion, omit `WHEN MATCHED` to skip updates/deletes.
593
511
 
594
512
  ### Expressions
595
513
 
@@ -704,16 +622,16 @@ RETURN sum(n IN [1+2+3, 2, 3] | n^2) AS sum // 49
704
622
  Test list elements against a condition. Follow standard Cypher syntax.
705
623
 
706
624
  ```cypher
707
- // any true if at least one element matches
625
+ // any - true if at least one element matches
708
626
  RETURN any(n IN [1, 2, 3] WHERE n > 2) // true
709
627
 
710
- // all true if every element matches
628
+ // all - true if every element matches
711
629
  RETURN all(n IN [2, 4, 6] WHERE n > 0) // true
712
630
 
713
- // none true if no element matches
631
+ // none - true if no element matches
714
632
  RETURN none(n IN [1, 2, 3] WHERE n > 5) // true
715
633
 
716
- // single true if exactly one element matches
634
+ // single - true if exactly one element matches
717
635
  RETURN single(n IN [1, 2, 3] WHERE n > 2) // true
718
636
 
719
637
  // In a WHERE clause
@@ -898,7 +816,7 @@ MATCH (a:Person) WHERE NOT (a)-[:KNOWS]->(:Person) RETURN a.name
898
816
  **Subquery Expressions:** `EXISTS`, `COUNT`, and `COLLECT` evaluate a full subquery as an expression. The subquery can reference outer-scope variables and supports the complete FlowQuery pipeline (MATCH, WITH, WHERE, UNWIND, LOAD, etc.).
899
817
 
900
818
  ```cypher
901
- // EXISTS returns true if the subquery produces any rows
819
+ // EXISTS - returns true if the subquery produces any rows
902
820
  MATCH (p:Person)
903
821
  WHERE EXISTS {
904
822
  MATCH (p)-[:KNOWS]->(friend:Person)
@@ -906,12 +824,12 @@ WHERE EXISTS {
906
824
  }
907
825
  RETURN p.name
908
826
 
909
- // NOT EXISTS negate with NOT
827
+ // NOT EXISTS - negate with NOT
910
828
  MATCH (p:Person)
911
829
  WHERE NOT EXISTS { MATCH (p)-[:KNOWS]->(:Person) }
912
830
  RETURN p.name
913
831
 
914
- // COUNT returns the number of rows the subquery produces
832
+ // COUNT - returns the number of rows the subquery produces
915
833
  MATCH (p:Person)
916
834
  WHERE COUNT { MATCH (p)-[:KNOWS]->(:Person) } > 2
917
835
  RETURN p.name
@@ -920,7 +838,7 @@ RETURN p.name
920
838
  MATCH (p:Person)
921
839
  RETURN p.name, COUNT { MATCH (p)-[:KNOWS]->(:Person) } AS friendCount
922
840
 
923
- // COLLECT returns a list of single-column values from the subquery
841
+ // COLLECT - returns a list of single-column values from the subquery
924
842
  MATCH (p:Person)
925
843
  RETURN COLLECT {
926
844
  MATCH (p)-[:KNOWS]->(friend:Person)
@@ -1296,7 +1214,7 @@ RETURN f.name AS name, f.description AS description, f.category AS category
1296
1214
 
1297
1215
  ### Virtual Org Chart
1298
1216
 
1299
- This single multi-statement query creates a virtual graph for a fictitious company complete with employees, skills, phone numbers, and a management chain then queries it to produce an org chart. [Try live!](https://microsoft.github.io/FlowQuery/?rZXPbtNAEMbFhUMOReXAeW4OwkH521ZBQgolrVQlSogDQlRVtdjTeKm9a603Bavqw_AAPEVfDO16s_EmEVElfPHuzGf7N9-M7UN4BfX-MM0SXiC-hkEA9zUAgJdwqc_quKdRH1o-MJJiH7yACAKnMTLP1-kf_PucykSlTocTE6weEWZEyBSZ7IM3_IXhUtI71MIs5kxd-KbV6PV6jWar2fR8yG9pkuR9uPQCKYjEReH54I2QRCjymGZqd0YZYSF6Vw--C9q2oGMiwmUOM3qHgnh-FfTLFPgNDNmCMkRB2ULTuKBOchO03Ww5oAMRxlRiKJdCVeadJnwZqcUYmeT6JlukHUs6FbQgMCUSEw2yRToVPFqG0prrkLrJTdLOBumMkyglWVbW5Q0YSQpJw1xtPn_dpuxayguSYg4fBOe3Sl2lDJBRLqyhBuVJfrYdynmRYRAKmqmyvGkhY67mzTsXJIs_jbYxexZzRBl8i4np6f_G7LhmLnMNGBS5xFR7-BHvJlm-DXhkAQepeoMueMxyXVQF0LQSxoSRxW7Afd12fQxCsUw1FpEEdLNzqjkDSW4x5kmEAsaLVG4TH1viOU8ff-dwTkT4-Ids9p7fyJ9EoGvrk0ztuqYi0fVtDMEcc7nzPTqxnGdE0pTAIGkEpMDI9XYX55Moew7leibHI2vwlGaYUIaq_xrySn1SBYZcRHp_YDZvaaQyNPJXAVWCCqmzDa7wVWK1XlcPsNKt61DK9c7eSFejcnphw2U5Kl6uag_vaofuP6Fx2Z8Np5PZPLieT64a-_8WCd7Ia_MlFnQRl5tWtWtW0tkv6VYl7Z2S3n7JUVXS2Sk5rkq6OyUnruTfHTZXqbRZWttXN9GXmrWy_gXUce1v7RnUt_x_X08XoqI50A_FcnzWxyAANBrfKOwsWYVcjxNWR8ikK2NkNOUUVR9SjpNJm2mqpMtImU8XwqXUvmVcyHzOa88dBN9U9Bc)
1217
+ This single multi-statement query creates a virtual graph for a fictitious company - complete with employees, skills, phone numbers, and a management chain - then queries it to produce an org chart. [Try live!](https://microsoft.github.io/FlowQuery/?rZXPbtNAEMbFhUMOReXAeW4OwkH521ZBQgolrVQlSogDQlRVtdjTeKm9a603Bavqw_AAPEVfDO16s_EmEVElfPHuzGf7N9-M7UN4BfX-MM0SXiC-hkEA9zUAgJdwqc_quKdRH1o-MJJiH7yACAKnMTLP1-kf_PucykSlTocTE6weEWZEyBSZ7IM3_IXhUtI71MIs5kxd-KbV6PV6jWar2fR8yG9pkuR9uPQCKYjEReH54I2QRCjymGZqd0YZYSF6Vw--C9q2oGMiwmUOM3qHgnh-FfTLFPgNDNmCMkRB2ULTuKBOchO03Ww5oAMRxlRiKJdCVeadJnwZqcUYmeT6JlukHUs6FbQgMCUSEw2yRToVPFqG0prrkLrJTdLOBumMkyglWVbW5Q0YSQpJw1xtPn_dpuxayguSYg4fBOe3Sl2lDJBRLqyhBuVJfrYdynmRYRAKmqmyvGkhY67mzTsXJIs_jbYxexZzRBl8i4np6f_G7LhmLnMNGBS5xFR7-BHvJlm-DXhkAQepeoMueMxyXVQF0LQSxoSRxW7Afd12fQxCsUw1FpEEdLNzqjkDSW4x5kmEAsaLVG4TH1viOU8ff-dwTkT4-Ids9p7fyJ9EoGvrk0ztuqYi0fVtDMEcc7nzPTqxnGdE0pTAIGkEpMDI9XYX55Moew7leibHI2vwlGaYUIaq_xrySn1SBYZcRHp_YDZvaaQyNPJXAVWCCqmzDa7wVWK1XlcPsNKt61DK9c7eSFejcnphw2U5Kl6uag_vaofuP6Fx2Z8Np5PZPLieT64a-_8WCd7Ia_MlFnQRl5tWtWtW0tkv6VYl7Z2S3n7JUVXS2Sk5rkq6OyUnruTfHTZXqbRZWttXN9GXmrWy_gXUce1v7RnUt_x_X08XoqI50A_FcnzWxyAANBrfKOwsWYVcjxNWR8ikK2NkNOUUVR9SjpNJm2mqpMtImU8XwqXUvmVcyHzOa88dBN9U9Bc)
1300
1218
 
1301
1219
  ```cypher
1302
1220
  CREATE VIRTUAL (:Employee) AS {
@@ -1350,7 +1268,7 @@ Output:
1350
1268
  | Amara Johnson | Product Manager | Product | +1-555-0302 | [Scrum, Data Analysis, Stakeholder Mgmt] | Priya Patel |
1351
1269
  | Priya Patel | VP of Product | Product | +1-555-0301 | [Roadmapping, Analytics, UX] | Sara Chen |
1352
1270
 
1353
- You can further explore the graph for example, find the full management chain from any employee up to the CEO:
1271
+ You can further explore the graph - for example, find the full management chain from any employee up to the CEO:
1354
1272
 
1355
1273
  ```cypher
1356
1274
  MATCH (e:Employee)-[:REPORTS_TO*1..]->(mgr:Employee)
@@ -1409,6 +1327,441 @@ ORDER BY neighbor_count DESC
1409
1327
  LIMIT 10
1410
1328
  ```
1411
1329
 
1330
+ ## Lineage and Provenance
1331
+
1332
+ FlowQuery exposes two complementary forms of lineage:
1333
+
1334
+ - **Structural lineage** (`metadata.info`) - what labels, types,
1335
+ properties, and sources the parsed query touches. Available without
1336
+ running the query and with zero runtime overhead.
1337
+ - **Row-level provenance** (`runner.provenance`) - opt-in via
1338
+ `{ provenance: true }`; for every emitted result row, the concrete
1339
+ node ids and relationship hops bound to it, their matched property
1340
+ values, and (for virtual-backed records) the inner sub-query lineage
1341
+ that produced them.
1342
+
1343
+ Combined, they let you trace every cell of a result back to the source
1344
+ record, the source virtual, and ultimately the URL or call-site that
1345
+ backs it.
1346
+
1347
+ ### Statement Info: Labels, Properties, and Source Lineage
1348
+
1349
+ `metadata.info` carries a `StatementInfo` describing the _structure_ the
1350
+ query touches - independent of execution. It captures:
1351
+
1352
+ - The node labels and relationship types referenced.
1353
+ - The data sources backing the underlying virtual definitions.
1354
+ - The node/relationship properties **consumed** by the query -
1355
+ `alias.prop` accesses anywhere in `MATCH`, `WHERE`, `WITH`, `RETURN`,
1356
+ `ORDER BY`, or function arguments, plus inline pattern properties
1357
+ like `(u:User {id: 'rick.o'})`.
1358
+ - The properties **declared** by each virtual's `RETURN` clause via
1359
+ `info.declared`, so you can validate that a query references only
1360
+ declared properties.
1361
+ - Literal values supplied for properties at the call site via
1362
+ `info.nodes[Label].literal_values` - collected from inline pattern
1363
+ properties and from equality / `IN` predicates such as
1364
+ `WHERE u.id = 'rick.o'` or `WHERE u.id IN ['a', 'b']`.
1365
+
1366
+ This is useful for governance, lineage UIs, query-cost estimation, schema
1367
+ validation, or routing decisions before the query runs.
1368
+
1369
+ The same `StatementInfoCrawler` can also be used directly on any parsed
1370
+ AST without going through a `Runner`:
1371
+
1372
+ ```javascript
1373
+ import { StatementInfoCrawler } from "flowquery";
1374
+
1375
+ const crawler = new StatementInfoCrawler();
1376
+ const info = crawler.crawl(parsedAst);
1377
+ ```
1378
+
1379
+ For end-to-end lineage from a property to its data source, use the
1380
+ per-entity `nodes` and `relationships` maps:
1381
+
1382
+ ```javascript
1383
+ const runner = new FlowQuery(`
1384
+ CREATE VIRTUAL (:City) AS {
1385
+ LOAD JSON FROM "https://example.com/cities" AS c
1386
+ RETURN c.id AS id, c.name AS name, c.country AS country
1387
+ };
1388
+ CREATE VIRTUAL (:City)-[:FLIGHT]-(:City) AS {
1389
+ LOAD JSON FROM "https://example.com/flights" AS f
1390
+ RETURN f.left_id AS left_id, f.right_id AS right_id, f.airline AS airline
1391
+ };
1392
+ MATCH (a:City {name: 'NYC'})-[r:FLIGHT]->(b:City)
1393
+ WHERE b.country IN ['US', 'CA']
1394
+ RETURN a.name AS origin, b.name AS destination, r.airline AS airline
1395
+ `);
1396
+ const { info } = runner.metadata;
1397
+
1398
+ console.log(info.nodes);
1399
+ // {
1400
+ // City: {
1401
+ // properties: ["country", "name"],
1402
+ // sources: ["https://example.com/cities"],
1403
+ // literal_values: { country: ["US", "CA"], name: ["NYC"] }
1404
+ // }
1405
+ // }
1406
+ console.log(info.relationships);
1407
+ // {
1408
+ // FLIGHT: {
1409
+ // properties: ["airline"],
1410
+ // sources: ["https://example.com/flights"],
1411
+ // literal_values: {}
1412
+ // }
1413
+ // }
1414
+ console.log(info.declared.nodes.City);
1415
+ // { properties: ["country", "id", "name"], sources: ["https://example.com/cities"] }
1416
+ console.log(info.sources);
1417
+ // ["https://example.com/cities", "https://example.com/flights"]
1418
+ ```
1419
+
1420
+ `StatementInfo` resolves sources and declared schemas for **any** virtual
1421
+ the query touches - both inline `CREATE VIRTUAL` clauses and
1422
+ previously-registered virtuals reached via `MATCH` or `DELETE`. The flat
1423
+ `node_labels`, `relationship_types`, `sources`, `node_properties`, and
1424
+ `relationship_properties` fields stay in sync with the per-entity `nodes` /
1425
+ `relationships` maps and are convenient for quick aggregate checks. Only
1426
+ purely literal AST subtrees end up in `literal_values` - values that depend
1427
+ on parameters, references, f-strings, or subqueries are skipped.
1428
+
1429
+ ### Row-level Provenance: Node and Relationship IDs Behind Each Result
1430
+
1431
+ `StatementInfo` describes the _structural_ lineage of a query - which
1432
+ labels, types, and sources back it. To get the _row-level_ lineage -
1433
+ which concrete node ids and relationship `(left_id, right_id, type)`
1434
+ hops actually flowed into each result row - pass `{ provenance: true }`
1435
+ when constructing the runner and read `runner.provenance`:
1436
+
1437
+ ```javascript
1438
+ const fq = new FlowQuery(
1439
+ `
1440
+ MATCH (a:City {name: 'NYC'})-[r:FLIGHT]->(b:City)
1441
+ RETURN a.name AS origin, b.name AS destination
1442
+ `,
1443
+ null,
1444
+ null,
1445
+ { provenance: true }
1446
+ );
1447
+ await fq.run();
1448
+
1449
+ fq.results;
1450
+ // [{ origin: 'NYC', destination: 'LAX' }, { origin: 'NYC', destination: 'YYZ' }]
1451
+
1452
+ fq.provenance;
1453
+ // [
1454
+ // {
1455
+ // nodes: [
1456
+ // { alias: 'a', label: 'City', id: 'nyc' },
1457
+ // { alias: 'b', label: 'City', id: 'lax' }
1458
+ // ],
1459
+ // relationships: [
1460
+ // { alias: 'r', type: 'FLIGHT',
1461
+ // hops: [{ left_id: 'nyc', right_id: 'lax', type: 'FLIGHT' }],
1462
+ // path: ['nyc', 'lax'] }
1463
+ // ],
1464
+ // rows: [/* per-input-row segments, see below */]
1465
+ // },
1466
+ // { nodes: [...], relationships: [{ alias: 'r', type: 'FLIGHT',
1467
+ // hops: [{ left_id: 'nyc', right_id: 'yyz', type: 'FLIGHT' }],
1468
+ // path: ['nyc', 'yyz'] }], rows: [...] }
1469
+ // ]
1470
+ ```
1471
+
1472
+ Semantics:
1473
+
1474
+ - `runner.provenance` is aligned **by index** with `runner.results`.
1475
+ - Each `NodeBinding.id` preserves the original scalar type of the
1476
+ underlying record (a numeric id stays a number).
1477
+ - Anonymous nodes / relationships from the pattern are included with
1478
+ `alias: null`.
1479
+ - Variable-length matches (`[:T*m..n]`) populate `hops` with every
1480
+ traversed edge in path order.
1481
+ - Every `RelationshipBinding` also carries a `path` field listing every
1482
+ visited node id in order: `[hops[0].left_id, hops[0].right_id,
1483
+ hops[1].right_id, …]`. For single-hop matches `path` has exactly two
1484
+ entries; for variable-length matches `path.length === hops.length + 1`.
1485
+ - `OPTIONAL MATCH` misses surface as `id: null` for the unmatched node
1486
+ and an empty `hops: []` for the unmatched relationship.
1487
+ - `ORDER BY` and `LIMIT` permute and truncate `provenance` in lockstep
1488
+ with `results`.
1489
+ - Aggregate `RETURN` (e.g. `count`, `collect`, `sum`) unions all
1490
+ contributing bindings into the output group's provenance, deduplicated
1491
+ per `(alias, id)` for nodes and per `(alias, hops)` for relationships.
1492
+ - `UNION ALL` concatenates branch provenance; `UNION` keeps the first
1493
+ branch's lineage for deduplicated rows.
1494
+ - Aggregating `WITH` clauses carry lineage forward. Inside the group, the
1495
+ contributing bindings (the upstream `MATCH` nodes and relationships)
1496
+ are deduplicated and frozen; any subsequent `MATCH` adds its own
1497
+ live bindings on top, so the final `RETURN` row's provenance shows
1498
+ both the pre-aggregation sources and the post-aggregation bindings.
1499
+ Chained aggregating `WITH` clauses compose transitively - the original
1500
+ ids and hops survive every aggregation hop.
1501
+
1502
+ When the option is omitted or set to `false`, the runner has zero
1503
+ provenance overhead and `runner.provenance` returns an empty array.
1504
+
1505
+ #### Per-Input-Row Segments: Aligning `collect()` with its Sources
1506
+
1507
+ Each `RowProvenance` also carries a `rows` array: one **segment** per
1508
+ input row that contributed to the result. A segment is just the
1509
+ `{ nodes, relationships }` slice for that single contributing row.
1510
+
1511
+ For non-aggregate rows `rows` always has length 1 and mirrors the
1512
+ top-level `nodes`/`relationships`. For aggregate rows the array
1513
+ positionally aligns with array-valued aggregates such as `collect`:
1514
+
1515
+ ```cypher
1516
+ MATCH (a:City)-[:FLIGHT]->(b:City)
1517
+ RETURN a.country AS country, collect(b.name) AS destinations
1518
+ ```
1519
+
1520
+ ```javascript
1521
+ fq.results[0];
1522
+ // { country: 'US', destinations: ['LAX', 'YYZ'] }
1523
+ fq.provenance[0].rows.length; // 2
1524
+ fq.provenance[0].rows[0].nodes; // contributed LAX: includes b = lax
1525
+ fq.provenance[0].rows[1].nodes; // contributed YYZ: includes b = yyz
1526
+ ```
1527
+
1528
+ This lets you map each element of a `collect`/`sum`/`avg` result back
1529
+ to the exact node / relationship ids that produced it.
1530
+
1531
+ #### Property-Level Lineage
1532
+
1533
+ Each `NodeBinding` and `RelationshipHop` produced under `{ provenance:
1534
+ true }` also carries the matched **property values** alongside the ids:
1535
+
1536
+ ```javascript
1537
+ const fq = new FlowQuery(query, null, null, { provenance: true });
1538
+ await fq.run();
1539
+
1540
+ fq.provenance[0].nodes[0];
1541
+ // {
1542
+ // alias: 'a', label: 'City', id: 'nyc',
1543
+ // properties: { name: 'New York', country: 'US' }
1544
+ // }
1545
+ fq.provenance[0].relationships[0].hops[0];
1546
+ // {
1547
+ // left_id: 'nyc', right_id: 'lax', type: 'FLIGHT',
1548
+ // properties: { airline: 'AA' }
1549
+ // }
1550
+ ```
1551
+
1552
+ - `NodeBinding.properties` is a shallow copy of the matched record with
1553
+ `id` and `_label` stripped. `RelationshipHop.properties` is a shallow
1554
+ copy of the matched relationship's user-visible properties.
1555
+
1556
+ #### Threading Lineage Through Virtual Sub-Queries
1557
+
1558
+ A `CREATE VIRTUAL (:X) AS { ... }` block wraps an inner FlowQuery that
1559
+ produces the synthesised records exposed under the `:X` label. By
1560
+ default, a downstream `MATCH (x:X)` only sees the synthesised row's
1561
+ `id` - the upstream query that produced it is opaque.
1562
+
1563
+ When `{ provenance: true }` is set, the inner runner's `RowProvenance`
1564
+ is threaded onto every binding whose record came from a virtual. Each
1565
+ `NodeBinding` and each `RelationshipHop` gains an optional `source:
1566
+ RowProvenance` field carrying the inner row's full lineage -
1567
+ recursively, when a virtual matches another virtual:
1568
+
1569
+ ```typescript
1570
+ import { Runner } from "flowquery";
1571
+
1572
+ // Virtual graph: derived city = US-only subset of SrcCity.
1573
+ await new Runner(`
1574
+ CREATE VIRTUAL (:SrcCity) AS {
1575
+ UNWIND [
1576
+ { id: 'nyc', country: 'US' },
1577
+ { id: 'lhr', country: 'UK' }
1578
+ ] AS c
1579
+ RETURN c.id AS id, c.country AS country
1580
+ }
1581
+ `).run();
1582
+ await new Runner(`
1583
+ CREATE VIRTUAL (:DerivedCity) AS {
1584
+ MATCH (s:SrcCity)
1585
+ WHERE s.country = 'US'
1586
+ RETURN s.id AS id
1587
+ }
1588
+ `).run();
1589
+
1590
+ const fq = new Runner(`MATCH (d:DerivedCity) RETURN d.id AS id`, null, null, { provenance: true });
1591
+ await fq.run();
1592
+
1593
+ fq.provenance[0].nodes[0];
1594
+ // {
1595
+ // alias: 'd', label: 'DerivedCity', id: 'nyc',
1596
+ // source: {
1597
+ // nodes: [{ alias: 's', label: 'SrcCity', id: 'nyc' }],
1598
+ // relationships: []
1599
+ // }
1600
+ // }
1601
+ ```
1602
+
1603
+ Semantics:
1604
+
1605
+ - The `source` field is **omitted** when the binding's record did not
1606
+ come from a virtual sub-query (e.g. records from `UNWIND … RETURN`
1607
+ inside the virtual produce a `source` with empty `nodes` and
1608
+ `relationships`, signalling "lineage was threaded but no graph slots
1609
+ were bound at this level").
1610
+ - Sub-query lineage is **recursive**: a virtual that matches another
1611
+ virtual carries nested `source` chains all the way down.
1612
+ - Provenance mode bypasses the static-virtual cache because each
1613
+ invocation must produce fresh records to back the lineage weak-map.
1614
+ Static caching continues to apply when `provenance` is off.
1615
+
1616
+ #### Data Sources and LET Chaining
1617
+
1618
+ `LOAD` operations contribute row-level `data_sources` entries on each
1619
+ emitted `RowSegment`: one `{ source, source_provenance? }` per loaded
1620
+ record. `source` is the URL, file URI, async-function name, or
1621
+ `let://<name>` reference; `source_provenance` is the inner
1622
+ `RowProvenance` of the source row when the `LOAD` was reading from a
1623
+ `LET`-bound dataset whose right-hand side itself produced lineage.
1624
+
1625
+ The same chain surfaces structurally. `info.sources` and the per-label
1626
+ `info.nodes[Label].sources` arrays follow `LOAD FROM <letName>`
1627
+ references and emit a `let://<name>` entry, plus the underlying
1628
+ sources the `LET` sub-query touched (when both definitions are
1629
+ visible to the crawler in the same query).
1630
+
1631
+ ```cypher
1632
+ LET cities = { LOAD JSON FROM 'file:///data/cities.json' AS c RETURN c.id AS id, c.name AS name };
1633
+ CREATE VIRTUAL (:City) AS { LOAD JSON FROM cities AS c RETURN c.id AS id, c.name AS name };
1634
+ MATCH (c:City) RETURN c.name AS name
1635
+ ```
1636
+
1637
+ ```javascript
1638
+ runner.metadata.info.nodes.City.sources;
1639
+ // ['file:///data/cities.json', 'let://cities']
1640
+
1641
+ runner.provenance[0].nodes[0].source.data_sources[0];
1642
+ // {
1643
+ // source: 'let://cities',
1644
+ // source_provenance: {
1645
+ // nodes: [], relationships: [],
1646
+ // data_sources: [{ source: 'file:///data/cities.json' }]
1647
+ // }
1648
+ // }
1649
+ ```
1650
+
1651
+ #### Column-Level Lineage: Tracing Each Result Cell to Its Source
1652
+
1653
+ `runner.info.returns` (added to `StatementInfo`) maps every output
1654
+ column to the `alias.property` accesses that compose it. Combined with
1655
+ row provenance it gives you per-cell traceability - value → source
1656
+ binding → node id → source URL - without any runtime AST inspection.
1657
+
1658
+ For a query
1659
+
1660
+ ```cypher
1661
+ MATCH (c:City)-[f:FLIGHT]->(d:City)
1662
+ WHERE c.country = 'US'
1663
+ RETURN c.name AS origin, d.name AS destination, f.airline AS airline
1664
+ ```
1665
+
1666
+ `runner.info.returns` is:
1667
+
1668
+ ```javascript
1669
+ {
1670
+ origin: {
1671
+ references: [{ alias: 'c', kind: 'node', labels: ['City'], property: 'name' }],
1672
+ kind: 'property'
1673
+ },
1674
+ destination: {
1675
+ references: [{ alias: 'd', kind: 'node', labels: ['City'], property: 'name' }],
1676
+ kind: 'property'
1677
+ },
1678
+ airline: {
1679
+ references: [{
1680
+ alias: 'f', kind: 'relationship',
1681
+ labels: ['FLIGHT'], property: 'airline'
1682
+ }],
1683
+ kind: 'property'
1684
+ }
1685
+ }
1686
+ ```
1687
+
1688
+ `kind` summarises how the column was built:
1689
+
1690
+ | `kind` | Meaning |
1691
+ | -------------- | ------------------------------------------------------------------------- |
1692
+ | `'literal'` | Pure literal expression, no bindings (e.g. `42 AS answer`). |
1693
+ | `'property'` | Direct `alias.property` projection (or pass-through). |
1694
+ | `'expression'` | Computed from one or more `alias.property` accesses. |
1695
+ | `'aggregate'` | Aggregate function (`count`, `sum`, `collect`, …); see `aggregate` field. |
1696
+
1697
+ Notes:
1698
+
1699
+ - The map is keyed by the column's output alias (the part after `AS`),
1700
+ falling back to `expr0`, `expr1`, … for unnamed columns.
1701
+ - `count(c)` and similar aggregates over a bare binding produce
1702
+ `kind: 'aggregate'`, `aggregate: 'count'`, and an empty `references`
1703
+ list (no specific property is read). Use `count(c.id)` to surface a
1704
+ property reference.
1705
+ - Multi-label intersection matches (`MATCH (n:A:B)`) populate every
1706
+ label in `references[i].labels`.
1707
+
1708
+ #### Combining Lineage and Provenance: `traceRow()` and `lineage()`
1709
+
1710
+ `info.returns` (structural) and `runner.provenance` (runtime) are kept
1711
+ as separate streams so each is useful on its own. When you want both
1712
+ joined per cell, the `Runner` exposes two convenience methods.
1713
+
1714
+ `runner.traceRow(rowIndex)` returns one `CellTrace` per output column
1715
+ for a single row, pairing the column's structural lineage with the
1716
+ node / relationship bindings whose alias matches it. The matched
1717
+ property value is extracted for you (including the built-ins `id`,
1718
+ `left_id`, `right_id`, `type`):
1719
+
1720
+ ```javascript
1721
+ const runner = new FlowQuery(
1722
+ `
1723
+ MATCH (c:City)-[f:FLIGHT]->(d:City)
1724
+ WHERE c.country = 'US'
1725
+ RETURN c.name AS origin, d.name AS destination, f.airline AS airline
1726
+ `,
1727
+ null,
1728
+ null,
1729
+ { provenance: true }
1730
+ );
1731
+ await runner.run();
1732
+
1733
+ const trace = runner.traceRow(0);
1734
+ // trace.origin = {
1735
+ // column: 'origin',
1736
+ // value: 'New York',
1737
+ // lineage: { references: [{ alias: 'c', kind: 'node', labels: ['City'], property: 'name' }], kind: 'property' },
1738
+ // bindings: [{
1739
+ // reference: { alias: 'c', kind: 'node', labels: ['City'], property: 'name' },
1740
+ // value: 'New York',
1741
+ // node: { alias: 'c', label: 'City', id: 'nyc', properties: { name: 'New York', country: 'US' }, source: {...} }
1742
+ // }]
1743
+ // }
1744
+ // trace.airline.bindings[0].relationship.hops[0].properties.airline === 'AA'
1745
+ ```
1746
+
1747
+ `runner.lineage()` is the one-shot equivalent over the entire result
1748
+ set:
1749
+
1750
+ ```javascript
1751
+ const report = runner.lineage();
1752
+ // report.columns: same shape as info.returns (structural per-column).
1753
+ // report.rows[i]: same shape as traceRow(i) (per-cell trace).
1754
+ ```
1755
+
1756
+ `bindings` is empty for literal columns, when the runner was
1757
+ constructed without `{ provenance: true }`, or when a row's bindings
1758
+ don't intersect the column's references. Aggregate columns like
1759
+ `collect(c.id)` surface one binding per contributing input row, so
1760
+ `trace.ids.bindings.map(b => b.value)` aligns with the collected
1761
+ array. `info`, `provenance`, and `metadata` remain available unchanged
1762
+
1763
+ - `traceRow` / `lineage` are purely additive helpers.
1764
+
1412
1765
  ## Contributing
1413
1766
 
1414
1767
  This project welcomes contributions and suggestions. Most contributions require you to agree to a