codespine 1.0.5__tar.gz → 1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-1.0.5 → codespine-1.0.6}/PKG-INFO +1 -1
- {codespine-1.0.5 → codespine-1.0.6}/codespine/__init__.py +1 -1
- {codespine-1.0.5 → codespine-1.0.6}/codespine/cli.py +75 -11
- codespine-1.0.6/codespine/db/_cypher_compat.py +523 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/call_resolver.py +11 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/engine.py +44 -8
- {codespine-1.0.5 → codespine-1.0.6}/codespine/sharding/store.py +9 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/PKG-INFO +1 -1
- {codespine-1.0.5 → codespine-1.0.6}/pyproject.toml +1 -1
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_cypher_compat.py +168 -0
- codespine-1.0.5/codespine/db/_cypher_compat.py +0 -351
- {codespine-1.0.5 → codespine-1.0.6}/LICENSE +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/README.md +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/community.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/context.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/coupling.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/crossmodule.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/deadcode.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/flow.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/impact.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/cache/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/cache/result_cache.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/config.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/db/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/db/duckdb_store.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/db/schema.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/db/store.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/diff/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/diff/branch_diff.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/guide.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/di_resolver.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/java_parser.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/mcp/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/mcp/server.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/noise/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/noise/blocklist.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/overlay/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/overlay/git_state.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/overlay/merge.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/overlay/store.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/search/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/search/bm25.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/search/fuzzy.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/search/hybrid.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/search/rrf.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/search/vector.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/sharding/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/sharding/router.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/watch/__init__.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/watch/git_hook.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine/watch/watcher.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/SOURCES.txt +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/requires.txt +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/top_level.txt +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/gindex.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/setup.cfg +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_call_resolver.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_community_detection.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_deadcode.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_duckdb_store.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_index_and_hybrid.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_java_parser.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_multimodule_index.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_overlay.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_result_cache.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_search_ranking.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_sharding.py +0 -0
- {codespine-1.0.5 → codespine-1.0.6}/tests/test_store_recovery.py +0 -0
|
@@ -192,6 +192,21 @@ def _index_shard_group(
|
|
|
192
192
|
with output_lock:
|
|
193
193
|
_phase(f"{prefix}Tracing calls...", "starting...")
|
|
194
194
|
return
|
|
195
|
+
if event == "resolve_calls_heartbeat":
|
|
196
|
+
# Fires every 2 s from a daemon thread so the spinner stays
|
|
197
|
+
# alive even when the resolver produces no new edges.
|
|
198
|
+
scanned = int(payload.get("scanned", 0))
|
|
199
|
+
edges = int(payload.get("edges", 0))
|
|
200
|
+
elapsed_s = float(payload.get("elapsed", 0.0))
|
|
201
|
+
if not parallel:
|
|
202
|
+
click.echo(
|
|
203
|
+
f"\r{_spinner_char()} {prefix}Tracing calls... "
|
|
204
|
+
f"{edges:>6} resolved / {scanned} scanned {elapsed_s:.1f}s ",
|
|
205
|
+
nl=False,
|
|
206
|
+
)
|
|
207
|
+
call_state["shown"] = True
|
|
208
|
+
call_state["last_ts"] = now
|
|
209
|
+
return
|
|
195
210
|
if event == "resolve_calls_progress":
|
|
196
211
|
call_state["count"] = int(payload.get("calls_resolved", 0))
|
|
197
212
|
if (now - call_state["last_ts"]) >= 0.25:
|
|
@@ -345,6 +360,37 @@ def analyse(path: str, full: bool, deep: bool, incremental_deep: bool, embed: bo
|
|
|
345
360
|
# For single-project analysis this is transparent — shard() always
|
|
346
361
|
# returns a GraphStore pointing to the correct shard path.
|
|
347
362
|
sg = ShardedGraphStore(read_only=False)
|
|
363
|
+
|
|
364
|
+
# ── SIGINT handler: flush partial index on Ctrl+C ────────────────────
|
|
365
|
+
# The handler captures `sg` by closure. On interrupt it snapshots all
|
|
366
|
+
# open shards so `codespine stats` and MCP see the partial result, then
|
|
367
|
+
# calls os._exit(130) to bypass Python cleanup (safe for CLI process).
|
|
368
|
+
# A second Ctrl+C hard-exits immediately.
|
|
369
|
+
_sigint_pressed: list[bool] = [False]
|
|
370
|
+
_old_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
371
|
+
|
|
372
|
+
def _sigint_flush(signum: int, frame: object) -> None: # noqa: ARG001
|
|
373
|
+
if _sigint_pressed[0]:
|
|
374
|
+
os._exit(130)
|
|
375
|
+
_sigint_pressed[0] = True
|
|
376
|
+
# Restore default handler so a second Ctrl+C exits immediately.
|
|
377
|
+
signal.signal(signal.SIGINT, signal.default_int_handler)
|
|
378
|
+
click.secho(
|
|
379
|
+
"\n\n⚠ Interrupted — flushing partial index to read replica…",
|
|
380
|
+
fg="yellow",
|
|
381
|
+
)
|
|
382
|
+
try:
|
|
383
|
+
sg.snapshot_all(background=False)
|
|
384
|
+
click.secho(
|
|
385
|
+
"✓ Partial index saved. Run 'codespine stats' to see what was indexed.",
|
|
386
|
+
fg="yellow",
|
|
387
|
+
)
|
|
388
|
+
except Exception: # noqa: BLE001
|
|
389
|
+
pass
|
|
390
|
+
os._exit(130)
|
|
391
|
+
|
|
392
|
+
signal.signal(signal.SIGINT, _sigint_flush)
|
|
393
|
+
|
|
348
394
|
# The indexer is initialised per-module below with the right shard store.
|
|
349
395
|
# We keep a single ShardedGraphStore to fan-out cross-module linking later.
|
|
350
396
|
|
|
@@ -594,6 +640,9 @@ def analyse(path: str, full: bool, deep: bool, incremental_deep: bool, embed: bo
|
|
|
594
640
|
sg.snapshot_all(background=False)
|
|
595
641
|
_finish_phase(snap_label, "MCP will reload automatically")
|
|
596
642
|
|
|
643
|
+
# Restore original SIGINT handler now that we've finished cleanly.
|
|
644
|
+
signal.signal(signal.SIGINT, _old_sigint_handler)
|
|
645
|
+
|
|
597
646
|
|
|
598
647
|
@main.command()
|
|
599
648
|
@click.argument("query")
|
|
@@ -741,15 +790,27 @@ def stats(as_json: bool, show_shards: bool) -> None:
|
|
|
741
790
|
click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
|
|
742
791
|
return
|
|
743
792
|
|
|
793
|
+
def _stat_count(store, query: str, params: dict) -> int:
|
|
794
|
+
"""Run a stats count query — returns 0 on any failure."""
|
|
795
|
+
try:
|
|
796
|
+
rows = store.query_records(query, params)
|
|
797
|
+
return int(rows[0]["n"]) if rows else 0
|
|
798
|
+
except Exception as exc: # noqa: BLE001
|
|
799
|
+
click.secho(f" (stat unavailable: {exc})", fg="yellow")
|
|
800
|
+
return 0
|
|
801
|
+
|
|
744
802
|
rows = []
|
|
745
803
|
for p in all_projects_meta:
|
|
746
804
|
pid = p["id"]
|
|
747
805
|
# Route each query to the project's owning shard.
|
|
748
806
|
ps = _project_store(pid)
|
|
749
|
-
|
|
750
|
-
|
|
807
|
+
n_files = _stat_count(
|
|
808
|
+
ps,
|
|
809
|
+
"MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n",
|
|
810
|
+
{"pid": pid},
|
|
751
811
|
)
|
|
752
|
-
|
|
812
|
+
n_classes = _stat_count(
|
|
813
|
+
ps,
|
|
753
814
|
"""
|
|
754
815
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
755
816
|
WITH f
|
|
@@ -758,7 +819,8 @@ def stats(as_json: bool, show_shards: bool) -> None:
|
|
|
758
819
|
""",
|
|
759
820
|
{"pid": pid},
|
|
760
821
|
)
|
|
761
|
-
|
|
822
|
+
n_methods = _stat_count(
|
|
823
|
+
ps,
|
|
762
824
|
"""
|
|
763
825
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
764
826
|
WITH f
|
|
@@ -769,7 +831,8 @@ def stats(as_json: bool, show_shards: bool) -> None:
|
|
|
769
831
|
""",
|
|
770
832
|
{"pid": pid},
|
|
771
833
|
)
|
|
772
|
-
|
|
834
|
+
n_calls = _stat_count(
|
|
835
|
+
ps,
|
|
773
836
|
"""
|
|
774
837
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
775
838
|
WITH f
|
|
@@ -780,7 +843,8 @@ def stats(as_json: bool, show_shards: bool) -> None:
|
|
|
780
843
|
""",
|
|
781
844
|
{"pid": pid},
|
|
782
845
|
)
|
|
783
|
-
|
|
846
|
+
n_emb = _stat_count(
|
|
847
|
+
ps,
|
|
784
848
|
"""
|
|
785
849
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
786
850
|
WITH f
|
|
@@ -793,11 +857,11 @@ def stats(as_json: bool, show_shards: bool) -> None:
|
|
|
793
857
|
"project": pid,
|
|
794
858
|
"path": p["path"],
|
|
795
859
|
"shard": sg.router.shard_for(pid),
|
|
796
|
-
"files":
|
|
797
|
-
"classes":
|
|
798
|
-
"methods":
|
|
799
|
-
"calls_out":
|
|
800
|
-
"embeddings":
|
|
860
|
+
"files": n_files,
|
|
861
|
+
"classes": n_classes,
|
|
862
|
+
"methods": n_methods,
|
|
863
|
+
"calls_out": n_calls,
|
|
864
|
+
"embeddings": n_emb,
|
|
801
865
|
})
|
|
802
866
|
|
|
803
867
|
if as_json:
|
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
"""Cypher-to-SQL translation for CodeSpine's DuckDB backend.
|
|
2
|
+
|
|
3
|
+
Translates the specific subset of OpenCypher used by CodeSpine into
|
|
4
|
+
equivalent DuckDB SQL so that every ``store.query_records(cypher, params)``
|
|
5
|
+
call continues to work without touching the call-sites.
|
|
6
|
+
|
|
7
|
+
Supported constructs
|
|
8
|
+
--------------------
|
|
9
|
+
- Node patterns MATCH (alias:Label) or (a:L {prop: $v})
|
|
10
|
+
- Anonymous nodes (:Label) in NOT-EXISTS subqueries
|
|
11
|
+
- Relationship patterns (a)-[r:REL]->(b) directed
|
|
12
|
+
- Undirected edges (a)-[r:REL]-(b) → OR of both directions
|
|
13
|
+
- Virtual FK edges (a)-[:HAS_METHOD]->(b) → b.class_id = a.id (no edge table)
|
|
14
|
+
- Multi-hop patterns (a)-[:R1]->(x)-[:R2]->(b)
|
|
15
|
+
- Anonymous destination (a)-[:CALLS]->()
|
|
16
|
+
- Multi-MATCH + WITH Multiple MATCH clauses joined by WITH pipeline stages
|
|
17
|
+
- WHERE =, <>, IN, CONTAINS, lower(), coalesce(),
|
|
18
|
+
IS NULL, IS NOT NULL, >=, <=
|
|
19
|
+
- NOT EXISTS subqueries NOT EXISTS { MATCH (:N)-[:R]->(m) }
|
|
20
|
+
- WITH … ORDER BY Kuzu paging construct → plain ORDER BY
|
|
21
|
+
- DISTINCT, ORDER BY, LIMIT
|
|
22
|
+
- Aggregates count(n) → count(*)
|
|
23
|
+
- Literal values 'string' in RETURN (e.g. 'method' as kind)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import re
|
|
29
|
+
from typing import Any
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Schema mappings
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
# Kuzu node label → DuckDB table name
|
|
36
|
+
_LABEL_TABLE: dict[str, str] = {
|
|
37
|
+
"Project": "projects",
|
|
38
|
+
"File": "files",
|
|
39
|
+
"Class": "classes",
|
|
40
|
+
"Method": "methods",
|
|
41
|
+
"Symbol": "symbols",
|
|
42
|
+
"Community": "communities",
|
|
43
|
+
"Flow": "flows",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Kuzu relationship type → (edge_table, src_col, dst_col, extra_where | None)
|
|
47
|
+
_REL_EDGE: dict[str, tuple[str, str, str, str | None]] = {
|
|
48
|
+
"CALLS": ("calls", "source_id", "target_id", None),
|
|
49
|
+
"OVERRIDES": ("references_type", "src_id", "dst_id", "rel = 'OVERRIDES'"),
|
|
50
|
+
"IMPLEMENTS": ("references_type", "src_id", "dst_id", "rel = 'IMPLEMENTS'"),
|
|
51
|
+
"INJECTS": ("injects", "src_class_id", "dst_class_id", None),
|
|
52
|
+
"BINDS_INTERFACE": ("binds_interface", "src_class_id", "dst_class_id", None),
|
|
53
|
+
"IN_COMMUNITY": ("community_members", "symbol_id", "community_id", None),
|
|
54
|
+
"IN_FLOW": ("flow_members", "symbol_id", "flow_id", None),
|
|
55
|
+
"CO_CHANGED_WITH": ("co_changed_with", "file_a", "file_b", None),
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Virtual FK edges: backed by a foreign-key column rather than a separate
|
|
59
|
+
# edge table. Format: (src_label_table, dst_label_table, dst_fk_col)
|
|
60
|
+
# e.g. HAS_METHOD: methods.class_id = class.id
|
|
61
|
+
_VIRTUAL_REL_EDGE: dict[str, tuple[str, str, str]] = {
|
|
62
|
+
"HAS_METHOD": ("classes", "methods", "class_id"),
|
|
63
|
+
"HAS_CLASS": ("files", "classes", "file_id"),
|
|
64
|
+
"DECLARES": ("files", "symbols", "file_id"),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# All real edge tables (used for anonymous total-count query)
|
|
68
|
+
_ALL_EDGE_TABLES = (
|
|
69
|
+
"calls",
|
|
70
|
+
"references_type",
|
|
71
|
+
"injects",
|
|
72
|
+
"binds_interface",
|
|
73
|
+
"community_members",
|
|
74
|
+
"flow_members",
|
|
75
|
+
"co_changed_with",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Top-level Cypher keywords recognised by the clause splitter.
|
|
79
|
+
# Order matters: longer / more-specific patterns must come before shorter ones.
|
|
80
|
+
_TOP_KEYWORDS = (
|
|
81
|
+
"OPTIONAL MATCH",
|
|
82
|
+
"ORDER BY",
|
|
83
|
+
"MATCH",
|
|
84
|
+
"WITH",
|
|
85
|
+
"WHERE",
|
|
86
|
+
"RETURN",
|
|
87
|
+
"LIMIT",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def is_cypher(query: str) -> bool:
|
|
92
|
+
"""Return True if *query* looks like Cypher rather than SQL."""
|
|
93
|
+
return bool(re.match(r"(?i)\s*MATCH\b", query.lstrip()))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
# Main entry point
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
def translate(cypher: str, params: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]:
|
|
101
|
+
"""Translate *cypher* to DuckDB SQL.
|
|
102
|
+
|
|
103
|
+
Returns ``(sql, params_dict)`` where *params_dict* preserves the original
|
|
104
|
+
``$name`` bindings so they can be passed directly to
|
|
105
|
+
``duckdb.connect().execute(sql, params_dict)``.
|
|
106
|
+
"""
|
|
107
|
+
sql = _translate(cypher)
|
|
108
|
+
return sql, (params or {})
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
# Clause splitter
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
def _split_clauses(q: str) -> list[tuple[str, str]]:
|
|
116
|
+
"""Tokenise a (whitespace-normalised) Cypher query into clause pairs.
|
|
117
|
+
|
|
118
|
+
Returns a list of ``(keyword, body)`` tuples at the TOP level of the
|
|
119
|
+
query. Keywords inside ``()``, ``[]``, ``{}`` or quoted strings are
|
|
120
|
+
NOT treated as clause boundaries.
|
|
121
|
+
|
|
122
|
+
Example::
|
|
123
|
+
|
|
124
|
+
"MATCH (f:File) WHERE f.id = $x WITH f MATCH (c:Class) RETURN c.name"
|
|
125
|
+
→ [("MATCH", "(f:File)"),
|
|
126
|
+
("WHERE", "f.id = $x"),
|
|
127
|
+
("WITH", "f"),
|
|
128
|
+
("MATCH", "(c:Class)"),
|
|
129
|
+
("RETURN","c.name")]
|
|
130
|
+
"""
|
|
131
|
+
results: list[tuple[str, str]] = []
|
|
132
|
+
n = len(q)
|
|
133
|
+
i = 0
|
|
134
|
+
depth_paren = depth_sq = depth_brace = 0
|
|
135
|
+
in_quote = False
|
|
136
|
+
quote_char = ""
|
|
137
|
+
current_kw: str | None = None
|
|
138
|
+
current_start = 0
|
|
139
|
+
|
|
140
|
+
while i < n:
|
|
141
|
+
ch = q[i]
|
|
142
|
+
|
|
143
|
+
# ── Quote handling ────────────────────────────────────────────────
|
|
144
|
+
if in_quote:
|
|
145
|
+
if ch == "\\" and i + 1 < n:
|
|
146
|
+
i += 2 # skip escaped char
|
|
147
|
+
continue
|
|
148
|
+
if ch == quote_char:
|
|
149
|
+
in_quote = False
|
|
150
|
+
i += 1
|
|
151
|
+
continue
|
|
152
|
+
if ch in ('"', "'"):
|
|
153
|
+
in_quote = True
|
|
154
|
+
quote_char = ch
|
|
155
|
+
i += 1
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
# ── Depth tracking ────────────────────────────────────────────────
|
|
159
|
+
if ch == "(":
|
|
160
|
+
depth_paren += 1
|
|
161
|
+
elif ch == ")":
|
|
162
|
+
depth_paren = max(0, depth_paren - 1)
|
|
163
|
+
elif ch == "[":
|
|
164
|
+
depth_sq += 1
|
|
165
|
+
elif ch == "]":
|
|
166
|
+
depth_sq = max(0, depth_sq - 1)
|
|
167
|
+
elif ch == "{":
|
|
168
|
+
depth_brace += 1
|
|
169
|
+
elif ch == "}":
|
|
170
|
+
depth_brace = max(0, depth_brace - 1)
|
|
171
|
+
|
|
172
|
+
# ── Keyword detection (top-level only) ───────────────────────────
|
|
173
|
+
if depth_paren == 0 and depth_sq == 0 and depth_brace == 0:
|
|
174
|
+
for kw in _TOP_KEYWORDS:
|
|
175
|
+
kl = len(kw)
|
|
176
|
+
if q[i : i + kl].upper() == kw:
|
|
177
|
+
end_pos = i + kl
|
|
178
|
+
# Require word boundary: end-of-string or non-word char
|
|
179
|
+
if end_pos < n and (q[end_pos].isalnum() or q[end_pos] == "_"):
|
|
180
|
+
continue # e.g. "MATCHING" is not "MATCH"
|
|
181
|
+
# Flush previous clause
|
|
182
|
+
if current_kw is not None:
|
|
183
|
+
body = q[current_start:i].strip()
|
|
184
|
+
results.append((current_kw, body))
|
|
185
|
+
current_kw = kw
|
|
186
|
+
current_start = end_pos
|
|
187
|
+
i = end_pos
|
|
188
|
+
break
|
|
189
|
+
else:
|
|
190
|
+
i += 1
|
|
191
|
+
else:
|
|
192
|
+
i += 1
|
|
193
|
+
|
|
194
|
+
# Flush final clause
|
|
195
|
+
if current_kw is not None:
|
|
196
|
+
body = q[current_start:].strip()
|
|
197
|
+
results.append((current_kw, body))
|
|
198
|
+
|
|
199
|
+
return results
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# ---------------------------------------------------------------------------
|
|
203
|
+
# Internal translation pipeline
|
|
204
|
+
# ---------------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
def _translate_anonymous_edge_count(cypher: str) -> str | None:
|
|
207
|
+
"""Fast-path for ``MATCH ()-[r]->() RETURN count(r) [as X]``.
|
|
208
|
+
|
|
209
|
+
Anonymous edge patterns carry no labels so the generic translator
|
|
210
|
+
cannot derive a FROM table. We special-case the count-all-edges
|
|
211
|
+
pattern by summing row-counts across every edge table.
|
|
212
|
+
"""
|
|
213
|
+
q = re.sub(r"\s+", " ", cypher.strip())
|
|
214
|
+
m = re.match(
|
|
215
|
+
r"(?i)MATCH\s*\(\s*\)\s*-\s*\[\s*\w*\s*\]\s*->\s*\(\s*\)\s*"
|
|
216
|
+
r"RETURN\s+count\s*\(\s*[*]?\w*\s*\)\s*(?:as\s+(\w+))?\s*$",
|
|
217
|
+
q,
|
|
218
|
+
)
|
|
219
|
+
if not m:
|
|
220
|
+
return None
|
|
221
|
+
alias = m.group(1) or "count"
|
|
222
|
+
unions = " UNION ALL ".join(
|
|
223
|
+
f"SELECT COUNT(*) AS c FROM {tbl}" for tbl in _ALL_EDGE_TABLES
|
|
224
|
+
)
|
|
225
|
+
return f"SELECT COALESCE(SUM(c), 0) AS {alias} FROM ({unions}) t"
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _translate(cypher: str) -> str:
|
|
229
|
+
# ── Fast-path: anonymous total-edge-count ────────────────────────────
|
|
230
|
+
special = _translate_anonymous_edge_count(cypher)
|
|
231
|
+
if special is not None:
|
|
232
|
+
return special
|
|
233
|
+
|
|
234
|
+
q = re.sub(r"\s+", " ", cypher.strip())
|
|
235
|
+
clauses = _split_clauses(q)
|
|
236
|
+
|
|
237
|
+
aliases: dict[str, str] = {} # alias → table name
|
|
238
|
+
inline_conds: list[str] = [] # WHERE from inline {prop: $val}
|
|
239
|
+
edge_conds: list[str] = [] # join conditions from real edge tables
|
|
240
|
+
virtual_conds: list[str] = [] # FK conditions from virtual edges
|
|
241
|
+
where_parts: list[str] = [] # collected WHERE bodies
|
|
242
|
+
ret_cols = "*"
|
|
243
|
+
ret_distinct = ""
|
|
244
|
+
order_clause = ""
|
|
245
|
+
limit_clause = ""
|
|
246
|
+
_rel_counter = {"n": 0}
|
|
247
|
+
|
|
248
|
+
for kw, body in clauses:
|
|
249
|
+
if kw == "MATCH":
|
|
250
|
+
_absorb_match(body, aliases, inline_conds, edge_conds,
|
|
251
|
+
virtual_conds, _rel_counter)
|
|
252
|
+
elif kw == "OPTIONAL MATCH":
|
|
253
|
+
# Degenerate: register new node aliases so their columns are
|
|
254
|
+
# selectable, but don't add INNER JOIN constraints.
|
|
255
|
+
# Full LEFT JOIN support is a future enhancement.
|
|
256
|
+
_absorb_match_nodes_only(body, aliases)
|
|
257
|
+
elif kw == "WITH":
|
|
258
|
+
# Paging idiom: WITH x ORDER BY x.col LIMIT n
|
|
259
|
+
ob = re.search(r"(?i)ORDER\s+BY\s+(.+?)(?:\s+LIMIT\s+\S+)?\s*$", body)
|
|
260
|
+
if ob and not order_clause:
|
|
261
|
+
order_clause = "ORDER BY " + ob.group(1).strip()
|
|
262
|
+
lm = re.search(r"(?i)LIMIT\s+(\S+)", body)
|
|
263
|
+
if lm and not limit_clause:
|
|
264
|
+
limit_clause = "LIMIT " + lm.group(1)
|
|
265
|
+
# Pipeline-separator WITH (no ORDER BY) is simply dropped.
|
|
266
|
+
elif kw == "WHERE":
|
|
267
|
+
where_parts.append(body)
|
|
268
|
+
elif kw == "RETURN":
|
|
269
|
+
# DISTINCT
|
|
270
|
+
dm = re.match(r"(?i)DISTINCT\s+(.*)", body)
|
|
271
|
+
if dm:
|
|
272
|
+
ret_distinct = "DISTINCT "
|
|
273
|
+
body = dm.group(1)
|
|
274
|
+
# Trailing ORDER BY inside RETURN
|
|
275
|
+
ob = re.search(r"(?i)\s+ORDER\s+BY\s+(.+?)(?=\s*(?:LIMIT|$))", body)
|
|
276
|
+
if ob and not order_clause:
|
|
277
|
+
order_clause = "ORDER BY " + ob.group(1).strip()
|
|
278
|
+
body = body[: ob.start()].strip()
|
|
279
|
+
# Trailing LIMIT inside RETURN
|
|
280
|
+
lm = re.search(r"(?i)\s+LIMIT\s+(\S+)", body)
|
|
281
|
+
if lm and not limit_clause:
|
|
282
|
+
limit_clause = "LIMIT " + lm.group(1)
|
|
283
|
+
body = body[: lm.start()].strip()
|
|
284
|
+
ret_cols = body.strip()
|
|
285
|
+
elif kw == "ORDER BY":
|
|
286
|
+
if not order_clause:
|
|
287
|
+
order_clause = "ORDER BY " + body
|
|
288
|
+
elif kw == "LIMIT":
|
|
289
|
+
if not limit_clause:
|
|
290
|
+
limit_clause = "LIMIT " + body.split()[0]
|
|
291
|
+
|
|
292
|
+
# ── FROM clause ───────────────────────────────────────────────────────
|
|
293
|
+
seen: set[str] = set()
|
|
294
|
+
from_parts: list[str] = []
|
|
295
|
+
for alias, tbl in aliases.items():
|
|
296
|
+
entry = f"{tbl} {alias}"
|
|
297
|
+
if entry not in seen:
|
|
298
|
+
from_parts.append(entry)
|
|
299
|
+
seen.add(entry)
|
|
300
|
+
from_str = ", ".join(from_parts) if from_parts else "(SELECT 1 WHERE 1=0) _empty(x)"
|
|
301
|
+
|
|
302
|
+
# ── WHERE clause ──────────────────────────────────────────────────────
|
|
303
|
+
all_conds: list[str] = []
|
|
304
|
+
all_conds.extend(edge_conds)
|
|
305
|
+
all_conds.extend(virtual_conds)
|
|
306
|
+
all_conds.extend(inline_conds)
|
|
307
|
+
for wp in where_parts:
|
|
308
|
+
expanded = _expand_not_exists(wp, aliases)
|
|
309
|
+
transformed = _transform_where(expanded)
|
|
310
|
+
if transformed:
|
|
311
|
+
all_conds.append(transformed)
|
|
312
|
+
where_str = " AND ".join(c for c in all_conds if c)
|
|
313
|
+
|
|
314
|
+
# ── SELECT ────────────────────────────────────────────────────────────
|
|
315
|
+
select_str = _transform_select(ret_cols)
|
|
316
|
+
|
|
317
|
+
# ── Assemble ──────────────────────────────────────────────────────────
|
|
318
|
+
parts = [f"SELECT {ret_distinct}{select_str}", f"FROM {from_str}"]
|
|
319
|
+
if where_str:
|
|
320
|
+
parts.append(f"WHERE {where_str}")
|
|
321
|
+
if order_clause:
|
|
322
|
+
parts.append(order_clause)
|
|
323
|
+
if limit_clause:
|
|
324
|
+
parts.append(limit_clause)
|
|
325
|
+
return " ".join(parts)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
# ---------------------------------------------------------------------------
|
|
329
|
+
# MATCH body processing
|
|
330
|
+
# ---------------------------------------------------------------------------
|
|
331
|
+
|
|
332
|
+
# Rel pattern: (src)-[alias:TYPE]->(dst) or (src)-[alias:TYPE]-(dst)
|
|
333
|
+
# dst alias is optional (anonymous destination)
|
|
334
|
+
_REL_DIRECTED_RE = re.compile(
|
|
335
|
+
r"\((\w+)(?::\w+(?:\s*\{[^}]*\})?)?\)" # src node
|
|
336
|
+
r"\s*-\[(\w*):(\w+)\]->\s*" # -[alias:TYPE]->
|
|
337
|
+
r"\((\w*)(?::\w+(?:\s*\{[^}]*\})?)?\)" # dst node (alias optional)
|
|
338
|
+
)
|
|
339
|
+
_REL_UNDIRECTED_RE = re.compile(
|
|
340
|
+
r"\((\w+)(?::\w+(?:\s*\{[^}]*\})?)?\)" # src node
|
|
341
|
+
r"\s*-\[(\w*):(\w+)\]-\s*" # -[alias:TYPE]-
|
|
342
|
+
r"\((\w*)(?::\w+(?:\s*\{[^}]*\})?)?\)" # dst node
|
|
343
|
+
)
|
|
344
|
+
_NODE_RE = re.compile(r"\((\w+):(\w+)(?:\s*\{([^}]+)\})?\)")
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _absorb_match(
|
|
348
|
+
body: str,
|
|
349
|
+
aliases: dict[str, str],
|
|
350
|
+
inline_conds: list[str],
|
|
351
|
+
edge_conds: list[str],
|
|
352
|
+
virtual_conds: list[str],
|
|
353
|
+
_rel_counter: dict[str, int],
|
|
354
|
+
) -> None:
|
|
355
|
+
"""Extract node aliases and relationship patterns from one MATCH body."""
|
|
356
|
+
|
|
357
|
+
# 1. Register named node patterns (alias:Label [{prop: $val}])
|
|
358
|
+
for m in _NODE_RE.finditer(body):
|
|
359
|
+
alias, label, inline = m.group(1), m.group(2), m.group(3)
|
|
360
|
+
if label in _LABEL_TABLE and alias not in aliases:
|
|
361
|
+
aliases[alias] = _LABEL_TABLE[label]
|
|
362
|
+
if inline:
|
|
363
|
+
for kv in re.finditer(r"(\w+)\s*:\s*(\$\w+)", inline):
|
|
364
|
+
inline_conds.append(f"{alias}.{kv.group(1)} = {kv.group(2)}")
|
|
365
|
+
|
|
366
|
+
# 2. Process directed rel patterns iteratively (handles multi-hop chains).
|
|
367
|
+
def _do_directed(m_obj: re.Match) -> str:
|
|
368
|
+
src, ralias, rtype, dst = (
|
|
369
|
+
m_obj.group(1), m_obj.group(2), m_obj.group(3), m_obj.group(4)
|
|
370
|
+
)
|
|
371
|
+
_process_rel(src, ralias, rtype, dst,
|
|
372
|
+
aliases, edge_conds, virtual_conds, _rel_counter,
|
|
373
|
+
undirected=False)
|
|
374
|
+
# Return just the dst node so multi-hop chains resolve left→right.
|
|
375
|
+
return f"({dst})" if dst else "()"
|
|
376
|
+
|
|
377
|
+
q = body
|
|
378
|
+
prev = None
|
|
379
|
+
while prev != q:
|
|
380
|
+
prev = q
|
|
381
|
+
q = _REL_DIRECTED_RE.sub(_do_directed, q)
|
|
382
|
+
|
|
383
|
+
# 3. Process undirected rel patterns.
|
|
384
|
+
for m in _REL_UNDIRECTED_RE.finditer(body):
|
|
385
|
+
src, ralias, rtype, dst = (
|
|
386
|
+
m.group(1), m.group(2), m.group(3), m.group(4)
|
|
387
|
+
)
|
|
388
|
+
_process_rel(src, ralias, rtype, dst,
|
|
389
|
+
aliases, edge_conds, virtual_conds, _rel_counter,
|
|
390
|
+
undirected=True)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _absorb_match_nodes_only(body: str, aliases: dict[str, str]) -> None:
|
|
394
|
+
"""Register node aliases from an OPTIONAL MATCH without adding joins.
|
|
395
|
+
|
|
396
|
+
This ensures columns from OPTIONAL MATCH nodes are reachable in SELECT/
|
|
397
|
+
WHERE even though we don't yet emit a proper LEFT JOIN.
|
|
398
|
+
"""
|
|
399
|
+
for m in _NODE_RE.finditer(body):
|
|
400
|
+
alias, label = m.group(1), m.group(2)
|
|
401
|
+
if label in _LABEL_TABLE and alias not in aliases:
|
|
402
|
+
aliases[alias] = _LABEL_TABLE[label]
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _process_rel(
|
|
406
|
+
src_alias: str,
|
|
407
|
+
rel_alias: str,
|
|
408
|
+
rel_type: str,
|
|
409
|
+
dst_alias: str,
|
|
410
|
+
aliases: dict[str, str],
|
|
411
|
+
edge_conds: list[str],
|
|
412
|
+
virtual_conds: list[str],
|
|
413
|
+
_rel_counter: dict[str, int],
|
|
414
|
+
*,
|
|
415
|
+
undirected: bool,
|
|
416
|
+
) -> None:
|
|
417
|
+
"""Emit join conditions for one relationship hop."""
|
|
418
|
+
|
|
419
|
+
# ── Virtual FK edge (no edge table) ──────────────────────────────────
|
|
420
|
+
if rel_type in _VIRTUAL_REL_EDGE:
|
|
421
|
+
_, dst_tbl, dst_fk_col = _VIRTUAL_REL_EDGE[rel_type]
|
|
422
|
+
# Register dst alias if it carries a label (already done in
|
|
423
|
+
# _absorb_match's node scan, but also handle un-labelled refs).
|
|
424
|
+
if dst_alias and dst_alias not in aliases:
|
|
425
|
+
aliases[dst_alias] = dst_tbl
|
|
426
|
+
if dst_alias:
|
|
427
|
+
virtual_conds.append(f"{dst_alias}.{dst_fk_col} = {src_alias}.id")
|
|
428
|
+
return
|
|
429
|
+
|
|
430
|
+
# ── Real edge table ───────────────────────────────────────────────────
|
|
431
|
+
if rel_type not in _REL_EDGE:
|
|
432
|
+
return # unknown relationship type — skip silently
|
|
433
|
+
|
|
434
|
+
edge_tbl, src_col, dst_col, extra = _REL_EDGE[rel_type]
|
|
435
|
+
_rel_counter["n"] += 1
|
|
436
|
+
ra = rel_alias or f"_r{_rel_counter['n']}"
|
|
437
|
+
|
|
438
|
+
if ra not in aliases:
|
|
439
|
+
aliases[ra] = edge_tbl
|
|
440
|
+
edge_conds.append(f"{ra}.{src_col} = {src_alias}.id")
|
|
441
|
+
if dst_alias:
|
|
442
|
+
if undirected:
|
|
443
|
+
# Undirected: match either direction.
|
|
444
|
+
edge_conds.append(
|
|
445
|
+
f"({ra}.{dst_col} = {dst_alias}.id"
|
|
446
|
+
f" OR {ra}.{src_col} = {dst_alias}.id)"
|
|
447
|
+
)
|
|
448
|
+
else:
|
|
449
|
+
edge_conds.append(f"{ra}.{dst_col} = {dst_alias}.id")
|
|
450
|
+
if extra:
|
|
451
|
+
edge_conds.append(f"{ra}.{extra}")
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
# ---------------------------------------------------------------------------
|
|
455
|
+
# WHERE expansion
|
|
456
|
+
# ---------------------------------------------------------------------------
|
|
457
|
+
|
|
458
|
+
def _expand_not_exists(body: str, aliases: dict[str, str]) -> str:
|
|
459
|
+
"""Replace ``NOT EXISTS { MATCH ... }`` with a SQL subquery."""
|
|
460
|
+
|
|
461
|
+
def _replace(m_obj: re.Match) -> str:
|
|
462
|
+
inner = m_obj.group(1)
|
|
463
|
+
rel_m = re.search(
|
|
464
|
+
r"\((\w*):?(\w*)\)-\[:(\w+)\]->\((\w*):?(\w*)\)", inner
|
|
465
|
+
)
|
|
466
|
+
if not rel_m:
|
|
467
|
+
return m_obj.group(0)
|
|
468
|
+
src_alias = rel_m.group(1)
|
|
469
|
+
rel_type = rel_m.group(3)
|
|
470
|
+
dst_alias = rel_m.group(4)
|
|
471
|
+
if rel_type not in _REL_EDGE:
|
|
472
|
+
return m_obj.group(0)
|
|
473
|
+
edge_tbl, src_col, dst_col, _ = _REL_EDGE[rel_type]
|
|
474
|
+
if dst_alias and dst_alias in aliases:
|
|
475
|
+
return (
|
|
476
|
+
f"NOT EXISTS (SELECT 1 FROM {edge_tbl} _ne"
|
|
477
|
+
f" WHERE _ne.{dst_col} = {dst_alias}.id)"
|
|
478
|
+
)
|
|
479
|
+
if src_alias and src_alias in aliases:
|
|
480
|
+
return (
|
|
481
|
+
f"NOT EXISTS (SELECT 1 FROM {edge_tbl} _ne"
|
|
482
|
+
f" WHERE _ne.{src_col} = {src_alias}.id)"
|
|
483
|
+
)
|
|
484
|
+
return m_obj.group(0)
|
|
485
|
+
|
|
486
|
+
return re.sub(
|
|
487
|
+
r"NOT\s+EXISTS\s*\{\s*MATCH\s*([^}]+)\}",
|
|
488
|
+
_replace,
|
|
489
|
+
body,
|
|
490
|
+
flags=re.IGNORECASE,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
# ---------------------------------------------------------------------------
|
|
495
|
+
# Clause-level transformations
|
|
496
|
+
# ---------------------------------------------------------------------------
|
|
497
|
+
|
|
498
|
+
def _transform_where(where: str) -> str:
|
|
499
|
+
if not where:
|
|
500
|
+
return ""
|
|
501
|
+
# CONTAINS → LIKE
|
|
502
|
+
where = re.sub(
|
|
503
|
+
r"(\w+\.\w+|\blower\([^)]+\)|\bcoalesce\([^)]+\))\s+CONTAINS\s+(\$\w+|'[^']*')",
|
|
504
|
+
lambda m: f"{m.group(1)} LIKE '%' || {m.group(2)} || '%'",
|
|
505
|
+
where,
|
|
506
|
+
flags=re.IGNORECASE,
|
|
507
|
+
)
|
|
508
|
+
# IN $list → = ANY($list)
|
|
509
|
+
where = re.sub(
|
|
510
|
+
r"\bIN\s+(\$\w+)\b",
|
|
511
|
+
r"= ANY(\1)",
|
|
512
|
+
where,
|
|
513
|
+
flags=re.IGNORECASE,
|
|
514
|
+
)
|
|
515
|
+
return where
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _transform_select(ret: str) -> str:
|
|
519
|
+
if not ret:
|
|
520
|
+
return "*"
|
|
521
|
+
# count(alias) → count(*)
|
|
522
|
+
ret = re.sub(r"\bcount\s*\(\s*\w+\s*\)", "count(*)", ret, flags=re.IGNORECASE)
|
|
523
|
+
return ret
|