codespine 1.0.5__tar.gz → 1.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. {codespine-1.0.5 → codespine-1.0.6}/PKG-INFO +1 -1
  2. {codespine-1.0.5 → codespine-1.0.6}/codespine/__init__.py +1 -1
  3. {codespine-1.0.5 → codespine-1.0.6}/codespine/cli.py +75 -11
  4. codespine-1.0.6/codespine/db/_cypher_compat.py +523 -0
  5. {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/call_resolver.py +11 -0
  6. {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/engine.py +44 -8
  7. {codespine-1.0.5 → codespine-1.0.6}/codespine/sharding/store.py +9 -0
  8. {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/PKG-INFO +1 -1
  9. {codespine-1.0.5 → codespine-1.0.6}/pyproject.toml +1 -1
  10. {codespine-1.0.5 → codespine-1.0.6}/tests/test_cypher_compat.py +168 -0
  11. codespine-1.0.5/codespine/db/_cypher_compat.py +0 -351
  12. {codespine-1.0.5 → codespine-1.0.6}/LICENSE +0 -0
  13. {codespine-1.0.5 → codespine-1.0.6}/README.md +0 -0
  14. {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/__init__.py +0 -0
  15. {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/community.py +0 -0
  16. {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/context.py +0 -0
  17. {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/coupling.py +0 -0
  18. {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/crossmodule.py +0 -0
  19. {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/deadcode.py +0 -0
  20. {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/flow.py +0 -0
  21. {codespine-1.0.5 → codespine-1.0.6}/codespine/analysis/impact.py +0 -0
  22. {codespine-1.0.5 → codespine-1.0.6}/codespine/cache/__init__.py +0 -0
  23. {codespine-1.0.5 → codespine-1.0.6}/codespine/cache/result_cache.py +0 -0
  24. {codespine-1.0.5 → codespine-1.0.6}/codespine/config.py +0 -0
  25. {codespine-1.0.5 → codespine-1.0.6}/codespine/db/__init__.py +0 -0
  26. {codespine-1.0.5 → codespine-1.0.6}/codespine/db/duckdb_store.py +0 -0
  27. {codespine-1.0.5 → codespine-1.0.6}/codespine/db/schema.py +0 -0
  28. {codespine-1.0.5 → codespine-1.0.6}/codespine/db/store.py +0 -0
  29. {codespine-1.0.5 → codespine-1.0.6}/codespine/diff/__init__.py +0 -0
  30. {codespine-1.0.5 → codespine-1.0.6}/codespine/diff/branch_diff.py +0 -0
  31. {codespine-1.0.5 → codespine-1.0.6}/codespine/guide.py +0 -0
  32. {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/__init__.py +0 -0
  33. {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/di_resolver.py +0 -0
  34. {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/java_parser.py +0 -0
  35. {codespine-1.0.5 → codespine-1.0.6}/codespine/indexer/symbol_builder.py +0 -0
  36. {codespine-1.0.5 → codespine-1.0.6}/codespine/mcp/__init__.py +0 -0
  37. {codespine-1.0.5 → codespine-1.0.6}/codespine/mcp/server.py +0 -0
  38. {codespine-1.0.5 → codespine-1.0.6}/codespine/noise/__init__.py +0 -0
  39. {codespine-1.0.5 → codespine-1.0.6}/codespine/noise/blocklist.py +0 -0
  40. {codespine-1.0.5 → codespine-1.0.6}/codespine/overlay/__init__.py +0 -0
  41. {codespine-1.0.5 → codespine-1.0.6}/codespine/overlay/git_state.py +0 -0
  42. {codespine-1.0.5 → codespine-1.0.6}/codespine/overlay/merge.py +0 -0
  43. {codespine-1.0.5 → codespine-1.0.6}/codespine/overlay/store.py +0 -0
  44. {codespine-1.0.5 → codespine-1.0.6}/codespine/search/__init__.py +0 -0
  45. {codespine-1.0.5 → codespine-1.0.6}/codespine/search/bm25.py +0 -0
  46. {codespine-1.0.5 → codespine-1.0.6}/codespine/search/fuzzy.py +0 -0
  47. {codespine-1.0.5 → codespine-1.0.6}/codespine/search/hybrid.py +0 -0
  48. {codespine-1.0.5 → codespine-1.0.6}/codespine/search/rrf.py +0 -0
  49. {codespine-1.0.5 → codespine-1.0.6}/codespine/search/vector.py +0 -0
  50. {codespine-1.0.5 → codespine-1.0.6}/codespine/sharding/__init__.py +0 -0
  51. {codespine-1.0.5 → codespine-1.0.6}/codespine/sharding/router.py +0 -0
  52. {codespine-1.0.5 → codespine-1.0.6}/codespine/watch/__init__.py +0 -0
  53. {codespine-1.0.5 → codespine-1.0.6}/codespine/watch/git_hook.py +0 -0
  54. {codespine-1.0.5 → codespine-1.0.6}/codespine/watch/watcher.py +0 -0
  55. {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/SOURCES.txt +0 -0
  56. {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/dependency_links.txt +0 -0
  57. {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/entry_points.txt +0 -0
  58. {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/requires.txt +0 -0
  59. {codespine-1.0.5 → codespine-1.0.6}/codespine.egg-info/top_level.txt +0 -0
  60. {codespine-1.0.5 → codespine-1.0.6}/gindex.py +0 -0
  61. {codespine-1.0.5 → codespine-1.0.6}/setup.cfg +0 -0
  62. {codespine-1.0.5 → codespine-1.0.6}/tests/test_branch_diff_normalize.py +0 -0
  63. {codespine-1.0.5 → codespine-1.0.6}/tests/test_call_resolver.py +0 -0
  64. {codespine-1.0.5 → codespine-1.0.6}/tests/test_community_detection.py +0 -0
  65. {codespine-1.0.5 → codespine-1.0.6}/tests/test_deadcode.py +0 -0
  66. {codespine-1.0.5 → codespine-1.0.6}/tests/test_duckdb_store.py +0 -0
  67. {codespine-1.0.5 → codespine-1.0.6}/tests/test_index_and_hybrid.py +0 -0
  68. {codespine-1.0.5 → codespine-1.0.6}/tests/test_java_parser.py +0 -0
  69. {codespine-1.0.5 → codespine-1.0.6}/tests/test_multimodule_index.py +0 -0
  70. {codespine-1.0.5 → codespine-1.0.6}/tests/test_overlay.py +0 -0
  71. {codespine-1.0.5 → codespine-1.0.6}/tests/test_result_cache.py +0 -0
  72. {codespine-1.0.5 → codespine-1.0.6}/tests/test_search_ranking.py +0 -0
  73. {codespine-1.0.5 → codespine-1.0.6}/tests/test_sharding.py +0 -0
  74. {codespine-1.0.5 → codespine-1.0.6}/tests/test_store_recovery.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 1.0.5
3
+ Version: 1.0.6
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "1.0.5"
4
+ __version__ = "1.0.6"
@@ -192,6 +192,21 @@ def _index_shard_group(
192
192
  with output_lock:
193
193
  _phase(f"{prefix}Tracing calls...", "starting...")
194
194
  return
195
+ if event == "resolve_calls_heartbeat":
196
+ # Fires every 2 s from a daemon thread so the spinner stays
197
+ # alive even when the resolver produces no new edges.
198
+ scanned = int(payload.get("scanned", 0))
199
+ edges = int(payload.get("edges", 0))
200
+ elapsed_s = float(payload.get("elapsed", 0.0))
201
+ if not parallel:
202
+ click.echo(
203
+ f"\r{_spinner_char()} {prefix}Tracing calls... "
204
+ f"{edges:>6} resolved / {scanned} scanned {elapsed_s:.1f}s ",
205
+ nl=False,
206
+ )
207
+ call_state["shown"] = True
208
+ call_state["last_ts"] = now
209
+ return
195
210
  if event == "resolve_calls_progress":
196
211
  call_state["count"] = int(payload.get("calls_resolved", 0))
197
212
  if (now - call_state["last_ts"]) >= 0.25:
@@ -345,6 +360,37 @@ def analyse(path: str, full: bool, deep: bool, incremental_deep: bool, embed: bo
345
360
  # For single-project analysis this is transparent — shard() always
346
361
  # returns a GraphStore pointing to the correct shard path.
347
362
  sg = ShardedGraphStore(read_only=False)
363
+
364
+ # ── SIGINT handler: flush partial index on Ctrl+C ────────────────────
365
+ # The handler captures `sg` by closure. On interrupt it snapshots all
366
+ # open shards so `codespine stats` and MCP see the partial result, then
367
+ # calls os._exit(130) to bypass Python cleanup (safe for CLI process).
368
+ # A second Ctrl+C hard-exits immediately.
369
+ _sigint_pressed: list[bool] = [False]
370
+ _old_sigint_handler = signal.getsignal(signal.SIGINT)
371
+
372
+ def _sigint_flush(signum: int, frame: object) -> None: # noqa: ARG001
373
+ if _sigint_pressed[0]:
374
+ os._exit(130)
375
+ _sigint_pressed[0] = True
376
+ # Restore default handler so a second Ctrl+C exits immediately.
377
+ signal.signal(signal.SIGINT, signal.default_int_handler)
378
+ click.secho(
379
+ "\n\n⚠ Interrupted — flushing partial index to read replica…",
380
+ fg="yellow",
381
+ )
382
+ try:
383
+ sg.snapshot_all(background=False)
384
+ click.secho(
385
+ "✓ Partial index saved. Run 'codespine stats' to see what was indexed.",
386
+ fg="yellow",
387
+ )
388
+ except Exception: # noqa: BLE001
389
+ pass
390
+ os._exit(130)
391
+
392
+ signal.signal(signal.SIGINT, _sigint_flush)
393
+
348
394
  # The indexer is initialised per-module below with the right shard store.
349
395
  # We keep a single ShardedGraphStore to fan-out cross-module linking later.
350
396
 
@@ -594,6 +640,9 @@ def analyse(path: str, full: bool, deep: bool, incremental_deep: bool, embed: bo
594
640
  sg.snapshot_all(background=False)
595
641
  _finish_phase(snap_label, "MCP will reload automatically")
596
642
 
643
+ # Restore original SIGINT handler now that we've finished cleanly.
644
+ signal.signal(signal.SIGINT, _old_sigint_handler)
645
+
597
646
 
598
647
  @main.command()
599
648
  @click.argument("query")
@@ -741,15 +790,27 @@ def stats(as_json: bool, show_shards: bool) -> None:
741
790
  click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
742
791
  return
743
792
 
793
+ def _stat_count(store, query: str, params: dict) -> int:
794
+ """Run a stats count query — returns 0 on any failure."""
795
+ try:
796
+ rows = store.query_records(query, params)
797
+ return int(rows[0]["n"]) if rows else 0
798
+ except Exception as exc: # noqa: BLE001
799
+ click.secho(f" (stat unavailable: {exc})", fg="yellow")
800
+ return 0
801
+
744
802
  rows = []
745
803
  for p in all_projects_meta:
746
804
  pid = p["id"]
747
805
  # Route each query to the project's owning shard.
748
806
  ps = _project_store(pid)
749
- files = ps.query_records(
750
- "MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n", {"pid": pid}
807
+ n_files = _stat_count(
808
+ ps,
809
+ "MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n",
810
+ {"pid": pid},
751
811
  )
752
- classes = ps.query_records(
812
+ n_classes = _stat_count(
813
+ ps,
753
814
  """
754
815
  MATCH (f:File) WHERE f.project_id = $pid
755
816
  WITH f
@@ -758,7 +819,8 @@ def stats(as_json: bool, show_shards: bool) -> None:
758
819
  """,
759
820
  {"pid": pid},
760
821
  )
761
- methods = ps.query_records(
822
+ n_methods = _stat_count(
823
+ ps,
762
824
  """
763
825
  MATCH (f:File) WHERE f.project_id = $pid
764
826
  WITH f
@@ -769,7 +831,8 @@ def stats(as_json: bool, show_shards: bool) -> None:
769
831
  """,
770
832
  {"pid": pid},
771
833
  )
772
- calls = ps.query_records(
834
+ n_calls = _stat_count(
835
+ ps,
773
836
  """
774
837
  MATCH (f:File) WHERE f.project_id = $pid
775
838
  WITH f
@@ -780,7 +843,8 @@ def stats(as_json: bool, show_shards: bool) -> None:
780
843
  """,
781
844
  {"pid": pid},
782
845
  )
783
- emb = ps.query_records(
846
+ n_emb = _stat_count(
847
+ ps,
784
848
  """
785
849
  MATCH (f:File) WHERE f.project_id = $pid
786
850
  WITH f
@@ -793,11 +857,11 @@ def stats(as_json: bool, show_shards: bool) -> None:
793
857
  "project": pid,
794
858
  "path": p["path"],
795
859
  "shard": sg.router.shard_for(pid),
796
- "files": files[0]["n"] if files else 0,
797
- "classes": classes[0]["n"] if classes else 0,
798
- "methods": methods[0]["n"] if methods else 0,
799
- "calls_out": calls[0]["n"] if calls else 0,
800
- "embeddings": emb[0]["n"] if emb else 0,
860
+ "files": n_files,
861
+ "classes": n_classes,
862
+ "methods": n_methods,
863
+ "calls_out": n_calls,
864
+ "embeddings": n_emb,
801
865
  })
802
866
 
803
867
  if as_json:
@@ -0,0 +1,523 @@
1
+ """Cypher-to-SQL translation for CodeSpine's DuckDB backend.
2
+
3
+ Translates the specific subset of OpenCypher used by CodeSpine into
4
+ equivalent DuckDB SQL so that every ``store.query_records(cypher, params)``
5
+ call continues to work without touching the call-sites.
6
+
7
+ Supported constructs
8
+ --------------------
9
+ - Node patterns MATCH (alias:Label) or (a:L {prop: $v})
10
+ - Anonymous nodes (:Label) in NOT-EXISTS subqueries
11
+ - Relationship patterns (a)-[r:REL]->(b) directed
12
+ - Undirected edges (a)-[r:REL]-(b) → OR of both directions
13
+ - Virtual FK edges (a)-[:HAS_METHOD]->(b) → b.class_id = a.id (no edge table)
14
+ - Multi-hop patterns (a)-[:R1]->(x)-[:R2]->(b)
15
+ - Anonymous destination (a)-[:CALLS]->()
16
+ - Multi-MATCH + WITH Multiple MATCH clauses joined by WITH pipeline stages
17
+ - WHERE =, <>, IN, CONTAINS, lower(), coalesce(),
18
+ IS NULL, IS NOT NULL, >=, <=
19
+ - NOT EXISTS subqueries NOT EXISTS { MATCH (:N)-[:R]->(m) }
20
+ - WITH … ORDER BY Kuzu paging construct → plain ORDER BY
21
+ - DISTINCT, ORDER BY, LIMIT
22
+ - Aggregates count(n) → count(*)
23
+ - Literal values 'string' in RETURN (e.g. 'method' as kind)
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import re
29
+ from typing import Any
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Schema mappings
33
+ # ---------------------------------------------------------------------------
34
+
35
+ # Kuzu node label → DuckDB table name
36
+ _LABEL_TABLE: dict[str, str] = {
37
+ "Project": "projects",
38
+ "File": "files",
39
+ "Class": "classes",
40
+ "Method": "methods",
41
+ "Symbol": "symbols",
42
+ "Community": "communities",
43
+ "Flow": "flows",
44
+ }
45
+
46
+ # Kuzu relationship type → (edge_table, src_col, dst_col, extra_where | None)
47
+ _REL_EDGE: dict[str, tuple[str, str, str, str | None]] = {
48
+ "CALLS": ("calls", "source_id", "target_id", None),
49
+ "OVERRIDES": ("references_type", "src_id", "dst_id", "rel = 'OVERRIDES'"),
50
+ "IMPLEMENTS": ("references_type", "src_id", "dst_id", "rel = 'IMPLEMENTS'"),
51
+ "INJECTS": ("injects", "src_class_id", "dst_class_id", None),
52
+ "BINDS_INTERFACE": ("binds_interface", "src_class_id", "dst_class_id", None),
53
+ "IN_COMMUNITY": ("community_members", "symbol_id", "community_id", None),
54
+ "IN_FLOW": ("flow_members", "symbol_id", "flow_id", None),
55
+ "CO_CHANGED_WITH": ("co_changed_with", "file_a", "file_b", None),
56
+ }
57
+
58
+ # Virtual FK edges: backed by a foreign-key column rather than a separate
59
+ # edge table. Format: (src_label_table, dst_label_table, dst_fk_col)
60
+ # e.g. HAS_METHOD: methods.class_id = class.id
61
+ _VIRTUAL_REL_EDGE: dict[str, tuple[str, str, str]] = {
62
+ "HAS_METHOD": ("classes", "methods", "class_id"),
63
+ "HAS_CLASS": ("files", "classes", "file_id"),
64
+ "DECLARES": ("files", "symbols", "file_id"),
65
+ }
66
+
67
+ # All real edge tables (used for anonymous total-count query)
68
+ _ALL_EDGE_TABLES = (
69
+ "calls",
70
+ "references_type",
71
+ "injects",
72
+ "binds_interface",
73
+ "community_members",
74
+ "flow_members",
75
+ "co_changed_with",
76
+ )
77
+
78
+ # Top-level Cypher keywords recognised by the clause splitter.
79
+ # Order matters: longer / more-specific patterns must come before shorter ones.
80
+ _TOP_KEYWORDS = (
81
+ "OPTIONAL MATCH",
82
+ "ORDER BY",
83
+ "MATCH",
84
+ "WITH",
85
+ "WHERE",
86
+ "RETURN",
87
+ "LIMIT",
88
+ )
89
+
90
+
91
+ def is_cypher(query: str) -> bool:
92
+ """Return True if *query* looks like Cypher rather than SQL."""
93
+ return bool(re.match(r"(?i)\s*MATCH\b", query.lstrip()))
94
+
95
+
96
+ # ---------------------------------------------------------------------------
97
+ # Main entry point
98
+ # ---------------------------------------------------------------------------
99
+
100
+ def translate(cypher: str, params: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]:
101
+ """Translate *cypher* to DuckDB SQL.
102
+
103
+ Returns ``(sql, params_dict)`` where *params_dict* preserves the original
104
+ ``$name`` bindings so they can be passed directly to
105
+ ``duckdb.connect().execute(sql, params_dict)``.
106
+ """
107
+ sql = _translate(cypher)
108
+ return sql, (params or {})
109
+
110
+
111
+ # ---------------------------------------------------------------------------
112
+ # Clause splitter
113
+ # ---------------------------------------------------------------------------
114
+
115
+ def _split_clauses(q: str) -> list[tuple[str, str]]:
116
+ """Tokenise a (whitespace-normalised) Cypher query into clause pairs.
117
+
118
+ Returns a list of ``(keyword, body)`` tuples at the TOP level of the
119
+ query. Keywords inside ``()``, ``[]``, ``{}`` or quoted strings are
120
+ NOT treated as clause boundaries.
121
+
122
+ Example::
123
+
124
+ "MATCH (f:File) WHERE f.id = $x WITH f MATCH (c:Class) RETURN c.name"
125
+ → [("MATCH", "(f:File)"),
126
+ ("WHERE", "f.id = $x"),
127
+ ("WITH", "f"),
128
+ ("MATCH", "(c:Class)"),
129
+ ("RETURN","c.name")]
130
+ """
131
+ results: list[tuple[str, str]] = []
132
+ n = len(q)
133
+ i = 0
134
+ depth_paren = depth_sq = depth_brace = 0
135
+ in_quote = False
136
+ quote_char = ""
137
+ current_kw: str | None = None
138
+ current_start = 0
139
+
140
+ while i < n:
141
+ ch = q[i]
142
+
143
+ # ── Quote handling ────────────────────────────────────────────────
144
+ if in_quote:
145
+ if ch == "\\" and i + 1 < n:
146
+ i += 2 # skip escaped char
147
+ continue
148
+ if ch == quote_char:
149
+ in_quote = False
150
+ i += 1
151
+ continue
152
+ if ch in ('"', "'"):
153
+ in_quote = True
154
+ quote_char = ch
155
+ i += 1
156
+ continue
157
+
158
+ # ── Depth tracking ────────────────────────────────────────────────
159
+ if ch == "(":
160
+ depth_paren += 1
161
+ elif ch == ")":
162
+ depth_paren = max(0, depth_paren - 1)
163
+ elif ch == "[":
164
+ depth_sq += 1
165
+ elif ch == "]":
166
+ depth_sq = max(0, depth_sq - 1)
167
+ elif ch == "{":
168
+ depth_brace += 1
169
+ elif ch == "}":
170
+ depth_brace = max(0, depth_brace - 1)
171
+
172
+ # ── Keyword detection (top-level only) ───────────────────────────
173
+ if depth_paren == 0 and depth_sq == 0 and depth_brace == 0:
174
+ for kw in _TOP_KEYWORDS:
175
+ kl = len(kw)
176
+ if q[i : i + kl].upper() == kw:
177
+ end_pos = i + kl
178
+ # Require word boundary: end-of-string or non-word char
179
+ if end_pos < n and (q[end_pos].isalnum() or q[end_pos] == "_"):
180
+ continue # e.g. "MATCHING" is not "MATCH"
181
+ # Flush previous clause
182
+ if current_kw is not None:
183
+ body = q[current_start:i].strip()
184
+ results.append((current_kw, body))
185
+ current_kw = kw
186
+ current_start = end_pos
187
+ i = end_pos
188
+ break
189
+ else:
190
+ i += 1
191
+ else:
192
+ i += 1
193
+
194
+ # Flush final clause
195
+ if current_kw is not None:
196
+ body = q[current_start:].strip()
197
+ results.append((current_kw, body))
198
+
199
+ return results
200
+
201
+
202
+ # ---------------------------------------------------------------------------
203
+ # Internal translation pipeline
204
+ # ---------------------------------------------------------------------------
205
+
206
+ def _translate_anonymous_edge_count(cypher: str) -> str | None:
207
+ """Fast-path for ``MATCH ()-[r]->() RETURN count(r) [as X]``.
208
+
209
+ Anonymous edge patterns carry no labels so the generic translator
210
+ cannot derive a FROM table. We special-case the count-all-edges
211
+ pattern by summing row-counts across every edge table.
212
+ """
213
+ q = re.sub(r"\s+", " ", cypher.strip())
214
+ m = re.match(
215
+ r"(?i)MATCH\s*\(\s*\)\s*-\s*\[\s*\w*\s*\]\s*->\s*\(\s*\)\s*"
216
+ r"RETURN\s+count\s*\(\s*[*]?\w*\s*\)\s*(?:as\s+(\w+))?\s*$",
217
+ q,
218
+ )
219
+ if not m:
220
+ return None
221
+ alias = m.group(1) or "count"
222
+ unions = " UNION ALL ".join(
223
+ f"SELECT COUNT(*) AS c FROM {tbl}" for tbl in _ALL_EDGE_TABLES
224
+ )
225
+ return f"SELECT COALESCE(SUM(c), 0) AS {alias} FROM ({unions}) t"
226
+
227
+
228
+ def _translate(cypher: str) -> str:
229
+ # ── Fast-path: anonymous total-edge-count ────────────────────────────
230
+ special = _translate_anonymous_edge_count(cypher)
231
+ if special is not None:
232
+ return special
233
+
234
+ q = re.sub(r"\s+", " ", cypher.strip())
235
+ clauses = _split_clauses(q)
236
+
237
+ aliases: dict[str, str] = {} # alias → table name
238
+ inline_conds: list[str] = [] # WHERE from inline {prop: $val}
239
+ edge_conds: list[str] = [] # join conditions from real edge tables
240
+ virtual_conds: list[str] = [] # FK conditions from virtual edges
241
+ where_parts: list[str] = [] # collected WHERE bodies
242
+ ret_cols = "*"
243
+ ret_distinct = ""
244
+ order_clause = ""
245
+ limit_clause = ""
246
+ _rel_counter = {"n": 0}
247
+
248
+ for kw, body in clauses:
249
+ if kw == "MATCH":
250
+ _absorb_match(body, aliases, inline_conds, edge_conds,
251
+ virtual_conds, _rel_counter)
252
+ elif kw == "OPTIONAL MATCH":
253
+ # Degenerate: register new node aliases so their columns are
254
+ # selectable, but don't add INNER JOIN constraints.
255
+ # Full LEFT JOIN support is a future enhancement.
256
+ _absorb_match_nodes_only(body, aliases)
257
+ elif kw == "WITH":
258
+ # Paging idiom: WITH x ORDER BY x.col LIMIT n
259
+ ob = re.search(r"(?i)ORDER\s+BY\s+(.+?)(?:\s+LIMIT\s+\S+)?\s*$", body)
260
+ if ob and not order_clause:
261
+ order_clause = "ORDER BY " + ob.group(1).strip()
262
+ lm = re.search(r"(?i)LIMIT\s+(\S+)", body)
263
+ if lm and not limit_clause:
264
+ limit_clause = "LIMIT " + lm.group(1)
265
+ # Pipeline-separator WITH (no ORDER BY) is simply dropped.
266
+ elif kw == "WHERE":
267
+ where_parts.append(body)
268
+ elif kw == "RETURN":
269
+ # DISTINCT
270
+ dm = re.match(r"(?i)DISTINCT\s+(.*)", body)
271
+ if dm:
272
+ ret_distinct = "DISTINCT "
273
+ body = dm.group(1)
274
+ # Trailing ORDER BY inside RETURN
275
+ ob = re.search(r"(?i)\s+ORDER\s+BY\s+(.+?)(?=\s*(?:LIMIT|$))", body)
276
+ if ob and not order_clause:
277
+ order_clause = "ORDER BY " + ob.group(1).strip()
278
+ body = body[: ob.start()].strip()
279
+ # Trailing LIMIT inside RETURN
280
+ lm = re.search(r"(?i)\s+LIMIT\s+(\S+)", body)
281
+ if lm and not limit_clause:
282
+ limit_clause = "LIMIT " + lm.group(1)
283
+ body = body[: lm.start()].strip()
284
+ ret_cols = body.strip()
285
+ elif kw == "ORDER BY":
286
+ if not order_clause:
287
+ order_clause = "ORDER BY " + body
288
+ elif kw == "LIMIT":
289
+ if not limit_clause:
290
+ limit_clause = "LIMIT " + body.split()[0]
291
+
292
+ # ── FROM clause ───────────────────────────────────────────────────────
293
+ seen: set[str] = set()
294
+ from_parts: list[str] = []
295
+ for alias, tbl in aliases.items():
296
+ entry = f"{tbl} {alias}"
297
+ if entry not in seen:
298
+ from_parts.append(entry)
299
+ seen.add(entry)
300
+ from_str = ", ".join(from_parts) if from_parts else "(SELECT 1 WHERE 1=0) _empty(x)"
301
+
302
+ # ── WHERE clause ──────────────────────────────────────────────────────
303
+ all_conds: list[str] = []
304
+ all_conds.extend(edge_conds)
305
+ all_conds.extend(virtual_conds)
306
+ all_conds.extend(inline_conds)
307
+ for wp in where_parts:
308
+ expanded = _expand_not_exists(wp, aliases)
309
+ transformed = _transform_where(expanded)
310
+ if transformed:
311
+ all_conds.append(transformed)
312
+ where_str = " AND ".join(c for c in all_conds if c)
313
+
314
+ # ── SELECT ────────────────────────────────────────────────────────────
315
+ select_str = _transform_select(ret_cols)
316
+
317
+ # ── Assemble ──────────────────────────────────────────────────────────
318
+ parts = [f"SELECT {ret_distinct}{select_str}", f"FROM {from_str}"]
319
+ if where_str:
320
+ parts.append(f"WHERE {where_str}")
321
+ if order_clause:
322
+ parts.append(order_clause)
323
+ if limit_clause:
324
+ parts.append(limit_clause)
325
+ return " ".join(parts)
326
+
327
+
328
+ # ---------------------------------------------------------------------------
329
+ # MATCH body processing
330
+ # ---------------------------------------------------------------------------
331
+
332
+ # Rel pattern: (src)-[alias:TYPE]->(dst) or (src)-[alias:TYPE]-(dst)
333
+ # dst alias is optional (anonymous destination)
334
+ _REL_DIRECTED_RE = re.compile(
335
+ r"\((\w+)(?::\w+(?:\s*\{[^}]*\})?)?\)" # src node
336
+ r"\s*-\[(\w*):(\w+)\]->\s*" # -[alias:TYPE]->
337
+ r"\((\w*)(?::\w+(?:\s*\{[^}]*\})?)?\)" # dst node (alias optional)
338
+ )
339
+ _REL_UNDIRECTED_RE = re.compile(
340
+ r"\((\w+)(?::\w+(?:\s*\{[^}]*\})?)?\)" # src node
341
+ r"\s*-\[(\w*):(\w+)\]-\s*" # -[alias:TYPE]-
342
+ r"\((\w*)(?::\w+(?:\s*\{[^}]*\})?)?\)" # dst node
343
+ )
344
+ _NODE_RE = re.compile(r"\((\w+):(\w+)(?:\s*\{([^}]+)\})?\)")
345
+
346
+
347
+ def _absorb_match(
348
+ body: str,
349
+ aliases: dict[str, str],
350
+ inline_conds: list[str],
351
+ edge_conds: list[str],
352
+ virtual_conds: list[str],
353
+ _rel_counter: dict[str, int],
354
+ ) -> None:
355
+ """Extract node aliases and relationship patterns from one MATCH body."""
356
+
357
+ # 1. Register named node patterns (alias:Label [{prop: $val}])
358
+ for m in _NODE_RE.finditer(body):
359
+ alias, label, inline = m.group(1), m.group(2), m.group(3)
360
+ if label in _LABEL_TABLE and alias not in aliases:
361
+ aliases[alias] = _LABEL_TABLE[label]
362
+ if inline:
363
+ for kv in re.finditer(r"(\w+)\s*:\s*(\$\w+)", inline):
364
+ inline_conds.append(f"{alias}.{kv.group(1)} = {kv.group(2)}")
365
+
366
+ # 2. Process directed rel patterns iteratively (handles multi-hop chains).
367
+ def _do_directed(m_obj: re.Match) -> str:
368
+ src, ralias, rtype, dst = (
369
+ m_obj.group(1), m_obj.group(2), m_obj.group(3), m_obj.group(4)
370
+ )
371
+ _process_rel(src, ralias, rtype, dst,
372
+ aliases, edge_conds, virtual_conds, _rel_counter,
373
+ undirected=False)
374
+ # Return just the dst node so multi-hop chains resolve left→right.
375
+ return f"({dst})" if dst else "()"
376
+
377
+ q = body
378
+ prev = None
379
+ while prev != q:
380
+ prev = q
381
+ q = _REL_DIRECTED_RE.sub(_do_directed, q)
382
+
383
+ # 3. Process undirected rel patterns.
384
+ for m in _REL_UNDIRECTED_RE.finditer(body):
385
+ src, ralias, rtype, dst = (
386
+ m.group(1), m.group(2), m.group(3), m.group(4)
387
+ )
388
+ _process_rel(src, ralias, rtype, dst,
389
+ aliases, edge_conds, virtual_conds, _rel_counter,
390
+ undirected=True)
391
+
392
+
393
+ def _absorb_match_nodes_only(body: str, aliases: dict[str, str]) -> None:
394
+ """Register node aliases from an OPTIONAL MATCH without adding joins.
395
+
396
+ This ensures columns from OPTIONAL MATCH nodes are reachable in SELECT/
397
+ WHERE even though we don't yet emit a proper LEFT JOIN.
398
+ """
399
+ for m in _NODE_RE.finditer(body):
400
+ alias, label = m.group(1), m.group(2)
401
+ if label in _LABEL_TABLE and alias not in aliases:
402
+ aliases[alias] = _LABEL_TABLE[label]
403
+
404
+
405
+ def _process_rel(
406
+ src_alias: str,
407
+ rel_alias: str,
408
+ rel_type: str,
409
+ dst_alias: str,
410
+ aliases: dict[str, str],
411
+ edge_conds: list[str],
412
+ virtual_conds: list[str],
413
+ _rel_counter: dict[str, int],
414
+ *,
415
+ undirected: bool,
416
+ ) -> None:
417
+ """Emit join conditions for one relationship hop."""
418
+
419
+ # ── Virtual FK edge (no edge table) ──────────────────────────────────
420
+ if rel_type in _VIRTUAL_REL_EDGE:
421
+ _, dst_tbl, dst_fk_col = _VIRTUAL_REL_EDGE[rel_type]
422
+ # Register dst alias if it carries a label (already done in
423
+ # _absorb_match's node scan, but also handle un-labelled refs).
424
+ if dst_alias and dst_alias not in aliases:
425
+ aliases[dst_alias] = dst_tbl
426
+ if dst_alias:
427
+ virtual_conds.append(f"{dst_alias}.{dst_fk_col} = {src_alias}.id")
428
+ return
429
+
430
+ # ── Real edge table ───────────────────────────────────────────────────
431
+ if rel_type not in _REL_EDGE:
432
+ return # unknown relationship type — skip silently
433
+
434
+ edge_tbl, src_col, dst_col, extra = _REL_EDGE[rel_type]
435
+ _rel_counter["n"] += 1
436
+ ra = rel_alias or f"_r{_rel_counter['n']}"
437
+
438
+ if ra not in aliases:
439
+ aliases[ra] = edge_tbl
440
+ edge_conds.append(f"{ra}.{src_col} = {src_alias}.id")
441
+ if dst_alias:
442
+ if undirected:
443
+ # Undirected: match either direction.
444
+ edge_conds.append(
445
+ f"({ra}.{dst_col} = {dst_alias}.id"
446
+ f" OR {ra}.{src_col} = {dst_alias}.id)"
447
+ )
448
+ else:
449
+ edge_conds.append(f"{ra}.{dst_col} = {dst_alias}.id")
450
+ if extra:
451
+ edge_conds.append(f"{ra}.{extra}")
452
+
453
+
454
+ # ---------------------------------------------------------------------------
455
+ # WHERE expansion
456
+ # ---------------------------------------------------------------------------
457
+
458
+ def _expand_not_exists(body: str, aliases: dict[str, str]) -> str:
459
+ """Replace ``NOT EXISTS { MATCH ... }`` with a SQL subquery."""
460
+
461
+ def _replace(m_obj: re.Match) -> str:
462
+ inner = m_obj.group(1)
463
+ rel_m = re.search(
464
+ r"\((\w*):?(\w*)\)-\[:(\w+)\]->\((\w*):?(\w*)\)", inner
465
+ )
466
+ if not rel_m:
467
+ return m_obj.group(0)
468
+ src_alias = rel_m.group(1)
469
+ rel_type = rel_m.group(3)
470
+ dst_alias = rel_m.group(4)
471
+ if rel_type not in _REL_EDGE:
472
+ return m_obj.group(0)
473
+ edge_tbl, src_col, dst_col, _ = _REL_EDGE[rel_type]
474
+ if dst_alias and dst_alias in aliases:
475
+ return (
476
+ f"NOT EXISTS (SELECT 1 FROM {edge_tbl} _ne"
477
+ f" WHERE _ne.{dst_col} = {dst_alias}.id)"
478
+ )
479
+ if src_alias and src_alias in aliases:
480
+ return (
481
+ f"NOT EXISTS (SELECT 1 FROM {edge_tbl} _ne"
482
+ f" WHERE _ne.{src_col} = {src_alias}.id)"
483
+ )
484
+ return m_obj.group(0)
485
+
486
+ return re.sub(
487
+ r"NOT\s+EXISTS\s*\{\s*MATCH\s*([^}]+)\}",
488
+ _replace,
489
+ body,
490
+ flags=re.IGNORECASE,
491
+ )
492
+
493
+
494
+ # ---------------------------------------------------------------------------
495
+ # Clause-level transformations
496
+ # ---------------------------------------------------------------------------
497
+
498
+ def _transform_where(where: str) -> str:
499
+ if not where:
500
+ return ""
501
+ # CONTAINS → LIKE
502
+ where = re.sub(
503
+ r"(\w+\.\w+|\blower\([^)]+\)|\bcoalesce\([^)]+\))\s+CONTAINS\s+(\$\w+|'[^']*')",
504
+ lambda m: f"{m.group(1)} LIKE '%' || {m.group(2)} || '%'",
505
+ where,
506
+ flags=re.IGNORECASE,
507
+ )
508
+ # IN $list → = ANY($list)
509
+ where = re.sub(
510
+ r"\bIN\s+(\$\w+)\b",
511
+ r"= ANY(\1)",
512
+ where,
513
+ flags=re.IGNORECASE,
514
+ )
515
+ return where
516
+
517
+
518
+ def _transform_select(ret: str) -> str:
519
+ if not ret:
520
+ return "*"
521
+ # count(alias) → count(*)
522
+ ret = re.sub(r"\bcount\s*\(\s*\w+\s*\)", "count(*)", ret, flags=re.IGNORECASE)
523
+ return ret