@rubytech/create-maxy 1.0.714 → 1.0.716

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/dist/index.js +33 -0
  2. package/package.json +1 -1
  3. package/payload/platform/lib/graph-search/dist/index.d.ts +27 -11
  4. package/payload/platform/lib/graph-search/dist/index.d.ts.map +1 -1
  5. package/payload/platform/lib/graph-search/dist/index.js +29 -13
  6. package/payload/platform/lib/graph-search/dist/index.js.map +1 -1
  7. package/payload/platform/lib/graph-search/src/__tests__/fulltext-coverage.test.ts +267 -0
  8. package/payload/platform/lib/graph-search/src/index.ts +27 -12
  9. package/payload/platform/neo4j/schema.cypher +53 -7
  10. package/payload/platform/plugins/docs/references/internals.md +4 -4
  11. package/payload/platform/plugins/memory/references/graph-primitives.md +9 -3
  12. package/payload/platform/scripts/embed-backfill.sh +370 -0
  13. package/payload/platform/scripts/seed-neo4j.sh +10 -4
  14. package/payload/server/public/assets/{Checkbox-DD2mv2dU.js → Checkbox-DzNre1pt.js} +1 -1
  15. package/payload/server/public/assets/{admin-BmuLrDs2.js → admin-BtlXk1uc.js} +1 -1
  16. package/payload/server/public/assets/{data-BiexCwhp.js → data-BTKZGy84.js} +1 -1
  17. package/payload/server/public/assets/{file-CXzgVus7.js → file-DTW7TZ7G.js} +1 -1
  18. package/payload/server/public/assets/{graph-DOWTWpqy.js → graph-CkcuUDtA.js} +1 -1
  19. package/payload/server/public/assets/{house-DuPFyfb8.js → house-DKctRD9t.js} +1 -1
  20. package/payload/server/public/assets/{jsx-runtime-UA_LoTFj.css → jsx-runtime-97_Z_wBZ.css} +1 -1
  21. package/payload/server/public/assets/{public-Tjx3543Z.js → public-CYQSF6Xf.js} +1 -1
  22. package/payload/server/public/assets/{share-2-C_nKw-UZ.js → share-2--beVI26B.js} +1 -1
  23. package/payload/server/public/assets/{useVoiceRecorder-aDBUrrIb.js → useVoiceRecorder-CACPhPdW.js} +1 -1
  24. package/payload/server/public/assets/{x-Do7BO9Ow.js → x-DGk8kvbj.js} +1 -1
  25. package/payload/server/public/data.html +6 -6
  26. package/payload/server/public/graph.html +7 -7
  27. package/payload/server/public/index.html +8 -8
  28. package/payload/server/public/public.html +5 -5
  29. package/payload/server/server.js +1 -1
  30. /package/payload/server/public/assets/{jsx-runtime-DV3X_CC7.js → jsx-runtime-C-H-0vwA.js} +0 -0
@@ -258,13 +258,59 @@ OPTIONS {
258
258
  }
259
259
  };
260
260
 
261
- // Full-text BM25 index for hybrid keyword search across document levels.
262
- // Post-Task 740: sections carry their body inline so the index covers
263
- // KnowledgeDocument.summary, Section.summary, Section.body, and the legacy
264
- // Chunk.content for any Chunks still present from pre-740 ingests.
265
- CREATE FULLTEXT INDEX knowledge_fulltext IF NOT EXISTS
266
- FOR (k:KnowledgeDocument|Section|Chunk)
267
- ON EACH [k.summary, k.content, k.body];
261
+ // Universal full-text BM25 index for hybrid keyword search (Task 748).
262
+ //
263
+ // Every operator-meaningful label written by the platform is in the index union;
264
+ // every textual property a writer assigns is in the property union. Neo4j silently
265
+ // ignores absent properties on a given label, so over-inclusion is harmless.
266
+ //
267
+ // **Doctrine.** Search is "find any node in my graph that mentions this term" —
268
+ // not "find a knowledge document". Pre-Task-748 the index name `knowledge_fulltext`
269
+ // covered only `KnowledgeDocument | Section | Chunk` (3 of ~40 written labels), so
270
+ // BM25 silently returned zero hits for Person/Organization/Task/Conversation/etc.
271
+ // regardless of query. Universal coverage is the doctrine; the doctrine test at
272
+ // `platform/lib/graph-search/src/__tests__/fulltext-coverage.test.ts` parses this
273
+ // declaration and asserts label-set ⊇ union(GRAPH_LABEL_COLOURS, schema-declared)
274
+ // so future label additions cannot silently re-narrow.
275
+ //
276
+ // Label union — every operator-meaningful label:
277
+ // - Business identity: LocalBusiness, Service, PriceSpecification, OpeningHoursSpecification, Organization
278
+ // - People: Person, UserProfile, Preference, AdminUser, AccessGrant
279
+ // - Knowledge: KnowledgeDocument, Section, Chunk (legacy), DigitalDocument, CreativeWork,
280
+ // Question, FAQPage, DefinedTerm, Review, ImageObject
281
+ // - Conversational: Conversation, AdminConversation, PublicConversation, Message,
282
+ // UserMessage, AssistantMessage, ToolCall
283
+ // - Tasks/projects/events: Task, Project, Event
284
+ // - Workflows: Workflow, WorkflowStep, WorkflowRun, StepResult
285
+ // - Onboarding: OnboardingState
286
+ // - Email: Email, EmailAccount
287
+ // - Review signals: ReviewAlert
288
+ // - CV/career sublabels: Position, Credential
289
+ //
290
+ // Property union — every textual property the schema's writers assign:
291
+ // - Generic: name, title, summary, body, content, text, description, headline, abstract,
292
+ // note, label, value, message, preview, tagline
293
+ // - Person: firstName, lastName, givenName, familyName, email
294
+ // - Email: subject, bodyPreview, fromName, fromAddress
295
+ // - EmailAccount: agentAddress
296
+ // - Email: screeningReason
297
+ // - Credential: authority
298
+ // - AccessGrant: contactValue
299
+ // - ToolCall: toolName
300
+ CREATE FULLTEXT INDEX entity_search IF NOT EXISTS
301
+ FOR (n:LocalBusiness|Service|PriceSpecification|OpeningHoursSpecification|Organization
302
+ |Person|UserProfile|Preference|AdminUser|AccessGrant
303
+ |KnowledgeDocument|Section|Chunk|DigitalDocument|CreativeWork|Question|FAQPage|DefinedTerm|Review|ImageObject
304
+ |Conversation|AdminConversation|PublicConversation|Message|UserMessage|AssistantMessage|ToolCall
305
+ |Task|Project|Event
306
+ |Workflow|WorkflowStep|WorkflowRun|StepResult
307
+ |OnboardingState|Email|EmailAccount|ReviewAlert
308
+ |Position|Credential)
309
+ ON EACH [n.name, n.firstName, n.lastName, n.givenName, n.familyName,
310
+ n.title, n.summary, n.body, n.content, n.text, n.description, n.headline, n.abstract,
311
+ n.email, n.note, n.label, n.value, n.message, n.preview, n.tagline,
312
+ n.subject, n.bodyPreview, n.fromName, n.fromAddress, n.agentAddress, n.screeningReason,
313
+ n.authority, n.contactValue, n.toolName];
268
314
 
269
315
  // Project node (Task 740) — a standalone creative-output node distinct from
270
316
  // :Section. Anchored via (:UserProfile)-[:CREATED]->(:Project), with optional
@@ -18,7 +18,7 @@ QUERY
18
18
  │ ├──► MERGE ──► EXPAND ──► RESULTS
19
19
  │ │
20
20
  └── ESCAPE (Lucene special chars) ──────► BM25 FULL-TEXT ──┘
21
- (knowledge_fulltext index)
21
+ (entity_search index — universal coverage)
22
22
 
23
23
  Merge formula: combined = 0.7 × vector_score + 0.3 × normalised_bm25_score
24
24
  Deduplication: by nodeId — when a node appears in both paths, keep the max score from each method independently, then combine.
@@ -29,7 +29,7 @@ Fallback: if the full-text index doesn't exist, vector-only results are returned
29
29
 
30
30
  **Vector path:** The query is embedded via Ollama (model per `EMBED_MODEL` env var, default `nomic-embed-text`). The resulting vector is compared against Neo4j's HNSW cosine indexes — one per indexed label. Dimensions are configured at install time (default 768). The search runs against all discovered indexes (or a subset if the caller specifies label filters). Scores are in [0, 1] (cosine similarity).
31
31
 
32
- **BM25 path:** The raw query text is escaped for Lucene special characters and run against the `knowledge_fulltext` full-text index, which spans `KnowledgeDocument`, `Section`, and `Chunk` labels on their `summary` and `content` properties. Raw BM25 scores are in [0, infinity) — they are normalised to [0, 1] via min-max scaling within the result set before merging. When all scores are equal (or a single result), all normalise to 1.0.
32
+ **BM25 path:** The raw query text is escaped for Lucene special characters and run against the `entity_search` full-text index (Task 748 — universal coverage), which spans every operator-meaningful label written by the platform on the canonical text-property union (~28 properties: `name`, `firstName`, `lastName`, `givenName`, `familyName`, `title`, `summary`, `body`, `content`, `description`, `headline`, `email`, `subject`, `bodyPreview`, etc.). Pre-Task-748 the index was named `knowledge_fulltext` and covered only `KnowledgeDocument | Section | Chunk` — that gap silently hid Person/Organization/Task/Event/etc. from BM25 regardless of query. Raw BM25 scores are in [0, infinity) — they are normalised to [0, 1] via min-max scaling within the result set before merging. When all scores are equal (or a single result), all normalise to 1.0.
33
33
 
34
34
  **Merge:** Results from both paths are collected in a single map keyed by `nodeId`. A node appearing in both paths accumulates the max vector score and max BM25 score independently. The combined score is `0.7 * vectorScore + 0.3 * bm25Score`. Results are sorted descending by combined score, then sliced to the requested limit (default 10).
35
35
 
@@ -59,7 +59,7 @@ Indexed labels: `Question`, `DefinedTerm`, `Review`, `Service`, `Person`, `Local
59
59
 
60
60
  | Index name | Labels | Properties | Purpose |
61
61
  |---|---|---|---|
62
- | `knowledge_fulltext` | KnowledgeDocument, Section, Chunk | `summary`, `content` | BM25 keyword matching for the hybrid pipeline |
62
+ | `entity_search` | All operator-meaningful labels (~40, see [`schema.cypher`](../../../neo4j/schema.cypher)) | Canonical text-property union (~28) | Universal BM25 keyword matching across the whole graph (Task 748) |
63
63
 
64
64
  ### Embedding lifecycle
65
65
 
@@ -282,7 +282,7 @@ Each public agent can subscribe to up to 5 keywords via `knowledgeKeywords` in i
282
282
 
283
283
  For each subscription keyword, two complementary searches run:
284
284
 
285
- 1. **BM25 full-text search** — queries the `knowledge_fulltext` index with the keyword as the search term. Catches content that mentions the keyword in its text.
285
+ 1. **BM25 full-text search** — queries the universal `entity_search` index (Task 748) with the keyword as the search term. Catches content that mentions the keyword in its text across every operator-meaningful label.
286
286
 
287
287
  2. **Property-based search** — finds nodes whose `keywords` array property contains the subscription keyword (case-insensitive). Catches nodes explicitly tagged with that keyword topic. These matches are boosted to maximum BM25 score (1.0) since they are exact tag matches.
288
288
 
@@ -292,11 +292,13 @@ Or use `maxy-graph-get_neo4j_schema` for a richer one-shot structural summary.
292
292
 
293
293
  ### Fulltext
294
294
 
295
- Use the `knowledge_fulltext` index for keyword-style search across
296
- KnowledgeDocument / Section / Chunk content:
295
+ Use the universal `entity_search` index (Task 748) for keyword-style search
296
+ across every operator-meaningful label Person, Organization, Task, Event,
297
+ Conversation, KnowledgeDocument, Email, etc. — on every textual property
298
+ the platform's writers assign:
297
299
 
298
300
  ```cypher
299
- CALL db.index.fulltext.queryNodes('knowledge_fulltext', $query)
301
+ CALL db.index.fulltext.queryNodes('entity_search', $query)
300
302
  YIELD node, score
301
303
  WHERE score > 0.5
302
304
  RETURN labels(node)[0] AS type,
@@ -306,6 +308,10 @@ RETURN labels(node)[0] AS type,
306
308
  LIMIT 20
307
309
  ```
308
310
 
311
+ Pre-Task-748 the index was named `knowledge_fulltext` and covered only
312
+ `KnowledgeDocument | Section | Chunk`. Existing Pis pick up the rename on
313
+ the next install via `seed-neo4j.sh`.
314
+
309
315
  ### Filter by status or category
310
316
 
311
317
  Events that are cancelled:
@@ -0,0 +1,370 @@
1
+ #!/usr/bin/env bash
2
+ # ============================================================
3
+ # embed-backfill.sh — populate embeddings on legacy nodes (Task 748)
4
+ #
5
+ # Walks the Neo4j graph for nodes carrying any registered Maxy label that
6
+ # lack `n.embedding` and have at least one populated text property. For
7
+ # each such node the script builds a text representation from the same
8
+ # property union the fulltext index covers (`name`, `title`, `summary`,
9
+ # `headline`, `body`, `content`, `text`), POSTs it to Ollama's `/api/embed`
10
+ # endpoint, and writes the resulting vector back to the node.
11
+ #
12
+ # Why it exists. Pre-Task-748 bulk-import paths (notably `memory-archive-write`
13
+ # for LinkedIn Connections.csv, ~5096 Persons per import) skipped per-row
14
+ # embedding to keep import latency under five minutes. With Task 748's
15
+ # universal fulltext coverage in place, BM25 catches those nodes immediately
16
+ # but vector ranking is sparse until embeddings exist. This script heals
17
+ # both the legacy backlog and any future bulk-imported population.
18
+ #
19
+ # Idempotent. Re-running picks up exactly where a prior run left off because
20
+ # the gating predicate is `n.embedding IS NULL` — nodes embedded by the
21
+ # previous run are excluded from the next batch query.
22
+ #
23
+ # Loud failure (per feedback_loud_failures.md). Any Ollama HTTP failure or
24
+ # cypher-shell error aborts the script with a non-zero exit and prints a
25
+ # precise re-run instruction. Partial-state-on-abort is safe: nodes whose
26
+ # embedding was committed before the abort stay embedded; the rest fall back
27
+ # into the next run's batch.
28
+ #
29
+ # Concurrent-run safety. flock-guarded — a second concurrent invocation
30
+ # exits immediately with a clear message, no work attempted. Protects
31
+ # against operator double-clicks and against the installer running it
32
+ # while a manual run is in flight.
33
+ #
34
+ # Usage. Stand-alone re-run: `bash platform/scripts/embed-backfill.sh`.
35
+ # Installer-driven: invoked automatically post-`seed-neo4j.sh` on every
36
+ # install (the no-op fast path returns in milliseconds when nothing is
37
+ # pending, so re-running on every install is harmless).
38
+ # ============================================================
39
+
40
+ set -euo pipefail
41
+
42
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
43
+ PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
44
+
45
+ NEO4J_URI="${NEO4J_URI:-bolt://localhost:7687}"
46
+ NEO4J_USER="${NEO4J_USER:-neo4j}"
47
+ OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
48
+ EMBED_MODEL="${EMBED_MODEL:-nomic-embed-text}"
49
+ BATCH_SIZE="${EMBED_BACKFILL_BATCH_SIZE:-50}"
50
+
51
+ # Lock file is brand-scoped via the install directory hash so concurrent
52
+ # Maxy + Real Agent installs (or any two brand installs sharing the device)
53
+ # do not block each other unnecessarily — they target separate Neo4j
54
+ # instances under separate INSTALL_DIRs and have zero shared state. The
55
+ # explicit env var override stays for operator-driven workflows.
56
+ INSTALL_DIR_HASH="$(echo -n "$PROJECT_DIR" | shasum | cut -c1-12)"
57
+ LOCK_FILE="${EMBED_BACKFILL_LOCK_FILE:-/tmp/maxy-embed-backfill-${INSTALL_DIR_HASH}.lock}"
58
+
59
+ # Resolve Neo4j password the same way seed-neo4j.sh does. Explicit env var
60
+ # takes precedence so the installer can pass it through without writing the
61
+ # file twice.
62
+ NEO4J_PASSWORD_FILE="$PROJECT_DIR/config/.neo4j-password"
63
+ if [ -z "${NEO4J_PASSWORD:-}" ]; then
64
+ if [ -f "$NEO4J_PASSWORD_FILE" ]; then
65
+ NEO4J_PASSWORD=$(cat "$NEO4J_PASSWORD_FILE")
66
+ else
67
+ echo "[embed-backfill] FAILED: NEO4J_PASSWORD env var unset and $NEO4J_PASSWORD_FILE missing"
68
+ echo "[embed-backfill] re-run after the seed step writes the password file, or set NEO4J_PASSWORD explicitly"
69
+ exit 1
70
+ fi
71
+ fi
72
+ export NEO4J_URI NEO4J_USER NEO4J_PASSWORD OLLAMA_URL EMBED_MODEL BATCH_SIZE
73
+
74
+ if ! command -v cypher-shell >/dev/null 2>&1; then
75
+ echo "[embed-backfill] FAILED: cypher-shell not on PATH; install Neo4j or add cypher-shell to PATH"
76
+ exit 1
77
+ fi
78
+ if ! command -v python3 >/dev/null 2>&1; then
79
+ echo "[embed-backfill] FAILED: python3 not on PATH; the installer requires it"
80
+ exit 1
81
+ fi
82
+
83
+ # flock guard — second concurrent invocation exits cleanly. The exec on
84
+ # fd 200 keeps the lock held for the lifetime of this process; flock -n
85
+ # is non-blocking so a busy lock returns immediately rather than queueing.
86
+ exec 200>"$LOCK_FILE"
87
+ if ! flock -n 200; then
88
+ echo "[embed-backfill] another instance is already running (lock=$LOCK_FILE), skipping"
89
+ exit 0
90
+ fi
91
+
92
+ # The python heredoc owns the per-batch loop. It uses subprocess to call
93
+ # cypher-shell (avoids re-implementing Bolt) and urllib to call Ollama
94
+ # (no extra deps). cypher-shell `--format plain` returns CSV; the csv
95
+ # module handles quoting/escaping reliably so node text containing commas,
96
+ # quotes, or newlines round-trips correctly.
97
+ #
98
+ # Cypher contract:
99
+ # READ: one row per unembedded node — { id: elementId, text: coalesced }
100
+ # gated by `n.embedding IS NULL` AND `any(label IN labels(n)
101
+ # WHERE label IN $registered)` AND a non-empty coalesce of the
102
+ # text property union. Nodes carrying an :Trashed label are
103
+ # excluded explicitly. READ params (`registered` list of strings,
104
+ # `batchSize` int) are passed via cypher-shell `--param` as plain
105
+ # Cypher expressions (string list literals + integer literal).
106
+ # WRITE: one batched UNWIND per chunk — pairs of (id, embedding[])
107
+ # interpolated into the Cypher payload as bare-key map literals
108
+ # (`{id: '...', embedding: [...]}`). Cypher does NOT accept
109
+ # double-quoted-string map keys, so JSON-serialised values cannot
110
+ # be passed via `--param` for the WRITE side; the inline literal
111
+ # path is the apoc-free alternative.
112
+ #
113
+ # The script does NOT shell out to the existing TS embed() helper because
114
+ # that would require booting Node + the platform/lib build. Calling the
115
+ # Ollama HTTP endpoint directly preserves the same behaviour with zero
116
+ # build dependency.
117
+ exec python3 - <<'PYEOF'
118
+ import json
119
+ import os
120
+ import sys
121
+ import time
122
+ import urllib.error
123
+ import urllib.request
124
+ from subprocess import PIPE, Popen
125
+ from io import StringIO
126
+ import csv
127
+
128
+ NEO4J_URI = os.environ["NEO4J_URI"]
129
+ NEO4J_USER = os.environ["NEO4J_USER"]
130
+ NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]
131
+ OLLAMA_URL = os.environ["OLLAMA_URL"]
132
+ EMBED_MODEL = os.environ["EMBED_MODEL"]
133
+ BATCH_SIZE = int(os.environ["BATCH_SIZE"])
134
+
135
+ # Mirrors the FOR (n:...) clause of `entity_search` in schema.cypher.
136
+ # Doctrine: every label written by the platform is searchable AND embeddable.
137
+ # Future label additions must extend BOTH this list and schema.cypher; the
138
+ # fulltext-coverage doctrine test catches the schema half but not this list.
139
+ REGISTERED_LABELS = [
140
+ "LocalBusiness", "Service", "PriceSpecification", "OpeningHoursSpecification", "Organization",
141
+ "Person", "UserProfile", "Preference", "AdminUser", "AccessGrant",
142
+ "KnowledgeDocument", "Section", "Chunk", "DigitalDocument", "CreativeWork",
143
+ "Question", "FAQPage", "DefinedTerm", "Review", "ImageObject",
144
+ "Conversation", "AdminConversation", "PublicConversation", "Message",
145
+ "UserMessage", "AssistantMessage", "ToolCall",
146
+ "Task", "Project", "Event",
147
+ "Workflow", "WorkflowStep", "WorkflowRun", "StepResult",
148
+ "OnboardingState", "Email", "EmailAccount", "ReviewAlert",
149
+ "Position", "Credential",
150
+ ]
151
+
152
+ # Properties to coalesce for the embedding text. Ordered: most identifying
153
+ # property first. Matches the canonical text-property list pinned by the
154
+ # fulltext-coverage doctrine test.
155
+ EMBED_TEXT_PROPS = ["name", "title", "summary", "headline", "body", "content", "text"]
156
+
157
+
158
+ def cypher(query: str, params: dict | None = None) -> str:
159
+ """Run a Cypher statement via cypher-shell --format plain.
160
+ Returns stdout as a single string. Aborts the script on non-zero exit
161
+ so a Cypher syntax error or a Neo4j outage surfaces immediately."""
162
+ cmd = [
163
+ "cypher-shell", "-u", NEO4J_USER, "-p", NEO4J_PASSWORD, "-a", NEO4J_URI,
164
+ "--format", "plain",
165
+ ]
166
+ if params:
167
+ for key, value in params.items():
168
+ cmd.extend(["--param", f"{key} => {json.dumps(value)}"])
169
+ proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
170
+ out, err = proc.communicate(query.encode("utf-8"))
171
+ if proc.returncode != 0:
172
+ sys.stderr.write(f"[embed-backfill] FAILED: cypher-shell exited {proc.returncode}\n")
173
+ sys.stderr.write(err.decode("utf-8", errors="replace"))
174
+ sys.exit(1)
175
+ return out.decode("utf-8", errors="replace")
176
+
177
+
178
+ def parse_csv_rows(stdout: str) -> list[dict]:
179
+ """cypher-shell --format plain emits a CSV header + rows. The csv module
180
+ handles quoting reliably even when text contains commas/quotes/newlines."""
181
+ if not stdout.strip():
182
+ return []
183
+ reader = csv.DictReader(StringIO(stdout))
184
+ return list(reader)
185
+
186
+
187
+ def ollama_embed(text: str, *, timeout: int = 30, retry_on_timeout: bool = True) -> list[float]:
188
+ """POST text to Ollama /api/embed.
189
+
190
+ Cold-start tolerance: when nomic-embed-text is not yet loaded into Ollama's
191
+ process memory, the first request for the model after a fresh boot can
192
+ exceed 30s while the model loads. Subsequent requests are fast. We retry
193
+ ONCE on TimeoutError with a longer (180s) timeout so a cold model load
194
+ does not abort the entire backfill at the first node. Retry is OFF by
195
+ default for the warmup probe to avoid recursion.
196
+
197
+ Aborts the script (non-zero exit) on any non-recoverable HTTP failure
198
+ with a precise message + re-run instruction so the operator never thinks
199
+ the backfill silently completed.
200
+ """
201
+ body = json.dumps({"model": EMBED_MODEL, "input": text}).encode("utf-8")
202
+ req = urllib.request.Request(
203
+ f"{OLLAMA_URL}/api/embed",
204
+ data=body,
205
+ headers={"Content-Type": "application/json"},
206
+ method="POST",
207
+ )
208
+ try:
209
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
210
+ payload = json.loads(resp.read().decode("utf-8"))
211
+ except TimeoutError as e:
212
+ if retry_on_timeout:
213
+ sys.stderr.write(
214
+ f"[embed-backfill] WARN: Ollama timeout after {timeout}s — likely cold-start; retrying with 180s timeout\n"
215
+ )
216
+ return ollama_embed(text, timeout=180, retry_on_timeout=False)
217
+ sys.stderr.write(f"[embed-backfill] FAILED: Ollama timeout after {timeout}s ({e})\n")
218
+ sys.stderr.write(
219
+ f"[embed-backfill] re-run via: bash {os.path.dirname(os.path.realpath(__file__))}/embed-backfill.sh\n"
220
+ )
221
+ sys.exit(1)
222
+ except (urllib.error.URLError, urllib.error.HTTPError) as e:
223
+ sys.stderr.write(f"[embed-backfill] FAILED: Ollama unreachable ({e})\n")
224
+ sys.stderr.write(
225
+ f"[embed-backfill] re-run via: bash {os.path.dirname(os.path.realpath(__file__))}/embed-backfill.sh\n"
226
+ )
227
+ sys.exit(1)
228
+ embeddings = payload.get("embeddings", [])
229
+ if not embeddings or not embeddings[0]:
230
+ sys.stderr.write(f"[embed-backfill] FAILED: Ollama returned no embedding for text length={len(text)}\n")
231
+ sys.exit(1)
232
+ return embeddings[0]
233
+
234
+
235
+ def cypher_string_literal(s: str) -> str:
236
+ """Format a Python string as a Cypher single-quoted string literal.
237
+
238
+ Escapes the two characters Cypher requires escaping inside single-quoted
239
+ strings: backslash and single quote. elementId values from Neo4j 5 are
240
+ typically `<dbprefix>:<uuid>:<recordId>` (alphanumeric + colon + dash) and
241
+ will not normally contain either, but escape defensively so a future
242
+ elementId format change cannot break the WRITE batch with a syntax error.
243
+ """
244
+ return "'" + s.replace("\\", "\\\\").replace("'", "\\'") + "'"
245
+
246
+
247
+ def cypher_float_list(values: list[float]) -> str:
248
+ """Format a list of floats as a Cypher list literal `[v1, v2, ...]`.
249
+
250
+ repr() on a Python float emits a decimal that Cypher accepts as a number
251
+ literal — including the negative sign, scientific notation, and infinity
252
+ edge cases. nomic-embed-text returns finite cosine-bounded floats so
253
+ inf/nan are not expected, but Python's repr is stable for any case that
254
+ does occur.
255
+ """
256
+ return "[" + ",".join(repr(v) for v in values) + "]"
257
+
258
+
259
+ # Build the WHERE clause once. The $registered parameter is interpolated
260
+ # into Cypher as a list literal; cypher-shell --param gives us a typed pass.
261
+ COALESCE_TEXT = "coalesce(" + ", ".join(f"n.{p}" for p in EMBED_TEXT_PROPS) + ", '')"
262
+ COUNT_QUERY = f"""
263
+ MATCH (n) WHERE n.embedding IS NULL
264
+ AND NOT n:Trashed
265
+ AND any(label IN labels(n) WHERE label IN $registered)
266
+ AND {COALESCE_TEXT} <> ''
267
+ RETURN count(n) AS remaining;
268
+ """
269
+ BATCH_QUERY = f"""
270
+ MATCH (n) WHERE n.embedding IS NULL
271
+ AND NOT n:Trashed
272
+ AND any(label IN labels(n) WHERE label IN $registered)
273
+ AND {COALESCE_TEXT} <> ''
274
+ RETURN elementId(n) AS id,
275
+ labels(n)[0] AS firstLabel,
276
+ {COALESCE_TEXT} AS text
277
+ LIMIT $batchSize;
278
+ """
279
+
280
+ count_out = cypher(COUNT_QUERY, {"registered": REGISTERED_LABELS})
281
+ total_remaining = 0
282
+ for row in parse_csv_rows(count_out):
283
+ total_remaining = int(row["remaining"])
284
+
285
+ print(f"[embed-backfill] start total={total_remaining} model={EMBED_MODEL}")
286
+
287
+ if total_remaining == 0:
288
+ print("[embed-backfill] done remaining=0 (nothing to backfill)")
289
+ sys.exit(0)
290
+
291
+ # Pre-warm Ollama so the first per-node call doesn't pay the model-load
292
+ # latency. The cold-start window for nomic-embed-text on a Pi 5 can exceed
293
+ # 30s; calling once with a tiny throwaway input loads the weights into
294
+ # memory before the loop begins. Failure here is treated identically to
295
+ # any other Ollama failure — loud abort with re-run instruction.
296
+ print(f"[embed-backfill] pre-warm model={EMBED_MODEL} timeout=180s")
297
+ ollama_embed("warmup", timeout=180, retry_on_timeout=False)
298
+
299
+ processed_total = 0
300
+ batch_index = 0
301
+ while True:
302
+ batch_start = time.time()
303
+ batch_out = cypher(
304
+ BATCH_QUERY,
305
+ {"registered": REGISTERED_LABELS, "batchSize": BATCH_SIZE},
306
+ )
307
+ rows = parse_csv_rows(batch_out)
308
+ if not rows:
309
+ break
310
+
311
+ # Compute embeddings serially. Ollama on a Pi 5 handles ~3-10 embeds
312
+ # per second with nomic-embed-text; concurrent requests just queue
313
+ # behind the GPU/CPU bottleneck so parallelism wouldn't help.
314
+ pairs: list[tuple[str, list[float]]] = []
315
+ label_counts: dict[str, int] = {}
316
+ for row in rows:
317
+ node_id = row["id"]
318
+ text = row["text"]
319
+ first_label = row["firstLabel"]
320
+ if not text:
321
+ continue
322
+ embedding = ollama_embed(text)
323
+ pairs.append((node_id, embedding))
324
+ label_counts[first_label] = label_counts.get(first_label, 0) + 1
325
+
326
+ if not pairs:
327
+ # Defensive: query said rows exist but all text was empty after
328
+ # the python read — means the COALESCE_TEXT predicate is wider
329
+ # than the python check. Stop to avoid an infinite loop.
330
+ sys.stderr.write("[embed-backfill] WARN: batch returned rows with empty text — stopping to avoid infinite loop\n")
331
+ break
332
+
333
+ # Build the WRITE batch as a Cypher literal payload rather than a
334
+ # `--param` map. cypher-shell's `--param` parses the value as a Cypher
335
+ # expression, and Cypher map keys must be bare identifiers (or backtick-
336
+ # quoted) — NOT double-quoted strings as JSON would emit. Interpolating
337
+ # bare-key map literals directly avoids the question entirely:
338
+ #
339
+ # UNWIND [{id: '4:abc:1', embedding: [0.1, 0.2, ...]}, ...] AS pair
340
+ # MATCH (n) WHERE elementId(n) = pair.id
341
+ # SET n.embedding = pair.embedding;
342
+ #
343
+ # cypher_string_literal escapes any backslash/quote in elementIds
344
+ # defensively; cypher_float_list serialises the embedding via repr()
345
+ # which Cypher accepts as a number literal.
346
+ pair_literals = ",".join(
347
+ f"{{id: {cypher_string_literal(node_id)}, embedding: {cypher_float_list(embedding)}}}"
348
+ for node_id, embedding in pairs
349
+ )
350
+ cypher(
351
+ f"""
352
+ UNWIND [{pair_literals}] AS pair
353
+ MATCH (n) WHERE elementId(n) = pair.id
354
+ SET n.embedding = pair.embedding;
355
+ """
356
+ )
357
+ elapsed_ms = int((time.time() - batch_start) * 1000)
358
+ batch_index += 1
359
+ processed_total += len(pairs)
360
+ label_summary = ", ".join(f"{k}={v}" for k, v in sorted(label_counts.items()))
361
+ print(f"[embed-backfill] batch={batch_index} processed={len(pairs)} elapsed-ms={elapsed_ms} labels={label_summary}")
362
+
363
+ # Final remaining check — should be zero or the diff between original
364
+ # total and processed_total (e.g. if new writes landed mid-run).
365
+ final_out = cypher(COUNT_QUERY, {"registered": REGISTERED_LABELS})
366
+ final_remaining = 0
367
+ for row in parse_csv_rows(final_out):
368
+ final_remaining = int(row["remaining"])
369
+ print(f"[embed-backfill] done processed={processed_total} remaining={final_remaining}")
370
+ PYEOF
@@ -411,13 +411,19 @@ fi
411
411
 
412
412
  echo "==> Connecting to Neo4j at $NEO4J_URI as $NEO4J_USER"
413
413
 
414
- # Migration: drop single-key UserProfile constraint (replaced by composite
415
- # (accountId, userId) in Task 249). Also drop the old preference_category
416
- # index replaced by (accountId, userId, category) composite.
417
- echo "==> Migrating schema: dropping single-key UserProfile constraint..."
414
+ # Schema migrations run before the main schema apply so renames don't collide
415
+ # with the new declarations. Each statement is idempotent (`IF EXISTS`):
416
+ # - Task 249: `user_profile_account_unique` replaced by composite (accountId, userId).
417
+ # - Task 249: `preference_category` index replaced by (accountId, userId, category).
418
+ # - Task 748: `knowledge_fulltext` (3 labels) replaced by `entity_search` (~40 labels)
419
+ # with the universal label/property union. The new index is created by the
420
+ # schema apply below; dropping the old name here is what lets cypher-shell
421
+ # run both in one pass without conflict.
422
+ echo "==> Migrating schema: dropping renamed/obsolete constraints + indexes..."
418
423
  "$CYPHER_SHELL" -u "$NEO4J_USER" -p "$NEO4J_PASSWORD" -a "$NEO4J_URI" << 'MIGRATE_EOF'
419
424
  DROP CONSTRAINT user_profile_account_unique IF EXISTS;
420
425
  DROP INDEX preference_category IF EXISTS;
426
+ DROP INDEX knowledge_fulltext IF EXISTS;
421
427
  MIGRATE_EOF
422
428
 
423
429
  # Vector index dimensions — configurable at install time via --embed-model.
@@ -1 +1 @@
1
- import{t as e}from"./jsx-runtime-DV3X_CC7.js";var t=e();function n({checked:e,onChange:n,label:r,disabled:i}){return(0,t.jsxs)(`label`,{className:`maxy-checkbox${i?` maxy-checkbox--disabled`:``}`,children:[(0,t.jsx)(`input`,{type:`checkbox`,checked:e,onChange:e=>n(e.target.checked),disabled:i}),(0,t.jsx)(`span`,{className:`maxy-checkbox__box`,children:`✱`}),r&&(0,t.jsx)(`span`,{className:`maxy-checkbox__label`,children:r})]})}export{n as t};
1
+ import{t as e}from"./jsx-runtime-C-H-0vwA.js";var t=e();function n({checked:e,onChange:n,label:r,disabled:i}){return(0,t.jsxs)(`label`,{className:`maxy-checkbox${i?` maxy-checkbox--disabled`:``}`,children:[(0,t.jsx)(`input`,{type:`checkbox`,checked:e,onChange:e=>n(e.target.checked),disabled:i}),(0,t.jsx)(`span`,{className:`maxy-checkbox__box`,children:`✱`}),r&&(0,t.jsx)(`span`,{className:`maxy-checkbox__label`,children:r})]})}export{n as t};