npm - @rubytech/create-maxy - Versions diffs - 1.0.714 → 1.0.716 - Mend

@rubytech/create-maxy 1.0.714 → 1.0.716

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/payload/platform/neo4j/schema.cypher CHANGED Viewed

@@ -258,13 +258,59 @@ OPTIONS {
   }
 };
-// Full-text BM25 index for hybrid keyword search across document levels.
-// Post-Task 740: sections carry their body inline so the index covers
-// KnowledgeDocument.summary, Section.summary, Section.body, and the legacy
-// Chunk.content for any Chunks still present from pre-740 ingests.
-CREATE FULLTEXT INDEX knowledge_fulltext IF NOT EXISTS
-FOR (k:KnowledgeDocument|Section|Chunk)
-ON EACH [k.summary, k.content, k.body];
+// Universal full-text BM25 index for hybrid keyword search (Task 748).
+//
+// Every operator-meaningful label written by the platform is in the index union;
+// every textual property a writer assigns is in the property union. Neo4j silently
+// ignores absent properties on a given label, so over-inclusion is harmless.
+//
+// **Doctrine.** Search is "find any node in my graph that mentions this term" —
+// not "find a knowledge document". Pre-Task-748 the index name `knowledge_fulltext`
+// covered only `KnowledgeDocument | Section | Chunk` (3 of ~40 written labels), so
+// BM25 silently returned zero hits for Person/Organization/Task/Conversation/etc.
+// regardless of query. Universal coverage is the doctrine; the doctrine test at
+// `platform/lib/graph-search/src/__tests__/fulltext-coverage.test.ts` parses this
+// declaration and asserts label-set ⊇ union(GRAPH_LABEL_COLOURS, schema-declared)
+// so future label additions cannot silently re-narrow.
+//
+// Label union — every operator-meaningful label:
+//   - Business identity: LocalBusiness, Service, PriceSpecification, OpeningHoursSpecification, Organization
+//   - People: Person, UserProfile, Preference, AdminUser, AccessGrant
+//   - Knowledge: KnowledgeDocument, Section, Chunk (legacy), DigitalDocument, CreativeWork,
+//     Question, FAQPage, DefinedTerm, Review, ImageObject
+//   - Conversational: Conversation, AdminConversation, PublicConversation, Message,
+//     UserMessage, AssistantMessage, ToolCall
+//   - Tasks/projects/events: Task, Project, Event
+//   - Workflows: Workflow, WorkflowStep, WorkflowRun, StepResult
+//   - Onboarding: OnboardingState
+//   - Email: Email, EmailAccount
+//   - Review signals: ReviewAlert
+//   - CV/career sublabels: Position, Credential
+//
+// Property union — every textual property the schema's writers assign:
+//   - Generic: name, title, summary, body, content, text, description, headline, abstract,
+//     note, label, value, message, preview, tagline
+//   - Person: firstName, lastName, givenName, familyName, email
+//   - Email: subject, bodyPreview, fromName, fromAddress
+//   - EmailAccount: agentAddress
+//   - Email: screeningReason
+//   - Credential: authority
+//   - AccessGrant: contactValue
+//   - ToolCall: toolName
+CREATE FULLTEXT INDEX entity_search IF NOT EXISTS
+FOR (n:LocalBusiness|Service|PriceSpecification|OpeningHoursSpecification|Organization
+    |Person|UserProfile|Preference|AdminUser|AccessGrant
+    |KnowledgeDocument|Section|Chunk|DigitalDocument|CreativeWork|Question|FAQPage|DefinedTerm|Review|ImageObject
+    |Conversation|AdminConversation|PublicConversation|Message|UserMessage|AssistantMessage|ToolCall
+    |Task|Project|Event
+    |Workflow|WorkflowStep|WorkflowRun|StepResult
+    |OnboardingState|Email|EmailAccount|ReviewAlert
+    |Position|Credential)
+ON EACH [n.name, n.firstName, n.lastName, n.givenName, n.familyName,
+         n.title, n.summary, n.body, n.content, n.text, n.description, n.headline, n.abstract,
+         n.email, n.note, n.label, n.value, n.message, n.preview, n.tagline,
+         n.subject, n.bodyPreview, n.fromName, n.fromAddress, n.agentAddress, n.screeningReason,
+         n.authority, n.contactValue, n.toolName];
 // Project node (Task 740) — a standalone creative-output node distinct from
 // :Section. Anchored via (:UserProfile)-[:CREATED]->(:Project), with optional

package/payload/platform/plugins/docs/references/internals.md CHANGED Viewed

@@ -18,7 +18,7 @@ QUERY
   │                                                          ├──► MERGE ──► EXPAND ──► RESULTS
   │                                                          │
   └── ESCAPE (Lucene special chars) ──────► BM25 FULL-TEXT ──┘
-                                            (knowledge_fulltext index)
+                                            (entity_search index — universal coverage)
 Merge formula: combined = 0.7 × vector_score + 0.3 × normalised_bm25_score
 Deduplication: by nodeId — when a node appears in both paths, keep the max score from each method independently, then combine.
@@ -29,7 +29,7 @@ Fallback: if the full-text index doesn't exist, vector-only results are returned
 **Vector path:** The query is embedded via Ollama (model per `EMBED_MODEL` env var, default `nomic-embed-text`). The resulting vector is compared against Neo4j's HNSW cosine indexes — one per indexed label. Dimensions are configured at install time (default 768). The search runs against all discovered indexes (or a subset if the caller specifies label filters). Scores are in [0, 1] (cosine similarity).
-**BM25 path:** The raw query text is escaped for Lucene special characters and run against the `knowledge_fulltext` full-text index, which spans `KnowledgeDocument`, `Section`, and `Chunk` labels on their `summary` and `content` properties. Raw BM25 scores are in [0, infinity) — they are normalised to [0, 1] via min-max scaling within the result set before merging. When all scores are equal (or a single result), all normalise to 1.0.
+**BM25 path:** The raw query text is escaped for Lucene special characters and run against the `entity_search` full-text index (Task 748 — universal coverage), which spans every operator-meaningful label written by the platform on the canonical text-property union (~28 properties: `name`, `firstName`, `lastName`, `givenName`, `familyName`, `title`, `summary`, `body`, `content`, `description`, `headline`, `email`, `subject`, `bodyPreview`, etc.). Pre-Task-748 the index was named `knowledge_fulltext` and covered only `KnowledgeDocument | Section | Chunk` — that gap silently hid Person/Organization/Task/Event/etc. from BM25 regardless of query. Raw BM25 scores are in [0, infinity) — they are normalised to [0, 1] via min-max scaling within the result set before merging. When all scores are equal (or a single result), all normalise to 1.0.
 **Merge:** Results from both paths are collected in a single map keyed by `nodeId`. A node appearing in both paths accumulates the max vector score and max BM25 score independently. The combined score is `0.7 * vectorScore + 0.3 * bm25Score`. Results are sorted descending by combined score, then sliced to the requested limit (default 10).
@@ -59,7 +59,7 @@ Indexed labels: `Question`, `DefinedTerm`, `Review`, `Service`, `Person`, `Local
 | Index name | Labels | Properties | Purpose |
 |---|---|---|---|
-| `knowledge_fulltext` | KnowledgeDocument, Section, Chunk | `summary`, `content` | BM25 keyword matching for the hybrid pipeline |
+| `entity_search` | All operator-meaningful labels (~40, see [`schema.cypher`](../../../neo4j/schema.cypher)) | Canonical text-property union (~28) | Universal BM25 keyword matching across the whole graph (Task 748) |
 ### Embedding lifecycle
@@ -282,7 +282,7 @@ Each public agent can subscribe to up to 5 keywords via `knowledgeKeywords` in i
 For each subscription keyword, two complementary searches run:
-1. **BM25 full-text search** — queries the `knowledge_fulltext` index with the keyword as the search term. Catches content that mentions the keyword in its text.
+1. **BM25 full-text search** — queries the universal `entity_search` index (Task 748) with the keyword as the search term. Catches content that mentions the keyword in its text across every operator-meaningful label.
 2. **Property-based search** — finds nodes whose `keywords` array property contains the subscription keyword (case-insensitive). Catches nodes explicitly tagged with that keyword topic. These matches are boosted to maximum BM25 score (1.0) since they are exact tag matches.

package/payload/platform/plugins/memory/references/graph-primitives.md CHANGED Viewed

@@ -292,11 +292,13 @@ Or use `maxy-graph-get_neo4j_schema` for a richer one-shot structural summary.
 ### Fulltext
-Use the `knowledge_fulltext` index for keyword-style search across
-KnowledgeDocument / Section / Chunk content:
+Use the universal `entity_search` index (Task 748) for keyword-style search
+across every operator-meaningful label — Person, Organization, Task, Event,
+Conversation, KnowledgeDocument, Email, etc. — on every textual property
+the platform's writers assign:
 ```cypher
-CALL db.index.fulltext.queryNodes('knowledge_fulltext', $query)
+CALL db.index.fulltext.queryNodes('entity_search', $query)
 YIELD node, score
 WHERE score > 0.5
 RETURN labels(node)[0] AS type,
@@ -306,6 +308,10 @@ RETURN labels(node)[0] AS type,
 LIMIT 20
 ```
+Pre-Task-748 the index was named `knowledge_fulltext` and covered only
+`KnowledgeDocument | Section | Chunk`. Existing Pis pick up the rename on
+the next install via `seed-neo4j.sh`.
 ### Filter by status or category
 Events that are cancelled:

package/payload/platform/scripts/embed-backfill.sh ADDED Viewed

@@ -0,0 +1,370 @@
+#!/usr/bin/env bash
+# ============================================================
+# embed-backfill.sh — populate embeddings on legacy nodes (Task 748)
+#
+# Walks the Neo4j graph for nodes carrying any registered Maxy label that
+# lack `n.embedding` and have at least one populated text property. For
+# each such node the script builds a text representation from the same
+# property union the fulltext index covers (`name`, `title`, `summary`,
+# `headline`, `body`, `content`, `text`), POSTs it to Ollama's `/api/embed`
+# endpoint, and writes the resulting vector back to the node.
+#
+# Why it exists. Pre-Task-748 bulk-import paths (notably `memory-archive-write`
+# for LinkedIn Connections.csv, ~5096 Persons per import) skipped per-row
+# embedding to keep import latency under five minutes. With Task 748's
+# universal fulltext coverage in place, BM25 catches those nodes immediately
+# but vector ranking is sparse until embeddings exist. This script heals
+# both the legacy backlog and any future bulk-imported population.
+#
+# Idempotent. Re-running picks up exactly where a prior run left off because
+# the gating predicate is `n.embedding IS NULL` — nodes embedded by the
+# previous run are excluded from the next batch query.
+#
+# Loud failure (per feedback_loud_failures.md). Any Ollama HTTP failure or
+# cypher-shell error aborts the script with a non-zero exit and prints a
+# precise re-run instruction. Partial-state-on-abort is safe: nodes whose
+# embedding was committed before the abort stay embedded; the rest fall back
+# into the next run's batch.
+#
+# Concurrent-run safety. flock-guarded — a second concurrent invocation
+# exits immediately with a clear message, no work attempted. Protects
+# against operator double-clicks and against the installer running it
+# while a manual run is in flight.
+#
+# Usage. Stand-alone re-run: `bash platform/scripts/embed-backfill.sh`.
+# Installer-driven: invoked automatically post-`seed-neo4j.sh` on every
+# install (the no-op fast path returns in milliseconds when nothing is
+# pending, so re-running on every install is harmless).
+# ============================================================
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+NEO4J_URI="${NEO4J_URI:-bolt://localhost:7687}"
+NEO4J_USER="${NEO4J_USER:-neo4j}"
+OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
+EMBED_MODEL="${EMBED_MODEL:-nomic-embed-text}"
+BATCH_SIZE="${EMBED_BACKFILL_BATCH_SIZE:-50}"
+# Lock file is brand-scoped via the install directory hash so concurrent
+# Maxy + Real Agent installs (or any two brand installs sharing the device)
+# do not block each other unnecessarily — they target separate Neo4j
+# instances under separate INSTALL_DIRs and have zero shared state. The
+# explicit env var override stays for operator-driven workflows.
+INSTALL_DIR_HASH="$(echo -n "$PROJECT_DIR" | shasum | cut -c1-12)"
+LOCK_FILE="${EMBED_BACKFILL_LOCK_FILE:-/tmp/maxy-embed-backfill-${INSTALL_DIR_HASH}.lock}"
+# Resolve Neo4j password the same way seed-neo4j.sh does. Explicit env var
+# takes precedence so the installer can pass it through without writing the
+# file twice.
+NEO4J_PASSWORD_FILE="$PROJECT_DIR/config/.neo4j-password"
+if [ -z "${NEO4J_PASSWORD:-}" ]; then
+  if [ -f "$NEO4J_PASSWORD_FILE" ]; then
+    NEO4J_PASSWORD=$(cat "$NEO4J_PASSWORD_FILE")
+  else
+    echo "[embed-backfill] FAILED: NEO4J_PASSWORD env var unset and $NEO4J_PASSWORD_FILE missing"
+    echo "[embed-backfill] re-run after the seed step writes the password file, or set NEO4J_PASSWORD explicitly"
+    exit 1
+  fi
+fi
+export NEO4J_URI NEO4J_USER NEO4J_PASSWORD OLLAMA_URL EMBED_MODEL BATCH_SIZE
+if ! command -v cypher-shell >/dev/null 2>&1; then
+  echo "[embed-backfill] FAILED: cypher-shell not on PATH; install Neo4j or add cypher-shell to PATH"
+  exit 1
+fi
+if ! command -v python3 >/dev/null 2>&1; then
+  echo "[embed-backfill] FAILED: python3 not on PATH; the installer requires it"
+  exit 1
+fi
+# flock guard — second concurrent invocation exits cleanly. The exec on
+# fd 200 keeps the lock held for the lifetime of this process; flock -n
+# is non-blocking so a busy lock returns immediately rather than queueing.
+exec 200>"$LOCK_FILE"
+if ! flock -n 200; then
+  echo "[embed-backfill] another instance is already running (lock=$LOCK_FILE), skipping"
+  exit 0
+fi
+# The python heredoc owns the per-batch loop. It uses subprocess to call
+# cypher-shell (avoids re-implementing Bolt) and urllib to call Ollama
+# (no extra deps). cypher-shell `--format plain` returns CSV; the csv
+# module handles quoting/escaping reliably so node text containing commas,
+# quotes, or newlines round-trips correctly.
+#
+# Cypher contract:
+#   READ:  one row per unembedded node — { id: elementId, text: coalesced }
+#          gated by `n.embedding IS NULL` AND `any(label IN labels(n)
+#          WHERE label IN $registered)` AND a non-empty coalesce of the
+#          text property union. Nodes carrying an :Trashed label are
+#          excluded explicitly. READ params (`registered` list of strings,
+#          `batchSize` int) are passed via cypher-shell `--param` as plain
+#          Cypher expressions (string list literals + integer literal).
+#   WRITE: one batched UNWIND per chunk — pairs of (id, embedding[])
+#          interpolated into the Cypher payload as bare-key map literals
+#          (`{id: '...', embedding: [...]}`). Cypher does NOT accept
+#          double-quoted-string map keys, so JSON-serialised values cannot
+#          be passed via `--param` for the WRITE side; the inline literal
+#          path is the apoc-free alternative.
+#
+# The script does NOT shell out to the existing TS embed() helper because
+# that would require booting Node + the platform/lib build. Calling the
+# Ollama HTTP endpoint directly preserves the same behaviour with zero
+# build dependency.
+exec python3 - <<'PYEOF'
+import json
+import os
+import sys
+import time
+import urllib.error
+import urllib.request
+from subprocess import PIPE, Popen
+from io import StringIO
+import csv
+NEO4J_URI = os.environ["NEO4J_URI"]
+NEO4J_USER = os.environ["NEO4J_USER"]
+NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]
+OLLAMA_URL = os.environ["OLLAMA_URL"]
+EMBED_MODEL = os.environ["EMBED_MODEL"]
+BATCH_SIZE = int(os.environ["BATCH_SIZE"])
+# Mirrors the FOR (n:...) clause of `entity_search` in schema.cypher.
+# Doctrine: every label written by the platform is searchable AND embeddable.
+# Future label additions must extend BOTH this list and schema.cypher; the
+# fulltext-coverage doctrine test catches the schema half but not this list.
+REGISTERED_LABELS = [
+    "LocalBusiness", "Service", "PriceSpecification", "OpeningHoursSpecification", "Organization",
+    "Person", "UserProfile", "Preference", "AdminUser", "AccessGrant",
+    "KnowledgeDocument", "Section", "Chunk", "DigitalDocument", "CreativeWork",
+    "Question", "FAQPage", "DefinedTerm", "Review", "ImageObject",
+    "Conversation", "AdminConversation", "PublicConversation", "Message",
+    "UserMessage", "AssistantMessage", "ToolCall",
+    "Task", "Project", "Event",
+    "Workflow", "WorkflowStep", "WorkflowRun", "StepResult",
+    "OnboardingState", "Email", "EmailAccount", "ReviewAlert",
+    "Position", "Credential",
+]
+# Properties to coalesce for the embedding text. Ordered: most identifying
+# property first. Matches the canonical text-property list pinned by the
+# fulltext-coverage doctrine test.
+EMBED_TEXT_PROPS = ["name", "title", "summary", "headline", "body", "content", "text"]
+def cypher(query: str, params: dict | None = None) -> str:
+    """Run a Cypher statement via cypher-shell --format plain.
+    Returns stdout as a single string. Aborts the script on non-zero exit
+    so a Cypher syntax error or a Neo4j outage surfaces immediately."""
+    cmd = [
+        "cypher-shell", "-u", NEO4J_USER, "-p", NEO4J_PASSWORD, "-a", NEO4J_URI,
+        "--format", "plain",
+    ]
+    if params:
+        for key, value in params.items():
+            cmd.extend(["--param", f"{key} => {json.dumps(value)}"])
+    proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
+    out, err = proc.communicate(query.encode("utf-8"))
+    if proc.returncode != 0:
+        sys.stderr.write(f"[embed-backfill] FAILED: cypher-shell exited {proc.returncode}\n")
+        sys.stderr.write(err.decode("utf-8", errors="replace"))
+        sys.exit(1)
+    return out.decode("utf-8", errors="replace")
+def parse_csv_rows(stdout: str) -> list[dict]:
+    """cypher-shell --format plain emits a CSV header + rows. The csv module
+    handles quoting reliably even when text contains commas/quotes/newlines."""
+    if not stdout.strip():
+        return []
+    reader = csv.DictReader(StringIO(stdout))
+    return list(reader)
+def ollama_embed(text: str, *, timeout: int = 30, retry_on_timeout: bool = True) -> list[float]:
+    """POST text to Ollama /api/embed.
+    Cold-start tolerance: when nomic-embed-text is not yet loaded into Ollama's
+    process memory, the first request for the model after a fresh boot can
+    exceed 30s while the model loads. Subsequent requests are fast. We retry
+    ONCE on TimeoutError with a longer (180s) timeout so a cold model load
+    does not abort the entire backfill at the first node. Retry is OFF by
+    default for the warmup probe to avoid recursion.
+    Aborts the script (non-zero exit) on any non-recoverable HTTP failure
+    with a precise message + re-run instruction so the operator never thinks
+    the backfill silently completed.
+    """
+    body = json.dumps({"model": EMBED_MODEL, "input": text}).encode("utf-8")
+    req = urllib.request.Request(
+        f"{OLLAMA_URL}/api/embed",
+        data=body,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            payload = json.loads(resp.read().decode("utf-8"))
+    except TimeoutError as e:
+        if retry_on_timeout:
+            sys.stderr.write(
+                f"[embed-backfill] WARN: Ollama timeout after {timeout}s — likely cold-start; retrying with 180s timeout\n"
+            )
+            return ollama_embed(text, timeout=180, retry_on_timeout=False)
+        sys.stderr.write(f"[embed-backfill] FAILED: Ollama timeout after {timeout}s ({e})\n")
+        sys.stderr.write(
+            f"[embed-backfill] re-run via: bash {os.path.dirname(os.path.realpath(__file__))}/embed-backfill.sh\n"
+        )
+        sys.exit(1)
+    except (urllib.error.URLError, urllib.error.HTTPError) as e:
+        sys.stderr.write(f"[embed-backfill] FAILED: Ollama unreachable ({e})\n")
+        sys.stderr.write(
+            f"[embed-backfill] re-run via: bash {os.path.dirname(os.path.realpath(__file__))}/embed-backfill.sh\n"
+        )
+        sys.exit(1)
+    embeddings = payload.get("embeddings", [])
+    if not embeddings or not embeddings[0]:
+        sys.stderr.write(f"[embed-backfill] FAILED: Ollama returned no embedding for text length={len(text)}\n")
+        sys.exit(1)
+    return embeddings[0]
+def cypher_string_literal(s: str) -> str:
+    """Format a Python string as a Cypher single-quoted string literal.
+    Escapes the two characters Cypher requires escaping inside single-quoted
+    strings: backslash and single quote. elementId values from Neo4j 5 are
+    typically `<dbprefix>:<uuid>:<recordId>` (alphanumeric + colon + dash) and
+    will not normally contain either, but escape defensively so a future
+    elementId format change cannot break the WRITE batch with a syntax error.
+    """
+    return "'" + s.replace("\\", "\\\\").replace("'", "\\'") + "'"
+def cypher_float_list(values: list[float]) -> str:
+    """Format a list of floats as a Cypher list literal `[v1, v2, ...]`.
+    repr() on a Python float emits a decimal that Cypher accepts as a number
+    literal — including the negative sign, scientific notation, and infinity
+    edge cases. nomic-embed-text returns finite cosine-bounded floats so
+    inf/nan are not expected, but Python's repr is stable for any case that
+    does occur.
+    """
+    return "[" + ",".join(repr(v) for v in values) + "]"
+# Build the WHERE clause once. The $registered parameter is interpolated
+# into Cypher as a list literal; cypher-shell --param gives us a typed pass.
+COALESCE_TEXT = "coalesce(" + ", ".join(f"n.{p}" for p in EMBED_TEXT_PROPS) + ", '')"
+COUNT_QUERY = f"""
+MATCH (n) WHERE n.embedding IS NULL
+  AND NOT n:Trashed
+  AND any(label IN labels(n) WHERE label IN $registered)
+  AND {COALESCE_TEXT} <> ''
+RETURN count(n) AS remaining;
+"""
+BATCH_QUERY = f"""
+MATCH (n) WHERE n.embedding IS NULL
+  AND NOT n:Trashed
+  AND any(label IN labels(n) WHERE label IN $registered)
+  AND {COALESCE_TEXT} <> ''
+RETURN elementId(n) AS id,
+       labels(n)[0] AS firstLabel,
+       {COALESCE_TEXT} AS text
+LIMIT $batchSize;
+"""
+count_out = cypher(COUNT_QUERY, {"registered": REGISTERED_LABELS})
+total_remaining = 0
+for row in parse_csv_rows(count_out):
+    total_remaining = int(row["remaining"])
+print(f"[embed-backfill] start total={total_remaining} model={EMBED_MODEL}")
+if total_remaining == 0:
+    print("[embed-backfill] done remaining=0 (nothing to backfill)")
+    sys.exit(0)
+# Pre-warm Ollama so the first per-node call doesn't pay the model-load
+# latency. The cold-start window for nomic-embed-text on a Pi 5 can exceed
+# 30s; calling once with a tiny throwaway input loads the weights into
+# memory before the loop begins. Failure here is treated identically to
+# any other Ollama failure — loud abort with re-run instruction.
+print(f"[embed-backfill] pre-warm model={EMBED_MODEL} timeout=180s")
+ollama_embed("warmup", timeout=180, retry_on_timeout=False)
+processed_total = 0
+batch_index = 0
+while True:
+    batch_start = time.time()
+    batch_out = cypher(
+        BATCH_QUERY,
+        {"registered": REGISTERED_LABELS, "batchSize": BATCH_SIZE},
+    )
+    rows = parse_csv_rows(batch_out)
+    if not rows:
+        break
+    # Compute embeddings serially. Ollama on a Pi 5 handles ~3-10 embeds
+    # per second with nomic-embed-text; concurrent requests just queue
+    # behind the GPU/CPU bottleneck so parallelism wouldn't help.
+    pairs: list[tuple[str, list[float]]] = []
+    label_counts: dict[str, int] = {}
+    for row in rows:
+        node_id = row["id"]
+        text = row["text"]
+        first_label = row["firstLabel"]
+        if not text:
+            continue
+        embedding = ollama_embed(text)
+        pairs.append((node_id, embedding))
+        label_counts[first_label] = label_counts.get(first_label, 0) + 1
+    if not pairs:
+        # Defensive: query said rows exist but all text was empty after
+        # the python read — means the COALESCE_TEXT predicate is wider
+        # than the python check. Stop to avoid an infinite loop.
+        sys.stderr.write("[embed-backfill] WARN: batch returned rows with empty text — stopping to avoid infinite loop\n")
+        break
+    # Build the WRITE batch as a Cypher literal payload rather than a
+    # `--param` map. cypher-shell's `--param` parses the value as a Cypher
+    # expression, and Cypher map keys must be bare identifiers (or backtick-
+    # quoted) — NOT double-quoted strings as JSON would emit. Interpolating
+    # bare-key map literals directly avoids the question entirely:
+    #
+    #   UNWIND [{id: '4:abc:1', embedding: [0.1, 0.2, ...]}, ...] AS pair
+    #   MATCH (n) WHERE elementId(n) = pair.id
+    #   SET n.embedding = pair.embedding;
+    #
+    # cypher_string_literal escapes any backslash/quote in elementIds
+    # defensively; cypher_float_list serialises the embedding via repr()
+    # which Cypher accepts as a number literal.
+    pair_literals = ",".join(
+        f"{{id: {cypher_string_literal(node_id)}, embedding: {cypher_float_list(embedding)}}}"
+        for node_id, embedding in pairs
+    )
+    cypher(
+        f"""
+        UNWIND [{pair_literals}] AS pair
+        MATCH (n) WHERE elementId(n) = pair.id
+        SET n.embedding = pair.embedding;
+        """
+    )
+    elapsed_ms = int((time.time() - batch_start) * 1000)
+    batch_index += 1
+    processed_total += len(pairs)
+    label_summary = ", ".join(f"{k}={v}" for k, v in sorted(label_counts.items()))
+    print(f"[embed-backfill] batch={batch_index} processed={len(pairs)} elapsed-ms={elapsed_ms} labels={label_summary}")
+# Final remaining check — should be zero or the diff between original
+# total and processed_total (e.g. if new writes landed mid-run).
+final_out = cypher(COUNT_QUERY, {"registered": REGISTERED_LABELS})
+final_remaining = 0
+for row in parse_csv_rows(final_out):
+    final_remaining = int(row["remaining"])
+print(f"[embed-backfill] done processed={processed_total} remaining={final_remaining}")
+PYEOF

package/payload/platform/scripts/seed-neo4j.sh CHANGED Viewed

@@ -411,13 +411,19 @@ fi
 echo "==> Connecting to Neo4j at $NEO4J_URI as $NEO4J_USER"
-# Migration: drop single-key UserProfile constraint (replaced by composite
-# (accountId, userId) in Task 249). Also drop the old preference_category
-# index — replaced by (accountId, userId, category) composite.
-echo "==> Migrating schema: dropping single-key UserProfile constraint..."
+# Schema migrations run before the main schema apply so renames don't collide
+# with the new declarations. Each statement is idempotent (`IF EXISTS`):
+#   - Task 249: `user_profile_account_unique` replaced by composite (accountId, userId).
+#   - Task 249: `preference_category` index replaced by (accountId, userId, category).
+#   - Task 748: `knowledge_fulltext` (3 labels) replaced by `entity_search` (~40 labels)
+#     with the universal label/property union. The new index is created by the
+#     schema apply below; dropping the old name here is what lets cypher-shell
+#     run both in one pass without conflict.
+echo "==> Migrating schema: dropping renamed/obsolete constraints + indexes..."
 "$CYPHER_SHELL" -u "$NEO4J_USER" -p "$NEO4J_PASSWORD" -a "$NEO4J_URI" << 'MIGRATE_EOF'
 DROP CONSTRAINT user_profile_account_unique IF EXISTS;
 DROP INDEX preference_category IF EXISTS;
+DROP INDEX knowledge_fulltext IF EXISTS;
 MIGRATE_EOF
 # Vector index dimensions — configurable at install time via --embed-model.

package/payload/server/public/assets/{Checkbox-DD2mv2dU.js → Checkbox-DzNre1pt.js} RENAMED Viewed

	@@ -1 +1 @@
1	- import{t as e}from"./jsx-runtime-~~DV3X_CC7~~.js";var t=e();function n({checked:e,onChange:n,label:r,disabled:i}){return(0,t.jsxs)(`label`,{className:`maxy-checkbox${i?` maxy-checkbox--disabled`:``}`,children:[(0,t.jsx)(`input`,{type:`checkbox`,checked:e,onChange:e=>n(e.target.checked),disabled:i}),(0,t.jsx)(`span`,{className:`maxy-checkbox__box`,children:`✱`}),r&&(0,t.jsx)(`span`,{className:`maxy-checkbox__label`,children:r})]})}export{n as t};
1	+ import{t as e}from"./jsx-runtime-C-H-0vwA.js";var t=e();function n({checked:e,onChange:n,label:r,disabled:i}){return(0,t.jsxs)(`label`,{className:`maxy-checkbox${i?` maxy-checkbox--disabled`:``}`,children:[(0,t.jsx)(`input`,{type:`checkbox`,checked:e,onChange:e=>n(e.target.checked),disabled:i}),(0,t.jsx)(`span`,{className:`maxy-checkbox__box`,children:`✱`}),r&&(0,t.jsx)(`span`,{className:`maxy-checkbox__label`,children:r})]})}export{n as t};