@rubytech/create-maxy 1.0.776 → 1.0.778

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1541,11 +1541,11 @@ function setupVncViewer() {
1541
1541
  }
1542
1542
  function setupAccount() {
1543
1543
  log("10", TOTAL, "Setting up...");
1544
- // Tasks 787 + 788 both seed-neo4j.sh and embed-backfill.sh hard-exit
1545
- // without NEO4J_URI. The installer owns the brand-correct URI and password,
1546
- // so we derive them once and pass to both call sites. Missing password file
1547
- // is a hard error: ensureNeo4jPassword() ran upstream and would have thrown
1548
- // already if it couldn't reach the brand's Neo4j.
1544
+ // Task 787 — seed-neo4j.sh hard-exits without NEO4J_URI. The installer
1545
+ // owns the brand-correct URI and password, so we derive them once.
1546
+ // Missing password file is a hard error: ensureNeo4jPassword() ran
1547
+ // upstream and would have thrown already if it couldn't reach the
1548
+ // brand's Neo4j.
1549
1549
  const passwordFile = join(INSTALL_DIR, "platform/config/.neo4j-password");
1550
1550
  if (!existsSync(passwordFile)) {
1551
1551
  throw new Error(`Neo4j password file missing at ${passwordFile} — required by setup step.`);
@@ -1559,40 +1559,6 @@ function setupAccount() {
1559
1559
  logFile(` [neo4j] passing NEO4J_URI=${neo4jUri} to seed`);
1560
1560
  shell("bash", [seedScript], { cwd: INSTALL_DIR, env: neo4jEnv });
1561
1561
  }
1562
- // Task 748 — universal embedding coverage backfill. Run after seed so the
1563
- // entity_search index is in place and any pre-Task-748 nodes (e.g. the
1564
- // 5096 LinkedIn-imported Persons on existing Pis that bulk-import skipped
1565
- // embedding for) get a vector populated. Idempotent — instant no-op when
1566
- // nothing is pending, so re-running on every install is harmless.
1567
- //
1568
- // Failure-mode policy: WARN, do not abort. The fulltext index is already
1569
- // applied above, so BM25 search works end-to-end without embeddings; the
1570
- // only gap is vector ranking quality on legacy nodes. Aborting the
1571
- // installer on an Ollama hiccup would block every install for a
1572
- // strictly-degradable feature. The script's own loud-failure output
1573
- // tells the operator how to re-run.
1574
- const backfillScript = join(INSTALL_DIR, "platform/scripts/embed-backfill.sh");
1575
- if (existsSync(backfillScript)) {
1576
- const start = Date.now();
1577
- logFile(`> bash ${backfillScript} (warn-not-abort)`);
1578
- const result = spawnSync("bash", [backfillScript], {
1579
- stdio: "inherit",
1580
- timeout: 30 * 60_000,
1581
- cwd: INSTALL_DIR,
1582
- env: neo4jEnv,
1583
- });
1584
- const dur = ((Date.now() - start) / 1000).toFixed(1);
1585
- if (result.status !== 0 || result.signal) {
1586
- const reason = result.signal ? `signal=${result.signal}` : `exit=${result.status}`;
1587
- logFile(` WARN: embed-backfill non-zero (${reason}) after ${dur}s`);
1588
- console.warn(`\n WARNING: embed-backfill did not complete (${reason}) — BM25 search works,\n` +
1589
- ` but vector ranking on legacy nodes will be sparse until you re-run:\n` +
1590
- ` bash ${backfillScript}\n`);
1591
- }
1592
- else {
1593
- logFile(` OK embed-backfill in ${dur}s`);
1594
- }
1595
- }
1596
1562
  }
1597
1563
  // ---------------------------------------------------------------------------
1598
1564
  // Tunnel script shortcuts
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@rubytech/create-maxy",
3
- "version": "1.0.776",
3
+ "version": "1.0.778",
4
4
  "description": "Install Maxy — AI for Productive People",
5
5
  "bin": {
6
6
  "create-maxy": "./dist/index.js"
@@ -125,7 +125,7 @@ If the initial Cloudflare login fails during setup, {{productName}} will fall ba
125
125
 
126
126
  Task 795 — `maxy-edge.service` (always-on front door) classifies upstream errors and serves a brand-aware response. There are two distinct user-visible shapes; the right one depends on what failed.
127
127
 
128
- **Branded holding page ("{{productName}} is starting") for ~10 s during an upgrade — this is expected and self-healing.** The edge process binds the public port immediately, but `maxy.service` (the upstream UI) takes ~10 s after restart to apply the neo4j schema and mount its 11 routes. Any browser navigation that lands during that window gets a self-contained HTML holding page that polls `/api/health` and reloads automatically once the upstream binds. No operator action required. The diagnostic line in `~/.maxy/logs/edge.log` is `[edge] upstream http error path=… err=connect ECONNREFUSED 127.0.0.1:<UPSTREAM_PORT> err-class=econnrefused-coldstart upstream=…` and disappears as soon as upstream binds.
128
+ **Branded holding page (brand logo + "Starting") for ~10 s during an upgrade — this is expected and self-healing.** The edge process binds the public port immediately, but `maxy.service` (the upstream UI) takes ~10 s after restart to apply the neo4j schema and mount its 11 routes. Any browser navigation that lands during that window gets a self-contained HTML holding page that polls `/api/health` and reloads automatically once the upstream binds. The page renders the brand logo (inlined as a base64 data URI at edge boot from `<install>/server/public/brand/<assets.logo>`) and the brand display/body fonts (loaded from fonts.googleapis.com) — both paths bypass the unavailable upstream so the page never makes a same-origin asset fetch. When `brand.logoContainsName` is true the logo replaces the productName text; otherwise the page falls back to "{{productName}} is starting". No operator action required. The diagnostic line in `~/.maxy/logs/edge.log` is `[edge] upstream http error path=… err=connect ECONNREFUSED 127.0.0.1:<UPSTREAM_PORT> err-class=econnrefused-coldstart upstream=…` and disappears as soon as upstream binds. Boot-time confirmation that the logo resolved: `[edge] brand=<name> holding-logo=inlined assets-dir=<path>` — `holding-logo=missing` means the logo file wasn't found at `assets-dir`, the page degrades to text-only.
129
129
 
130
130
  **Branded plain-text 502 ("Bad Gateway ({{productName}} unavailable)") — real upstream failure, not cold-start.** Any error class other than `ECONNREFUSED` (timeouts, resets, host-unreachable) returns the existing 502 path. The diagnostic line carries `err-class=other`. Read the log with `tail -200 ~/.maxy/logs/edge.log | rg 'err-class=other'` and check `~/.maxy/logs/server.log` for upstream stack traces — the upstream itself is the source.
131
131
 
@@ -335,7 +335,10 @@ handle_connect_requests() {
335
335
  # Attempt WiFi connection. Capture exit code before || true so we
336
336
  # get nmcli's actual exit status, not the unconditional 0 from || true.
337
337
  local connect_output connect_exit
338
- connect_output=$(nmcli device wifi connect "$target_ssid" password "$target_password" --wait 30 2>&1)
338
+ # `--wait` / `-w` is a top-level nmcli option (must precede the
339
+ # subcommand), not an argument to `device wifi connect`. Putting it
340
+ # after the subcommand fails with "invalid extra argument '--wait'".
341
+ connect_output=$(nmcli --wait 30 device wifi connect "$target_ssid" password "$target_password" 2>&1)
339
342
  connect_exit=$?
340
343
 
341
344
  if [ $connect_exit -eq 0 ] && wifi_is_connected; then
@@ -24,7 +24,7 @@ import { createServer, request as httpRequest } from "http";
24
24
  import { createConnection as createConnection2 } from "net";
25
25
  import { readFileSync as readFileSync4, existsSync as existsSync5, watchFile } from "fs";
26
26
  import { homedir } from "os";
27
- import { join as join3 } from "path";
27
+ import { join as join4 } from "path";
28
28
 
29
29
  // server/ws-proxy.ts
30
30
  import { createConnection } from "net";
@@ -668,14 +668,50 @@ function createEdgeAdminApp(opts) {
668
668
 
669
669
  // server/edge-fallback.ts
670
670
  import { existsSync as existsSync4, readFileSync as readFileSync3 } from "fs";
671
- var BRAND_DEFAULTS = { configDir: ".maxy", productName: "Maxy" };
672
- function loadBrand(path) {
673
- if (!path || !existsSync4(path)) return { ...BRAND_DEFAULTS };
671
+ import { extname, join as join3 } from "path";
672
+ var BRAND_DEFAULTS = {
673
+ configDir: ".maxy",
674
+ productName: "Maxy",
675
+ background: "#FAFAF8",
676
+ textColor: "#2A2A2A",
677
+ displayFont: "'Cormorant', Georgia, serif",
678
+ bodyFont: "'DM Sans', -apple-system, BlinkMacSystemFont, sans-serif",
679
+ logoContainsName: false,
680
+ logoDataUri: null
681
+ };
682
+ var MIME_BY_EXT = {
683
+ ".png": "image/png",
684
+ ".jpg": "image/jpeg",
685
+ ".jpeg": "image/jpeg",
686
+ ".svg": "image/svg+xml",
687
+ ".webp": "image/webp",
688
+ ".gif": "image/gif"
689
+ };
690
+ function inlineAsset(filePath) {
691
+ if (!existsSync4(filePath)) return null;
692
+ const mime = MIME_BY_EXT[extname(filePath).toLowerCase()];
693
+ if (!mime) return null;
694
+ const bytes = readFileSync3(filePath);
695
+ return `data:${mime};base64,${bytes.toString("base64")}`;
696
+ }
697
+ function readString(value, fallback) {
698
+ return typeof value === "string" && value.length > 0 ? value : fallback;
699
+ }
700
+ function loadBrand(brandJsonPath2, assetsDir = "") {
701
+ if (!brandJsonPath2 || !existsSync4(brandJsonPath2)) return { ...BRAND_DEFAULTS };
674
702
  try {
675
- const parsed = JSON.parse(readFileSync3(path, "utf-8"));
703
+ const parsed = JSON.parse(readFileSync3(brandJsonPath2, "utf-8"));
704
+ const logoFile = typeof parsed.assets?.logo === "string" ? parsed.assets.logo : null;
705
+ const logoDataUri = logoFile && assetsDir ? inlineAsset(join3(assetsDir, logoFile)) : null;
676
706
  return {
677
- configDir: typeof parsed.configDir === "string" ? parsed.configDir : BRAND_DEFAULTS.configDir,
678
- productName: typeof parsed.productName === "string" ? parsed.productName : BRAND_DEFAULTS.productName
707
+ configDir: readString(parsed.configDir, BRAND_DEFAULTS.configDir),
708
+ productName: readString(parsed.productName, BRAND_DEFAULTS.productName),
709
+ background: readString(parsed.defaultColors?.background, BRAND_DEFAULTS.background),
710
+ textColor: BRAND_DEFAULTS.textColor,
711
+ displayFont: readString(parsed.defaultFonts?.display, BRAND_DEFAULTS.displayFont),
712
+ bodyFont: readString(parsed.defaultFonts?.body, BRAND_DEFAULTS.bodyFont),
713
+ logoContainsName: parsed.logoContainsName === true,
714
+ logoDataUri
679
715
  };
680
716
  } catch (err) {
681
717
  console.error(`[edge] brand.json parse error: ${err.message}`);
@@ -689,40 +725,59 @@ var HTML_ESCAPES = { "<": "&lt;", ">": "&gt;", "&": "&amp;", '"': "&quot;", "'":
689
725
  function escapeHtml(s) {
690
726
  return String(s).replace(/[<>&"']/g, (c) => HTML_ESCAPES[c] ?? c);
691
727
  }
692
- function buildHoldingPage(productName) {
693
- const safeName = escapeHtml(productName);
728
+ function firstFontFamily(stack) {
729
+ const match = stack.match(/^\s*['"]([^'"]+)['"]/);
730
+ return match ? match[1] : null;
731
+ }
732
+ function googleFontsHref(displayFont, bodyFont) {
733
+ const families = [firstFontFamily(displayFont), firstFontFamily(bodyFont)].filter((f) => f !== null);
734
+ if (families.length === 0) return null;
735
+ const params = families.map((name) => `family=${encodeURIComponent(name)}:wght@400;500;600`).join("&");
736
+ return `https://fonts.googleapis.com/css2?${params}&display=swap`;
737
+ }
738
+ function buildHoldingPage(brand) {
739
+ const safeName = escapeHtml(brand.productName);
740
+ const fontsHref = googleFontsHref(brand.displayFont, brand.bodyFont);
741
+ const fontsLink = fontsHref ? `<link rel="preconnect" href="https://fonts.googleapis.com"><link rel="preconnect" href="https://fonts.gstatic.com" crossorigin><link rel="stylesheet" href="${escapeHtml(fontsHref)}">` : "";
742
+ const logoBlock = brand.logoDataUri ? `<img class="logo" src="${brand.logoDataUri}" alt="${safeName}">` : "";
743
+ const headline = brand.logoContainsName && brand.logoDataUri ? '<p class="sub">Starting</p>' : `<p class="name">${safeName} is starting</p>`;
694
744
  return `<!doctype html>
695
745
  <html lang="en">
696
746
  <head>
697
747
  <meta charset="utf-8">
698
748
  <meta name="viewport" content="width=device-width,initial-scale=1">
699
749
  <title>${safeName}</title>
750
+ ${fontsLink}
700
751
  <style>
701
752
  html, body { margin: 0; padding: 0; height: 100%; }
702
753
  body {
703
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
704
- background: #fafaf8;
705
- color: #2a2a2a;
754
+ font-family: ${brand.bodyFont};
755
+ background: ${brand.background};
756
+ color: ${brand.textColor};
706
757
  display: flex;
707
758
  align-items: center;
708
759
  justify-content: center;
709
760
  flex-direction: column;
761
+ gap: 1.25rem;
710
762
  }
711
- .name { font-size: 1.4rem; font-weight: 500; margin: 0 0 0.5rem; }
712
- .sub { font-size: 0.95rem; color: #888; margin: 0; }
763
+ .logo { display: block; max-width: 240px; max-height: 120px; width: auto; height: auto; }
764
+ .name { font-family: ${brand.displayFont}; font-size: 2rem; font-weight: 500; margin: 0; letter-spacing: -0.01em; }
765
+ .sub { font-family: ${brand.bodyFont}; font-size: 0.95rem; color: ${brand.textColor}; opacity: 0.55; margin: 0; letter-spacing: 0.04em; text-transform: uppercase; }
766
+ .dots { display: flex; gap: 6px; align-items: center; }
713
767
  .dot {
714
768
  display: inline-block; width: 6px; height: 6px; border-radius: 50%;
715
- background: currentColor; margin: 0 2px; opacity: 0.3;
769
+ background: currentColor; opacity: 0.3;
716
770
  animation: pulse 1.4s infinite ease-in-out;
717
771
  }
718
772
  .dot:nth-child(2) { animation-delay: 0.2s; }
719
773
  .dot:nth-child(3) { animation-delay: 0.4s; }
720
- @keyframes pulse { 0%, 80%, 100% { opacity: 0.3; } 40% { opacity: 1; } }
774
+ @keyframes pulse { 0%, 80%, 100% { opacity: 0.25; } 40% { opacity: 0.85; } }
721
775
  </style>
722
776
  </head>
723
777
  <body>
724
- <p class="name">${safeName} is starting</p>
725
- <p class="sub"><span class="dot"></span><span class="dot"></span><span class="dot"></span></p>
778
+ ${logoBlock}
779
+ ${headline}
780
+ <p class="dots"><span class="dot"></span><span class="dot"></span><span class="dot"></span></p>
726
781
  <script>
727
782
  (function () {
728
783
  var attempts = 0;
@@ -743,9 +798,10 @@ function buildHoldingPage(productName) {
743
798
 
744
799
  // server/edge.ts
745
800
  var PLATFORM_ROOT2 = process.env.MAXY_PLATFORM_ROOT || "";
746
- var BRAND_JSON_PATH = PLATFORM_ROOT2 ? join3(PLATFORM_ROOT2, "config", "brand.json") : "";
747
- var BRAND = loadBrand(BRAND_JSON_PATH);
748
- var ALIAS_DOMAINS_PATH = join3(homedir(), BRAND.configDir, "alias-domains.json");
801
+ var BRAND_JSON_PATH = PLATFORM_ROOT2 ? join4(PLATFORM_ROOT2, "config", "brand.json") : "";
802
+ var BRAND_ASSETS_DIR = PLATFORM_ROOT2 ? join4(PLATFORM_ROOT2, "..", "server", "public", "brand") : "";
803
+ var BRAND = loadBrand(BRAND_JSON_PATH, BRAND_ASSETS_DIR);
804
+ var ALIAS_DOMAINS_PATH = join4(homedir(), BRAND.configDir, "alias-domains.json");
749
805
  function loadAliasDomains() {
750
806
  try {
751
807
  if (!existsSync5(ALIAS_DOMAINS_PATH)) return /* @__PURE__ */ new Set();
@@ -814,7 +870,7 @@ function forwardHttp(clientReq, clientRes) {
814
870
  const isHtmlNavigation = errClass === "econnrefused-coldstart" && clientReq.method === "GET" && accept.includes("text/html");
815
871
  if (isHtmlNavigation) {
816
872
  clientRes.writeHead(200, { "content-type": "text/html; charset=utf-8", "cache-control": "no-store" });
817
- clientRes.end(buildHoldingPage(BRAND.productName));
873
+ clientRes.end(buildHoldingPage(BRAND));
818
874
  } else {
819
875
  clientRes.writeHead(502, { "content-type": "text/plain" });
820
876
  clientRes.end(`Bad Gateway (${BRAND.productName} unavailable)`);
@@ -904,7 +960,7 @@ server.on("upgrade", (req, socket, head) => {
904
960
  });
905
961
  server.listen(EDGE_PORT, EDGE_HOSTNAME, () => {
906
962
  console.log(`[edge] listening on http://${EDGE_HOSTNAME}:${EDGE_PORT}`);
907
- console.log(`[edge] brand=${BRAND.productName}`);
963
+ console.log(`[edge] brand=${BRAND.productName} holding-logo=${BRAND.logoDataUri ? "inlined" : "missing"} assets-dir=${BRAND_ASSETS_DIR || "(none)"}`);
908
964
  console.log(`[edge] /websockify \u2192 ${WEBSOCKIFY_HOST}:${WEBSOCKIFY_PORT}`);
909
965
  console.log(`[edge] everything else \u2192 ${UPSTREAM_HOST}:${UPSTREAM_PORT}`);
910
966
  });
@@ -1,382 +0,0 @@
1
- #!/usr/bin/env bash
2
- # ============================================================
3
- # embed-backfill.sh — populate embeddings on legacy nodes (Task 748)
4
- #
5
- # Walks the Neo4j graph for nodes carrying any registered Maxy label that
6
- # lack `n.embedding` and have at least one populated text property. For
7
- # each such node the script builds a text representation from the same
8
- # property union the fulltext index covers (`name`, `title`, `summary`,
9
- # `headline`, `body`, `content`, `text`), POSTs it to Ollama's `/api/embed`
10
- # endpoint, and writes the resulting vector back to the node.
11
- #
12
- # Why it exists. Pre-Task-748 bulk-import paths (notably `memory-archive-write`
13
- # for LinkedIn Connections.csv, ~5096 Persons per import) skipped per-row
14
- # embedding to keep import latency under five minutes. With Task 748's
15
- # universal fulltext coverage in place, BM25 catches those nodes immediately
16
- # but vector ranking is sparse until embeddings exist. This script heals
17
- # both the legacy backlog and any future bulk-imported population.
18
- #
19
- # Idempotent. Re-running picks up exactly where a prior run left off because
20
- # the gating predicate is `n.embedding IS NULL` — nodes embedded by the
21
- # previous run are excluded from the next batch query.
22
- #
23
- # Loud failure (per feedback_loud_failures.md). Any Ollama HTTP failure or
24
- # cypher-shell error aborts the script with a non-zero exit and prints a
25
- # precise re-run instruction. Partial-state-on-abort is safe: nodes whose
26
- # embedding was committed before the abort stay embedded; the rest fall back
27
- # into the next run's batch.
28
- #
29
- # Concurrent-run safety. flock-guarded — a second concurrent invocation
30
- # exits immediately with a clear message, no work attempted. Protects
31
- # against operator double-clicks and against the installer running it
32
- # while a manual run is in flight.
33
- #
34
- # Usage. Stand-alone re-run: `bash platform/scripts/embed-backfill.sh`.
35
- # Installer-driven: invoked automatically post-`seed-neo4j.sh` on every
36
- # install (the no-op fast path returns in milliseconds when nothing is
37
- # pending, so re-running on every install is harmless).
38
- # ============================================================
39
-
40
- set -euo pipefail
41
-
42
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
43
- PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
44
-
45
- # NEO4J_URI is hard-required (Task 788). The previous default
46
- # `bolt://localhost:7687` would silently route the backfill to the wrong Neo4j
47
- # on any brand-dedicated install, masking the actual configuration error.
48
- if [ -z "${NEO4J_URI:-}" ]; then
49
- echo "Error: NEO4J_URI required (no default — see Task 788)" >&2
50
- echo " Set NEO4J_URI=bolt://localhost:<brand.neo4jPort> before running." >&2
51
- exit 1
52
- fi
53
- NEO4J_USER="${NEO4J_USER:-neo4j}"
54
- OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
55
- EMBED_MODEL="${EMBED_MODEL:-nomic-embed-text}"
56
- BATCH_SIZE="${EMBED_BACKFILL_BATCH_SIZE:-50}"
57
-
58
- # Lock file is brand-scoped via the install directory hash so concurrent
59
- # Maxy + Real Agent installs (or any two brand installs sharing the device)
60
- # do not block each other unnecessarily — they target separate Neo4j
61
- # instances under separate INSTALL_DIRs and have zero shared state. The
62
- # explicit env var override stays for operator-driven workflows.
63
- INSTALL_DIR_HASH="$(echo -n "$PROJECT_DIR" | shasum | cut -c1-12)"
64
- LOCK_FILE="${EMBED_BACKFILL_LOCK_FILE:-/tmp/maxy-embed-backfill-${INSTALL_DIR_HASH}.lock}"
65
-
66
- # Resolve Neo4j password the same way seed-neo4j.sh does. Explicit env var
67
- # takes precedence so the installer can pass it through without writing the
68
- # file twice.
69
- NEO4J_PASSWORD_FILE="$PROJECT_DIR/config/.neo4j-password"
70
- if [ -z "${NEO4J_PASSWORD:-}" ]; then
71
- if [ -f "$NEO4J_PASSWORD_FILE" ]; then
72
- NEO4J_PASSWORD=$(cat "$NEO4J_PASSWORD_FILE")
73
- else
74
- echo "[embed-backfill] FAILED: NEO4J_PASSWORD env var unset and $NEO4J_PASSWORD_FILE missing"
75
- echo "[embed-backfill] re-run after the seed step writes the password file, or set NEO4J_PASSWORD explicitly"
76
- exit 1
77
- fi
78
- fi
79
- export NEO4J_URI NEO4J_USER NEO4J_PASSWORD OLLAMA_URL EMBED_MODEL BATCH_SIZE
80
-
81
- if ! command -v cypher-shell >/dev/null 2>&1; then
82
- echo "[embed-backfill] FAILED: cypher-shell not on PATH; install Neo4j or add cypher-shell to PATH"
83
- exit 1
84
- fi
85
- if ! command -v python3 >/dev/null 2>&1; then
86
- echo "[embed-backfill] FAILED: python3 not on PATH; the installer requires it"
87
- exit 1
88
- fi
89
-
90
- # flock guard — second concurrent invocation exits cleanly. The exec on
91
- # fd 200 keeps the lock held for the lifetime of this process; flock -n
92
- # is non-blocking so a busy lock returns immediately rather than queueing.
93
- exec 200>"$LOCK_FILE"
94
- if ! flock -n 200; then
95
- echo "[embed-backfill] another instance is already running (lock=$LOCK_FILE), skipping"
96
- exit 0
97
- fi
98
-
99
- # The python heredoc owns the per-batch loop. It uses subprocess to call
100
- # cypher-shell (avoids re-implementing Bolt) and urllib to call Ollama
101
- # (no extra deps). cypher-shell `--format plain` returns CSV; the csv
102
- # module handles quoting/escaping reliably so node text containing commas,
103
- # quotes, or newlines round-trips correctly.
104
- #
105
- # Cypher contract:
106
- # READ: one row per unembedded node — { id: elementId, text: coalesced }
107
- # gated by `n.embedding IS NULL` AND `any(label IN labels(n)
108
- # WHERE label IN $registered)` AND a non-empty coalesce of the
109
- # text property union. Nodes carrying an :Trashed label are
110
- # excluded explicitly. READ params (`registered` list of strings,
111
- # `batchSize` int) are passed via cypher-shell `--param` as plain
112
- # Cypher expressions (string list literals + integer literal).
113
- # WRITE: one batched UNWIND per chunk — pairs of (id, embedding[])
114
- # interpolated into the Cypher payload as bare-key map literals
115
- # (`{id: '...', embedding: [...]}`). Cypher does NOT accept
116
- # double-quoted-string map keys, so JSON-serialised values cannot
117
- # be passed via `--param` for the WRITE side; the inline literal
118
- # path is the apoc-free alternative.
119
- #
120
- # The script does NOT shell out to the existing TS embed() helper because
121
- # that would require booting Node + the platform/lib build. Calling the
122
- # Ollama HTTP endpoint directly preserves the same behaviour with zero
123
- # build dependency.
124
- exec python3 - <<'PYEOF'
125
- import json
126
- import os
127
- import sys
128
- import time
129
- import urllib.error
130
- import urllib.request
131
- from subprocess import PIPE, Popen
132
- from io import StringIO
133
- import csv
134
-
135
- NEO4J_URI = os.environ["NEO4J_URI"]
136
- NEO4J_USER = os.environ["NEO4J_USER"]
137
- NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]
138
- OLLAMA_URL = os.environ["OLLAMA_URL"]
139
- EMBED_MODEL = os.environ["EMBED_MODEL"]
140
- BATCH_SIZE = int(os.environ["BATCH_SIZE"])
141
-
142
- # Mirrors the FOR (n:...) clause of `entity_search` in schema.cypher.
143
- # Doctrine: every label written by the platform is searchable AND embeddable.
144
- # Future label additions must extend BOTH this list and schema.cypher; the
145
- # fulltext-coverage doctrine test catches the schema half but not this list.
146
- REGISTERED_LABELS = [
147
- "LocalBusiness", "Service", "PriceSpecification", "OpeningHoursSpecification", "Organization",
148
- "Person", "UserProfile", "Preference", "AdminUser", "AccessGrant",
149
- "KnowledgeDocument", "Section", "Chunk", "DigitalDocument", "CreativeWork",
150
- "Question", "FAQPage", "DefinedTerm", "Review", "ImageObject",
151
- "Conversation", "AdminConversation", "PublicConversation", "Message",
152
- "UserMessage", "AssistantMessage", "ToolCall",
153
- "Task", "Project", "Event",
154
- "Workflow", "WorkflowStep", "WorkflowRun", "StepResult",
155
- "OnboardingState", "Email", "EmailAccount", "ReviewAlert",
156
- "Position", "Credential",
157
- ]
158
-
159
- # Properties to coalesce for the embedding text. Ordered: most identifying
160
- # property first. Matches the canonical text-property list pinned by the
161
- # fulltext-coverage doctrine test.
162
- EMBED_TEXT_PROPS = ["name", "title", "summary", "headline", "body", "content", "text"]
163
-
164
-
165
- def cypher(query: str, params: dict | None = None) -> str:
166
- """Run a Cypher statement via cypher-shell --format plain.
167
- Returns stdout as a single string. Aborts the script on non-zero exit
168
- so a Cypher syntax error or a Neo4j outage surfaces immediately."""
169
- cmd = [
170
- "cypher-shell", "-u", NEO4J_USER, "-p", NEO4J_PASSWORD, "-a", NEO4J_URI,
171
- "--format", "plain",
172
- ]
173
- if params:
174
- for key, value in params.items():
175
- cmd.extend(["--param", f"{key} => {json.dumps(value)}"])
176
- proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
177
- out, err = proc.communicate(query.encode("utf-8"))
178
- if proc.returncode != 0:
179
- sys.stderr.write(f"[embed-backfill] FAILED: cypher-shell exited {proc.returncode}\n")
180
- sys.stderr.write(err.decode("utf-8", errors="replace"))
181
- sys.exit(1)
182
- return out.decode("utf-8", errors="replace")
183
-
184
-
185
- def parse_csv_rows(stdout: str) -> list[dict]:
186
- """cypher-shell --format plain emits a CSV header + rows. The csv module
187
- handles quoting reliably even when text contains commas/quotes/newlines.
188
-
189
- skipinitialspace=True is required because cypher-shell emits a space
190
- after each comma in both header and data rows (`id, firstLabel, text`),
191
- and DictReader otherwise treats the spaces as part of the column name —
192
- `row["text"]` raises KeyError because the actual key is " text"."""
193
- if not stdout.strip():
194
- return []
195
- reader = csv.DictReader(StringIO(stdout), skipinitialspace=True)
196
- return list(reader)
197
-
198
-
199
- def ollama_embed(text: str, *, timeout: int = 30, retry_on_timeout: bool = True) -> list[float]:
200
- """POST text to Ollama /api/embed.
201
-
202
- Cold-start tolerance: when nomic-embed-text is not yet loaded into Ollama's
203
- process memory, the first request for the model after a fresh boot can
204
- exceed 30s while the model loads. Subsequent requests are fast. We retry
205
- ONCE on TimeoutError with a longer (180s) timeout so a cold model load
206
- does not abort the entire backfill at the first node. Retry is OFF by
207
- default for the warmup probe to avoid recursion.
208
-
209
- Aborts the script (non-zero exit) on any non-recoverable HTTP failure
210
- with a precise message + re-run instruction so the operator never thinks
211
- the backfill silently completed.
212
- """
213
- body = json.dumps({"model": EMBED_MODEL, "input": text}).encode("utf-8")
214
- req = urllib.request.Request(
215
- f"{OLLAMA_URL}/api/embed",
216
- data=body,
217
- headers={"Content-Type": "application/json"},
218
- method="POST",
219
- )
220
- try:
221
- with urllib.request.urlopen(req, timeout=timeout) as resp:
222
- payload = json.loads(resp.read().decode("utf-8"))
223
- except TimeoutError as e:
224
- if retry_on_timeout:
225
- sys.stderr.write(
226
- f"[embed-backfill] WARN: Ollama timeout after {timeout}s — likely cold-start; retrying with 180s timeout\n"
227
- )
228
- return ollama_embed(text, timeout=180, retry_on_timeout=False)
229
- sys.stderr.write(f"[embed-backfill] FAILED: Ollama timeout after {timeout}s ({e})\n")
230
- sys.stderr.write(
231
- f"[embed-backfill] re-run via: bash {os.path.dirname(os.path.realpath(__file__))}/embed-backfill.sh\n"
232
- )
233
- sys.exit(1)
234
- except (urllib.error.URLError, urllib.error.HTTPError) as e:
235
- sys.stderr.write(f"[embed-backfill] FAILED: Ollama unreachable ({e})\n")
236
- sys.stderr.write(
237
- f"[embed-backfill] re-run via: bash {os.path.dirname(os.path.realpath(__file__))}/embed-backfill.sh\n"
238
- )
239
- sys.exit(1)
240
- embeddings = payload.get("embeddings", [])
241
- if not embeddings or not embeddings[0]:
242
- sys.stderr.write(f"[embed-backfill] FAILED: Ollama returned no embedding for text length={len(text)}\n")
243
- sys.exit(1)
244
- return embeddings[0]
245
-
246
-
247
- def cypher_string_literal(s: str) -> str:
248
- """Format a Python string as a Cypher single-quoted string literal.
249
-
250
- Escapes the two characters Cypher requires escaping inside single-quoted
251
- strings: backslash and single quote. elementId values from Neo4j 5 are
252
- typically `<dbprefix>:<uuid>:<recordId>` (alphanumeric + colon + dash) and
253
- will not normally contain either, but escape defensively so a future
254
- elementId format change cannot break the WRITE batch with a syntax error.
255
- """
256
- return "'" + s.replace("\\", "\\\\").replace("'", "\\'") + "'"
257
-
258
-
259
- def cypher_float_list(values: list[float]) -> str:
260
- """Format a list of floats as a Cypher list literal `[v1, v2, ...]`.
261
-
262
- repr() on a Python float emits a decimal that Cypher accepts as a number
263
- literal — including the negative sign, scientific notation, and infinity
264
- edge cases. nomic-embed-text returns finite cosine-bounded floats so
265
- inf/nan are not expected, but Python's repr is stable for any case that
266
- does occur.
267
- """
268
- return "[" + ",".join(repr(v) for v in values) + "]"
269
-
270
-
271
- # Build the WHERE clause once. The $registered parameter is interpolated
272
- # into Cypher as a list literal; cypher-shell --param gives us a typed pass.
273
- COALESCE_TEXT = "coalesce(" + ", ".join(f"n.{p}" for p in EMBED_TEXT_PROPS) + ", '')"
274
- COUNT_QUERY = f"""
275
- MATCH (n) WHERE n.embedding IS NULL
276
- AND NOT n:Trashed
277
- AND any(label IN labels(n) WHERE label IN $registered)
278
- AND {COALESCE_TEXT} <> ''
279
- RETURN count(n) AS remaining;
280
- """
281
- BATCH_QUERY = f"""
282
- MATCH (n) WHERE n.embedding IS NULL
283
- AND NOT n:Trashed
284
- AND any(label IN labels(n) WHERE label IN $registered)
285
- AND {COALESCE_TEXT} <> ''
286
- RETURN elementId(n) AS id,
287
- labels(n)[0] AS firstLabel,
288
- {COALESCE_TEXT} AS text
289
- LIMIT $batchSize;
290
- """
291
-
292
- count_out = cypher(COUNT_QUERY, {"registered": REGISTERED_LABELS})
293
- total_remaining = 0
294
- for row in parse_csv_rows(count_out):
295
- total_remaining = int(row["remaining"])
296
-
297
- print(f"[embed-backfill] start total={total_remaining} model={EMBED_MODEL}")
298
-
299
- if total_remaining == 0:
300
- print("[embed-backfill] done remaining=0 (nothing to backfill)")
301
- sys.exit(0)
302
-
303
- # Pre-warm Ollama so the first per-node call doesn't pay the model-load
304
- # latency. The cold-start window for nomic-embed-text on a Pi 5 can exceed
305
- # 30s; calling once with a tiny throwaway input loads the weights into
306
- # memory before the loop begins. Failure here is treated identically to
307
- # any other Ollama failure — loud abort with re-run instruction.
308
- print(f"[embed-backfill] pre-warm model={EMBED_MODEL} timeout=180s")
309
- ollama_embed("warmup", timeout=180, retry_on_timeout=False)
310
-
311
- processed_total = 0
312
- batch_index = 0
313
- while True:
314
- batch_start = time.time()
315
- batch_out = cypher(
316
- BATCH_QUERY,
317
- {"registered": REGISTERED_LABELS, "batchSize": BATCH_SIZE},
318
- )
319
- rows = parse_csv_rows(batch_out)
320
- if not rows:
321
- break
322
-
323
- # Compute embeddings serially. Ollama on a Pi 5 handles ~3-10 embeds
324
- # per second with nomic-embed-text; concurrent requests just queue
325
- # behind the GPU/CPU bottleneck so parallelism wouldn't help.
326
- pairs: list[tuple[str, list[float]]] = []
327
- label_counts: dict[str, int] = {}
328
- for row in rows:
329
- node_id = row["id"]
330
- text = row["text"]
331
- first_label = row["firstLabel"]
332
- if not text:
333
- continue
334
- embedding = ollama_embed(text)
335
- pairs.append((node_id, embedding))
336
- label_counts[first_label] = label_counts.get(first_label, 0) + 1
337
-
338
- if not pairs:
339
- # Defensive: query said rows exist but all text was empty after
340
- # the python read — means the COALESCE_TEXT predicate is wider
341
- # than the python check. Stop to avoid an infinite loop.
342
- sys.stderr.write("[embed-backfill] WARN: batch returned rows with empty text — stopping to avoid infinite loop\n")
343
- break
344
-
345
- # Build the WRITE batch as a Cypher literal payload rather than a
346
- # `--param` map. cypher-shell's `--param` parses the value as a Cypher
347
- # expression, and Cypher map keys must be bare identifiers (or backtick-
348
- # quoted) — NOT double-quoted strings as JSON would emit. Interpolating
349
- # bare-key map literals directly avoids the question entirely:
350
- #
351
- # UNWIND [{id: '4:abc:1', embedding: [0.1, 0.2, ...]}, ...] AS pair
352
- # MATCH (n) WHERE elementId(n) = pair.id
353
- # SET n.embedding = pair.embedding;
354
- #
355
- # cypher_string_literal escapes any backslash/quote in elementIds
356
- # defensively; cypher_float_list serialises the embedding via repr()
357
- # which Cypher accepts as a number literal.
358
- pair_literals = ",".join(
359
- f"{{id: {cypher_string_literal(node_id)}, embedding: {cypher_float_list(embedding)}}}"
360
- for node_id, embedding in pairs
361
- )
362
- cypher(
363
- f"""
364
- UNWIND [{pair_literals}] AS pair
365
- MATCH (n) WHERE elementId(n) = pair.id
366
- SET n.embedding = pair.embedding;
367
- """
368
- )
369
- elapsed_ms = int((time.time() - batch_start) * 1000)
370
- batch_index += 1
371
- processed_total += len(pairs)
372
- label_summary = ", ".join(f"{k}={v}" for k, v in sorted(label_counts.items()))
373
- print(f"[embed-backfill] batch={batch_index} processed={len(pairs)} elapsed-ms={elapsed_ms} labels={label_summary}")
374
-
375
- # Final remaining check — should be zero or the diff between original
376
- # total and processed_total (e.g. if new writes landed mid-run).
377
- final_out = cypher(COUNT_QUERY, {"registered": REGISTERED_LABELS})
378
- final_remaining = 0
379
- for row in parse_csv_rows(final_out):
380
- final_remaining = int(row["remaining"])
381
- print(f"[embed-backfill] done processed={processed_total} remaining={final_remaining}")
382
- PYEOF