@rubytech/create-maxy 1.0.776 → 1.0.777
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +5 -39
- package/package.json +1 -1
- package/payload/platform/scripts/embed-backfill.sh +0 -382
package/dist/index.js
CHANGED
|
@@ -1541,11 +1541,11 @@ function setupVncViewer() {
|
|
|
1541
1541
|
}
|
|
1542
1542
|
function setupAccount() {
|
|
1543
1543
|
log("10", TOTAL, "Setting up...");
|
|
1544
|
-
//
|
|
1545
|
-
//
|
|
1546
|
-
//
|
|
1547
|
-
//
|
|
1548
|
-
//
|
|
1544
|
+
// Task 787 — seed-neo4j.sh hard-exits without NEO4J_URI. The installer
|
|
1545
|
+
// owns the brand-correct URI and password, so we derive them once.
|
|
1546
|
+
// Missing password file is a hard error: ensureNeo4jPassword() ran
|
|
1547
|
+
// upstream and would have thrown already if it couldn't reach the
|
|
1548
|
+
// brand's Neo4j.
|
|
1549
1549
|
const passwordFile = join(INSTALL_DIR, "platform/config/.neo4j-password");
|
|
1550
1550
|
if (!existsSync(passwordFile)) {
|
|
1551
1551
|
throw new Error(`Neo4j password file missing at ${passwordFile} — required by setup step.`);
|
|
@@ -1559,40 +1559,6 @@ function setupAccount() {
|
|
|
1559
1559
|
logFile(` [neo4j] passing NEO4J_URI=${neo4jUri} to seed`);
|
|
1560
1560
|
shell("bash", [seedScript], { cwd: INSTALL_DIR, env: neo4jEnv });
|
|
1561
1561
|
}
|
|
1562
|
-
// Task 748 — universal embedding coverage backfill. Run after seed so the
|
|
1563
|
-
// entity_search index is in place and any pre-Task-748 nodes (e.g. the
|
|
1564
|
-
// 5096 LinkedIn-imported Persons on existing Pis that bulk-import skipped
|
|
1565
|
-
// embedding for) get a vector populated. Idempotent — instant no-op when
|
|
1566
|
-
// nothing is pending, so re-running on every install is harmless.
|
|
1567
|
-
//
|
|
1568
|
-
// Failure-mode policy: WARN, do not abort. The fulltext index is already
|
|
1569
|
-
// applied above, so BM25 search works end-to-end without embeddings; the
|
|
1570
|
-
// only gap is vector ranking quality on legacy nodes. Aborting the
|
|
1571
|
-
// installer on an Ollama hiccup would block every install for a
|
|
1572
|
-
// strictly-degradable feature. The script's own loud-failure output
|
|
1573
|
-
// tells the operator how to re-run.
|
|
1574
|
-
const backfillScript = join(INSTALL_DIR, "platform/scripts/embed-backfill.sh");
|
|
1575
|
-
if (existsSync(backfillScript)) {
|
|
1576
|
-
const start = Date.now();
|
|
1577
|
-
logFile(`> bash ${backfillScript} (warn-not-abort)`);
|
|
1578
|
-
const result = spawnSync("bash", [backfillScript], {
|
|
1579
|
-
stdio: "inherit",
|
|
1580
|
-
timeout: 30 * 60_000,
|
|
1581
|
-
cwd: INSTALL_DIR,
|
|
1582
|
-
env: neo4jEnv,
|
|
1583
|
-
});
|
|
1584
|
-
const dur = ((Date.now() - start) / 1000).toFixed(1);
|
|
1585
|
-
if (result.status !== 0 || result.signal) {
|
|
1586
|
-
const reason = result.signal ? `signal=${result.signal}` : `exit=${result.status}`;
|
|
1587
|
-
logFile(` WARN: embed-backfill non-zero (${reason}) after ${dur}s`);
|
|
1588
|
-
console.warn(`\n WARNING: embed-backfill did not complete (${reason}) — BM25 search works,\n` +
|
|
1589
|
-
` but vector ranking on legacy nodes will be sparse until you re-run:\n` +
|
|
1590
|
-
` bash ${backfillScript}\n`);
|
|
1591
|
-
}
|
|
1592
|
-
else {
|
|
1593
|
-
logFile(` OK embed-backfill in ${dur}s`);
|
|
1594
|
-
}
|
|
1595
|
-
}
|
|
1596
1562
|
}
|
|
1597
1563
|
// ---------------------------------------------------------------------------
|
|
1598
1564
|
// Tunnel script shortcuts
|
package/package.json
CHANGED
|
@@ -1,382 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
# ============================================================
|
|
3
|
-
# embed-backfill.sh — populate embeddings on legacy nodes (Task 748)
|
|
4
|
-
#
|
|
5
|
-
# Walks the Neo4j graph for nodes carrying any registered Maxy label that
|
|
6
|
-
# lack `n.embedding` and have at least one populated text property. For
|
|
7
|
-
# each such node the script builds a text representation from the same
|
|
8
|
-
# property union the fulltext index covers (`name`, `title`, `summary`,
|
|
9
|
-
# `headline`, `body`, `content`, `text`), POSTs it to Ollama's `/api/embed`
|
|
10
|
-
# endpoint, and writes the resulting vector back to the node.
|
|
11
|
-
#
|
|
12
|
-
# Why it exists. Pre-Task-748 bulk-import paths (notably `memory-archive-write`
|
|
13
|
-
# for LinkedIn Connections.csv, ~5096 Persons per import) skipped per-row
|
|
14
|
-
# embedding to keep import latency under five minutes. With Task 748's
|
|
15
|
-
# universal fulltext coverage in place, BM25 catches those nodes immediately
|
|
16
|
-
# but vector ranking is sparse until embeddings exist. This script heals
|
|
17
|
-
# both the legacy backlog and any future bulk-imported population.
|
|
18
|
-
#
|
|
19
|
-
# Idempotent. Re-running picks up exactly where a prior run left off because
|
|
20
|
-
# the gating predicate is `n.embedding IS NULL` — nodes embedded by the
|
|
21
|
-
# previous run are excluded from the next batch query.
|
|
22
|
-
#
|
|
23
|
-
# Loud failure (per feedback_loud_failures.md). Any Ollama HTTP failure or
|
|
24
|
-
# cypher-shell error aborts the script with a non-zero exit and prints a
|
|
25
|
-
# precise re-run instruction. Partial-state-on-abort is safe: nodes whose
|
|
26
|
-
# embedding was committed before the abort stay embedded; the rest fall back
|
|
27
|
-
# into the next run's batch.
|
|
28
|
-
#
|
|
29
|
-
# Concurrent-run safety. flock-guarded — a second concurrent invocation
|
|
30
|
-
# exits immediately with a clear message, no work attempted. Protects
|
|
31
|
-
# against operator double-clicks and against the installer running it
|
|
32
|
-
# while a manual run is in flight.
|
|
33
|
-
#
|
|
34
|
-
# Usage. Stand-alone re-run: `bash platform/scripts/embed-backfill.sh`.
|
|
35
|
-
# Installer-driven: invoked automatically post-`seed-neo4j.sh` on every
|
|
36
|
-
# install (the no-op fast path returns in milliseconds when nothing is
|
|
37
|
-
# pending, so re-running on every install is harmless).
|
|
38
|
-
# ============================================================
|
|
39
|
-
|
|
40
|
-
set -euo pipefail
|
|
41
|
-
|
|
42
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
43
|
-
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
44
|
-
|
|
45
|
-
# NEO4J_URI is hard-required (Task 788). The previous default
|
|
46
|
-
# `bolt://localhost:7687` would silently route the backfill to the wrong Neo4j
|
|
47
|
-
# on any brand-dedicated install, masking the actual configuration error.
|
|
48
|
-
if [ -z "${NEO4J_URI:-}" ]; then
|
|
49
|
-
echo "Error: NEO4J_URI required (no default — see Task 788)" >&2
|
|
50
|
-
echo " Set NEO4J_URI=bolt://localhost:<brand.neo4jPort> before running." >&2
|
|
51
|
-
exit 1
|
|
52
|
-
fi
|
|
53
|
-
NEO4J_USER="${NEO4J_USER:-neo4j}"
|
|
54
|
-
OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
|
|
55
|
-
EMBED_MODEL="${EMBED_MODEL:-nomic-embed-text}"
|
|
56
|
-
BATCH_SIZE="${EMBED_BACKFILL_BATCH_SIZE:-50}"
|
|
57
|
-
|
|
58
|
-
# Lock file is brand-scoped via the install directory hash so concurrent
|
|
59
|
-
# Maxy + Real Agent installs (or any two brand installs sharing the device)
|
|
60
|
-
# do not block each other unnecessarily — they target separate Neo4j
|
|
61
|
-
# instances under separate INSTALL_DIRs and have zero shared state. The
|
|
62
|
-
# explicit env var override stays for operator-driven workflows.
|
|
63
|
-
INSTALL_DIR_HASH="$(echo -n "$PROJECT_DIR" | shasum | cut -c1-12)"
|
|
64
|
-
LOCK_FILE="${EMBED_BACKFILL_LOCK_FILE:-/tmp/maxy-embed-backfill-${INSTALL_DIR_HASH}.lock}"
|
|
65
|
-
|
|
66
|
-
# Resolve Neo4j password the same way seed-neo4j.sh does. Explicit env var
|
|
67
|
-
# takes precedence so the installer can pass it through without writing the
|
|
68
|
-
# file twice.
|
|
69
|
-
NEO4J_PASSWORD_FILE="$PROJECT_DIR/config/.neo4j-password"
|
|
70
|
-
if [ -z "${NEO4J_PASSWORD:-}" ]; then
|
|
71
|
-
if [ -f "$NEO4J_PASSWORD_FILE" ]; then
|
|
72
|
-
NEO4J_PASSWORD=$(cat "$NEO4J_PASSWORD_FILE")
|
|
73
|
-
else
|
|
74
|
-
echo "[embed-backfill] FAILED: NEO4J_PASSWORD env var unset and $NEO4J_PASSWORD_FILE missing"
|
|
75
|
-
echo "[embed-backfill] re-run after the seed step writes the password file, or set NEO4J_PASSWORD explicitly"
|
|
76
|
-
exit 1
|
|
77
|
-
fi
|
|
78
|
-
fi
|
|
79
|
-
export NEO4J_URI NEO4J_USER NEO4J_PASSWORD OLLAMA_URL EMBED_MODEL BATCH_SIZE
|
|
80
|
-
|
|
81
|
-
if ! command -v cypher-shell >/dev/null 2>&1; then
|
|
82
|
-
echo "[embed-backfill] FAILED: cypher-shell not on PATH; install Neo4j or add cypher-shell to PATH"
|
|
83
|
-
exit 1
|
|
84
|
-
fi
|
|
85
|
-
if ! command -v python3 >/dev/null 2>&1; then
|
|
86
|
-
echo "[embed-backfill] FAILED: python3 not on PATH; the installer requires it"
|
|
87
|
-
exit 1
|
|
88
|
-
fi
|
|
89
|
-
|
|
90
|
-
# flock guard — second concurrent invocation exits cleanly. The exec on
|
|
91
|
-
# fd 200 keeps the lock held for the lifetime of this process; flock -n
|
|
92
|
-
# is non-blocking so a busy lock returns immediately rather than queueing.
|
|
93
|
-
exec 200>"$LOCK_FILE"
|
|
94
|
-
if ! flock -n 200; then
|
|
95
|
-
echo "[embed-backfill] another instance is already running (lock=$LOCK_FILE), skipping"
|
|
96
|
-
exit 0
|
|
97
|
-
fi
|
|
98
|
-
|
|
99
|
-
# The python heredoc owns the per-batch loop. It uses subprocess to call
|
|
100
|
-
# cypher-shell (avoids re-implementing Bolt) and urllib to call Ollama
|
|
101
|
-
# (no extra deps). cypher-shell `--format plain` returns CSV; the csv
|
|
102
|
-
# module handles quoting/escaping reliably so node text containing commas,
|
|
103
|
-
# quotes, or newlines round-trips correctly.
|
|
104
|
-
#
|
|
105
|
-
# Cypher contract:
|
|
106
|
-
# READ: one row per unembedded node — { id: elementId, text: coalesced }
|
|
107
|
-
# gated by `n.embedding IS NULL` AND `any(label IN labels(n)
|
|
108
|
-
# WHERE label IN $registered)` AND a non-empty coalesce of the
|
|
109
|
-
# text property union. Nodes carrying an :Trashed label are
|
|
110
|
-
# excluded explicitly. READ params (`registered` list of strings,
|
|
111
|
-
# `batchSize` int) are passed via cypher-shell `--param` as plain
|
|
112
|
-
# Cypher expressions (string list literals + integer literal).
|
|
113
|
-
# WRITE: one batched UNWIND per chunk — pairs of (id, embedding[])
|
|
114
|
-
# interpolated into the Cypher payload as bare-key map literals
|
|
115
|
-
# (`{id: '...', embedding: [...]}`). Cypher does NOT accept
|
|
116
|
-
# double-quoted-string map keys, so JSON-serialised values cannot
|
|
117
|
-
# be passed via `--param` for the WRITE side; the inline literal
|
|
118
|
-
# path is the apoc-free alternative.
|
|
119
|
-
#
|
|
120
|
-
# The script does NOT shell out to the existing TS embed() helper because
|
|
121
|
-
# that would require booting Node + the platform/lib build. Calling the
|
|
122
|
-
# Ollama HTTP endpoint directly preserves the same behaviour with zero
|
|
123
|
-
# build dependency.
|
|
124
|
-
exec python3 - <<'PYEOF'
|
|
125
|
-
import json
|
|
126
|
-
import os
|
|
127
|
-
import sys
|
|
128
|
-
import time
|
|
129
|
-
import urllib.error
|
|
130
|
-
import urllib.request
|
|
131
|
-
from subprocess import PIPE, Popen
|
|
132
|
-
from io import StringIO
|
|
133
|
-
import csv
|
|
134
|
-
|
|
135
|
-
NEO4J_URI = os.environ["NEO4J_URI"]
|
|
136
|
-
NEO4J_USER = os.environ["NEO4J_USER"]
|
|
137
|
-
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]
|
|
138
|
-
OLLAMA_URL = os.environ["OLLAMA_URL"]
|
|
139
|
-
EMBED_MODEL = os.environ["EMBED_MODEL"]
|
|
140
|
-
BATCH_SIZE = int(os.environ["BATCH_SIZE"])
|
|
141
|
-
|
|
142
|
-
# Mirrors the FOR (n:...) clause of `entity_search` in schema.cypher.
|
|
143
|
-
# Doctrine: every label written by the platform is searchable AND embeddable.
|
|
144
|
-
# Future label additions must extend BOTH this list and schema.cypher; the
|
|
145
|
-
# fulltext-coverage doctrine test catches the schema half but not this list.
|
|
146
|
-
REGISTERED_LABELS = [
|
|
147
|
-
"LocalBusiness", "Service", "PriceSpecification", "OpeningHoursSpecification", "Organization",
|
|
148
|
-
"Person", "UserProfile", "Preference", "AdminUser", "AccessGrant",
|
|
149
|
-
"KnowledgeDocument", "Section", "Chunk", "DigitalDocument", "CreativeWork",
|
|
150
|
-
"Question", "FAQPage", "DefinedTerm", "Review", "ImageObject",
|
|
151
|
-
"Conversation", "AdminConversation", "PublicConversation", "Message",
|
|
152
|
-
"UserMessage", "AssistantMessage", "ToolCall",
|
|
153
|
-
"Task", "Project", "Event",
|
|
154
|
-
"Workflow", "WorkflowStep", "WorkflowRun", "StepResult",
|
|
155
|
-
"OnboardingState", "Email", "EmailAccount", "ReviewAlert",
|
|
156
|
-
"Position", "Credential",
|
|
157
|
-
]
|
|
158
|
-
|
|
159
|
-
# Properties to coalesce for the embedding text. Ordered: most identifying
|
|
160
|
-
# property first. Matches the canonical text-property list pinned by the
|
|
161
|
-
# fulltext-coverage doctrine test.
|
|
162
|
-
EMBED_TEXT_PROPS = ["name", "title", "summary", "headline", "body", "content", "text"]
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
def cypher(query: str, params: dict | None = None) -> str:
|
|
166
|
-
"""Run a Cypher statement via cypher-shell --format plain.
|
|
167
|
-
Returns stdout as a single string. Aborts the script on non-zero exit
|
|
168
|
-
so a Cypher syntax error or a Neo4j outage surfaces immediately."""
|
|
169
|
-
cmd = [
|
|
170
|
-
"cypher-shell", "-u", NEO4J_USER, "-p", NEO4J_PASSWORD, "-a", NEO4J_URI,
|
|
171
|
-
"--format", "plain",
|
|
172
|
-
]
|
|
173
|
-
if params:
|
|
174
|
-
for key, value in params.items():
|
|
175
|
-
cmd.extend(["--param", f"{key} => {json.dumps(value)}"])
|
|
176
|
-
proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
|
177
|
-
out, err = proc.communicate(query.encode("utf-8"))
|
|
178
|
-
if proc.returncode != 0:
|
|
179
|
-
sys.stderr.write(f"[embed-backfill] FAILED: cypher-shell exited {proc.returncode}\n")
|
|
180
|
-
sys.stderr.write(err.decode("utf-8", errors="replace"))
|
|
181
|
-
sys.exit(1)
|
|
182
|
-
return out.decode("utf-8", errors="replace")
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
def parse_csv_rows(stdout: str) -> list[dict]:
|
|
186
|
-
"""cypher-shell --format plain emits a CSV header + rows. The csv module
|
|
187
|
-
handles quoting reliably even when text contains commas/quotes/newlines.
|
|
188
|
-
|
|
189
|
-
skipinitialspace=True is required because cypher-shell emits a space
|
|
190
|
-
after each comma in both header and data rows (`id, firstLabel, text`),
|
|
191
|
-
and DictReader otherwise treats the spaces as part of the column name —
|
|
192
|
-
`row["text"]` raises KeyError because the actual key is " text"."""
|
|
193
|
-
if not stdout.strip():
|
|
194
|
-
return []
|
|
195
|
-
reader = csv.DictReader(StringIO(stdout), skipinitialspace=True)
|
|
196
|
-
return list(reader)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
def ollama_embed(text: str, *, timeout: int = 30, retry_on_timeout: bool = True) -> list[float]:
|
|
200
|
-
"""POST text to Ollama /api/embed.
|
|
201
|
-
|
|
202
|
-
Cold-start tolerance: when nomic-embed-text is not yet loaded into Ollama's
|
|
203
|
-
process memory, the first request for the model after a fresh boot can
|
|
204
|
-
exceed 30s while the model loads. Subsequent requests are fast. We retry
|
|
205
|
-
ONCE on TimeoutError with a longer (180s) timeout so a cold model load
|
|
206
|
-
does not abort the entire backfill at the first node. Retry is OFF by
|
|
207
|
-
default for the warmup probe to avoid recursion.
|
|
208
|
-
|
|
209
|
-
Aborts the script (non-zero exit) on any non-recoverable HTTP failure
|
|
210
|
-
with a precise message + re-run instruction so the operator never thinks
|
|
211
|
-
the backfill silently completed.
|
|
212
|
-
"""
|
|
213
|
-
body = json.dumps({"model": EMBED_MODEL, "input": text}).encode("utf-8")
|
|
214
|
-
req = urllib.request.Request(
|
|
215
|
-
f"{OLLAMA_URL}/api/embed",
|
|
216
|
-
data=body,
|
|
217
|
-
headers={"Content-Type": "application/json"},
|
|
218
|
-
method="POST",
|
|
219
|
-
)
|
|
220
|
-
try:
|
|
221
|
-
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
222
|
-
payload = json.loads(resp.read().decode("utf-8"))
|
|
223
|
-
except TimeoutError as e:
|
|
224
|
-
if retry_on_timeout:
|
|
225
|
-
sys.stderr.write(
|
|
226
|
-
f"[embed-backfill] WARN: Ollama timeout after {timeout}s — likely cold-start; retrying with 180s timeout\n"
|
|
227
|
-
)
|
|
228
|
-
return ollama_embed(text, timeout=180, retry_on_timeout=False)
|
|
229
|
-
sys.stderr.write(f"[embed-backfill] FAILED: Ollama timeout after {timeout}s ({e})\n")
|
|
230
|
-
sys.stderr.write(
|
|
231
|
-
f"[embed-backfill] re-run via: bash {os.path.dirname(os.path.realpath(__file__))}/embed-backfill.sh\n"
|
|
232
|
-
)
|
|
233
|
-
sys.exit(1)
|
|
234
|
-
except (urllib.error.URLError, urllib.error.HTTPError) as e:
|
|
235
|
-
sys.stderr.write(f"[embed-backfill] FAILED: Ollama unreachable ({e})\n")
|
|
236
|
-
sys.stderr.write(
|
|
237
|
-
f"[embed-backfill] re-run via: bash {os.path.dirname(os.path.realpath(__file__))}/embed-backfill.sh\n"
|
|
238
|
-
)
|
|
239
|
-
sys.exit(1)
|
|
240
|
-
embeddings = payload.get("embeddings", [])
|
|
241
|
-
if not embeddings or not embeddings[0]:
|
|
242
|
-
sys.stderr.write(f"[embed-backfill] FAILED: Ollama returned no embedding for text length={len(text)}\n")
|
|
243
|
-
sys.exit(1)
|
|
244
|
-
return embeddings[0]
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
def cypher_string_literal(s: str) -> str:
|
|
248
|
-
"""Format a Python string as a Cypher single-quoted string literal.
|
|
249
|
-
|
|
250
|
-
Escapes the two characters Cypher requires escaping inside single-quoted
|
|
251
|
-
strings: backslash and single quote. elementId values from Neo4j 5 are
|
|
252
|
-
typically `<dbprefix>:<uuid>:<recordId>` (alphanumeric + colon + dash) and
|
|
253
|
-
will not normally contain either, but escape defensively so a future
|
|
254
|
-
elementId format change cannot break the WRITE batch with a syntax error.
|
|
255
|
-
"""
|
|
256
|
-
return "'" + s.replace("\\", "\\\\").replace("'", "\\'") + "'"
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
def cypher_float_list(values: list[float]) -> str:
|
|
260
|
-
"""Format a list of floats as a Cypher list literal `[v1, v2, ...]`.
|
|
261
|
-
|
|
262
|
-
repr() on a Python float emits a decimal that Cypher accepts as a number
|
|
263
|
-
literal — including the negative sign, scientific notation, and infinity
|
|
264
|
-
edge cases. nomic-embed-text returns finite cosine-bounded floats so
|
|
265
|
-
inf/nan are not expected, but Python's repr is stable for any case that
|
|
266
|
-
does occur.
|
|
267
|
-
"""
|
|
268
|
-
return "[" + ",".join(repr(v) for v in values) + "]"
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
# Build the WHERE clause once. The $registered parameter is interpolated
|
|
272
|
-
# into Cypher as a list literal; cypher-shell --param gives us a typed pass.
|
|
273
|
-
COALESCE_TEXT = "coalesce(" + ", ".join(f"n.{p}" for p in EMBED_TEXT_PROPS) + ", '')"
|
|
274
|
-
COUNT_QUERY = f"""
|
|
275
|
-
MATCH (n) WHERE n.embedding IS NULL
|
|
276
|
-
AND NOT n:Trashed
|
|
277
|
-
AND any(label IN labels(n) WHERE label IN $registered)
|
|
278
|
-
AND {COALESCE_TEXT} <> ''
|
|
279
|
-
RETURN count(n) AS remaining;
|
|
280
|
-
"""
|
|
281
|
-
BATCH_QUERY = f"""
|
|
282
|
-
MATCH (n) WHERE n.embedding IS NULL
|
|
283
|
-
AND NOT n:Trashed
|
|
284
|
-
AND any(label IN labels(n) WHERE label IN $registered)
|
|
285
|
-
AND {COALESCE_TEXT} <> ''
|
|
286
|
-
RETURN elementId(n) AS id,
|
|
287
|
-
labels(n)[0] AS firstLabel,
|
|
288
|
-
{COALESCE_TEXT} AS text
|
|
289
|
-
LIMIT $batchSize;
|
|
290
|
-
"""
|
|
291
|
-
|
|
292
|
-
count_out = cypher(COUNT_QUERY, {"registered": REGISTERED_LABELS})
|
|
293
|
-
total_remaining = 0
|
|
294
|
-
for row in parse_csv_rows(count_out):
|
|
295
|
-
total_remaining = int(row["remaining"])
|
|
296
|
-
|
|
297
|
-
print(f"[embed-backfill] start total={total_remaining} model={EMBED_MODEL}")
|
|
298
|
-
|
|
299
|
-
if total_remaining == 0:
|
|
300
|
-
print("[embed-backfill] done remaining=0 (nothing to backfill)")
|
|
301
|
-
sys.exit(0)
|
|
302
|
-
|
|
303
|
-
# Pre-warm Ollama so the first per-node call doesn't pay the model-load
|
|
304
|
-
# latency. The cold-start window for nomic-embed-text on a Pi 5 can exceed
|
|
305
|
-
# 30s; calling once with a tiny throwaway input loads the weights into
|
|
306
|
-
# memory before the loop begins. Failure here is treated identically to
|
|
307
|
-
# any other Ollama failure — loud abort with re-run instruction.
|
|
308
|
-
print(f"[embed-backfill] pre-warm model={EMBED_MODEL} timeout=180s")
|
|
309
|
-
ollama_embed("warmup", timeout=180, retry_on_timeout=False)
|
|
310
|
-
|
|
311
|
-
processed_total = 0
|
|
312
|
-
batch_index = 0
|
|
313
|
-
while True:
|
|
314
|
-
batch_start = time.time()
|
|
315
|
-
batch_out = cypher(
|
|
316
|
-
BATCH_QUERY,
|
|
317
|
-
{"registered": REGISTERED_LABELS, "batchSize": BATCH_SIZE},
|
|
318
|
-
)
|
|
319
|
-
rows = parse_csv_rows(batch_out)
|
|
320
|
-
if not rows:
|
|
321
|
-
break
|
|
322
|
-
|
|
323
|
-
# Compute embeddings serially. Ollama on a Pi 5 handles ~3-10 embeds
|
|
324
|
-
# per second with nomic-embed-text; concurrent requests just queue
|
|
325
|
-
# behind the GPU/CPU bottleneck so parallelism wouldn't help.
|
|
326
|
-
pairs: list[tuple[str, list[float]]] = []
|
|
327
|
-
label_counts: dict[str, int] = {}
|
|
328
|
-
for row in rows:
|
|
329
|
-
node_id = row["id"]
|
|
330
|
-
text = row["text"]
|
|
331
|
-
first_label = row["firstLabel"]
|
|
332
|
-
if not text:
|
|
333
|
-
continue
|
|
334
|
-
embedding = ollama_embed(text)
|
|
335
|
-
pairs.append((node_id, embedding))
|
|
336
|
-
label_counts[first_label] = label_counts.get(first_label, 0) + 1
|
|
337
|
-
|
|
338
|
-
if not pairs:
|
|
339
|
-
# Defensive: query said rows exist but all text was empty after
|
|
340
|
-
# the python read — means the COALESCE_TEXT predicate is wider
|
|
341
|
-
# than the python check. Stop to avoid an infinite loop.
|
|
342
|
-
sys.stderr.write("[embed-backfill] WARN: batch returned rows with empty text — stopping to avoid infinite loop\n")
|
|
343
|
-
break
|
|
344
|
-
|
|
345
|
-
# Build the WRITE batch as a Cypher literal payload rather than a
|
|
346
|
-
# `--param` map. cypher-shell's `--param` parses the value as a Cypher
|
|
347
|
-
# expression, and Cypher map keys must be bare identifiers (or backtick-
|
|
348
|
-
# quoted) — NOT double-quoted strings as JSON would emit. Interpolating
|
|
349
|
-
# bare-key map literals directly avoids the question entirely:
|
|
350
|
-
#
|
|
351
|
-
# UNWIND [{id: '4:abc:1', embedding: [0.1, 0.2, ...]}, ...] AS pair
|
|
352
|
-
# MATCH (n) WHERE elementId(n) = pair.id
|
|
353
|
-
# SET n.embedding = pair.embedding;
|
|
354
|
-
#
|
|
355
|
-
# cypher_string_literal escapes any backslash/quote in elementIds
|
|
356
|
-
# defensively; cypher_float_list serialises the embedding via repr()
|
|
357
|
-
# which Cypher accepts as a number literal.
|
|
358
|
-
pair_literals = ",".join(
|
|
359
|
-
f"{{id: {cypher_string_literal(node_id)}, embedding: {cypher_float_list(embedding)}}}"
|
|
360
|
-
for node_id, embedding in pairs
|
|
361
|
-
)
|
|
362
|
-
cypher(
|
|
363
|
-
f"""
|
|
364
|
-
UNWIND [{pair_literals}] AS pair
|
|
365
|
-
MATCH (n) WHERE elementId(n) = pair.id
|
|
366
|
-
SET n.embedding = pair.embedding;
|
|
367
|
-
"""
|
|
368
|
-
)
|
|
369
|
-
elapsed_ms = int((time.time() - batch_start) * 1000)
|
|
370
|
-
batch_index += 1
|
|
371
|
-
processed_total += len(pairs)
|
|
372
|
-
label_summary = ", ".join(f"{k}={v}" for k, v in sorted(label_counts.items()))
|
|
373
|
-
print(f"[embed-backfill] batch={batch_index} processed={len(pairs)} elapsed-ms={elapsed_ms} labels={label_summary}")
|
|
374
|
-
|
|
375
|
-
# Final remaining check — should be zero or the diff between original
|
|
376
|
-
# total and processed_total (e.g. if new writes landed mid-run).
|
|
377
|
-
final_out = cypher(COUNT_QUERY, {"registered": REGISTERED_LABELS})
|
|
378
|
-
final_remaining = 0
|
|
379
|
-
for row in parse_csv_rows(final_out):
|
|
380
|
-
final_remaining = int(row["remaining"])
|
|
381
|
-
print(f"[embed-backfill] done processed={processed_total} remaining={final_remaining}")
|
|
382
|
-
PYEOF
|