@semiont/jobs 0.5.5 → 0.5.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -49
- package/dist/index.d.ts +82 -101
- package/dist/index.js +430 -518
- package/dist/index.js.map +1 -1
- package/dist/worker-main.js +240 -304
- package/dist/worker-main.js.map +1 -1
- package/package.json +9 -5
package/dist/index.js
CHANGED
|
@@ -1,23 +1,30 @@
|
|
|
1
|
-
import { promises
|
|
1
|
+
import { promises } from 'fs';
|
|
2
2
|
import * as path from 'path';
|
|
3
|
-
import {
|
|
3
|
+
import { jobId, reconcileSelector, getLocaleEnglishName, didToAgent } from '@semiont/core';
|
|
4
4
|
import { generateAnnotationId } from '@semiont/event-sourcing';
|
|
5
5
|
|
|
6
6
|
// src/fs-job-queue.ts
|
|
7
|
+
var REANNOUNCE_INTERVAL_MS = 3e4;
|
|
8
|
+
var STALE_RUNNING_MS = 30 * 6e4;
|
|
9
|
+
var PROGRESS_WRITE_MIN_INTERVAL_MS = 5e3;
|
|
10
|
+
var RETENTION_HOURS = 24;
|
|
11
|
+
var CLEANUP_INTERVAL_MS = 36e5;
|
|
7
12
|
var FsJobQueue = class {
|
|
8
13
|
constructor(project, logger, eventBus) {
|
|
9
14
|
this.eventBus = eventBus;
|
|
10
15
|
this.jobsDir = project.jobsDir;
|
|
11
16
|
this.logger = logger;
|
|
12
17
|
}
|
|
18
|
+
eventBus;
|
|
13
19
|
jobsDir;
|
|
14
20
|
logger;
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
21
|
+
reannounceTimer = null;
|
|
22
|
+
cleanupTimer = null;
|
|
23
|
+
/** Per-job timestamp of the last progress write, for throttling. */
|
|
24
|
+
lastProgressWrite = /* @__PURE__ */ new Map();
|
|
19
25
|
/**
|
|
20
|
-
* Initialize job queue directories,
|
|
26
|
+
* Initialize job queue directories, announce any pending backlog,
|
|
27
|
+
* and start the re-announce interval. Idempotent.
|
|
21
28
|
*/
|
|
22
29
|
async initialize() {
|
|
23
30
|
const statuses = ["pending", "running", "complete", "failed", "cancelled"];
|
|
@@ -25,62 +32,83 @@ var FsJobQueue = class {
|
|
|
25
32
|
const dir = path.join(this.jobsDir, status);
|
|
26
33
|
await promises.mkdir(dir, { recursive: true });
|
|
27
34
|
}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
35
|
+
if (this.eventBus && !this.reannounceTimer) {
|
|
36
|
+
await this.announcePendingJobs();
|
|
37
|
+
this.reannounceTimer = setInterval(() => {
|
|
38
|
+
this.announcePendingJobs().catch((error) => {
|
|
39
|
+
this.logger.warn("Pending-job re-announce failed", {
|
|
40
|
+
error: error instanceof Error ? error.message : String(error)
|
|
41
|
+
});
|
|
42
|
+
});
|
|
43
|
+
this.recoverStaleRunningJobs().catch((error) => {
|
|
44
|
+
this.logger.warn("Stale-running recovery failed", {
|
|
45
|
+
error: error instanceof Error ? error.message : String(error)
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
}, REANNOUNCE_INTERVAL_MS);
|
|
49
|
+
this.reannounceTimer.unref?.();
|
|
50
|
+
}
|
|
51
|
+
if (!this.cleanupTimer) {
|
|
52
|
+
this.cleanupTimer = setInterval(() => {
|
|
53
|
+
this.cleanupOldJobs(RETENTION_HOURS).catch((error) => {
|
|
54
|
+
this.logger.warn("Job retention cleanup failed", {
|
|
55
|
+
error: error instanceof Error ? error.message : String(error)
|
|
56
|
+
});
|
|
57
|
+
});
|
|
58
|
+
}, CLEANUP_INTERVAL_MS);
|
|
59
|
+
this.cleanupTimer.unref?.();
|
|
38
60
|
}
|
|
39
61
|
this.logger.info("Job queue initialized");
|
|
40
62
|
}
|
|
41
63
|
/**
|
|
42
|
-
*
|
|
64
|
+
* Stop the re-announce and retention intervals
|
|
43
65
|
*/
|
|
44
66
|
destroy() {
|
|
45
|
-
if (this.
|
|
46
|
-
this.
|
|
47
|
-
this.
|
|
67
|
+
if (this.reannounceTimer) {
|
|
68
|
+
clearInterval(this.reannounceTimer);
|
|
69
|
+
this.reannounceTimer = null;
|
|
48
70
|
}
|
|
49
|
-
if (this.
|
|
50
|
-
|
|
51
|
-
this.
|
|
71
|
+
if (this.cleanupTimer) {
|
|
72
|
+
clearInterval(this.cleanupTimer);
|
|
73
|
+
this.cleanupTimer = null;
|
|
52
74
|
}
|
|
53
75
|
}
|
|
54
76
|
/**
|
|
55
|
-
*
|
|
77
|
+
* Emit `job:queued` for a pending job, if an EventBus is wired and
|
|
78
|
+
* the job carries a `resourceId` (every current job type does).
|
|
56
79
|
*/
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
const content = await promises.readFile(path.join(pendingDir, file), "utf-8");
|
|
66
|
-
jobs.push(JSON.parse(content));
|
|
67
|
-
} catch {
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
this.pendingQueue = jobs;
|
|
71
|
-
} catch {
|
|
72
|
-
this.pendingQueue = [];
|
|
80
|
+
announce(job) {
|
|
81
|
+
if (this.eventBus && "params" in job && "resourceId" in job.params) {
|
|
82
|
+
this.eventBus.get("job:queued").next({
|
|
83
|
+
jobId: job.metadata.id,
|
|
84
|
+
jobType: job.metadata.type,
|
|
85
|
+
resourceId: job.params.resourceId,
|
|
86
|
+
userId: job.metadata.userId
|
|
87
|
+
});
|
|
73
88
|
}
|
|
74
89
|
}
|
|
75
90
|
/**
|
|
76
|
-
*
|
|
91
|
+
* Announce every job currently in `pending/`. Files that vanish or
|
|
92
|
+
* fail to parse mid-scan (claimed, cancelled, partially written)
|
|
93
|
+
* are skipped — they're either gone for a good reason or picked up
|
|
94
|
+
* on the next tick.
|
|
77
95
|
*/
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
await
|
|
83
|
-
}
|
|
96
|
+
async announcePendingJobs() {
|
|
97
|
+
const pendingDir = path.join(this.jobsDir, "pending");
|
|
98
|
+
let files;
|
|
99
|
+
try {
|
|
100
|
+
files = await promises.readdir(pendingDir);
|
|
101
|
+
} catch {
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
files.sort();
|
|
105
|
+
for (const file of files) {
|
|
106
|
+
try {
|
|
107
|
+
const content = await promises.readFile(path.join(pendingDir, file), "utf-8");
|
|
108
|
+
this.announce(JSON.parse(content));
|
|
109
|
+
} catch {
|
|
110
|
+
}
|
|
111
|
+
}
|
|
84
112
|
}
|
|
85
113
|
/**
|
|
86
114
|
* Create a new job
|
|
@@ -90,16 +118,7 @@ var FsJobQueue = class {
|
|
|
90
118
|
await promises.writeFile(jobPath, JSON.stringify(job, null, 2), "utf-8");
|
|
91
119
|
this.logger.info("Job created", { jobId: job.metadata.id, status: job.status });
|
|
92
120
|
if (job.status === "pending") {
|
|
93
|
-
this.
|
|
94
|
-
this.pendingQueue.sort((a, b) => a.metadata.id.localeCompare(b.metadata.id));
|
|
95
|
-
}
|
|
96
|
-
if (this.eventBus && "params" in job && "resourceId" in job.params) {
|
|
97
|
-
this.eventBus.get("job:queued").next({
|
|
98
|
-
jobId: job.metadata.id,
|
|
99
|
-
jobType: job.metadata.type,
|
|
100
|
-
resourceId: job.params.resourceId,
|
|
101
|
-
userId: job.metadata.userId
|
|
102
|
-
});
|
|
121
|
+
this.announce(job);
|
|
103
122
|
}
|
|
104
123
|
}
|
|
105
124
|
/**
|
|
@@ -128,34 +147,92 @@ var FsJobQueue = class {
|
|
|
128
147
|
await promises.unlink(oldPath);
|
|
129
148
|
} catch (error) {
|
|
130
149
|
}
|
|
131
|
-
if (oldStatus === "pending") {
|
|
132
|
-
const idx = this.pendingQueue.findIndex((j) => j.metadata.id === job.metadata.id);
|
|
133
|
-
if (idx !== -1) this.pendingQueue.splice(idx, 1);
|
|
134
|
-
}
|
|
135
|
-
if (job.status === "pending") {
|
|
136
|
-
this.pendingQueue.push(job);
|
|
137
|
-
this.pendingQueue.sort((a, b) => a.metadata.id.localeCompare(b.metadata.id));
|
|
138
|
-
}
|
|
139
150
|
}
|
|
140
151
|
const newPath = this.getJobPath(job.metadata.id, job.status);
|
|
141
152
|
await promises.writeFile(newPath, JSON.stringify(job, null, 2), "utf-8");
|
|
142
153
|
if (oldStatus && oldStatus !== job.status) {
|
|
143
154
|
this.logger.info("Job moved", { jobId: job.metadata.id, oldStatus, newStatus: job.status });
|
|
155
|
+
if (job.status === "pending") {
|
|
156
|
+
this.announce(job);
|
|
157
|
+
}
|
|
144
158
|
} else {
|
|
145
159
|
this.logger.info("Job updated", { jobId: job.metadata.id, status: job.status });
|
|
146
160
|
}
|
|
147
161
|
}
|
|
148
162
|
/**
|
|
149
|
-
*
|
|
150
|
-
*
|
|
163
|
+
* Move a running job to `complete`. Returns false (and changes
|
|
164
|
+
* nothing) if the job is missing or not running — which also makes
|
|
165
|
+
* duplicate `job:complete` events harmless.
|
|
166
|
+
*/
|
|
167
|
+
async completeJob(jobId, result) {
|
|
168
|
+
const job = await this.getJob(jobId);
|
|
169
|
+
if (!job || job.status !== "running") {
|
|
170
|
+
return false;
|
|
171
|
+
}
|
|
172
|
+
this.lastProgressWrite.delete(jobId);
|
|
173
|
+
const completed = {
|
|
174
|
+
status: "complete",
|
|
175
|
+
metadata: job.metadata,
|
|
176
|
+
params: job.params,
|
|
177
|
+
startedAt: job.startedAt,
|
|
178
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
179
|
+
result
|
|
180
|
+
};
|
|
181
|
+
await this.updateJob(completed, "running");
|
|
182
|
+
return true;
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Retry-or-fail a running job. While `retryCount < maxRetries` the
|
|
186
|
+
* job goes back to `pending` with the count bumped (and is
|
|
187
|
+
* re-announced); after that it lands in `failed` with the error.
|
|
188
|
+
* Returns null (and changes nothing) if the job isn't running.
|
|
189
|
+
*/
|
|
190
|
+
async failJob(jobId, error) {
|
|
191
|
+
const job = await this.getJob(jobId);
|
|
192
|
+
if (!job || job.status !== "running") {
|
|
193
|
+
return null;
|
|
194
|
+
}
|
|
195
|
+
this.lastProgressWrite.delete(jobId);
|
|
196
|
+
if (job.metadata.retryCount < job.metadata.maxRetries) {
|
|
197
|
+
const retried = {
|
|
198
|
+
status: "pending",
|
|
199
|
+
metadata: { ...job.metadata, retryCount: job.metadata.retryCount + 1 },
|
|
200
|
+
params: job.params
|
|
201
|
+
};
|
|
202
|
+
await this.updateJob(retried, "running");
|
|
203
|
+
return "retried";
|
|
204
|
+
}
|
|
205
|
+
const failed = {
|
|
206
|
+
status: "failed",
|
|
207
|
+
metadata: job.metadata,
|
|
208
|
+
params: job.params,
|
|
209
|
+
startedAt: job.startedAt,
|
|
210
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
211
|
+
error
|
|
212
|
+
};
|
|
213
|
+
await this.updateJob(failed, "running");
|
|
214
|
+
return "failed";
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* Write progress into a running job's file. Throttled per job, and
|
|
218
|
+
* a no-op for jobs that aren't running. Beyond surfacing live
|
|
219
|
+
* progress to `job:status-requested`, each write refreshes the
|
|
220
|
+
* file's mtime — the heartbeat `recoverStaleRunningJobs` watches.
|
|
151
221
|
*/
|
|
152
|
-
async
|
|
153
|
-
|
|
154
|
-
|
|
222
|
+
async recordProgress(jobId, progress) {
|
|
223
|
+
const now = Date.now();
|
|
224
|
+
const lastWrite = this.lastProgressWrite.get(jobId) ?? 0;
|
|
225
|
+
if (now - lastWrite < PROGRESS_WRITE_MIN_INTERVAL_MS) {
|
|
226
|
+
return;
|
|
155
227
|
}
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
228
|
+
this.lastProgressWrite.set(jobId, now);
|
|
229
|
+
const job = await this.getJob(jobId);
|
|
230
|
+
if (!job || job.status !== "running") {
|
|
231
|
+
this.lastProgressWrite.delete(jobId);
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
234
|
+
const updated = { ...job, progress };
|
|
235
|
+
await promises.writeFile(this.getJobPath(jobId, "running"), JSON.stringify(updated, null, 2), "utf-8");
|
|
159
236
|
}
|
|
160
237
|
/**
|
|
161
238
|
* List jobs with filters
|
|
@@ -206,6 +283,63 @@ var FsJobQueue = class {
|
|
|
206
283
|
await this.updateJob(cancelledJob, oldStatus);
|
|
207
284
|
return true;
|
|
208
285
|
}
|
|
286
|
+
/**
|
|
287
|
+
* Cancel all pending jobs in a category — the granularity of the
|
|
288
|
+
* `job:cancel-requested` UI signal. Running jobs are left to finish:
|
|
289
|
+
* interrupting a worker mid-inference would need a worker-side kill
|
|
290
|
+
* channel that doesn't exist.
|
|
291
|
+
*/
|
|
292
|
+
async cancelPendingJobs(category) {
|
|
293
|
+
const matches = category === "generation" ? (type) => type === "generation" : (type) => type.endsWith("-annotation");
|
|
294
|
+
const pending = await this.listJobs({ status: "pending", limit: Number.MAX_SAFE_INTEGER });
|
|
295
|
+
let cancelled = 0;
|
|
296
|
+
for (const job of pending) {
|
|
297
|
+
if (!matches(job.metadata.type)) continue;
|
|
298
|
+
if (await this.cancelJob(job.metadata.id)) {
|
|
299
|
+
cancelled++;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
if (cancelled > 0) {
|
|
303
|
+
this.logger.info("Cancelled pending jobs", { category, cancelled });
|
|
304
|
+
}
|
|
305
|
+
return cancelled;
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Recover running jobs orphaned by a dead worker: any `running/`
|
|
309
|
+
* file whose mtime is older than the stale window is fed through
|
|
310
|
+
* the same retry-or-fail path as `job:fail`. Progress writes
|
|
311
|
+
* refresh the mtime, so a live worker is never recovered out from
|
|
312
|
+
* under itself as long as it reports within the window.
|
|
313
|
+
*/
|
|
314
|
+
async recoverStaleRunningJobs() {
|
|
315
|
+
const runningDir = path.join(this.jobsDir, "running");
|
|
316
|
+
let files;
|
|
317
|
+
try {
|
|
318
|
+
files = await promises.readdir(runningDir);
|
|
319
|
+
} catch {
|
|
320
|
+
return 0;
|
|
321
|
+
}
|
|
322
|
+
const now = Date.now();
|
|
323
|
+
let recovered = 0;
|
|
324
|
+
for (const file of files) {
|
|
325
|
+
if (!file.endsWith(".json")) continue;
|
|
326
|
+
try {
|
|
327
|
+
const stat = await promises.stat(path.join(runningDir, file));
|
|
328
|
+
if (now - stat.mtimeMs < STALE_RUNNING_MS) continue;
|
|
329
|
+
const staleId = jobId(file.slice(0, -".json".length));
|
|
330
|
+
const outcome = await this.failJob(
|
|
331
|
+
staleId,
|
|
332
|
+
`worker presumed dead \u2014 no progress within ${STALE_RUNNING_MS / 6e4} minutes`
|
|
333
|
+
);
|
|
334
|
+
if (outcome) {
|
|
335
|
+
this.logger.warn("Recovered stale running job", { jobId: staleId, outcome });
|
|
336
|
+
recovered++;
|
|
337
|
+
}
|
|
338
|
+
} catch {
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
return recovered;
|
|
342
|
+
}
|
|
209
343
|
/**
|
|
210
344
|
* Clean up old completed/failed jobs (older than retention period)
|
|
211
345
|
*/
|
|
@@ -269,156 +403,6 @@ var FsJobQueue = class {
|
|
|
269
403
|
}
|
|
270
404
|
};
|
|
271
405
|
|
|
272
|
-
// src/job-worker.ts
|
|
273
|
-
var JobWorker = class {
|
|
274
|
-
running = false;
|
|
275
|
-
currentJob = null;
|
|
276
|
-
pollIntervalMs;
|
|
277
|
-
errorBackoffMs;
|
|
278
|
-
jobQueue;
|
|
279
|
-
logger;
|
|
280
|
-
constructor(jobQueue, pollIntervalMs = 1e3, errorBackoffMs = 5e3, logger) {
|
|
281
|
-
this.jobQueue = jobQueue;
|
|
282
|
-
this.pollIntervalMs = pollIntervalMs;
|
|
283
|
-
this.errorBackoffMs = errorBackoffMs;
|
|
284
|
-
this.logger = logger;
|
|
285
|
-
}
|
|
286
|
-
/**
|
|
287
|
-
* Start the worker (polls queue in loop)
|
|
288
|
-
*/
|
|
289
|
-
async start() {
|
|
290
|
-
this.running = true;
|
|
291
|
-
this.logger.info("Worker started", { worker: this.getWorkerName() });
|
|
292
|
-
while (this.running) {
|
|
293
|
-
try {
|
|
294
|
-
const job = await this.pollNextJob();
|
|
295
|
-
if (job) {
|
|
296
|
-
await this.processJob(job);
|
|
297
|
-
} else {
|
|
298
|
-
await this.sleep(this.pollIntervalMs);
|
|
299
|
-
}
|
|
300
|
-
} catch (error) {
|
|
301
|
-
this.logger.error("Error in worker main loop", { worker: this.getWorkerName(), error: error instanceof Error ? error.message : String(error) });
|
|
302
|
-
await this.sleep(this.errorBackoffMs);
|
|
303
|
-
}
|
|
304
|
-
}
|
|
305
|
-
this.logger.info("Worker stopped", { worker: this.getWorkerName() });
|
|
306
|
-
}
|
|
307
|
-
/**
|
|
308
|
-
* Stop the worker (graceful shutdown)
|
|
309
|
-
*/
|
|
310
|
-
async stop() {
|
|
311
|
-
this.logger.info("Stopping worker", { worker: this.getWorkerName() });
|
|
312
|
-
this.running = false;
|
|
313
|
-
const timeout = 6e4;
|
|
314
|
-
const startTime = Date.now();
|
|
315
|
-
while (this.currentJob && Date.now() - startTime < timeout) {
|
|
316
|
-
await this.sleep(100);
|
|
317
|
-
}
|
|
318
|
-
if (this.currentJob) {
|
|
319
|
-
this.logger.warn("Forced worker shutdown", { worker: this.getWorkerName(), jobId: this.currentJob.metadata.id });
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
/**
|
|
323
|
-
* Poll for next job to process
|
|
324
|
-
*/
|
|
325
|
-
async pollNextJob() {
|
|
326
|
-
return this.jobQueue.pollNextPendingJob((job) => this.canProcessJob(job));
|
|
327
|
-
}
|
|
328
|
-
/**
|
|
329
|
-
* Process a job (handles state transitions and error handling)
|
|
330
|
-
*/
|
|
331
|
-
async processJob(job) {
|
|
332
|
-
this.currentJob = job;
|
|
333
|
-
try {
|
|
334
|
-
if (job.status !== "pending") {
|
|
335
|
-
this.logger.warn("Skipping non-pending job", { worker: this.getWorkerName(), jobId: job.metadata.id, status: job.status });
|
|
336
|
-
return;
|
|
337
|
-
}
|
|
338
|
-
const runningJob = {
|
|
339
|
-
status: "running",
|
|
340
|
-
metadata: job.metadata,
|
|
341
|
-
params: job.params,
|
|
342
|
-
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
343
|
-
progress: {}
|
|
344
|
-
// Initialize with empty progress
|
|
345
|
-
};
|
|
346
|
-
await this.jobQueue.updateJob(runningJob, "pending");
|
|
347
|
-
this.logger.info("Processing job", { worker: this.getWorkerName(), jobId: job.metadata.id, jobType: job.metadata.type });
|
|
348
|
-
const result = await this.executeJob(runningJob);
|
|
349
|
-
await this.emitCompletionEvent(runningJob, result);
|
|
350
|
-
const completeJob = {
|
|
351
|
-
status: "complete",
|
|
352
|
-
metadata: runningJob.metadata,
|
|
353
|
-
params: runningJob.params,
|
|
354
|
-
startedAt: runningJob.startedAt,
|
|
355
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
356
|
-
result: result ?? {}
|
|
357
|
-
// Use returned result or empty object
|
|
358
|
-
};
|
|
359
|
-
await this.jobQueue.updateJob(completeJob, "running");
|
|
360
|
-
this.logger.info("Job completed successfully", { worker: this.getWorkerName(), jobId: job.metadata.id });
|
|
361
|
-
} catch (error) {
|
|
362
|
-
await this.handleJobFailure(job, error);
|
|
363
|
-
} finally {
|
|
364
|
-
this.currentJob = null;
|
|
365
|
-
}
|
|
366
|
-
}
|
|
367
|
-
/**
|
|
368
|
-
* Handle job failure (retry or move to failed)
|
|
369
|
-
*/
|
|
370
|
-
async handleJobFailure(job, error) {
|
|
371
|
-
const updatedMetadata = {
|
|
372
|
-
...job.metadata,
|
|
373
|
-
retryCount: job.metadata.retryCount + 1
|
|
374
|
-
};
|
|
375
|
-
if (updatedMetadata.retryCount < updatedMetadata.maxRetries) {
|
|
376
|
-
this.logger.info("Job failed, will retry", { worker: this.getWorkerName(), jobId: job.metadata.id, retryCount: updatedMetadata.retryCount, maxRetries: updatedMetadata.maxRetries });
|
|
377
|
-
this.logger.debug("Job error details", { error: error instanceof Error ? error.message : String(error), stack: error instanceof Error ? error.stack : void 0 });
|
|
378
|
-
const retryJob = {
|
|
379
|
-
status: "pending",
|
|
380
|
-
metadata: updatedMetadata,
|
|
381
|
-
params: job.status === "pending" ? job.params : job.params
|
|
382
|
-
};
|
|
383
|
-
await this.jobQueue.updateJob(retryJob, job.status);
|
|
384
|
-
} else {
|
|
385
|
-
this.logger.error("Job failed permanently", { worker: this.getWorkerName(), jobId: job.metadata.id, retryCount: updatedMetadata.retryCount });
|
|
386
|
-
this.logger.error("Job error details", { error: error instanceof Error ? error.message : String(error), stack: error instanceof Error ? error.stack : void 0 });
|
|
387
|
-
const failedJob = {
|
|
388
|
-
status: "failed",
|
|
389
|
-
metadata: updatedMetadata,
|
|
390
|
-
params: job.status === "pending" ? job.params : job.params,
|
|
391
|
-
startedAt: job.status === "running" ? job.startedAt : void 0,
|
|
392
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
393
|
-
error: error instanceof Error ? error.message : String(error)
|
|
394
|
-
};
|
|
395
|
-
await this.jobQueue.updateJob(failedJob, job.status);
|
|
396
|
-
}
|
|
397
|
-
}
|
|
398
|
-
/**
|
|
399
|
-
* Update job progress (best-effort, doesn't throw)
|
|
400
|
-
*/
|
|
401
|
-
async updateJobProgress(job) {
|
|
402
|
-
try {
|
|
403
|
-
await this.jobQueue.updateJob(job);
|
|
404
|
-
} catch (error) {
|
|
405
|
-
this.logger.warn("Failed to update job progress", { worker: this.getWorkerName(), error: error instanceof Error ? error.message : String(error) });
|
|
406
|
-
}
|
|
407
|
-
}
|
|
408
|
-
/**
|
|
409
|
-
* Sleep utility
|
|
410
|
-
*/
|
|
411
|
-
sleep(ms) {
|
|
412
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
413
|
-
}
|
|
414
|
-
/**
|
|
415
|
-
* Emit completion event (optional hook for subclasses)
|
|
416
|
-
* Override this to emit job-specific completion events (e.g., job.completed)
|
|
417
|
-
*/
|
|
418
|
-
async emitCompletionEvent(_job, _result) {
|
|
419
|
-
}
|
|
420
|
-
};
|
|
421
|
-
|
|
422
406
|
// src/types.ts
|
|
423
407
|
function isPendingJob(job) {
|
|
424
408
|
return job.status === "pending";
|
|
@@ -480,17 +464,15 @@ ${content.substring(0, 8e3)}
|
|
|
480
464
|
|
|
481
465
|
Return a JSON array of comments. Each comment must have:
|
|
482
466
|
- "exact": the exact text passage being commented on (quoted verbatim from source)
|
|
483
|
-
- "
|
|
484
|
-
- "
|
|
485
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
486
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
467
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
468
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
487
469
|
- "comment": your comment following the instructions above
|
|
488
470
|
|
|
489
471
|
Respond with a valid JSON array.
|
|
490
472
|
|
|
491
473
|
Example:
|
|
492
474
|
[
|
|
493
|
-
{"exact": "the quarterly review meeting", "
|
|
475
|
+
{"exact": "the quarterly review meeting", "prefix": "We need to schedule ", "suffix": " for next month.", "comment": "Who will lead this? Should we invite the external auditors?"}
|
|
494
476
|
]`;
|
|
495
477
|
} else {
|
|
496
478
|
const toneGuidance = tone ? `
|
|
@@ -516,17 +498,15 @@ ${content.substring(0, 8e3)}
|
|
|
516
498
|
|
|
517
499
|
Return a JSON array of comments. Each comment should have:
|
|
518
500
|
- "exact": the exact text passage being commented on (quoted verbatim from source)
|
|
519
|
-
- "
|
|
520
|
-
- "
|
|
521
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
522
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
501
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
502
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
523
503
|
- "comment": your explanatory comment (1-3 sentences, provide context/background/clarification)
|
|
524
504
|
|
|
525
505
|
Respond with a valid JSON array.
|
|
526
506
|
|
|
527
507
|
Example format:
|
|
528
508
|
[
|
|
529
|
-
{"exact": "Ouranos", "
|
|
509
|
+
{"exact": "Ouranos", "prefix": "In the beginning, ", "suffix": " ruled the universe", "comment": "Ouranos (also spelled Uranus) is the primordial Greek deity personifying the sky. In Hesiod's Theogony, he is the son and husband of Gaia (Earth) and father of the Titans."}
|
|
530
510
|
]`;
|
|
531
511
|
}
|
|
532
512
|
return prompt;
|
|
@@ -557,16 +537,14 @@ ${content.substring(0, 8e3)}
|
|
|
557
537
|
|
|
558
538
|
Return a JSON array of highlights. Each highlight must have:
|
|
559
539
|
- "exact": the exact text passage to highlight (quoted verbatim from source)
|
|
560
|
-
- "
|
|
561
|
-
- "
|
|
562
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
563
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
540
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
541
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
564
542
|
|
|
565
543
|
Respond with a valid JSON array.
|
|
566
544
|
|
|
567
545
|
Example:
|
|
568
546
|
[
|
|
569
|
-
{"exact": "revenue grew 45% year-over-year", "
|
|
547
|
+
{"exact": "revenue grew 45% year-over-year", "prefix": "In Q3 2024, ", "suffix": ", exceeding all forecasts."}
|
|
570
548
|
]`;
|
|
571
549
|
} else {
|
|
572
550
|
const densityGuidance = density ? `
|
|
@@ -590,16 +568,14 @@ ${content.substring(0, 8e3)}
|
|
|
590
568
|
|
|
591
569
|
Return a JSON array of highlights. Each highlight should have:
|
|
592
570
|
- "exact": the exact text passage to highlight (quoted verbatim from source)
|
|
593
|
-
- "
|
|
594
|
-
- "
|
|
595
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
596
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
571
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
572
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
597
573
|
|
|
598
574
|
Respond with a valid JSON array.
|
|
599
575
|
|
|
600
576
|
Example format:
|
|
601
577
|
[
|
|
602
|
-
{"exact": "we will discontinue support for legacy systems by March 2025", "
|
|
578
|
+
{"exact": "we will discontinue support for legacy systems by March 2025", "prefix": "After careful consideration, ", "suffix": ". This decision affects"}
|
|
603
579
|
]`;
|
|
604
580
|
}
|
|
605
581
|
return prompt;
|
|
@@ -633,17 +609,15 @@ ${content.substring(0, 8e3)}
|
|
|
633
609
|
|
|
634
610
|
Return a JSON array of assessments. Each assessment must have:
|
|
635
611
|
- "exact": the exact text passage being assessed (quoted verbatim from source)
|
|
636
|
-
- "
|
|
637
|
-
- "
|
|
638
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
639
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
612
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
613
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
640
614
|
- "assessment": your assessment following the instructions above
|
|
641
615
|
|
|
642
616
|
Respond with a valid JSON array.
|
|
643
617
|
|
|
644
618
|
Example:
|
|
645
619
|
[
|
|
646
|
-
{"exact": "the quarterly revenue target", "
|
|
620
|
+
{"exact": "the quarterly revenue target", "prefix": "We established ", "suffix": " for Q4 2024.", "assessment": "This target seems ambitious given market conditions. Consider revising based on recent trends."}
|
|
647
621
|
]`;
|
|
648
622
|
} else {
|
|
649
623
|
const toneGuidance = tone ? `
|
|
@@ -669,17 +643,15 @@ ${content.substring(0, 8e3)}
|
|
|
669
643
|
|
|
670
644
|
Return a JSON array of assessments. Each assessment should have:
|
|
671
645
|
- "exact": the exact text passage being assessed (quoted verbatim from source)
|
|
672
|
-
- "
|
|
673
|
-
- "
|
|
674
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
675
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
646
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
647
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
676
648
|
- "assessment": your analytical assessment (1-3 sentences, evaluate validity/strength/implications)
|
|
677
649
|
|
|
678
650
|
Respond with a valid JSON array.
|
|
679
651
|
|
|
680
652
|
Example format:
|
|
681
653
|
[
|
|
682
|
-
{"exact": "AI will replace most jobs by 2030", "
|
|
654
|
+
{"exact": "AI will replace most jobs by 2030", "prefix": "Many experts predict that ", "suffix": ", fundamentally reshaping", "assessment": "This claim lacks nuance and supporting evidence. Employment patterns historically show job transformation rather than wholesale replacement. The timeline appears speculative without specific sector analysis."}
|
|
683
655
|
]`;
|
|
684
656
|
}
|
|
685
657
|
return prompt;
|
|
@@ -725,17 +697,15 @@ ${content}
|
|
|
725
697
|
|
|
726
698
|
Return a JSON array of tags. Each tag should have:
|
|
727
699
|
- "exact": the exact text passage (quoted verbatim from source)
|
|
728
|
-
- "
|
|
729
|
-
- "
|
|
730
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
731
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
700
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
701
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
732
702
|
|
|
733
703
|
Respond with a valid JSON array.
|
|
734
704
|
|
|
735
705
|
Example format:
|
|
736
706
|
[
|
|
737
|
-
{"exact": "What duty did the defendant owe?", "
|
|
738
|
-
{"exact": "In tort law, a duty of care is established when...", "
|
|
707
|
+
{"exact": "What duty did the defendant owe?", "prefix": "The central question is: ", "suffix": " This question must be"},
|
|
708
|
+
{"exact": "In tort law, a duty of care is established when...", "prefix": "Legal framework:\\n", "suffix": "\\n\\nApplying this standard"}
|
|
739
709
|
]`;
|
|
740
710
|
return prompt;
|
|
741
711
|
}
|
|
@@ -803,23 +773,29 @@ var MotivationParsers = class {
|
|
|
803
773
|
try {
|
|
804
774
|
const parsed = extractObjectsFromArray(response);
|
|
805
775
|
const valid = parsed.filter(
|
|
806
|
-
(c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.
|
|
776
|
+
(c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.comment === "string" && c.comment.trim().length > 0
|
|
807
777
|
);
|
|
808
778
|
console.log(`[MotivationParsers] Parsed ${valid.length} valid comments from ${parsed.length} total`);
|
|
809
779
|
const validatedComments = [];
|
|
810
780
|
for (const comment of valid) {
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
});
|
|
820
|
-
} catch (error) {
|
|
821
|
-
console.warn(`[MotivationParsers] Skipping invalid comment "${comment.exact}":`, error);
|
|
781
|
+
const reconciled = reconcileSelector(content, {
|
|
782
|
+
exact: comment.exact,
|
|
783
|
+
...typeof comment.prefix === "string" ? { prefix: comment.prefix } : {},
|
|
784
|
+
...typeof comment.suffix === "string" ? { suffix: comment.suffix } : {}
|
|
785
|
+
});
|
|
786
|
+
if (!reconciled) {
|
|
787
|
+
console.warn(`[MotivationParsers] Dropped hallucinated comment "${comment.exact}"`);
|
|
788
|
+
continue;
|
|
822
789
|
}
|
|
790
|
+
logAnchorMethod("comment", comment.exact, reconciled.anchorMethod);
|
|
791
|
+
validatedComments.push({
|
|
792
|
+
comment: comment.comment,
|
|
793
|
+
exact: reconciled.exact,
|
|
794
|
+
start: reconciled.start,
|
|
795
|
+
end: reconciled.end,
|
|
796
|
+
...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
|
|
797
|
+
...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
|
|
798
|
+
});
|
|
823
799
|
}
|
|
824
800
|
return validatedComments;
|
|
825
801
|
} catch (error) {
|
|
@@ -838,22 +814,27 @@ var MotivationParsers = class {
|
|
|
838
814
|
try {
|
|
839
815
|
const parsed = extractObjectsFromArray(response);
|
|
840
816
|
const highlights = parsed.filter(
|
|
841
|
-
(h) => !!h && typeof h === "object" && typeof h.exact === "string"
|
|
817
|
+
(h) => !!h && typeof h === "object" && typeof h.exact === "string"
|
|
842
818
|
);
|
|
843
819
|
const validatedHighlights = [];
|
|
844
820
|
for (const highlight of highlights) {
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
});
|
|
854
|
-
} catch (error) {
|
|
855
|
-
console.warn(`[MotivationParsers] Skipping invalid highlight "${highlight.exact}":`, error);
|
|
821
|
+
const reconciled = reconcileSelector(content, {
|
|
822
|
+
exact: highlight.exact,
|
|
823
|
+
...typeof highlight.prefix === "string" ? { prefix: highlight.prefix } : {},
|
|
824
|
+
...typeof highlight.suffix === "string" ? { suffix: highlight.suffix } : {}
|
|
825
|
+
});
|
|
826
|
+
if (!reconciled) {
|
|
827
|
+
console.warn(`[MotivationParsers] Dropped hallucinated highlight "${highlight.exact}"`);
|
|
828
|
+
continue;
|
|
856
829
|
}
|
|
830
|
+
logAnchorMethod("highlight", highlight.exact, reconciled.anchorMethod);
|
|
831
|
+
validatedHighlights.push({
|
|
832
|
+
exact: reconciled.exact,
|
|
833
|
+
start: reconciled.start,
|
|
834
|
+
end: reconciled.end,
|
|
835
|
+
...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
|
|
836
|
+
...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
|
|
837
|
+
});
|
|
857
838
|
}
|
|
858
839
|
return validatedHighlights;
|
|
859
840
|
} catch (error) {
|
|
@@ -873,22 +854,28 @@ var MotivationParsers = class {
|
|
|
873
854
|
try {
|
|
874
855
|
const parsed = extractObjectsFromArray(response);
|
|
875
856
|
const assessments = parsed.filter(
|
|
876
|
-
(a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.
|
|
857
|
+
(a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.assessment === "string"
|
|
877
858
|
);
|
|
878
859
|
const validatedAssessments = [];
|
|
879
860
|
for (const assessment of assessments) {
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
});
|
|
889
|
-
} catch (error) {
|
|
890
|
-
console.warn(`[MotivationParsers] Skipping invalid assessment "${assessment.exact}":`, error);
|
|
861
|
+
const reconciled = reconcileSelector(content, {
|
|
862
|
+
exact: assessment.exact,
|
|
863
|
+
...typeof assessment.prefix === "string" ? { prefix: assessment.prefix } : {},
|
|
864
|
+
...typeof assessment.suffix === "string" ? { suffix: assessment.suffix } : {}
|
|
865
|
+
});
|
|
866
|
+
if (!reconciled) {
|
|
867
|
+
console.warn(`[MotivationParsers] Dropped hallucinated assessment "${assessment.exact}"`);
|
|
868
|
+
continue;
|
|
891
869
|
}
|
|
870
|
+
logAnchorMethod("assessment", assessment.exact, reconciled.anchorMethod);
|
|
871
|
+
validatedAssessments.push({
|
|
872
|
+
assessment: assessment.assessment,
|
|
873
|
+
exact: reconciled.exact,
|
|
874
|
+
start: reconciled.start,
|
|
875
|
+
end: reconciled.end,
|
|
876
|
+
...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
|
|
877
|
+
...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
|
|
878
|
+
});
|
|
892
879
|
}
|
|
893
880
|
return validatedAssessments;
|
|
894
881
|
} catch (error) {
|
|
@@ -898,17 +885,15 @@ var MotivationParsers = class {
|
|
|
898
885
|
}
|
|
899
886
|
}
|
|
900
887
|
/**
|
|
901
|
-
* Parse
|
|
902
|
-
*
|
|
903
|
-
*
|
|
904
|
-
* @param response - Raw AI response string (may include markdown code fences)
|
|
905
|
-
* @returns Array of tag matches (offsets not yet validated)
|
|
888
|
+
* Parse the LLM's tag response into raw, pre-reconciliation tag inputs.
|
|
889
|
+
* Reconciliation happens in `validateTagOffsets`, which adds `start`/`end`
|
|
890
|
+
* by anchoring `exact` against the source content.
|
|
906
891
|
*/
|
|
907
892
|
static parseTags(response) {
|
|
908
893
|
try {
|
|
909
894
|
const parsed = extractObjectsFromArray(response);
|
|
910
895
|
const valid = parsed.filter(
|
|
911
|
-
(t) => !!t && typeof t === "object" && typeof t.exact === "string" &&
|
|
896
|
+
(t) => !!t && typeof t === "object" && typeof t.exact === "string" && t.exact.trim().length > 0
|
|
912
897
|
);
|
|
913
898
|
console.log(`[MotivationParsers] Parsed ${valid.length} valid tags from ${parsed.length} total`);
|
|
914
899
|
return valid;
|
|
@@ -918,52 +903,41 @@ var MotivationParsers = class {
|
|
|
918
903
|
}
|
|
919
904
|
}
|
|
920
905
|
/**
|
|
921
|
-
*
|
|
922
|
-
* Helper for tag detection after initial parsing
|
|
923
|
-
*
|
|
924
|
-
* @param tags - Parsed tags without validated offsets
|
|
925
|
-
* @param content - Original content to validate against
|
|
926
|
-
* @param category - Category to assign to validated tags
|
|
927
|
-
* @returns Array of validated tag matches
|
|
906
|
+
* Anchor raw tag inputs against source content and add category.
|
|
928
907
|
*/
|
|
929
908
|
static validateTagOffsets(tags, content, category) {
|
|
930
909
|
const validatedTags = [];
|
|
931
910
|
for (const tag of tags) {
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
suffix: validated.suffix
|
|
941
|
-
});
|
|
942
|
-
} catch (error) {
|
|
943
|
-
console.warn(`[MotivationParsers] Skipping invalid tag for category "${category}":`, error);
|
|
911
|
+
const reconciled = reconcileSelector(content, {
|
|
912
|
+
exact: tag.exact,
|
|
913
|
+
...typeof tag.prefix === "string" ? { prefix: tag.prefix } : {},
|
|
914
|
+
...typeof tag.suffix === "string" ? { suffix: tag.suffix } : {}
|
|
915
|
+
});
|
|
916
|
+
if (!reconciled) {
|
|
917
|
+
console.warn(`[MotivationParsers] Dropped hallucinated tag "${tag.exact}" for category "${category}"`);
|
|
918
|
+
continue;
|
|
944
919
|
}
|
|
920
|
+
logAnchorMethod("tag", tag.exact, reconciled.anchorMethod);
|
|
921
|
+
validatedTags.push({
|
|
922
|
+
category,
|
|
923
|
+
exact: reconciled.exact,
|
|
924
|
+
start: reconciled.start,
|
|
925
|
+
end: reconciled.end,
|
|
926
|
+
...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
|
|
927
|
+
...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
|
|
928
|
+
});
|
|
945
929
|
}
|
|
946
930
|
return validatedTags;
|
|
947
931
|
}
|
|
948
932
|
};
|
|
933
|
+
function logAnchorMethod(motivation, exact, anchorMethod) {
|
|
934
|
+
if (anchorMethod === "first-of-many" || anchorMethod === "fuzzy-match") {
|
|
935
|
+
console.warn(`[MotivationParsers] ${motivation} anchored via ${anchorMethod}: "${exact}"`);
|
|
936
|
+
}
|
|
937
|
+
}
|
|
949
938
|
|
|
950
939
|
// src/workers/annotation-detection.ts
|
|
951
940
|
var AnnotationDetection = class {
|
|
952
|
-
/**
|
|
953
|
-
* Fetch content from a ContentFetcher and read the stream to a string.
|
|
954
|
-
* Shared helper for all workers.
|
|
955
|
-
*/
|
|
956
|
-
static async fetchContent(contentFetcher, resourceId) {
|
|
957
|
-
const stream = await contentFetcher(resourceId);
|
|
958
|
-
if (!stream) {
|
|
959
|
-
throw new Error(`Could not load content for resource ${resourceId}`);
|
|
960
|
-
}
|
|
961
|
-
const chunks = [];
|
|
962
|
-
for await (const chunk of stream) {
|
|
963
|
-
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
964
|
-
}
|
|
965
|
-
return Buffer.concat(chunks).toString("utf-8");
|
|
966
|
-
}
|
|
967
941
|
/**
|
|
968
942
|
* Detect comments in content.
|
|
969
943
|
*
|
|
@@ -1073,17 +1047,15 @@ ${exact}
|
|
|
1073
1047
|
"""
|
|
1074
1048
|
|
|
1075
1049
|
Respond with a JSON array of entities found. Each entity should have:
|
|
1076
|
-
- exact: the exact text span from the input
|
|
1050
|
+
- exact: the exact text span from the input (quoted verbatim \u2014 character-for-character)
|
|
1077
1051
|
- entityType: one of the provided entity types
|
|
1078
|
-
-
|
|
1079
|
-
-
|
|
1080
|
-
- prefix: up to 32 characters of text immediately before the entity (helps identify correct occurrence)
|
|
1081
|
-
- suffix: up to 32 characters of text immediately after the entity (helps identify correct occurrence)
|
|
1052
|
+
- prefix: up to 64 characters of text immediately before the entity (used to disambiguate when the same text appears more than once)
|
|
1053
|
+
- suffix: up to 64 characters of text immediately after the entity (same purpose)
|
|
1082
1054
|
|
|
1083
1055
|
If no entities are found, respond with an empty array [].
|
|
1084
1056
|
|
|
1085
1057
|
Example output:
|
|
1086
|
-
[{"exact":"Alice","entityType":"Person","
|
|
1058
|
+
[{"exact":"Alice","entityType":"Person","prefix":"","suffix":" went to"},{"exact":"Paris","entityType":"Location","prefix":"went to ","suffix":" yesterday"}]`;
|
|
1087
1059
|
logger.debug("Sending entity extraction request", { entityTypes: entityTypesDescription });
|
|
1088
1060
|
const response = await client.generateTextWithMetadata(
|
|
1089
1061
|
prompt,
|
|
@@ -1112,151 +1084,18 @@ Example output:
|
|
|
1112
1084
|
logger.error(errorMsg);
|
|
1113
1085
|
throw new Error(errorMsg);
|
|
1114
1086
|
}
|
|
1115
|
-
return entities.
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
index: idx + 1,
|
|
1120
|
-
total: entities.length,
|
|
1121
|
-
type: entity.entityType,
|
|
1122
|
-
text: entity.exact,
|
|
1123
|
-
offsetsFromAI: `[${start}:${end}]`
|
|
1124
|
-
});
|
|
1125
|
-
const extractedText = exact.substring(start, end);
|
|
1126
|
-
let anchorMethod;
|
|
1127
|
-
if (extractedText === entity.exact) {
|
|
1128
|
-
anchorMethod = "llm-exact";
|
|
1129
|
-
logger.debug("Entity anchored", {
|
|
1130
|
-
text: entity.exact,
|
|
1131
|
-
entityType: entity.entityType,
|
|
1132
|
-
anchorMethod
|
|
1133
|
-
});
|
|
1134
|
-
} else {
|
|
1135
|
-
logger.debug("LLM offsets mismatch \u2014 attempting re-anchor", {
|
|
1136
|
-
expected: entity.exact,
|
|
1137
|
-
llmOffsets: `[${start}:${end}]`,
|
|
1138
|
-
foundAtLlmOffsets: extractedText
|
|
1139
|
-
});
|
|
1140
|
-
let occurrenceCount = 0;
|
|
1141
|
-
let firstOccurrence = -1;
|
|
1142
|
-
let searchPos = 0;
|
|
1143
|
-
while ((searchPos = exact.indexOf(entity.exact, searchPos)) !== -1) {
|
|
1144
|
-
if (firstOccurrence === -1) firstOccurrence = searchPos;
|
|
1145
|
-
occurrenceCount++;
|
|
1146
|
-
searchPos++;
|
|
1147
|
-
}
|
|
1148
|
-
if (occurrenceCount === 0) {
|
|
1149
|
-
anchorMethod = "dropped";
|
|
1150
|
-
logger.error("Entity text not found in resource \u2014 dropping", {
|
|
1151
|
-
text: entity.exact,
|
|
1152
|
-
entityType: entity.entityType,
|
|
1153
|
-
llmOffsets: `[${start}:${end}]`,
|
|
1154
|
-
anchorMethod,
|
|
1155
|
-
resourceStart: exact.substring(0, 200)
|
|
1156
|
-
});
|
|
1157
|
-
return null;
|
|
1158
|
-
}
|
|
1159
|
-
let recoveredOffset = -1;
|
|
1160
|
-
if (entity.prefix || entity.suffix) {
|
|
1161
|
-
let p = 0;
|
|
1162
|
-
while ((p = exact.indexOf(entity.exact, p)) !== -1) {
|
|
1163
|
-
const candidatePrefix = exact.substring(Math.max(0, p - 32), p);
|
|
1164
|
-
const candidateSuffix = exact.substring(
|
|
1165
|
-
p + entity.exact.length,
|
|
1166
|
-
Math.min(exact.length, p + entity.exact.length + 32)
|
|
1167
|
-
);
|
|
1168
|
-
const prefixMatch = !entity.prefix || candidatePrefix.endsWith(entity.prefix);
|
|
1169
|
-
const suffixMatch = !entity.suffix || candidateSuffix.startsWith(entity.suffix);
|
|
1170
|
-
if (prefixMatch && suffixMatch) {
|
|
1171
|
-
recoveredOffset = p;
|
|
1172
|
-
break;
|
|
1173
|
-
}
|
|
1174
|
-
p++;
|
|
1175
|
-
}
|
|
1176
|
-
}
|
|
1177
|
-
if (recoveredOffset !== -1) {
|
|
1178
|
-
anchorMethod = "context-recovered";
|
|
1179
|
-
start = recoveredOffset;
|
|
1180
|
-
end = recoveredOffset + entity.exact.length;
|
|
1181
|
-
logger.debug("Entity anchored", {
|
|
1182
|
-
text: entity.exact,
|
|
1183
|
-
entityType: entity.entityType,
|
|
1184
|
-
anchorMethod,
|
|
1185
|
-
offsetDiff: recoveredOffset - entity.startOffset
|
|
1186
|
-
});
|
|
1187
|
-
} else if (occurrenceCount === 1) {
|
|
1188
|
-
anchorMethod = "unique-match";
|
|
1189
|
-
start = firstOccurrence;
|
|
1190
|
-
end = firstOccurrence + entity.exact.length;
|
|
1191
|
-
logger.debug("Entity anchored", {
|
|
1192
|
-
text: entity.exact,
|
|
1193
|
-
entityType: entity.entityType,
|
|
1194
|
-
anchorMethod,
|
|
1195
|
-
offsetDiff: firstOccurrence - entity.startOffset
|
|
1196
|
-
});
|
|
1197
|
-
} else {
|
|
1198
|
-
anchorMethod = "first-of-many";
|
|
1199
|
-
start = firstOccurrence;
|
|
1200
|
-
end = firstOccurrence + entity.exact.length;
|
|
1201
|
-
logger.warn("Entity anchored at first of multiple occurrences \u2014 may be wrong", {
|
|
1202
|
-
text: entity.exact,
|
|
1203
|
-
entityType: entity.entityType,
|
|
1204
|
-
anchorMethod,
|
|
1205
|
-
occurrenceCount,
|
|
1206
|
-
chosenOffset: firstOccurrence,
|
|
1207
|
-
llmOffsets: `[${entity.startOffset}:${entity.endOffset}]`,
|
|
1208
|
-
hasPrefix: !!entity.prefix,
|
|
1209
|
-
hasSuffix: !!entity.suffix
|
|
1210
|
-
});
|
|
1211
|
-
}
|
|
1087
|
+
return entities.filter((e) => {
|
|
1088
|
+
const ok = e && typeof e === "object" && typeof e.exact === "string" && typeof e.entityType === "string";
|
|
1089
|
+
if (!ok) {
|
|
1090
|
+
logger.debug("Dropped malformed LLM entity", { entity: e });
|
|
1212
1091
|
}
|
|
1213
|
-
return
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
};
|
|
1221
|
-
}).filter((entity) => {
|
|
1222
|
-
if (entity === null) {
|
|
1223
|
-
logger.debug("Filtered entity: null");
|
|
1224
|
-
return false;
|
|
1225
|
-
}
|
|
1226
|
-
if (entity.start === void 0 || entity.end === void 0) {
|
|
1227
|
-
logger.warn("Filtered entity: missing offsets", { text: entity.exact });
|
|
1228
|
-
return false;
|
|
1229
|
-
}
|
|
1230
|
-
if (entity.start < 0) {
|
|
1231
|
-
logger.warn("Filtered entity: negative start", {
|
|
1232
|
-
text: entity.exact,
|
|
1233
|
-
start: entity.start
|
|
1234
|
-
});
|
|
1235
|
-
return false;
|
|
1236
|
-
}
|
|
1237
|
-
if (entity.end > exact.length) {
|
|
1238
|
-
logger.warn("Filtered entity: end exceeds text length", {
|
|
1239
|
-
text: entity.exact,
|
|
1240
|
-
end: entity.end,
|
|
1241
|
-
textLength: exact.length
|
|
1242
|
-
});
|
|
1243
|
-
return false;
|
|
1244
|
-
}
|
|
1245
|
-
const extractedText = exact.substring(entity.start, entity.end);
|
|
1246
|
-
if (extractedText !== entity.exact) {
|
|
1247
|
-
logger.warn("Filtered entity: offset mismatch", {
|
|
1248
|
-
expected: entity.exact,
|
|
1249
|
-
got: extractedText,
|
|
1250
|
-
offsets: `[${entity.start}:${entity.end}]`
|
|
1251
|
-
});
|
|
1252
|
-
return false;
|
|
1253
|
-
}
|
|
1254
|
-
logger.debug("Accepted entity", {
|
|
1255
|
-
text: entity.exact,
|
|
1256
|
-
offsets: `[${entity.start}:${entity.end}]`
|
|
1257
|
-
});
|
|
1258
|
-
return true;
|
|
1259
|
-
});
|
|
1092
|
+
return ok;
|
|
1093
|
+
}).map((entity) => ({
|
|
1094
|
+
exact: entity.exact,
|
|
1095
|
+
entityType: entity.entityType,
|
|
1096
|
+
...typeof entity.prefix === "string" ? { prefix: entity.prefix } : {},
|
|
1097
|
+
...typeof entity.suffix === "string" ? { suffix: entity.suffix } : {}
|
|
1098
|
+
}));
|
|
1260
1099
|
} catch (error) {
|
|
1261
1100
|
logger.error("Failed to parse entity extraction response", {
|
|
1262
1101
|
error: error instanceof Error ? error.message : String(error)
|
|
@@ -1392,7 +1231,59 @@ Requirements:
|
|
|
1392
1231
|
});
|
|
1393
1232
|
return result;
|
|
1394
1233
|
}
|
|
1395
|
-
function
|
|
1234
|
+
function toMatch(r) {
|
|
1235
|
+
return {
|
|
1236
|
+
exact: r.exact,
|
|
1237
|
+
start: r.start,
|
|
1238
|
+
end: r.end,
|
|
1239
|
+
...r.prefix !== void 0 ? { prefix: r.prefix } : {},
|
|
1240
|
+
...r.suffix !== void 0 ? { suffix: r.suffix } : {}
|
|
1241
|
+
};
|
|
1242
|
+
}
|
|
1243
|
+
function annotationDedupeKey(ann) {
|
|
1244
|
+
const target = ann.target;
|
|
1245
|
+
const selectors = Array.isArray(target?.selector) ? target.selector : [];
|
|
1246
|
+
const pos = selectors.find((s) => s.type === "TextPositionSelector");
|
|
1247
|
+
return [
|
|
1248
|
+
ann.motivation,
|
|
1249
|
+
pos?.start ?? "?",
|
|
1250
|
+
pos?.end ?? "?",
|
|
1251
|
+
JSON.stringify(ann.body ?? null)
|
|
1252
|
+
].join("|");
|
|
1253
|
+
}
|
|
1254
|
+
function dedupeAnnotations(annotations) {
|
|
1255
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1256
|
+
const out = [];
|
|
1257
|
+
for (const ann of annotations) {
|
|
1258
|
+
const key = annotationDedupeKey(ann);
|
|
1259
|
+
if (seen.has(key)) continue;
|
|
1260
|
+
seen.add(key);
|
|
1261
|
+
out.push(ann);
|
|
1262
|
+
}
|
|
1263
|
+
return out;
|
|
1264
|
+
}
|
|
1265
|
+
function buildTextAnnotation(content, resourceId, userId, generator, motivation, match, body) {
|
|
1266
|
+
if (content.substring(match.start, match.end) !== match.exact) {
|
|
1267
|
+
throw new Error(
|
|
1268
|
+
`buildTextAnnotation invariant: content.substring(${match.start}, ${match.end}) !== exact for resource ${resourceId}, motivation ${motivation}`
|
|
1269
|
+
);
|
|
1270
|
+
}
|
|
1271
|
+
if (match.prefix !== void 0) {
|
|
1272
|
+
const actualPrefix = content.substring(Math.max(0, match.start - match.prefix.length), match.start);
|
|
1273
|
+
if (actualPrefix !== match.prefix) {
|
|
1274
|
+
throw new Error(
|
|
1275
|
+
`buildTextAnnotation invariant: content prefix-slice !== prefix for resource ${resourceId}, motivation ${motivation}`
|
|
1276
|
+
);
|
|
1277
|
+
}
|
|
1278
|
+
}
|
|
1279
|
+
if (match.suffix !== void 0) {
|
|
1280
|
+
const actualSuffix = content.substring(match.end, Math.min(content.length, match.end + match.suffix.length));
|
|
1281
|
+
if (actualSuffix !== match.suffix) {
|
|
1282
|
+
throw new Error(
|
|
1283
|
+
`buildTextAnnotation invariant: content suffix-slice !== suffix for resource ${resourceId}, motivation ${motivation}`
|
|
1284
|
+
);
|
|
1285
|
+
}
|
|
1286
|
+
}
|
|
1396
1287
|
const creator = didToAgent(userId);
|
|
1397
1288
|
const wasAttributedTo = creator["@id"] === generator["@id"] ? [generator] : [creator, generator];
|
|
1398
1289
|
return {
|
|
@@ -1431,9 +1322,9 @@ async function processHighlightJob(content, inferenceClient, params, userId, gen
|
|
|
1431
1322
|
params.sourceLanguage
|
|
1432
1323
|
);
|
|
1433
1324
|
onProgress(60, `Creating ${highlights.length} annotations...`, "creating");
|
|
1434
|
-
const annotations = highlights.map(
|
|
1435
|
-
(h) => buildTextAnnotation(params.resourceId, userId, generator, "highlighting", h)
|
|
1436
|
-
);
|
|
1325
|
+
const annotations = dedupeAnnotations(highlights.map(
|
|
1326
|
+
(h) => buildTextAnnotation(content, params.resourceId, userId, generator, "highlighting", h)
|
|
1327
|
+
));
|
|
1437
1328
|
onProgress(100, `Complete! Created ${annotations.length} highlights`, "creating");
|
|
1438
1329
|
return {
|
|
1439
1330
|
annotations,
|
|
@@ -1454,16 +1345,16 @@ async function processCommentJob(content, inferenceClient, params, userId, gener
|
|
|
1454
1345
|
);
|
|
1455
1346
|
onProgress(60, `Creating ${comments.length} annotations...`, "creating");
|
|
1456
1347
|
const bodyLanguage = params.language ?? "en";
|
|
1457
|
-
const annotations = comments.map(
|
|
1348
|
+
const annotations = dedupeAnnotations(comments.map(
|
|
1458
1349
|
(c) => (
|
|
1459
1350
|
// Match the pre-#651 CommentAnnotationWorker: include format and
|
|
1460
1351
|
// language on the body TextualBody. Optional in the schema, but
|
|
1461
1352
|
// consumers that do language-aware rendering rely on them.
|
|
1462
|
-
buildTextAnnotation(params.resourceId, userId, generator, "commenting", c, [
|
|
1353
|
+
buildTextAnnotation(content, params.resourceId, userId, generator, "commenting", c, [
|
|
1463
1354
|
{ type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language: bodyLanguage }
|
|
1464
1355
|
])
|
|
1465
1356
|
)
|
|
1466
|
-
);
|
|
1357
|
+
));
|
|
1467
1358
|
onProgress(100, `Complete! Created ${annotations.length} comments`, "creating");
|
|
1468
1359
|
return {
|
|
1469
1360
|
annotations,
|
|
@@ -1484,7 +1375,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
|
|
|
1484
1375
|
);
|
|
1485
1376
|
onProgress(60, `Creating ${assessments.length} annotations...`, "creating");
|
|
1486
1377
|
const bodyLanguage = params.language ?? "en";
|
|
1487
|
-
const annotations = assessments.map(
|
|
1378
|
+
const annotations = dedupeAnnotations(assessments.map(
|
|
1488
1379
|
(a) => (
|
|
1489
1380
|
// Single-object body with purpose aligned to motivation, matching the
|
|
1490
1381
|
// pre-#651 AssessmentAnnotationWorker's shape and the majority of
|
|
@@ -1492,7 +1383,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
|
|
|
1492
1383
|
// purpose='describing' — that loses the "this is an assessment, not
|
|
1493
1384
|
// a description" signal and breaks existing readers that access
|
|
1494
1385
|
// `body.value` directly on the object.
|
|
1495
|
-
buildTextAnnotation(params.resourceId, userId, generator, "assessing", a, {
|
|
1386
|
+
buildTextAnnotation(content, params.resourceId, userId, generator, "assessing", a, {
|
|
1496
1387
|
type: "TextualBody",
|
|
1497
1388
|
value: a.assessment,
|
|
1498
1389
|
purpose: "assessing",
|
|
@@ -1500,7 +1391,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
|
|
|
1500
1391
|
language: bodyLanguage
|
|
1501
1392
|
})
|
|
1502
1393
|
)
|
|
1503
|
-
);
|
|
1394
|
+
));
|
|
1504
1395
|
onProgress(100, `Complete! Created ${annotations.length} assessments`, "creating");
|
|
1505
1396
|
return {
|
|
1506
1397
|
annotations,
|
|
@@ -1544,27 +1435,44 @@ async function processReferenceJob(content, inferenceClient, params, userId, gen
|
|
|
1544
1435
|
{ type: "TextualBody", value: entityTypeName, purpose: "tagging", format: "text/plain", language: bodyLanguage }
|
|
1545
1436
|
];
|
|
1546
1437
|
for (const entity of extractedEntities) {
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
);
|
|
1557
|
-
allAnnotations.push(ann);
|
|
1558
|
-
totalEmitted++;
|
|
1559
|
-
} catch {
|
|
1438
|
+
const reconciled = reconcileSelector(content, {
|
|
1439
|
+
exact: entity.exact,
|
|
1440
|
+
...entity.prefix !== void 0 ? { prefix: entity.prefix } : {},
|
|
1441
|
+
...entity.suffix !== void 0 ? { suffix: entity.suffix } : {}
|
|
1442
|
+
});
|
|
1443
|
+
if (!reconciled) {
|
|
1444
|
+
logger.error("Entity dropped \u2014 text not found in source", {
|
|
1445
|
+
text: entity.exact,
|
|
1446
|
+
entityType: entity.entityType
|
|
1447
|
+
});
|
|
1560
1448
|
errors++;
|
|
1449
|
+
continue;
|
|
1561
1450
|
}
|
|
1451
|
+
if (reconciled.anchorMethod === "first-of-many" || reconciled.anchorMethod === "fuzzy-match") {
|
|
1452
|
+
logger.warn("Entity anchored via degraded method", {
|
|
1453
|
+
text: entity.exact,
|
|
1454
|
+
entityType: entity.entityType,
|
|
1455
|
+
anchorMethod: reconciled.anchorMethod
|
|
1456
|
+
});
|
|
1457
|
+
}
|
|
1458
|
+
const ann = buildTextAnnotation(
|
|
1459
|
+
content,
|
|
1460
|
+
params.resourceId,
|
|
1461
|
+
userId,
|
|
1462
|
+
generator,
|
|
1463
|
+
"linking",
|
|
1464
|
+
toMatch(reconciled),
|
|
1465
|
+
unresolvedBody
|
|
1466
|
+
);
|
|
1467
|
+
allAnnotations.push(ann);
|
|
1468
|
+
totalEmitted++;
|
|
1562
1469
|
}
|
|
1563
1470
|
}
|
|
1564
|
-
|
|
1471
|
+
const annotations = dedupeAnnotations(allAnnotations);
|
|
1472
|
+
onProgress(100, `Complete! Created ${annotations.length} references`, "creating");
|
|
1565
1473
|
return {
|
|
1566
|
-
annotations
|
|
1567
|
-
result: { totalFound, totalEmitted, errors }
|
|
1474
|
+
annotations,
|
|
1475
|
+
result: { totalFound, totalEmitted: annotations.length, errors }
|
|
1568
1476
|
};
|
|
1569
1477
|
}
|
|
1570
1478
|
async function processTagJob(content, inferenceClient, params, userId, generator, onProgress) {
|
|
@@ -1584,15 +1492,19 @@ async function processTagJob(content, inferenceClient, params, userId, generator
|
|
|
1584
1492
|
const tags = allTags;
|
|
1585
1493
|
onProgress(60, `Creating ${tags.length} tag annotations...`, "creating");
|
|
1586
1494
|
const bodyLanguage = params.language ?? "en";
|
|
1587
|
-
const
|
|
1588
|
-
const annotations = tags.map((t) => {
|
|
1495
|
+
const annotations = dedupeAnnotations(tags.map((t) => {
|
|
1589
1496
|
const category = t.category ?? "unknown";
|
|
1590
|
-
|
|
1591
|
-
return buildTextAnnotation(params.resourceId, userId, generator, "tagging", t, [
|
|
1497
|
+
return buildTextAnnotation(content, params.resourceId, userId, generator, "tagging", t, [
|
|
1592
1498
|
{ type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: bodyLanguage },
|
|
1593
1499
|
{ type: "TextualBody", value: params.schema.id, purpose: "classifying", format: "text/plain" }
|
|
1594
1500
|
]);
|
|
1595
|
-
});
|
|
1501
|
+
}));
|
|
1502
|
+
const byCategory = {};
|
|
1503
|
+
for (const ann of annotations) {
|
|
1504
|
+
const body = ann.body;
|
|
1505
|
+
const category = Array.isArray(body) && typeof body[0]?.value === "string" ? body[0].value : "unknown";
|
|
1506
|
+
byCategory[category] = (byCategory[category] ?? 0) + 1;
|
|
1507
|
+
}
|
|
1596
1508
|
onProgress(100, `Complete! Created ${annotations.length} tags`, "creating");
|
|
1597
1509
|
return {
|
|
1598
1510
|
annotations,
|
|
@@ -1628,6 +1540,6 @@ async function processGenerationJob(inferenceClient, params, onProgress, logger)
|
|
|
1628
1540
|
};
|
|
1629
1541
|
}
|
|
1630
1542
|
|
|
1631
|
-
export { AnnotationDetection, FsJobQueue,
|
|
1543
|
+
export { AnnotationDetection, FsJobQueue, generateResourceFromTopic, isCancelledJob, isCompleteJob, isFailedJob, isPendingJob, isRunningJob, processAssessmentJob, processCommentJob, processGenerationJob, processHighlightJob, processReferenceJob, processTagJob };
|
|
1632
1544
|
//# sourceMappingURL=index.js.map
|
|
1633
1545
|
//# sourceMappingURL=index.js.map
|