@forwardimpact/libsyntheticrender 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,384 @@
1
+ /**
2
+ * Enricher — Pass 2 LLM enrichment of prose blocks.
3
+ *
4
+ * Finds all elements with `data-enrich` attributes in Pass 1 HTML,
5
+ * calls ProseEngine to generate rich prose with inline microdata,
6
+ * and replaces placeholder content.
7
+ *
8
+ * @module libuniverse/render/enricher
9
+ */
10
+
11
+ /**
12
+ * Regex to find data-enrich blocks and their content.
13
+ * Matches: <div ... data-enrich="key"> ... </div>
14
+ */
15
+ const ENRICH_PATTERN =
16
+ /(<div[^>]+data-enrich="([^"]+)"[^>]*>)([\s\S]*?)(<\/div>)/g;
17
+
18
+ /**
19
+ * Build entity context for enrichment from linked entities.
20
+ * @param {string} enrichKey
21
+ * @param {object} linked - LinkedEntities from link-assigner
22
+ * @returns {{ entityType: string, entityName: string, mentionTargets: object[] } | null}
23
+ */
24
+ function buildEnrichContext(enrichKey, linked) {
25
+ const [type, ...rest] = enrichKey.split("_");
26
+ const id = rest.join("_");
27
+
28
+ switch (type) {
29
+ case "project": {
30
+ const proj = linked.projects.find((p) => p.id === id);
31
+ if (!proj) return null;
32
+ const mentions = [
33
+ ...proj.drugLinks.map((d) => ({
34
+ type: "Drug",
35
+ name: d.name,
36
+ iri: d.iri,
37
+ })),
38
+ ...proj.platformLinks.map((p) => ({
39
+ type: "SoftwareApplication",
40
+ name: p.name,
41
+ iri: p.iri,
42
+ })),
43
+ ...proj.members
44
+ .slice(0, 3)
45
+ .map((m) => ({ type: "Person", name: m.name, iri: m.iri })),
46
+ ];
47
+ return {
48
+ entityType: "Project",
49
+ entityName: proj.name,
50
+ mentionTargets: mentions,
51
+ };
52
+ }
53
+ case "platform": {
54
+ const plat = linked.platforms.find((p) => p.id === id);
55
+ if (!plat) return null;
56
+ const deps = plat.dependencies || [];
57
+ const depPlatforms = deps
58
+ .map((d) =>
59
+ linked.platforms.find(
60
+ (p) => p.id === (typeof d === "string" ? d : d.id),
61
+ ),
62
+ )
63
+ .filter(Boolean);
64
+ const mentions = [
65
+ ...depPlatforms.map((p) => ({
66
+ type: "SoftwareApplication",
67
+ name: p.name,
68
+ iri: p.iri,
69
+ })),
70
+ ...(plat.projectLinks || [])
71
+ .slice(0, 2)
72
+ .map((p) => ({ type: "Project", name: p.name, iri: p.iri })),
73
+ ...(plat.drugLinks || [])
74
+ .slice(0, 2)
75
+ .map((d) => ({ type: "Drug", name: d.name, iri: d.iri })),
76
+ ];
77
+ return {
78
+ entityType: "SoftwareApplication",
79
+ entityName: plat.name,
80
+ mentionTargets: mentions,
81
+ };
82
+ }
83
+ case "drug": {
84
+ const drug = linked.drugs.find((d) => d.id === id);
85
+ if (!drug) return null;
86
+ const mentions = [
87
+ ...(drug.projectLinks || [])
88
+ .slice(0, 2)
89
+ .map((p) => ({ type: "Project", name: p.name, iri: p.iri })),
90
+ ...(drug.platformLinks || []).slice(0, 2).map((p) => ({
91
+ type: "SoftwareApplication",
92
+ name: p.name,
93
+ iri: p.iri,
94
+ })),
95
+ ];
96
+ if (drug.parentDrug) {
97
+ const parent = linked.drugs.find((d) => d.id === drug.parentDrug);
98
+ if (parent)
99
+ mentions.push({ type: "Drug", name: parent.name, iri: parent.iri });
100
+ }
101
+ return {
102
+ entityType: "Drug",
103
+ entityName: drug.name,
104
+ mentionTargets: mentions,
105
+ };
106
+ }
107
+ case "course": {
108
+ const course = linked.courses.find((c) => c.id === id);
109
+ if (!course) return null;
110
+ const mentions = [
111
+ ...(course.platformLink
112
+ ? [
113
+ {
114
+ type: "SoftwareApplication",
115
+ name: course.platformLink.name,
116
+ iri: course.platformLink.iri,
117
+ },
118
+ ]
119
+ : []),
120
+ ...(course.drugLink
121
+ ? [
122
+ {
123
+ type: "Drug",
124
+ name: course.drugLink.name,
125
+ iri: course.drugLink.iri,
126
+ },
127
+ ]
128
+ : []),
129
+ ...course.attendees
130
+ .slice(0, 2)
131
+ .map((a) => ({ type: "Person", name: a.name, iri: a.iri })),
132
+ ];
133
+ return {
134
+ entityType: "Course",
135
+ entityName: course.title,
136
+ mentionTargets: mentions,
137
+ };
138
+ }
139
+ case "event": {
140
+ const idx = parseInt(id, 10) - 1;
141
+ const event = linked.events[idx];
142
+ if (!event) return null;
143
+ const mentions = [
144
+ {
145
+ type: "Person",
146
+ name: event.organizer.name,
147
+ iri: event.organizer.iri,
148
+ },
149
+ ...event.aboutProjects
150
+ .slice(0, 2)
151
+ .map((p) => ({ type: "Project", name: p.name, iri: p.iri })),
152
+ ...event.aboutDrugs
153
+ .slice(0, 2)
154
+ .map((d) => ({ type: "Drug", name: d.name, iri: d.iri })),
155
+ ...event.attendees
156
+ .slice(0, 2)
157
+ .map((a) => ({ type: "Person", name: a.name, iri: a.iri })),
158
+ ];
159
+ return {
160
+ entityType: "Event",
161
+ entityName: event.title,
162
+ mentionTargets: mentions,
163
+ };
164
+ }
165
+ case "blog": {
166
+ const idx = parseInt(id, 10) - 1;
167
+ const post = linked.blogPosts[idx];
168
+ if (!post) return null;
169
+ const mentions = [
170
+ ...post.aboutDrugs
171
+ .slice(0, 2)
172
+ .map((d) => ({ type: "Drug", name: d.name, iri: d.iri })),
173
+ ...post.aboutPlatforms.slice(0, 2).map((p) => ({
174
+ type: "SoftwareApplication",
175
+ name: p.name,
176
+ iri: p.iri,
177
+ })),
178
+ ...post.mentionsPeople
179
+ .slice(0, 3)
180
+ .map((p) => ({ type: "Person", name: p.name, iri: p.iri })),
181
+ ];
182
+ return {
183
+ entityType: "BlogPosting",
184
+ entityName: post.headline,
185
+ mentionTargets: mentions,
186
+ };
187
+ }
188
+ case "article": {
189
+ const article = linked.articles?.find((a) => a.topic === id);
190
+ if (!article) return null;
191
+ const mentions = [
192
+ ...(article.drugLinks || [])
193
+ .slice(0, 2)
194
+ .map((d) => ({ type: "Drug", name: d.name, iri: d.iri })),
195
+ ...(article.platformLinks || []).slice(0, 2).map((p) => ({
196
+ type: "SoftwareApplication",
197
+ name: p.name,
198
+ iri: p.iri,
199
+ })),
200
+ ...(article.projectLinks || [])
201
+ .slice(0, 2)
202
+ .map((p) => ({ type: "Project", name: p.name, iri: p.iri })),
203
+ ...(article.authorLinks || [])
204
+ .slice(0, 2)
205
+ .map((a) => ({ type: "Person", name: a.name, iri: a.iri })),
206
+ ];
207
+ return {
208
+ entityType: "ScholarlyArticle",
209
+ entityName: article.title,
210
+ mentionTargets: mentions,
211
+ };
212
+ }
213
+ default:
214
+ return null;
215
+ }
216
+ }
217
+
218
+ /**
219
+ * Build the LLM prompt for enriching a prose block.
220
+ * @param {object} ctx - Context from buildEnrichContext
221
+ * @param {string} placeholder - Current placeholder text
222
+ * @param {string} domain - Universe domain for IRI constraint
223
+ * @returns {object[]} Messages array for ProseEngine.generateStructured
224
+ */
225
+ function buildEnrichMessages(ctx, placeholder, domain) {
226
+ const mentionList = ctx.mentionTargets
227
+ .map((m) => `- ${m.type}: ${m.name} (${m.iri})`)
228
+ .join("\n");
229
+
230
+ const system = `You are a technical writer producing HTML content with Schema.org microdata for a pharmaceutical company knowledge base.
231
+ Output only the inner HTML content — no wrapper tags, no markdown fences.
232
+ Write 300-500 words of detailed, rich prose across multiple paragraphs. Mention entities using inline Schema.org microdata spans.
233
+ Only use the exact IRIs provided. Do not invent new IRIs. All itemid values must start with "https://${domain}/id/".`;
234
+
235
+ const user = `Rewrite this text block for a ${ctx.entityType} document about "${ctx.entityName}".
236
+
237
+ Current text: "${placeholder}"
238
+
239
+ Write 300-500 words of detailed prose across 3-5 paragraphs. Naturally mention these entities using Schema.org microdata:
240
+
241
+ ${mentionList}
242
+
243
+ Use this pattern for inline mentions:
244
+ <span itemprop="mentions" itemscope itemtype="https://schema.org/{{type}}" itemid="{{iri}}"><span itemprop="name">{{name}}</span></span>
245
+
246
+ Output only the HTML content for the block — no wrapper tags.`;
247
+
248
+ return [
249
+ { role: "system", content: system },
250
+ { role: "user", content: user },
251
+ ];
252
+ }
253
+
254
+ /**
255
+ * Strip or sanitize itemid attributes that don't match the universe domain.
256
+ * @param {string} html
257
+ * @param {string} domain
258
+ * @returns {string}
259
+ */
260
+ function stripOffDomainIris(html, domain) {
261
+ const prefix = `https://${domain}/id/`;
262
+ return html
263
+ .replace(
264
+ /(<[^>]*?)\s+itemscope\s+itemtype="[^"]*"\s+itemid="([^"]*)"/g,
265
+ (match, before, iri) => {
266
+ if (iri.startsWith(prefix)) return match;
267
+ return before;
268
+ },
269
+ )
270
+ .replace(
271
+ /(<[^>]*?)\s+itemid="([^"]*)"\s+itemscope\s+itemtype="[^"]*"/g,
272
+ (match, before, iri) => {
273
+ if (iri.startsWith(prefix)) return match;
274
+ return before;
275
+ },
276
+ )
277
+ .replace(/(<[^>]*?)\s+itemid="([^"]*)"/g, (match, before, iri) => {
278
+ if (iri.startsWith(prefix)) return match;
279
+ return before;
280
+ });
281
+ }
282
+
283
+ /**
284
+ * Close any unclosed HTML tags in LLM-generated prose.
285
+ * Tracks open/close tags and appends missing closing tags.
286
+ * @param {string} html
287
+ * @returns {string}
288
+ */
289
+ function balanceTags(html) {
290
+ const VOID = new Set([
291
+ "area",
292
+ "base",
293
+ "br",
294
+ "col",
295
+ "embed",
296
+ "hr",
297
+ "img",
298
+ "input",
299
+ "link",
300
+ "meta",
301
+ "source",
302
+ "track",
303
+ "wbr",
304
+ ]);
305
+ const stack = [];
306
+ const tagRe = /<\/?([a-zA-Z][a-zA-Z0-9]*)\b[^>]*\/?>/g;
307
+ let m;
308
+ while ((m = tagRe.exec(html)) !== null) {
309
+ const full = m[0];
310
+ const tag = m[1].toLowerCase();
311
+ if (VOID.has(tag) || full.endsWith("/>")) continue;
312
+ if (full.startsWith("</")) {
313
+ const idx = stack.lastIndexOf(tag);
314
+ if (idx !== -1) stack.splice(idx, 1);
315
+ } else {
316
+ stack.push(tag);
317
+ }
318
+ }
319
+ let suffix = "";
320
+ for (let i = stack.length - 1; i >= 0; i--) {
321
+ suffix += `</${stack[i]}>`;
322
+ }
323
+ return html + suffix;
324
+ }
325
+
326
+ /**
327
+ * Enrich all prose blocks in HTML documents via LLM.
328
+ * @param {Map<string, string>} htmlFiles - filename → HTML content from Pass 1
329
+ * @param {object} linked - LinkedEntities from link-assigner
330
+ * @param {import('../engine/prose.js').ProseEngine} proseEngine
331
+ * @param {string} domain - Universe domain
332
+ * @param {object} logger - Logger instance
333
+ * @returns {Promise<Map<string, string>>} filename → enriched HTML content
334
+ */
335
+ export async function enrichDocuments(
336
+ htmlFiles,
337
+ linked,
338
+ proseEngine,
339
+ domain,
340
+ logger,
341
+ ) {
342
+ if (!logger) throw new Error("logger is required");
343
+ const enriched = new Map();
344
+ let totalBlocks = 0;
345
+ let enrichedBlocks = 0;
346
+
347
+ for (const [filename, html] of htmlFiles) {
348
+ let result = html;
349
+ const matches = [...html.matchAll(ENRICH_PATTERN)];
350
+ totalBlocks += matches.length;
351
+
352
+ for (const match of matches) {
353
+ const [, openTag, enrichKey, content, closeTag] = match;
354
+ const ctx = buildEnrichContext(enrichKey, linked);
355
+ if (!ctx) continue;
356
+
357
+ const placeholder = content.replace(/<[^>]+>/g, "").trim();
358
+ const messages = buildEnrichMessages(ctx, placeholder, domain);
359
+ let prose = await proseEngine.generateStructured(
360
+ `enrich_${enrichKey}`,
361
+ messages,
362
+ );
363
+
364
+ if (prose) {
365
+ if (domain) prose = stripOffDomainIris(prose, domain);
366
+ prose = balanceTags(prose);
367
+ const cleanOpen = openTag.replace(/\s*data-enrich="[^"]*"/, "");
368
+ result = result.replace(
369
+ match[0],
370
+ `${cleanOpen}\n ${prose}\n ${closeTag}`,
371
+ );
372
+ enrichedBlocks++;
373
+ }
374
+ }
375
+
376
+ enriched.set(filename, result);
377
+ }
378
+
379
+ logger.info(
380
+ "enricher",
381
+ `Enriched ${enrichedBlocks}/${totalBlocks} prose blocks`,
382
+ );
383
+ return enriched;
384
+ }