@uniweb/semantic-parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -0
- package/.eslintrc.json +28 -0
- package/LICENSE +674 -0
- package/README.md +395 -0
- package/docs/api.md +352 -0
- package/docs/file-structure.md +50 -0
- package/docs/guide.md +206 -0
- package/docs/mapping-patterns.md +928 -0
- package/docs/text-component-reference.md +515 -0
- package/package.json +41 -0
- package/reference/README.md +195 -0
- package/reference/Text.js +188 -0
- package/src/index.js +35 -0
- package/src/mappers/accessor.js +312 -0
- package/src/mappers/extractors.js +397 -0
- package/src/mappers/helpers.js +234 -0
- package/src/mappers/index.js +28 -0
- package/src/mappers/types.js +495 -0
- package/src/processors/byType.js +129 -0
- package/src/processors/groups.js +330 -0
- package/src/processors/groups_backup.js +379 -0
- package/src/processors/groups_doc.md +179 -0
- package/src/processors/sequence.js +573 -0
- package/src/processors/sequence_backup.js +402 -0
- package/src/utils/role.js +53 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Process a ProseMirror/TipTap document into a flat sequence
|
|
3
|
+
* @param {Object} doc ProseMirror document
|
|
4
|
+
* @param {Object} options Parsing options
|
|
5
|
+
* @returns {Array} Sequence of content elements
|
|
6
|
+
*/
|
|
7
|
+
function processSequence(doc, options = {}) {
|
|
8
|
+
const sequence = [];
|
|
9
|
+
processNode(doc, sequence, options);
|
|
10
|
+
|
|
11
|
+
return sequence;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function processNode(node, sequence, options) {
|
|
15
|
+
// Special handling for root doc node
|
|
16
|
+
if (node.type === "doc") {
|
|
17
|
+
node.content?.forEach((child) => processNode(child, sequence, options));
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Create element based on node type
|
|
22
|
+
const element = createSequenceElement(node, options);
|
|
23
|
+
|
|
24
|
+
if (element) {
|
|
25
|
+
sequence.push(element);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function createSequenceElement(node, options = {}) {
|
|
30
|
+
function isLink() {
|
|
31
|
+
if (node.type === "paragraph" && node.content.length === 1) {
|
|
32
|
+
return (
|
|
33
|
+
node.content[0].marks?.some((mark) => mark.type === "link") ||
|
|
34
|
+
false
|
|
35
|
+
);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function isStyledLink() {
|
|
40
|
+
// Check if paragraph has multiple content parts with same link mark
|
|
41
|
+
if (
|
|
42
|
+
node.type === "paragraph" &&
|
|
43
|
+
node.content &&
|
|
44
|
+
node.content.length > 1
|
|
45
|
+
) {
|
|
46
|
+
// Filter out icons
|
|
47
|
+
const content = node.content.filter(
|
|
48
|
+
(c) => c.type !== "UniwebIcon" && c.type !== "image"
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
if (content.length === 0) return false;
|
|
52
|
+
|
|
53
|
+
// Get first link mark
|
|
54
|
+
const firstLinkMark = content[0]?.marks?.find(
|
|
55
|
+
(mark) => mark.type === "link" && mark.attrs
|
|
56
|
+
);
|
|
57
|
+
if (!firstLinkMark) return false;
|
|
58
|
+
|
|
59
|
+
// Check if all content items have same link mark
|
|
60
|
+
const allHaveSameLink = content.every((c) =>
|
|
61
|
+
c?.marks?.some(
|
|
62
|
+
(mark) =>
|
|
63
|
+
mark.type === "link" &&
|
|
64
|
+
mark.attrs?.href === firstLinkMark.attrs.href
|
|
65
|
+
)
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
return allHaveSameLink ? firstLinkMark : false;
|
|
69
|
+
}
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function isImage() {
|
|
74
|
+
if (node.type === "paragraph" && node.content.length === 1) {
|
|
75
|
+
return (
|
|
76
|
+
node.content[0].type === "image" &&
|
|
77
|
+
(node.content[0].attrs.role === "image" ||
|
|
78
|
+
node.content[0].attrs.role === "banner")
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function isIcon() {
|
|
84
|
+
if (node.type === "paragraph" && node.content.length === 1) {
|
|
85
|
+
return (
|
|
86
|
+
node.content[0].type === "image" &&
|
|
87
|
+
node.content[0].attrs.role === "icon"
|
|
88
|
+
);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function isButton() {
|
|
93
|
+
if (node.type === "paragraph" && node.content.length === 1) {
|
|
94
|
+
return (
|
|
95
|
+
node.content[0].type === "text" &&
|
|
96
|
+
node.content[0].marks?.some((mark) => mark.type === "button")
|
|
97
|
+
);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function isVideo() {
|
|
102
|
+
if (node.type === "paragraph" && node.content.length === 1) {
|
|
103
|
+
return (
|
|
104
|
+
node.content[0].type === "image" &&
|
|
105
|
+
node.content[0].attrs.role === "video"
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// extract pure [type] content from the paragraph node for easier handling in the byGroup processor
|
|
111
|
+
|
|
112
|
+
// Check styled link first (multi-part link)
|
|
113
|
+
const styledLinkMark = isStyledLink();
|
|
114
|
+
if (styledLinkMark) {
|
|
115
|
+
// Remove link marks from content, keep other styling
|
|
116
|
+
const cleanedContent = node.content
|
|
117
|
+
.filter((c) => c.type !== "UniwebIcon" && c.type !== "image")
|
|
118
|
+
.map((c) => ({
|
|
119
|
+
...c,
|
|
120
|
+
marks: c.marks?.filter((mark) => mark.type !== "link") || [],
|
|
121
|
+
}));
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
type: "styledLink",
|
|
125
|
+
href: styledLinkMark.attrs.href,
|
|
126
|
+
target: styledLinkMark.attrs.target || "_self",
|
|
127
|
+
content: getTextContent(
|
|
128
|
+
{ ...node, content: cleanedContent },
|
|
129
|
+
options
|
|
130
|
+
),
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Simple single-part link
|
|
135
|
+
if (isLink()) {
|
|
136
|
+
return {
|
|
137
|
+
type: "link",
|
|
138
|
+
content: {
|
|
139
|
+
href: node.content[0].marks.find((mark) => mark.type === "link")
|
|
140
|
+
.attrs.href,
|
|
141
|
+
label: node.content[0].text,
|
|
142
|
+
},
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if (isImage()) {
|
|
147
|
+
return {
|
|
148
|
+
type: "image",
|
|
149
|
+
src: node.content[0].attrs.src,
|
|
150
|
+
caption: node.content[0].attrs.title,
|
|
151
|
+
alt: node.content[0].attrs.alt || node.content[0].attrs.title,
|
|
152
|
+
role: node.content[0].attrs.role,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (isIcon()) {
|
|
157
|
+
return {
|
|
158
|
+
type: "icon",
|
|
159
|
+
svg: node.content[0].attrs.svg,
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
if (isButton()) {
|
|
164
|
+
return {
|
|
165
|
+
type: "button",
|
|
166
|
+
content: node.content[0].text,
|
|
167
|
+
attrs: node.content[0].marks.find((mark) => mark.type === "button")
|
|
168
|
+
.attrs,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
if (isVideo()) {
|
|
173
|
+
return {
|
|
174
|
+
type: "video",
|
|
175
|
+
src: node.content[0].attrs.src,
|
|
176
|
+
caption: node.content[0].attrs.title,
|
|
177
|
+
alt: node.content[0].attrs.alt || node.content[0].attrs.title,
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
switch (node.type) {
|
|
182
|
+
case "heading":
|
|
183
|
+
return {
|
|
184
|
+
type: "heading",
|
|
185
|
+
level: node.attrs.level,
|
|
186
|
+
content: getTextContent(node, options),
|
|
187
|
+
attrs: node.attrs, // Pass through all attributes (including textAlign)
|
|
188
|
+
};
|
|
189
|
+
|
|
190
|
+
case "paragraph":
|
|
191
|
+
return {
|
|
192
|
+
type: "paragraph",
|
|
193
|
+
content: getTextContent(node, options),
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
case "blockquote":
|
|
197
|
+
// Process blockquote content recursively
|
|
198
|
+
return {
|
|
199
|
+
type: "blockquote",
|
|
200
|
+
content:
|
|
201
|
+
node.content
|
|
202
|
+
?.map((child) => createSequenceElement(child, options))
|
|
203
|
+
.filter(Boolean) || [],
|
|
204
|
+
};
|
|
205
|
+
|
|
206
|
+
case "codeBlock":
|
|
207
|
+
const textContent = getTextContent(node, options);
|
|
208
|
+
let parsedJson = null;
|
|
209
|
+
|
|
210
|
+
if (options.parseCodeAsJson) {
|
|
211
|
+
try {
|
|
212
|
+
parsedJson = JSON.parse(textContent);
|
|
213
|
+
} catch (err) {
|
|
214
|
+
// Invalid JSON, keep as string
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return {
|
|
219
|
+
type: "codeBlock",
|
|
220
|
+
content: textContent,
|
|
221
|
+
parsed: parsedJson,
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
case "image":
|
|
225
|
+
return {
|
|
226
|
+
type: "image",
|
|
227
|
+
src: node.attrs.src,
|
|
228
|
+
alt: node.attrs.alt,
|
|
229
|
+
role: node.attrs.role,
|
|
230
|
+
};
|
|
231
|
+
|
|
232
|
+
case "bulletList":
|
|
233
|
+
case "orderedList":
|
|
234
|
+
return {
|
|
235
|
+
type: "list",
|
|
236
|
+
style: node.type === "bulletList" ? "bullet" : "ordered",
|
|
237
|
+
items: processListItems(node, options),
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
case "listItem":
|
|
241
|
+
return {
|
|
242
|
+
type: "listItem",
|
|
243
|
+
content: getTextContent(node, options),
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
case "horizontalRule":
|
|
247
|
+
return {
|
|
248
|
+
type: "divider",
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
// Custom TipTap elements
|
|
252
|
+
case "card-group":
|
|
253
|
+
return {
|
|
254
|
+
type: "card-group",
|
|
255
|
+
cards:
|
|
256
|
+
node.content
|
|
257
|
+
?.filter((c) => c.type === "card" && !c.attrs?.hidden)
|
|
258
|
+
.map((card) => ({
|
|
259
|
+
...card.attrs,
|
|
260
|
+
type: "card",
|
|
261
|
+
})) || [],
|
|
262
|
+
};
|
|
263
|
+
|
|
264
|
+
case "document-group":
|
|
265
|
+
return {
|
|
266
|
+
type: "document-group",
|
|
267
|
+
documents:
|
|
268
|
+
node.content
|
|
269
|
+
?.filter((c) => c.type === "document")
|
|
270
|
+
.map((doc) => ({
|
|
271
|
+
...doc.attrs,
|
|
272
|
+
type: "document",
|
|
273
|
+
})) || [],
|
|
274
|
+
};
|
|
275
|
+
|
|
276
|
+
case "FormBlock":
|
|
277
|
+
// Parse form data (can be JSON string or object)
|
|
278
|
+
let formData = node.attrs?.data;
|
|
279
|
+
if (typeof formData === "string") {
|
|
280
|
+
try {
|
|
281
|
+
formData = JSON.parse(formData);
|
|
282
|
+
} catch (err) {
|
|
283
|
+
// Keep as string
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
return {
|
|
287
|
+
type: "form",
|
|
288
|
+
data: formData,
|
|
289
|
+
attrs: node.attrs,
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
case "text":
|
|
293
|
+
return null;
|
|
294
|
+
|
|
295
|
+
default:
|
|
296
|
+
return {
|
|
297
|
+
type: node.type,
|
|
298
|
+
content: getTextContent(node, options),
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function getTextContent(node, options = {}) {
|
|
304
|
+
if (!node.content) return "";
|
|
305
|
+
|
|
306
|
+
return node.content.reduce((prev, curr) => {
|
|
307
|
+
const { type, marks = [], text } = curr;
|
|
308
|
+
|
|
309
|
+
if (type === "text") {
|
|
310
|
+
let styledText = text || "";
|
|
311
|
+
|
|
312
|
+
// Apply marks in order: textStyle, highlight, bold, italic, link
|
|
313
|
+
// This ensures proper nesting
|
|
314
|
+
|
|
315
|
+
// textStyle (color)
|
|
316
|
+
if (marks.some((mark) => mark.type === "textStyle")) {
|
|
317
|
+
const color = marks.find((mark) => mark.type === "textStyle")
|
|
318
|
+
?.attrs?.color;
|
|
319
|
+
if (color) {
|
|
320
|
+
styledText = `<span style="color: var(--${color})">${styledText}</span>`;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// highlight
|
|
325
|
+
if (marks.some((mark) => mark.type === "highlight")) {
|
|
326
|
+
styledText = `<span style="background-color: var(--highlight)">${styledText}</span>`;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// bold
|
|
330
|
+
if (marks.some((mark) => mark.type === "bold")) {
|
|
331
|
+
styledText = `<strong>${styledText}</strong>`;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// italic
|
|
335
|
+
if (marks.some((mark) => mark.type === "italic")) {
|
|
336
|
+
styledText = `<em>${styledText}</em>`;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// link (outermost)
|
|
340
|
+
if (marks.some((mark) => mark.type === "link")) {
|
|
341
|
+
const linkMark = marks.find((mark) => mark.type === "link");
|
|
342
|
+
const href = linkMark.attrs.href;
|
|
343
|
+
const target = linkMark.attrs.target || "_self";
|
|
344
|
+
|
|
345
|
+
// Check if it's a file link (add download attribute)
|
|
346
|
+
const fileExtensions = [
|
|
347
|
+
"pdf",
|
|
348
|
+
"doc",
|
|
349
|
+
"docx",
|
|
350
|
+
"xls",
|
|
351
|
+
"xlsx",
|
|
352
|
+
"ppt",
|
|
353
|
+
"pptx",
|
|
354
|
+
"jpg",
|
|
355
|
+
"jpeg",
|
|
356
|
+
"png",
|
|
357
|
+
"webp",
|
|
358
|
+
"gif",
|
|
359
|
+
"svg",
|
|
360
|
+
"mp4",
|
|
361
|
+
"mp3",
|
|
362
|
+
"wav",
|
|
363
|
+
"mov",
|
|
364
|
+
"zip",
|
|
365
|
+
];
|
|
366
|
+
const extension = href.split(".").pop()?.toLowerCase();
|
|
367
|
+
const isFileLink = fileExtensions.includes(extension);
|
|
368
|
+
|
|
369
|
+
styledText = `<a href="${href}" target="${target}"${
|
|
370
|
+
isFileLink ? " download" : ""
|
|
371
|
+
}>${styledText}</a>`;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
return prev + styledText;
|
|
375
|
+
} else if (type === "hardBreak") {
|
|
376
|
+
return prev + "<br>";
|
|
377
|
+
} else {
|
|
378
|
+
console.warn(`unhandled text content type: ${type}`, curr);
|
|
379
|
+
return prev;
|
|
380
|
+
}
|
|
381
|
+
}, "");
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
function processListItems(node, options = {}) {
|
|
385
|
+
const items = [];
|
|
386
|
+
node.content?.forEach((item) => {
|
|
387
|
+
if (item.type === "listItem") {
|
|
388
|
+
items.push({
|
|
389
|
+
content: item.content
|
|
390
|
+
?.filter((child) => !child.type.endsWith("List"))
|
|
391
|
+
?.map((child) => createSequenceElement(child, options)),
|
|
392
|
+
items: item.content
|
|
393
|
+
?.filter((child) => child.type.endsWith("List"))
|
|
394
|
+
.flatMap((list) => processListItems(list, options)),
|
|
395
|
+
});
|
|
396
|
+
}
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
return items;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
export { processSequence };
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract role information from a node
|
|
3
|
+
* @param {Object} node Node with potential role information
|
|
4
|
+
* @returns {string|null} Role value or null
|
|
5
|
+
*/
|
|
6
|
+
function getRoleFromNode(node) {
|
|
7
|
+
// Check different possible locations of role information
|
|
8
|
+
return (
|
|
9
|
+
// Direct role attribute
|
|
10
|
+
node.attrs?.role ||
|
|
11
|
+
// Role in marks
|
|
12
|
+
node.marks?.find((mark) => mark.type === "role")?.attrs?.value ||
|
|
13
|
+
// Default role based on type
|
|
14
|
+
getDefaultRole(node)
|
|
15
|
+
);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Get default role based on node type and position
|
|
20
|
+
*/
|
|
21
|
+
function getDefaultRole(node) {
|
|
22
|
+
switch (node.type) {
|
|
23
|
+
case "image":
|
|
24
|
+
return "content";
|
|
25
|
+
case "link":
|
|
26
|
+
return "link";
|
|
27
|
+
default:
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Validate if a role is known for a given type
|
|
34
|
+
*/
|
|
35
|
+
function isValidRole(type, role) {
|
|
36
|
+
const validRoles = {
|
|
37
|
+
image: ["background", "content", "gallery", "icon"],
|
|
38
|
+
link: [
|
|
39
|
+
"button",
|
|
40
|
+
"button-primary",
|
|
41
|
+
"button-outline",
|
|
42
|
+
"nav-link",
|
|
43
|
+
"footer-link",
|
|
44
|
+
],
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
return validRoles[type]?.includes(role) || false;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export {
|
|
51
|
+
getRoleFromNode,
|
|
52
|
+
isValidRole,
|
|
53
|
+
};
|