@uniweb/semantic-parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -0
- package/.eslintrc.json +28 -0
- package/LICENSE +674 -0
- package/README.md +395 -0
- package/docs/api.md +352 -0
- package/docs/file-structure.md +50 -0
- package/docs/guide.md +206 -0
- package/docs/mapping-patterns.md +928 -0
- package/docs/text-component-reference.md +515 -0
- package/package.json +41 -0
- package/reference/README.md +195 -0
- package/reference/Text.js +188 -0
- package/src/index.js +35 -0
- package/src/mappers/accessor.js +312 -0
- package/src/mappers/extractors.js +397 -0
- package/src/mappers/helpers.js +234 -0
- package/src/mappers/index.js +28 -0
- package/src/mappers/types.js +495 -0
- package/src/processors/byType.js +129 -0
- package/src/processors/groups.js +330 -0
- package/src/processors/groups_backup.js +379 -0
- package/src/processors/groups_doc.md +179 -0
- package/src/processors/sequence.js +573 -0
- package/src/processors/sequence_backup.js +402 -0
- package/src/utils/role.js +53 -0
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Transform a sequence into content groups with semantic structure
|
|
3
|
+
* @param {Array} sequence Flat sequence of elements
|
|
4
|
+
* @param {Object} options Parsing options
|
|
5
|
+
* @returns {Object} Content organized into groups with identified main content
|
|
6
|
+
*/
|
|
7
|
+
function processGroups(sequence, options = {}) {
|
|
8
|
+
const result = {
|
|
9
|
+
main: null,
|
|
10
|
+
items: [],
|
|
11
|
+
metadata: {
|
|
12
|
+
dividerMode: false,
|
|
13
|
+
groups: 0,
|
|
14
|
+
},
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
if (!sequence.length) return result;
|
|
18
|
+
|
|
19
|
+
const groups = splitBySlices(sequence);
|
|
20
|
+
|
|
21
|
+
// Process each group's structure
|
|
22
|
+
const processedGroups = groups.map((group) => processGroupContent(group));
|
|
23
|
+
|
|
24
|
+
// Special handling for first group in divider mode
|
|
25
|
+
if (result.metadata.dividerMode && groups.startsWithDivider) {
|
|
26
|
+
result.items = processedGroups;
|
|
27
|
+
} else {
|
|
28
|
+
// Organize into main content and items
|
|
29
|
+
const shouldBeMain = identifyMainContent(processedGroups);
|
|
30
|
+
if (shouldBeMain) {
|
|
31
|
+
result.main = processedGroups[0];
|
|
32
|
+
result.items = processedGroups.slice(1);
|
|
33
|
+
} else {
|
|
34
|
+
result.items = processedGroups;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// result.metadata.groups = processedGroups.length;
|
|
39
|
+
return result;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function splitBySlices(sequence) {
|
|
43
|
+
const groups = [];
|
|
44
|
+
let currentGroup = [];
|
|
45
|
+
|
|
46
|
+
for (let i = 0; i < sequence.length; i++) {
|
|
47
|
+
const element = sequence[i];
|
|
48
|
+
|
|
49
|
+
// 1. Handle Dividers (Explicit Split)
|
|
50
|
+
if (element.type === "divider") {
|
|
51
|
+
// Close current group if it has content
|
|
52
|
+
if (currentGroup.length > 0) {
|
|
53
|
+
groups.push(currentGroup);
|
|
54
|
+
currentGroup = [];
|
|
55
|
+
}
|
|
56
|
+
continue; // Consume the divider (don't add to group)
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// 2. Handle Headings (Semantic Split)
|
|
60
|
+
if (element.type === "heading") {
|
|
61
|
+
// SPECIAL CASE: Banner Image for the whole content
|
|
62
|
+
// If we are at the second element (index 1), and the first element was a banner image,
|
|
63
|
+
// we do NOT close the group. We let the heading merge with the image.
|
|
64
|
+
const isBannerMerge = i === 1 && isBannerImage(sequence, 0);
|
|
65
|
+
|
|
66
|
+
// A new Heading Group starts a new visual block.
|
|
67
|
+
// If we have gathered content in the current group, close it now.
|
|
68
|
+
if (currentGroup.length > 0 && !isBannerMerge) {
|
|
69
|
+
groups.push(currentGroup);
|
|
70
|
+
currentGroup = [];
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Consume the entire semantic heading block (Title + Subtitles)
|
|
74
|
+
// We reuse your smart readHeadingGroup logic here!
|
|
75
|
+
const headingBlock = readHeadingGroup(sequence, i);
|
|
76
|
+
currentGroup.push(...headingBlock);
|
|
77
|
+
|
|
78
|
+
// Advance the index by the number of headings consumed
|
|
79
|
+
// (Loop increments i by 1, so we add length - 1)
|
|
80
|
+
i += headingBlock.length - 1;
|
|
81
|
+
} else {
|
|
82
|
+
// 3. Handle Content (Body)
|
|
83
|
+
// Paragraphs, images, lists, etc. just append to the current slice.
|
|
84
|
+
currentGroup.push(element);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Push the final group if not empty
|
|
89
|
+
if (currentGroup.length > 0) {
|
|
90
|
+
groups.push(currentGroup);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return groups;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Check if this is a pretitle - any heading followed by a more important heading
|
|
98
|
+
* (e.g., H3→H1, H2→H1, H6→H5, etc.)
|
|
99
|
+
*/
|
|
100
|
+
function isPreTitle(sequence, i) {
|
|
101
|
+
return (
|
|
102
|
+
i + 1 < sequence.length &&
|
|
103
|
+
sequence[i].type === "heading" &&
|
|
104
|
+
sequence[i + 1].type === "heading" &&
|
|
105
|
+
sequence[i].level > sequence[i + 1].level // Smaller heading before larger
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function isBannerImage(sequence, i) {
|
|
110
|
+
return (
|
|
111
|
+
i === 0 &&
|
|
112
|
+
i + 1 < sequence.length &&
|
|
113
|
+
sequence[i].type === "image" &&
|
|
114
|
+
(sequence[i].role === "banner" || sequence[i + 1].type === "heading")
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function readHeadingGroup(sequence, startIdx) {
|
|
119
|
+
const elements = [sequence[startIdx]];
|
|
120
|
+
|
|
121
|
+
// Iterate starting from the next element
|
|
122
|
+
for (let i = startIdx + 1; i < sequence.length; i++) {
|
|
123
|
+
const element = sequence[i];
|
|
124
|
+
const previousElement = elements[elements.length - 1];
|
|
125
|
+
|
|
126
|
+
if (element.type !== "heading") {
|
|
127
|
+
break;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Case 1: Strictly Deeper (Standard Subtitle/Deep Header)
|
|
131
|
+
// e.g. H1 -> H2
|
|
132
|
+
if (element.level > previousElement.level) {
|
|
133
|
+
elements.push(element);
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Case 2: Pretitle Promotion (Small -> Big)
|
|
138
|
+
// Only allowed if we haven't gone deep yet (length is 1)
|
|
139
|
+
// e.g. H2 -> H1
|
|
140
|
+
if (elements.length === 1 && element.level < previousElement.level) {
|
|
141
|
+
elements.push(element);
|
|
142
|
+
continue;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Otherwise (Sibling or New Section), stop.
|
|
146
|
+
break;
|
|
147
|
+
}
|
|
148
|
+
return elements;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Process a group's content to identify its structure
|
|
153
|
+
*/
|
|
154
|
+
function processGroupContent(elements) {
|
|
155
|
+
const header = {
|
|
156
|
+
pretitle: "",
|
|
157
|
+
title: "",
|
|
158
|
+
subtitle: "",
|
|
159
|
+
subtitle2: "",
|
|
160
|
+
alignment: null,
|
|
161
|
+
};
|
|
162
|
+
|
|
163
|
+
const body = {
|
|
164
|
+
imgs: [],
|
|
165
|
+
icons: [],
|
|
166
|
+
videos: [],
|
|
167
|
+
paragraphs: [],
|
|
168
|
+
links: [],
|
|
169
|
+
lists: [],
|
|
170
|
+
buttons: [],
|
|
171
|
+
properties: {},
|
|
172
|
+
propertyBlocks: [],
|
|
173
|
+
cards: [],
|
|
174
|
+
documents: [],
|
|
175
|
+
forms: [],
|
|
176
|
+
quotes: [],
|
|
177
|
+
headings: [],
|
|
178
|
+
};
|
|
179
|
+
|
|
180
|
+
const metadata = {
|
|
181
|
+
level: null,
|
|
182
|
+
contentTypes: new Set(),
|
|
183
|
+
};
|
|
184
|
+
|
|
185
|
+
if (!elements)
|
|
186
|
+
return {
|
|
187
|
+
header,
|
|
188
|
+
body,
|
|
189
|
+
metadata,
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
for (let i = 0; i < elements.length; i++) {
|
|
193
|
+
//We shuold only set pretitle once
|
|
194
|
+
if (isPreTitle(elements, i) && !header.pretitle) {
|
|
195
|
+
header.pretitle = elements[i].text;
|
|
196
|
+
i++; // move to known next heading (H1 or h2)
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
const element = elements[i];
|
|
200
|
+
|
|
201
|
+
if (element.type === "heading") {
|
|
202
|
+
if (element.children && Array.isArray(element.children))
|
|
203
|
+
processInlineElements(element.children, body);
|
|
204
|
+
|
|
205
|
+
//We shuold set the group level to the highest one instead of the first one.
|
|
206
|
+
metadata.level ??= element.level;
|
|
207
|
+
|
|
208
|
+
// Extract alignment from first heading
|
|
209
|
+
if (!header.alignment && element.attrs?.textAlign) {
|
|
210
|
+
header.alignment = element.attrs.textAlign;
|
|
211
|
+
}
|
|
212
|
+
// h3 h2 h1 h1
|
|
213
|
+
// Assign to header fields
|
|
214
|
+
// h3 h2 h3 h4
|
|
215
|
+
if (!header.title) {
|
|
216
|
+
header.title = element.text;
|
|
217
|
+
} else if (!header.subtitle) {
|
|
218
|
+
header.subtitle = element.text;
|
|
219
|
+
} else if (!header.subtitle2) {
|
|
220
|
+
header.subtitle2 = element.text;
|
|
221
|
+
} else {
|
|
222
|
+
// After subtitle2, we're in body - collect heading
|
|
223
|
+
body.headings.push(element.text);
|
|
224
|
+
}
|
|
225
|
+
} else if (element.type === "list") {
|
|
226
|
+
const listItems = element.children;
|
|
227
|
+
|
|
228
|
+
body.lists.push(
|
|
229
|
+
listItems.map((listItem) => processGroupContent(listItem).body)
|
|
230
|
+
);
|
|
231
|
+
} else {
|
|
232
|
+
let preserveProps = {
|
|
233
|
+
...element.attrs,
|
|
234
|
+
};
|
|
235
|
+
|
|
236
|
+
switch (element.type) {
|
|
237
|
+
case "paragraph":
|
|
238
|
+
if (element.children && Array.isArray(element.children))
|
|
239
|
+
processInlineElements(element.children, body);
|
|
240
|
+
|
|
241
|
+
if (element.text) body.paragraphs.push(element.text);
|
|
242
|
+
break;
|
|
243
|
+
|
|
244
|
+
case "image":
|
|
245
|
+
body.imgs.push(preserveProps);
|
|
246
|
+
break;
|
|
247
|
+
|
|
248
|
+
case "video":
|
|
249
|
+
body.videos.push(preserveProps);
|
|
250
|
+
break;
|
|
251
|
+
|
|
252
|
+
case "link":
|
|
253
|
+
if (element.children && Array.isArray(element.children))
|
|
254
|
+
processInlineElements(element.children, body);
|
|
255
|
+
|
|
256
|
+
body.links.push(preserveProps);
|
|
257
|
+
break;
|
|
258
|
+
|
|
259
|
+
case "icon":
|
|
260
|
+
//Might be string
|
|
261
|
+
body.icons.push(element.attrs);
|
|
262
|
+
break;
|
|
263
|
+
|
|
264
|
+
case "button":
|
|
265
|
+
body.buttons.push({
|
|
266
|
+
attrs: element.attrs,
|
|
267
|
+
content: element.text,
|
|
268
|
+
});
|
|
269
|
+
break;
|
|
270
|
+
|
|
271
|
+
case "blockquote":
|
|
272
|
+
// Process blockquote content recursively
|
|
273
|
+
const quoteContent = processGroupContent(element.children);
|
|
274
|
+
body.quotes.push(quoteContent.body);
|
|
275
|
+
break;
|
|
276
|
+
|
|
277
|
+
case "codeBlock":
|
|
278
|
+
const codeData = element.text;
|
|
279
|
+
body.properties = codeData; // Last one
|
|
280
|
+
body.propertyBlocks.push(codeData); // All of them
|
|
281
|
+
break;
|
|
282
|
+
|
|
283
|
+
case "form":
|
|
284
|
+
body.forms.push(element.data || element.attrs);
|
|
285
|
+
break;
|
|
286
|
+
|
|
287
|
+
case "card-group":
|
|
288
|
+
body.cards.push(...element.cards);
|
|
289
|
+
break;
|
|
290
|
+
|
|
291
|
+
case "document-group":
|
|
292
|
+
body.documents.push(...element.documents);
|
|
293
|
+
break;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return {
|
|
299
|
+
header,
|
|
300
|
+
body,
|
|
301
|
+
metadata,
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Determine if the first group should be treated as main content
|
|
307
|
+
*/
|
|
308
|
+
function identifyMainContent(groups) {
|
|
309
|
+
if (groups.length === 0) return false;
|
|
310
|
+
|
|
311
|
+
// Single group is main content
|
|
312
|
+
if (groups.length === 1) return true;
|
|
313
|
+
|
|
314
|
+
// First group should be more important (lower level) than second to be main
|
|
315
|
+
const first = groups[0].metadata.level;
|
|
316
|
+
const second = groups[1].metadata.level;
|
|
317
|
+
|
|
318
|
+
return first ? !second || first < second : false;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
function processInlineElements(children, body) {
|
|
322
|
+
children.forEach((item) => {
|
|
323
|
+
//Handle icons only for now
|
|
324
|
+
if (item.type === "icon") {
|
|
325
|
+
body.icons.push(item.attrs);
|
|
326
|
+
}
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
export { processGroups };
|
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Transform a sequence into content groups with semantic structure
|
|
3
|
+
* @param {Array} sequence Flat sequence of elements
|
|
4
|
+
* @param {Object} options Parsing options
|
|
5
|
+
* @returns {Object} Content organized into groups with identified main content
|
|
6
|
+
*/
|
|
7
|
+
function processGroups(sequence, options = {}) {
|
|
8
|
+
const result = {
|
|
9
|
+
main: null,
|
|
10
|
+
items: [],
|
|
11
|
+
metadata: {
|
|
12
|
+
dividerMode: false,
|
|
13
|
+
groups: 0,
|
|
14
|
+
},
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
if (!sequence.length) return result;
|
|
18
|
+
|
|
19
|
+
// Check if using divider mode
|
|
20
|
+
result.metadata.dividerMode = sequence.some((el) => el.type === "divider");
|
|
21
|
+
|
|
22
|
+
// Split sequence into raw groups
|
|
23
|
+
const groups = result.metadata.dividerMode
|
|
24
|
+
? splitByDividers(sequence)
|
|
25
|
+
: splitByHeadings(sequence, options);
|
|
26
|
+
|
|
27
|
+
// Process each group's structure
|
|
28
|
+
const processedGroups = groups.map((group) => processGroupContent(group));
|
|
29
|
+
|
|
30
|
+
// Special handling for first group in divider mode
|
|
31
|
+
if (result.metadata.dividerMode && groups.startsWithDivider) {
|
|
32
|
+
result.items = processedGroups;
|
|
33
|
+
} else {
|
|
34
|
+
// Organize into main content and items
|
|
35
|
+
const shouldBeMain = identifyMainContent(processedGroups);
|
|
36
|
+
if (shouldBeMain) {
|
|
37
|
+
result.main = processedGroups[0];
|
|
38
|
+
result.items = processedGroups.slice(1);
|
|
39
|
+
} else {
|
|
40
|
+
result.items = processedGroups;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// result.metadata.groups = processedGroups.length;
|
|
45
|
+
return result;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Split sequence into groups using dividers
|
|
50
|
+
*/
|
|
51
|
+
function splitByDividers(sequence) {
|
|
52
|
+
const groups = [];
|
|
53
|
+
let currentGroup = [];
|
|
54
|
+
let startsWithDivider = false;
|
|
55
|
+
|
|
56
|
+
// Check if content effectively starts with divider (ignoring whitespace etc)
|
|
57
|
+
for (let i = 0; i < sequence.length; i++) {
|
|
58
|
+
const element = sequence[i];
|
|
59
|
+
|
|
60
|
+
if (element.type === "divider") {
|
|
61
|
+
if (currentGroup.length === 0 && groups.length === 0) {
|
|
62
|
+
startsWithDivider = true;
|
|
63
|
+
} else if (currentGroup.length > 0) {
|
|
64
|
+
groups.push(currentGroup);
|
|
65
|
+
currentGroup = [];
|
|
66
|
+
}
|
|
67
|
+
} else {
|
|
68
|
+
currentGroup.push(element);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if (currentGroup.length > 0) {
|
|
73
|
+
groups.push(currentGroup);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
groups.startsWithDivider = startsWithDivider;
|
|
77
|
+
return groups;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Split sequence into groups using heading patterns
|
|
82
|
+
*/
|
|
83
|
+
function splitByHeadings(sequence, options = {}) {
|
|
84
|
+
const groups = [];
|
|
85
|
+
let currentGroup = [];
|
|
86
|
+
let isPreOpened = false;
|
|
87
|
+
|
|
88
|
+
// Consider if current group is pre opened (only has banner or pretitle)
|
|
89
|
+
// before starting a new group.
|
|
90
|
+
const startGroup = (preOpen) => {
|
|
91
|
+
if (currentGroup.length && !isPreOpened) {
|
|
92
|
+
groups.push(currentGroup);
|
|
93
|
+
currentGroup = [];
|
|
94
|
+
}
|
|
95
|
+
isPreOpened = preOpen;
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
for (let i = 0; i < sequence.length; i++) {
|
|
99
|
+
// Only allow a banner for the first group
|
|
100
|
+
if (!groups.length && isBannerImage(sequence, i)) {
|
|
101
|
+
startGroup(true); // pre open a new group
|
|
102
|
+
currentGroup.push(sequence[i]);
|
|
103
|
+
i++; // move to known next element (it will be a heading)
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Handle special pretitle case before consuming all consecutive
|
|
107
|
+
// headings with increasing levels
|
|
108
|
+
if (isPreTitle(sequence, i)) {
|
|
109
|
+
startGroup(true); // pre open a new group
|
|
110
|
+
currentGroup.push(sequence[i]);
|
|
111
|
+
i++; // move to known next element (it will be a heading)
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const element = sequence[i];
|
|
115
|
+
|
|
116
|
+
if (element.type === "heading") {
|
|
117
|
+
const headings = readHeadingGroup(sequence, i);
|
|
118
|
+
startGroup(false);
|
|
119
|
+
|
|
120
|
+
// Add headings to the current group
|
|
121
|
+
currentGroup.push(...headings);
|
|
122
|
+
i += headings.length - 1; // skip all the added headings
|
|
123
|
+
} else {
|
|
124
|
+
currentGroup.push(element);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (currentGroup.length > 0) {
|
|
129
|
+
groups.push(currentGroup);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return groups;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Check if this is a pretitle - any heading followed by a more important heading
|
|
137
|
+
* (e.g., H3→H1, H2→H1, H6→H5, etc.)
|
|
138
|
+
*/
|
|
139
|
+
function isPreTitle(sequence, i) {
|
|
140
|
+
return (
|
|
141
|
+
i + 1 < sequence.length &&
|
|
142
|
+
sequence[i].type === "heading" &&
|
|
143
|
+
sequence[i + 1].type === "heading" &&
|
|
144
|
+
sequence[i].level > sequence[i + 1].level // Smaller heading before larger
|
|
145
|
+
);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
function isBannerImage(sequence, i) {
|
|
149
|
+
return (
|
|
150
|
+
i + 1 < sequence.length &&
|
|
151
|
+
sequence[i].type === "image" &&
|
|
152
|
+
(sequence[i].role === "banner" || sequence[i + 1].type === "heading")
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Eagerly consume all consecutive headings with increasing levels
|
|
158
|
+
* and return them as an array.
|
|
159
|
+
*/
|
|
160
|
+
function readHeadingGroup(sequence, i) {
|
|
161
|
+
const elements = [sequence[i]];
|
|
162
|
+
for (i++; i < sequence.length; i++) {
|
|
163
|
+
const element = sequence[i];
|
|
164
|
+
|
|
165
|
+
if (
|
|
166
|
+
element.type === "heading" &&
|
|
167
|
+
element.level > sequence[i - 1].level
|
|
168
|
+
) {
|
|
169
|
+
elements.push(element);
|
|
170
|
+
} else {
|
|
171
|
+
break;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
return elements;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Process a group's content to identify its structure
|
|
179
|
+
*/
|
|
180
|
+
function processGroupContent(elements) {
|
|
181
|
+
const header = {
|
|
182
|
+
pretitle: "",
|
|
183
|
+
title: "",
|
|
184
|
+
subtitle: "",
|
|
185
|
+
subtitle2: "",
|
|
186
|
+
alignment: null,
|
|
187
|
+
};
|
|
188
|
+
let banner = null;
|
|
189
|
+
const body = {
|
|
190
|
+
imgs: [],
|
|
191
|
+
icons: [],
|
|
192
|
+
videos: [],
|
|
193
|
+
paragraphs: [],
|
|
194
|
+
links: [],
|
|
195
|
+
lists: [],
|
|
196
|
+
buttons: [],
|
|
197
|
+
properties: {},
|
|
198
|
+
propertyBlocks: [],
|
|
199
|
+
cards: [],
|
|
200
|
+
documents: [],
|
|
201
|
+
forms: [],
|
|
202
|
+
quotes: [],
|
|
203
|
+
headings: [],
|
|
204
|
+
};
|
|
205
|
+
|
|
206
|
+
const metadata = {
|
|
207
|
+
level: null,
|
|
208
|
+
contentTypes: new Set(),
|
|
209
|
+
};
|
|
210
|
+
|
|
211
|
+
let inBody = false; // Track when we've finished header section
|
|
212
|
+
|
|
213
|
+
for (let i = 0; i < elements.length; i++) {
|
|
214
|
+
if (isPreTitle(elements, i)) {
|
|
215
|
+
header.pretitle = elements[i].content;
|
|
216
|
+
i++; // move to known next heading (H1 or h2)
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if (isBannerImage(elements, i)) {
|
|
220
|
+
banner = {
|
|
221
|
+
url: elements[i].src,
|
|
222
|
+
caption: elements[i].caption,
|
|
223
|
+
alt: elements[i].alt,
|
|
224
|
+
};
|
|
225
|
+
i++;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const element = elements[i];
|
|
229
|
+
|
|
230
|
+
if (element.type === "heading") {
|
|
231
|
+
metadata.level ??= element.level;
|
|
232
|
+
|
|
233
|
+
// Extract alignment from first heading
|
|
234
|
+
if (!header.alignment && element.attrs?.textAlign) {
|
|
235
|
+
header.alignment = element.attrs.textAlign;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// Assign to header fields
|
|
239
|
+
if (!header.title) {
|
|
240
|
+
header.title = element.content;
|
|
241
|
+
} else if (!header.subtitle) {
|
|
242
|
+
header.subtitle = element.content;
|
|
243
|
+
} else if (!header.subtitle2) {
|
|
244
|
+
header.subtitle2 = element.content;
|
|
245
|
+
} else {
|
|
246
|
+
// After subtitle2, we're in body - collect heading
|
|
247
|
+
inBody = true;
|
|
248
|
+
body.headings.push(element.content);
|
|
249
|
+
}
|
|
250
|
+
} else if (element.type === "list") {
|
|
251
|
+
inBody = true;
|
|
252
|
+
body.lists.push(processListContent(element));
|
|
253
|
+
} else {
|
|
254
|
+
inBody = true;
|
|
255
|
+
|
|
256
|
+
switch (element.type) {
|
|
257
|
+
case "paragraph":
|
|
258
|
+
body.paragraphs.push(element.content);
|
|
259
|
+
break;
|
|
260
|
+
|
|
261
|
+
case "image":
|
|
262
|
+
body.imgs.push({
|
|
263
|
+
url: element.src,
|
|
264
|
+
caption: element.caption,
|
|
265
|
+
alt: element.alt,
|
|
266
|
+
});
|
|
267
|
+
break;
|
|
268
|
+
|
|
269
|
+
case "link":
|
|
270
|
+
body.links.push({
|
|
271
|
+
href: element.content.href,
|
|
272
|
+
label: element.content.label,
|
|
273
|
+
});
|
|
274
|
+
break;
|
|
275
|
+
|
|
276
|
+
case "styledLink":
|
|
277
|
+
// Styled link (multi-part with same href)
|
|
278
|
+
body.links.push({
|
|
279
|
+
href: element.href,
|
|
280
|
+
label: element.content,
|
|
281
|
+
target: element.target,
|
|
282
|
+
});
|
|
283
|
+
break;
|
|
284
|
+
|
|
285
|
+
case "icon":
|
|
286
|
+
body.icons.push(element.svg);
|
|
287
|
+
break;
|
|
288
|
+
|
|
289
|
+
case "button":
|
|
290
|
+
body.buttons.push(element);
|
|
291
|
+
break;
|
|
292
|
+
|
|
293
|
+
case "video":
|
|
294
|
+
body.videos.push({
|
|
295
|
+
src: element.src,
|
|
296
|
+
caption: element.caption,
|
|
297
|
+
alt: element.alt,
|
|
298
|
+
});
|
|
299
|
+
break;
|
|
300
|
+
|
|
301
|
+
case "blockquote":
|
|
302
|
+
// Process blockquote content recursively
|
|
303
|
+
const quoteContent = processGroupContent(
|
|
304
|
+
element.content,
|
|
305
|
+
options
|
|
306
|
+
);
|
|
307
|
+
body.quotes.push(quoteContent.body);
|
|
308
|
+
break;
|
|
309
|
+
|
|
310
|
+
case "codeBlock":
|
|
311
|
+
// Use parsed JSON if available, otherwise use text content
|
|
312
|
+
const codeData =
|
|
313
|
+
element.parsed !== null
|
|
314
|
+
? element.parsed
|
|
315
|
+
: element.content;
|
|
316
|
+
body.properties = codeData; // Last one
|
|
317
|
+
body.propertyBlocks.push(codeData); // All of them
|
|
318
|
+
break;
|
|
319
|
+
|
|
320
|
+
case "card-group":
|
|
321
|
+
body.cards.push(...element.cards);
|
|
322
|
+
break;
|
|
323
|
+
|
|
324
|
+
case "document-group":
|
|
325
|
+
body.documents.push(...element.documents);
|
|
326
|
+
break;
|
|
327
|
+
|
|
328
|
+
case "form":
|
|
329
|
+
body.forms.push(element.data || element.attrs);
|
|
330
|
+
break;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
return {
|
|
336
|
+
header,
|
|
337
|
+
body,
|
|
338
|
+
banner,
|
|
339
|
+
metadata,
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
function processListContent(list) {
|
|
344
|
+
const { items } = list;
|
|
345
|
+
|
|
346
|
+
return items.map((item) => {
|
|
347
|
+
const { items: nestedList, content: listContent } = item;
|
|
348
|
+
|
|
349
|
+
const parsedContent = processGroupContent(listContent).body;
|
|
350
|
+
|
|
351
|
+
if (nestedList.length) {
|
|
352
|
+
const parsedNestedList = nestedList.map(
|
|
353
|
+
(nestedItem) => processGroupContent(nestedItem.content).body
|
|
354
|
+
);
|
|
355
|
+
|
|
356
|
+
parsedContent.lists = [parsedNestedList];
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
return parsedContent;
|
|
360
|
+
});
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Determine if the first group should be treated as main content
|
|
365
|
+
*/
|
|
366
|
+
function identifyMainContent(groups) {
|
|
367
|
+
if (groups.length === 0) return false;
|
|
368
|
+
|
|
369
|
+
// Single group is main content
|
|
370
|
+
if (groups.length === 1) return true;
|
|
371
|
+
|
|
372
|
+
// First group should be more important (lower level) than second to be main
|
|
373
|
+
const first = groups[0].metadata.level;
|
|
374
|
+
const second = groups[1].metadata.level;
|
|
375
|
+
|
|
376
|
+
return first ? !second || first < second : false;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
export { processGroups };
|