@uniweb/semantic-parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -0
- package/.eslintrc.json +28 -0
- package/LICENSE +674 -0
- package/README.md +395 -0
- package/docs/api.md +352 -0
- package/docs/file-structure.md +50 -0
- package/docs/guide.md +206 -0
- package/docs/mapping-patterns.md +928 -0
- package/docs/text-component-reference.md +515 -0
- package/package.json +41 -0
- package/reference/README.md +195 -0
- package/reference/Text.js +188 -0
- package/src/index.js +35 -0
- package/src/mappers/accessor.js +312 -0
- package/src/mappers/extractors.js +397 -0
- package/src/mappers/helpers.js +234 -0
- package/src/mappers/index.js +28 -0
- package/src/mappers/types.js +495 -0
- package/src/processors/byType.js +129 -0
- package/src/processors/groups.js +330 -0
- package/src/processors/groups_backup.js +379 -0
- package/src/processors/groups_doc.md +179 -0
- package/src/processors/sequence.js +573 -0
- package/src/processors/sequence_backup.js +402 -0
- package/src/utils/role.js +53 -0
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Field type definitions for content transformation
|
|
3
|
+
* Handles automatic cleanup and transformation based on component requirements
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Strip all HTML/markup from text, preserving only text content
|
|
8
|
+
* @param {string} text - Text with potential markup
|
|
9
|
+
* @param {Object} options - Stripping options
|
|
10
|
+
* @returns {string} Plain text
|
|
11
|
+
*/
|
|
12
|
+
function stripMarkup(text, options = {}) {
|
|
13
|
+
if (typeof text !== 'string') return '';
|
|
14
|
+
|
|
15
|
+
const {
|
|
16
|
+
preserveLineBreaks = false,
|
|
17
|
+
preserveWhitespace = false
|
|
18
|
+
} = options;
|
|
19
|
+
|
|
20
|
+
let result = text;
|
|
21
|
+
|
|
22
|
+
// Convert <br> to newlines if preserving line breaks
|
|
23
|
+
if (preserveLineBreaks) {
|
|
24
|
+
result = result.replace(/<br\s*\/?>/gi, '\n');
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Remove all HTML tags
|
|
28
|
+
result = result.replace(/<[^>]*>/g, '');
|
|
29
|
+
|
|
30
|
+
// Decode HTML entities
|
|
31
|
+
result = result
|
|
32
|
+
.replace(/ /g, ' ')
|
|
33
|
+
.replace(/&/g, '&')
|
|
34
|
+
.replace(/</g, '<')
|
|
35
|
+
.replace(/>/g, '>')
|
|
36
|
+
.replace(/"/g, '"')
|
|
37
|
+
.replace(/'/g, "'");
|
|
38
|
+
|
|
39
|
+
// Normalize whitespace unless preserving
|
|
40
|
+
if (!preserveWhitespace && !preserveLineBreaks) {
|
|
41
|
+
result = result.replace(/\s+/g, ' ').trim();
|
|
42
|
+
} else if (!preserveWhitespace && preserveLineBreaks) {
|
|
43
|
+
// Preserve line breaks but normalize spaces within lines
|
|
44
|
+
result = result.split('\n').map(line => line.replace(/\s+/g, ' ').trim()).join('\n');
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
return result;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Truncate text to specified length with smart boundary detection
|
|
52
|
+
* @param {string} text - Text to truncate
|
|
53
|
+
* @param {Object} options - Truncation options
|
|
54
|
+
* @returns {string} Truncated text
|
|
55
|
+
*/
|
|
56
|
+
function truncateText(text, options = {}) {
|
|
57
|
+
if (typeof text !== 'string') return '';
|
|
58
|
+
|
|
59
|
+
const {
|
|
60
|
+
maxLength,
|
|
61
|
+
boundary = 'word', // 'word', 'sentence', 'character'
|
|
62
|
+
ellipsis = '...',
|
|
63
|
+
stripMarkup: strip = false
|
|
64
|
+
} = options;
|
|
65
|
+
|
|
66
|
+
if (!maxLength) return text;
|
|
67
|
+
|
|
68
|
+
// Strip markup if requested
|
|
69
|
+
let result = strip ? stripMarkup(text) : text;
|
|
70
|
+
|
|
71
|
+
// Already short enough
|
|
72
|
+
if (result.length <= maxLength) return result;
|
|
73
|
+
|
|
74
|
+
// Truncate with boundary awareness
|
|
75
|
+
if (boundary === 'character') {
|
|
76
|
+
return result.substring(0, maxLength) + ellipsis;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (boundary === 'sentence') {
|
|
80
|
+
// Find last sentence end before maxLength
|
|
81
|
+
const truncated = result.substring(0, maxLength);
|
|
82
|
+
const lastPeriod = Math.max(
|
|
83
|
+
truncated.lastIndexOf('. '),
|
|
84
|
+
truncated.lastIndexOf('! '),
|
|
85
|
+
truncated.lastIndexOf('? ')
|
|
86
|
+
);
|
|
87
|
+
|
|
88
|
+
if (lastPeriod > maxLength * 0.5) {
|
|
89
|
+
return result.substring(0, lastPeriod + 1);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Word boundary (default)
|
|
94
|
+
const truncated = result.substring(0, maxLength);
|
|
95
|
+
const lastSpace = truncated.lastIndexOf(' ');
|
|
96
|
+
|
|
97
|
+
if (lastSpace > maxLength * 0.7) {
|
|
98
|
+
return result.substring(0, lastSpace) + ellipsis;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return truncated + ellipsis;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Sanitize HTML, removing dangerous tags while preserving safe formatting
|
|
106
|
+
* @param {string} html - HTML to sanitize
|
|
107
|
+
* @param {Object} options - Sanitization options
|
|
108
|
+
* @returns {string} Sanitized HTML
|
|
109
|
+
*/
|
|
110
|
+
function sanitizeHtml(html, options = {}) {
|
|
111
|
+
if (typeof html !== 'string') return '';
|
|
112
|
+
|
|
113
|
+
const {
|
|
114
|
+
allowedTags = ['strong', 'em', 'a', 'br'],
|
|
115
|
+
stripTags = ['script', 'style', 'iframe', 'object', 'embed']
|
|
116
|
+
} = options;
|
|
117
|
+
|
|
118
|
+
let result = html;
|
|
119
|
+
|
|
120
|
+
// Remove explicitly forbidden tags and their content
|
|
121
|
+
stripTags.forEach(tag => {
|
|
122
|
+
const regex = new RegExp(`<${tag}[^>]*>.*?<\/${tag}>`, 'gis');
|
|
123
|
+
result = result.replace(regex, '');
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
// Remove tags not in allowedTags
|
|
127
|
+
const tagRegex = /<\/?([a-z][a-z0-9]*)\b[^>]*>/gi;
|
|
128
|
+
result = result.replace(tagRegex, (match, tagName) => {
|
|
129
|
+
if (allowedTags.includes(tagName.toLowerCase())) {
|
|
130
|
+
// Keep allowed tags, but sanitize attributes for anchors
|
|
131
|
+
if (tagName.toLowerCase() === 'a') {
|
|
132
|
+
const hrefMatch = match.match(/href=["']([^"']+)["']/);
|
|
133
|
+
if (hrefMatch) {
|
|
134
|
+
return `<a href="${hrefMatch[1]}">`;
|
|
135
|
+
}
|
|
136
|
+
return match.includes('</') ? '</a>' : '<a>';
|
|
137
|
+
}
|
|
138
|
+
return match;
|
|
139
|
+
}
|
|
140
|
+
return '';
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
return result;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Create an excerpt from text content
|
|
148
|
+
* @param {string|Array} content - Text or array of paragraphs
|
|
149
|
+
* @param {Object} options - Excerpt options
|
|
150
|
+
* @returns {string} Excerpt
|
|
151
|
+
*/
|
|
152
|
+
function createExcerpt(content, options = {}) {
|
|
153
|
+
const {
|
|
154
|
+
maxLength = 150,
|
|
155
|
+
boundary = 'word',
|
|
156
|
+
ellipsis = '...',
|
|
157
|
+
preferFirstSentence = true
|
|
158
|
+
} = options;
|
|
159
|
+
|
|
160
|
+
// Convert array to string
|
|
161
|
+
let text = Array.isArray(content) ? content.join(' ') : content;
|
|
162
|
+
if (typeof text !== 'string') return '';
|
|
163
|
+
|
|
164
|
+
// Always strip markup for excerpts
|
|
165
|
+
text = stripMarkup(text);
|
|
166
|
+
|
|
167
|
+
// Try to get first sentence if preferred and not too long
|
|
168
|
+
if (preferFirstSentence) {
|
|
169
|
+
const firstSentence = text.match(/^[^.!?]+[.!?]/);
|
|
170
|
+
if (firstSentence && firstSentence[0].length <= maxLength * 1.2) {
|
|
171
|
+
return firstSentence[0].trim();
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return truncateText(text, { maxLength, boundary, ellipsis });
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Type handlers for field transformations
|
|
180
|
+
*/
|
|
181
|
+
const typeHandlers = {
|
|
182
|
+
/**
|
|
183
|
+
* Plain text - strips all markup
|
|
184
|
+
*/
|
|
185
|
+
plaintext: {
|
|
186
|
+
transform: (value, options = {}) => {
|
|
187
|
+
if (value === null || value === undefined) return '';
|
|
188
|
+
|
|
189
|
+
const text = String(value);
|
|
190
|
+
let result = stripMarkup(text, options);
|
|
191
|
+
|
|
192
|
+
if (options.maxLength) {
|
|
193
|
+
result = truncateText(result, {
|
|
194
|
+
maxLength: options.maxLength,
|
|
195
|
+
boundary: options.boundary || 'word',
|
|
196
|
+
ellipsis: options.ellipsis || '...'
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (options.transform) {
|
|
201
|
+
result = options.transform(result);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return result;
|
|
205
|
+
},
|
|
206
|
+
|
|
207
|
+
validate: (value, rules = {}, context = 'visual-editor') => {
|
|
208
|
+
const errors = [];
|
|
209
|
+
const stripped = stripMarkup(String(value || ''));
|
|
210
|
+
|
|
211
|
+
// Only warn in build mode
|
|
212
|
+
if (context === 'build') {
|
|
213
|
+
if (/<[^>]*>/.test(value)) {
|
|
214
|
+
errors.push({
|
|
215
|
+
field: rules.fieldName,
|
|
216
|
+
type: 'markup_detected',
|
|
217
|
+
message: 'Field contains HTML markup but expects plain text',
|
|
218
|
+
severity: 'warning',
|
|
219
|
+
autoFix: true
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if (rules.required && !stripped) {
|
|
225
|
+
errors.push({
|
|
226
|
+
field: rules.fieldName,
|
|
227
|
+
type: 'required',
|
|
228
|
+
message: 'Required field is missing',
|
|
229
|
+
severity: 'error',
|
|
230
|
+
autoFix: false
|
|
231
|
+
});
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (rules.maxLength && stripped.length > rules.maxLength) {
|
|
235
|
+
errors.push({
|
|
236
|
+
field: rules.fieldName,
|
|
237
|
+
type: 'max_length',
|
|
238
|
+
message: `Text is ${stripped.length} characters (max: ${rules.maxLength})`,
|
|
239
|
+
severity: context === 'build' ? 'warning' : 'info',
|
|
240
|
+
autoFix: true
|
|
241
|
+
});
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if (rules.minLength && stripped.length < rules.minLength) {
|
|
245
|
+
errors.push({
|
|
246
|
+
field: rules.fieldName,
|
|
247
|
+
type: 'min_length',
|
|
248
|
+
message: `Text is ${stripped.length} characters (min: ${rules.minLength})`,
|
|
249
|
+
severity: 'warning',
|
|
250
|
+
autoFix: false
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return errors;
|
|
255
|
+
}
|
|
256
|
+
},
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Rich text - preserves safe HTML, removes dangerous tags
|
|
260
|
+
*/
|
|
261
|
+
richtext: {
|
|
262
|
+
transform: (value, options = {}) => {
|
|
263
|
+
if (value === null || value === undefined) return '';
|
|
264
|
+
|
|
265
|
+
const text = String(value);
|
|
266
|
+
let result = sanitizeHtml(text, {
|
|
267
|
+
allowedTags: options.allowedTags || ['strong', 'em', 'a', 'br'],
|
|
268
|
+
stripTags: options.stripTags || ['script', 'style', 'iframe']
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
if (options.maxLength) {
|
|
272
|
+
// For richtext, truncate but preserve markup
|
|
273
|
+
result = truncateText(result, {
|
|
274
|
+
maxLength: options.maxLength,
|
|
275
|
+
boundary: options.boundary || 'word',
|
|
276
|
+
ellipsis: options.ellipsis || '...',
|
|
277
|
+
stripMarkup: false
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return result;
|
|
282
|
+
},
|
|
283
|
+
|
|
284
|
+
validate: (value, rules = {}, context = 'visual-editor') => {
|
|
285
|
+
const errors = [];
|
|
286
|
+
|
|
287
|
+
if (rules.required && !stripMarkup(String(value || ''))) {
|
|
288
|
+
errors.push({
|
|
289
|
+
field: rules.fieldName,
|
|
290
|
+
type: 'required',
|
|
291
|
+
message: 'Required field is missing',
|
|
292
|
+
severity: 'error',
|
|
293
|
+
autoFix: false
|
|
294
|
+
});
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return errors;
|
|
298
|
+
}
|
|
299
|
+
},
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Excerpt - auto-generates excerpt from content
|
|
303
|
+
*/
|
|
304
|
+
excerpt: {
|
|
305
|
+
transform: (value, options = {}) => {
|
|
306
|
+
return createExcerpt(value, {
|
|
307
|
+
maxLength: options.maxLength || 150,
|
|
308
|
+
boundary: options.boundary || 'word',
|
|
309
|
+
ellipsis: options.ellipsis || '...',
|
|
310
|
+
preferFirstSentence: options.preferFirstSentence !== false
|
|
311
|
+
});
|
|
312
|
+
},
|
|
313
|
+
|
|
314
|
+
validate: () => [] // Excerpts are auto-generated, no validation needed
|
|
315
|
+
},
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Number - parses and formats numbers
|
|
319
|
+
*/
|
|
320
|
+
number: {
|
|
321
|
+
transform: (value, options = {}) => {
|
|
322
|
+
const num = parseFloat(value);
|
|
323
|
+
if (isNaN(num)) {
|
|
324
|
+
return options.defaultValue !== undefined ? options.defaultValue : 0;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
if (options.format) {
|
|
328
|
+
// Simple number formatting
|
|
329
|
+
const { decimals, thousands = ',', decimal = '.' } = options.format;
|
|
330
|
+
|
|
331
|
+
let result = decimals !== undefined
|
|
332
|
+
? num.toFixed(decimals)
|
|
333
|
+
: String(num);
|
|
334
|
+
|
|
335
|
+
if (thousands) {
|
|
336
|
+
const parts = result.split('.');
|
|
337
|
+
parts[0] = parts[0].replace(/\B(?=(\d{3})+(?!\d))/g, thousands);
|
|
338
|
+
result = parts.join(decimal);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
return result;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return num;
|
|
345
|
+
},
|
|
346
|
+
|
|
347
|
+
validate: (value, rules = {}) => {
|
|
348
|
+
const errors = [];
|
|
349
|
+
const num = parseFloat(value);
|
|
350
|
+
|
|
351
|
+
if (rules.required && isNaN(num)) {
|
|
352
|
+
errors.push({
|
|
353
|
+
field: rules.fieldName,
|
|
354
|
+
type: 'invalid_number',
|
|
355
|
+
message: 'Value is not a valid number',
|
|
356
|
+
severity: 'error',
|
|
357
|
+
autoFix: false
|
|
358
|
+
});
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
return errors;
|
|
362
|
+
}
|
|
363
|
+
},
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Image - processes image data
|
|
367
|
+
*/
|
|
368
|
+
image: {
|
|
369
|
+
transform: (value, options = {}) => {
|
|
370
|
+
if (!value) {
|
|
371
|
+
return options.defaultValue || null;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Handle string (URL) or object (full image data)
|
|
375
|
+
if (typeof value === 'string') {
|
|
376
|
+
return {
|
|
377
|
+
url: value,
|
|
378
|
+
alt: options.defaultAlt || '',
|
|
379
|
+
caption: null
|
|
380
|
+
};
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
return {
|
|
384
|
+
url: value.url || value.src,
|
|
385
|
+
alt: value.alt || options.defaultAlt || '',
|
|
386
|
+
caption: value.caption || value.title || null,
|
|
387
|
+
width: value.width,
|
|
388
|
+
height: value.height
|
|
389
|
+
};
|
|
390
|
+
},
|
|
391
|
+
|
|
392
|
+
validate: (value, rules = {}) => {
|
|
393
|
+
const errors = [];
|
|
394
|
+
|
|
395
|
+
if (rules.required && !value) {
|
|
396
|
+
errors.push({
|
|
397
|
+
field: rules.fieldName,
|
|
398
|
+
type: 'required',
|
|
399
|
+
message: 'Required image is missing',
|
|
400
|
+
severity: 'error',
|
|
401
|
+
autoFix: false
|
|
402
|
+
});
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
return errors;
|
|
406
|
+
}
|
|
407
|
+
},
|
|
408
|
+
|
|
409
|
+
/**
|
|
410
|
+
* Link - processes link data
|
|
411
|
+
*/
|
|
412
|
+
link: {
|
|
413
|
+
transform: (value, options = {}) => {
|
|
414
|
+
if (!value) {
|
|
415
|
+
return options.defaultValue || null;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
// Handle string (URL) or object (full link data)
|
|
419
|
+
if (typeof value === 'string') {
|
|
420
|
+
return {
|
|
421
|
+
href: value,
|
|
422
|
+
label: options.defaultLabel || value,
|
|
423
|
+
target: value.startsWith('http') ? '_blank' : '_self'
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
return {
|
|
428
|
+
href: value.href || value.url,
|
|
429
|
+
label: value.label || value.text || value.href,
|
|
430
|
+
target: value.target || (value.external ? '_blank' : '_self')
|
|
431
|
+
};
|
|
432
|
+
},
|
|
433
|
+
|
|
434
|
+
validate: (value, rules = {}) => {
|
|
435
|
+
const errors = [];
|
|
436
|
+
|
|
437
|
+
if (rules.required && !value) {
|
|
438
|
+
errors.push({
|
|
439
|
+
field: rules.fieldName,
|
|
440
|
+
type: 'required',
|
|
441
|
+
message: 'Required link is missing',
|
|
442
|
+
severity: 'error',
|
|
443
|
+
autoFix: false
|
|
444
|
+
});
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
return errors;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
};
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* Apply type transformation to a value
|
|
454
|
+
* @param {*} value - Value to transform
|
|
455
|
+
* @param {string} type - Field type
|
|
456
|
+
* @param {Object} options - Type-specific options
|
|
457
|
+
* @returns {*} Transformed value
|
|
458
|
+
*/
|
|
459
|
+
function applyType(value, type, options = {}) {
|
|
460
|
+
const handler = typeHandlers[type];
|
|
461
|
+
if (!handler) {
|
|
462
|
+
console.warn(`Unknown field type: ${type}`);
|
|
463
|
+
return value;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
return handler.transform(value, options);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
/**
|
|
470
|
+
* Validate value against type and rules
|
|
471
|
+
* @param {*} value - Value to validate
|
|
472
|
+
* @param {string} type - Field type
|
|
473
|
+
* @param {Object} rules - Validation rules
|
|
474
|
+
* @param {string} context - Execution context (visual-editor or build)
|
|
475
|
+
* @returns {Array} Array of validation errors/warnings
|
|
476
|
+
*/
|
|
477
|
+
function validateType(value, type, rules = {}, context = 'visual-editor') {
|
|
478
|
+
const handler = typeHandlers[type];
|
|
479
|
+
if (!handler) {
|
|
480
|
+
return [];
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
return handler.validate(value, rules, context);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
export {
|
|
487
|
+
typeHandlers,
|
|
488
|
+
applyType,
|
|
489
|
+
validateType,
|
|
490
|
+
// Export utilities for direct use
|
|
491
|
+
stripMarkup,
|
|
492
|
+
truncateText,
|
|
493
|
+
sanitizeHtml,
|
|
494
|
+
createExcerpt
|
|
495
|
+
};
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Organize content elements by their type while preserving context
|
|
3
|
+
* @param {Array} sequence Flat sequence of elements
|
|
4
|
+
* @returns {Object} Content organized by type
|
|
5
|
+
*/
|
|
6
|
+
function processByType(sequence) {
|
|
7
|
+
const collections = {
|
|
8
|
+
headings: [],
|
|
9
|
+
paragraphs: [],
|
|
10
|
+
images: {
|
|
11
|
+
background: [],
|
|
12
|
+
content: [],
|
|
13
|
+
gallery: [],
|
|
14
|
+
icon: [],
|
|
15
|
+
},
|
|
16
|
+
lists: [],
|
|
17
|
+
dividers: [],
|
|
18
|
+
metadata: {
|
|
19
|
+
totalElements: sequence.length,
|
|
20
|
+
dominantType: null,
|
|
21
|
+
hasMedia: false,
|
|
22
|
+
},
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
// Track type frequencies for metadata
|
|
26
|
+
const typeFrequency = new Map();
|
|
27
|
+
|
|
28
|
+
sequence.forEach((element, index) => {
|
|
29
|
+
// Track element type frequency
|
|
30
|
+
typeFrequency.set(element.type, (typeFrequency.get(element.type) || 0) + 1);
|
|
31
|
+
|
|
32
|
+
// Add context information
|
|
33
|
+
const context = getElementContext(sequence, index);
|
|
34
|
+
const enrichedElement = { ...element, context };
|
|
35
|
+
|
|
36
|
+
// Process element based on type
|
|
37
|
+
switch (element.type) {
|
|
38
|
+
case "heading":
|
|
39
|
+
collections.headings.push(enrichedElement);
|
|
40
|
+
break;
|
|
41
|
+
|
|
42
|
+
case "paragraph":
|
|
43
|
+
collections.paragraphs.push(enrichedElement);
|
|
44
|
+
break;
|
|
45
|
+
|
|
46
|
+
case "image": {
|
|
47
|
+
const role = element.role || "content";
|
|
48
|
+
if (!collections.images[role]) {
|
|
49
|
+
collections.images[role] = [];
|
|
50
|
+
}
|
|
51
|
+
collections.images[role].push(enrichedElement);
|
|
52
|
+
collections.metadata.hasMedia = true;
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
case "list":
|
|
57
|
+
collections.lists.push(enrichedElement);
|
|
58
|
+
break;
|
|
59
|
+
|
|
60
|
+
case "divider":
|
|
61
|
+
collections.dividers.push(enrichedElement);
|
|
62
|
+
break;
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
// Calculate dominant type
|
|
67
|
+
let maxFrequency = 0;
|
|
68
|
+
typeFrequency.forEach((frequency, type) => {
|
|
69
|
+
if (frequency > maxFrequency) {
|
|
70
|
+
maxFrequency = frequency;
|
|
71
|
+
collections.metadata.dominantType = type;
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
// Add helper methods
|
|
76
|
+
addCollectionHelpers(collections);
|
|
77
|
+
|
|
78
|
+
return collections;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Get context information for an element
|
|
83
|
+
*/
|
|
84
|
+
function getElementContext(sequence, position) {
|
|
85
|
+
const context = {
|
|
86
|
+
position,
|
|
87
|
+
previousElement: position > 0 ? sequence[position - 1] : null,
|
|
88
|
+
nextElement: position < sequence.length - 1 ? sequence[position + 1] : null,
|
|
89
|
+
nearestHeading: null,
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
// Find nearest preceding heading
|
|
93
|
+
for (let i = position - 1; i >= 0; i--) {
|
|
94
|
+
if (sequence[i].type === "heading") {
|
|
95
|
+
context.nearestHeading = sequence[i];
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return context;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Add helper methods to collections
|
|
105
|
+
*/
|
|
106
|
+
function addCollectionHelpers(collections) {
|
|
107
|
+
// Get headings of specific level
|
|
108
|
+
collections.getHeadingsByLevel = function (level) {
|
|
109
|
+
return this.headings.filter((h) => h.level === level);
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
// Get elements by heading context
|
|
113
|
+
collections.getElementsByHeadingContext = function (headingFilter) {
|
|
114
|
+
const allElements = [
|
|
115
|
+
...this.paragraphs,
|
|
116
|
+
...Object.values(this.images).flat(),
|
|
117
|
+
...this.lists,
|
|
118
|
+
];
|
|
119
|
+
|
|
120
|
+
return allElements.filter(
|
|
121
|
+
(el) =>
|
|
122
|
+
el.context?.nearestHeading && headingFilter(el.context.nearestHeading)
|
|
123
|
+
);
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
export {
|
|
128
|
+
processByType
|
|
129
|
+
};
|