@uniweb/semantic-parser 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/processors/groups.js +40 -6
package/package.json
CHANGED
package/src/processors/groups.js
CHANGED
|
@@ -173,6 +173,7 @@ function isBannerImage(sequence, i) {
|
|
|
173
173
|
|
|
174
174
|
function readHeadingGroup(sequence, startIdx) {
|
|
175
175
|
const elements = [sequence[startIdx]];
|
|
176
|
+
let hasGoneDeeper = false;
|
|
176
177
|
|
|
177
178
|
// Iterate starting from the next element
|
|
178
179
|
for (let i = startIdx + 1; i < sequence.length; i++) {
|
|
@@ -186,6 +187,7 @@ function readHeadingGroup(sequence, startIdx) {
|
|
|
186
187
|
// Case 1: Strictly Deeper (Standard Subtitle/Deep Header)
|
|
187
188
|
// e.g. H1 -> H2
|
|
188
189
|
if (element.level > previousElement.level) {
|
|
190
|
+
hasGoneDeeper = true;
|
|
189
191
|
elements.push(element);
|
|
190
192
|
continue;
|
|
191
193
|
}
|
|
@@ -198,7 +200,18 @@ function readHeadingGroup(sequence, startIdx) {
|
|
|
198
200
|
continue;
|
|
199
201
|
}
|
|
200
202
|
|
|
201
|
-
//
|
|
203
|
+
// Case 3: Same Level Continuation (multi-line heading)
|
|
204
|
+
// Only before going deeper — once a subtitle level is reached,
|
|
205
|
+
// same-level headings are new sections, not continuations.
|
|
206
|
+
// e.g. H1 -> H1 → merged into title array
|
|
207
|
+
// but H1 -> H2 -> H2 → second H2 starts a new group (items)
|
|
208
|
+
if (element.level === previousElement.level && !hasGoneDeeper) {
|
|
209
|
+
elements.push(element);
|
|
210
|
+
continue;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Otherwise (New Section — went deeper then back up, or
|
|
214
|
+
// same-level after going deeper), stop.
|
|
202
215
|
break;
|
|
203
216
|
}
|
|
204
217
|
return elements;
|
|
@@ -240,6 +253,10 @@ function processGroupContent(elements) {
|
|
|
240
253
|
metadata,
|
|
241
254
|
};
|
|
242
255
|
|
|
256
|
+
// Track last assigned heading slot and level for same-level merging
|
|
257
|
+
let lastSlot = null;
|
|
258
|
+
let lastLevel = null;
|
|
259
|
+
|
|
243
260
|
for (let i = 0; i < elements.length; i++) {
|
|
244
261
|
//We shuold only set pretitle once
|
|
245
262
|
if (isPreTitle(elements, i) && !header.pretitle) {
|
|
@@ -256,19 +273,29 @@ function processGroupContent(elements) {
|
|
|
256
273
|
//We shuold set the group level to the highest one instead of the first one.
|
|
257
274
|
metadata.level ??= element.level;
|
|
258
275
|
|
|
259
|
-
//
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
276
|
+
// Same level as last assigned → merge into same slot as array
|
|
277
|
+
if (lastLevel !== null && element.level === lastLevel && lastSlot) {
|
|
278
|
+
const current = header[lastSlot];
|
|
279
|
+
if (Array.isArray(current)) {
|
|
280
|
+
current.push(element.text);
|
|
281
|
+
} else {
|
|
282
|
+
header[lastSlot] = [current, element.text];
|
|
283
|
+
}
|
|
284
|
+
} else if (!header.title) {
|
|
263
285
|
header.title = element.text;
|
|
286
|
+
lastSlot = 'title';
|
|
264
287
|
} else if (!header.subtitle) {
|
|
265
288
|
header.subtitle = element.text;
|
|
289
|
+
lastSlot = 'subtitle';
|
|
266
290
|
} else if (!header.subtitle2) {
|
|
267
291
|
header.subtitle2 = element.text;
|
|
292
|
+
lastSlot = 'subtitle2';
|
|
268
293
|
} else {
|
|
269
294
|
// After subtitle2, we're in body - collect heading
|
|
270
295
|
body.headings.push(element.text);
|
|
296
|
+
lastSlot = null;
|
|
271
297
|
}
|
|
298
|
+
lastLevel = element.level;
|
|
272
299
|
} else if (element.type === "list") {
|
|
273
300
|
const listItems = element.children;
|
|
274
301
|
|
|
@@ -405,7 +432,14 @@ function identifyMainContent(groups) {
|
|
|
405
432
|
const first = groups[0].metadata.level;
|
|
406
433
|
const second = groups[1].metadata.level;
|
|
407
434
|
|
|
408
|
-
|
|
435
|
+
// First group has a heading more important than second → main
|
|
436
|
+
if (first && (!second || first < second)) return true;
|
|
437
|
+
|
|
438
|
+
// First group has NO heading (just body content before first heading) → promote to main
|
|
439
|
+
// This prevents empty-titled first items when content precedes headings
|
|
440
|
+
if (!first && second) return true;
|
|
441
|
+
|
|
442
|
+
return false;
|
|
409
443
|
}
|
|
410
444
|
|
|
411
445
|
function processInlineElements(children, body) {
|