@uniweb/semantic-parser 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +3 -7
- package/README.md +2 -159
- package/package.json +2 -5
- package/src/index.js +1 -2
- package/src/processors/groups.js +1 -8
- package/docs/api.md +0 -350
- package/docs/entity-consolidation.md +0 -470
- package/docs/file-structure.md +0 -50
- package/docs/guide.md +0 -206
- package/docs/mapping-patterns.md +0 -928
- package/docs/text-component-reference.md +0 -515
- package/reference/README.md +0 -195
- package/reference/Text.js +0 -188
- package/src/mappers/accessor.js +0 -312
- package/src/mappers/extractors.js +0 -416
- package/src/mappers/helpers.js +0 -234
- package/src/mappers/index.js +0 -28
- package/src/mappers/types.js +0 -495
- package/src/processors/groups_backup.js +0 -379
- package/src/processors/groups_doc.md +0 -179
- package/src/processors/sequence_backup.js +0 -402
- package/src/processors_old/byType.js +0 -129
- package/src/processors_old/groups.js +0 -240
- package/src/processors_old/sequence.js +0 -140
|
@@ -1,240 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Transform a sequence into content groups with semantic structure
|
|
3
|
-
* @param {Array} sequence Flat sequence of elements
|
|
4
|
-
* @returns {Object} Content organized into groups with identified main content
|
|
5
|
-
*/
|
|
6
|
-
function processGroups(sequence) {
|
|
7
|
-
const result = {
|
|
8
|
-
main: null,
|
|
9
|
-
items: [],
|
|
10
|
-
metadata: {
|
|
11
|
-
dividerMode: false,
|
|
12
|
-
groups: 0,
|
|
13
|
-
},
|
|
14
|
-
};
|
|
15
|
-
|
|
16
|
-
if (!sequence.length) return result;
|
|
17
|
-
|
|
18
|
-
// Check if using divider mode
|
|
19
|
-
result.metadata.dividerMode = sequence.some((el) => el.type === "divider");
|
|
20
|
-
|
|
21
|
-
// Split sequence into raw groups
|
|
22
|
-
const groups = result.metadata.dividerMode
|
|
23
|
-
? splitByDividers(sequence)
|
|
24
|
-
: splitByHeadings(sequence);
|
|
25
|
-
|
|
26
|
-
// Process each group's structure
|
|
27
|
-
const processedGroups = groups.map(processGroupContent);
|
|
28
|
-
|
|
29
|
-
// Special handling for first group in divider mode
|
|
30
|
-
if (result.metadata.dividerMode && groups.startsWithDivider) {
|
|
31
|
-
result.items = processedGroups;
|
|
32
|
-
} else {
|
|
33
|
-
// Organize into main content and items
|
|
34
|
-
const shouldBeMain = identifyMainContent(processedGroups);
|
|
35
|
-
if (shouldBeMain) {
|
|
36
|
-
result.main = processedGroups[0];
|
|
37
|
-
result.items = processedGroups.slice(1);
|
|
38
|
-
} else {
|
|
39
|
-
result.items = processedGroups;
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
// result.metadata.groups = processedGroups.length;
|
|
44
|
-
return result;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
/**
|
|
48
|
-
* Split sequence into groups using dividers
|
|
49
|
-
*/
|
|
50
|
-
function splitByDividers(sequence) {
|
|
51
|
-
const groups = [];
|
|
52
|
-
let currentGroup = [];
|
|
53
|
-
let startsWithDivider = false;
|
|
54
|
-
|
|
55
|
-
// Check if content effectively starts with divider (ignoring whitespace etc)
|
|
56
|
-
for (let i = 0; i < sequence.length; i++) {
|
|
57
|
-
const element = sequence[i];
|
|
58
|
-
|
|
59
|
-
if (element.type === "divider") {
|
|
60
|
-
if (currentGroup.length === 0 && groups.length === 0) {
|
|
61
|
-
startsWithDivider = true;
|
|
62
|
-
} else if (currentGroup.length > 0) {
|
|
63
|
-
groups.push(currentGroup);
|
|
64
|
-
currentGroup = [];
|
|
65
|
-
}
|
|
66
|
-
} else {
|
|
67
|
-
currentGroup.push(element);
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
if (currentGroup.length > 0) {
|
|
72
|
-
groups.push(currentGroup);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
groups.startsWithDivider = startsWithDivider;
|
|
76
|
-
return groups;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
/**
|
|
80
|
-
* Split sequence into groups using heading patterns
|
|
81
|
-
*/
|
|
82
|
-
function splitByHeadings(sequence) {
|
|
83
|
-
const groups = [];
|
|
84
|
-
let currentGroup = [];
|
|
85
|
-
let isPreOpened = false;
|
|
86
|
-
|
|
87
|
-
// Consider if current group is pre opened (only has banner or pretitle)
|
|
88
|
-
// before starting a new group.
|
|
89
|
-
const startGroup = (preOpen) => {
|
|
90
|
-
if (currentGroup.length && !isPreOpened) {
|
|
91
|
-
groups.push(currentGroup);
|
|
92
|
-
currentGroup = [];
|
|
93
|
-
}
|
|
94
|
-
isPreOpened = preOpen;
|
|
95
|
-
};
|
|
96
|
-
|
|
97
|
-
for (let i = 0; i < sequence.length; i++) {
|
|
98
|
-
// Only allow a banner for the first group
|
|
99
|
-
if (!groups.length && isBannerImage(sequence, i)) {
|
|
100
|
-
startGroup(true); // pre open a new group
|
|
101
|
-
currentGroup.push(sequence[i]);
|
|
102
|
-
i++; // move to known next element (it will be a heading)
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
// Handle special pretitle case before consuming all consecutive
|
|
106
|
-
// headings with increasing levels
|
|
107
|
-
if (isPreTitle(sequence, i)) {
|
|
108
|
-
startGroup(true); // pre open a new group
|
|
109
|
-
currentGroup.push(sequence[i]);
|
|
110
|
-
i++; // move to known next element (it will be a heading)
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
const element = sequence[i];
|
|
114
|
-
|
|
115
|
-
if (element.type === "heading") {
|
|
116
|
-
const headings = readHeadingGroup(sequence, i);
|
|
117
|
-
startGroup(false);
|
|
118
|
-
|
|
119
|
-
// Add headings to the current group
|
|
120
|
-
currentGroup.push(...headings);
|
|
121
|
-
i += headings.length - 1; // skip all the added headings
|
|
122
|
-
} else {
|
|
123
|
-
currentGroup.push(element);
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
if (currentGroup.length > 0) {
|
|
128
|
-
groups.push(currentGroup);
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
return groups;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
/**
|
|
135
|
-
* Check if this is a pretitle (eg, H3 followed by H1/H2)
|
|
136
|
-
*/
|
|
137
|
-
function isPreTitle(sequence, i) {
|
|
138
|
-
return (
|
|
139
|
-
i + 1 < sequence.length &&
|
|
140
|
-
sequence[i].type === "heading" &&
|
|
141
|
-
sequence[i + 1].type === "heading" &&
|
|
142
|
-
sequence[i].level > sequence[i + 1].level
|
|
143
|
-
);
|
|
144
|
-
|
|
145
|
-
// return (
|
|
146
|
-
// i + 1 < sequence.length &&
|
|
147
|
-
// sequence[i].type === "heading" &&
|
|
148
|
-
// sequence[i].level === 3 &&
|
|
149
|
-
// sequence[i + 1].type === "heading" &&
|
|
150
|
-
// sequence[i + 1].level <= 2
|
|
151
|
-
// );
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
function isBannerImage(sequence, i) {
|
|
155
|
-
return (
|
|
156
|
-
i + 1 < sequence.length &&
|
|
157
|
-
sequence[i].type === "image" &&
|
|
158
|
-
sequence[i + 1].type === "heading"
|
|
159
|
-
);
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
/**
|
|
163
|
-
* Eagerly consume all consecutive headings with increasing levels
|
|
164
|
-
* and return them as an array.
|
|
165
|
-
*/
|
|
166
|
-
function readHeadingGroup(sequence, i) {
|
|
167
|
-
const elements = [sequence[i]];
|
|
168
|
-
for (i++; i < sequence.length; i++) {
|
|
169
|
-
const element = sequence[i];
|
|
170
|
-
|
|
171
|
-
if (element.type === "heading" && element.level > sequence[i - 1].level) {
|
|
172
|
-
elements.push(element);
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
return elements;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
/**
|
|
179
|
-
* Process a group's content to identify its structure
|
|
180
|
-
*/
|
|
181
|
-
function processGroupContent(elements) {
|
|
182
|
-
const content = [];
|
|
183
|
-
const headings = {
|
|
184
|
-
pretitle: null,
|
|
185
|
-
title: null,
|
|
186
|
-
subtitle: null,
|
|
187
|
-
subsubtitle: null,
|
|
188
|
-
};
|
|
189
|
-
const metadata = {
|
|
190
|
-
level: null,
|
|
191
|
-
contentTypes: new Set(),
|
|
192
|
-
};
|
|
193
|
-
|
|
194
|
-
for (let i = 0; i < elements.length; i++) {
|
|
195
|
-
if (isPreTitle(elements, i)) {
|
|
196
|
-
headings.pretitle = elements[i];
|
|
197
|
-
i++; // move to known next heading (H1 or h2)
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
const element = elements[i];
|
|
201
|
-
|
|
202
|
-
if (element.type === "heading") {
|
|
203
|
-
metadata.level ??= element.level;
|
|
204
|
-
|
|
205
|
-
if (!headings.title) {
|
|
206
|
-
headings.title = element;
|
|
207
|
-
} else if (!headings.subtitle) {
|
|
208
|
-
headings.subtitle = element;
|
|
209
|
-
} else if (!headings.subsubtitle) {
|
|
210
|
-
headings.subsubtitle = element;
|
|
211
|
-
}
|
|
212
|
-
// What do we do if more headings?
|
|
213
|
-
} else {
|
|
214
|
-
content.push(element);
|
|
215
|
-
metadata.contentTypes.add(element.type);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
return { headings, content, metadata };
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
/**
|
|
223
|
-
* Determine if the first group should be treated as main content
|
|
224
|
-
*/
|
|
225
|
-
function identifyMainContent(groups) {
|
|
226
|
-
if (groups.length === 0) return false;
|
|
227
|
-
|
|
228
|
-
// Single group is main content
|
|
229
|
-
if (groups.length === 1) return true;
|
|
230
|
-
|
|
231
|
-
// First group should be more important (lower level) than second to be main
|
|
232
|
-
const first = groups[0].metadata.level;
|
|
233
|
-
const second = groups[1].metadata.level;
|
|
234
|
-
|
|
235
|
-
return first ? !second || first < second : false;
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
module.exports = {
|
|
239
|
-
processGroups,
|
|
240
|
-
};
|
|
@@ -1,140 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Process a ProseMirror/TipTap document into a flat sequence
|
|
3
|
-
* @param {Object} doc ProseMirror document
|
|
4
|
-
* @returns {Array} Sequence of content elements
|
|
5
|
-
*/
|
|
6
|
-
function processSequence(doc) {
|
|
7
|
-
const sequence = [];
|
|
8
|
-
processNode(doc, sequence);
|
|
9
|
-
return sequence;
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
function processNode(node, sequence) {
|
|
13
|
-
// Special handling for root doc node
|
|
14
|
-
if (node.type === "doc") {
|
|
15
|
-
node.content?.forEach((child) => processNode(child, sequence));
|
|
16
|
-
return;
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
// Create element based on node type
|
|
20
|
-
const element = createSequenceElement(node);
|
|
21
|
-
|
|
22
|
-
if (element) {
|
|
23
|
-
// Process marks from node or content
|
|
24
|
-
if (node.marks?.length || hasMarkedContent(node)) {
|
|
25
|
-
element.marks = collectMarks(node);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
sequence.push(element);
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
// Process children if they exist and not already processed
|
|
32
|
-
if (node.content && !element?.items) {
|
|
33
|
-
node.content.forEach((child) => processNode(child, sequence));
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
function createSequenceElement(node) {
|
|
38
|
-
switch (node.type) {
|
|
39
|
-
case "heading":
|
|
40
|
-
return {
|
|
41
|
-
type: "heading",
|
|
42
|
-
level: node.attrs.level,
|
|
43
|
-
content: getTextContent(node),
|
|
44
|
-
};
|
|
45
|
-
|
|
46
|
-
case "paragraph":
|
|
47
|
-
return {
|
|
48
|
-
type: "paragraph",
|
|
49
|
-
content: getTextContent(node),
|
|
50
|
-
};
|
|
51
|
-
|
|
52
|
-
case "image":
|
|
53
|
-
return {
|
|
54
|
-
type: "image",
|
|
55
|
-
src: node.attrs.src,
|
|
56
|
-
alt: node.attrs.alt,
|
|
57
|
-
role: node.attrs.role || "content",
|
|
58
|
-
};
|
|
59
|
-
|
|
60
|
-
case "bulletList":
|
|
61
|
-
case "orderedList":
|
|
62
|
-
return {
|
|
63
|
-
type: "list",
|
|
64
|
-
style: node.type === "bulletList" ? "bullet" : "ordered",
|
|
65
|
-
items: processListItems(node),
|
|
66
|
-
};
|
|
67
|
-
|
|
68
|
-
case "listItem":
|
|
69
|
-
return {
|
|
70
|
-
type: "listItem",
|
|
71
|
-
content: getTextContent(node),
|
|
72
|
-
};
|
|
73
|
-
|
|
74
|
-
case "horizontalRule":
|
|
75
|
-
return {
|
|
76
|
-
type: "divider",
|
|
77
|
-
};
|
|
78
|
-
|
|
79
|
-
case "text":
|
|
80
|
-
return null;
|
|
81
|
-
|
|
82
|
-
default:
|
|
83
|
-
return {
|
|
84
|
-
type: node.type,
|
|
85
|
-
content: getTextContent(node),
|
|
86
|
-
};
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
function getTextContent(node) {
|
|
91
|
-
if (!node.content) return "";
|
|
92
|
-
return node.content.reduce((text, child) => {
|
|
93
|
-
if (child.type === "text") {
|
|
94
|
-
return text + child.text;
|
|
95
|
-
}
|
|
96
|
-
return text + getTextContent(child);
|
|
97
|
-
}, "");
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
function hasMarkedContent(node) {
|
|
101
|
-
if (!node.content) return false;
|
|
102
|
-
return node.content.some(
|
|
103
|
-
(child) => child.marks?.length || hasMarkedContent(child)
|
|
104
|
-
);
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
function collectMarks(node) {
|
|
108
|
-
const marks = new Set();
|
|
109
|
-
|
|
110
|
-
if (node.marks) {
|
|
111
|
-
node.marks.forEach((mark) => marks.add(mark.type));
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
if (node.content) {
|
|
115
|
-
node.content.forEach((child) => {
|
|
116
|
-
collectMarks(child).forEach((mark) => marks.add(mark));
|
|
117
|
-
});
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
return Array.from(marks);
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
function processListItems(node) {
|
|
124
|
-
const items = [];
|
|
125
|
-
node.content?.forEach((item) => {
|
|
126
|
-
if (item.type === "listItem") {
|
|
127
|
-
items.push({
|
|
128
|
-
content: getTextContent(item),
|
|
129
|
-
items: item.content
|
|
130
|
-
?.filter((child) => child.type.endsWith("List"))
|
|
131
|
-
.flatMap((list) => processListItems(list)),
|
|
132
|
-
});
|
|
133
|
-
}
|
|
134
|
-
});
|
|
135
|
-
return items;
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
module.exports = {
|
|
139
|
-
processSequence,
|
|
140
|
-
};
|