@uniweb/semantic-parser 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,41 +1,41 @@
1
1
  {
2
- "name": "@uniweb/semantic-parser",
3
- "version": "1.0.0",
4
- "description": "Semantic parser for ProseMirror/TipTap content structures",
5
- "type": "module",
6
- "main": "./src/index.js",
7
- "exports": {
8
- ".": "./src/index.js",
9
- "./mappers": "./src/mappers/index.js",
10
- "./mappers/*": "./src/mappers/*.js"
11
- },
12
- "scripts": {
13
- "test": "NODE_OPTIONS=--experimental-vm-modules jest",
14
- "test-report": "NODE_OPTIONS=--experimental-vm-modules jest --json > test-results.json 2>&1",
15
- "test:groups": "NODE_OPTIONS=--experimental-vm-modules jest tests/processors/groups.test.js"
16
- },
17
- "keywords": [
18
- "prosemirror",
19
- "tiptap",
20
- "parser",
21
- "semantic",
22
- "content"
23
- ],
24
- "author": "Proximify Inc.",
25
- "license": "GPL-3.0-or-later",
26
- "devDependencies": {
27
- "jest": "^29.7.0"
28
- },
29
- "repository": {
30
- "type": "git",
31
- "url": "git+https://github.com/uniweb/semantic-parser.git"
32
- },
33
- "bugs": {
34
- "url": "https://github.com/uniweb/semantic-parser/issues"
35
- },
36
- "homepage": "https://github.com/uniweb/semantic-parser#readme",
37
- "directories": {
38
- "doc": "docs",
39
- "test": "tests"
40
- }
41
- }
2
+ "name": "@uniweb/semantic-parser",
3
+ "version": "1.0.1",
4
+ "description": "Semantic parser for ProseMirror/TipTap content structures",
5
+ "type": "module",
6
+ "main": "./src/index.js",
7
+ "exports": {
8
+ ".": "./src/index.js",
9
+ "./mappers": "./src/mappers/index.js",
10
+ "./mappers/*": "./src/mappers/*.js"
11
+ },
12
+ "keywords": [
13
+ "prosemirror",
14
+ "tiptap",
15
+ "parser",
16
+ "semantic",
17
+ "content"
18
+ ],
19
+ "author": "Proximify Inc.",
20
+ "license": "GPL-3.0-or-later",
21
+ "devDependencies": {
22
+ "jest": "^29.7.0"
23
+ },
24
+ "repository": {
25
+ "type": "git",
26
+ "url": "git+https://github.com/uniweb/semantic-parser.git"
27
+ },
28
+ "bugs": {
29
+ "url": "https://github.com/uniweb/semantic-parser/issues"
30
+ },
31
+ "homepage": "https://github.com/uniweb/semantic-parser#readme",
32
+ "directories": {
33
+ "doc": "docs",
34
+ "test": "tests"
35
+ },
36
+ "scripts": {
37
+ "test": "NODE_OPTIONS=--experimental-vm-modules jest",
38
+ "test-report": "NODE_OPTIONS=--experimental-vm-modules jest --json > test-results.json 2>&1",
39
+ "test:groups": "NODE_OPTIONS=--experimental-vm-modules jest tests/processors/groups.test.js"
40
+ }
41
+ }
@@ -0,0 +1,129 @@
1
+ /**
2
+ * Organize content elements by their type while preserving context
3
+ * @param {Array} sequence Flat sequence of elements
4
+ * @returns {Object} Content organized by type
5
+ */
6
+ function processByType(sequence) {
7
+ const collections = {
8
+ headings: [],
9
+ paragraphs: [],
10
+ images: {
11
+ background: [],
12
+ content: [],
13
+ gallery: [],
14
+ icon: [],
15
+ },
16
+ lists: [],
17
+ dividers: [],
18
+ metadata: {
19
+ totalElements: sequence.length,
20
+ dominantType: null,
21
+ hasMedia: false,
22
+ },
23
+ };
24
+
25
+ // Track type frequencies for metadata
26
+ const typeFrequency = new Map();
27
+
28
+ sequence.forEach((element, index) => {
29
+ // Track element type frequency
30
+ typeFrequency.set(element.type, (typeFrequency.get(element.type) || 0) + 1);
31
+
32
+ // Add context information
33
+ const context = getElementContext(sequence, index);
34
+ const enrichedElement = { ...element, context };
35
+
36
+ // Process element based on type
37
+ switch (element.type) {
38
+ case "heading":
39
+ collections.headings.push(enrichedElement);
40
+ break;
41
+
42
+ case "paragraph":
43
+ collections.paragraphs.push(enrichedElement);
44
+ break;
45
+
46
+ case "image": {
47
+ const role = element.role || "content";
48
+ if (!collections.images[role]) {
49
+ collections.images[role] = [];
50
+ }
51
+ collections.images[role].push(enrichedElement);
52
+ collections.metadata.hasMedia = true;
53
+ break;
54
+ }
55
+
56
+ case "list":
57
+ collections.lists.push(enrichedElement);
58
+ break;
59
+
60
+ case "divider":
61
+ collections.dividers.push(enrichedElement);
62
+ break;
63
+ }
64
+ });
65
+
66
+ // Calculate dominant type
67
+ let maxFrequency = 0;
68
+ typeFrequency.forEach((frequency, type) => {
69
+ if (frequency > maxFrequency) {
70
+ maxFrequency = frequency;
71
+ collections.metadata.dominantType = type;
72
+ }
73
+ });
74
+
75
+ // Add helper methods
76
+ addCollectionHelpers(collections);
77
+
78
+ return collections;
79
+ }
80
+
81
+ /**
82
+ * Get context information for an element
83
+ */
84
+ function getElementContext(sequence, position) {
85
+ const context = {
86
+ position,
87
+ previousElement: position > 0 ? sequence[position - 1] : null,
88
+ nextElement: position < sequence.length - 1 ? sequence[position + 1] : null,
89
+ nearestHeading: null,
90
+ };
91
+
92
+ // Find nearest preceding heading
93
+ for (let i = position - 1; i >= 0; i--) {
94
+ if (sequence[i].type === "heading") {
95
+ context.nearestHeading = sequence[i];
96
+ break;
97
+ }
98
+ }
99
+
100
+ return context;
101
+ }
102
+
103
+ /**
104
+ * Add helper methods to collections
105
+ */
106
+ function addCollectionHelpers(collections) {
107
+ // Get headings of specific level
108
+ collections.getHeadingsByLevel = function (level) {
109
+ return this.headings.filter((h) => h.level === level);
110
+ };
111
+
112
+ // Get elements by heading context
113
+ collections.getElementsByHeadingContext = function (headingFilter) {
114
+ const allElements = [
115
+ ...this.paragraphs,
116
+ ...Object.values(this.images).flat(),
117
+ ...this.lists,
118
+ ];
119
+
120
+ return allElements.filter(
121
+ (el) =>
122
+ el.context?.nearestHeading && headingFilter(el.context.nearestHeading)
123
+ );
124
+ };
125
+ }
126
+
127
+ module.exports = {
128
+ processByType,
129
+ };
@@ -0,0 +1,240 @@
1
+ /**
2
+ * Transform a sequence into content groups with semantic structure
3
+ * @param {Array} sequence Flat sequence of elements
4
+ * @returns {Object} Content organized into groups with identified main content
5
+ */
6
+ function processGroups(sequence) {
7
+ const result = {
8
+ main: null,
9
+ items: [],
10
+ metadata: {
11
+ dividerMode: false,
12
+ groups: 0,
13
+ },
14
+ };
15
+
16
+ if (!sequence.length) return result;
17
+
18
+ // Check if using divider mode
19
+ result.metadata.dividerMode = sequence.some((el) => el.type === "divider");
20
+
21
+ // Split sequence into raw groups
22
+ const groups = result.metadata.dividerMode
23
+ ? splitByDividers(sequence)
24
+ : splitByHeadings(sequence);
25
+
26
+ // Process each group's structure
27
+ const processedGroups = groups.map(processGroupContent);
28
+
29
+ // Special handling for first group in divider mode
30
+ if (result.metadata.dividerMode && groups.startsWithDivider) {
31
+ result.items = processedGroups;
32
+ } else {
33
+ // Organize into main content and items
34
+ const shouldBeMain = identifyMainContent(processedGroups);
35
+ if (shouldBeMain) {
36
+ result.main = processedGroups[0];
37
+ result.items = processedGroups.slice(1);
38
+ } else {
39
+ result.items = processedGroups;
40
+ }
41
+ }
42
+
43
+ // result.metadata.groups = processedGroups.length;
44
+ return result;
45
+ }
46
+
47
+ /**
48
+ * Split sequence into groups using dividers
49
+ */
50
+ function splitByDividers(sequence) {
51
+ const groups = [];
52
+ let currentGroup = [];
53
+ let startsWithDivider = false;
54
+
55
+ // Check if content effectively starts with divider (ignoring whitespace etc)
56
+ for (let i = 0; i < sequence.length; i++) {
57
+ const element = sequence[i];
58
+
59
+ if (element.type === "divider") {
60
+ if (currentGroup.length === 0 && groups.length === 0) {
61
+ startsWithDivider = true;
62
+ } else if (currentGroup.length > 0) {
63
+ groups.push(currentGroup);
64
+ currentGroup = [];
65
+ }
66
+ } else {
67
+ currentGroup.push(element);
68
+ }
69
+ }
70
+
71
+ if (currentGroup.length > 0) {
72
+ groups.push(currentGroup);
73
+ }
74
+
75
+ groups.startsWithDivider = startsWithDivider;
76
+ return groups;
77
+ }
78
+
79
+ /**
80
+ * Split sequence into groups using heading patterns
81
+ */
82
+ function splitByHeadings(sequence) {
83
+ const groups = [];
84
+ let currentGroup = [];
85
+ let isPreOpened = false;
86
+
87
+ // Consider if current group is pre opened (only has banner or pretitle)
88
+ // before starting a new group.
89
+ const startGroup = (preOpen) => {
90
+ if (currentGroup.length && !isPreOpened) {
91
+ groups.push(currentGroup);
92
+ currentGroup = [];
93
+ }
94
+ isPreOpened = preOpen;
95
+ };
96
+
97
+ for (let i = 0; i < sequence.length; i++) {
98
+ // Only allow a banner for the first group
99
+ if (!groups.length && isBannerImage(sequence, i)) {
100
+ startGroup(true); // pre open a new group
101
+ currentGroup.push(sequence[i]);
102
+ i++; // move to known next element (it will be a heading)
103
+ }
104
+
105
+ // Handle special pretitle case before consuming all consecutive
106
+ // headings with increasing levels
107
+ if (isPreTitle(sequence, i)) {
108
+ startGroup(true); // pre open a new group
109
+ currentGroup.push(sequence[i]);
110
+ i++; // move to known next element (it will be a heading)
111
+ }
112
+
113
+ const element = sequence[i];
114
+
115
+ if (element.type === "heading") {
116
+ const headings = readHeadingGroup(sequence, i);
117
+ startGroup(false);
118
+
119
+ // Add headings to the current group
120
+ currentGroup.push(...headings);
121
+ i += headings.length - 1; // skip all the added headings
122
+ } else {
123
+ currentGroup.push(element);
124
+ }
125
+ }
126
+
127
+ if (currentGroup.length > 0) {
128
+ groups.push(currentGroup);
129
+ }
130
+
131
+ return groups;
132
+ }
133
+
134
+ /**
135
+ * Check if this is a pretitle (eg, H3 followed by H1/H2)
136
+ */
137
+ function isPreTitle(sequence, i) {
138
+ return (
139
+ i + 1 < sequence.length &&
140
+ sequence[i].type === "heading" &&
141
+ sequence[i + 1].type === "heading" &&
142
+ sequence[i].level > sequence[i + 1].level
143
+ );
144
+
145
+ // return (
146
+ // i + 1 < sequence.length &&
147
+ // sequence[i].type === "heading" &&
148
+ // sequence[i].level === 3 &&
149
+ // sequence[i + 1].type === "heading" &&
150
+ // sequence[i + 1].level <= 2
151
+ // );
152
+ }
153
+
154
+ function isBannerImage(sequence, i) {
155
+ return (
156
+ i + 1 < sequence.length &&
157
+ sequence[i].type === "image" &&
158
+ sequence[i + 1].type === "heading"
159
+ );
160
+ }
161
+
162
+ /**
163
+ * Eagerly consume all consecutive headings with increasing levels
164
+ * and return them as an array.
165
+ */
166
+ function readHeadingGroup(sequence, i) {
167
+ const elements = [sequence[i]];
168
+ for (i++; i < sequence.length; i++) {
169
+ const element = sequence[i];
170
+
171
+ if (element.type === "heading" && element.level > sequence[i - 1].level) {
172
+ elements.push(element);
173
+ }
174
+ }
175
+ return elements;
176
+ }
177
+
178
+ /**
179
+ * Process a group's content to identify its structure
180
+ */
181
+ function processGroupContent(elements) {
182
+ const content = [];
183
+ const headings = {
184
+ pretitle: null,
185
+ title: null,
186
+ subtitle: null,
187
+ subsubtitle: null,
188
+ };
189
+ const metadata = {
190
+ level: null,
191
+ contentTypes: new Set(),
192
+ };
193
+
194
+ for (let i = 0; i < elements.length; i++) {
195
+ if (isPreTitle(elements, i)) {
196
+ headings.pretitle = elements[i];
197
+ i++; // move to known next heading (H1 or h2)
198
+ }
199
+
200
+ const element = elements[i];
201
+
202
+ if (element.type === "heading") {
203
+ metadata.level ??= element.level;
204
+
205
+ if (!headings.title) {
206
+ headings.title = element;
207
+ } else if (!headings.subtitle) {
208
+ headings.subtitle = element;
209
+ } else if (!headings.subsubtitle) {
210
+ headings.subsubtitle = element;
211
+ }
212
+ // What do we do if more headings?
213
+ } else {
214
+ content.push(element);
215
+ metadata.contentTypes.add(element.type);
216
+ }
217
+ }
218
+
219
+ return { headings, content, metadata };
220
+ }
221
+
222
+ /**
223
+ * Determine if the first group should be treated as main content
224
+ */
225
+ function identifyMainContent(groups) {
226
+ if (groups.length === 0) return false;
227
+
228
+ // Single group is main content
229
+ if (groups.length === 1) return true;
230
+
231
+ // First group should be more important (lower level) than second to be main
232
+ const first = groups[0].metadata.level;
233
+ const second = groups[1].metadata.level;
234
+
235
+ return first ? !second || first < second : false;
236
+ }
237
+
238
+ module.exports = {
239
+ processGroups,
240
+ };
@@ -0,0 +1,140 @@
1
+ /**
2
+ * Process a ProseMirror/TipTap document into a flat sequence
3
+ * @param {Object} doc ProseMirror document
4
+ * @returns {Array} Sequence of content elements
5
+ */
6
+ function processSequence(doc) {
7
+ const sequence = [];
8
+ processNode(doc, sequence);
9
+ return sequence;
10
+ }
11
+
12
+ function processNode(node, sequence) {
13
+ // Special handling for root doc node
14
+ if (node.type === "doc") {
15
+ node.content?.forEach((child) => processNode(child, sequence));
16
+ return;
17
+ }
18
+
19
+ // Create element based on node type
20
+ const element = createSequenceElement(node);
21
+
22
+ if (element) {
23
+ // Process marks from node or content
24
+ if (node.marks?.length || hasMarkedContent(node)) {
25
+ element.marks = collectMarks(node);
26
+ }
27
+
28
+ sequence.push(element);
29
+ }
30
+
31
+ // Process children if they exist and not already processed
32
+ if (node.content && !element?.items) {
33
+ node.content.forEach((child) => processNode(child, sequence));
34
+ }
35
+ }
36
+
37
+ function createSequenceElement(node) {
38
+ switch (node.type) {
39
+ case "heading":
40
+ return {
41
+ type: "heading",
42
+ level: node.attrs.level,
43
+ content: getTextContent(node),
44
+ };
45
+
46
+ case "paragraph":
47
+ return {
48
+ type: "paragraph",
49
+ content: getTextContent(node),
50
+ };
51
+
52
+ case "image":
53
+ return {
54
+ type: "image",
55
+ src: node.attrs.src,
56
+ alt: node.attrs.alt,
57
+ role: node.attrs.role || "content",
58
+ };
59
+
60
+ case "bulletList":
61
+ case "orderedList":
62
+ return {
63
+ type: "list",
64
+ style: node.type === "bulletList" ? "bullet" : "ordered",
65
+ items: processListItems(node),
66
+ };
67
+
68
+ case "listItem":
69
+ return {
70
+ type: "listItem",
71
+ content: getTextContent(node),
72
+ };
73
+
74
+ case "horizontalRule":
75
+ return {
76
+ type: "divider",
77
+ };
78
+
79
+ case "text":
80
+ return null;
81
+
82
+ default:
83
+ return {
84
+ type: node.type,
85
+ content: getTextContent(node),
86
+ };
87
+ }
88
+ }
89
+
90
+ function getTextContent(node) {
91
+ if (!node.content) return "";
92
+ return node.content.reduce((text, child) => {
93
+ if (child.type === "text") {
94
+ return text + child.text;
95
+ }
96
+ return text + getTextContent(child);
97
+ }, "");
98
+ }
99
+
100
+ function hasMarkedContent(node) {
101
+ if (!node.content) return false;
102
+ return node.content.some(
103
+ (child) => child.marks?.length || hasMarkedContent(child)
104
+ );
105
+ }
106
+
107
+ function collectMarks(node) {
108
+ const marks = new Set();
109
+
110
+ if (node.marks) {
111
+ node.marks.forEach((mark) => marks.add(mark.type));
112
+ }
113
+
114
+ if (node.content) {
115
+ node.content.forEach((child) => {
116
+ collectMarks(child).forEach((mark) => marks.add(mark));
117
+ });
118
+ }
119
+
120
+ return Array.from(marks);
121
+ }
122
+
123
+ function processListItems(node) {
124
+ const items = [];
125
+ node.content?.forEach((item) => {
126
+ if (item.type === "listItem") {
127
+ items.push({
128
+ content: getTextContent(item),
129
+ items: item.content
130
+ ?.filter((child) => child.type.endsWith("List"))
131
+ .flatMap((list) => processListItems(list)),
132
+ });
133
+ }
134
+ });
135
+ return items;
136
+ }
137
+
138
+ module.exports = {
139
+ processSequence,
140
+ };