@uniweb/semantic-parser 1.0.8 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +42 -25
- package/README.md +52 -104
- package/docs/api.md +38 -40
- package/docs/mapping-patterns.md +47 -47
- package/docs/text-component-reference.md +3 -3
- package/package.json +4 -1
- package/src/index.js +5 -7
- package/src/mappers/extractors.js +113 -120
- package/src/processors/groups.js +105 -30
- package/src/processors/sequence.js +59 -11
- package/src/processors/byType.js +0 -130
package/AGENTS.md
CHANGED
|
@@ -52,35 +52,26 @@ const result = parseContent(doc);
|
|
|
52
52
|
// }
|
|
53
53
|
```
|
|
54
54
|
|
|
55
|
-
### Content
|
|
55
|
+
### Content Output Structure
|
|
56
56
|
|
|
57
|
-
|
|
57
|
+
The parser returns a flat content structure:
|
|
58
58
|
|
|
59
59
|
```js
|
|
60
60
|
{
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
propertyBlocks: [],
|
|
76
|
-
cards: [],
|
|
77
|
-
headings: []
|
|
78
|
-
},
|
|
79
|
-
banner: null, // Image with banner role or image before heading
|
|
80
|
-
metadata: {
|
|
81
|
-
level: null, // Heading level that started this group
|
|
82
|
-
contentTypes: Set()
|
|
83
|
-
}
|
|
61
|
+
title: '', // Main heading
|
|
62
|
+
pretitle: '', // Heading before main title
|
|
63
|
+
subtitle: '', // Heading after main title
|
|
64
|
+
paragraphs: [],
|
|
65
|
+
links: [],
|
|
66
|
+
imgs: [],
|
|
67
|
+
icons: [],
|
|
68
|
+
videos: [],
|
|
69
|
+
lists: [],
|
|
70
|
+
buttons: [],
|
|
71
|
+
data: {}, // Tagged code blocks (keyed by tag name)
|
|
72
|
+
cards: [],
|
|
73
|
+
headings: [],
|
|
74
|
+
items: [], // Child content groups
|
|
84
75
|
}
|
|
85
76
|
```
|
|
86
77
|
|
|
@@ -102,6 +93,32 @@ The sequence processor identifies several special element types by inspecting pa
|
|
|
102
93
|
|
|
103
94
|
These are extracted into dedicated element types for easier downstream processing.
|
|
104
95
|
|
|
96
|
+
### Tagged Code Blocks
|
|
97
|
+
|
|
98
|
+
Code blocks with tags route parsed data to the `data` object:
|
|
99
|
+
|
|
100
|
+
```markdown
|
|
101
|
+
```json:nav-links
|
|
102
|
+
[{ "label": "Home", "href": "/" }]
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
```yaml:config
|
|
106
|
+
title: My Site
|
|
107
|
+
theme: dark
|
|
108
|
+
```
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Results in:
|
|
112
|
+
```js
|
|
113
|
+
content.data['nav-links'] = [{ label: "Home", href: "/" }]
|
|
114
|
+
content.data['config'] = { title: "My Site", theme: "dark" }
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
**Parsing rules:**
|
|
118
|
+
- Tagged blocks with `json` language: parsed as JSON
|
|
119
|
+
- Tagged blocks with `yaml`/`yml` language: parsed as YAML
|
|
120
|
+
- Untagged blocks: not parsed (stay as raw text in sequence for display)
|
|
121
|
+
|
|
105
122
|
### List Processing
|
|
106
123
|
|
|
107
124
|
Lists maintain hierarchy through nested structure. The `processListItems()` function in sequence.js handles nested lists, while `processListContent()` in groups.js applies full group content processing to each list item, allowing lists to contain rich content (images, paragraphs, nested lists, etc.).
|
package/README.md
CHANGED
|
@@ -4,11 +4,10 @@ A semantic parser for ProseMirror/TipTap content structures that helps bridge th
|
|
|
4
4
|
|
|
5
5
|
## What it Does
|
|
6
6
|
|
|
7
|
-
The parser transforms rich text editor content (ProseMirror/TipTap) into structured, semantic groups that web components can easily consume. It provides
|
|
7
|
+
The parser transforms rich text editor content (ProseMirror/TipTap) into structured, semantic groups that web components can easily consume. It provides two complementary views of your content:
|
|
8
8
|
|
|
9
|
-
1. **Sequence**:
|
|
10
|
-
2. **Groups**: Content organized into semantic sections
|
|
11
|
-
3. **ByType**: Elements categorized by type for easy filtering and queries
|
|
9
|
+
1. **Sequence**: An ordered list of all content elements (for rendering in document order)
|
|
10
|
+
2. **Groups**: Content organized into semantic sections (main content + items)
|
|
12
11
|
|
|
13
12
|
## Installation
|
|
14
13
|
|
|
@@ -41,16 +40,16 @@ const doc = {
|
|
|
41
40
|
const result = parseContent(doc);
|
|
42
41
|
|
|
43
42
|
// Access different views
|
|
44
|
-
console.log(result.sequence); //
|
|
45
|
-
console.log(result.
|
|
46
|
-
console.log(result.
|
|
43
|
+
console.log(result.sequence); // Ordered array of elements
|
|
44
|
+
console.log(result.title); // Main content fields at top level
|
|
45
|
+
console.log(result.items); // Additional content groups
|
|
47
46
|
```
|
|
48
47
|
|
|
49
48
|
## Output Structure
|
|
50
49
|
|
|
51
50
|
### Sequence View
|
|
52
51
|
|
|
53
|
-
|
|
52
|
+
An ordered array of semantic elements preserving document order:
|
|
54
53
|
|
|
55
54
|
```js
|
|
56
55
|
result.sequence = [
|
|
@@ -59,72 +58,37 @@ result.sequence = [
|
|
|
59
58
|
]
|
|
60
59
|
```
|
|
61
60
|
|
|
62
|
-
###
|
|
61
|
+
### Content Structure
|
|
63
62
|
|
|
64
|
-
|
|
63
|
+
Main content fields are at the top level. The `items` array contains additional content groups (created when headings appear after content), each with the same field structure:
|
|
65
64
|
|
|
66
65
|
```js
|
|
67
|
-
result
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
dividerMode: false, // Using dividers vs headings
|
|
88
|
-
groups: 0
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
### ByType View
|
|
66
|
+
result = {
|
|
67
|
+
// Main content fields
|
|
68
|
+
pretitle: "", // Heading before main title
|
|
69
|
+
title: "Welcome", // Main heading
|
|
70
|
+
subtitle: "", // Heading after main title
|
|
71
|
+
paragraphs: ["Get started today."],
|
|
72
|
+
imgs: [],
|
|
73
|
+
videos: [],
|
|
74
|
+
links: [],
|
|
75
|
+
lists: [],
|
|
76
|
+
icons: [],
|
|
77
|
+
buttons: [],
|
|
78
|
+
banner: null, // Optional banner image
|
|
79
|
+
// ... more content types
|
|
80
|
+
|
|
81
|
+
// Additional content groups (from headings after content)
|
|
82
|
+
items: [
|
|
83
|
+
{ title: "Feature 1", paragraphs: [...], links: [...] },
|
|
84
|
+
{ title: "Feature 2", paragraphs: [...], links: [...] }
|
|
85
|
+
],
|
|
94
86
|
|
|
95
|
-
|
|
87
|
+
// Ordered sequence for document-order rendering
|
|
88
|
+
sequence: [...],
|
|
96
89
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
headings: [
|
|
100
|
-
{
|
|
101
|
-
type: "heading",
|
|
102
|
-
level: 1,
|
|
103
|
-
content: "Welcome",
|
|
104
|
-
context: {
|
|
105
|
-
position: 0,
|
|
106
|
-
previousElement: null,
|
|
107
|
-
nextElement: { type: "paragraph", ... },
|
|
108
|
-
nearestHeading: null
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
],
|
|
112
|
-
paragraphs: [ /* ... */ ],
|
|
113
|
-
images: {
|
|
114
|
-
background: [],
|
|
115
|
-
content: [],
|
|
116
|
-
gallery: [],
|
|
117
|
-
icon: []
|
|
118
|
-
},
|
|
119
|
-
lists: [],
|
|
120
|
-
metadata: {
|
|
121
|
-
totalElements: 2,
|
|
122
|
-
dominantType: "paragraph",
|
|
123
|
-
hasMedia: false
|
|
124
|
-
},
|
|
125
|
-
// Helper methods
|
|
126
|
-
getHeadingsByLevel(level),
|
|
127
|
-
getElementsByHeadingContext(filter)
|
|
90
|
+
// Original document
|
|
91
|
+
raw: { type: "doc", content: [...] }
|
|
128
92
|
}
|
|
129
93
|
```
|
|
130
94
|
|
|
@@ -133,45 +97,29 @@ result.byType = {
|
|
|
133
97
|
### Extracting Main Content
|
|
134
98
|
|
|
135
99
|
```js
|
|
136
|
-
const
|
|
100
|
+
const content = parseContent(doc);
|
|
137
101
|
|
|
138
|
-
const title =
|
|
139
|
-
const description =
|
|
140
|
-
const image =
|
|
102
|
+
const title = content.title;
|
|
103
|
+
const description = content.paragraphs.join(" ");
|
|
104
|
+
const image = content.banner?.url;
|
|
141
105
|
```
|
|
142
106
|
|
|
143
107
|
### Processing Content Sections
|
|
144
108
|
|
|
145
109
|
```js
|
|
146
|
-
const
|
|
110
|
+
const content = parseContent(doc);
|
|
147
111
|
|
|
148
112
|
// Main content
|
|
149
|
-
console.log("
|
|
113
|
+
console.log("Title:", content.title);
|
|
114
|
+
console.log("Description:", content.paragraphs);
|
|
150
115
|
|
|
151
|
-
// Additional
|
|
152
|
-
|
|
153
|
-
console.log("Section:", item.
|
|
154
|
-
console.log("Content:", item.
|
|
116
|
+
// Additional content groups
|
|
117
|
+
content.items.forEach(item => {
|
|
118
|
+
console.log("Section:", item.title);
|
|
119
|
+
console.log("Content:", item.paragraphs);
|
|
155
120
|
});
|
|
156
121
|
```
|
|
157
122
|
|
|
158
|
-
### Finding Specific Elements
|
|
159
|
-
|
|
160
|
-
```js
|
|
161
|
-
const { byType } = parseContent(doc);
|
|
162
|
-
|
|
163
|
-
// Get all H2 headings
|
|
164
|
-
const subheadings = byType.getHeadingsByLevel(2);
|
|
165
|
-
|
|
166
|
-
// Get all background images
|
|
167
|
-
const backgrounds = byType.images.background;
|
|
168
|
-
|
|
169
|
-
// Get content under specific headings
|
|
170
|
-
const features = byType.getElementsByHeadingContext(
|
|
171
|
-
h => h.content.includes("Features")
|
|
172
|
-
);
|
|
173
|
-
```
|
|
174
|
-
|
|
175
123
|
### Sequential Processing
|
|
176
124
|
|
|
177
125
|
```js
|
|
@@ -203,17 +151,17 @@ Automatically transform content based on field types with context-aware behavior
|
|
|
203
151
|
```js
|
|
204
152
|
const schema = {
|
|
205
153
|
title: {
|
|
206
|
-
path: "
|
|
154
|
+
path: "title",
|
|
207
155
|
type: "plaintext", // Auto-strips <strong>, <em>, etc.
|
|
208
156
|
maxLength: 60 // Auto-truncates intelligently
|
|
209
157
|
},
|
|
210
158
|
excerpt: {
|
|
211
|
-
path: "
|
|
159
|
+
path: "paragraphs",
|
|
212
160
|
type: "excerpt", // Auto-creates excerpt from paragraphs
|
|
213
161
|
maxLength: 150
|
|
214
162
|
},
|
|
215
163
|
image: {
|
|
216
|
-
path: "
|
|
164
|
+
path: "imgs[0].url",
|
|
217
165
|
type: "image",
|
|
218
166
|
defaultValue: "/placeholder.jpg"
|
|
219
167
|
}
|
|
@@ -259,15 +207,15 @@ Define custom mappings using schemas:
|
|
|
259
207
|
|
|
260
208
|
```js
|
|
261
209
|
const schema = {
|
|
262
|
-
brand: "
|
|
263
|
-
title: "
|
|
264
|
-
subtitle: "
|
|
210
|
+
brand: "pretitle",
|
|
211
|
+
title: "title",
|
|
212
|
+
subtitle: "subtitle",
|
|
265
213
|
image: {
|
|
266
|
-
path: "
|
|
214
|
+
path: "imgs[0].url",
|
|
267
215
|
defaultValue: "/placeholder.jpg"
|
|
268
216
|
},
|
|
269
217
|
actions: {
|
|
270
|
-
path: "
|
|
218
|
+
path: "links",
|
|
271
219
|
transform: links => links.map(l => ({ label: l.label, type: "primary" }))
|
|
272
220
|
}
|
|
273
221
|
};
|
package/docs/api.md
CHANGED
|
@@ -118,51 +118,49 @@ A flat array of semantic elements extracted from the document tree.
|
|
|
118
118
|
|
|
119
119
|
### `groups`
|
|
120
120
|
|
|
121
|
-
Content organized into semantic groups with identified main content and items.
|
|
121
|
+
Content organized into semantic groups with identified main content and items. The structure is flat - header and body fields are merged at the top level.
|
|
122
122
|
|
|
123
123
|
```js
|
|
124
124
|
{
|
|
125
125
|
main: {
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
126
|
+
// Header fields (flat)
|
|
127
|
+
pretitle: "PRETITLE TEXT", // H3 before main title
|
|
128
|
+
title: "Main Title", // First heading in group
|
|
129
|
+
subtitle: "Subtitle", // Second heading in group
|
|
130
|
+
|
|
131
|
+
// Body fields (flat)
|
|
132
|
+
paragraphs: ["paragraph text", ...],
|
|
133
|
+
imgs: [
|
|
134
|
+
{ url: "...", caption: "...", alt: "..." }
|
|
135
|
+
],
|
|
136
|
+
icons: ["<svg>...</svg>", ...],
|
|
137
|
+
videos: [
|
|
138
|
+
{ src: "...", caption: "...", alt: "..." }
|
|
139
|
+
],
|
|
140
|
+
links: [
|
|
141
|
+
{ href: "...", label: "..." }
|
|
142
|
+
],
|
|
143
|
+
lists: [
|
|
144
|
+
[/* processed list items */]
|
|
145
|
+
],
|
|
146
|
+
buttons: [
|
|
147
|
+
{ content: "...", attrs: {...} }
|
|
148
|
+
],
|
|
149
|
+
properties: [], // Code block content
|
|
150
|
+
propertyBlocks: [], // Array of code blocks
|
|
151
|
+
cards: [], // Not yet implemented
|
|
152
|
+
headings: [], // Used in list items
|
|
153
|
+
|
|
154
|
+
// Banner (flat)
|
|
154
155
|
banner: {
|
|
155
156
|
url: "path/to/banner.jpg",
|
|
156
157
|
caption: "Banner caption",
|
|
157
158
|
alt: "Banner alt text"
|
|
158
|
-
} | null
|
|
159
|
-
metadata: {
|
|
160
|
-
level: 1, // Heading level that started this group
|
|
161
|
-
contentTypes: {} // Set of content types in group
|
|
162
|
-
}
|
|
159
|
+
} | null
|
|
163
160
|
},
|
|
164
161
|
items: [
|
|
165
|
-
// Array of groups with same structure as main
|
|
162
|
+
// Array of groups with same flat structure as main
|
|
163
|
+
// { title, pretitle, subtitle, paragraphs, imgs, ... }
|
|
166
164
|
],
|
|
167
165
|
metadata: {
|
|
168
166
|
dividerMode: false, // Whether dividers were used for grouping
|
|
@@ -268,14 +266,14 @@ const result = parseContent(doc);
|
|
|
268
266
|
```js
|
|
269
267
|
const { groups } = parseContent(doc);
|
|
270
268
|
|
|
271
|
-
// Access main content
|
|
272
|
-
console.log(groups.main.
|
|
273
|
-
console.log(groups.main.
|
|
269
|
+
// Access main content (flat structure)
|
|
270
|
+
console.log(groups.main.title);
|
|
271
|
+
console.log(groups.main.paragraphs);
|
|
274
272
|
|
|
275
273
|
// Iterate through content items
|
|
276
274
|
groups.items.forEach(item => {
|
|
277
|
-
console.log(item.
|
|
278
|
-
console.log(item.
|
|
275
|
+
console.log(item.title);
|
|
276
|
+
console.log(item.paragraphs);
|
|
279
277
|
});
|
|
280
278
|
```
|
|
281
279
|
|