@uniweb/semantic-parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -0
- package/.eslintrc.json +28 -0
- package/LICENSE +674 -0
- package/README.md +395 -0
- package/docs/api.md +352 -0
- package/docs/file-structure.md +50 -0
- package/docs/guide.md +206 -0
- package/docs/mapping-patterns.md +928 -0
- package/docs/text-component-reference.md +515 -0
- package/package.json +41 -0
- package/reference/README.md +195 -0
- package/reference/Text.js +188 -0
- package/src/index.js +35 -0
- package/src/mappers/accessor.js +312 -0
- package/src/mappers/extractors.js +397 -0
- package/src/mappers/helpers.js +234 -0
- package/src/mappers/index.js +28 -0
- package/src/mappers/types.js +495 -0
- package/src/processors/byType.js +129 -0
- package/src/processors/groups.js +330 -0
- package/src/processors/groups_backup.js +379 -0
- package/src/processors/groups_doc.md +179 -0
- package/src/processors/sequence.js +573 -0
- package/src/processors/sequence_backup.js +402 -0
- package/src/utils/role.js +53 -0
package/README.md
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
# @uniwebcms/semantic-parser
|
|
2
|
+
|
|
3
|
+
A semantic parser for ProseMirror/TipTap content structures that helps bridge the gap between natural content writing and component-based web development.
|
|
4
|
+
|
|
5
|
+
## What it Does
|
|
6
|
+
|
|
7
|
+
The parser transforms rich text editor content (ProseMirror/TipTap) into structured, semantic groups that web components can easily consume. It provides three complementary views of your content:
|
|
8
|
+
|
|
9
|
+
1. **Sequence**: A flat, ordered list of all content elements
|
|
10
|
+
2. **Groups**: Content organized into semantic sections with identified main content
|
|
11
|
+
3. **ByType**: Elements categorized by type for easy filtering and queries
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install @uniwebcms/semantic-parser
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
```js
|
|
22
|
+
import { parseContent } from "@uniwebcms/semantic-parser";
|
|
23
|
+
|
|
24
|
+
// Your ProseMirror/TipTap document
|
|
25
|
+
const doc = {
|
|
26
|
+
type: "doc",
|
|
27
|
+
content: [
|
|
28
|
+
{
|
|
29
|
+
type: "heading",
|
|
30
|
+
attrs: { level: 1 },
|
|
31
|
+
content: [{ type: "text", text: "Welcome" }],
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
type: "paragraph",
|
|
35
|
+
content: [{ type: "text", text: "Get started today." }],
|
|
36
|
+
},
|
|
37
|
+
],
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
// Parse the content
|
|
41
|
+
const result = parseContent(doc);
|
|
42
|
+
|
|
43
|
+
// Access different views
|
|
44
|
+
console.log(result.sequence); // Flat array of elements
|
|
45
|
+
console.log(result.groups); // Semantic groups with main/items
|
|
46
|
+
console.log(result.byType); // Elements organized by type
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Output Structure
|
|
50
|
+
|
|
51
|
+
### Sequence View
|
|
52
|
+
|
|
53
|
+
A flat array of semantic elements preserving document order:
|
|
54
|
+
|
|
55
|
+
```js
|
|
56
|
+
result.sequence = [
|
|
57
|
+
{ type: "heading", level: 1, content: "Welcome" },
|
|
58
|
+
{ type: "paragraph", content: "Get started today." }
|
|
59
|
+
]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Groups View
|
|
63
|
+
|
|
64
|
+
Content organized into semantic groups:
|
|
65
|
+
|
|
66
|
+
```js
|
|
67
|
+
result.groups = {
|
|
68
|
+
main: {
|
|
69
|
+
header: {
|
|
70
|
+
pretitle: "", // H3 before main title
|
|
71
|
+
title: "Welcome", // Main heading
|
|
72
|
+
subtitle: "" // Heading after main title
|
|
73
|
+
},
|
|
74
|
+
body: {
|
|
75
|
+
paragraphs: ["Get started today."],
|
|
76
|
+
imgs: [],
|
|
77
|
+
videos: [],
|
|
78
|
+
links: [],
|
|
79
|
+
lists: [],
|
|
80
|
+
// ... more content types
|
|
81
|
+
},
|
|
82
|
+
banner: null, // Optional banner image
|
|
83
|
+
metadata: { level: 1 }
|
|
84
|
+
},
|
|
85
|
+
items: [], // Additional content groups
|
|
86
|
+
metadata: {
|
|
87
|
+
dividerMode: false, // Using dividers vs headings
|
|
88
|
+
groups: 0
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### ByType View
|
|
94
|
+
|
|
95
|
+
Elements organized by type with context:
|
|
96
|
+
|
|
97
|
+
```js
|
|
98
|
+
result.byType = {
|
|
99
|
+
headings: [
|
|
100
|
+
{
|
|
101
|
+
type: "heading",
|
|
102
|
+
level: 1,
|
|
103
|
+
content: "Welcome",
|
|
104
|
+
context: {
|
|
105
|
+
position: 0,
|
|
106
|
+
previousElement: null,
|
|
107
|
+
nextElement: { type: "paragraph", ... },
|
|
108
|
+
nearestHeading: null
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
],
|
|
112
|
+
paragraphs: [ /* ... */ ],
|
|
113
|
+
images: {
|
|
114
|
+
background: [],
|
|
115
|
+
content: [],
|
|
116
|
+
gallery: [],
|
|
117
|
+
icon: []
|
|
118
|
+
},
|
|
119
|
+
lists: [],
|
|
120
|
+
metadata: {
|
|
121
|
+
totalElements: 2,
|
|
122
|
+
dominantType: "paragraph",
|
|
123
|
+
hasMedia: false
|
|
124
|
+
},
|
|
125
|
+
// Helper methods
|
|
126
|
+
getHeadingsByLevel(level),
|
|
127
|
+
getElementsByHeadingContext(filter)
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Common Use Cases
|
|
132
|
+
|
|
133
|
+
### Extracting Main Content
|
|
134
|
+
|
|
135
|
+
```js
|
|
136
|
+
const { groups } = parseContent(doc);
|
|
137
|
+
|
|
138
|
+
const title = groups.main.header.title;
|
|
139
|
+
const description = groups.main.body.paragraphs.join(" ");
|
|
140
|
+
const image = groups.main.banner?.url;
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Processing Content Sections
|
|
144
|
+
|
|
145
|
+
```js
|
|
146
|
+
const { groups } = parseContent(doc);
|
|
147
|
+
|
|
148
|
+
// Main content
|
|
149
|
+
console.log("Main:", groups.main.header.title);
|
|
150
|
+
|
|
151
|
+
// Additional sections
|
|
152
|
+
groups.items.forEach(item => {
|
|
153
|
+
console.log("Section:", item.header.title);
|
|
154
|
+
console.log("Content:", item.body.paragraphs);
|
|
155
|
+
});
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Finding Specific Elements
|
|
159
|
+
|
|
160
|
+
```js
|
|
161
|
+
const { byType } = parseContent(doc);
|
|
162
|
+
|
|
163
|
+
// Get all H2 headings
|
|
164
|
+
const subheadings = byType.getHeadingsByLevel(2);
|
|
165
|
+
|
|
166
|
+
// Get all background images
|
|
167
|
+
const backgrounds = byType.images.background;
|
|
168
|
+
|
|
169
|
+
// Get content under specific headings
|
|
170
|
+
const features = byType.getElementsByHeadingContext(
|
|
171
|
+
h => h.content.includes("Features")
|
|
172
|
+
);
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Sequential Processing
|
|
176
|
+
|
|
177
|
+
```js
|
|
178
|
+
const { sequence } = parseContent(doc);
|
|
179
|
+
|
|
180
|
+
sequence.forEach(element => {
|
|
181
|
+
switch(element.type) {
|
|
182
|
+
case 'heading':
|
|
183
|
+
renderHeading(element);
|
|
184
|
+
break;
|
|
185
|
+
case 'paragraph':
|
|
186
|
+
renderParagraph(element);
|
|
187
|
+
break;
|
|
188
|
+
case 'image':
|
|
189
|
+
renderImage(element);
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
});
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Content Mapping Utilities
|
|
196
|
+
|
|
197
|
+
The parser includes optional mapping utilities to transform parsed content into component-specific formats. Perfect for visual editors and component-based systems.
|
|
198
|
+
|
|
199
|
+
### Type System (Recommended)
|
|
200
|
+
|
|
201
|
+
Automatically transform content based on field types with context-aware behavior:
|
|
202
|
+
|
|
203
|
+
```js
|
|
204
|
+
const schema = {
|
|
205
|
+
title: {
|
|
206
|
+
path: "groups.main.header.title",
|
|
207
|
+
type: "plaintext", // Auto-strips <strong>, <em>, etc.
|
|
208
|
+
maxLength: 60 // Auto-truncates intelligently
|
|
209
|
+
},
|
|
210
|
+
excerpt: {
|
|
211
|
+
path: "groups.main.body.paragraphs",
|
|
212
|
+
type: "excerpt", // Auto-creates excerpt from paragraphs
|
|
213
|
+
maxLength: 150
|
|
214
|
+
},
|
|
215
|
+
image: {
|
|
216
|
+
path: "groups.main.body.imgs[0].url",
|
|
217
|
+
type: "image",
|
|
218
|
+
defaultValue: "/placeholder.jpg"
|
|
219
|
+
}
|
|
220
|
+
};
|
|
221
|
+
|
|
222
|
+
// Visual editor mode (default) - silent, graceful cleanup
|
|
223
|
+
const data = mappers.extractBySchema(parsed, schema);
|
|
224
|
+
|
|
225
|
+
// Build mode - validates and warns
|
|
226
|
+
const data = mappers.extractBySchema(parsed, schema, { mode: 'build' });
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
**Field Types:** `plaintext`, `richtext`, `excerpt`, `number`, `image`, `link`
|
|
230
|
+
|
|
231
|
+
### Using Pre-Built Extractors
|
|
232
|
+
|
|
233
|
+
```js
|
|
234
|
+
import { parseContent, mappers } from "@uniwebcms/semantic-parser";
|
|
235
|
+
|
|
236
|
+
const parsed = parseContent(doc);
|
|
237
|
+
|
|
238
|
+
// Extract hero component data
|
|
239
|
+
const heroData = mappers.extractors.hero(parsed);
|
|
240
|
+
// { title, subtitle, kicker, description, image, cta, ... }
|
|
241
|
+
|
|
242
|
+
// Extract card data
|
|
243
|
+
const cards = mappers.extractors.card(parsed, { useItems: true });
|
|
244
|
+
|
|
245
|
+
// Extract statistics
|
|
246
|
+
const stats = mappers.extractors.stats(parsed);
|
|
247
|
+
// [{ value: "12", label: "Partner Labs" }, ...]
|
|
248
|
+
|
|
249
|
+
// Extract navigation menu
|
|
250
|
+
const nav = mappers.extractors.navigation(parsed);
|
|
251
|
+
|
|
252
|
+
// Extract features list
|
|
253
|
+
const features = mappers.extractors.features(parsed);
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Schema-Based Mapping
|
|
257
|
+
|
|
258
|
+
Define custom mappings using schemas:
|
|
259
|
+
|
|
260
|
+
```js
|
|
261
|
+
const schema = {
|
|
262
|
+
brand: "groups.main.header.pretitle",
|
|
263
|
+
title: "groups.main.header.title",
|
|
264
|
+
subtitle: "groups.main.header.subtitle",
|
|
265
|
+
image: {
|
|
266
|
+
path: "groups.main.body.imgs[0].url",
|
|
267
|
+
defaultValue: "/placeholder.jpg"
|
|
268
|
+
},
|
|
269
|
+
actions: {
|
|
270
|
+
path: "groups.main.body.links",
|
|
271
|
+
transform: links => links.map(l => ({ label: l.label, type: "primary" }))
|
|
272
|
+
}
|
|
273
|
+
};
|
|
274
|
+
|
|
275
|
+
const componentData = mappers.accessor.extractBySchema(parsed, schema);
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### Available Extractors
|
|
279
|
+
|
|
280
|
+
- `hero` - Hero/banner sections
|
|
281
|
+
- `card` - Card components
|
|
282
|
+
- `article` - Article/blog content
|
|
283
|
+
- `stats` - Statistics/metrics
|
|
284
|
+
- `navigation` - Navigation menus
|
|
285
|
+
- `features` - Feature lists
|
|
286
|
+
- `testimonial` - Testimonials
|
|
287
|
+
- `faq` - FAQ sections
|
|
288
|
+
- `pricing` - Pricing tiers
|
|
289
|
+
- `team` - Team members
|
|
290
|
+
- `gallery` - Image galleries
|
|
291
|
+
|
|
292
|
+
See **[Mapping Patterns Guide](./docs/mapping-patterns.md)** for complete documentation.
|
|
293
|
+
|
|
294
|
+
## Rendering Content
|
|
295
|
+
|
|
296
|
+
After extracting content, render it using a Text component that handles paragraph arrays, rich HTML, and formatting marks.
|
|
297
|
+
|
|
298
|
+
### Text Component Pattern
|
|
299
|
+
|
|
300
|
+
```jsx
|
|
301
|
+
import { parseContent, mappers } from '@uniwebcms/semantic-parser';
|
|
302
|
+
import { H1, P } from './components/Text';
|
|
303
|
+
|
|
304
|
+
const parsed = parseContent(doc);
|
|
305
|
+
const hero = mappers.extractors.hero(parsed);
|
|
306
|
+
|
|
307
|
+
// Render extracted content
|
|
308
|
+
<>
|
|
309
|
+
<H1 text={hero.title} />
|
|
310
|
+
<P text={hero.description} /> {/* Handles arrays automatically */}
|
|
311
|
+
</>
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
The Text component:
|
|
315
|
+
- **Handles arrays** - Renders `["Para 1", "Para 2"]` as separate paragraphs
|
|
316
|
+
- **Supports rich HTML** - Preserves formatting marks
|
|
317
|
+
- **Multi-line headings** - Wraps multiple lines in semantic heading tags
|
|
318
|
+
- **Color marks** - Supports `<mark>` and `<span>` for visual emphasis
|
|
319
|
+
|
|
320
|
+
See **[Text Component Reference](./docs/text-component-reference.md)** for implementation guide.
|
|
321
|
+
|
|
322
|
+
### Sanitization
|
|
323
|
+
|
|
324
|
+
Sanitize content at the engine level (during data preparation), not in components:
|
|
325
|
+
|
|
326
|
+
```javascript
|
|
327
|
+
import { parseContent, mappers } from '@uniwebcms/semantic-parser';
|
|
328
|
+
|
|
329
|
+
function prepareData(parsed) {
|
|
330
|
+
const hero = mappers.extractors.hero(parsed);
|
|
331
|
+
return {
|
|
332
|
+
...hero,
|
|
333
|
+
title: mappers.types.sanitizeHtml(hero.title, {
|
|
334
|
+
allowedTags: ['strong', 'em', 'mark', 'span'],
|
|
335
|
+
allowedAttr: ['class', 'data-variant']
|
|
336
|
+
})
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
The parser provides sanitization utilities but doesn't enforce their use. Your engine decides when to sanitize based on security requirements.
|
|
342
|
+
|
|
343
|
+
## Content Grouping
|
|
344
|
+
|
|
345
|
+
The parser supports two grouping modes:
|
|
346
|
+
|
|
347
|
+
### Heading-Based Grouping (Default)
|
|
348
|
+
|
|
349
|
+
Groups are created based on heading patterns. A new group starts when:
|
|
350
|
+
- A heading follows content
|
|
351
|
+
- Multiple H1s appear (no main content created)
|
|
352
|
+
- The heading level indicates a new section
|
|
353
|
+
|
|
354
|
+
**Pretitle Detection:** Any heading followed by a more important heading is automatically detected as a pretitle:
|
|
355
|
+
- H3 before H1 → pretitle ✅
|
|
356
|
+
- H2 before H1 → pretitle ✅
|
|
357
|
+
- H6 before H5 → pretitle ✅
|
|
358
|
+
- H4 before H2 → pretitle ✅
|
|
359
|
+
|
|
360
|
+
No configuration needed - it just works naturally!
|
|
361
|
+
|
|
362
|
+
### Divider-Based Grouping
|
|
363
|
+
|
|
364
|
+
When any horizontal rule (`---`) is present, the entire document uses divider-based grouping. Groups are split explicitly by dividers.
|
|
365
|
+
|
|
366
|
+
## Text Formatting
|
|
367
|
+
|
|
368
|
+
Inline formatting is preserved as HTML tags:
|
|
369
|
+
|
|
370
|
+
```js
|
|
371
|
+
// Input: Text with bold mark
|
|
372
|
+
// Output: "Text with <strong>bold</strong>"
|
|
373
|
+
|
|
374
|
+
// Input: Link mark
|
|
375
|
+
// Output: "Click <a href=\"/docs\">here</a>"
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
## Documentation
|
|
379
|
+
|
|
380
|
+
- **[Content Writing Guide](./docs/guide.md)**: Learn how to structure content for optimal parsing
|
|
381
|
+
- **[API Reference](./docs/api.md)**: Complete API documentation with all element types
|
|
382
|
+
- **[Mapping Patterns Guide](./docs/mapping-patterns.md)**: Transform content to component-specific formats
|
|
383
|
+
- **[Text Component Reference](./docs/text-component-reference.md)**: Reference implementation for rendering parsed content
|
|
384
|
+
- **[File Structure](./docs/file-structure.md)**: Codebase organization
|
|
385
|
+
|
|
386
|
+
## Use Cases
|
|
387
|
+
|
|
388
|
+
- **Component-based websites**: Extract structured data for React/Vue components
|
|
389
|
+
- **Content management**: Parse editor content into database-friendly structures
|
|
390
|
+
- **Static site generation**: Transform rich content into template-ready data
|
|
391
|
+
- **Content analysis**: Analyze document structure and content types
|
|
392
|
+
|
|
393
|
+
## License
|
|
394
|
+
|
|
395
|
+
GPL-3.0-or-later
|