@uniweb/semantic-parser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,395 @@
1
+ # @uniwebcms/semantic-parser
2
+
3
+ A semantic parser for ProseMirror/TipTap content structures that helps bridge the gap between natural content writing and component-based web development.
4
+
5
+ ## What it Does
6
+
7
+ The parser transforms rich text editor content (ProseMirror/TipTap) into structured, semantic groups that web components can easily consume. It provides three complementary views of your content:
8
+
9
+ 1. **Sequence**: A flat, ordered list of all content elements
10
+ 2. **Groups**: Content organized into semantic sections with identified main content
11
+ 3. **ByType**: Elements categorized by type for easy filtering and queries
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ npm install @uniwebcms/semantic-parser
17
+ ```
18
+
19
+ ## Quick Start
20
+
21
+ ```js
22
+ import { parseContent } from "@uniwebcms/semantic-parser";
23
+
24
+ // Your ProseMirror/TipTap document
25
+ const doc = {
26
+ type: "doc",
27
+ content: [
28
+ {
29
+ type: "heading",
30
+ attrs: { level: 1 },
31
+ content: [{ type: "text", text: "Welcome" }],
32
+ },
33
+ {
34
+ type: "paragraph",
35
+ content: [{ type: "text", text: "Get started today." }],
36
+ },
37
+ ],
38
+ };
39
+
40
+ // Parse the content
41
+ const result = parseContent(doc);
42
+
43
+ // Access different views
44
+ console.log(result.sequence); // Flat array of elements
45
+ console.log(result.groups); // Semantic groups with main/items
46
+ console.log(result.byType); // Elements organized by type
47
+ ```
48
+
49
+ ## Output Structure
50
+
51
+ ### Sequence View
52
+
53
+ A flat array of semantic elements preserving document order:
54
+
55
+ ```js
56
+ result.sequence = [
57
+ { type: "heading", level: 1, content: "Welcome" },
58
+ { type: "paragraph", content: "Get started today." }
59
+ ]
60
+ ```
61
+
62
+ ### Groups View
63
+
64
+ Content organized into semantic groups:
65
+
66
+ ```js
67
+ result.groups = {
68
+ main: {
69
+ header: {
70
+ pretitle: "", // H3 before main title
71
+ title: "Welcome", // Main heading
72
+ subtitle: "" // Heading after main title
73
+ },
74
+ body: {
75
+ paragraphs: ["Get started today."],
76
+ imgs: [],
77
+ videos: [],
78
+ links: [],
79
+ lists: [],
80
+ // ... more content types
81
+ },
82
+ banner: null, // Optional banner image
83
+ metadata: { level: 1 }
84
+ },
85
+ items: [], // Additional content groups
86
+ metadata: {
87
+ dividerMode: false, // Using dividers vs headings
88
+ groups: 0
89
+ }
90
+ }
91
+ ```
92
+
93
+ ### ByType View
94
+
95
+ Elements organized by type with context:
96
+
97
+ ```js
98
+ result.byType = {
99
+ headings: [
100
+ {
101
+ type: "heading",
102
+ level: 1,
103
+ content: "Welcome",
104
+ context: {
105
+ position: 0,
106
+ previousElement: null,
107
+ nextElement: { type: "paragraph", ... },
108
+ nearestHeading: null
109
+ }
110
+ }
111
+ ],
112
+ paragraphs: [ /* ... */ ],
113
+ images: {
114
+ background: [],
115
+ content: [],
116
+ gallery: [],
117
+ icon: []
118
+ },
119
+ lists: [],
120
+ metadata: {
121
+ totalElements: 2,
122
+ dominantType: "paragraph",
123
+ hasMedia: false
124
+ },
125
+ // Helper methods
126
+ getHeadingsByLevel(level),
127
+ getElementsByHeadingContext(filter)
128
+ }
129
+ ```
130
+
131
+ ## Common Use Cases
132
+
133
+ ### Extracting Main Content
134
+
135
+ ```js
136
+ const { groups } = parseContent(doc);
137
+
138
+ const title = groups.main.header.title;
139
+ const description = groups.main.body.paragraphs.join(" ");
140
+ const image = groups.main.banner?.url;
141
+ ```
142
+
143
+ ### Processing Content Sections
144
+
145
+ ```js
146
+ const { groups } = parseContent(doc);
147
+
148
+ // Main content
149
+ console.log("Main:", groups.main.header.title);
150
+
151
+ // Additional sections
152
+ groups.items.forEach(item => {
153
+ console.log("Section:", item.header.title);
154
+ console.log("Content:", item.body.paragraphs);
155
+ });
156
+ ```
157
+
158
+ ### Finding Specific Elements
159
+
160
+ ```js
161
+ const { byType } = parseContent(doc);
162
+
163
+ // Get all H2 headings
164
+ const subheadings = byType.getHeadingsByLevel(2);
165
+
166
+ // Get all background images
167
+ const backgrounds = byType.images.background;
168
+
169
+ // Get content under specific headings
170
+ const features = byType.getElementsByHeadingContext(
171
+ h => h.content.includes("Features")
172
+ );
173
+ ```
174
+
175
+ ### Sequential Processing
176
+
177
+ ```js
178
+ const { sequence } = parseContent(doc);
179
+
180
+ sequence.forEach(element => {
181
+ switch(element.type) {
182
+ case 'heading':
183
+ renderHeading(element);
184
+ break;
185
+ case 'paragraph':
186
+ renderParagraph(element);
187
+ break;
188
+ case 'image':
189
+ renderImage(element);
190
+ break;
191
+ }
192
+ });
193
+ ```
194
+
195
+ ## Content Mapping Utilities
196
+
197
+ The parser includes optional mapping utilities to transform parsed content into component-specific formats. Perfect for visual editors and component-based systems.
198
+
199
+ ### Type System (Recommended)
200
+
201
+ Automatically transform content based on field types with context-aware behavior:
202
+
203
+ ```js
204
+ const schema = {
205
+ title: {
206
+ path: "groups.main.header.title",
207
+ type: "plaintext", // Auto-strips <strong>, <em>, etc.
208
+ maxLength: 60 // Auto-truncates intelligently
209
+ },
210
+ excerpt: {
211
+ path: "groups.main.body.paragraphs",
212
+ type: "excerpt", // Auto-creates excerpt from paragraphs
213
+ maxLength: 150
214
+ },
215
+ image: {
216
+ path: "groups.main.body.imgs[0].url",
217
+ type: "image",
218
+ defaultValue: "/placeholder.jpg"
219
+ }
220
+ };
221
+
222
+ // Visual editor mode (default) - silent, graceful cleanup
223
+ const data = mappers.extractBySchema(parsed, schema);
224
+
225
+ // Build mode - validates and warns
226
+ const data = mappers.extractBySchema(parsed, schema, { mode: 'build' });
227
+ ```
228
+
229
+ **Field Types:** `plaintext`, `richtext`, `excerpt`, `number`, `image`, `link`
230
+
231
+ ### Using Pre-Built Extractors
232
+
233
+ ```js
234
+ import { parseContent, mappers } from "@uniwebcms/semantic-parser";
235
+
236
+ const parsed = parseContent(doc);
237
+
238
+ // Extract hero component data
239
+ const heroData = mappers.extractors.hero(parsed);
240
+ // { title, subtitle, kicker, description, image, cta, ... }
241
+
242
+ // Extract card data
243
+ const cards = mappers.extractors.card(parsed, { useItems: true });
244
+
245
+ // Extract statistics
246
+ const stats = mappers.extractors.stats(parsed);
247
+ // [{ value: "12", label: "Partner Labs" }, ...]
248
+
249
+ // Extract navigation menu
250
+ const nav = mappers.extractors.navigation(parsed);
251
+
252
+ // Extract features list
253
+ const features = mappers.extractors.features(parsed);
254
+ ```
255
+
256
+ ### Schema-Based Mapping
257
+
258
+ Define custom mappings using schemas:
259
+
260
+ ```js
261
+ const schema = {
262
+ brand: "groups.main.header.pretitle",
263
+ title: "groups.main.header.title",
264
+ subtitle: "groups.main.header.subtitle",
265
+ image: {
266
+ path: "groups.main.body.imgs[0].url",
267
+ defaultValue: "/placeholder.jpg"
268
+ },
269
+ actions: {
270
+ path: "groups.main.body.links",
271
+ transform: links => links.map(l => ({ label: l.label, type: "primary" }))
272
+ }
273
+ };
274
+
275
+ const componentData = mappers.accessor.extractBySchema(parsed, schema);
276
+ ```
277
+
278
+ ### Available Extractors
279
+
280
+ - `hero` - Hero/banner sections
281
+ - `card` - Card components
282
+ - `article` - Article/blog content
283
+ - `stats` - Statistics/metrics
284
+ - `navigation` - Navigation menus
285
+ - `features` - Feature lists
286
+ - `testimonial` - Testimonials
287
+ - `faq` - FAQ sections
288
+ - `pricing` - Pricing tiers
289
+ - `team` - Team members
290
+ - `gallery` - Image galleries
291
+
292
+ See **[Mapping Patterns Guide](./docs/mapping-patterns.md)** for complete documentation.
293
+
294
+ ## Rendering Content
295
+
296
+ After extracting content, render it using a Text component that handles paragraph arrays, rich HTML, and formatting marks.
297
+
298
+ ### Text Component Pattern
299
+
300
+ ```jsx
301
+ import { parseContent, mappers } from '@uniwebcms/semantic-parser';
302
+ import { H1, P } from './components/Text';
303
+
304
+ const parsed = parseContent(doc);
305
+ const hero = mappers.extractors.hero(parsed);
306
+
307
+ // Render extracted content
308
+ <>
309
+ <H1 text={hero.title} />
310
+ <P text={hero.description} /> {/* Handles arrays automatically */}
311
+ </>
312
+ ```
313
+
314
+ The Text component:
315
+ - **Handles arrays** - Renders `["Para 1", "Para 2"]` as separate paragraphs
316
+ - **Supports rich HTML** - Preserves formatting marks
317
+ - **Multi-line headings** - Wraps multiple lines in semantic heading tags
318
+ - **Color marks** - Supports `<mark>` and `<span>` for visual emphasis
319
+
320
+ See **[Text Component Reference](./docs/text-component-reference.md)** for implementation guide.
321
+
322
+ ### Sanitization
323
+
324
+ Sanitize content at the engine level (during data preparation), not in components:
325
+
326
+ ```javascript
327
+ import { parseContent, mappers } from '@uniwebcms/semantic-parser';
328
+
329
+ function prepareData(parsed) {
330
+ const hero = mappers.extractors.hero(parsed);
331
+ return {
332
+ ...hero,
333
+ title: mappers.types.sanitizeHtml(hero.title, {
334
+ allowedTags: ['strong', 'em', 'mark', 'span'],
335
+ allowedAttr: ['class', 'data-variant']
336
+ })
337
+ };
338
+ }
339
+ ```
340
+
341
+ The parser provides sanitization utilities but doesn't enforce their use. Your engine decides when to sanitize based on security requirements.
342
+
343
+ ## Content Grouping
344
+
345
+ The parser supports two grouping modes:
346
+
347
+ ### Heading-Based Grouping (Default)
348
+
349
+ Groups are created based on heading patterns. A new group starts when:
350
+ - A heading follows content
351
+ - Multiple H1s appear (no main content created)
352
+ - The heading level indicates a new section
353
+
354
+ **Pretitle Detection:** Any heading followed by a more important heading is automatically detected as a pretitle:
355
+ - H3 before H1 → pretitle ✅
356
+ - H2 before H1 → pretitle ✅
357
+ - H6 before H5 → pretitle ✅
358
+ - H4 before H2 → pretitle ✅
359
+
360
+ No configuration needed - it just works naturally!
361
+
362
+ ### Divider-Based Grouping
363
+
364
+ When any horizontal rule (`---`) is present, the entire document uses divider-based grouping. Groups are split explicitly by dividers.
365
+
366
+ ## Text Formatting
367
+
368
+ Inline formatting is preserved as HTML tags:
369
+
370
+ ```js
371
+ // Input: Text with bold mark
372
+ // Output: "Text with <strong>bold</strong>"
373
+
374
+ // Input: Link mark
375
+ // Output: "Click <a href=\"/docs\">here</a>"
376
+ ```
377
+
378
+ ## Documentation
379
+
380
+ - **[Content Writing Guide](./docs/guide.md)**: Learn how to structure content for optimal parsing
381
+ - **[API Reference](./docs/api.md)**: Complete API documentation with all element types
382
+ - **[Mapping Patterns Guide](./docs/mapping-patterns.md)**: Transform content to component-specific formats
383
+ - **[Text Component Reference](./docs/text-component-reference.md)**: Reference implementation for rendering parsed content
384
+ - **[File Structure](./docs/file-structure.md)**: Codebase organization
385
+
386
+ ## Use Cases
387
+
388
+ - **Component-based websites**: Extract structured data for React/Vue components
389
+ - **Content management**: Parse editor content into database-friendly structures
390
+ - **Static site generation**: Transform rich content into template-ready data
391
+ - **Content analysis**: Analyze document structure and content types
392
+
393
+ ## License
394
+
395
+ GPL-3.0-or-later