@uniweb/semantic-parser 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -61,16 +61,16 @@ The parser returns a flat content structure:
61
61
  title: '', // Main heading
62
62
  pretitle: '', // Heading before main title
63
63
  subtitle: '', // Heading after main title
64
- subtitle2: '', // Third heading level
65
64
  paragraphs: [],
66
65
  links: [], // All link-like entities (including buttons, documents)
67
- imgs: [],
66
+ images: [],
68
67
  icons: [],
69
68
  videos: [],
70
69
  lists: [],
71
70
  quotes: [],
72
- data: {}, // Structured data (tagged code blocks, forms, cards)
73
- headings: [], // Overflow headings after title/subtitle/subtitle2
71
+ snippets: [], // Fenced code [{ language, code }]
72
+ data: {}, // Structured data (tagged data blocks, forms, cards)
73
+ headings: [], // Headings after subtitle, in document order
74
74
  items: [], // Child content groups (same structure recursively)
75
75
  }
76
76
  ```
@@ -125,11 +125,9 @@ Editor-specific nodes are mapped to standard entities:
125
125
  - `card-group` → `data[cardType]` arrays (e.g., `data.person`, `data.event`)
126
126
  - `document-group` → `links[]` with `role: "document"` and `download: true`
127
127
 
128
- See `docs/entity-consolidation.md` for complete mapping documentation.
128
+ ### Tagged Data Blocks
129
129
 
130
- ### Tagged Code Blocks
131
-
132
- Code blocks with tags route parsed data to the `data` object:
130
+ Data blocks with tags route parsed data to the `data` object:
133
131
 
134
132
  ```markdown
135
133
  ```yaml:nav-links
@@ -164,14 +162,14 @@ Lists maintain hierarchy through nested structure. The `processListItems()` func
164
162
 
165
163
  ## Content Writing Conventions
166
164
 
167
- The parser implements the semantic conventions documented in `docs/guide.md`. Key patterns:
165
+ Key patterns:
168
166
 
169
167
  - **Pretitle Pattern**: Any heading followed by a more important heading (e.g., H3→H1, H2→H1, H6→H5, etc.)
170
168
  - **Banner Pattern**: Image (with banner role or followed by heading) at start of first group
171
169
  - **Divider Mode**: Presence of any `horizontalRule` switches entire document to divider-based grouping
172
170
  - **Heading Groups**: Consecutive headings with increasing levels are consumed together
173
171
  - **Main Content**: First group is main if it's the only group OR has lower heading level than second group
174
- - **Body Headings**: Headings that overflow the header slots (title, subtitle, subtitle2) are automatically collected in `body.headings`
172
+ - **Body Headings**: Headings after the title and subtitle slots are collected in `body.headings` in document order
175
173
 
176
174
  ## Testing Structure
177
175
 
@@ -187,6 +185,5 @@ Tests are organized by processor:
187
185
 
188
186
  - The parser never modifies the original ProseMirror document
189
187
  - Text content can include inline HTML for formatting (bold → `<strong>`, italic → `<em>`, links → `<a>`)
190
- - The `processors_old/` directory contains legacy implementations - do not modify
191
188
  - Context information in byType includes position, previous/next elements, and nearest heading
192
189
  - Group splitting logic differs significantly between heading mode and divider mode
package/README.md CHANGED
@@ -68,18 +68,17 @@ result = {
68
68
  pretitle: "", // Heading before main title
69
69
  title: "Welcome", // Main heading
70
70
  subtitle: "", // Heading after main title
71
- subtitle2: "", // Third heading level
72
71
 
73
72
  // Body fields
74
73
  paragraphs: ["Get started today."],
75
74
  links: [], // All links (including buttons, documents)
76
- imgs: [],
75
+ images: [],
77
76
  videos: [],
78
77
  icons: [],
79
78
  lists: [],
80
79
  quotes: [],
81
- data: {}, // Structured data (tagged code blocks, forms, cards)
82
- headings: [], // Overflow headings after title/subtitle/subtitle2
80
+ data: {}, // Structured data (tagged data blocks, forms, cards)
81
+ headings: [], // Headings after subtitle, in document order
83
82
 
84
83
  // Additional content groups (from headings after content)
85
84
  items: [
@@ -143,154 +142,6 @@ sequence.forEach(element => {
143
142
  });
144
143
  ```
145
144
 
146
- ## Content Mapping Utilities
147
-
148
- The parser includes optional mapping utilities to transform parsed content into component-specific formats. Perfect for visual editors and component-based systems.
149
-
150
- ### Type System (Recommended)
151
-
152
- Automatically transform content based on field types with context-aware behavior:
153
-
154
- ```js
155
- const schema = {
156
- title: {
157
- path: "title",
158
- type: "plaintext", // Auto-strips <strong>, <em>, etc.
159
- maxLength: 60 // Auto-truncates intelligently
160
- },
161
- excerpt: {
162
- path: "paragraphs",
163
- type: "excerpt", // Auto-creates excerpt from paragraphs
164
- maxLength: 150
165
- },
166
- image: {
167
- path: "imgs[0].url",
168
- type: "image",
169
- defaultValue: "/placeholder.jpg"
170
- }
171
- };
172
-
173
- // Visual editor mode (default) - silent, graceful cleanup
174
- const data = mappers.extractBySchema(parsed, schema);
175
-
176
- // Build mode - validates and warns
177
- const data = mappers.extractBySchema(parsed, schema, { mode: 'build' });
178
- ```
179
-
180
- **Field Types:** `plaintext`, `richtext`, `excerpt`, `number`, `image`, `link`
181
-
182
- ### Using Pre-Built Extractors
183
-
184
- ```js
185
- import { parseContent, mappers } from "@uniweb/semantic-parser";
186
-
187
- const parsed = parseContent(doc);
188
-
189
- // Extract hero component data
190
- const heroData = mappers.extractors.hero(parsed);
191
- // { title, subtitle, kicker, description, image, cta, ... }
192
-
193
- // Extract card data
194
- const cards = mappers.extractors.card(parsed, { useItems: true });
195
-
196
- // Extract statistics
197
- const stats = mappers.extractors.stats(parsed);
198
- // [{ value: "12", label: "Partner Labs" }, ...]
199
-
200
- // Extract navigation menu
201
- const nav = mappers.extractors.navigation(parsed);
202
-
203
- // Extract features list
204
- const features = mappers.extractors.features(parsed);
205
- ```
206
-
207
- ### Schema-Based Mapping
208
-
209
- Define custom mappings using schemas:
210
-
211
- ```js
212
- const schema = {
213
- brand: "pretitle",
214
- title: "title",
215
- subtitle: "subtitle",
216
- image: {
217
- path: "imgs[0].url",
218
- defaultValue: "/placeholder.jpg"
219
- },
220
- actions: {
221
- path: "links",
222
- transform: links => links.map(l => ({ label: l.label, type: "primary" }))
223
- }
224
- };
225
-
226
- const componentData = mappers.accessor.extractBySchema(parsed, schema);
227
- ```
228
-
229
- ### Available Extractors
230
-
231
- - `hero` - Hero/banner sections
232
- - `card` - Card components
233
- - `article` - Article/blog content
234
- - `stats` - Statistics/metrics
235
- - `navigation` - Navigation menus
236
- - `features` - Feature lists
237
- - `testimonial` - Testimonials
238
- - `faq` - FAQ sections
239
- - `pricing` - Pricing tiers
240
- - `team` - Team members
241
- - `gallery` - Image galleries
242
-
243
- See **[Mapping Patterns Guide](./docs/mapping-patterns.md)** for complete documentation.
244
-
245
- ## Rendering Content
246
-
247
- After extracting content, render it using a Text component that handles paragraph arrays, rich HTML, and formatting marks.
248
-
249
- ### Text Component Pattern
250
-
251
- ```jsx
252
- import { parseContent, mappers } from '@uniweb/semantic-parser';
253
- import { H1, P } from './components/Text';
254
-
255
- const parsed = parseContent(doc);
256
- const hero = mappers.extractors.hero(parsed);
257
-
258
- // Render extracted content
259
- <>
260
- <H1 text={hero.title} />
261
- <P text={hero.description} /> {/* Handles arrays automatically */}
262
- </>
263
- ```
264
-
265
- The Text component:
266
- - **Handles arrays** - Renders `["Para 1", "Para 2"]` as separate paragraphs
267
- - **Supports rich HTML** - Preserves formatting marks
268
- - **Multi-line headings** - Wraps multiple lines in semantic heading tags
269
- - **Color marks** - Supports `<mark>` and `<span>` for visual emphasis
270
-
271
- See **[Text Component Reference](./docs/text-component-reference.md)** for implementation guide.
272
-
273
- ### Sanitization
274
-
275
- Sanitize content at the engine level (during data preparation), not in components:
276
-
277
- ```javascript
278
- import { parseContent, mappers } from '@uniweb/semantic-parser';
279
-
280
- function prepareData(parsed) {
281
- const hero = mappers.extractors.hero(parsed);
282
- return {
283
- ...hero,
284
- title: mappers.types.sanitizeHtml(hero.title, {
285
- allowedTags: ['strong', 'em', 'mark', 'span'],
286
- allowedAttr: ['class', 'data-variant']
287
- })
288
- };
289
- }
290
- ```
291
-
292
- The parser provides sanitization utilities but doesn't enforce their use. Your engine decides when to sanitize based on security requirements.
293
-
294
145
  ## Content Grouping
295
146
 
296
147
  The parser supports two grouping modes:
@@ -346,14 +197,6 @@ Bracketed spans (`[text]{.class}`) are converted to `<span>` elements with their
346
197
 
347
198
  Spans can have classes, IDs, and custom attributes. They combine with other marks—a span with bold becomes `<strong><span class="...">text</span></strong>`.
348
199
 
349
- ## Documentation
350
-
351
- - **[Content Writing Guide](./docs/guide.md)**: Learn how to structure content for optimal parsing
352
- - **[API Reference](./docs/api.md)**: Complete API documentation with all element types
353
- - **[Mapping Patterns Guide](./docs/mapping-patterns.md)**: Transform content to component-specific formats
354
- - **[Text Component Reference](./docs/text-component-reference.md)**: Reference implementation for rendering parsed content
355
- - **[File Structure](./docs/file-structure.md)**: Codebase organization
356
-
357
200
  ## Use Cases
358
201
 
359
202
  - **Component-based websites**: Extract structured data for React/Vue components
package/package.json CHANGED
@@ -1,13 +1,11 @@
1
1
  {
2
2
  "name": "@uniweb/semantic-parser",
3
- "version": "1.1.4",
3
+ "version": "1.1.6",
4
4
  "description": "Semantic parser for ProseMirror/TipTap content structures",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
7
7
  "exports": {
8
- ".": "./src/index.js",
9
- "./mappers": "./src/mappers/index.js",
10
- "./mappers/*": "./src/mappers/*.js"
8
+ ".": "./src/index.js"
11
9
  },
12
10
  "keywords": [
13
11
  "prosemirror",
@@ -30,7 +28,6 @@
30
28
  },
31
29
  "homepage": "https://github.com/uniweb/semantic-parser#readme",
32
30
  "directories": {
33
- "doc": "docs",
34
31
  "test": "tests"
35
32
  },
36
33
  "dependencies": {
package/src/index.js CHANGED
@@ -1,6 +1,5 @@
1
1
  import { processSequence } from "./processors/sequence.js";
2
2
  import { processGroups } from "./processors/groups.js";
3
- import * as mappers from "./mappers/index.js";
4
3
 
5
4
  /**
6
5
  * Parse ProseMirror/TipTap content into semantic structure
@@ -30,4 +29,4 @@ function parseContent(doc, options = {}) {
30
29
  };
31
30
  }
32
31
 
33
- export { parseContent, mappers };
32
+ export { parseContent };
@@ -9,14 +9,14 @@ function flattenGroup(group) {
9
9
  title: group.header.title || '',
10
10
  pretitle: group.header.pretitle || '',
11
11
  subtitle: group.header.subtitle || '',
12
- subtitle2: group.header.subtitle2 || '',
13
12
  paragraphs: group.body.paragraphs || [],
14
13
  links: group.body.links || [],
15
- imgs: group.body.imgs || [],
14
+ images: group.body.images || [],
16
15
  icons: group.body.icons || [],
17
16
  lists: group.body.lists || [],
18
17
  videos: group.body.videos || [],
19
18
  insets: group.body.insets || [],
19
+ snippets: group.body.snippets || [],
20
20
  data: group.body.data || {},
21
21
  quotes: group.body.quotes || [],
22
22
  headings: group.body.headings || [],
@@ -36,14 +36,14 @@ function processGroups(sequence, options = {}) {
36
36
  title: '',
37
37
  pretitle: '',
38
38
  subtitle: '',
39
- subtitle2: '',
40
39
  paragraphs: [],
41
40
  links: [],
42
- imgs: [],
41
+ images: [],
43
42
  icons: [],
44
43
  lists: [],
45
44
  videos: [],
46
45
  insets: [],
46
+ snippets: [],
47
47
  data: {},
48
48
  quotes: [],
49
49
  headings: [],
@@ -73,10 +73,9 @@ function processGroups(sequence, options = {}) {
73
73
  title: '',
74
74
  pretitle: '',
75
75
  subtitle: '',
76
- subtitle2: '',
77
76
  paragraphs: [],
78
77
  links: [],
79
- imgs: [],
78
+ images: [],
80
79
  icons: [],
81
80
  lists: [],
82
81
  videos: [],
@@ -225,14 +224,14 @@ function processGroupContent(elements) {
225
224
  pretitle: "",
226
225
  title: "",
227
226
  subtitle: "",
228
- subtitle2: "",
229
227
  };
230
228
 
231
229
  const body = {
232
- imgs: [],
230
+ images: [],
233
231
  icons: [],
234
232
  videos: [],
235
233
  insets: [],
234
+ snippets: [],
236
235
  paragraphs: [],
237
236
  links: [],
238
237
  lists: [],
@@ -287,11 +286,8 @@ function processGroupContent(elements) {
287
286
  } else if (!header.subtitle) {
288
287
  header.subtitle = element.text;
289
288
  lastSlot = 'subtitle';
290
- } else if (!header.subtitle2) {
291
- header.subtitle2 = element.text;
292
- lastSlot = 'subtitle2';
293
289
  } else {
294
- // After subtitle2, we're in body - collect heading
290
+ // After subtitle, remaining headings go to body
295
291
  body.headings.push(element.text);
296
292
  lastSlot = null;
297
293
  }
@@ -320,7 +316,7 @@ function processGroupContent(elements) {
320
316
  if (element.attrs?.role === "icon") {
321
317
  body.icons.push(element.attrs);
322
318
  } else {
323
- body.imgs.push(preserveProps);
319
+ body.images.push(preserveProps);
324
320
  }
325
321
  break;
326
322
 
@@ -366,11 +362,16 @@ function processGroupContent(elements) {
366
362
  break;
367
363
 
368
364
  case "codeBlock":
369
- // Fallback: tagged code blocks where parsing failed at build time
370
- // Untagged blocks stay in sequence for display
371
365
  const tag = element.attrs?.tag;
372
366
  if (tag) {
367
+ // Tagged block where parsing failed at build time — store as data
373
368
  body.data[tag] = element.text;
369
+ } else {
370
+ // Untagged code block — collect as a snippet
371
+ body.snippets.push({
372
+ language: element.attrs?.language || '',
373
+ code: typeof element.text === 'string' ? element.text : '',
374
+ });
374
375
  }
375
376
  break;
376
377