@uniweb/semantic-parser 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -61,7 +61,6 @@ The parser returns a flat content structure:
61
61
  title: '', // Main heading
62
62
  pretitle: '', // Heading before main title
63
63
  subtitle: '', // Heading after main title
64
- subtitle2: '', // Third heading level
65
64
  paragraphs: [],
66
65
  links: [], // All link-like entities (including buttons, documents)
67
66
  images: [],
@@ -71,7 +70,7 @@ The parser returns a flat content structure:
71
70
  quotes: [],
72
71
  snippets: [], // Fenced code — [{ language, code }]
73
72
  data: {}, // Structured data (tagged data blocks, forms, cards)
74
- headings: [], // Overflow headings after title/subtitle/subtitle2
73
+ headings: [], // Headings after subtitle, in document order
75
74
  items: [], // Child content groups (same structure recursively)
76
75
  }
77
76
  ```
@@ -126,8 +125,6 @@ Editor-specific nodes are mapped to standard entities:
126
125
  - `card-group` → `data[cardType]` arrays (e.g., `data.person`, `data.event`)
127
126
  - `document-group` → `links[]` with `role: "document"` and `download: true`
128
127
 
129
- See `docs/entity-consolidation.md` for complete mapping documentation.
130
-
131
128
  ### Tagged Data Blocks
132
129
 
133
130
  Data blocks with tags route parsed data to the `data` object:
@@ -165,14 +162,14 @@ Lists maintain hierarchy through nested structure. The `processListItems()` func
165
162
 
166
163
  ## Content Writing Conventions
167
164
 
168
- The parser implements the semantic conventions documented in `docs/guide.md`. Key patterns:
165
+ Key patterns:
169
166
 
170
167
  - **Pretitle Pattern**: Any heading followed by a more important heading (e.g., H3→H1, H2→H1, H6→H5, etc.)
171
168
  - **Banner Pattern**: Image (with banner role or followed by heading) at start of first group
172
169
  - **Divider Mode**: Presence of any `horizontalRule` switches entire document to divider-based grouping
173
170
  - **Heading Groups**: Consecutive headings with increasing levels are consumed together
174
171
  - **Main Content**: First group is main if it's the only group OR has lower heading level than second group
175
- - **Body Headings**: Headings that overflow the header slots (title, subtitle, subtitle2) are automatically collected in `body.headings`
172
+ - **Body Headings**: Headings after the title and subtitle slots are collected in `body.headings` in document order
176
173
 
177
174
  ## Testing Structure
178
175
 
@@ -188,6 +185,5 @@ Tests are organized by processor:
188
185
 
189
186
  - The parser never modifies the original ProseMirror document
190
187
  - Text content can include inline HTML for formatting (bold → `<strong>`, italic → `<em>`, links → `<a>`)
191
- - The `processors_old/` directory contains legacy implementations - do not modify
192
188
  - Context information in byType includes position, previous/next elements, and nearest heading
193
189
  - Group splitting logic differs significantly between heading mode and divider mode
package/README.md CHANGED
@@ -68,7 +68,6 @@ result = {
68
68
  pretitle: "", // Heading before main title
69
69
  title: "Welcome", // Main heading
70
70
  subtitle: "", // Heading after main title
71
- subtitle2: "", // Third heading level
72
71
 
73
72
  // Body fields
74
73
  paragraphs: ["Get started today."],
@@ -78,8 +77,8 @@ result = {
78
77
  icons: [],
79
78
  lists: [],
80
79
  quotes: [],
81
- data: {}, // Structured data (tagged code blocks, forms, cards)
82
- headings: [], // Overflow headings after title/subtitle/subtitle2
80
+ data: {}, // Structured data (tagged data blocks, forms, cards)
81
+ headings: [], // Headings after subtitle, in document order
83
82
 
84
83
  // Additional content groups (from headings after content)
85
84
  items: [
@@ -143,154 +142,6 @@ sequence.forEach(element => {
143
142
  });
144
143
  ```
145
144
 
146
- ## Content Mapping Utilities
147
-
148
- The parser includes optional mapping utilities to transform parsed content into component-specific formats. Perfect for visual editors and component-based systems.
149
-
150
- ### Type System (Recommended)
151
-
152
- Automatically transform content based on field types with context-aware behavior:
153
-
154
- ```js
155
- const schema = {
156
- title: {
157
- path: "title",
158
- type: "plaintext", // Auto-strips <strong>, <em>, etc.
159
- maxLength: 60 // Auto-truncates intelligently
160
- },
161
- excerpt: {
162
- path: "paragraphs",
163
- type: "excerpt", // Auto-creates excerpt from paragraphs
164
- maxLength: 150
165
- },
166
- image: {
167
- path: "images[0].url",
168
- type: "image",
169
- defaultValue: "/placeholder.jpg"
170
- }
171
- };
172
-
173
- // Visual editor mode (default) - silent, graceful cleanup
174
- const data = mappers.extractBySchema(parsed, schema);
175
-
176
- // Build mode - validates and warns
177
- const data = mappers.extractBySchema(parsed, schema, { mode: 'build' });
178
- ```
179
-
180
- **Field Types:** `plaintext`, `richtext`, `excerpt`, `number`, `image`, `link`
181
-
182
- ### Using Pre-Built Extractors
183
-
184
- ```js
185
- import { parseContent, mappers } from "@uniweb/semantic-parser";
186
-
187
- const parsed = parseContent(doc);
188
-
189
- // Extract hero component data
190
- const heroData = mappers.extractors.hero(parsed);
191
- // { title, subtitle, kicker, description, image, cta, ... }
192
-
193
- // Extract card data
194
- const cards = mappers.extractors.card(parsed, { useItems: true });
195
-
196
- // Extract statistics
197
- const stats = mappers.extractors.stats(parsed);
198
- // [{ value: "12", label: "Partner Labs" }, ...]
199
-
200
- // Extract navigation menu
201
- const nav = mappers.extractors.navigation(parsed);
202
-
203
- // Extract features list
204
- const features = mappers.extractors.features(parsed);
205
- ```
206
-
207
- ### Schema-Based Mapping
208
-
209
- Define custom mappings using schemas:
210
-
211
- ```js
212
- const schema = {
213
- brand: "pretitle",
214
- title: "title",
215
- subtitle: "subtitle",
216
- image: {
217
- path: "images[0].url",
218
- defaultValue: "/placeholder.jpg"
219
- },
220
- actions: {
221
- path: "links",
222
- transform: links => links.map(l => ({ label: l.label, type: "primary" }))
223
- }
224
- };
225
-
226
- const componentData = mappers.accessor.extractBySchema(parsed, schema);
227
- ```
228
-
229
- ### Available Extractors
230
-
231
- - `hero` - Hero/banner sections
232
- - `card` - Card components
233
- - `article` - Article/blog content
234
- - `stats` - Statistics/metrics
235
- - `navigation` - Navigation menus
236
- - `features` - Feature lists
237
- - `testimonial` - Testimonials
238
- - `faq` - FAQ sections
239
- - `pricing` - Pricing tiers
240
- - `team` - Team members
241
- - `gallery` - Image galleries
242
-
243
- See **[Mapping Patterns Guide](./docs/mapping-patterns.md)** for complete documentation.
244
-
245
- ## Rendering Content
246
-
247
- After extracting content, render it using a Text component that handles paragraph arrays, rich HTML, and formatting marks.
248
-
249
- ### Text Component Pattern
250
-
251
- ```jsx
252
- import { parseContent, mappers } from '@uniweb/semantic-parser';
253
- import { H1, P } from './components/Text';
254
-
255
- const parsed = parseContent(doc);
256
- const hero = mappers.extractors.hero(parsed);
257
-
258
- // Render extracted content
259
- <>
260
- <H1 text={hero.title} />
261
- <P text={hero.description} /> {/* Handles arrays automatically */}
262
- </>
263
- ```
264
-
265
- The Text component:
266
- - **Handles arrays** - Renders `["Para 1", "Para 2"]` as separate paragraphs
267
- - **Supports rich HTML** - Preserves formatting marks
268
- - **Multi-line headings** - Wraps multiple lines in semantic heading tags
269
- - **Color marks** - Supports `<mark>` and `<span>` for visual emphasis
270
-
271
- See **[Text Component Reference](./docs/text-component-reference.md)** for implementation guide.
272
-
273
- ### Sanitization
274
-
275
- Sanitize content at the engine level (during data preparation), not in components:
276
-
277
- ```javascript
278
- import { parseContent, mappers } from '@uniweb/semantic-parser';
279
-
280
- function prepareData(parsed) {
281
- const hero = mappers.extractors.hero(parsed);
282
- return {
283
- ...hero,
284
- title: mappers.types.sanitizeHtml(hero.title, {
285
- allowedTags: ['strong', 'em', 'mark', 'span'],
286
- allowedAttr: ['class', 'data-variant']
287
- })
288
- };
289
- }
290
- ```
291
-
292
- The parser provides sanitization utilities but doesn't enforce their use. Your engine decides when to sanitize based on security requirements.
293
-
294
145
  ## Content Grouping
295
146
 
296
147
  The parser supports two grouping modes:
@@ -346,14 +197,6 @@ Bracketed spans (`[text]{.class}`) are converted to `<span>` elements with their
346
197
 
347
198
  Spans can have classes, IDs, and custom attributes. They combine with other marks—a span with bold becomes `<strong><span class="...">text</span></strong>`.
348
199
 
349
- ## Documentation
350
-
351
- - **[Content Writing Guide](./docs/guide.md)**: Learn how to structure content for optimal parsing
352
- - **[API Reference](./docs/api.md)**: Complete API documentation with all element types
353
- - **[Mapping Patterns Guide](./docs/mapping-patterns.md)**: Transform content to component-specific formats
354
- - **[Text Component Reference](./docs/text-component-reference.md)**: Reference implementation for rendering parsed content
355
- - **[File Structure](./docs/file-structure.md)**: Codebase organization
356
-
357
200
  ## Use Cases
358
201
 
359
202
  - **Component-based websites**: Extract structured data for React/Vue components
package/package.json CHANGED
@@ -1,13 +1,11 @@
1
1
  {
2
2
  "name": "@uniweb/semantic-parser",
3
- "version": "1.1.5",
3
+ "version": "1.1.7",
4
4
  "description": "Semantic parser for ProseMirror/TipTap content structures",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
7
7
  "exports": {
8
- ".": "./src/index.js",
9
- "./mappers": "./src/mappers/index.js",
10
- "./mappers/*": "./src/mappers/*.js"
8
+ ".": "./src/index.js"
11
9
  },
12
10
  "keywords": [
13
11
  "prosemirror",
@@ -30,7 +28,6 @@
30
28
  },
31
29
  "homepage": "https://github.com/uniweb/semantic-parser#readme",
32
30
  "directories": {
33
- "doc": "docs",
34
31
  "test": "tests"
35
32
  },
36
33
  "dependencies": {
@@ -0,0 +1,204 @@
1
+ /**
2
+ * Reverse conversion: content structure → TipTap document.
3
+ *
4
+ * Mirrors the forward parser (processors/sequence.js + processors/groups.js)
5
+ * so that parseContent(buildDoc(content)) roundtrips cleanly.
6
+ *
7
+ * Starter content uses plain strings (no HTML marks), so the conversion
8
+ * is straightforward — no need to reverse inline HTML formatting.
9
+ */
10
+
11
+ // --- TipTap node builders ---
12
+
13
+ function textNode(text) {
14
+ return { type: 'text', text }
15
+ }
16
+
17
+ function heading(level, text) {
18
+ if (!text) return null
19
+ // Multi-line title: string[] → multiple headings at same level
20
+ if (Array.isArray(text)) {
21
+ return text.map(t => heading(level, t)).filter(Boolean)
22
+ }
23
+ return {
24
+ type: 'heading',
25
+ attrs: { level },
26
+ content: [textNode(text)],
27
+ }
28
+ }
29
+
30
+ function paragraph(text) {
31
+ if (!text) return null
32
+ return {
33
+ type: 'paragraph',
34
+ content: [textNode(text)],
35
+ }
36
+ }
37
+
38
+ function linkParagraph({ text, href, target }) {
39
+ if (!text || !href) return null
40
+ const mark = { type: 'link', attrs: { href } }
41
+ if (target) mark.attrs.target = target
42
+ return {
43
+ type: 'paragraph',
44
+ content: [{ type: 'text', text, marks: [mark] }],
45
+ }
46
+ }
47
+
48
+ function imageBlock({ src, alt = '', caption = '', direction, role, width, height }) {
49
+ const attrs = { url: src, alt }
50
+ if (caption) attrs.caption = caption
51
+ if (direction) attrs.direction = direction
52
+ if (role) attrs.role = role
53
+ if (width && height) {
54
+ attrs.aspect_ratio = { width, height, ratio: (height / width) * 100 }
55
+ }
56
+ return { type: 'ImageBlock', attrs }
57
+ }
58
+
59
+ function iconNode({ src, svg, library, name, size, color }) {
60
+ // UniwebIcon supports multiple source types
61
+ const attrs = {}
62
+ if (svg || src) attrs.svg = svg || src
63
+ if (library) attrs.library = library
64
+ if (name) attrs.name = name
65
+ if (size) attrs.size = size
66
+ if (color) attrs.color = color
67
+ return { type: 'UniwebIcon', attrs }
68
+ }
69
+
70
+ function videoNode({ src, caption, direction, coverImg }) {
71
+ const attrs = { src }
72
+ if (caption) attrs.caption = caption
73
+ if (direction) attrs.direction = direction
74
+ if (coverImg) attrs.coverImg = coverImg
75
+ return { type: 'Video', attrs }
76
+ }
77
+
78
+ function dividerBlock() {
79
+ return { type: 'DividerBlock' }
80
+ }
81
+
82
+ function bulletList(items) {
83
+ if (!items || !items.length) return null
84
+ return {
85
+ type: 'bulletList',
86
+ content: items.map(item => ({
87
+ type: 'listItem',
88
+ content: [paragraph(item)].filter(Boolean),
89
+ })),
90
+ }
91
+ }
92
+
93
+ // --- Group builder ---
94
+
95
+ /**
96
+ * Build TipTap nodes from a content group (main or item).
97
+ *
98
+ * @param {Object} group - Content structure: { pretitle, title, subtitle, paragraphs, images, ... }
99
+ * @param {number} titleLevel - Heading level for title (1 for main, 2 for items)
100
+ * @returns {Array} Array of TipTap nodes
101
+ */
102
+ function buildGroupNodes(group, titleLevel = 1) {
103
+ const nodes = []
104
+
105
+ // 1. Headings: pretitle → title → subtitle
106
+ // Pretitle uses a higher level number (less important) than title
107
+ // e.g., H3 before H1 — mirrors isPreTitle() in groups.js
108
+ if (group.pretitle) {
109
+ const pre = heading(titleLevel + 2, group.pretitle)
110
+ if (Array.isArray(pre)) nodes.push(...pre)
111
+ else if (pre) nodes.push(pre)
112
+ }
113
+
114
+ if (group.title) {
115
+ const t = heading(titleLevel, group.title)
116
+ if (Array.isArray(t)) nodes.push(...t)
117
+ else if (t) nodes.push(t)
118
+ }
119
+
120
+ // Subtitle is one level below title
121
+ if (group.subtitle) {
122
+ const sub = heading(titleLevel + 1, group.subtitle)
123
+ if (Array.isArray(sub)) nodes.push(...sub)
124
+ else if (sub) nodes.push(sub)
125
+ }
126
+
127
+ // 2. Body fields in document order
128
+ if (group.paragraphs) {
129
+ for (const p of group.paragraphs) {
130
+ const node = paragraph(p)
131
+ if (node) nodes.push(node)
132
+ }
133
+ }
134
+
135
+ if (group.images) {
136
+ for (const img of group.images) {
137
+ nodes.push(imageBlock(img))
138
+ }
139
+ }
140
+
141
+ if (group.links) {
142
+ for (const link of group.links) {
143
+ const node = linkParagraph(link)
144
+ if (node) nodes.push(node)
145
+ }
146
+ }
147
+
148
+ if (group.icons) {
149
+ for (const icon of group.icons) {
150
+ nodes.push(iconNode(icon))
151
+ }
152
+ }
153
+
154
+ if (group.videos) {
155
+ for (const video of group.videos) {
156
+ nodes.push(videoNode(video))
157
+ }
158
+ }
159
+
160
+ if (group.lists) {
161
+ for (const list of group.lists) {
162
+ const node = bulletList(list)
163
+ if (node) nodes.push(node)
164
+ }
165
+ }
166
+
167
+ return nodes
168
+ }
169
+
170
+ // --- Main export ---
171
+
172
+ /**
173
+ * Build a TipTap document from a content structure.
174
+ *
175
+ * This is the reverse of parseContent(): given a flat content object
176
+ * (title, paragraphs, items, etc.), produce a TipTap document that
177
+ * roundtrips through parseContent() to yield the same structure.
178
+ *
179
+ * @param {Object} content - Content structure (same shape as parseContent output / starter)
180
+ * @returns {Object|null} TipTap document { type: 'doc', content: [...] }, or null if empty
181
+ */
182
+ function buildDoc(content) {
183
+ if (!content) return null
184
+
185
+ const nodes = []
186
+
187
+ // Main group content (title level 1)
188
+ nodes.push(...buildGroupNodes(content, 1))
189
+
190
+ // Items: separated by DividerBlock (mirrors divider-based grouping in groups.js)
191
+ if (content.items && content.items.length > 0) {
192
+ for (const item of content.items) {
193
+ nodes.push(dividerBlock())
194
+ // Item headings use level 2 (one below main H1)
195
+ nodes.push(...buildGroupNodes(item, 2))
196
+ }
197
+ }
198
+
199
+ if (nodes.length === 0) return null
200
+
201
+ return { type: 'doc', content: nodes }
202
+ }
203
+
204
+ export { buildDoc }
package/src/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { processSequence } from "./processors/sequence.js";
2
2
  import { processGroups } from "./processors/groups.js";
3
- import * as mappers from "./mappers/index.js";
3
+ import { buildDoc } from "./builders/doc.js";
4
4
 
5
5
  /**
6
6
  * Parse ProseMirror/TipTap content into semantic structure
@@ -30,4 +30,4 @@ function parseContent(doc, options = {}) {
30
30
  };
31
31
  }
32
32
 
33
- export { parseContent, mappers };
33
+ export { parseContent, buildDoc };
@@ -9,7 +9,6 @@ function flattenGroup(group) {
9
9
  title: group.header.title || '',
10
10
  pretitle: group.header.pretitle || '',
11
11
  subtitle: group.header.subtitle || '',
12
- subtitle2: group.header.subtitle2 || '',
13
12
  paragraphs: group.body.paragraphs || [],
14
13
  links: group.body.links || [],
15
14
  images: group.body.images || [],
@@ -37,7 +36,6 @@ function processGroups(sequence, options = {}) {
37
36
  title: '',
38
37
  pretitle: '',
39
38
  subtitle: '',
40
- subtitle2: '',
41
39
  paragraphs: [],
42
40
  links: [],
43
41
  images: [],
@@ -75,7 +73,6 @@ function processGroups(sequence, options = {}) {
75
73
  title: '',
76
74
  pretitle: '',
77
75
  subtitle: '',
78
- subtitle2: '',
79
76
  paragraphs: [],
80
77
  links: [],
81
78
  images: [],
@@ -227,7 +224,6 @@ function processGroupContent(elements) {
227
224
  pretitle: "",
228
225
  title: "",
229
226
  subtitle: "",
230
- subtitle2: "",
231
227
  };
232
228
 
233
229
  const body = {
@@ -290,11 +286,8 @@ function processGroupContent(elements) {
290
286
  } else if (!header.subtitle) {
291
287
  header.subtitle = element.text;
292
288
  lastSlot = 'subtitle';
293
- } else if (!header.subtitle2) {
294
- header.subtitle2 = element.text;
295
- lastSlot = 'subtitle2';
296
289
  } else {
297
- // After subtitle2, we're in body - collect heading
290
+ // After subtitle, remaining headings go to body
298
291
  body.headings.push(element.text);
299
292
  lastSlot = null;
300
293
  }
@@ -412,21 +412,25 @@ function processInlineElements(content) {
412
412
  return items;
413
413
  }
414
414
 
415
- function makeAssetUrl(info) {
416
- let url = "";
417
-
418
- let src = info?.src || info?.url || "";
415
+ const ASSET_BASE_URL = "https://assets.uniweb.app/";
419
416
 
420
- if (src) {
421
- url = src;
422
- } else if (info?.identifier) {
423
- url =
424
- new uniweb.Profile(`docufolio/profile`, "_template").getAssetInfo(
425
- info.identifier
426
- )?.src || "";
427
- }
417
+ /**
418
+ * Resolve an asset identifier ({version}/{filename}) to a direct URL.
419
+ * Assets are hosted at assets.uniweb.app under dist/{version}/base.{ext}.
420
+ */
421
+ function resolveAssetIdentifier(identifier) {
422
+ if (!identifier || typeof identifier !== "string") return "";
423
+ const [version, filename] = identifier.split("/");
424
+ if (!filename) return "";
425
+ const ext = filename.substring(filename.lastIndexOf(".") + 1);
426
+ return `${ASSET_BASE_URL}dist/${version}/base.${ext}`;
427
+ }
428
428
 
429
- return url;
429
+ function makeAssetUrl(info) {
430
+ const src = info?.src || info?.url || "";
431
+ if (src) return src;
432
+ if (info?.identifier) return resolveAssetIdentifier(info.identifier);
433
+ return "";
430
434
  }
431
435
 
432
436
  function parseCardBlock(itemAttrs) {
@@ -467,10 +471,7 @@ function parseDocumentBlock(itemAttrs) {
467
471
  const { identifier = "" } = info;
468
472
 
469
473
  if (identifier) {
470
- ele.downloadUrl = new uniweb.Profile(
471
- `docufolio/profile`,
472
- "_template"
473
- ).getAssetInfo(identifier)?.href;
474
+ ele.downloadUrl = resolveAssetIdentifier(identifier);
474
475
  }
475
476
  }
476
477