@uniweb/semantic-parser 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +3 -7
- package/README.md +2 -159
- package/package.json +2 -5
- package/src/index.js +1 -2
- package/src/processors/groups.js +1 -8
- package/docs/api.md +0 -350
- package/docs/entity-consolidation.md +0 -470
- package/docs/file-structure.md +0 -50
- package/docs/guide.md +0 -206
- package/docs/mapping-patterns.md +0 -928
- package/docs/text-component-reference.md +0 -515
- package/reference/README.md +0 -195
- package/reference/Text.js +0 -188
- package/src/mappers/accessor.js +0 -312
- package/src/mappers/extractors.js +0 -416
- package/src/mappers/helpers.js +0 -234
- package/src/mappers/index.js +0 -28
- package/src/mappers/types.js +0 -495
- package/src/processors/groups_backup.js +0 -379
- package/src/processors/groups_doc.md +0 -179
- package/src/processors/sequence_backup.js +0 -402
- package/src/processors_old/byType.js +0 -129
- package/src/processors_old/groups.js +0 -240
- package/src/processors_old/sequence.js +0 -140
package/AGENTS.md
CHANGED
|
@@ -61,7 +61,6 @@ The parser returns a flat content structure:
|
|
|
61
61
|
title: '', // Main heading
|
|
62
62
|
pretitle: '', // Heading before main title
|
|
63
63
|
subtitle: '', // Heading after main title
|
|
64
|
-
subtitle2: '', // Third heading level
|
|
65
64
|
paragraphs: [],
|
|
66
65
|
links: [], // All link-like entities (including buttons, documents)
|
|
67
66
|
images: [],
|
|
@@ -71,7 +70,7 @@ The parser returns a flat content structure:
|
|
|
71
70
|
quotes: [],
|
|
72
71
|
snippets: [], // Fenced code — [{ language, code }]
|
|
73
72
|
data: {}, // Structured data (tagged data blocks, forms, cards)
|
|
74
|
-
headings: [], //
|
|
73
|
+
headings: [], // Headings after subtitle, in document order
|
|
75
74
|
items: [], // Child content groups (same structure recursively)
|
|
76
75
|
}
|
|
77
76
|
```
|
|
@@ -126,8 +125,6 @@ Editor-specific nodes are mapped to standard entities:
|
|
|
126
125
|
- `card-group` → `data[cardType]` arrays (e.g., `data.person`, `data.event`)
|
|
127
126
|
- `document-group` → `links[]` with `role: "document"` and `download: true`
|
|
128
127
|
|
|
129
|
-
See `docs/entity-consolidation.md` for complete mapping documentation.
|
|
130
|
-
|
|
131
128
|
### Tagged Data Blocks
|
|
132
129
|
|
|
133
130
|
Data blocks with tags route parsed data to the `data` object:
|
|
@@ -165,14 +162,14 @@ Lists maintain hierarchy through nested structure. The `processListItems()` func
|
|
|
165
162
|
|
|
166
163
|
## Content Writing Conventions
|
|
167
164
|
|
|
168
|
-
|
|
165
|
+
Key patterns:
|
|
169
166
|
|
|
170
167
|
- **Pretitle Pattern**: Any heading followed by a more important heading (e.g., H3→H1, H2→H1, H6→H5, etc.)
|
|
171
168
|
- **Banner Pattern**: Image (with banner role or followed by heading) at start of first group
|
|
172
169
|
- **Divider Mode**: Presence of any `horizontalRule` switches entire document to divider-based grouping
|
|
173
170
|
- **Heading Groups**: Consecutive headings with increasing levels are consumed together
|
|
174
171
|
- **Main Content**: First group is main if it's the only group OR has lower heading level than second group
|
|
175
|
-
- **Body Headings**: Headings
|
|
172
|
+
- **Body Headings**: Headings after the title and subtitle slots are collected in `body.headings` in document order
|
|
176
173
|
|
|
177
174
|
## Testing Structure
|
|
178
175
|
|
|
@@ -188,6 +185,5 @@ Tests are organized by processor:
|
|
|
188
185
|
|
|
189
186
|
- The parser never modifies the original ProseMirror document
|
|
190
187
|
- Text content can include inline HTML for formatting (bold → `<strong>`, italic → `<em>`, links → `<a>`)
|
|
191
|
-
- The `processors_old/` directory contains legacy implementations - do not modify
|
|
192
188
|
- Context information in byType includes position, previous/next elements, and nearest heading
|
|
193
189
|
- Group splitting logic differs significantly between heading mode and divider mode
|
package/README.md
CHANGED
|
@@ -68,7 +68,6 @@ result = {
|
|
|
68
68
|
pretitle: "", // Heading before main title
|
|
69
69
|
title: "Welcome", // Main heading
|
|
70
70
|
subtitle: "", // Heading after main title
|
|
71
|
-
subtitle2: "", // Third heading level
|
|
72
71
|
|
|
73
72
|
// Body fields
|
|
74
73
|
paragraphs: ["Get started today."],
|
|
@@ -78,8 +77,8 @@ result = {
|
|
|
78
77
|
icons: [],
|
|
79
78
|
lists: [],
|
|
80
79
|
quotes: [],
|
|
81
|
-
data: {}, // Structured data (tagged
|
|
82
|
-
headings: [], //
|
|
80
|
+
data: {}, // Structured data (tagged data blocks, forms, cards)
|
|
81
|
+
headings: [], // Headings after subtitle, in document order
|
|
83
82
|
|
|
84
83
|
// Additional content groups (from headings after content)
|
|
85
84
|
items: [
|
|
@@ -143,154 +142,6 @@ sequence.forEach(element => {
|
|
|
143
142
|
});
|
|
144
143
|
```
|
|
145
144
|
|
|
146
|
-
## Content Mapping Utilities
|
|
147
|
-
|
|
148
|
-
The parser includes optional mapping utilities to transform parsed content into component-specific formats. Perfect for visual editors and component-based systems.
|
|
149
|
-
|
|
150
|
-
### Type System (Recommended)
|
|
151
|
-
|
|
152
|
-
Automatically transform content based on field types with context-aware behavior:
|
|
153
|
-
|
|
154
|
-
```js
|
|
155
|
-
const schema = {
|
|
156
|
-
title: {
|
|
157
|
-
path: "title",
|
|
158
|
-
type: "plaintext", // Auto-strips <strong>, <em>, etc.
|
|
159
|
-
maxLength: 60 // Auto-truncates intelligently
|
|
160
|
-
},
|
|
161
|
-
excerpt: {
|
|
162
|
-
path: "paragraphs",
|
|
163
|
-
type: "excerpt", // Auto-creates excerpt from paragraphs
|
|
164
|
-
maxLength: 150
|
|
165
|
-
},
|
|
166
|
-
image: {
|
|
167
|
-
path: "images[0].url",
|
|
168
|
-
type: "image",
|
|
169
|
-
defaultValue: "/placeholder.jpg"
|
|
170
|
-
}
|
|
171
|
-
};
|
|
172
|
-
|
|
173
|
-
// Visual editor mode (default) - silent, graceful cleanup
|
|
174
|
-
const data = mappers.extractBySchema(parsed, schema);
|
|
175
|
-
|
|
176
|
-
// Build mode - validates and warns
|
|
177
|
-
const data = mappers.extractBySchema(parsed, schema, { mode: 'build' });
|
|
178
|
-
```
|
|
179
|
-
|
|
180
|
-
**Field Types:** `plaintext`, `richtext`, `excerpt`, `number`, `image`, `link`
|
|
181
|
-
|
|
182
|
-
### Using Pre-Built Extractors
|
|
183
|
-
|
|
184
|
-
```js
|
|
185
|
-
import { parseContent, mappers } from "@uniweb/semantic-parser";
|
|
186
|
-
|
|
187
|
-
const parsed = parseContent(doc);
|
|
188
|
-
|
|
189
|
-
// Extract hero component data
|
|
190
|
-
const heroData = mappers.extractors.hero(parsed);
|
|
191
|
-
// { title, subtitle, kicker, description, image, cta, ... }
|
|
192
|
-
|
|
193
|
-
// Extract card data
|
|
194
|
-
const cards = mappers.extractors.card(parsed, { useItems: true });
|
|
195
|
-
|
|
196
|
-
// Extract statistics
|
|
197
|
-
const stats = mappers.extractors.stats(parsed);
|
|
198
|
-
// [{ value: "12", label: "Partner Labs" }, ...]
|
|
199
|
-
|
|
200
|
-
// Extract navigation menu
|
|
201
|
-
const nav = mappers.extractors.navigation(parsed);
|
|
202
|
-
|
|
203
|
-
// Extract features list
|
|
204
|
-
const features = mappers.extractors.features(parsed);
|
|
205
|
-
```
|
|
206
|
-
|
|
207
|
-
### Schema-Based Mapping
|
|
208
|
-
|
|
209
|
-
Define custom mappings using schemas:
|
|
210
|
-
|
|
211
|
-
```js
|
|
212
|
-
const schema = {
|
|
213
|
-
brand: "pretitle",
|
|
214
|
-
title: "title",
|
|
215
|
-
subtitle: "subtitle",
|
|
216
|
-
image: {
|
|
217
|
-
path: "images[0].url",
|
|
218
|
-
defaultValue: "/placeholder.jpg"
|
|
219
|
-
},
|
|
220
|
-
actions: {
|
|
221
|
-
path: "links",
|
|
222
|
-
transform: links => links.map(l => ({ label: l.label, type: "primary" }))
|
|
223
|
-
}
|
|
224
|
-
};
|
|
225
|
-
|
|
226
|
-
const componentData = mappers.accessor.extractBySchema(parsed, schema);
|
|
227
|
-
```
|
|
228
|
-
|
|
229
|
-
### Available Extractors
|
|
230
|
-
|
|
231
|
-
- `hero` - Hero/banner sections
|
|
232
|
-
- `card` - Card components
|
|
233
|
-
- `article` - Article/blog content
|
|
234
|
-
- `stats` - Statistics/metrics
|
|
235
|
-
- `navigation` - Navigation menus
|
|
236
|
-
- `features` - Feature lists
|
|
237
|
-
- `testimonial` - Testimonials
|
|
238
|
-
- `faq` - FAQ sections
|
|
239
|
-
- `pricing` - Pricing tiers
|
|
240
|
-
- `team` - Team members
|
|
241
|
-
- `gallery` - Image galleries
|
|
242
|
-
|
|
243
|
-
See **[Mapping Patterns Guide](./docs/mapping-patterns.md)** for complete documentation.
|
|
244
|
-
|
|
245
|
-
## Rendering Content
|
|
246
|
-
|
|
247
|
-
After extracting content, render it using a Text component that handles paragraph arrays, rich HTML, and formatting marks.
|
|
248
|
-
|
|
249
|
-
### Text Component Pattern
|
|
250
|
-
|
|
251
|
-
```jsx
|
|
252
|
-
import { parseContent, mappers } from '@uniweb/semantic-parser';
|
|
253
|
-
import { H1, P } from './components/Text';
|
|
254
|
-
|
|
255
|
-
const parsed = parseContent(doc);
|
|
256
|
-
const hero = mappers.extractors.hero(parsed);
|
|
257
|
-
|
|
258
|
-
// Render extracted content
|
|
259
|
-
<>
|
|
260
|
-
<H1 text={hero.title} />
|
|
261
|
-
<P text={hero.description} /> {/* Handles arrays automatically */}
|
|
262
|
-
</>
|
|
263
|
-
```
|
|
264
|
-
|
|
265
|
-
The Text component:
|
|
266
|
-
- **Handles arrays** - Renders `["Para 1", "Para 2"]` as separate paragraphs
|
|
267
|
-
- **Supports rich HTML** - Preserves formatting marks
|
|
268
|
-
- **Multi-line headings** - Wraps multiple lines in semantic heading tags
|
|
269
|
-
- **Color marks** - Supports `<mark>` and `<span>` for visual emphasis
|
|
270
|
-
|
|
271
|
-
See **[Text Component Reference](./docs/text-component-reference.md)** for implementation guide.
|
|
272
|
-
|
|
273
|
-
### Sanitization
|
|
274
|
-
|
|
275
|
-
Sanitize content at the engine level (during data preparation), not in components:
|
|
276
|
-
|
|
277
|
-
```javascript
|
|
278
|
-
import { parseContent, mappers } from '@uniweb/semantic-parser';
|
|
279
|
-
|
|
280
|
-
function prepareData(parsed) {
|
|
281
|
-
const hero = mappers.extractors.hero(parsed);
|
|
282
|
-
return {
|
|
283
|
-
...hero,
|
|
284
|
-
title: mappers.types.sanitizeHtml(hero.title, {
|
|
285
|
-
allowedTags: ['strong', 'em', 'mark', 'span'],
|
|
286
|
-
allowedAttr: ['class', 'data-variant']
|
|
287
|
-
})
|
|
288
|
-
};
|
|
289
|
-
}
|
|
290
|
-
```
|
|
291
|
-
|
|
292
|
-
The parser provides sanitization utilities but doesn't enforce their use. Your engine decides when to sanitize based on security requirements.
|
|
293
|
-
|
|
294
145
|
## Content Grouping
|
|
295
146
|
|
|
296
147
|
The parser supports two grouping modes:
|
|
@@ -346,14 +197,6 @@ Bracketed spans (`[text]{.class}`) are converted to `<span>` elements with their
|
|
|
346
197
|
|
|
347
198
|
Spans can have classes, IDs, and custom attributes. They combine with other marks—a span with bold becomes `<strong><span class="...">text</span></strong>`.
|
|
348
199
|
|
|
349
|
-
## Documentation
|
|
350
|
-
|
|
351
|
-
- **[Content Writing Guide](./docs/guide.md)**: Learn how to structure content for optimal parsing
|
|
352
|
-
- **[API Reference](./docs/api.md)**: Complete API documentation with all element types
|
|
353
|
-
- **[Mapping Patterns Guide](./docs/mapping-patterns.md)**: Transform content to component-specific formats
|
|
354
|
-
- **[Text Component Reference](./docs/text-component-reference.md)**: Reference implementation for rendering parsed content
|
|
355
|
-
- **[File Structure](./docs/file-structure.md)**: Codebase organization
|
|
356
|
-
|
|
357
200
|
## Use Cases
|
|
358
201
|
|
|
359
202
|
- **Component-based websites**: Extract structured data for React/Vue components
|
package/package.json
CHANGED
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@uniweb/semantic-parser",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.6",
|
|
4
4
|
"description": "Semantic parser for ProseMirror/TipTap content structures",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./src/index.js",
|
|
7
7
|
"exports": {
|
|
8
|
-
".": "./src/index.js"
|
|
9
|
-
"./mappers": "./src/mappers/index.js",
|
|
10
|
-
"./mappers/*": "./src/mappers/*.js"
|
|
8
|
+
".": "./src/index.js"
|
|
11
9
|
},
|
|
12
10
|
"keywords": [
|
|
13
11
|
"prosemirror",
|
|
@@ -30,7 +28,6 @@
|
|
|
30
28
|
},
|
|
31
29
|
"homepage": "https://github.com/uniweb/semantic-parser#readme",
|
|
32
30
|
"directories": {
|
|
33
|
-
"doc": "docs",
|
|
34
31
|
"test": "tests"
|
|
35
32
|
},
|
|
36
33
|
"dependencies": {
|
package/src/index.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { processSequence } from "./processors/sequence.js";
|
|
2
2
|
import { processGroups } from "./processors/groups.js";
|
|
3
|
-
import * as mappers from "./mappers/index.js";
|
|
4
3
|
|
|
5
4
|
/**
|
|
6
5
|
* Parse ProseMirror/TipTap content into semantic structure
|
|
@@ -30,4 +29,4 @@ function parseContent(doc, options = {}) {
|
|
|
30
29
|
};
|
|
31
30
|
}
|
|
32
31
|
|
|
33
|
-
export { parseContent
|
|
32
|
+
export { parseContent };
|
package/src/processors/groups.js
CHANGED
|
@@ -9,7 +9,6 @@ function flattenGroup(group) {
|
|
|
9
9
|
title: group.header.title || '',
|
|
10
10
|
pretitle: group.header.pretitle || '',
|
|
11
11
|
subtitle: group.header.subtitle || '',
|
|
12
|
-
subtitle2: group.header.subtitle2 || '',
|
|
13
12
|
paragraphs: group.body.paragraphs || [],
|
|
14
13
|
links: group.body.links || [],
|
|
15
14
|
images: group.body.images || [],
|
|
@@ -37,7 +36,6 @@ function processGroups(sequence, options = {}) {
|
|
|
37
36
|
title: '',
|
|
38
37
|
pretitle: '',
|
|
39
38
|
subtitle: '',
|
|
40
|
-
subtitle2: '',
|
|
41
39
|
paragraphs: [],
|
|
42
40
|
links: [],
|
|
43
41
|
images: [],
|
|
@@ -75,7 +73,6 @@ function processGroups(sequence, options = {}) {
|
|
|
75
73
|
title: '',
|
|
76
74
|
pretitle: '',
|
|
77
75
|
subtitle: '',
|
|
78
|
-
subtitle2: '',
|
|
79
76
|
paragraphs: [],
|
|
80
77
|
links: [],
|
|
81
78
|
images: [],
|
|
@@ -227,7 +224,6 @@ function processGroupContent(elements) {
|
|
|
227
224
|
pretitle: "",
|
|
228
225
|
title: "",
|
|
229
226
|
subtitle: "",
|
|
230
|
-
subtitle2: "",
|
|
231
227
|
};
|
|
232
228
|
|
|
233
229
|
const body = {
|
|
@@ -290,11 +286,8 @@ function processGroupContent(elements) {
|
|
|
290
286
|
} else if (!header.subtitle) {
|
|
291
287
|
header.subtitle = element.text;
|
|
292
288
|
lastSlot = 'subtitle';
|
|
293
|
-
} else if (!header.subtitle2) {
|
|
294
|
-
header.subtitle2 = element.text;
|
|
295
|
-
lastSlot = 'subtitle2';
|
|
296
289
|
} else {
|
|
297
|
-
// After
|
|
290
|
+
// After subtitle, remaining headings go to body
|
|
298
291
|
body.headings.push(element.text);
|
|
299
292
|
lastSlot = null;
|
|
300
293
|
}
|
package/docs/api.md
DELETED
|
@@ -1,350 +0,0 @@
|
|
|
1
|
-
# API Reference
|
|
2
|
-
|
|
3
|
-
## parseContent(doc, options)
|
|
4
|
-
|
|
5
|
-
Parses a ProseMirror/TipTap document into three semantic views.
|
|
6
|
-
|
|
7
|
-
### Import
|
|
8
|
-
|
|
9
|
-
```js
|
|
10
|
-
import { parseContent } from '@uniweb/semantic-parser';
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
### Parameters
|
|
14
|
-
|
|
15
|
-
- `doc` (Object): A ProseMirror/TipTap document object with `type: "doc"` and `content` array
|
|
16
|
-
- `options` (Object, optional): Parsing options
|
|
17
|
-
- `parseCodeAsJson` (boolean): Parse code blocks as JSON for properties. Default: false
|
|
18
|
-
|
|
19
|
-
**Note:** Body headings are always collected automatically - no configuration needed.
|
|
20
|
-
|
|
21
|
-
### Returns
|
|
22
|
-
|
|
23
|
-
An object with four properties providing different views of the content:
|
|
24
|
-
|
|
25
|
-
```js
|
|
26
|
-
{
|
|
27
|
-
raw: Object, // Original ProseMirror document
|
|
28
|
-
sequence: Array, // Flat sequence of elements
|
|
29
|
-
groups: Object, // Semantic content groups
|
|
30
|
-
byType: Object // Elements organized by type
|
|
31
|
-
}
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
## Return Value Structure
|
|
35
|
-
|
|
36
|
-
### `raw`
|
|
37
|
-
|
|
38
|
-
The original ProseMirror document passed as input, unchanged.
|
|
39
|
-
|
|
40
|
-
### `sequence`
|
|
41
|
-
|
|
42
|
-
A flat array of semantic elements extracted from the document tree.
|
|
43
|
-
|
|
44
|
-
**Element Types:**
|
|
45
|
-
|
|
46
|
-
```js
|
|
47
|
-
// Heading
|
|
48
|
-
{
|
|
49
|
-
type: "heading",
|
|
50
|
-
level: 1, // 1-6
|
|
51
|
-
content: "Text content with <strong>HTML</strong> formatting"
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
// Paragraph
|
|
55
|
-
{
|
|
56
|
-
type: "paragraph",
|
|
57
|
-
content: "Text with <em>inline</em> <a href=\"...\">formatting</a>"
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
// List
|
|
61
|
-
{
|
|
62
|
-
type: "list",
|
|
63
|
-
style: "bullet" | "ordered",
|
|
64
|
-
items: [
|
|
65
|
-
{
|
|
66
|
-
content: [/* array of elements */],
|
|
67
|
-
items: [/* nested list items */]
|
|
68
|
-
}
|
|
69
|
-
]
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
// Image
|
|
73
|
-
{
|
|
74
|
-
type: "image",
|
|
75
|
-
src: "path/to/image.jpg",
|
|
76
|
-
alt: "Alt text",
|
|
77
|
-
caption: "Caption text",
|
|
78
|
-
role: "background" | "content" | "banner" | "icon"
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
// Icon (SVG)
|
|
82
|
-
{
|
|
83
|
-
type: "icon",
|
|
84
|
-
svg: "<svg>...</svg>"
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// Video
|
|
88
|
-
{
|
|
89
|
-
type: "video",
|
|
90
|
-
src: "path/to/video.mp4",
|
|
91
|
-
alt: "Alt text",
|
|
92
|
-
caption: "Caption text"
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
// Link (paragraph containing only a link)
|
|
96
|
-
{
|
|
97
|
-
type: "link",
|
|
98
|
-
content: {
|
|
99
|
-
href: "https://example.com",
|
|
100
|
-
label: "Link text"
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
// Button
|
|
105
|
-
{
|
|
106
|
-
type: "button",
|
|
107
|
-
content: "Button text",
|
|
108
|
-
attrs: {
|
|
109
|
-
// Button-specific attributes
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
// Divider (horizontal rule)
|
|
114
|
-
{
|
|
115
|
-
type: "divider"
|
|
116
|
-
}
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
### `groups`
|
|
120
|
-
|
|
121
|
-
Content organized into semantic groups with identified main content and items. The structure is flat - header and body fields are merged at the top level.
|
|
122
|
-
|
|
123
|
-
```js
|
|
124
|
-
{
|
|
125
|
-
main: {
|
|
126
|
-
// Header fields (flat)
|
|
127
|
-
pretitle: "PRETITLE TEXT", // H3 before main title
|
|
128
|
-
title: "Main Title", // First heading in group
|
|
129
|
-
subtitle: "Subtitle", // Second heading in group
|
|
130
|
-
|
|
131
|
-
// Body fields (flat)
|
|
132
|
-
paragraphs: ["paragraph text", ...],
|
|
133
|
-
images: [
|
|
134
|
-
{ url: "...", caption: "...", alt: "..." }
|
|
135
|
-
],
|
|
136
|
-
icons: ["<svg>...</svg>", ...],
|
|
137
|
-
videos: [
|
|
138
|
-
{ src: "...", caption: "...", alt: "..." }
|
|
139
|
-
],
|
|
140
|
-
links: [
|
|
141
|
-
{ href: "...", label: "..." }
|
|
142
|
-
],
|
|
143
|
-
lists: [
|
|
144
|
-
[/* processed list items */]
|
|
145
|
-
],
|
|
146
|
-
buttons: [
|
|
147
|
-
{ content: "...", attrs: {...} }
|
|
148
|
-
],
|
|
149
|
-
properties: [], // Code block content
|
|
150
|
-
propertyBlocks: [], // Array of code blocks
|
|
151
|
-
cards: [], // Not yet implemented
|
|
152
|
-
headings: [], // Used in list items
|
|
153
|
-
|
|
154
|
-
// Banner (flat)
|
|
155
|
-
banner: {
|
|
156
|
-
url: "path/to/banner.jpg",
|
|
157
|
-
caption: "Banner caption",
|
|
158
|
-
alt: "Banner alt text"
|
|
159
|
-
} | null
|
|
160
|
-
},
|
|
161
|
-
items: [
|
|
162
|
-
// Array of groups with same flat structure as main
|
|
163
|
-
// { title, pretitle, subtitle, paragraphs, images, ... }
|
|
164
|
-
],
|
|
165
|
-
metadata: {
|
|
166
|
-
dividerMode: false, // Whether dividers were used for grouping
|
|
167
|
-
groups: 0 // Total number of groups
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
```
|
|
171
|
-
|
|
172
|
-
**Grouping Modes:**
|
|
173
|
-
|
|
174
|
-
1. **Heading-based grouping** (default): Groups start with heading patterns
|
|
175
|
-
2. **Divider-based grouping**: When any `horizontalRule` is present, groups are split by dividers
|
|
176
|
-
|
|
177
|
-
**Main Content Identification:**
|
|
178
|
-
|
|
179
|
-
- Single group → always main content
|
|
180
|
-
- Multiple groups → first group is main if it has lower heading level than second group
|
|
181
|
-
- Divider mode starting with divider → no main content, all items
|
|
182
|
-
|
|
183
|
-
### `byType`
|
|
184
|
-
|
|
185
|
-
Elements organized by type with positional context.
|
|
186
|
-
|
|
187
|
-
```js
|
|
188
|
-
{
|
|
189
|
-
headings: [
|
|
190
|
-
{
|
|
191
|
-
type: "heading",
|
|
192
|
-
level: 1,
|
|
193
|
-
content: "Title",
|
|
194
|
-
context: {
|
|
195
|
-
position: 0,
|
|
196
|
-
previousElement: null,
|
|
197
|
-
nextElement: { type: "paragraph", ... },
|
|
198
|
-
nearestHeading: null
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
],
|
|
202
|
-
paragraphs: [
|
|
203
|
-
{
|
|
204
|
-
type: "paragraph",
|
|
205
|
-
content: "Text",
|
|
206
|
-
context: { ... }
|
|
207
|
-
}
|
|
208
|
-
],
|
|
209
|
-
images: {
|
|
210
|
-
background: [/* images with role="background" */],
|
|
211
|
-
content: [/* images with role="content" */],
|
|
212
|
-
gallery: [/* images with role="gallery" */],
|
|
213
|
-
icon: [/* images with role="icon" */]
|
|
214
|
-
},
|
|
215
|
-
lists: [/* list elements with context */],
|
|
216
|
-
dividers: [/* divider elements with context */],
|
|
217
|
-
metadata: {
|
|
218
|
-
totalElements: 10,
|
|
219
|
-
dominantType: "paragraph",
|
|
220
|
-
hasMedia: true
|
|
221
|
-
},
|
|
222
|
-
|
|
223
|
-
// Helper methods
|
|
224
|
-
getHeadingsByLevel(level),
|
|
225
|
-
getElementsByHeadingContext(headingFilter)
|
|
226
|
-
}
|
|
227
|
-
```
|
|
228
|
-
|
|
229
|
-
**Helper Methods:**
|
|
230
|
-
|
|
231
|
-
```js
|
|
232
|
-
// Get all H1 headings
|
|
233
|
-
byType.getHeadingsByLevel(1)
|
|
234
|
-
|
|
235
|
-
// Get all elements under headings matching a filter
|
|
236
|
-
byType.getElementsByHeadingContext((heading) => heading.level === 2)
|
|
237
|
-
```
|
|
238
|
-
|
|
239
|
-
## Usage Examples
|
|
240
|
-
|
|
241
|
-
### Basic Usage
|
|
242
|
-
|
|
243
|
-
```js
|
|
244
|
-
import { parseContent } from "@uniweb/semantic-parser";
|
|
245
|
-
|
|
246
|
-
const doc = {
|
|
247
|
-
type: "doc",
|
|
248
|
-
content: [
|
|
249
|
-
{
|
|
250
|
-
type: "heading",
|
|
251
|
-
attrs: { level: 1 },
|
|
252
|
-
content: [{ type: "text", text: "Welcome" }]
|
|
253
|
-
},
|
|
254
|
-
{
|
|
255
|
-
type: "paragraph",
|
|
256
|
-
content: [{ type: "text", text: "Get started today." }]
|
|
257
|
-
}
|
|
258
|
-
]
|
|
259
|
-
};
|
|
260
|
-
|
|
261
|
-
const result = parseContent(doc);
|
|
262
|
-
```
|
|
263
|
-
|
|
264
|
-
### Working with Groups
|
|
265
|
-
|
|
266
|
-
```js
|
|
267
|
-
const { groups } = parseContent(doc);
|
|
268
|
-
|
|
269
|
-
// Access main content (flat structure)
|
|
270
|
-
console.log(groups.main.title);
|
|
271
|
-
console.log(groups.main.paragraphs);
|
|
272
|
-
|
|
273
|
-
// Iterate through content items
|
|
274
|
-
groups.items.forEach(item => {
|
|
275
|
-
console.log(item.title);
|
|
276
|
-
console.log(item.paragraphs);
|
|
277
|
-
});
|
|
278
|
-
```
|
|
279
|
-
|
|
280
|
-
### Working with byType
|
|
281
|
-
|
|
282
|
-
```js
|
|
283
|
-
const { byType } = parseContent(doc);
|
|
284
|
-
|
|
285
|
-
// Get all images
|
|
286
|
-
const allImages = Object.values(byType.images).flat();
|
|
287
|
-
|
|
288
|
-
// Get all H2 headings
|
|
289
|
-
const h2Headings = byType.getHeadingsByLevel(2);
|
|
290
|
-
|
|
291
|
-
// Get content under specific headings
|
|
292
|
-
const featuresContent = byType.getElementsByHeadingContext(
|
|
293
|
-
h => h.content.includes("Features")
|
|
294
|
-
);
|
|
295
|
-
```
|
|
296
|
-
|
|
297
|
-
### Working with Sequence
|
|
298
|
-
|
|
299
|
-
```js
|
|
300
|
-
const { sequence } = parseContent(doc);
|
|
301
|
-
|
|
302
|
-
// Process elements in order
|
|
303
|
-
sequence.forEach(element => {
|
|
304
|
-
switch(element.type) {
|
|
305
|
-
case 'heading':
|
|
306
|
-
console.log(`H${element.level}: ${element.content}`);
|
|
307
|
-
break;
|
|
308
|
-
case 'paragraph':
|
|
309
|
-
console.log(`P: ${element.content}`);
|
|
310
|
-
break;
|
|
311
|
-
}
|
|
312
|
-
});
|
|
313
|
-
```
|
|
314
|
-
|
|
315
|
-
## Text Formatting
|
|
316
|
-
|
|
317
|
-
The parser preserves inline formatting as HTML tags within text content:
|
|
318
|
-
|
|
319
|
-
- **Bold**: `<strong>text</strong>`
|
|
320
|
-
- **Italic**: `<em>text</em>`
|
|
321
|
-
- **Links**: `<a href="url">text</a>`
|
|
322
|
-
|
|
323
|
-
```js
|
|
324
|
-
// Input
|
|
325
|
-
{
|
|
326
|
-
type: "paragraph",
|
|
327
|
-
content: [
|
|
328
|
-
{ type: "text", text: "Normal " },
|
|
329
|
-
{ type: "text", marks: [{ type: "bold" }], text: "bold" }
|
|
330
|
-
]
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
// Output
|
|
334
|
-
{
|
|
335
|
-
type: "paragraph",
|
|
336
|
-
content: "Normal <strong>bold</strong>"
|
|
337
|
-
}
|
|
338
|
-
```
|
|
339
|
-
|
|
340
|
-
## Special Element Detection
|
|
341
|
-
|
|
342
|
-
The parser detects special patterns and extracts them as dedicated element types:
|
|
343
|
-
|
|
344
|
-
- **Paragraph with only a link** → `type: "link"`
|
|
345
|
-
- **Paragraph with only an image** (role: image/banner) → `type: "image"`
|
|
346
|
-
- **Paragraph with only an icon** (role: icon) → `type: "icon"`
|
|
347
|
-
- **Paragraph with only a button mark** → `type: "button"`
|
|
348
|
-
- **Paragraph with only a video** (role: video) → `type: "video"`
|
|
349
|
-
|
|
350
|
-
This makes it easier to identify and handle these special cases in downstream processing.
|