@uniweb/semantic-parser 1.0.7 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +136 -0
- package/README.md +52 -104
- package/docs/api.md +38 -40
- package/docs/mapping-patterns.md +47 -47
- package/docs/text-component-reference.md +3 -3
- package/package.json +1 -1
- package/src/index.js +5 -7
- package/src/mappers/extractors.js +113 -120
- package/src/processors/groups.js +96 -25
- package/src/processors/byType.js +0 -130
package/AGENTS.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# AGENTS.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance for AI assistants working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
This is a semantic parser for ProseMirror/TipTap content structures. It transforms rich text editor content into structured, semantic groups that web components can consume. The parser bridges the gap between natural content writing and component-based web development.
|
|
8
|
+
|
|
9
|
+
## Development Commands
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Run all tests
|
|
13
|
+
npm test
|
|
14
|
+
|
|
15
|
+
# Run tests with JSON report output
|
|
16
|
+
npm run test-report
|
|
17
|
+
|
|
18
|
+
# Run a specific test file
|
|
19
|
+
npx jest tests/parser.test.js
|
|
20
|
+
|
|
21
|
+
# Run tests in watch mode
|
|
22
|
+
npx jest --watch
|
|
23
|
+
|
|
24
|
+
# Run a specific test by name
|
|
25
|
+
npx jest -t "handles simple document structure"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Architecture
|
|
29
|
+
|
|
30
|
+
### Three-Stage Processing Pipeline
|
|
31
|
+
|
|
32
|
+
The parser processes content through three distinct stages, each building on the previous:
|
|
33
|
+
|
|
34
|
+
1. **Sequence Processing** (`src/processors/sequence.js`): Flattens the ProseMirror document tree into a linear sequence of semantic elements (headings, paragraphs, images, lists, etc.)
|
|
35
|
+
|
|
36
|
+
2. **Groups Processing** (`src/processors/groups.js`): Transforms the sequence into semantic groups with identified main content and items. Supports two grouping modes:
|
|
37
|
+
- Heading-based grouping (default)
|
|
38
|
+
- Divider-based grouping (when horizontal rules are present)
|
|
39
|
+
|
|
40
|
+
3. **ByType Processing** (`src/processors/byType.js`): Organizes elements by type with positional context, enabling type-specific queries
|
|
41
|
+
|
|
42
|
+
The main entry point (`src/index.js`) returns all three views:
|
|
43
|
+
```js
|
|
44
|
+
import { parseContent } from './src/index.js';
|
|
45
|
+
|
|
46
|
+
const result = parseContent(doc);
|
|
47
|
+
// {
|
|
48
|
+
// raw: doc, // Original ProseMirror document
|
|
49
|
+
// sequence: [...], // Flat sequence of elements
|
|
50
|
+
// groups: {...}, // Semantic groups with main/items
|
|
51
|
+
// byType: {...} // Elements organized by type
|
|
52
|
+
// }
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Content Group Structure
|
|
56
|
+
|
|
57
|
+
Groups follow a specific structure defined in `processGroupContent()`:
|
|
58
|
+
|
|
59
|
+
```js
|
|
60
|
+
{
|
|
61
|
+
header: {
|
|
62
|
+
pretitle: '', // H3 before main title
|
|
63
|
+
title: '', // Main heading (H1 or H2)
|
|
64
|
+
subtitle: '' // Heading after main title
|
|
65
|
+
},
|
|
66
|
+
body: {
|
|
67
|
+
imgs: [],
|
|
68
|
+
icons: [],
|
|
69
|
+
videos: [],
|
|
70
|
+
paragraphs: [],
|
|
71
|
+
links: [],
|
|
72
|
+
lists: [],
|
|
73
|
+
buttons: [],
|
|
74
|
+
properties: [],
|
|
75
|
+
propertyBlocks: [],
|
|
76
|
+
cards: [],
|
|
77
|
+
headings: []
|
|
78
|
+
},
|
|
79
|
+
banner: null, // Image with banner role or image before heading
|
|
80
|
+
metadata: {
|
|
81
|
+
level: null, // Heading level that started this group
|
|
82
|
+
contentTypes: Set()
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Main Content Identification
|
|
88
|
+
|
|
89
|
+
The `identifyMainContent()` function (src/processors/groups.js:282) determines if the first group should be treated as main content:
|
|
90
|
+
- Single group is always main content
|
|
91
|
+
- First group must have lower heading level than second group
|
|
92
|
+
- Divider mode affects main content identification
|
|
93
|
+
|
|
94
|
+
### Special Element Detection
|
|
95
|
+
|
|
96
|
+
The sequence processor identifies several special element types by inspecting paragraph content:
|
|
97
|
+
- **Links**: Paragraphs containing only a single link mark
|
|
98
|
+
- **Images**: Paragraphs with single image (role: 'image' or 'banner')
|
|
99
|
+
- **Icons**: Paragraphs with single image (role: 'icon')
|
|
100
|
+
- **Buttons**: Paragraphs with single text node having button mark
|
|
101
|
+
- **Videos**: Paragraphs with single image (role: 'video')
|
|
102
|
+
|
|
103
|
+
These are extracted into dedicated element types for easier downstream processing.
|
|
104
|
+
|
|
105
|
+
### List Processing
|
|
106
|
+
|
|
107
|
+
Lists maintain hierarchy through nested structure. The `processListItems()` function in sequence.js handles nested lists, while `processListContent()` in groups.js applies full group content processing to each list item, allowing lists to contain rich content (images, paragraphs, nested lists, etc.).
|
|
108
|
+
|
|
109
|
+
## Content Writing Conventions
|
|
110
|
+
|
|
111
|
+
The parser implements the semantic conventions documented in `docs/guide.md`. Key patterns:
|
|
112
|
+
|
|
113
|
+
- **Pretitle Pattern**: Any heading followed by a more important heading (e.g., H3→H1, H2→H1, H6→H5, etc.)
|
|
114
|
+
- **Banner Pattern**: Image (with banner role or followed by heading) at start of first group
|
|
115
|
+
- **Divider Mode**: Presence of any `horizontalRule` switches entire document to divider-based grouping
|
|
116
|
+
- **Heading Groups**: Consecutive headings with increasing levels are consumed together
|
|
117
|
+
- **Main Content**: First group is main if it's the only group OR has lower heading level than second group
|
|
118
|
+
- **Body Headings**: Headings that overflow the header slots (title, subtitle, subtitle2) are automatically collected in `body.headings`
|
|
119
|
+
|
|
120
|
+
## Testing Structure
|
|
121
|
+
|
|
122
|
+
Tests are organized by processor:
|
|
123
|
+
- `tests/parser.test.js` - Integration tests
|
|
124
|
+
- `tests/processors/sequence.test.js` - Sequence processing
|
|
125
|
+
- `tests/processors/groups.test.js` - Groups processing
|
|
126
|
+
- `tests/processors/byType.test.js` - ByType processing
|
|
127
|
+
- `tests/utils/role.test.js` - Role utilities
|
|
128
|
+
- `tests/fixtures/` - Shared test documents
|
|
129
|
+
|
|
130
|
+
## Important Implementation Notes
|
|
131
|
+
|
|
132
|
+
- The parser never modifies the original ProseMirror document
|
|
133
|
+
- Text content can include inline HTML for formatting (bold → `<strong>`, italic → `<em>`, links → `<a>`)
|
|
134
|
+
- The `processors_old/` directory contains legacy implementations - do not modify
|
|
135
|
+
- Context information in byType includes position, previous/next elements, and nearest heading
|
|
136
|
+
- Group splitting logic differs significantly between heading mode and divider mode
|
package/README.md
CHANGED
|
@@ -4,11 +4,10 @@ A semantic parser for ProseMirror/TipTap content structures that helps bridge th
|
|
|
4
4
|
|
|
5
5
|
## What it Does
|
|
6
6
|
|
|
7
|
-
The parser transforms rich text editor content (ProseMirror/TipTap) into structured, semantic groups that web components can easily consume. It provides
|
|
7
|
+
The parser transforms rich text editor content (ProseMirror/TipTap) into structured, semantic groups that web components can easily consume. It provides two complementary views of your content:
|
|
8
8
|
|
|
9
|
-
1. **Sequence**:
|
|
10
|
-
2. **Groups**: Content organized into semantic sections
|
|
11
|
-
3. **ByType**: Elements categorized by type for easy filtering and queries
|
|
9
|
+
1. **Sequence**: An ordered list of all content elements (for rendering in document order)
|
|
10
|
+
2. **Groups**: Content organized into semantic sections (main content + items)
|
|
12
11
|
|
|
13
12
|
## Installation
|
|
14
13
|
|
|
@@ -41,16 +40,16 @@ const doc = {
|
|
|
41
40
|
const result = parseContent(doc);
|
|
42
41
|
|
|
43
42
|
// Access different views
|
|
44
|
-
console.log(result.sequence); //
|
|
45
|
-
console.log(result.
|
|
46
|
-
console.log(result.
|
|
43
|
+
console.log(result.sequence); // Ordered array of elements
|
|
44
|
+
console.log(result.title); // Main content fields at top level
|
|
45
|
+
console.log(result.items); // Additional content groups
|
|
47
46
|
```
|
|
48
47
|
|
|
49
48
|
## Output Structure
|
|
50
49
|
|
|
51
50
|
### Sequence View
|
|
52
51
|
|
|
53
|
-
|
|
52
|
+
An ordered array of semantic elements preserving document order:
|
|
54
53
|
|
|
55
54
|
```js
|
|
56
55
|
result.sequence = [
|
|
@@ -59,72 +58,37 @@ result.sequence = [
|
|
|
59
58
|
]
|
|
60
59
|
```
|
|
61
60
|
|
|
62
|
-
###
|
|
61
|
+
### Content Structure
|
|
63
62
|
|
|
64
|
-
|
|
63
|
+
Main content fields are at the top level. The `items` array contains additional content groups (e.g., H3 sections), each with the same field structure:
|
|
65
64
|
|
|
66
65
|
```js
|
|
67
|
-
result
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
dividerMode: false, // Using dividers vs headings
|
|
88
|
-
groups: 0
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
### ByType View
|
|
66
|
+
result = {
|
|
67
|
+
// Main content fields
|
|
68
|
+
pretitle: "", // H3 before main title
|
|
69
|
+
title: "Welcome", // Main heading (H1)
|
|
70
|
+
subtitle: "", // H2 after main title
|
|
71
|
+
paragraphs: ["Get started today."],
|
|
72
|
+
imgs: [],
|
|
73
|
+
videos: [],
|
|
74
|
+
links: [],
|
|
75
|
+
lists: [],
|
|
76
|
+
icons: [],
|
|
77
|
+
buttons: [],
|
|
78
|
+
banner: null, // Optional banner image
|
|
79
|
+
// ... more content types
|
|
80
|
+
|
|
81
|
+
// Additional content groups (H3 sections)
|
|
82
|
+
items: [
|
|
83
|
+
{ title: "Feature 1", paragraphs: [...], links: [...] },
|
|
84
|
+
{ title: "Feature 2", paragraphs: [...], links: [...] }
|
|
85
|
+
],
|
|
94
86
|
|
|
95
|
-
|
|
87
|
+
// Ordered sequence for document-order rendering
|
|
88
|
+
sequence: [...],
|
|
96
89
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
headings: [
|
|
100
|
-
{
|
|
101
|
-
type: "heading",
|
|
102
|
-
level: 1,
|
|
103
|
-
content: "Welcome",
|
|
104
|
-
context: {
|
|
105
|
-
position: 0,
|
|
106
|
-
previousElement: null,
|
|
107
|
-
nextElement: { type: "paragraph", ... },
|
|
108
|
-
nearestHeading: null
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
],
|
|
112
|
-
paragraphs: [ /* ... */ ],
|
|
113
|
-
images: {
|
|
114
|
-
background: [],
|
|
115
|
-
content: [],
|
|
116
|
-
gallery: [],
|
|
117
|
-
icon: []
|
|
118
|
-
},
|
|
119
|
-
lists: [],
|
|
120
|
-
metadata: {
|
|
121
|
-
totalElements: 2,
|
|
122
|
-
dominantType: "paragraph",
|
|
123
|
-
hasMedia: false
|
|
124
|
-
},
|
|
125
|
-
// Helper methods
|
|
126
|
-
getHeadingsByLevel(level),
|
|
127
|
-
getElementsByHeadingContext(filter)
|
|
90
|
+
// Original document
|
|
91
|
+
raw: { type: "doc", content: [...] }
|
|
128
92
|
}
|
|
129
93
|
```
|
|
130
94
|
|
|
@@ -133,45 +97,29 @@ result.byType = {
|
|
|
133
97
|
### Extracting Main Content
|
|
134
98
|
|
|
135
99
|
```js
|
|
136
|
-
const
|
|
100
|
+
const content = parseContent(doc);
|
|
137
101
|
|
|
138
|
-
const title =
|
|
139
|
-
const description =
|
|
140
|
-
const image =
|
|
102
|
+
const title = content.title;
|
|
103
|
+
const description = content.paragraphs.join(" ");
|
|
104
|
+
const image = content.banner?.url;
|
|
141
105
|
```
|
|
142
106
|
|
|
143
107
|
### Processing Content Sections
|
|
144
108
|
|
|
145
109
|
```js
|
|
146
|
-
const
|
|
110
|
+
const content = parseContent(doc);
|
|
147
111
|
|
|
148
112
|
// Main content
|
|
149
|
-
console.log("
|
|
113
|
+
console.log("Title:", content.title);
|
|
114
|
+
console.log("Description:", content.paragraphs);
|
|
150
115
|
|
|
151
|
-
// Additional sections
|
|
152
|
-
|
|
153
|
-
console.log("Section:", item.
|
|
154
|
-
console.log("Content:", item.
|
|
116
|
+
// Additional sections (H3 groups)
|
|
117
|
+
content.items.forEach(item => {
|
|
118
|
+
console.log("Section:", item.title);
|
|
119
|
+
console.log("Content:", item.paragraphs);
|
|
155
120
|
});
|
|
156
121
|
```
|
|
157
122
|
|
|
158
|
-
### Finding Specific Elements
|
|
159
|
-
|
|
160
|
-
```js
|
|
161
|
-
const { byType } = parseContent(doc);
|
|
162
|
-
|
|
163
|
-
// Get all H2 headings
|
|
164
|
-
const subheadings = byType.getHeadingsByLevel(2);
|
|
165
|
-
|
|
166
|
-
// Get all background images
|
|
167
|
-
const backgrounds = byType.images.background;
|
|
168
|
-
|
|
169
|
-
// Get content under specific headings
|
|
170
|
-
const features = byType.getElementsByHeadingContext(
|
|
171
|
-
h => h.content.includes("Features")
|
|
172
|
-
);
|
|
173
|
-
```
|
|
174
|
-
|
|
175
123
|
### Sequential Processing
|
|
176
124
|
|
|
177
125
|
```js
|
|
@@ -203,17 +151,17 @@ Automatically transform content based on field types with context-aware behavior
|
|
|
203
151
|
```js
|
|
204
152
|
const schema = {
|
|
205
153
|
title: {
|
|
206
|
-
path: "
|
|
154
|
+
path: "title",
|
|
207
155
|
type: "plaintext", // Auto-strips <strong>, <em>, etc.
|
|
208
156
|
maxLength: 60 // Auto-truncates intelligently
|
|
209
157
|
},
|
|
210
158
|
excerpt: {
|
|
211
|
-
path: "
|
|
159
|
+
path: "paragraphs",
|
|
212
160
|
type: "excerpt", // Auto-creates excerpt from paragraphs
|
|
213
161
|
maxLength: 150
|
|
214
162
|
},
|
|
215
163
|
image: {
|
|
216
|
-
path: "
|
|
164
|
+
path: "imgs[0].url",
|
|
217
165
|
type: "image",
|
|
218
166
|
defaultValue: "/placeholder.jpg"
|
|
219
167
|
}
|
|
@@ -259,15 +207,15 @@ Define custom mappings using schemas:
|
|
|
259
207
|
|
|
260
208
|
```js
|
|
261
209
|
const schema = {
|
|
262
|
-
brand: "
|
|
263
|
-
title: "
|
|
264
|
-
subtitle: "
|
|
210
|
+
brand: "pretitle",
|
|
211
|
+
title: "title",
|
|
212
|
+
subtitle: "subtitle",
|
|
265
213
|
image: {
|
|
266
|
-
path: "
|
|
214
|
+
path: "imgs[0].url",
|
|
267
215
|
defaultValue: "/placeholder.jpg"
|
|
268
216
|
},
|
|
269
217
|
actions: {
|
|
270
|
-
path: "
|
|
218
|
+
path: "links",
|
|
271
219
|
transform: links => links.map(l => ({ label: l.label, type: "primary" }))
|
|
272
220
|
}
|
|
273
221
|
};
|
package/docs/api.md
CHANGED
|
@@ -118,51 +118,49 @@ A flat array of semantic elements extracted from the document tree.
|
|
|
118
118
|
|
|
119
119
|
### `groups`
|
|
120
120
|
|
|
121
|
-
Content organized into semantic groups with identified main content and items.
|
|
121
|
+
Content organized into semantic groups with identified main content and items. The structure is flat - header and body fields are merged at the top level.
|
|
122
122
|
|
|
123
123
|
```js
|
|
124
124
|
{
|
|
125
125
|
main: {
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
126
|
+
// Header fields (flat)
|
|
127
|
+
pretitle: "PRETITLE TEXT", // H3 before main title
|
|
128
|
+
title: "Main Title", // First heading in group
|
|
129
|
+
subtitle: "Subtitle", // Second heading in group
|
|
130
|
+
|
|
131
|
+
// Body fields (flat)
|
|
132
|
+
paragraphs: ["paragraph text", ...],
|
|
133
|
+
imgs: [
|
|
134
|
+
{ url: "...", caption: "...", alt: "..." }
|
|
135
|
+
],
|
|
136
|
+
icons: ["<svg>...</svg>", ...],
|
|
137
|
+
videos: [
|
|
138
|
+
{ src: "...", caption: "...", alt: "..." }
|
|
139
|
+
],
|
|
140
|
+
links: [
|
|
141
|
+
{ href: "...", label: "..." }
|
|
142
|
+
],
|
|
143
|
+
lists: [
|
|
144
|
+
[/* processed list items */]
|
|
145
|
+
],
|
|
146
|
+
buttons: [
|
|
147
|
+
{ content: "...", attrs: {...} }
|
|
148
|
+
],
|
|
149
|
+
properties: [], // Code block content
|
|
150
|
+
propertyBlocks: [], // Array of code blocks
|
|
151
|
+
cards: [], // Not yet implemented
|
|
152
|
+
headings: [], // Used in list items
|
|
153
|
+
|
|
154
|
+
// Banner (flat)
|
|
154
155
|
banner: {
|
|
155
156
|
url: "path/to/banner.jpg",
|
|
156
157
|
caption: "Banner caption",
|
|
157
158
|
alt: "Banner alt text"
|
|
158
|
-
} | null
|
|
159
|
-
metadata: {
|
|
160
|
-
level: 1, // Heading level that started this group
|
|
161
|
-
contentTypes: {} // Set of content types in group
|
|
162
|
-
}
|
|
159
|
+
} | null
|
|
163
160
|
},
|
|
164
161
|
items: [
|
|
165
|
-
// Array of groups with same structure as main
|
|
162
|
+
// Array of groups with same flat structure as main
|
|
163
|
+
// { title, pretitle, subtitle, paragraphs, imgs, ... }
|
|
166
164
|
],
|
|
167
165
|
metadata: {
|
|
168
166
|
dividerMode: false, // Whether dividers were used for grouping
|
|
@@ -268,14 +266,14 @@ const result = parseContent(doc);
|
|
|
268
266
|
```js
|
|
269
267
|
const { groups } = parseContent(doc);
|
|
270
268
|
|
|
271
|
-
// Access main content
|
|
272
|
-
console.log(groups.main.
|
|
273
|
-
console.log(groups.main.
|
|
269
|
+
// Access main content (flat structure)
|
|
270
|
+
console.log(groups.main.title);
|
|
271
|
+
console.log(groups.main.paragraphs);
|
|
274
272
|
|
|
275
273
|
// Iterate through content items
|
|
276
274
|
groups.items.forEach(item => {
|
|
277
|
-
console.log(item.
|
|
278
|
-
console.log(item.
|
|
275
|
+
console.log(item.title);
|
|
276
|
+
console.log(item.paragraphs);
|
|
279
277
|
});
|
|
280
278
|
```
|
|
281
279
|
|