@uniweb/semantic-parser 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +8 -11
- package/README.md +3 -160
- package/package.json +2 -5
- package/src/index.js +1 -2
- package/src/processors/groups.js +16 -15
- package/docs/api.md +0 -350
- package/docs/entity-consolidation.md +0 -470
- package/docs/file-structure.md +0 -50
- package/docs/guide.md +0 -206
- package/docs/mapping-patterns.md +0 -928
- package/docs/text-component-reference.md +0 -515
- package/reference/README.md +0 -195
- package/reference/Text.js +0 -188
- package/src/mappers/accessor.js +0 -312
- package/src/mappers/extractors.js +0 -416
- package/src/mappers/helpers.js +0 -234
- package/src/mappers/index.js +0 -28
- package/src/mappers/types.js +0 -495
- package/src/processors/groups_backup.js +0 -379
- package/src/processors/groups_doc.md +0 -179
- package/src/processors/sequence_backup.js +0 -402
- package/src/processors_old/byType.js +0 -129
- package/src/processors_old/groups.js +0 -240
- package/src/processors_old/sequence.js +0 -140
package/docs/api.md
DELETED
|
@@ -1,350 +0,0 @@
|
|
|
1
|
-
# API Reference
|
|
2
|
-
|
|
3
|
-
## parseContent(doc, options)
|
|
4
|
-
|
|
5
|
-
Parses a ProseMirror/TipTap document into three semantic views.
|
|
6
|
-
|
|
7
|
-
### Import
|
|
8
|
-
|
|
9
|
-
```js
|
|
10
|
-
import { parseContent } from '@uniweb/semantic-parser';
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
### Parameters
|
|
14
|
-
|
|
15
|
-
- `doc` (Object): A ProseMirror/TipTap document object with `type: "doc"` and `content` array
|
|
16
|
-
- `options` (Object, optional): Parsing options
|
|
17
|
-
- `parseCodeAsJson` (boolean): Parse code blocks as JSON for properties. Default: false
|
|
18
|
-
|
|
19
|
-
**Note:** Body headings are always collected automatically - no configuration needed.
|
|
20
|
-
|
|
21
|
-
### Returns
|
|
22
|
-
|
|
23
|
-
An object with four properties providing different views of the content:
|
|
24
|
-
|
|
25
|
-
```js
|
|
26
|
-
{
|
|
27
|
-
raw: Object, // Original ProseMirror document
|
|
28
|
-
sequence: Array, // Flat sequence of elements
|
|
29
|
-
groups: Object, // Semantic content groups
|
|
30
|
-
byType: Object // Elements organized by type
|
|
31
|
-
}
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
## Return Value Structure
|
|
35
|
-
|
|
36
|
-
### `raw`
|
|
37
|
-
|
|
38
|
-
The original ProseMirror document passed as input, unchanged.
|
|
39
|
-
|
|
40
|
-
### `sequence`
|
|
41
|
-
|
|
42
|
-
A flat array of semantic elements extracted from the document tree.
|
|
43
|
-
|
|
44
|
-
**Element Types:**
|
|
45
|
-
|
|
46
|
-
```js
|
|
47
|
-
// Heading
|
|
48
|
-
{
|
|
49
|
-
type: "heading",
|
|
50
|
-
level: 1, // 1-6
|
|
51
|
-
content: "Text content with <strong>HTML</strong> formatting"
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
// Paragraph
|
|
55
|
-
{
|
|
56
|
-
type: "paragraph",
|
|
57
|
-
content: "Text with <em>inline</em> <a href=\"...\">formatting</a>"
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
// List
|
|
61
|
-
{
|
|
62
|
-
type: "list",
|
|
63
|
-
style: "bullet" | "ordered",
|
|
64
|
-
items: [
|
|
65
|
-
{
|
|
66
|
-
content: [/* array of elements */],
|
|
67
|
-
items: [/* nested list items */]
|
|
68
|
-
}
|
|
69
|
-
]
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
// Image
|
|
73
|
-
{
|
|
74
|
-
type: "image",
|
|
75
|
-
src: "path/to/image.jpg",
|
|
76
|
-
alt: "Alt text",
|
|
77
|
-
caption: "Caption text",
|
|
78
|
-
role: "background" | "content" | "banner" | "icon"
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
// Icon (SVG)
|
|
82
|
-
{
|
|
83
|
-
type: "icon",
|
|
84
|
-
svg: "<svg>...</svg>"
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// Video
|
|
88
|
-
{
|
|
89
|
-
type: "video",
|
|
90
|
-
src: "path/to/video.mp4",
|
|
91
|
-
alt: "Alt text",
|
|
92
|
-
caption: "Caption text"
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
// Link (paragraph containing only a link)
|
|
96
|
-
{
|
|
97
|
-
type: "link",
|
|
98
|
-
content: {
|
|
99
|
-
href: "https://example.com",
|
|
100
|
-
label: "Link text"
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
// Button
|
|
105
|
-
{
|
|
106
|
-
type: "button",
|
|
107
|
-
content: "Button text",
|
|
108
|
-
attrs: {
|
|
109
|
-
// Button-specific attributes
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
// Divider (horizontal rule)
|
|
114
|
-
{
|
|
115
|
-
type: "divider"
|
|
116
|
-
}
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
### `groups`
|
|
120
|
-
|
|
121
|
-
Content organized into semantic groups with identified main content and items. The structure is flat - header and body fields are merged at the top level.
|
|
122
|
-
|
|
123
|
-
```js
|
|
124
|
-
{
|
|
125
|
-
main: {
|
|
126
|
-
// Header fields (flat)
|
|
127
|
-
pretitle: "PRETITLE TEXT", // H3 before main title
|
|
128
|
-
title: "Main Title", // First heading in group
|
|
129
|
-
subtitle: "Subtitle", // Second heading in group
|
|
130
|
-
|
|
131
|
-
// Body fields (flat)
|
|
132
|
-
paragraphs: ["paragraph text", ...],
|
|
133
|
-
imgs: [
|
|
134
|
-
{ url: "...", caption: "...", alt: "..." }
|
|
135
|
-
],
|
|
136
|
-
icons: ["<svg>...</svg>", ...],
|
|
137
|
-
videos: [
|
|
138
|
-
{ src: "...", caption: "...", alt: "..." }
|
|
139
|
-
],
|
|
140
|
-
links: [
|
|
141
|
-
{ href: "...", label: "..." }
|
|
142
|
-
],
|
|
143
|
-
lists: [
|
|
144
|
-
[/* processed list items */]
|
|
145
|
-
],
|
|
146
|
-
buttons: [
|
|
147
|
-
{ content: "...", attrs: {...} }
|
|
148
|
-
],
|
|
149
|
-
properties: [], // Code block content
|
|
150
|
-
propertyBlocks: [], // Array of code blocks
|
|
151
|
-
cards: [], // Not yet implemented
|
|
152
|
-
headings: [], // Used in list items
|
|
153
|
-
|
|
154
|
-
// Banner (flat)
|
|
155
|
-
banner: {
|
|
156
|
-
url: "path/to/banner.jpg",
|
|
157
|
-
caption: "Banner caption",
|
|
158
|
-
alt: "Banner alt text"
|
|
159
|
-
} | null
|
|
160
|
-
},
|
|
161
|
-
items: [
|
|
162
|
-
// Array of groups with same flat structure as main
|
|
163
|
-
// { title, pretitle, subtitle, paragraphs, imgs, ... }
|
|
164
|
-
],
|
|
165
|
-
metadata: {
|
|
166
|
-
dividerMode: false, // Whether dividers were used for grouping
|
|
167
|
-
groups: 0 // Total number of groups
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
```
|
|
171
|
-
|
|
172
|
-
**Grouping Modes:**
|
|
173
|
-
|
|
174
|
-
1. **Heading-based grouping** (default): Groups start with heading patterns
|
|
175
|
-
2. **Divider-based grouping**: When any `horizontalRule` is present, groups are split by dividers
|
|
176
|
-
|
|
177
|
-
**Main Content Identification:**
|
|
178
|
-
|
|
179
|
-
- Single group → always main content
|
|
180
|
-
- Multiple groups → first group is main if it has lower heading level than second group
|
|
181
|
-
- Divider mode starting with divider → no main content, all items
|
|
182
|
-
|
|
183
|
-
### `byType`
|
|
184
|
-
|
|
185
|
-
Elements organized by type with positional context.
|
|
186
|
-
|
|
187
|
-
```js
|
|
188
|
-
{
|
|
189
|
-
headings: [
|
|
190
|
-
{
|
|
191
|
-
type: "heading",
|
|
192
|
-
level: 1,
|
|
193
|
-
content: "Title",
|
|
194
|
-
context: {
|
|
195
|
-
position: 0,
|
|
196
|
-
previousElement: null,
|
|
197
|
-
nextElement: { type: "paragraph", ... },
|
|
198
|
-
nearestHeading: null
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
],
|
|
202
|
-
paragraphs: [
|
|
203
|
-
{
|
|
204
|
-
type: "paragraph",
|
|
205
|
-
content: "Text",
|
|
206
|
-
context: { ... }
|
|
207
|
-
}
|
|
208
|
-
],
|
|
209
|
-
images: {
|
|
210
|
-
background: [/* images with role="background" */],
|
|
211
|
-
content: [/* images with role="content" */],
|
|
212
|
-
gallery: [/* images with role="gallery" */],
|
|
213
|
-
icon: [/* images with role="icon" */]
|
|
214
|
-
},
|
|
215
|
-
lists: [/* list elements with context */],
|
|
216
|
-
dividers: [/* divider elements with context */],
|
|
217
|
-
metadata: {
|
|
218
|
-
totalElements: 10,
|
|
219
|
-
dominantType: "paragraph",
|
|
220
|
-
hasMedia: true
|
|
221
|
-
},
|
|
222
|
-
|
|
223
|
-
// Helper methods
|
|
224
|
-
getHeadingsByLevel(level),
|
|
225
|
-
getElementsByHeadingContext(headingFilter)
|
|
226
|
-
}
|
|
227
|
-
```
|
|
228
|
-
|
|
229
|
-
**Helper Methods:**
|
|
230
|
-
|
|
231
|
-
```js
|
|
232
|
-
// Get all H1 headings
|
|
233
|
-
byType.getHeadingsByLevel(1)
|
|
234
|
-
|
|
235
|
-
// Get all elements under headings matching a filter
|
|
236
|
-
byType.getElementsByHeadingContext((heading) => heading.level === 2)
|
|
237
|
-
```
|
|
238
|
-
|
|
239
|
-
## Usage Examples
|
|
240
|
-
|
|
241
|
-
### Basic Usage
|
|
242
|
-
|
|
243
|
-
```js
|
|
244
|
-
import { parseContent } from "@uniweb/semantic-parser";
|
|
245
|
-
|
|
246
|
-
const doc = {
|
|
247
|
-
type: "doc",
|
|
248
|
-
content: [
|
|
249
|
-
{
|
|
250
|
-
type: "heading",
|
|
251
|
-
attrs: { level: 1 },
|
|
252
|
-
content: [{ type: "text", text: "Welcome" }]
|
|
253
|
-
},
|
|
254
|
-
{
|
|
255
|
-
type: "paragraph",
|
|
256
|
-
content: [{ type: "text", text: "Get started today." }]
|
|
257
|
-
}
|
|
258
|
-
]
|
|
259
|
-
};
|
|
260
|
-
|
|
261
|
-
const result = parseContent(doc);
|
|
262
|
-
```
|
|
263
|
-
|
|
264
|
-
### Working with Groups
|
|
265
|
-
|
|
266
|
-
```js
|
|
267
|
-
const { groups } = parseContent(doc);
|
|
268
|
-
|
|
269
|
-
// Access main content (flat structure)
|
|
270
|
-
console.log(groups.main.title);
|
|
271
|
-
console.log(groups.main.paragraphs);
|
|
272
|
-
|
|
273
|
-
// Iterate through content items
|
|
274
|
-
groups.items.forEach(item => {
|
|
275
|
-
console.log(item.title);
|
|
276
|
-
console.log(item.paragraphs);
|
|
277
|
-
});
|
|
278
|
-
```
|
|
279
|
-
|
|
280
|
-
### Working with byType
|
|
281
|
-
|
|
282
|
-
```js
|
|
283
|
-
const { byType } = parseContent(doc);
|
|
284
|
-
|
|
285
|
-
// Get all images
|
|
286
|
-
const allImages = Object.values(byType.images).flat();
|
|
287
|
-
|
|
288
|
-
// Get all H2 headings
|
|
289
|
-
const h2Headings = byType.getHeadingsByLevel(2);
|
|
290
|
-
|
|
291
|
-
// Get content under specific headings
|
|
292
|
-
const featuresContent = byType.getElementsByHeadingContext(
|
|
293
|
-
h => h.content.includes("Features")
|
|
294
|
-
);
|
|
295
|
-
```
|
|
296
|
-
|
|
297
|
-
### Working with Sequence
|
|
298
|
-
|
|
299
|
-
```js
|
|
300
|
-
const { sequence } = parseContent(doc);
|
|
301
|
-
|
|
302
|
-
// Process elements in order
|
|
303
|
-
sequence.forEach(element => {
|
|
304
|
-
switch(element.type) {
|
|
305
|
-
case 'heading':
|
|
306
|
-
console.log(`H${element.level}: ${element.content}`);
|
|
307
|
-
break;
|
|
308
|
-
case 'paragraph':
|
|
309
|
-
console.log(`P: ${element.content}`);
|
|
310
|
-
break;
|
|
311
|
-
}
|
|
312
|
-
});
|
|
313
|
-
```
|
|
314
|
-
|
|
315
|
-
## Text Formatting
|
|
316
|
-
|
|
317
|
-
The parser preserves inline formatting as HTML tags within text content:
|
|
318
|
-
|
|
319
|
-
- **Bold**: `<strong>text</strong>`
|
|
320
|
-
- **Italic**: `<em>text</em>`
|
|
321
|
-
- **Links**: `<a href="url">text</a>`
|
|
322
|
-
|
|
323
|
-
```js
|
|
324
|
-
// Input
|
|
325
|
-
{
|
|
326
|
-
type: "paragraph",
|
|
327
|
-
content: [
|
|
328
|
-
{ type: "text", text: "Normal " },
|
|
329
|
-
{ type: "text", marks: [{ type: "bold" }], text: "bold" }
|
|
330
|
-
]
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
// Output
|
|
334
|
-
{
|
|
335
|
-
type: "paragraph",
|
|
336
|
-
content: "Normal <strong>bold</strong>"
|
|
337
|
-
}
|
|
338
|
-
```
|
|
339
|
-
|
|
340
|
-
## Special Element Detection
|
|
341
|
-
|
|
342
|
-
The parser detects special patterns and extracts them as dedicated element types:
|
|
343
|
-
|
|
344
|
-
- **Paragraph with only a link** → `type: "link"`
|
|
345
|
-
- **Paragraph with only an image** (role: image/banner) → `type: "image"`
|
|
346
|
-
- **Paragraph with only an icon** (role: icon) → `type: "icon"`
|
|
347
|
-
- **Paragraph with only a button mark** → `type: "button"`
|
|
348
|
-
- **Paragraph with only a video** (role: video) → `type: "video"`
|
|
349
|
-
|
|
350
|
-
This makes it easier to identify and handle these special cases in downstream processing.
|