@uniweb/semantic-parser 1.0.9 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +42 -25
- package/README.md +6 -6
- package/package.json +4 -1
- package/src/processors/groups.js +15 -11
- package/src/processors/sequence.js +59 -11
package/AGENTS.md
CHANGED
|
@@ -52,35 +52,26 @@ const result = parseContent(doc);
|
|
|
52
52
|
// }
|
|
53
53
|
```
|
|
54
54
|
|
|
55
|
-
### Content
|
|
55
|
+
### Content Output Structure
|
|
56
56
|
|
|
57
|
-
|
|
57
|
+
The parser returns a flat content structure:
|
|
58
58
|
|
|
59
59
|
```js
|
|
60
60
|
{
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
propertyBlocks: [],
|
|
76
|
-
cards: [],
|
|
77
|
-
headings: []
|
|
78
|
-
},
|
|
79
|
-
banner: null, // Image with banner role or image before heading
|
|
80
|
-
metadata: {
|
|
81
|
-
level: null, // Heading level that started this group
|
|
82
|
-
contentTypes: Set()
|
|
83
|
-
}
|
|
61
|
+
title: '', // Main heading
|
|
62
|
+
pretitle: '', // Heading before main title
|
|
63
|
+
subtitle: '', // Heading after main title
|
|
64
|
+
paragraphs: [],
|
|
65
|
+
links: [],
|
|
66
|
+
imgs: [],
|
|
67
|
+
icons: [],
|
|
68
|
+
videos: [],
|
|
69
|
+
lists: [],
|
|
70
|
+
buttons: [],
|
|
71
|
+
data: {}, // Tagged code blocks (keyed by tag name)
|
|
72
|
+
cards: [],
|
|
73
|
+
headings: [],
|
|
74
|
+
items: [], // Child content groups
|
|
84
75
|
}
|
|
85
76
|
```
|
|
86
77
|
|
|
@@ -102,6 +93,32 @@ The sequence processor identifies several special element types by inspecting pa
|
|
|
102
93
|
|
|
103
94
|
These are extracted into dedicated element types for easier downstream processing.
|
|
104
95
|
|
|
96
|
+
### Tagged Code Blocks
|
|
97
|
+
|
|
98
|
+
Code blocks with tags route parsed data to the `data` object:
|
|
99
|
+
|
|
100
|
+
```markdown
|
|
101
|
+
```json:nav-links
|
|
102
|
+
[{ "label": "Home", "href": "/" }]
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
```yaml:config
|
|
106
|
+
title: My Site
|
|
107
|
+
theme: dark
|
|
108
|
+
```
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Results in:
|
|
112
|
+
```js
|
|
113
|
+
content.data['nav-links'] = [{ label: "Home", href: "/" }]
|
|
114
|
+
content.data['config'] = { title: "My Site", theme: "dark" }
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
**Parsing rules:**
|
|
118
|
+
- Tagged blocks with `json` language: parsed as JSON
|
|
119
|
+
- Tagged blocks with `yaml`/`yml` language: parsed as YAML
|
|
120
|
+
- Untagged blocks: not parsed (stay as raw text in sequence for display)
|
|
121
|
+
|
|
105
122
|
### List Processing
|
|
106
123
|
|
|
107
124
|
Lists maintain hierarchy through nested structure. The `processListItems()` function in sequence.js handles nested lists, while `processListContent()` in groups.js applies full group content processing to each list item, allowing lists to contain rich content (images, paragraphs, nested lists, etc.).
|
package/README.md
CHANGED
|
@@ -60,14 +60,14 @@ result.sequence = [
|
|
|
60
60
|
|
|
61
61
|
### Content Structure
|
|
62
62
|
|
|
63
|
-
Main content fields are at the top level. The `items` array contains additional content groups (
|
|
63
|
+
Main content fields are at the top level. The `items` array contains additional content groups (created when headings appear after content), each with the same field structure:
|
|
64
64
|
|
|
65
65
|
```js
|
|
66
66
|
result = {
|
|
67
67
|
// Main content fields
|
|
68
|
-
pretitle: "", //
|
|
69
|
-
title: "Welcome", // Main heading
|
|
70
|
-
subtitle: "", //
|
|
68
|
+
pretitle: "", // Heading before main title
|
|
69
|
+
title: "Welcome", // Main heading
|
|
70
|
+
subtitle: "", // Heading after main title
|
|
71
71
|
paragraphs: ["Get started today."],
|
|
72
72
|
imgs: [],
|
|
73
73
|
videos: [],
|
|
@@ -78,7 +78,7 @@ result = {
|
|
|
78
78
|
banner: null, // Optional banner image
|
|
79
79
|
// ... more content types
|
|
80
80
|
|
|
81
|
-
// Additional content groups (
|
|
81
|
+
// Additional content groups (from headings after content)
|
|
82
82
|
items: [
|
|
83
83
|
{ title: "Feature 1", paragraphs: [...], links: [...] },
|
|
84
84
|
{ title: "Feature 2", paragraphs: [...], links: [...] }
|
|
@@ -113,7 +113,7 @@ const content = parseContent(doc);
|
|
|
113
113
|
console.log("Title:", content.title);
|
|
114
114
|
console.log("Description:", content.paragraphs);
|
|
115
115
|
|
|
116
|
-
// Additional
|
|
116
|
+
// Additional content groups
|
|
117
117
|
content.items.forEach(item => {
|
|
118
118
|
console.log("Section:", item.title);
|
|
119
119
|
console.log("Content:", item.paragraphs);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@uniweb/semantic-parser",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.10",
|
|
4
4
|
"description": "Semantic parser for ProseMirror/TipTap content structures",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./src/index.js",
|
|
@@ -33,6 +33,9 @@
|
|
|
33
33
|
"doc": "docs",
|
|
34
34
|
"test": "tests"
|
|
35
35
|
},
|
|
36
|
+
"dependencies": {
|
|
37
|
+
"yaml": "^2.8.2"
|
|
38
|
+
},
|
|
36
39
|
"scripts": {
|
|
37
40
|
"test": "NODE_OPTIONS=--experimental-vm-modules jest",
|
|
38
41
|
"test-report": "NODE_OPTIONS=--experimental-vm-modules jest --json > test-results.json 2>&1",
|
package/src/processors/groups.js
CHANGED
|
@@ -18,8 +18,7 @@ function flattenGroup(group) {
|
|
|
18
18
|
lists: group.body.lists || [],
|
|
19
19
|
videos: group.body.videos || [],
|
|
20
20
|
buttons: group.body.buttons || [],
|
|
21
|
-
|
|
22
|
-
propertyBlocks: group.body.propertyBlocks || [],
|
|
21
|
+
data: group.body.data || {},
|
|
23
22
|
cards: group.body.cards || [],
|
|
24
23
|
documents: group.body.documents || [],
|
|
25
24
|
forms: group.body.forms || [],
|
|
@@ -50,8 +49,7 @@ function processGroups(sequence, options = {}) {
|
|
|
50
49
|
lists: [],
|
|
51
50
|
videos: [],
|
|
52
51
|
buttons: [],
|
|
53
|
-
|
|
54
|
-
propertyBlocks: [],
|
|
52
|
+
data: {},
|
|
55
53
|
cards: [],
|
|
56
54
|
documents: [],
|
|
57
55
|
forms: [],
|
|
@@ -92,8 +90,7 @@ function processGroups(sequence, options = {}) {
|
|
|
92
90
|
lists: [],
|
|
93
91
|
videos: [],
|
|
94
92
|
buttons: [],
|
|
95
|
-
|
|
96
|
-
propertyBlocks: [],
|
|
93
|
+
data: {},
|
|
97
94
|
cards: [],
|
|
98
95
|
documents: [],
|
|
99
96
|
forms: [],
|
|
@@ -239,8 +236,7 @@ function processGroupContent(elements) {
|
|
|
239
236
|
links: [],
|
|
240
237
|
lists: [],
|
|
241
238
|
buttons: [],
|
|
242
|
-
|
|
243
|
-
propertyBlocks: [],
|
|
239
|
+
data: {},
|
|
244
240
|
cards: [],
|
|
245
241
|
documents: [],
|
|
246
242
|
forms: [],
|
|
@@ -345,10 +341,18 @@ function processGroupContent(elements) {
|
|
|
345
341
|
body.quotes.push(quoteContent.body);
|
|
346
342
|
break;
|
|
347
343
|
|
|
344
|
+
case "dataBlock":
|
|
345
|
+
// Pre-parsed structured data from content-reader
|
|
346
|
+
body.data[element.tag] = element.data;
|
|
347
|
+
break;
|
|
348
|
+
|
|
348
349
|
case "codeBlock":
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
350
|
+
// Fallback: tagged code blocks where parsing failed at build time
|
|
351
|
+
// Untagged blocks stay in sequence for display
|
|
352
|
+
const tag = element.attrs?.tag;
|
|
353
|
+
if (tag) {
|
|
354
|
+
body.data[tag] = element.text;
|
|
355
|
+
}
|
|
352
356
|
break;
|
|
353
357
|
|
|
354
358
|
case "form":
|
|
@@ -1,3 +1,52 @@
|
|
|
1
|
+
import { parse as parseYaml } from "yaml";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Get code block data - prefers pre-parsed attrs.data, falls back to parsing text
|
|
5
|
+
*
|
|
6
|
+
* Content can come from two sources:
|
|
7
|
+
* 1. Pre-parsed at build time: attrs.data contains parsed JS object
|
|
8
|
+
* 2. Legacy/runtime: text needs to be parsed based on language
|
|
9
|
+
*
|
|
10
|
+
* @param {string} text - Raw code block text
|
|
11
|
+
* @param {Object} attrs - Code block attributes (language, tag, data)
|
|
12
|
+
* @returns {*} Parsed data or raw text
|
|
13
|
+
*/
|
|
14
|
+
function getCodeBlockData(text, attrs) {
|
|
15
|
+
const { language, tag, data } = attrs || {};
|
|
16
|
+
|
|
17
|
+
// Only process tagged blocks
|
|
18
|
+
if (!tag) {
|
|
19
|
+
return text;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Prefer pre-parsed data from build time (attrs.data)
|
|
23
|
+
if (data !== undefined) {
|
|
24
|
+
return data;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Fallback: parse text at runtime (for backwards compatibility)
|
|
28
|
+
const lang = (language || "").toLowerCase();
|
|
29
|
+
|
|
30
|
+
if (lang === "json") {
|
|
31
|
+
try {
|
|
32
|
+
return JSON.parse(text);
|
|
33
|
+
} catch {
|
|
34
|
+
return text;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (lang === "yaml" || lang === "yml") {
|
|
39
|
+
try {
|
|
40
|
+
return parseYaml(text);
|
|
41
|
+
} catch {
|
|
42
|
+
return text;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Unknown language - return raw text
|
|
47
|
+
return text;
|
|
48
|
+
}
|
|
49
|
+
|
|
1
50
|
/**
|
|
2
51
|
* Process a ProseMirror/TipTap document into a flat sequence
|
|
3
52
|
* @param {Object} doc ProseMirror document
|
|
@@ -79,20 +128,19 @@ function createSequenceElement(node, options = {}) {
|
|
|
79
128
|
attrs,
|
|
80
129
|
};
|
|
81
130
|
|
|
82
|
-
case "
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
} catch (err) {
|
|
90
|
-
parsed = textContent;
|
|
91
|
-
}
|
|
131
|
+
case "dataBlock":
|
|
132
|
+
// Pre-parsed structured data from content-reader
|
|
133
|
+
return {
|
|
134
|
+
type: "dataBlock",
|
|
135
|
+
data: attrs.data,
|
|
136
|
+
tag: attrs.tag,
|
|
137
|
+
};
|
|
92
138
|
|
|
139
|
+
case "codeBlock":
|
|
140
|
+
const codeText = getTextContent(content, options);
|
|
93
141
|
return {
|
|
94
142
|
type: "codeBlock",
|
|
95
|
-
text:
|
|
143
|
+
text: getCodeBlockData(codeText, attrs),
|
|
96
144
|
attrs,
|
|
97
145
|
};
|
|
98
146
|
|