@uniweb/semantic-parser 1.0.9 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -52,35 +52,26 @@ const result = parseContent(doc);
52
52
  // }
53
53
  ```
54
54
 
55
- ### Content Group Structure
55
+ ### Content Output Structure
56
56
 
57
- Groups follow a specific structure defined in `processGroupContent()`:
57
+ The parser returns a flat content structure:
58
58
 
59
59
  ```js
60
60
  {
61
- header: {
62
- pretitle: '', // H3 before main title
63
- title: '', // Main heading (H1 or H2)
64
- subtitle: '' // Heading after main title
65
- },
66
- body: {
67
- imgs: [],
68
- icons: [],
69
- videos: [],
70
- paragraphs: [],
71
- links: [],
72
- lists: [],
73
- buttons: [],
74
- properties: [],
75
- propertyBlocks: [],
76
- cards: [],
77
- headings: []
78
- },
79
- banner: null, // Image with banner role or image before heading
80
- metadata: {
81
- level: null, // Heading level that started this group
82
- contentTypes: Set()
83
- }
61
+ title: '', // Main heading
62
+ pretitle: '', // Heading before main title
63
+ subtitle: '', // Heading after main title
64
+ paragraphs: [],
65
+ links: [],
66
+ imgs: [],
67
+ icons: [],
68
+ videos: [],
69
+ lists: [],
70
+ buttons: [],
71
+ data: {}, // Tagged code blocks (keyed by tag name)
72
+ cards: [],
73
+ headings: [],
74
+ items: [], // Child content groups
84
75
  }
85
76
  ```
86
77
 
@@ -102,6 +93,32 @@ The sequence processor identifies several special element types by inspecting pa
102
93
 
103
94
  These are extracted into dedicated element types for easier downstream processing.
104
95
 
96
+ ### Tagged Code Blocks
97
+
98
+ Code blocks with tags route parsed data to the `data` object:
99
+
100
+ ```markdown
101
+ ```json:nav-links
102
+ [{ "label": "Home", "href": "/" }]
103
+ ```
104
+
105
+ ```yaml:config
106
+ title: My Site
107
+ theme: dark
108
+ ```
109
+ ```
110
+
111
+ Results in:
112
+ ```js
113
+ content.data['nav-links'] = [{ label: "Home", href: "/" }]
114
+ content.data['config'] = { title: "My Site", theme: "dark" }
115
+ ```
116
+
117
+ **Parsing rules:**
118
+ - Tagged blocks with `json` language: parsed as JSON
119
+ - Tagged blocks with `yaml`/`yml` language: parsed as YAML
120
+ - Untagged blocks: not parsed (stay as raw text in sequence for display)
121
+
105
122
  ### List Processing
106
123
 
107
124
  Lists maintain hierarchy through nested structure. The `processListItems()` function in sequence.js handles nested lists, while `processListContent()` in groups.js applies full group content processing to each list item, allowing lists to contain rich content (images, paragraphs, nested lists, etc.).
package/README.md CHANGED
@@ -60,14 +60,14 @@ result.sequence = [
60
60
 
61
61
  ### Content Structure
62
62
 
63
- Main content fields are at the top level. The `items` array contains additional content groups (e.g., H3 sections), each with the same field structure:
63
+ Main content fields are at the top level. The `items` array contains additional content groups (created when headings appear after content), each with the same field structure:
64
64
 
65
65
  ```js
66
66
  result = {
67
67
  // Main content fields
68
- pretitle: "", // H3 before main title
69
- title: "Welcome", // Main heading (H1)
70
- subtitle: "", // H2 after main title
68
+ pretitle: "", // Heading before main title
69
+ title: "Welcome", // Main heading
70
+ subtitle: "", // Heading after main title
71
71
  paragraphs: ["Get started today."],
72
72
  imgs: [],
73
73
  videos: [],
@@ -78,7 +78,7 @@ result = {
78
78
  banner: null, // Optional banner image
79
79
  // ... more content types
80
80
 
81
- // Additional content groups (H3 sections)
81
+ // Additional content groups (from headings after content)
82
82
  items: [
83
83
  { title: "Feature 1", paragraphs: [...], links: [...] },
84
84
  { title: "Feature 2", paragraphs: [...], links: [...] }
@@ -113,7 +113,7 @@ const content = parseContent(doc);
113
113
  console.log("Title:", content.title);
114
114
  console.log("Description:", content.paragraphs);
115
115
 
116
- // Additional sections (H3 groups)
116
+ // Additional content groups
117
117
  content.items.forEach(item => {
118
118
  console.log("Section:", item.title);
119
119
  console.log("Content:", item.paragraphs);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@uniweb/semantic-parser",
3
- "version": "1.0.9",
3
+ "version": "1.0.10",
4
4
  "description": "Semantic parser for ProseMirror/TipTap content structures",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
@@ -33,6 +33,9 @@
33
33
  "doc": "docs",
34
34
  "test": "tests"
35
35
  },
36
+ "dependencies": {
37
+ "yaml": "^2.8.2"
38
+ },
36
39
  "scripts": {
37
40
  "test": "NODE_OPTIONS=--experimental-vm-modules jest",
38
41
  "test-report": "NODE_OPTIONS=--experimental-vm-modules jest --json > test-results.json 2>&1",
@@ -18,8 +18,7 @@ function flattenGroup(group) {
18
18
  lists: group.body.lists || [],
19
19
  videos: group.body.videos || [],
20
20
  buttons: group.body.buttons || [],
21
- properties: group.body.properties || {},
22
- propertyBlocks: group.body.propertyBlocks || [],
21
+ data: group.body.data || {},
23
22
  cards: group.body.cards || [],
24
23
  documents: group.body.documents || [],
25
24
  forms: group.body.forms || [],
@@ -50,8 +49,7 @@ function processGroups(sequence, options = {}) {
50
49
  lists: [],
51
50
  videos: [],
52
51
  buttons: [],
53
- properties: {},
54
- propertyBlocks: [],
52
+ data: {},
55
53
  cards: [],
56
54
  documents: [],
57
55
  forms: [],
@@ -92,8 +90,7 @@ function processGroups(sequence, options = {}) {
92
90
  lists: [],
93
91
  videos: [],
94
92
  buttons: [],
95
- properties: {},
96
- propertyBlocks: [],
93
+ data: {},
97
94
  cards: [],
98
95
  documents: [],
99
96
  forms: [],
@@ -239,8 +236,7 @@ function processGroupContent(elements) {
239
236
  links: [],
240
237
  lists: [],
241
238
  buttons: [],
242
- properties: {},
243
- propertyBlocks: [],
239
+ data: {},
244
240
  cards: [],
245
241
  documents: [],
246
242
  forms: [],
@@ -345,10 +341,18 @@ function processGroupContent(elements) {
345
341
  body.quotes.push(quoteContent.body);
346
342
  break;
347
343
 
344
+ case "dataBlock":
345
+ // Pre-parsed structured data from content-reader
346
+ body.data[element.tag] = element.data;
347
+ break;
348
+
348
349
  case "codeBlock":
349
- const codeData = element.text;
350
- body.properties = codeData; // Last one
351
- body.propertyBlocks.push(codeData); // All of them
350
+ // Fallback: tagged code blocks where parsing failed at build time
351
+ // Untagged blocks stay in sequence for display
352
+ const tag = element.attrs?.tag;
353
+ if (tag) {
354
+ body.data[tag] = element.text;
355
+ }
352
356
  break;
353
357
 
354
358
  case "form":
@@ -1,3 +1,52 @@
1
+ import { parse as parseYaml } from "yaml";
2
+
3
+ /**
4
+ * Get code block data - prefers pre-parsed attrs.data, falls back to parsing text
5
+ *
6
+ * Content can come from two sources:
7
+ * 1. Pre-parsed at build time: attrs.data contains parsed JS object
8
+ * 2. Legacy/runtime: text needs to be parsed based on language
9
+ *
10
+ * @param {string} text - Raw code block text
11
+ * @param {Object} attrs - Code block attributes (language, tag, data)
12
+ * @returns {*} Parsed data or raw text
13
+ */
14
+ function getCodeBlockData(text, attrs) {
15
+ const { language, tag, data } = attrs || {};
16
+
17
+ // Only process tagged blocks
18
+ if (!tag) {
19
+ return text;
20
+ }
21
+
22
+ // Prefer pre-parsed data from build time (attrs.data)
23
+ if (data !== undefined) {
24
+ return data;
25
+ }
26
+
27
+ // Fallback: parse text at runtime (for backwards compatibility)
28
+ const lang = (language || "").toLowerCase();
29
+
30
+ if (lang === "json") {
31
+ try {
32
+ return JSON.parse(text);
33
+ } catch {
34
+ return text;
35
+ }
36
+ }
37
+
38
+ if (lang === "yaml" || lang === "yml") {
39
+ try {
40
+ return parseYaml(text);
41
+ } catch {
42
+ return text;
43
+ }
44
+ }
45
+
46
+ // Unknown language - return raw text
47
+ return text;
48
+ }
49
+
1
50
  /**
2
51
  * Process a ProseMirror/TipTap document into a flat sequence
3
52
  * @param {Object} doc ProseMirror document
@@ -79,20 +128,19 @@ function createSequenceElement(node, options = {}) {
79
128
  attrs,
80
129
  };
81
130
 
82
- case "codeBlock":
83
- let textContent = getTextContent(content, options);
84
- let parsed = "";
85
-
86
- //Try pasre json if possible
87
- try {
88
- parsed = JSON.parse(`${textContent}`);
89
- } catch (err) {
90
- parsed = textContent;
91
- }
131
+ case "dataBlock":
132
+ // Pre-parsed structured data from content-reader
133
+ return {
134
+ type: "dataBlock",
135
+ data: attrs.data,
136
+ tag: attrs.tag,
137
+ };
92
138
 
139
+ case "codeBlock":
140
+ const codeText = getTextContent(content, options);
93
141
  return {
94
142
  type: "codeBlock",
95
- text: parsed,
143
+ text: getCodeBlockData(codeText, attrs),
96
144
  attrs,
97
145
  };
98
146