@uniweb/content-reader 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,174 @@
1
+ # Content Reader
2
+
3
+ A JavaScript library for converting Markdown content into ProseMirror-compatible document structures. This library is designed to work seamlessly with TipTap v2 and provides enhanced Markdown parsing capabilities with support for extended syntax.
4
+
5
+ ## Features
6
+
7
+ ### Basic Markdown Support
8
+
9
+ - Paragraphs and basic text formatting (bold, italic)
10
+ - Headings with automatic ID generation
11
+ - Links and images
12
+ - Ordered and unordered lists with nesting support
13
+ - Code blocks with language and filename support
14
+ - Tables with alignment and formatting
15
+ - Block quotes
16
+ - Horizontal rules
17
+
18
+ ### Extended Syntax
19
+
20
+ - **Enhanced Images**: Support for image roles (content, background, icon, gallery) using prefix syntax:
21
+
22
+ ```markdown
23
+ ![Alt text](icon:path/to/icon.svg)
24
+ ![Alt text](background:path/to/bg.jpg)
25
+ ```
26
+
27
+ - **Enhanced Links**: Button variants with predefined styles:
28
+
29
+ ```markdown
30
+ [Button Text](button:https://example.com)
31
+ ```
32
+
33
+ - **Tables with Alignment**: Full support for aligned columns:
34
+ ```markdown
35
+ | Left | Center | Right |
36
+ | :--- | :----: | ----: |
37
+ | Text | Text | Text |
38
+ ```
39
+
40
+ ### Developer-Friendly Features
41
+
42
+ - Clean, well-documented code
43
+ - Comprehensive test suite
44
+ - Modular architecture for easy extension
45
+ - Compatible with TipTap v2 document structure
46
+ - Full TypeScript type definitions
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ npm install @uniwebcms/content-reader
52
+ ```
53
+
54
+ ## Usage
55
+
56
+ Basic usage:
57
+
58
+ ```javascript
59
+ const { markdownToProseMirror } = require("@uniwebcms/content-reader");
60
+
61
+ const markdown = `
62
+ # Hello World
63
+
64
+ This is a **bold** statement with a [link](https://example.com).
65
+
66
+ - List item 1
67
+ - List item 2
68
+ - Nested item
69
+ `;
70
+
71
+ const doc = markdownToProseMirror(markdown);
72
+ ```
73
+
74
+ ### Using with TipTap
75
+
76
+ The library is designed to work seamlessly with TipTap editors:
77
+
78
+ ```javascript
79
+ import { Editor } from "@tiptap/core";
80
+ import { markdownToProseMirror } from "@uniwebcms/content-reader";
81
+
82
+ const editor = new Editor({
83
+ content: markdownToProseMirror(markdown),
84
+ // ... other TipTap configuration
85
+ });
86
+ ```
87
+
88
+ ### Advanced Features
89
+
90
+ #### Working with Image Roles
91
+
92
+ The library supports extended image syntax for different display contexts:
93
+
94
+ ```javascript
95
+ const markdown = `
96
+ ![Header image](background:header.jpg)
97
+ ![Profile photo](gallery:profile.jpg)
98
+ ![Settings](icon:settings.svg)
99
+ `;
100
+
101
+ const doc = markdownToProseMirror(markdown);
102
+ // Each image will have a 'role' attribute in its output structure
103
+ ```
104
+
105
+ #### Handling Tables with Alignment
106
+
107
+ Tables support column alignment and formatted content:
108
+
109
+ ```javascript
110
+ const markdown = `
111
+ | Name | Status | Actions |
112
+ |:-----|:------:|--------:|
113
+ | John | Active | **Edit** |
114
+ | Jane | Away | *View* |
115
+ `;
116
+
117
+ const doc = markdownToProseMirror(markdown);
118
+ // Table cells will have appropriate alignment attributes
119
+ ```
120
+
121
+ ## Architecture
122
+
123
+ The library is organized into several modules:
124
+
125
+ - **Parser Core**: Handles the main parsing logic and orchestration
126
+ - **Block Parser**: Processes block-level elements
127
+ - **Inline Parser**: Handles inline formatting and text
128
+ - **Extensions**: Manages extended syntax features
129
+ - **Schema**: Defines the document structure
130
+
131
+ ## Contributing
132
+
133
+ We welcome contributions! Please see our contributing guidelines for details.
134
+
135
+ ### Development Setup
136
+
137
+ 1. Clone the repository:
138
+
139
+ ```bash
140
+ git clone https://github.com/uniwebcms/content-reader.git
141
+ ```
142
+
143
+ 2. Install dependencies:
144
+
145
+ ```bash
146
+ npm install
147
+ ```
148
+
149
+ 3. Run tests:
150
+ ```bash
151
+ npm test
152
+ ```
153
+
154
+ ### Testing
155
+
156
+ The project uses Jest for testing. Run the test suite:
157
+
158
+ ```bash
159
+ npm test
160
+ ```
161
+
162
+ Or in watch mode:
163
+
164
+ ```bash
165
+ npm run test:watch
166
+ ```
167
+
168
+ ## License
169
+
170
+ Apache License 2.0 - See [LICENSE](LICENSE) for details.
171
+
172
+ ## Credits
173
+
174
+ Developed and maintained by UniWeb CMS. Special thanks to all contributors.
package/package.json ADDED
@@ -0,0 +1,44 @@
1
+ {
2
+ "name": "@uniweb/content-reader",
3
+ "version": "1.0.0",
4
+ "description": "Markdown to ProseMirror document structure converter",
5
+ "type": "module",
6
+ "main": "src/index.js",
7
+ "exports": {
8
+ ".": "./src/index.js"
9
+ },
10
+ "scripts": {
11
+ "test": "NODE_OPTIONS=--experimental-vm-modules jest",
12
+ "test:watch": "NODE_OPTIONS=--experimental-vm-modules jest --watch",
13
+ "test-report": "NODE_OPTIONS=--experimental-vm-modules jest --json > test-results.json 2>&1"
14
+ },
15
+ "keywords": [
16
+ "markdown",
17
+ "prosemirror",
18
+ "tiptap",
19
+ "parser"
20
+ ],
21
+ "author": "Proximify Inc.",
22
+ "license": "GPL-3.0-or-later",
23
+ "dependencies": {
24
+ "marked": "^11.1.0"
25
+ },
26
+ "devDependencies": {
27
+ "jest": "^29.7.0"
28
+ },
29
+ "jest": {
30
+ "testEnvironment": "node",
31
+ "verbose": true
32
+ },
33
+ "directories": {
34
+ "test": "tests"
35
+ },
36
+ "repository": {
37
+ "type": "git",
38
+ "url": "git+https://github.com/uniweb/content-reader.git"
39
+ },
40
+ "bugs": {
41
+ "url": "https://github.com/uniweb/content-reader/issues"
42
+ },
43
+ "homepage": "https://github.com/uniweb/content-reader#readme"
44
+ }
package/src/index.js ADDED
@@ -0,0 +1,22 @@
1
+ /**
2
+ * @fileoverview Main entry point for the content-reader package.
3
+ * Exports the main function to convert markdown to ProseMirror structure.
4
+ */
5
+
6
+ import { marked } from "marked";
7
+ import { parseMarkdownContent } from "./parser/index.js";
8
+ import { getBaseSchema } from "./schema/index.js";
9
+ import { isValidUniwebMarkdown } from "./utils.js";
10
+
11
+ /**
12
+ * Convert markdown content to ProseMirror document structure
13
+ * @param {string} markdown - The markdown content to parse
14
+ * @returns {Object} ProseMirror document structure
15
+ */
16
+ function markdownToProseMirror(markdown) {
17
+ const schema = getBaseSchema();
18
+ const tokens = marked.lexer(markdown);
19
+ return parseMarkdownContent(tokens, schema);
20
+ }
21
+
22
+ export { markdownToProseMirror, isValidUniwebMarkdown };
@@ -0,0 +1,170 @@
1
+ /**
2
+ * @fileoverview Parse block-level markdown elements
3
+ */
4
+
5
+ import { marked } from "marked";
6
+ import { parseInline } from "./inline.js";
7
+ import { parseList } from "./lists.js";
8
+ import { parseTable } from "./tables.js";
9
+
10
+ /**
11
+ * Process code block info string (e.g., "javascript:example.js")
12
+ * @param {string} info - Code block info string
13
+ * @returns {Object} Language and filename
14
+ */
15
+ function processCodeInfo(info) {
16
+ if (!info) return { language: null, filename: null };
17
+
18
+ const parts = info.split(":");
19
+ return {
20
+ language: parts[0] || null,
21
+ filename: parts[1] || null,
22
+ };
23
+ }
24
+
25
+ /**
26
+ * Clean code block text
27
+ * @param {string} text - Raw code block text
28
+ * @returns {string} Cleaned text
29
+ */
30
+ function cleanCodeText(text) {
31
+ // Remove common indent (for indented code blocks)
32
+ const lines = text.split("\n");
33
+ const indent = lines[0].match(/^\s*/)[0];
34
+ return lines
35
+ .map((line) =>
36
+ line.startsWith(indent) ? line.slice(indent.length) : line
37
+ )
38
+ .join("\n")
39
+ .trim();
40
+ }
41
+
42
+ /**
43
+ * Parse a paragraph's content by tokenizing with marked
44
+ * @param {Object} token - Marked token for paragraph
45
+ * @param {Object} schema - ProseMirror schema
46
+ * @returns {Array} Array of ProseMirror inline nodes
47
+ */
48
+ function parseParagraph(token, schema) {
49
+ // // Use marked's inline lexer to properly handle inline code
50
+ // const inlineTokens = marked.Lexer.lexInline(token.text || token.raw);
51
+ // return inlineTokens.flatMap((t) => parseInline(t, schema));
52
+
53
+ // Use the pre-parsed tokens instead of re-lexing
54
+ return token.tokens.flatMap((t) => parseInline(t, schema));
55
+ }
56
+
57
+ /**
58
+ * Parse block level content
59
+ * @param {Object} token - Marked token for block content
60
+ * @param {Object} schema - ProseMirror schema
61
+ * @returns {Object|null} ProseMirror block node or null if empty
62
+ */
63
+ function parseBlock(token, schema) {
64
+ // console.log("BLOCK TOKEN: ", token);
65
+ // Skip HTML comments
66
+ if (token.type === "html" && token.text.startsWith("<!--")) {
67
+ return null;
68
+ }
69
+
70
+ if (token.type === "paragraph") {
71
+ const content = parseParagraph(token, schema);
72
+
73
+ if (!content.length) {
74
+ return null;
75
+ }
76
+
77
+ // extract images to the root level
78
+ const result = [];
79
+ let currentParagraph = null;
80
+
81
+ content.forEach((element) => {
82
+ if (element.type === "image") {
83
+ // If there's an open paragraph, push it to the result before the image
84
+ if (currentParagraph) {
85
+ result.push({
86
+ type: "paragraph",
87
+ content: currentParagraph,
88
+ });
89
+ currentParagraph = null; // Reset the current paragraph
90
+ }
91
+ // Push the image directly to the result
92
+ result.push(element);
93
+ } else {
94
+ // Start a new paragraph if there isn't one open
95
+ if (!currentParagraph) {
96
+ currentParagraph = [];
97
+ }
98
+ // Add the non-image element to the current paragraph
99
+ currentParagraph.push(element);
100
+ }
101
+ });
102
+
103
+ // If there's an open paragraph after the last element, push it to the result
104
+ if (currentParagraph) {
105
+ result.push({ type: "paragraph", content: currentParagraph });
106
+ }
107
+
108
+ return result;
109
+
110
+ // return {
111
+ // type: "paragraph",
112
+ // content,
113
+ // };
114
+ }
115
+
116
+ if (token.type === "heading") {
117
+ const headingContent = parseParagraph(token, schema);
118
+
119
+ return {
120
+ type: "heading",
121
+ attrs: {
122
+ level: token.depth,
123
+ id: null,
124
+ },
125
+ content: headingContent,
126
+ };
127
+ }
128
+
129
+ if (token.type === "blockquote") {
130
+ const content = token.tokens.flatMap((t) => parseBlock(t, schema));
131
+ return {
132
+ type: "blockquote",
133
+ content,
134
+ };
135
+ }
136
+
137
+ if (token.type === "hr") {
138
+ return {
139
+ type: "divider",
140
+ attrs: { style: "dot", size: "normal" },
141
+ };
142
+ }
143
+
144
+ if (token.type === "code") {
145
+ const { language, filename } = processCodeInfo(token.lang);
146
+ return {
147
+ type: "codeBlock",
148
+ attrs: { language, filename },
149
+ content: [
150
+ {
151
+ type: "text",
152
+ text: cleanCodeText(token.text),
153
+ },
154
+ ],
155
+ };
156
+ }
157
+
158
+ if (token.type === "list") {
159
+ return parseList(token, schema);
160
+ }
161
+
162
+ if (token.type === "table") {
163
+ return parseTable(token, schema);
164
+ }
165
+
166
+ // Handle unknown block types as null
167
+ return null;
168
+ }
169
+
170
+ export { parseBlock, parseParagraph };
@@ -0,0 +1,54 @@
1
+ /**
2
+ * @fileoverview Main parser orchestration
3
+ */
4
+
5
+ import { parseBlock } from "./block.js";
6
+ import { isEyebrowPattern, parseEyebrowPattern } from "./patterns.js";
7
+ import { isEmptyContent } from "./utils.js";
8
+
9
+ /**
10
+ * Parse markdown content into ProseMirror document structure
11
+ * @param {Array} tokens - Array of marked tokens
12
+ * @param {Object} schema - ProseMirror schema
13
+ * @returns {Object} ProseMirror document
14
+ */
15
+ function parseMarkdownContent(tokens, schema) {
16
+ const content = [];
17
+ let skipNext = false;
18
+ // console.log("tokens:", tokens);
19
+ for (let i = 0; i < tokens.length; i++) {
20
+ if (skipNext) {
21
+ skipNext = false;
22
+ continue;
23
+ }
24
+
25
+ // Handle eyebrow pattern
26
+ // if (isEyebrowPattern(tokens, i)) {
27
+ // content.push(...parseEyebrowPattern(tokens, i, schema));
28
+ // skipNext = true;
29
+ // continue;
30
+ // }
31
+
32
+ const node = parseBlock(tokens[i], schema);
33
+ if (node) {
34
+ if (Array.isArray(node)) {
35
+ content.push(...node);
36
+ } else {
37
+ content.push(node);
38
+ }
39
+ }
40
+ }
41
+
42
+ // Filter out any remaining null nodes and empty paragraphs
43
+ return {
44
+ type: "doc",
45
+ content: content.filter((node) => {
46
+ if (!node) return false;
47
+ if (node.type === "paragraph" && isEmptyContent(node.content))
48
+ return false;
49
+ return true;
50
+ }),
51
+ };
52
+ }
53
+
54
+ export { parseMarkdownContent };
@@ -0,0 +1,117 @@
1
+ /**
2
+ * @fileoverview Parse inline markdown elements
3
+ */
4
+
5
+ /**
6
+ * Parse inline markdown content into ProseMirror/Tiptap nodes
7
+ * @param {Object} token - Marked token for inline content
8
+ * @param {Object} schema - ProseMirror schema
9
+ * @returns {Array} Array of ProseMirror inline nodes
10
+ *
11
+ * Notes on implementation choices:
12
+ * - We use token.raw for plain text to avoid HTML entity encoding
13
+ * - For formatted text (bold/italic), we use token.tokens to handle nested formatting
14
+ * - Tiptap represents formatting as marks on text nodes, not nested structures
15
+ * - HTML entities are only decoded for specific token types (codespan, link) where
16
+ * we need the processed content
17
+ */
18
+ function parseInline(token, schema, removeNewLine = false) {
19
+ if (token.type === "text") {
20
+ if (removeNewLine && token.raw) {
21
+ token.raw = token.raw.replace(/\n/g, "");
22
+ }
23
+ // Use raw to get unencoded characters (', ", &, etc.)
24
+ // marked's .text property encodes these as HTML entities
25
+ return token.raw ? [{ type: "text", text: token.raw }] : [];
26
+ }
27
+
28
+ if (token.type === "strong" || token.type === "em") {
29
+ // Tiptap represents formatting as marks on text nodes
30
+ // For nested formatting like **_text_**, all marks are applied to the same text node
31
+ const mark = { type: token.type === "strong" ? "bold" : "italic" };
32
+
33
+ return token.tokens.flatMap((t) =>
34
+ parseInline(t, schema, removeNewLine).map((node) => ({
35
+ ...node,
36
+ marks: [...(node.marks || []), mark],
37
+ }))
38
+ );
39
+ }
40
+
41
+ if (token.type === "html") {
42
+ // Handle HTML tokens however you need
43
+ // You might want to strip the < > or process them differently
44
+ return [{ type: "text", text: token.raw }];
45
+ } else if (token.type === "br") {
46
+ return [{ type: "text", text: "\n" }];
47
+ }
48
+
49
+ // Decode HTML entities
50
+ const text = token.text
51
+ .replace(/&#39;/g, "'")
52
+ .replace(/&quot;/g, '"')
53
+ .replace(/&amp;/g, "&");
54
+
55
+ if (token.type === "codespan") {
56
+ return [
57
+ {
58
+ type: "text",
59
+ marks: [{ type: "code" }],
60
+ text,
61
+ },
62
+ ];
63
+ }
64
+
65
+ if (token.type === "link") {
66
+ const isButton = token.href.startsWith("button:");
67
+ const href = isButton ? token.href.substring(7) : token.href;
68
+
69
+ return [
70
+ {
71
+ type: "text",
72
+ marks: [
73
+ {
74
+ type: isButton ? "button" : "link",
75
+ attrs: {
76
+ href,
77
+ title: token.title || null,
78
+ ...(isButton && { variant: "primary" }),
79
+ },
80
+ },
81
+ ],
82
+ text,
83
+ },
84
+ ];
85
+ }
86
+
87
+ if (token.type === "image") {
88
+ let role, src;
89
+
90
+ // Find the first colon to handle role:url format correctly
91
+ if (token.href.includes(":") && !token.href.startsWith("http")) {
92
+ const colonIndex = token.href.indexOf(":");
93
+ role = token.href.substring(0, colonIndex);
94
+ src = token.href.substring(colonIndex + 1);
95
+ } else {
96
+ role = "image";
97
+ src = token.href;
98
+ }
99
+
100
+ return [
101
+ {
102
+ type: "image",
103
+ attrs: {
104
+ src,
105
+ caption: token.title || null,
106
+ alt: text || null,
107
+ role,
108
+ },
109
+ },
110
+ ];
111
+ }
112
+
113
+ // Handle unknown token types as plain text
114
+ return token.raw ? [{ type: "text", text: token.raw }] : [];
115
+ }
116
+
117
+ export { parseInline };
@@ -0,0 +1,106 @@
1
+ /**
2
+ * @fileoverview Parse markdown lists
3
+ */
4
+
5
+ import { marked } from "marked";
6
+ import { parseInline } from "./inline.js";
7
+
8
+ /**
9
+ * Extract main content from a list item, excluding nested list content
10
+ * @param {Object} item - List item token
11
+ * @returns {string} Main content text
12
+ */
13
+ function extractMainContent(item) {
14
+ // Remove nested list markdown from the text
15
+ const text = item.text || "";
16
+ const lines = text.split("\n");
17
+ return lines
18
+ .filter(
19
+ (line) =>
20
+ !line.trim().startsWith("-") && !line.trim().match(/^\d+\./)
21
+ )
22
+ .join("\n");
23
+ }
24
+
25
+ /**
26
+ * Parse list item text content
27
+ * @param {Object} item - List item token
28
+ * @param {Object} schema - ProseMirror schema
29
+ * @returns {Array} Array of ProseMirror nodes for the item content
30
+ */
31
+ function parseListItemContent(item, schema) {
32
+ const mainContent = extractMainContent(item);
33
+ const inlineTokens = marked.Lexer.lexInline(mainContent);
34
+
35
+ const content = [
36
+ {
37
+ type: "paragraph",
38
+ content: inlineTokens.flatMap((t) => parseInline(t, schema, true)),
39
+ },
40
+ ];
41
+
42
+ // Handle nested lists by parsing them as new markdown
43
+ if (item.text) {
44
+ const lines = item.text.split("\n");
45
+ let currentNested = [];
46
+ let isNested = false;
47
+
48
+ for (const line of lines) {
49
+ const trimmed = line.trim();
50
+ if (trimmed.startsWith("-") || trimmed.match(/^\d+\./)) {
51
+ currentNested.push(line);
52
+ isNested = true;
53
+ } else if (isNested && trimmed === "") {
54
+ currentNested.push(line);
55
+ }
56
+ }
57
+
58
+ if (currentNested.length > 0) {
59
+ const nestedMarkdown = currentNested.join("\n");
60
+ const nestedTokens = marked.lexer(nestedMarkdown);
61
+
62
+ for (const token of nestedTokens) {
63
+ if (token.type === "list") {
64
+ content.push({
65
+ type: token.ordered ? "orderedList" : "bulletList",
66
+ ...(token.ordered && {
67
+ attrs: { start: token.start || 1 },
68
+ }),
69
+ content: parseListItems(token.items, schema),
70
+ });
71
+ }
72
+ }
73
+ }
74
+ }
75
+
76
+ return content;
77
+ }
78
+
79
+ /**
80
+ * Parse list items recursively
81
+ * @param {Array} items - Array of list item tokens
82
+ * @param {Object} schema - ProseMirror schema
83
+ * @returns {Array} Array of ProseMirror list item nodes
84
+ */
85
+ function parseListItems(items, schema) {
86
+ return items.map((item) => ({
87
+ type: "listItem",
88
+ content: parseListItemContent(item, schema),
89
+ }));
90
+ }
91
+
92
+ /**
93
+ * Parse list block
94
+ * @param {Object} token - List token
95
+ * @param {Object} schema - ProseMirror schema
96
+ * @returns {Object} ProseMirror list node
97
+ */
98
+ function parseList(token, schema) {
99
+ return {
100
+ type: token.ordered ? "orderedList" : "bulletList",
101
+ ...(token.ordered && { attrs: { start: token.start || 1 } }),
102
+ content: parseListItems(token.items, schema),
103
+ };
104
+ }
105
+
106
+ export { parseList, parseListItems };