@fileverse/content-processor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -0
- package/dist/dom-setup.d.ts +4 -0
- package/dist/dom-setup.d.ts.map +1 -0
- package/dist/dom-setup.js +53 -0
- package/dist/dom-setup.js.map +1 -0
- package/dist/extensions.d.ts +13 -0
- package/dist/extensions.d.ts.map +1 -0
- package/dist/extensions.js +261 -0
- package/dist/extensions.js.map +1 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +94 -0
- package/dist/index.js.map +1 -0
- package/dist/markdown-parser.d.ts +5 -0
- package/dist/markdown-parser.d.ts.map +1 -0
- package/dist/markdown-parser.js +114 -0
- package/dist/markdown-parser.js.map +1 -0
- package/package.json +68 -0
- package/src/dom-setup.ts +60 -0
- package/src/extensions.ts +294 -0
- package/src/index.ts +117 -0
- package/src/markdown-parser.ts +148 -0
- package/test/sample.md +80 -0
- package/test/test.ts +30 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import MarkdownIt from 'markdown-it';
|
|
2
|
+
import markdownItFootnote from 'markdown-it-footnote';
|
|
3
|
+
import { createDOMPurifyInstance, parseHTMLString } from './dom-setup.js';
|
|
4
|
+
|
|
5
|
+
const markdownIt = new MarkdownIt().use(markdownItFootnote);
|
|
6
|
+
|
|
7
|
+
export function stripFrontmatter(markdown: string): string {
|
|
8
|
+
const fmRegex = /^---\n[\s\S]*?\n---\n*/;
|
|
9
|
+
return markdown.replace(fmRegex, '').replace(/^\s*\n/, '');
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export function parseMarkdownToHTML(markdown: string): string {
|
|
13
|
+
let cleanMarkdown = stripFrontmatter(markdown);
|
|
14
|
+
|
|
15
|
+
cleanMarkdown = cleanMarkdown.replace(/(\d)\*(\d)/g, '$1\\*$2');
|
|
16
|
+
cleanMarkdown = cleanMarkdown.replace(/(\])\*(\[)/g, '$1\\*$2');
|
|
17
|
+
cleanMarkdown = cleanMarkdown.replace(/(\))\*(\()/g, '$1\\*$2');
|
|
18
|
+
|
|
19
|
+
let convertedHtml = markdownIt.render(cleanMarkdown);
|
|
20
|
+
|
|
21
|
+
return convertedHtml;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function processHTMLForEditor(html: string): string {
|
|
25
|
+
const doc = parseHTMLString(html);
|
|
26
|
+
const body = doc.body;
|
|
27
|
+
|
|
28
|
+
const topLevelPs = body.querySelectorAll(':scope > p');
|
|
29
|
+
topLevelPs.forEach((p) => {
|
|
30
|
+
if (p.childNodes.length === 0 && p.textContent === '') {
|
|
31
|
+
p.remove();
|
|
32
|
+
}
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
const calloutAsides = body.querySelectorAll('aside.callout');
|
|
36
|
+
calloutAsides.forEach((el) => {
|
|
37
|
+
el.setAttribute('data-type', 'callout');
|
|
38
|
+
el.removeAttribute('class');
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
const callouts = body.querySelectorAll('aside[data-type="callout"]');
|
|
42
|
+
callouts.forEach((aside) => {
|
|
43
|
+
const ps = aside.querySelectorAll('p');
|
|
44
|
+
ps.forEach((p) => {
|
|
45
|
+
const isEmpty = Array.from(p.childNodes).every((node) => {
|
|
46
|
+
if (node.nodeType === 1) {
|
|
47
|
+
return (node as Element).nodeName === 'BR';
|
|
48
|
+
}
|
|
49
|
+
if (node.nodeType === 3) {
|
|
50
|
+
return node.textContent?.trim() === '';
|
|
51
|
+
}
|
|
52
|
+
return false;
|
|
53
|
+
});
|
|
54
|
+
if (isEmpty && p.parentNode) {
|
|
55
|
+
p.parentNode.removeChild(p);
|
|
56
|
+
}
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
const lists = body.getElementsByTagName('ul');
|
|
61
|
+
for (let i = 0; i < lists.length; i++) {
|
|
62
|
+
const list = lists[i];
|
|
63
|
+
const items = list.getElementsByTagName('li');
|
|
64
|
+
let isTodoList = false;
|
|
65
|
+
|
|
66
|
+
for (let j = 0; j < items.length; j++) {
|
|
67
|
+
const item = items[j];
|
|
68
|
+
const text = item.textContent || '';
|
|
69
|
+
const todoMatch = text.match(/^\[([ x])\]\s*(.*)/i);
|
|
70
|
+
|
|
71
|
+
if (todoMatch) {
|
|
72
|
+
isTodoList = true;
|
|
73
|
+
const isChecked = todoMatch[1].toLowerCase() === 'x';
|
|
74
|
+
const content = todoMatch[2];
|
|
75
|
+
|
|
76
|
+
item.setAttribute('data-type', 'taskItem');
|
|
77
|
+
item.setAttribute('data-checked', isChecked.toString());
|
|
78
|
+
item.textContent = content;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if (isTodoList) {
|
|
83
|
+
list.setAttribute('data-type', 'taskList');
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const paragraphs = body.getElementsByTagName('p');
|
|
88
|
+
for (let i = paragraphs.length - 1; i >= 0; i--) {
|
|
89
|
+
const p = paragraphs[i];
|
|
90
|
+
if (p.childNodes.length === 1 && p.firstChild?.nodeName === 'IMG') {
|
|
91
|
+
p.parentNode?.replaceChild(p.firstChild, p);
|
|
92
|
+
}
|
|
93
|
+
if (
|
|
94
|
+
p.childNodes.length === 1 &&
|
|
95
|
+
p.firstChild?.textContent?.trim() === '==='
|
|
96
|
+
) {
|
|
97
|
+
const pageBreakDiv = doc.createElement('div');
|
|
98
|
+
pageBreakDiv.setAttribute('data-type', 'page-break');
|
|
99
|
+
pageBreakDiv.setAttribute('data-page-break', 'true');
|
|
100
|
+
p.parentNode?.replaceChild(pageBreakDiv, p);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
let processedHtml = body.innerHTML;
|
|
105
|
+
|
|
106
|
+
const superscriptRegex = /\^([^\s^]+)\^/g;
|
|
107
|
+
const subscriptRegex = /~([^\s~](?:[^~]*[^\s~])?)~/g;
|
|
108
|
+
const pageBreakRegex = /===\s*$/gm;
|
|
109
|
+
|
|
110
|
+
processedHtml = processedHtml.replace(
|
|
111
|
+
superscriptRegex,
|
|
112
|
+
'<sup data-type="sup">$1</sup>'
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
processedHtml = processedHtml.replace(
|
|
116
|
+
subscriptRegex,
|
|
117
|
+
'<sub data-type="sub">$1</sub>'
|
|
118
|
+
);
|
|
119
|
+
|
|
120
|
+
processedHtml = processedHtml.replace(
|
|
121
|
+
pageBreakRegex,
|
|
122
|
+
'<div data-type="page-break" data-page-break="true"></div>'
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
const DOMPurify = createDOMPurifyInstance();
|
|
126
|
+
processedHtml = DOMPurify.sanitize(processedHtml, {
|
|
127
|
+
ADD_TAGS: ['div', 'aside'],
|
|
128
|
+
ADD_ATTR: [
|
|
129
|
+
'data-type',
|
|
130
|
+
'data-page-break',
|
|
131
|
+
'data-checked',
|
|
132
|
+
'url',
|
|
133
|
+
'src',
|
|
134
|
+
'media-type',
|
|
135
|
+
'alt',
|
|
136
|
+
'title',
|
|
137
|
+
'width',
|
|
138
|
+
'height',
|
|
139
|
+
],
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
return processedHtml;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
export function convertMarkdownToHTML(markdown: string): string {
|
|
146
|
+
const rawHtml = parseMarkdownToHTML(markdown);
|
|
147
|
+
return processHTMLForEditor(rawHtml);
|
|
148
|
+
}
|
package/test/sample.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Test Document
|
|
3
|
+
date: 2024-01-01
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Heading 1
|
|
7
|
+
|
|
8
|
+
This is a paragraph with **bold** and *italic* text.
|
|
9
|
+
|
|
10
|
+
## Heading 2
|
|
11
|
+
|
|
12
|
+
Here's a list:
|
|
13
|
+
|
|
14
|
+
- Item 1
|
|
15
|
+
- Item 2
|
|
16
|
+
- Item 3
|
|
17
|
+
|
|
18
|
+
### Ordered List
|
|
19
|
+
|
|
20
|
+
1. First item
|
|
21
|
+
2. Second item
|
|
22
|
+
3. Third item
|
|
23
|
+
|
|
24
|
+
### Task List
|
|
25
|
+
|
|
26
|
+
- [ ] Unchecked task
|
|
27
|
+
- [x] Checked task
|
|
28
|
+
- [ ] Another task
|
|
29
|
+
|
|
30
|
+
## Code
|
|
31
|
+
|
|
32
|
+
Inline `code` example.
|
|
33
|
+
|
|
34
|
+
```javascript
|
|
35
|
+
function hello() {
|
|
36
|
+
console.log("Hello, World!");
|
|
37
|
+
}
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Links and Images
|
|
41
|
+
|
|
42
|
+
[Link text](https://example.com)
|
|
43
|
+
|
|
44
|
+

|
|
45
|
+
|
|
46
|
+
## Table
|
|
47
|
+
|
|
48
|
+
| Header 1 | Header 2 |
|
|
49
|
+
|----------|----------|
|
|
50
|
+
| Cell 1 | Cell 2 |
|
|
51
|
+
| Cell 3 | Cell 4 |
|
|
52
|
+
|
|
53
|
+
## Blockquote
|
|
54
|
+
|
|
55
|
+
> This is a blockquote.
|
|
56
|
+
> It can span multiple lines.
|
|
57
|
+
|
|
58
|
+
## Page Break
|
|
59
|
+
|
|
60
|
+
===
|
|
61
|
+
|
|
62
|
+
## More Content
|
|
63
|
+
|
|
64
|
+
Content after page break.
|
|
65
|
+
|
|
66
|
+
<aside class="callout">
|
|
67
|
+
This is a callout block.
|
|
68
|
+
</aside>
|
|
69
|
+
|
|
70
|
+
## Superscript and Subscript
|
|
71
|
+
|
|
72
|
+
E = mc^2^
|
|
73
|
+
|
|
74
|
+
H~2~O
|
|
75
|
+
|
|
76
|
+
## Footnote
|
|
77
|
+
|
|
78
|
+
This has a footnote[^1].
|
|
79
|
+
|
|
80
|
+
[^1]: This is the footnote content.
|
package/test/test.ts
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
import { markdownToYjs, isYjsEncoded } from '../dist/index.js';
|
|
5
|
+
|
|
6
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
7
|
+
const __dirname = path.dirname(__filename);
|
|
8
|
+
|
|
9
|
+
const sampleMarkdown = fs.readFileSync(
|
|
10
|
+
path.join(__dirname, 'sample.md'),
|
|
11
|
+
'utf-8'
|
|
12
|
+
);
|
|
13
|
+
|
|
14
|
+
console.log('Testing markdown-to-yjs converter...\n');
|
|
15
|
+
console.log('Input markdown length:', sampleMarkdown.length, 'characters');
|
|
16
|
+
console.log('---\n');
|
|
17
|
+
|
|
18
|
+
try {
|
|
19
|
+
const yjsContent = markdownToYjs("Hello, World!, How are you?");
|
|
20
|
+
|
|
21
|
+
console.log('Output Yjs content length:', yjsContent.length, 'characters');
|
|
22
|
+
console.log('Is valid Yjs encoded:', isYjsEncoded(yjsContent));
|
|
23
|
+
console.log('\nFirst 100 characters of Yjs content:');
|
|
24
|
+
console.log(yjsContent);
|
|
25
|
+
console.log('\n---');
|
|
26
|
+
console.log('Conversion successful!');
|
|
27
|
+
} catch (error) {
|
|
28
|
+
console.error('Conversion failed:', error);
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "ES2020",
|
|
4
|
+
"module": "ESNext",
|
|
5
|
+
"moduleResolution": "bundler",
|
|
6
|
+
"lib": ["ES2020", "DOM"],
|
|
7
|
+
"declaration": true,
|
|
8
|
+
"declarationMap": true,
|
|
9
|
+
"sourceMap": true,
|
|
10
|
+
"outDir": "./dist",
|
|
11
|
+
"rootDir": "./src",
|
|
12
|
+
"strict": true,
|
|
13
|
+
"esModuleInterop": true,
|
|
14
|
+
"skipLibCheck": true,
|
|
15
|
+
"forceConsistentCasingInFileNames": true,
|
|
16
|
+
"resolveJsonModule": true
|
|
17
|
+
},
|
|
18
|
+
"include": ["src/**/*"],
|
|
19
|
+
"exclude": ["node_modules", "dist"]
|
|
20
|
+
}
|