hast-latex 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +11 -0
- package/dist/index.mjs +241 -0
- package/package.json +61 -0
- package/readme.md +99 -0
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import * as Latex from "@unified-latex/unified-latex-types";
|
|
2
|
+
import { Root } from "hast";
|
|
3
|
+
import { Plugin } from "unified";
|
|
4
|
+
|
|
5
|
+
//#region lib/unified-hast-to-latex/index.d.ts
|
|
6
|
+
interface RehypeUnifiedLatexOptions {
|
|
7
|
+
documentClass?: 'article' | 'report' | 'book';
|
|
8
|
+
}
|
|
9
|
+
declare const rehypeUnifiedLatex: Plugin<[(RehypeUnifiedLatexOptions | null | undefined)?], Root, Latex.Root>;
|
|
10
|
+
//#endregion
|
|
11
|
+
export { type RehypeUnifiedLatexOptions, rehypeUnifiedLatex };
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import { args, m } from "@unified-latex/unified-latex-builder";
|
|
2
|
+
import { visit } from "unist-util-visit";
|
|
3
|
+
|
|
4
|
+
//#region lib/utils/getClassList.ts
|
|
5
|
+
function getClassList(node) {
|
|
6
|
+
if (!hasClassList(node)) return [];
|
|
7
|
+
const className = node.properties?.className;
|
|
8
|
+
if (Array.isArray(className)) return className.map(String);
|
|
9
|
+
if (typeof className === "string") return className.split(/\s+/);
|
|
10
|
+
return [];
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
//#endregion
|
|
14
|
+
//#region lib/unified-hast-to-latex/collect-body.ts
|
|
15
|
+
function getBody(tree) {
|
|
16
|
+
const html = tree.children.find((node) => node.type === "element" && node.tagName === "html");
|
|
17
|
+
if (!html) return void 0;
|
|
18
|
+
return html.children.find((node) => node.type === "element" && node.tagName === "body");
|
|
19
|
+
}
|
|
20
|
+
function hastNodeToLatex(node) {
|
|
21
|
+
if (node.type === "text") return textToLatexNodes(node.value);
|
|
22
|
+
if (node.type === "element") {
|
|
23
|
+
if (isHeading(node)) return [convertHeading(node)];
|
|
24
|
+
if (isPageNumber(node)) return [];
|
|
25
|
+
if (isChapterBlock(node)) return convertChapterBlock(node);
|
|
26
|
+
if (node.tagName === "p") return convertParagraph(node);
|
|
27
|
+
if (node.tagName === "span") return convertSpan(node);
|
|
28
|
+
if (node.tagName === "i") return maybeApplyMacro(node, flattenText(node.children));
|
|
29
|
+
if (node.tagName === "em") return maybeApplyMacro(node, flattenText(node.children));
|
|
30
|
+
if (node.tagName === "b") return maybeApplyMacro(node, flattenText(node.children));
|
|
31
|
+
if (node.tagName === "strong") return maybeApplyMacro(node, flattenText(node.children));
|
|
32
|
+
}
|
|
33
|
+
return [];
|
|
34
|
+
}
|
|
35
|
+
function isPageNumber(node) {
|
|
36
|
+
const classList = getClassList(node);
|
|
37
|
+
return classList.includes("page-number") || classList.includes("pagenum");
|
|
38
|
+
}
|
|
39
|
+
function isHeading(node) {
|
|
40
|
+
return node.tagName in HEADING_TAG_TO_MACRO;
|
|
41
|
+
}
|
|
42
|
+
function convertHeading(node) {
|
|
43
|
+
const macroName = HEADING_TAG_TO_MACRO[node.tagName] ?? "section";
|
|
44
|
+
const isStarred = getClassList(node).includes("starred");
|
|
45
|
+
const titleContent = flattenText(node.children);
|
|
46
|
+
const starredArg = {
|
|
47
|
+
type: "argument",
|
|
48
|
+
content: isStarred ? [{
|
|
49
|
+
type: "string",
|
|
50
|
+
content: "*"
|
|
51
|
+
}] : [],
|
|
52
|
+
openMark: "",
|
|
53
|
+
closeMark: ""
|
|
54
|
+
};
|
|
55
|
+
const emptyArg = {
|
|
56
|
+
type: "argument",
|
|
57
|
+
content: [],
|
|
58
|
+
openMark: "",
|
|
59
|
+
closeMark: ""
|
|
60
|
+
};
|
|
61
|
+
return {
|
|
62
|
+
type: "macro",
|
|
63
|
+
content: macroName,
|
|
64
|
+
_renderInfo: HEADING_RENDER_INFO,
|
|
65
|
+
args: [
|
|
66
|
+
starredArg,
|
|
67
|
+
emptyArg,
|
|
68
|
+
emptyArg,
|
|
69
|
+
{
|
|
70
|
+
type: "argument",
|
|
71
|
+
content: titleContent,
|
|
72
|
+
openMark: "{",
|
|
73
|
+
closeMark: "}"
|
|
74
|
+
}
|
|
75
|
+
]
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
function convertParagraph(node) {
|
|
79
|
+
return node.children.flatMap((child) => hastNodeToLatex(child));
|
|
80
|
+
}
|
|
81
|
+
function convertSpan(node) {
|
|
82
|
+
return maybeApplyMacro(node, flattenText(node.children));
|
|
83
|
+
}
|
|
84
|
+
function hasFollowingParagraph(nodes, startIndex) {
|
|
85
|
+
for (let i = startIndex; i < nodes.length; i += 1) {
|
|
86
|
+
const next = nodes[i];
|
|
87
|
+
if (next.type === "text" && next.value.trim() === "") continue;
|
|
88
|
+
if (next.type === "element" && next.tagName === "p") return true;
|
|
89
|
+
if (next.type === "element") return false;
|
|
90
|
+
}
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
function isParagraph(node) {
|
|
94
|
+
return node.type === "element" && node.tagName === "p";
|
|
95
|
+
}
|
|
96
|
+
function textToLatexNodes(value) {
|
|
97
|
+
if (!value.trim()) return [];
|
|
98
|
+
return value.replace(/’/g, "'").split(/(\s+|['’]|[.,!?;:])/).filter(Boolean).map((part) => {
|
|
99
|
+
if (/^\s+$/.test(part)) return { type: "whitespace" };
|
|
100
|
+
if (part === "'" || part === "’") return {
|
|
101
|
+
type: "string",
|
|
102
|
+
content: "'"
|
|
103
|
+
};
|
|
104
|
+
return {
|
|
105
|
+
type: "string",
|
|
106
|
+
content: part
|
|
107
|
+
};
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
function isChapterBlock(node) {
|
|
111
|
+
return getClassList(node).includes("chapter") && node.tagName === "div";
|
|
112
|
+
}
|
|
113
|
+
function convertChapterBlock(node) {
|
|
114
|
+
const chapterTitleNodes = [];
|
|
115
|
+
visit(node, (node$1) => {
|
|
116
|
+
if (node$1.type === "element" && isHeading(node$1)) {
|
|
117
|
+
let text = "";
|
|
118
|
+
visit(node$1, (child) => {
|
|
119
|
+
if (child.type === "text") text += child.value;
|
|
120
|
+
});
|
|
121
|
+
if (typeof text === "string" && text.trim() !== "") chapterTitleNodes.push(text);
|
|
122
|
+
}
|
|
123
|
+
});
|
|
124
|
+
const chapterTitle = chapterTitleNodes.at(0);
|
|
125
|
+
const chapterSubtitle = chapterTitleNodes.at(1);
|
|
126
|
+
if (!chapterTitle) return [];
|
|
127
|
+
return [{
|
|
128
|
+
type: "macro",
|
|
129
|
+
content: "chapter",
|
|
130
|
+
args: [{
|
|
131
|
+
closeMark: "}",
|
|
132
|
+
openMark: "{",
|
|
133
|
+
type: "argument",
|
|
134
|
+
content: [{
|
|
135
|
+
type: "string",
|
|
136
|
+
content: chapterTitle
|
|
137
|
+
}]
|
|
138
|
+
}]
|
|
139
|
+
}, ...chapterSubtitle ? [{
|
|
140
|
+
type: "macro",
|
|
141
|
+
content: "section*",
|
|
142
|
+
args: [{
|
|
143
|
+
closeMark: "}",
|
|
144
|
+
openMark: "{",
|
|
145
|
+
type: "argument",
|
|
146
|
+
content: [{
|
|
147
|
+
type: "string",
|
|
148
|
+
content: chapterSubtitle
|
|
149
|
+
}]
|
|
150
|
+
}]
|
|
151
|
+
}] : []];
|
|
152
|
+
}
|
|
153
|
+
function maybeApplyMacro(element, children) {
|
|
154
|
+
const classList = getClassList(element);
|
|
155
|
+
let macroName = null;
|
|
156
|
+
if (classList.includes("smcap")) macroName = "textsc";
|
|
157
|
+
if (element.tagName === "i" || element.tagName === "em") macroName = "textit";
|
|
158
|
+
if (element.tagName === "b" || element.tagName === "strong") macroName = "textbf";
|
|
159
|
+
if (!macroName) return children;
|
|
160
|
+
return m(macroName, children);
|
|
161
|
+
}
|
|
162
|
+
function flattenText(children) {
|
|
163
|
+
return children.flatMap((child) => {
|
|
164
|
+
if (child.type === "text") return textToLatexNodes(child.value);
|
|
165
|
+
if (child.type === "element") return flattenText(child.children);
|
|
166
|
+
return [];
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
function hasClassList(node) {
|
|
170
|
+
const className = node.properties?.className;
|
|
171
|
+
return Array.isArray(className) ? className.length > 0 : typeof className === "string" && className.trim() !== "";
|
|
172
|
+
}
|
|
173
|
+
const HEADING_RENDER_INFO = {
|
|
174
|
+
breakAround: true,
|
|
175
|
+
namedArguments: [
|
|
176
|
+
"starred",
|
|
177
|
+
null,
|
|
178
|
+
"tocTitle",
|
|
179
|
+
"title"
|
|
180
|
+
]
|
|
181
|
+
};
|
|
182
|
+
const HEADING_TAG_TO_MACRO = {
|
|
183
|
+
h1: "section",
|
|
184
|
+
h2: "section",
|
|
185
|
+
h3: "section",
|
|
186
|
+
h4: "section",
|
|
187
|
+
h5: "section",
|
|
188
|
+
h6: "section"
|
|
189
|
+
};
|
|
190
|
+
|
|
191
|
+
//#endregion
|
|
192
|
+
//#region lib/unified-hast-to-latex/collect-meta.ts
|
|
193
|
+
function getHead(tree) {
|
|
194
|
+
const html = tree.children.find((node) => node.type === "element" && node.tagName === "html");
|
|
195
|
+
if (!html) return void 0;
|
|
196
|
+
return html.children.find((node) => node.type === "element" && node.tagName === "head");
|
|
197
|
+
}
|
|
198
|
+
function applyMetaToLatex(tree, latexAst) {
|
|
199
|
+
const head = getHead(tree);
|
|
200
|
+
if (!head) return latexAst;
|
|
201
|
+
const metaNodes = [m("usepackage", args(["T1", "fontenc"], { braces: "[]{}" }))];
|
|
202
|
+
for (const child of head.children) if (child.type === "element" && child.tagName === "meta") {
|
|
203
|
+
const nameAttr = child.properties?.name;
|
|
204
|
+
const contentAttr = child.properties?.content;
|
|
205
|
+
if (typeof nameAttr === "string" && typeof contentAttr === "string") {
|
|
206
|
+
if (["author", "dc.creator"].includes(nameAttr.toLowerCase())) metaNodes.push(m("author", contentAttr));
|
|
207
|
+
else if (["title", "dc.title"].includes(nameAttr.toLowerCase())) metaNodes.push(m("title", contentAttr));
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
const beginDocIndex = latexAst.content.findIndex((node) => node.type === "macro" && node.content === "begin" && node.args?.[0]?.content?.[0]?.type === "string" && node.args?.[0]?.content?.[0]?.content === "document");
|
|
211
|
+
if (beginDocIndex !== -1) latexAst.content.splice(beginDocIndex, 0, ...metaNodes);
|
|
212
|
+
else latexAst.content.unshift(...metaNodes);
|
|
213
|
+
return latexAst;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
//#endregion
|
|
217
|
+
//#region lib/unified-hast-to-latex/index.ts
|
|
218
|
+
const rehypeUnifiedLatex = (options = { documentClass: "book" }) => {
|
|
219
|
+
return (tree) => {
|
|
220
|
+
getHead(tree);
|
|
221
|
+
const body = getBody(tree);
|
|
222
|
+
const content = [];
|
|
223
|
+
const meaningfulChildren = body?.children ?? [];
|
|
224
|
+
for (let i = 0; i < meaningfulChildren.length; i += 1) {
|
|
225
|
+
const child = meaningfulChildren[i];
|
|
226
|
+
const latexNodes = hastNodeToLatex(child);
|
|
227
|
+
content.push(...latexNodes);
|
|
228
|
+
if (isParagraph(child) && hasFollowingParagraph(meaningfulChildren, i + 1)) content.push({ type: "parbreak" });
|
|
229
|
+
}
|
|
230
|
+
content.unshift(m("begin", "document"));
|
|
231
|
+
content.push(m("end", "document"));
|
|
232
|
+
content.unshift(m("documentclass", options?.documentClass ?? "book"));
|
|
233
|
+
return applyMetaToLatex(tree, {
|
|
234
|
+
type: "root",
|
|
235
|
+
content
|
|
236
|
+
});
|
|
237
|
+
};
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
//#endregion
|
|
241
|
+
export { rehypeUnifiedLatex };
|
package/package.json
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "hast-latex",
|
|
3
|
+
"description": "Converts HAST to Unified LaTeX AST",
|
|
4
|
+
"version": "0.0.0",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.mjs",
|
|
7
|
+
"types": "dist/index.d.mts",
|
|
8
|
+
"author": {
|
|
9
|
+
"name": "Evan Hennessy",
|
|
10
|
+
"url": "https://www.hennessyevan.com"
|
|
11
|
+
},
|
|
12
|
+
"files": [
|
|
13
|
+
"dist"
|
|
14
|
+
],
|
|
15
|
+
"keywords": [
|
|
16
|
+
"unist",
|
|
17
|
+
"ast",
|
|
18
|
+
"latex",
|
|
19
|
+
"rehype",
|
|
20
|
+
"unified",
|
|
21
|
+
"hast",
|
|
22
|
+
"transformer"
|
|
23
|
+
],
|
|
24
|
+
"engines": {
|
|
25
|
+
"node": ">=22.18.0"
|
|
26
|
+
},
|
|
27
|
+
"license": "MIT",
|
|
28
|
+
"dependencies": {
|
|
29
|
+
"@unified-latex/unified-latex": "^1.8.3",
|
|
30
|
+
"@unified-latex/unified-latex-builder": "^1.8.3",
|
|
31
|
+
"@unified-latex/unified-latex-lint": "^1.8.3",
|
|
32
|
+
"@unified-latex/unified-latex-to-hast": "^1.8.3",
|
|
33
|
+
"@unified-latex/unified-latex-types": "^1.8.0",
|
|
34
|
+
"@unified-latex/unified-latex-util-parse": "^1.8.3",
|
|
35
|
+
"@unified-latex/unified-latex-util-print-raw": "^1.8.0",
|
|
36
|
+
"@unified-latex/unified-latex-util-render-info": "^1.8.3",
|
|
37
|
+
"@unified-latex/unified-latex-util-to-string": "^1.8.3",
|
|
38
|
+
"hast-util-class-list": "^2.0.1",
|
|
39
|
+
"hast-util-from-html": "^2.0.3",
|
|
40
|
+
"hast-util-select": "^6.0.4",
|
|
41
|
+
"hast-util-to-string": "^3.0.1",
|
|
42
|
+
"prettier-plugin-latex": "^2.0.1",
|
|
43
|
+
"rehype-parse": "^9.0.1",
|
|
44
|
+
"tiny-invariant": "^1.3.3",
|
|
45
|
+
"unified": "^11.0.5",
|
|
46
|
+
"unist-util-inspect": "^8.1.0",
|
|
47
|
+
"unist-util-remove": "^4.0.0",
|
|
48
|
+
"unist-util-select": "^5.1.0",
|
|
49
|
+
"unist-util-visit": "^5.0.0"
|
|
50
|
+
},
|
|
51
|
+
"devDependencies": {
|
|
52
|
+
"@types/hast": "^3.0.4",
|
|
53
|
+
"tsdown": "0.19.0-beta.3",
|
|
54
|
+
"typescript": "^5.9.3"
|
|
55
|
+
},
|
|
56
|
+
"scripts": {
|
|
57
|
+
"example": "node ./example.ts",
|
|
58
|
+
"build": "tsdown ./index.ts",
|
|
59
|
+
"release": "pnpm version patch && pnpm build && git add . && git commit -m 'chore: release' && git push && git push --tags && pnpm publish"
|
|
60
|
+
}
|
|
61
|
+
}
|
package/readme.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# hast-latex
|
|
2
|
+
|
|
3
|
+
A small unifiedJS plugin that turns an HTML/HAST tree into a LaTeX AST (see [unified-latex](https://github.com/siefkenj/unified-latex)) so you can render, lint, or emit `.tex` from HTML sources. Designed for book-like documents (ebooks, etc.) where you want headings, chapters, metadata, and inline formatting to map cleanly into LaTeX.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install hast-latex rehype-parse unified @unified-latex/unified-latex-util-to-string
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick start
|
|
12
|
+
|
|
13
|
+
The plugin consumes a HAST tree (e.g., from `rehype-parse`) and returns a unified-latex AST you can stringify with `@unified-latex/unified-latex-util-to-string`.
|
|
14
|
+
|
|
15
|
+
```ts
|
|
16
|
+
import { unified } from 'unified'
|
|
17
|
+
import rehypeParse from 'rehype-parse'
|
|
18
|
+
import { hastLatex } from 'hast-latex'
|
|
19
|
+
import { unifiedLatexStringCompiler } from '@unified-latex/unified-latex-util-to-string'
|
|
20
|
+
|
|
21
|
+
const html = `<html><head><meta name="title" content="Example" /></head><body><h1>Hello</h1><p>World.</p></body></html>`
|
|
22
|
+
|
|
23
|
+
// Build HAST from HTML
|
|
24
|
+
const rehypeProcessor = unified()
|
|
25
|
+
.use(rehypeParse)
|
|
26
|
+
.use(hastLatex, { documentClass: 'book' })
|
|
27
|
+
|
|
28
|
+
const hast = rehypeProcessor.parse(html)
|
|
29
|
+
const latexAst = rehypeProcessor.runSync(hast)
|
|
30
|
+
|
|
31
|
+
// Turn LaTeX AST into a .tex string
|
|
32
|
+
const latex = unified().use(unifiedLatexStringCompiler).stringify(latexAst)
|
|
33
|
+
|
|
34
|
+
console.log(latex)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## API
|
|
38
|
+
|
|
39
|
+
### `hastLatex(options?)`
|
|
40
|
+
|
|
41
|
+
Unified plugin that converts HAST → unified-latex AST.
|
|
42
|
+
|
|
43
|
+
Options:
|
|
44
|
+
|
|
45
|
+
- `documentClass`: `'article' | 'report' | 'book'` (default: `'book'`). Used for the emitted `\documentclass{...}` macro.
|
|
46
|
+
- `makeTitle`: `boolean` (default: `false`). When `true`, inserts `\maketitle` and uses metadata from the HTML `<head>` (e.g., `<meta name="title">`, `<meta name="author">`, `dc.title`, `dc.creator`) to populate `\title{}` and `\author{}`.
|
|
47
|
+
- `macroReplacements`: `Record<string, string>` mapping CSS selectors to LaTeX macro names for inline styling (e.g., `{ 'b,strong': 'textbf', 'i,em': 'textit' }`). This lets you customize how inline HTML is converted to LaTeX commands.
|
|
48
|
+
|
|
49
|
+
Default `macroReplacements`:
|
|
50
|
+
|
|
51
|
+
```ts
|
|
52
|
+
{
|
|
53
|
+
'b,strong': 'textbf',
|
|
54
|
+
'i,em': 'textit',
|
|
55
|
+
u: 'underline',
|
|
56
|
+
's,strike,del': 'sout',
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
#### Advanced usage
|
|
61
|
+
|
|
62
|
+
```ts
|
|
63
|
+
import { unified } from 'unified'
|
|
64
|
+
import rehypeParse from 'rehype-parse'
|
|
65
|
+
import { hastLatex } from 'hast-latex'
|
|
66
|
+
import { unifiedLatexStringCompiler } from '@unified-latex/unified-latex-util-to-string'
|
|
67
|
+
|
|
68
|
+
const html = `<!doctype html><html><head>
|
|
69
|
+
<meta name="title" content="My Book" />
|
|
70
|
+
<meta name="author" content="Jane Doe" />
|
|
71
|
+
</head><body>
|
|
72
|
+
<h1 class="starred">Intro</h1>
|
|
73
|
+
<p><span class="smcap">Small Caps</span> and <u>underline</u>.</p>
|
|
74
|
+
</body></html>`
|
|
75
|
+
|
|
76
|
+
const processor = unified()
|
|
77
|
+
.use(rehypeParse)
|
|
78
|
+
.use(hastLatex, {
|
|
79
|
+
documentClass: 'book',
|
|
80
|
+
makeTitle: true,
|
|
81
|
+
macroReplacements: {
|
|
82
|
+
'span.smcap': 'textsc',
|
|
83
|
+
u: 'underline',
|
|
84
|
+
},
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
const hast = processor.parse(html)
|
|
88
|
+
const latexAst = processor.runSync(hast as any)
|
|
89
|
+
const latex = unified().use(unifiedLatexStringCompiler).stringify(latexAst)
|
|
90
|
+
|
|
91
|
+
console.log(latex)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Notes & limitations
|
|
95
|
+
|
|
96
|
+
- Currently focused on book-like prose from ebook sources. Lists, images, tables, math, footnotes, etc., are not yet mapped.
|
|
97
|
+
- HTML must include a `<html><head>...</head><body>...</body></html>` structure for metadata extraction.
|
|
98
|
+
- Output is a LaTeX AST; you choose when/how to stringify or compile it.
|
|
99
|
+
- API and output shape will change; pinned at `0.x` while iterating.
|