hast-latex 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ import * as Latex from "@unified-latex/unified-latex-types";
2
+ import { Root } from "hast";
3
+ import { Plugin } from "unified";
4
+
5
+ //#region lib/unified-hast-to-latex/index.d.ts
6
+ interface RehypeUnifiedLatexOptions {
7
+ documentClass?: 'article' | 'report' | 'book';
8
+ }
9
+ declare const rehypeUnifiedLatex: Plugin<[(RehypeUnifiedLatexOptions | null | undefined)?], Root, Latex.Root>;
10
+ //#endregion
11
+ export { type RehypeUnifiedLatexOptions, rehypeUnifiedLatex };
package/dist/index.mjs ADDED
@@ -0,0 +1,241 @@
1
+ import { args, m } from "@unified-latex/unified-latex-builder";
2
+ import { visit } from "unist-util-visit";
3
+
4
+ //#region lib/utils/getClassList.ts
5
+ function getClassList(node) {
6
+ if (!hasClassList(node)) return [];
7
+ const className = node.properties?.className;
8
+ if (Array.isArray(className)) return className.map(String);
9
+ if (typeof className === "string") return className.split(/\s+/);
10
+ return [];
11
+ }
12
+
13
+ //#endregion
14
+ //#region lib/unified-hast-to-latex/collect-body.ts
15
+ function getBody(tree) {
16
+ const html = tree.children.find((node) => node.type === "element" && node.tagName === "html");
17
+ if (!html) return void 0;
18
+ return html.children.find((node) => node.type === "element" && node.tagName === "body");
19
+ }
20
+ function hastNodeToLatex(node) {
21
+ if (node.type === "text") return textToLatexNodes(node.value);
22
+ if (node.type === "element") {
23
+ if (isHeading(node)) return [convertHeading(node)];
24
+ if (isPageNumber(node)) return [];
25
+ if (isChapterBlock(node)) return convertChapterBlock(node);
26
+ if (node.tagName === "p") return convertParagraph(node);
27
+ if (node.tagName === "span") return convertSpan(node);
28
+ if (node.tagName === "i") return maybeApplyMacro(node, flattenText(node.children));
29
+ if (node.tagName === "em") return maybeApplyMacro(node, flattenText(node.children));
30
+ if (node.tagName === "b") return maybeApplyMacro(node, flattenText(node.children));
31
+ if (node.tagName === "strong") return maybeApplyMacro(node, flattenText(node.children));
32
+ }
33
+ return [];
34
+ }
35
+ function isPageNumber(node) {
36
+ const classList = getClassList(node);
37
+ return classList.includes("page-number") || classList.includes("pagenum");
38
+ }
39
+ function isHeading(node) {
40
+ return node.tagName in HEADING_TAG_TO_MACRO;
41
+ }
42
+ function convertHeading(node) {
43
+ const macroName = HEADING_TAG_TO_MACRO[node.tagName] ?? "section";
44
+ const isStarred = getClassList(node).includes("starred");
45
+ const titleContent = flattenText(node.children);
46
+ const starredArg = {
47
+ type: "argument",
48
+ content: isStarred ? [{
49
+ type: "string",
50
+ content: "*"
51
+ }] : [],
52
+ openMark: "",
53
+ closeMark: ""
54
+ };
55
+ const emptyArg = {
56
+ type: "argument",
57
+ content: [],
58
+ openMark: "",
59
+ closeMark: ""
60
+ };
61
+ return {
62
+ type: "macro",
63
+ content: macroName,
64
+ _renderInfo: HEADING_RENDER_INFO,
65
+ args: [
66
+ starredArg,
67
+ emptyArg,
68
+ emptyArg,
69
+ {
70
+ type: "argument",
71
+ content: titleContent,
72
+ openMark: "{",
73
+ closeMark: "}"
74
+ }
75
+ ]
76
+ };
77
+ }
78
+ function convertParagraph(node) {
79
+ return node.children.flatMap((child) => hastNodeToLatex(child));
80
+ }
81
+ function convertSpan(node) {
82
+ return maybeApplyMacro(node, flattenText(node.children));
83
+ }
84
+ function hasFollowingParagraph(nodes, startIndex) {
85
+ for (let i = startIndex; i < nodes.length; i += 1) {
86
+ const next = nodes[i];
87
+ if (next.type === "text" && next.value.trim() === "") continue;
88
+ if (next.type === "element" && next.tagName === "p") return true;
89
+ if (next.type === "element") return false;
90
+ }
91
+ return false;
92
+ }
93
+ function isParagraph(node) {
94
+ return node.type === "element" && node.tagName === "p";
95
+ }
96
+ function textToLatexNodes(value) {
97
+ if (!value.trim()) return [];
98
+ return value.replace(/’/g, "'").split(/(\s+|['’]|[.,!?;:])/).filter(Boolean).map((part) => {
99
+ if (/^\s+$/.test(part)) return { type: "whitespace" };
100
+ if (part === "'" || part === "’") return {
101
+ type: "string",
102
+ content: "'"
103
+ };
104
+ return {
105
+ type: "string",
106
+ content: part
107
+ };
108
+ });
109
+ }
110
+ function isChapterBlock(node) {
111
+ return getClassList(node).includes("chapter") && node.tagName === "div";
112
+ }
113
+ function convertChapterBlock(node) {
114
+ const chapterTitleNodes = [];
115
+ visit(node, (node$1) => {
116
+ if (node$1.type === "element" && isHeading(node$1)) {
117
+ let text = "";
118
+ visit(node$1, (child) => {
119
+ if (child.type === "text") text += child.value;
120
+ });
121
+ if (typeof text === "string" && text.trim() !== "") chapterTitleNodes.push(text);
122
+ }
123
+ });
124
+ const chapterTitle = chapterTitleNodes.at(0);
125
+ const chapterSubtitle = chapterTitleNodes.at(1);
126
+ if (!chapterTitle) return [];
127
+ return [{
128
+ type: "macro",
129
+ content: "chapter",
130
+ args: [{
131
+ closeMark: "}",
132
+ openMark: "{",
133
+ type: "argument",
134
+ content: [{
135
+ type: "string",
136
+ content: chapterTitle
137
+ }]
138
+ }]
139
+ }, ...chapterSubtitle ? [{
140
+ type: "macro",
141
+ content: "section*",
142
+ args: [{
143
+ closeMark: "}",
144
+ openMark: "{",
145
+ type: "argument",
146
+ content: [{
147
+ type: "string",
148
+ content: chapterSubtitle
149
+ }]
150
+ }]
151
+ }] : []];
152
+ }
153
+ function maybeApplyMacro(element, children) {
154
+ const classList = getClassList(element);
155
+ let macroName = null;
156
+ if (classList.includes("smcap")) macroName = "textsc";
157
+ if (element.tagName === "i" || element.tagName === "em") macroName = "textit";
158
+ if (element.tagName === "b" || element.tagName === "strong") macroName = "textbf";
159
+ if (!macroName) return children;
160
+ return m(macroName, children);
161
+ }
162
+ function flattenText(children) {
163
+ return children.flatMap((child) => {
164
+ if (child.type === "text") return textToLatexNodes(child.value);
165
+ if (child.type === "element") return flattenText(child.children);
166
+ return [];
167
+ });
168
+ }
169
+ function hasClassList(node) {
170
+ const className = node.properties?.className;
171
+ return Array.isArray(className) ? className.length > 0 : typeof className === "string" && className.trim() !== "";
172
+ }
173
+ const HEADING_RENDER_INFO = {
174
+ breakAround: true,
175
+ namedArguments: [
176
+ "starred",
177
+ null,
178
+ "tocTitle",
179
+ "title"
180
+ ]
181
+ };
182
+ const HEADING_TAG_TO_MACRO = {
183
+ h1: "section",
184
+ h2: "section",
185
+ h3: "section",
186
+ h4: "section",
187
+ h5: "section",
188
+ h6: "section"
189
+ };
190
+
191
+ //#endregion
192
+ //#region lib/unified-hast-to-latex/collect-meta.ts
193
+ function getHead(tree) {
194
+ const html = tree.children.find((node) => node.type === "element" && node.tagName === "html");
195
+ if (!html) return void 0;
196
+ return html.children.find((node) => node.type === "element" && node.tagName === "head");
197
+ }
198
+ function applyMetaToLatex(tree, latexAst) {
199
+ const head = getHead(tree);
200
+ if (!head) return latexAst;
201
+ const metaNodes = [m("usepackage", args(["T1", "fontenc"], { braces: "[]{}" }))];
202
+ for (const child of head.children) if (child.type === "element" && child.tagName === "meta") {
203
+ const nameAttr = child.properties?.name;
204
+ const contentAttr = child.properties?.content;
205
+ if (typeof nameAttr === "string" && typeof contentAttr === "string") {
206
+ if (["author", "dc.creator"].includes(nameAttr.toLowerCase())) metaNodes.push(m("author", contentAttr));
207
+ else if (["title", "dc.title"].includes(nameAttr.toLowerCase())) metaNodes.push(m("title", contentAttr));
208
+ }
209
+ }
210
+ const beginDocIndex = latexAst.content.findIndex((node) => node.type === "macro" && node.content === "begin" && node.args?.[0]?.content?.[0]?.type === "string" && node.args?.[0]?.content?.[0]?.content === "document");
211
+ if (beginDocIndex !== -1) latexAst.content.splice(beginDocIndex, 0, ...metaNodes);
212
+ else latexAst.content.unshift(...metaNodes);
213
+ return latexAst;
214
+ }
215
+
216
+ //#endregion
217
+ //#region lib/unified-hast-to-latex/index.ts
218
+ const rehypeUnifiedLatex = (options = { documentClass: "book" }) => {
219
+ return (tree) => {
220
+ getHead(tree);
221
+ const body = getBody(tree);
222
+ const content = [];
223
+ const meaningfulChildren = body?.children ?? [];
224
+ for (let i = 0; i < meaningfulChildren.length; i += 1) {
225
+ const child = meaningfulChildren[i];
226
+ const latexNodes = hastNodeToLatex(child);
227
+ content.push(...latexNodes);
228
+ if (isParagraph(child) && hasFollowingParagraph(meaningfulChildren, i + 1)) content.push({ type: "parbreak" });
229
+ }
230
+ content.unshift(m("begin", "document"));
231
+ content.push(m("end", "document"));
232
+ content.unshift(m("documentclass", options?.documentClass ?? "book"));
233
+ return applyMetaToLatex(tree, {
234
+ type: "root",
235
+ content
236
+ });
237
+ };
238
+ };
239
+
240
+ //#endregion
241
+ export { rehypeUnifiedLatex };
package/package.json ADDED
@@ -0,0 +1,61 @@
1
+ {
2
+ "name": "hast-latex",
3
+ "description": "Converts HAST to Unified LaTeX AST",
4
+ "version": "0.0.0",
5
+ "type": "module",
6
+ "main": "dist/index.mjs",
7
+ "types": "dist/index.d.mts",
8
+ "author": {
9
+ "name": "Evan Hennessy",
10
+ "url": "https://www.hennessyevan.com"
11
+ },
12
+ "files": [
13
+ "dist"
14
+ ],
15
+ "keywords": [
16
+ "unist",
17
+ "ast",
18
+ "latex",
19
+ "rehype",
20
+ "unified",
21
+ "hast",
22
+ "transformer"
23
+ ],
24
+ "engines": {
25
+ "node": ">=22.18.0"
26
+ },
27
+ "license": "MIT",
28
+ "dependencies": {
29
+ "@unified-latex/unified-latex": "^1.8.3",
30
+ "@unified-latex/unified-latex-builder": "^1.8.3",
31
+ "@unified-latex/unified-latex-lint": "^1.8.3",
32
+ "@unified-latex/unified-latex-to-hast": "^1.8.3",
33
+ "@unified-latex/unified-latex-types": "^1.8.0",
34
+ "@unified-latex/unified-latex-util-parse": "^1.8.3",
35
+ "@unified-latex/unified-latex-util-print-raw": "^1.8.0",
36
+ "@unified-latex/unified-latex-util-render-info": "^1.8.3",
37
+ "@unified-latex/unified-latex-util-to-string": "^1.8.3",
38
+ "hast-util-class-list": "^2.0.1",
39
+ "hast-util-from-html": "^2.0.3",
40
+ "hast-util-select": "^6.0.4",
41
+ "hast-util-to-string": "^3.0.1",
42
+ "prettier-plugin-latex": "^2.0.1",
43
+ "rehype-parse": "^9.0.1",
44
+ "tiny-invariant": "^1.3.3",
45
+ "unified": "^11.0.5",
46
+ "unist-util-inspect": "^8.1.0",
47
+ "unist-util-remove": "^4.0.0",
48
+ "unist-util-select": "^5.1.0",
49
+ "unist-util-visit": "^5.0.0"
50
+ },
51
+ "devDependencies": {
52
+ "@types/hast": "^3.0.4",
53
+ "tsdown": "0.19.0-beta.3",
54
+ "typescript": "^5.9.3"
55
+ },
56
+ "scripts": {
57
+ "example": "node ./example.ts",
58
+ "build": "tsdown ./index.ts",
59
+ "release": "pnpm version patch && pnpm build && git add . && git commit -m 'chore: release' && git push && git push --tags && pnpm publish"
60
+ }
61
+ }
package/readme.md ADDED
@@ -0,0 +1,99 @@
1
+ # hast-latex
2
+
3
+ A small unifiedJS plugin that turns an HTML/HAST tree into a LaTeX AST (see [unified-latex](https://github.com/siefkenj/unified-latex)) so you can render, lint, or emit `.tex` from HTML sources. Designed for book-like documents (ebooks, etc.) where you want headings, chapters, metadata, and inline formatting to map cleanly into LaTeX.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install hast-latex rehype-parse unified @unified-latex/unified-latex-util-to-string
9
+ ```
10
+
11
+ ## Quick start
12
+
13
+ The plugin consumes a HAST tree (e.g., from `rehype-parse`) and returns a unified-latex AST you can stringify with `@unified-latex/unified-latex-util-to-string`.
14
+
15
+ ```ts
16
+ import { unified } from 'unified'
17
+ import rehypeParse from 'rehype-parse'
18
+ import { hastLatex } from 'hast-latex'
19
+ import { unifiedLatexStringCompiler } from '@unified-latex/unified-latex-util-to-string'
20
+
21
+ const html = `<html><head><meta name="title" content="Example" /></head><body><h1>Hello</h1><p>World.</p></body></html>`
22
+
23
+ // Build HAST from HTML
24
+ const rehypeProcessor = unified()
25
+ .use(rehypeParse)
26
+ .use(hastLatex, { documentClass: 'book' })
27
+
28
+ const hast = rehypeProcessor.parse(html)
29
+ const latexAst = rehypeProcessor.runSync(hast)
30
+
31
+ // Turn LaTeX AST into a .tex string
32
+ const latex = unified().use(unifiedLatexStringCompiler).stringify(latexAst)
33
+
34
+ console.log(latex)
35
+ ```
36
+
37
+ ## API
38
+
39
+ ### `hastLatex(options?)`
40
+
41
+ Unified plugin that converts HAST → unified-latex AST.
42
+
43
+ Options:
44
+
45
+ - `documentClass`: `'article' | 'report' | 'book'` (default: `'book'`). Used for the emitted `\documentclass{...}` macro.
46
+ - `makeTitle`: `boolean` (default: `false`). When `true`, inserts `\maketitle` and uses metadata from the HTML `<head>` (e.g., `<meta name="title">`, `<meta name="author">`, `dc.title`, `dc.creator`) to populate `\title{}` and `\author{}`.
47
+ - `macroReplacements`: `Record<string, string>` mapping CSS selectors to LaTeX macro names for inline styling (e.g., `{ 'b,strong': 'textbf', 'i,em': 'textit' }`). This lets you customize how inline HTML is converted to LaTeX commands.
48
+
49
+ Default `macroReplacements`:
50
+
51
+ ```ts
52
+ {
53
+ 'b,strong': 'textbf',
54
+ 'i,em': 'textit',
55
+ u: 'underline',
56
+ 's,strike,del': 'sout',
57
+ }
58
+ ```
59
+
60
+ #### Advanced usage
61
+
62
+ ```ts
63
+ import { unified } from 'unified'
64
+ import rehypeParse from 'rehype-parse'
65
+ import { hastLatex } from 'hast-latex'
66
+ import { unifiedLatexStringCompiler } from '@unified-latex/unified-latex-util-to-string'
67
+
68
+ const html = `<!doctype html><html><head>
69
+ <meta name="title" content="My Book" />
70
+ <meta name="author" content="Jane Doe" />
71
+ </head><body>
72
+ <h1 class="starred">Intro</h1>
73
+ <p><span class="smcap">Small Caps</span> and <u>underline</u>.</p>
74
+ </body></html>`
75
+
76
+ const processor = unified()
77
+ .use(rehypeParse)
78
+ .use(hastLatex, {
79
+ documentClass: 'book',
80
+ makeTitle: true,
81
+ macroReplacements: {
82
+ 'span.smcap': 'textsc',
83
+ u: 'underline',
84
+ },
85
+ })
86
+
87
+ const hast = processor.parse(html)
88
+ const latexAst = processor.runSync(hast as any)
89
+ const latex = unified().use(unifiedLatexStringCompiler).stringify(latexAst)
90
+
91
+ console.log(latex)
92
+ ```
93
+
94
+ ## Notes & limitations
95
+
96
+ - Currently focused on book-like prose from ebook sources. Lists, images, tables, math, footnotes, etc., are not yet mapped.
97
+ - HTML must include a `<html><head>...</head><body>...</body></html>` structure for metadata extraction.
98
+ - Output is a LaTeX AST; you choose when/how to stringify or compile it.
99
+ - API and output shape will change; pinned at `0.x` while iterating.