mark-epub-down 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +151 -0
- package/dist/application/convert-epub-document.d.ts +10 -0
- package/dist/application/convert-epub-document.js +80 -0
- package/dist/application/convert-epub-document.js.map +1 -0
- package/dist/application/convert-epub.d.ts +13 -0
- package/dist/application/convert-epub.js +35 -0
- package/dist/application/convert-epub.js.map +1 -0
- package/dist/cli/confirm-overwrite.d.ts +1 -0
- package/dist/cli/confirm-overwrite.js +29 -0
- package/dist/cli/confirm-overwrite.js.map +1 -0
- package/dist/cli/reporting.d.ts +3 -0
- package/dist/cli/reporting.js +24 -0
- package/dist/cli/reporting.js.map +1 -0
- package/dist/cli/run-convert-command.d.ts +10 -0
- package/dist/cli/run-convert-command.js +61 -0
- package/dist/cli/run-convert-command.js.map +1 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +28 -0
- package/dist/cli.js.map +1 -0
- package/dist/domain/errors.d.ts +7 -0
- package/dist/domain/errors.js +20 -0
- package/dist/domain/errors.js.map +1 -0
- package/dist/domain/spec.d.ts +8 -0
- package/dist/domain/spec.js +49 -0
- package/dist/domain/spec.js.map +1 -0
- package/dist/domain/types.d.ts +48 -0
- package/dist/domain/types.js +3 -0
- package/dist/domain/types.js.map +1 -0
- package/dist/domain/warnings.d.ts +8 -0
- package/dist/domain/warnings.js +49 -0
- package/dist/domain/warnings.js.map +1 -0
- package/dist/epub/archive.d.ts +3 -0
- package/dist/epub/archive.js +22 -0
- package/dist/epub/archive.js.map +1 -0
- package/dist/epub/container.d.ts +2 -0
- package/dist/epub/container.js +46 -0
- package/dist/epub/container.js.map +1 -0
- package/dist/epub/content.d.ts +7 -0
- package/dist/epub/content.js +34 -0
- package/dist/epub/content.js.map +1 -0
- package/dist/epub/opf.d.ts +2 -0
- package/dist/epub/opf.js +122 -0
- package/dist/epub/opf.js.map +1 -0
- package/dist/epub/spine.d.ts +2 -0
- package/dist/epub/spine.js +25 -0
- package/dist/epub/spine.js.map +1 -0
- package/dist/epub/toc.d.ts +3 -0
- package/dist/epub/toc.js +165 -0
- package/dist/epub/toc.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -0
- package/dist/output/render.d.ts +9 -0
- package/dist/output/render.js +54 -0
- package/dist/output/render.js.map +1 -0
- package/dist/transform/anchors.d.ts +2 -0
- package/dist/transform/anchors.js +18 -0
- package/dist/transform/anchors.js.map +1 -0
- package/dist/transform/cleanup.d.ts +4 -0
- package/dist/transform/cleanup.js +37 -0
- package/dist/transform/cleanup.js.map +1 -0
- package/dist/transform/links.d.ts +14 -0
- package/dist/transform/links.js +174 -0
- package/dist/transform/links.js.map +1 -0
- package/dist/transform/markdown.d.ts +2 -0
- package/dist/transform/markdown.js +102 -0
- package/dist/transform/markdown.js.map +1 -0
- package/dist/transform/tables.d.ts +5 -0
- package/dist/transform/tables.js +130 -0
- package/dist/transform/tables.js.map +1 -0
- package/dist/utils/epub-path.d.ts +14 -0
- package/dist/utils/epub-path.js +56 -0
- package/dist/utils/epub-path.js.map +1 -0
- package/dist/utils/path.d.ts +2 -0
- package/dist/utils/path.js +33 -0
- package/dist/utils/path.js.map +1 -0
- package/docs/epub-to-md-v1-public-spec.md +176 -0
- package/docs/v1-technical-selection.md +106 -0
- package/package.json +67 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.normalizeEpubPath = normalizeEpubPath;
|
|
7
|
+
exports.splitEpubHref = splitEpubHref;
|
|
8
|
+
exports.buildTargetKey = buildTargetKey;
|
|
9
|
+
exports.resolveEpubHref = resolveEpubHref;
|
|
10
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
11
|
+
function normalizeEpubPath(value) {
|
|
12
|
+
const normalized = value.replace(/\\/g, "/");
|
|
13
|
+
return node_path_1.default.posix.normalize(normalized).replace(/^\.\//, "");
|
|
14
|
+
}
|
|
15
|
+
function splitEpubHref(href) {
|
|
16
|
+
const [pathPart, fragment] = href.split("#", 2);
|
|
17
|
+
return {
|
|
18
|
+
pathPart,
|
|
19
|
+
fragment: fragment && fragment.length > 0 ? fragment : undefined,
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
function buildTargetKey(resourcePath, fragment) {
|
|
23
|
+
return fragment ? `${normalizeEpubPath(resourcePath)}#${fragment}` : normalizeEpubPath(resourcePath);
|
|
24
|
+
}
|
|
25
|
+
function resolveEpubHref(baseDocumentPath, href) {
|
|
26
|
+
const trimmedHref = href.trim();
|
|
27
|
+
if (isExternalHref(trimmedHref)) {
|
|
28
|
+
return {
|
|
29
|
+
kind: "external",
|
|
30
|
+
href: trimmedHref,
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
if (trimmedHref.startsWith("#")) {
|
|
34
|
+
const fragment = trimmedHref.slice(1);
|
|
35
|
+
return {
|
|
36
|
+
kind: "internal",
|
|
37
|
+
href: trimmedHref,
|
|
38
|
+
resourcePath: normalizeEpubPath(baseDocumentPath),
|
|
39
|
+
fragment,
|
|
40
|
+
targetKey: buildTargetKey(baseDocumentPath, fragment),
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
const { pathPart, fragment } = splitEpubHref(trimmedHref);
|
|
44
|
+
const resourcePath = normalizeEpubPath(node_path_1.default.posix.join(node_path_1.default.posix.dirname(normalizeEpubPath(baseDocumentPath)), pathPart));
|
|
45
|
+
return {
|
|
46
|
+
kind: "internal",
|
|
47
|
+
href: trimmedHref,
|
|
48
|
+
resourcePath,
|
|
49
|
+
fragment,
|
|
50
|
+
targetKey: buildTargetKey(resourcePath, fragment),
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
function isExternalHref(href) {
|
|
54
|
+
return /^[a-z][a-z0-9+.-]*:/i.test(href) || href.startsWith("//");
|
|
55
|
+
}
|
|
56
|
+
//# sourceMappingURL=epub-path.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"epub-path.js","sourceRoot":"","sources":["../../src/utils/epub-path.ts"],"names":[],"mappings":";;;;;AAUA,8CAGC;AAED,sCASC;AAED,wCAEC;AAED,0CAiCC;AA/DD,0DAA6B;AAU7B,SAAgB,iBAAiB,CAAC,KAAa;IAC7C,MAAM,UAAU,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;IAC7C,OAAO,mBAAI,CAAC,KAAK,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;AAC/D,CAAC;AAED,SAAgB,aAAa,CAAC,IAAY;IAIxC,MAAM,CAAC,QAAQ,EAAE,QAAQ,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;IAChD,OAAO;QACL,QAAQ;QACR,QAAQ,EAAE,QAAQ,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS;KACjE,CAAC;AACJ,CAAC;AAED,SAAgB,cAAc,CAAC,YAAoB,EAAE,QAAiB;IACpE,OAAO,QAAQ,CAAC,CAAC,CAAC,GAAG,iBAAiB,CAAC,YAAY,CAAC,IAAI,QAAQ,EAAE,CAAC,CAAC,CAAC,iBAAiB,CAAC,YAAY,CAAC,CAAC;AACvG,CAAC;AAED,SAAgB,eAAe,CAAC,gBAAwB,EAAE,IAAY;IACpE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAEhC,IAAI,cAAc,CAAC,WAAW,CAAC,EAAE,CAAC;QAChC,OAAO;YACL,IAAI,EAAE,UAAU;YAChB,IAAI,EAAE,WAAW;SAClB,CAAC;IACJ,CAAC;IAED,IAAI,WAAW,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QAChC,MAAM,QAAQ,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACtC,OAAO;YACL,IAAI,EAAE,UAAU;YAChB,IAAI,EAAE,WAAW;YACjB,YAAY,EAAE,iBAAiB,CAAC,gBAAgB,CAAC;YACjD,QAAQ;YACR,SAAS,EAAE,cAAc,CAAC,gBAAgB,EAAE,QAAQ,CAAC;SACtD,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,aAAa,CAAC,WAAW,CAAC,CAAC;IAC1D,MAAM,YAAY,GAAG,iBAAiB,CACpC,mBAAI,CAAC,KAAK,CAAC,IAAI,CAAC,mBAAI,CAAC,KAAK,CAAC,OAAO,CAAC,iBAAiB,CAAC,gBAAgB,CAAC,CAAC,EAAE,QAAQ,CAAC,CACnF,CAAC;IAEF,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,IAAI,EAAE,WAAW;QACjB,YAAY;QACZ,QAAQ;QACR,SAAS,EAAE,cAAc,CAAC,YAAY,EAAE,QAAQ,CAAC;KAClD,CAAC;AACJ,CAAC;AAED,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,sBAAsB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;AACpE,CAAC"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.deriveOutputPath = deriveOutputPath;
|
|
7
|
+
exports.ensureOutputPathAvailable = ensureOutputPathAvailable;
|
|
8
|
+
const promises_1 = require("node:fs/promises");
|
|
9
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
10
|
+
const errors_1 = require("../domain/errors");
|
|
11
|
+
function deriveOutputPath(inputPath, outputPath, cwd) {
|
|
12
|
+
if (outputPath) {
|
|
13
|
+
return node_path_1.default.resolve(cwd, outputPath);
|
|
14
|
+
}
|
|
15
|
+
const parsed = node_path_1.default.parse(inputPath);
|
|
16
|
+
return node_path_1.default.join(parsed.dir, `${parsed.name}.md`);
|
|
17
|
+
}
|
|
18
|
+
async function ensureOutputPathAvailable(outputPath, overwrite = false) {
|
|
19
|
+
try {
|
|
20
|
+
await (0, promises_1.access)(outputPath);
|
|
21
|
+
}
|
|
22
|
+
catch (error) {
|
|
23
|
+
if (error instanceof errors_1.ConversionError) {
|
|
24
|
+
throw error;
|
|
25
|
+
}
|
|
26
|
+
return;
|
|
27
|
+
}
|
|
28
|
+
if (overwrite) {
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
throw errors_1.ConversionError.fatal("OUTPUT_EXISTS", `output file already exists and overwrite is disabled: ${outputPath}`);
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=path.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"path.js","sourceRoot":"","sources":["../../src/utils/path.ts"],"names":[],"mappings":";;;;;AAKA,4CAOC;AAED,8DAsBC;AApCD,+CAA0C;AAC1C,0DAA6B;AAE7B,6CAAmD;AAEnD,SAAgB,gBAAgB,CAAC,SAAiB,EAAE,UAA8B,EAAE,GAAW;IAC7F,IAAI,UAAU,EAAE,CAAC;QACf,OAAO,mBAAI,CAAC,OAAO,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;IACvC,CAAC;IAED,MAAM,MAAM,GAAG,mBAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IACrC,OAAO,mBAAI,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,GAAG,MAAM,CAAC,IAAI,KAAK,CAAC,CAAC;AACpD,CAAC;AAEM,KAAK,UAAU,yBAAyB,CAC7C,UAAkB,EAClB,SAAS,GAAG,KAAK;IAEjB,IAAI,CAAC;QACH,MAAM,IAAA,iBAAM,EAAC,UAAU,CAAC,CAAC;IAC3B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,wBAAe,EAAE,CAAC;YACrC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,OAAO;IACT,CAAC;IAED,IAAI,SAAS,EAAE,CAAC;QACd,OAAO;IACT,CAAC;IAED,MAAM,wBAAe,CAAC,KAAK,CACzB,eAAe,EACf,yDAAyD,UAAU,EAAE,CACtE,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# EPUB to Markdown v1 Public Spec
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
`mark-epub-down` is a Node.js CLI and package that converts a single EPUB into a single Markdown document.
|
|
6
|
+
|
|
7
|
+
The v1 output is intended as source material for LLM knowledge bases, wikis, and related ingestion pipelines. The project prioritizes semantic preservation, source correctness, and low-risk transformation over reader-oriented Markdown polish.
|
|
8
|
+
|
|
9
|
+
## Scope
|
|
10
|
+
|
|
11
|
+
- Input: one `.epub` file
|
|
12
|
+
- Output: one `.md` file
|
|
13
|
+
- Supported runtime targets: Node.js `20`, `22`, and `24`
|
|
14
|
+
- Implementation language: TypeScript
|
|
15
|
+
- Distribution shape: npm package with CLI and programmatic Node API
|
|
16
|
+
|
|
17
|
+
## Goals
|
|
18
|
+
|
|
19
|
+
- Preserve meaningful document structure and content semantics.
|
|
20
|
+
- Keep source order aligned with the EPUB spine.
|
|
21
|
+
- Include EPUB-native table-of-contents information in the output.
|
|
22
|
+
- Prefer conservative transformations over aggressive normalization.
|
|
23
|
+
- Produce Markdown that works well as downstream ingestion source.
|
|
24
|
+
|
|
25
|
+
## Non-goals
|
|
26
|
+
|
|
27
|
+
- Perfect visual reproduction of EPUB layout or CSS presentation
|
|
28
|
+
- Viewer-specific Markdown tuning as the primary goal
|
|
29
|
+
- Heuristic reconstruction of missing structure
|
|
30
|
+
- Chapter-splitting output in the v1 baseline
|
|
31
|
+
|
|
32
|
+
## CLI
|
|
33
|
+
|
|
34
|
+
The v1 CLI keeps a small surface:
|
|
35
|
+
|
|
36
|
+
```text
|
|
37
|
+
epub2llm <input.epub>
|
|
38
|
+
epub2llm <input.epub> -o <output.md>
|
|
39
|
+
epub2llm -h
|
|
40
|
+
epub2llm --help
|
|
41
|
+
epub2llm -V
|
|
42
|
+
epub2llm --version
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
- The input EPUB is a positional argument.
|
|
46
|
+
- `-o` and `--output` select an explicit output path.
|
|
47
|
+
- If no output path is provided, the tool derives one from the input filename with a `.md` extension.
|
|
48
|
+
- Existing output files are not overwritten silently.
|
|
49
|
+
- In interactive terminal use, the CLI may prompt for explicit overwrite confirmation with a default `No` answer.
|
|
50
|
+
|
|
51
|
+
## Node API
|
|
52
|
+
|
|
53
|
+
The package exposes a programmatic `convertEpub()` function for Node.js use.
|
|
54
|
+
|
|
55
|
+
The stable v1 options are:
|
|
56
|
+
|
|
57
|
+
- `inputPath`
|
|
58
|
+
- `outputPath`
|
|
59
|
+
- `cwd`
|
|
60
|
+
- `overwrite`
|
|
61
|
+
|
|
62
|
+
`convertEpub()` writes the Markdown file and returns structured conversion results, including warnings. It does not prompt for overwrite confirmation or print terminal output. If the output target already exists and `overwrite` is not enabled, the function fails conservatively with `OUTPUT_EXISTS`.
|
|
63
|
+
|
|
64
|
+
## Output Structure
|
|
65
|
+
|
|
66
|
+
The generated Markdown document uses this high-level structure:
|
|
67
|
+
|
|
68
|
+
1. minimal YAML front matter
|
|
69
|
+
2. top-level book title
|
|
70
|
+
3. dedicated `## TOC` section
|
|
71
|
+
4. merged body content in spine order
|
|
72
|
+
|
|
73
|
+
The front matter stays minimal and only includes values available from EPUB package metadata:
|
|
74
|
+
|
|
75
|
+
- `title`
|
|
76
|
+
- `creator`
|
|
77
|
+
- `language`
|
|
78
|
+
- `identifier`
|
|
79
|
+
- `publisher`
|
|
80
|
+
- `published`
|
|
81
|
+
|
|
82
|
+
Missing metadata fields are omitted rather than guessed.
|
|
83
|
+
|
|
84
|
+
`published` is mapped from EPUB `dc:date`. Full dates and date-times are normalized to `YYYY-MM-DD`. Partial dates such as `YYYY` or `YYYY-MM` are preserved as-is rather than padded with guessed precision.
|
|
85
|
+
|
|
86
|
+
## Conversion Rules
|
|
87
|
+
|
|
88
|
+
### Table of contents
|
|
89
|
+
|
|
90
|
+
- The EPUB-native TOC is the authoritative TOC source.
|
|
91
|
+
- The TOC is rendered as a hierarchical Markdown list under `## TOC`.
|
|
92
|
+
- Entries become Markdown links only when the target can be mapped confidently.
|
|
93
|
+
- Unresolved TOC items remain plain text.
|
|
94
|
+
|
|
95
|
+
### Document structure
|
|
96
|
+
|
|
97
|
+
- Source heading levels are preserved.
|
|
98
|
+
- Source headings are not globally shifted to compensate for the inserted book title.
|
|
99
|
+
- The merged body follows the source document's own heading structure.
|
|
100
|
+
|
|
101
|
+
### Links, anchors, and notes
|
|
102
|
+
|
|
103
|
+
- Internal targets are rewritten into collision-safe identifiers for merged single-file output.
|
|
104
|
+
- TOC targets, internal links, and note-related links are rewritten conservatively.
|
|
105
|
+
- When a link target cannot be rewritten safely, the output degrades conservatively instead of guessing.
|
|
106
|
+
- Footnote and note structure is preserved as close to the original topology as possible.
|
|
107
|
+
|
|
108
|
+
### Content cleanup
|
|
109
|
+
|
|
110
|
+
- Cleanup is based on DOM/XHTML elements, not page-type inference.
|
|
111
|
+
- The strategy is conservative blacklist removal.
|
|
112
|
+
- Only high-confidence non-text elements are removed by default.
|
|
113
|
+
- Empty containers may be removed only when they carry no visible text, no preserved children, and no necessary structure.
|
|
114
|
+
|
|
115
|
+
The default removable set includes:
|
|
116
|
+
|
|
117
|
+
- `script`
|
|
118
|
+
- `style`
|
|
119
|
+
- `img`
|
|
120
|
+
- `svg`
|
|
121
|
+
- `canvas`
|
|
122
|
+
- `audio`
|
|
123
|
+
- `video`
|
|
124
|
+
- `source`
|
|
125
|
+
- `track`
|
|
126
|
+
- `iframe`
|
|
127
|
+
- `object`
|
|
128
|
+
- `embed`
|
|
129
|
+
- `form`
|
|
130
|
+
- `input`
|
|
131
|
+
- `button`
|
|
132
|
+
- `select`
|
|
133
|
+
- `option`
|
|
134
|
+
- `textarea`
|
|
135
|
+
|
|
136
|
+
Containers such as `figure`, `figcaption`, `aside`, `section`, `nav`, `div`, and `span` are not removed purely by tag name.
|
|
137
|
+
|
|
138
|
+
### Core element mapping
|
|
139
|
+
|
|
140
|
+
- `h1` to `h6` map to Markdown headings
|
|
141
|
+
- `p` maps to paragraphs
|
|
142
|
+
- `blockquote` maps to Markdown blockquotes
|
|
143
|
+
- `hr` maps to `---`
|
|
144
|
+
- `em` and `i` map to emphasis
|
|
145
|
+
- `strong` and `b` map to strong emphasis
|
|
146
|
+
- `code` maps to inline code
|
|
147
|
+
- safe `a[href]` targets map to Markdown links
|
|
148
|
+
|
|
149
|
+
Definition lists are degraded into Markdown list structures instead of being dropped.
|
|
150
|
+
|
|
151
|
+
## Errors and Warnings
|
|
152
|
+
|
|
153
|
+
Fatal errors stop conversion and return a non-zero exit code. Typical fatal cases include:
|
|
154
|
+
|
|
155
|
+
- missing input file
|
|
156
|
+
- invalid or unreadable EPUB container
|
|
157
|
+
- missing or unreadable OPF/package document
|
|
158
|
+
- unreadable spine content required for conversion
|
|
159
|
+
- unwritable output path
|
|
160
|
+
- output target already exists and overwrite is not confirmed
|
|
161
|
+
|
|
162
|
+
Warnings still allow output generation and keep a success exit code. Typical warning cases include:
|
|
163
|
+
|
|
164
|
+
- missing TOC
|
|
165
|
+
- unresolved TOC targets
|
|
166
|
+
- links that cannot be safely rewritten
|
|
167
|
+
- dropped elements caused by cleanup rules
|
|
168
|
+
- incomplete metadata
|
|
169
|
+
- source structures that cannot be represented perfectly in Markdown
|
|
170
|
+
|
|
171
|
+
Warnings may be summarized in CLI output, and some low-signal warnings may be retained only in structured results rather than shown in the terminal.
|
|
172
|
+
|
|
173
|
+
## Validation Boundary
|
|
174
|
+
|
|
175
|
+
- Fixed Layout EPUB (FXL) is out of scope for v1.
|
|
176
|
+
- Validation should cover nested TOCs, footnotes, CJK ruby content, tables, image-heavy EPUBs, degraded TOC metadata, incomplete metadata, and RTL samples.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# EPUB to Markdown v1 Technical Selection
|
|
2
|
+
|
|
3
|
+
This document records the implementation choices made strictly from `epub-to-md-v1-spec.md`.
|
|
4
|
+
|
|
5
|
+
## Runtime and language
|
|
6
|
+
|
|
7
|
+
- Supported runtime targets: Node.js `20`, `22`, and `24`
|
|
8
|
+
- Language: TypeScript
|
|
9
|
+
- Module output: CommonJS for a simple Node CLI and library distribution path
|
|
10
|
+
|
|
11
|
+
## Package choices
|
|
12
|
+
|
|
13
|
+
| Concern | Package | Why this fits the spec |
|
|
14
|
+
| --- | --- | --- |
|
|
15
|
+
| CLI parsing | `commander` | Mature, minimal, standard `-h/-V/-o` surface |
|
|
16
|
+
| EPUB unzip to temp dir | `extract-zip` | Small, established, matches the "unpack into temporary working area" pipeline |
|
|
17
|
+
| XML parsing | `fast-xml-parser` | Mature, fast, good fit for `container.xml`, OPF, and NCX |
|
|
18
|
+
| XHTML/DOM handling | `jsdom@26.1.0` | Stable DOM API and compatible with the supported Node.js runtimes |
|
|
19
|
+
| HTML to Markdown | `turndown` | Widely used baseline converter for conservative Markdown generation |
|
|
20
|
+
| GFM table support | `turndown-plugin-gfm` | Provides a starting point for simple-table conversion without inventing a custom renderer too early |
|
|
21
|
+
|
|
22
|
+
## Project skeleton
|
|
23
|
+
|
|
24
|
+
The codebase is split by the pipeline described in the spec:
|
|
25
|
+
|
|
26
|
+
- `src/cli.ts`
|
|
27
|
+
- CLI surface and exit handling
|
|
28
|
+
- `src/cli/`
|
|
29
|
+
- CLI-only overwrite confirmation, reporting, and command orchestration
|
|
30
|
+
- `src/application/convert-epub.ts`
|
|
31
|
+
- public file-writing API for Node consumers
|
|
32
|
+
- `src/application/convert-epub-document.ts`
|
|
33
|
+
- internal conversion core that returns Markdown plus warnings
|
|
34
|
+
- `src/epub/`
|
|
35
|
+
- archive extraction, `container.xml`, OPF parsing, TOC parsing, spine indexing, spine content loading
|
|
36
|
+
- `src/transform/`
|
|
37
|
+
- DOM cleanup, anchor rewriting, internal-link rewriting, Markdown conversion primitives
|
|
38
|
+
- `src/output/`
|
|
39
|
+
- front matter, title, and TOC rendering
|
|
40
|
+
- `src/domain/`
|
|
41
|
+
- spec constants, shared types, warnings, and fatal error model
|
|
42
|
+
- `src/utils/`
|
|
43
|
+
- path derivation and conservative output handling
|
|
44
|
+
|
|
45
|
+
## Current MVP coverage
|
|
46
|
+
|
|
47
|
+
The current implementation now covers the minimum viable pipeline:
|
|
48
|
+
|
|
49
|
+
1. input/output path validation
|
|
50
|
+
2. temp-dir creation and EPUB extraction
|
|
51
|
+
3. `container.xml` parsing
|
|
52
|
+
4. OPF metadata/manifest/spine parsing
|
|
53
|
+
5. TOC source detection and parsing
|
|
54
|
+
6. spine index construction
|
|
55
|
+
7. spine XHTML loading
|
|
56
|
+
8. conservative DOM cleanup
|
|
57
|
+
9. internal target collection from `id` / `name` / `xml:id`, with merged-document anchor generation
|
|
58
|
+
10. low-risk internal link, TOC target, and explicit footnote/backlink rewriting
|
|
59
|
+
11. XHTML-to-Markdown conversion
|
|
60
|
+
12. front matter, book title, TOC, and merged body rendering
|
|
61
|
+
13. final Markdown file emission
|
|
62
|
+
14. stderr warning emission
|
|
63
|
+
|
|
64
|
+
## Still intentionally deferred
|
|
65
|
+
|
|
66
|
+
The following spec areas are still intentionally partial rather than fully complete:
|
|
67
|
+
|
|
68
|
+
1. deeper footnote edge cases beyond explicit source anchors and note/backlink semantics
|
|
69
|
+
2. richer table strategy, especially complex-table HTML fallback detection
|
|
70
|
+
3. broader malformed-EPUB tolerance and regression coverage
|
|
71
|
+
|
|
72
|
+
## Regression Harness
|
|
73
|
+
|
|
74
|
+
The repo now includes a small regression suite using Node's built-in `node:test` runner:
|
|
75
|
+
|
|
76
|
+
- run with `npm test`
|
|
77
|
+
- tests generate temporary EPUB fixtures on the fly
|
|
78
|
+
- current coverage includes:
|
|
79
|
+
- output skeleton generation
|
|
80
|
+
- warning suppression for expected dropped elements
|
|
81
|
+
- `<br>` rendered as plain newline for downstream Markdown tool compatibility
|
|
82
|
+
- ruby converted to explicit text fallback
|
|
83
|
+
- RTL text preserved in TOC and body content
|
|
84
|
+
- degraded-but-readable TOC with unresolved targets downgraded to plain text plus warning
|
|
85
|
+
- explicit footnote/backlink preservation
|
|
86
|
+
- role-based endnotes stored as list items without losing ordered-list semantics
|
|
87
|
+
- image-heavy low-text content preserving surviving text while dropping media
|
|
88
|
+
- row-header tables preserved as Markdown tables without unnecessary HTML fallback
|
|
89
|
+
- simple vs complex table handling
|
|
90
|
+
- inconsistent package navigation metadata downgraded to warning instead of fatal failure
|
|
91
|
+
- invalid nav downgraded to warning instead of fatal failure
|
|
92
|
+
- NCX fallback when nav parsing fails
|
|
93
|
+
- unreadable NCX downgraded to warning instead of fatal failure
|
|
94
|
+
- invalid NCX downgraded to warning instead of fatal failure
|
|
95
|
+
- missing-TOC warning behavior
|
|
96
|
+
- impact-oriented warning wording and explicit CLI visibility policy
|
|
97
|
+
- conservative output-file overwrite failure
|
|
98
|
+
- interactive overwrite confirmation accept path
|
|
99
|
+
- interactive overwrite confirmation decline / EOF path
|
|
100
|
+
|
|
101
|
+
## Known Divergences
|
|
102
|
+
|
|
103
|
+
The current implementation intentionally diverges from one point in the draft spec:
|
|
104
|
+
|
|
105
|
+
- `<br>` currently renders as a plain newline, not trailing `\`
|
|
106
|
+
- reason: downstream tools in actual use, including Obsidian/MarkEdit in this workflow, do not reliably interpret the trailing-backslash hard-break form
|
package/package.json
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "mark-epub-down",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "EPUB to Markdown source generator for LLM knowledge bases, wikis, and related ingestion pipelines",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"keywords": [
|
|
7
|
+
"epub",
|
|
8
|
+
"markdown",
|
|
9
|
+
"cli",
|
|
10
|
+
"node",
|
|
11
|
+
"llm",
|
|
12
|
+
"ingestion"
|
|
13
|
+
],
|
|
14
|
+
"homepage": "https://github.com/thomson1973/mark-epub-down#readme",
|
|
15
|
+
"repository": {
|
|
16
|
+
"type": "git",
|
|
17
|
+
"url": "git+https://github.com/thomson1973/mark-epub-down.git"
|
|
18
|
+
},
|
|
19
|
+
"bugs": {
|
|
20
|
+
"url": "https://github.com/thomson1973/mark-epub-down/issues"
|
|
21
|
+
},
|
|
22
|
+
"bin": {
|
|
23
|
+
"epub2llm": "dist/cli.js"
|
|
24
|
+
},
|
|
25
|
+
"main": "dist/index.js",
|
|
26
|
+
"types": "dist/index.d.ts",
|
|
27
|
+
"exports": {
|
|
28
|
+
".": {
|
|
29
|
+
"types": "./dist/index.d.ts",
|
|
30
|
+
"require": "./dist/index.js",
|
|
31
|
+
"default": "./dist/index.js"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"files": [
|
|
35
|
+
"dist",
|
|
36
|
+
"README.md",
|
|
37
|
+
"LICENSE",
|
|
38
|
+
"docs"
|
|
39
|
+
],
|
|
40
|
+
"engines": {
|
|
41
|
+
"node": "^20.14.0 || ^22.0.0 || ^24.0.0"
|
|
42
|
+
},
|
|
43
|
+
"scripts": {
|
|
44
|
+
"build": "tsc -p tsconfig.json",
|
|
45
|
+
"prepack": "npm run build",
|
|
46
|
+
"test": "npm run build && node --test test/*.test.mjs",
|
|
47
|
+
"typecheck": "tsc -p tsconfig.json --noEmit",
|
|
48
|
+
"start": "node dist/cli.js"
|
|
49
|
+
},
|
|
50
|
+
"publishConfig": {
|
|
51
|
+
"access": "public"
|
|
52
|
+
},
|
|
53
|
+
"dependencies": {
|
|
54
|
+
"commander": "^14.0.3",
|
|
55
|
+
"extract-zip": "^2.0.1",
|
|
56
|
+
"fast-xml-parser": "^5.5.11",
|
|
57
|
+
"jsdom": "^26.1.0",
|
|
58
|
+
"turndown": "^7.2.4",
|
|
59
|
+
"turndown-plugin-gfm": "^1.0.2"
|
|
60
|
+
},
|
|
61
|
+
"devDependencies": {
|
|
62
|
+
"@types/jsdom": "^28.0.1",
|
|
63
|
+
"@types/node": "^20.19.39",
|
|
64
|
+
"@types/turndown": "^5.0.6",
|
|
65
|
+
"typescript": "^5.5.3"
|
|
66
|
+
}
|
|
67
|
+
}
|