mark-epub-down 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +151 -0
  3. package/dist/application/convert-epub-document.d.ts +10 -0
  4. package/dist/application/convert-epub-document.js +80 -0
  5. package/dist/application/convert-epub-document.js.map +1 -0
  6. package/dist/application/convert-epub.d.ts +13 -0
  7. package/dist/application/convert-epub.js +35 -0
  8. package/dist/application/convert-epub.js.map +1 -0
  9. package/dist/cli/confirm-overwrite.d.ts +1 -0
  10. package/dist/cli/confirm-overwrite.js +29 -0
  11. package/dist/cli/confirm-overwrite.js.map +1 -0
  12. package/dist/cli/reporting.d.ts +3 -0
  13. package/dist/cli/reporting.js +24 -0
  14. package/dist/cli/reporting.js.map +1 -0
  15. package/dist/cli/run-convert-command.d.ts +10 -0
  16. package/dist/cli/run-convert-command.js +61 -0
  17. package/dist/cli/run-convert-command.js.map +1 -0
  18. package/dist/cli.d.ts +2 -0
  19. package/dist/cli.js +28 -0
  20. package/dist/cli.js.map +1 -0
  21. package/dist/domain/errors.d.ts +7 -0
  22. package/dist/domain/errors.js +20 -0
  23. package/dist/domain/errors.js.map +1 -0
  24. package/dist/domain/spec.d.ts +8 -0
  25. package/dist/domain/spec.js +49 -0
  26. package/dist/domain/spec.js.map +1 -0
  27. package/dist/domain/types.d.ts +48 -0
  28. package/dist/domain/types.js +3 -0
  29. package/dist/domain/types.js.map +1 -0
  30. package/dist/domain/warnings.d.ts +8 -0
  31. package/dist/domain/warnings.js +49 -0
  32. package/dist/domain/warnings.js.map +1 -0
  33. package/dist/epub/archive.d.ts +3 -0
  34. package/dist/epub/archive.js +22 -0
  35. package/dist/epub/archive.js.map +1 -0
  36. package/dist/epub/container.d.ts +2 -0
  37. package/dist/epub/container.js +46 -0
  38. package/dist/epub/container.js.map +1 -0
  39. package/dist/epub/content.d.ts +7 -0
  40. package/dist/epub/content.js +34 -0
  41. package/dist/epub/content.js.map +1 -0
  42. package/dist/epub/opf.d.ts +2 -0
  43. package/dist/epub/opf.js +122 -0
  44. package/dist/epub/opf.js.map +1 -0
  45. package/dist/epub/spine.d.ts +2 -0
  46. package/dist/epub/spine.js +25 -0
  47. package/dist/epub/spine.js.map +1 -0
  48. package/dist/epub/toc.d.ts +3 -0
  49. package/dist/epub/toc.js +165 -0
  50. package/dist/epub/toc.js.map +1 -0
  51. package/dist/index.d.ts +4 -0
  52. package/dist/index.js +8 -0
  53. package/dist/index.js.map +1 -0
  54. package/dist/output/render.d.ts +9 -0
  55. package/dist/output/render.js +54 -0
  56. package/dist/output/render.js.map +1 -0
  57. package/dist/transform/anchors.d.ts +2 -0
  58. package/dist/transform/anchors.js +18 -0
  59. package/dist/transform/anchors.js.map +1 -0
  60. package/dist/transform/cleanup.d.ts +4 -0
  61. package/dist/transform/cleanup.js +37 -0
  62. package/dist/transform/cleanup.js.map +1 -0
  63. package/dist/transform/links.d.ts +14 -0
  64. package/dist/transform/links.js +174 -0
  65. package/dist/transform/links.js.map +1 -0
  66. package/dist/transform/markdown.d.ts +2 -0
  67. package/dist/transform/markdown.js +102 -0
  68. package/dist/transform/markdown.js.map +1 -0
  69. package/dist/transform/tables.d.ts +5 -0
  70. package/dist/transform/tables.js +130 -0
  71. package/dist/transform/tables.js.map +1 -0
  72. package/dist/utils/epub-path.d.ts +14 -0
  73. package/dist/utils/epub-path.js +56 -0
  74. package/dist/utils/epub-path.js.map +1 -0
  75. package/dist/utils/path.d.ts +2 -0
  76. package/dist/utils/path.js +33 -0
  77. package/dist/utils/path.js.map +1 -0
  78. package/docs/epub-to-md-v1-public-spec.md +176 -0
  79. package/docs/v1-technical-selection.md +106 -0
  80. package/package.json +67 -0
@@ -0,0 +1,56 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.normalizeEpubPath = normalizeEpubPath;
7
+ exports.splitEpubHref = splitEpubHref;
8
+ exports.buildTargetKey = buildTargetKey;
9
+ exports.resolveEpubHref = resolveEpubHref;
10
+ const node_path_1 = __importDefault(require("node:path"));
11
+ function normalizeEpubPath(value) {
12
+ const normalized = value.replace(/\\/g, "/");
13
+ return node_path_1.default.posix.normalize(normalized).replace(/^\.\//, "");
14
+ }
15
+ function splitEpubHref(href) {
16
+ const [pathPart, fragment] = href.split("#", 2);
17
+ return {
18
+ pathPart,
19
+ fragment: fragment && fragment.length > 0 ? fragment : undefined,
20
+ };
21
+ }
22
+ function buildTargetKey(resourcePath, fragment) {
23
+ return fragment ? `${normalizeEpubPath(resourcePath)}#${fragment}` : normalizeEpubPath(resourcePath);
24
+ }
25
+ function resolveEpubHref(baseDocumentPath, href) {
26
+ const trimmedHref = href.trim();
27
+ if (isExternalHref(trimmedHref)) {
28
+ return {
29
+ kind: "external",
30
+ href: trimmedHref,
31
+ };
32
+ }
33
+ if (trimmedHref.startsWith("#")) {
34
+ const fragment = trimmedHref.slice(1);
35
+ return {
36
+ kind: "internal",
37
+ href: trimmedHref,
38
+ resourcePath: normalizeEpubPath(baseDocumentPath),
39
+ fragment,
40
+ targetKey: buildTargetKey(baseDocumentPath, fragment),
41
+ };
42
+ }
43
+ const { pathPart, fragment } = splitEpubHref(trimmedHref);
44
+ const resourcePath = normalizeEpubPath(node_path_1.default.posix.join(node_path_1.default.posix.dirname(normalizeEpubPath(baseDocumentPath)), pathPart));
45
+ return {
46
+ kind: "internal",
47
+ href: trimmedHref,
48
+ resourcePath,
49
+ fragment,
50
+ targetKey: buildTargetKey(resourcePath, fragment),
51
+ };
52
+ }
53
+ function isExternalHref(href) {
54
+ return /^[a-z][a-z0-9+.-]*:/i.test(href) || href.startsWith("//");
55
+ }
56
+ //# sourceMappingURL=epub-path.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"epub-path.js","sourceRoot":"","sources":["../../src/utils/epub-path.ts"],"names":[],"mappings":";;;;;AAUA,8CAGC;AAED,sCASC;AAED,wCAEC;AAED,0CAiCC;AA/DD,0DAA6B;AAU7B,SAAgB,iBAAiB,CAAC,KAAa;IAC7C,MAAM,UAAU,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;IAC7C,OAAO,mBAAI,CAAC,KAAK,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;AAC/D,CAAC;AAED,SAAgB,aAAa,CAAC,IAAY;IAIxC,MAAM,CAAC,QAAQ,EAAE,QAAQ,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;IAChD,OAAO;QACL,QAAQ;QACR,QAAQ,EAAE,QAAQ,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS;KACjE,CAAC;AACJ,CAAC;AAED,SAAgB,cAAc,CAAC,YAAoB,EAAE,QAAiB;IACpE,OAAO,QAAQ,CAAC,CAAC,CAAC,GAAG,iBAAiB,CAAC,YAAY,CAAC,IAAI,QAAQ,EAAE,CAAC,CAAC,CAAC,iBAAiB,CAAC,YAAY,CAAC,CAAC;AACvG,CAAC;AAED,SAAgB,eAAe,CAAC,gBAAwB,EAAE,IAAY;IACpE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAEhC,IAAI,cAAc,CAAC,WAAW,CAAC,EAAE,CAAC;QAChC,OAAO;YACL,IAAI,EAAE,UAAU;YAChB,IAAI,EAAE,WAAW;SAClB,CAAC;IACJ,CAAC;IAED,IAAI,WAAW,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QAChC,MAAM,QAAQ,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACtC,OAAO;YACL,IAAI,EAAE,UAAU;YAChB,IAAI,EAAE,WAAW;YACjB,YAAY,EAAE,iBAAiB,CAAC,gBAAgB,CAAC;YACjD,QAAQ;YACR,SAAS,EAAE,cAAc,CAAC,gBAAgB,EAAE,QAAQ,CAAC;SACtD,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,aAAa,CAAC,WAAW,CAAC,CAAC;IAC1D,MAAM,YAAY,GAAG,iBAAiB,CACpC,mBAAI,CAAC,KAAK,CAAC,IAAI,CAAC,mBAAI,CAAC,KAAK,CAAC,OAAO,CAAC,iBAAiB,CAAC,gBAAgB,CAAC,CAAC,EAAE,QAAQ,CAAC,CACnF,CAAC;IAEF,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,IAAI,EAAE,WAAW;QACjB,YAAY;QACZ,QAAQ;QACR,SAAS,EAAE,cAAc,CAAC,YAAY,EAAE,QAAQ,CAAC;KAClD,CAAC;AACJ,CAAC;AAED,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,sBAAsB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;AACpE,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function deriveOutputPath(inputPath: string, outputPath: string | undefined, cwd: string): string;
2
+ export declare function ensureOutputPathAvailable(outputPath: string, overwrite?: boolean): Promise<void>;
@@ -0,0 +1,33 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.deriveOutputPath = deriveOutputPath;
7
+ exports.ensureOutputPathAvailable = ensureOutputPathAvailable;
8
+ const promises_1 = require("node:fs/promises");
9
+ const node_path_1 = __importDefault(require("node:path"));
10
+ const errors_1 = require("../domain/errors");
11
+ function deriveOutputPath(inputPath, outputPath, cwd) {
12
+ if (outputPath) {
13
+ return node_path_1.default.resolve(cwd, outputPath);
14
+ }
15
+ const parsed = node_path_1.default.parse(inputPath);
16
+ return node_path_1.default.join(parsed.dir, `${parsed.name}.md`);
17
+ }
18
+ async function ensureOutputPathAvailable(outputPath, overwrite = false) {
19
+ try {
20
+ await (0, promises_1.access)(outputPath);
21
+ }
22
+ catch (error) {
23
+ if (error instanceof errors_1.ConversionError) {
24
+ throw error;
25
+ }
26
+ return;
27
+ }
28
+ if (overwrite) {
29
+ return;
30
+ }
31
+ throw errors_1.ConversionError.fatal("OUTPUT_EXISTS", `output file already exists and overwrite is disabled: ${outputPath}`);
32
+ }
33
+ //# sourceMappingURL=path.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"path.js","sourceRoot":"","sources":["../../src/utils/path.ts"],"names":[],"mappings":";;;;;AAKA,4CAOC;AAED,8DAsBC;AApCD,+CAA0C;AAC1C,0DAA6B;AAE7B,6CAAmD;AAEnD,SAAgB,gBAAgB,CAAC,SAAiB,EAAE,UAA8B,EAAE,GAAW;IAC7F,IAAI,UAAU,EAAE,CAAC;QACf,OAAO,mBAAI,CAAC,OAAO,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;IACvC,CAAC;IAED,MAAM,MAAM,GAAG,mBAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IACrC,OAAO,mBAAI,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,GAAG,MAAM,CAAC,IAAI,KAAK,CAAC,CAAC;AACpD,CAAC;AAEM,KAAK,UAAU,yBAAyB,CAC7C,UAAkB,EAClB,SAAS,GAAG,KAAK;IAEjB,IAAI,CAAC;QACH,MAAM,IAAA,iBAAM,EAAC,UAAU,CAAC,CAAC;IAC3B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,wBAAe,EAAE,CAAC;YACrC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,OAAO;IACT,CAAC;IAED,IAAI,SAAS,EAAE,CAAC;QACd,OAAO;IACT,CAAC;IAED,MAAM,wBAAe,CAAC,KAAK,CACzB,eAAe,EACf,yDAAyD,UAAU,EAAE,CACtE,CAAC;AACJ,CAAC"}
@@ -0,0 +1,176 @@
1
+ # EPUB to Markdown v1 Public Spec
2
+
3
+ ## Overview
4
+
5
+ `mark-epub-down` is a Node.js CLI and package that converts a single EPUB into a single Markdown document.
6
+
7
+ The v1 output is intended as source material for LLM knowledge bases, wikis, and related ingestion pipelines. The project prioritizes semantic preservation, source correctness, and low-risk transformation over reader-oriented Markdown polish.
8
+
9
+ ## Scope
10
+
11
+ - Input: one `.epub` file
12
+ - Output: one `.md` file
13
+ - Supported runtime targets: Node.js `20`, `22`, and `24`
14
+ - Implementation language: TypeScript
15
+ - Distribution shape: npm package with CLI and programmatic Node API
16
+
17
+ ## Goals
18
+
19
+ - Preserve meaningful document structure and content semantics.
20
+ - Keep source order aligned with the EPUB spine.
21
+ - Include EPUB-native table-of-contents information in the output.
22
+ - Prefer conservative transformations over aggressive normalization.
23
+ - Produce Markdown that works well as downstream ingestion source.
24
+
25
+ ## Non-goals
26
+
27
+ - Perfect visual reproduction of EPUB layout or CSS presentation
28
+ - Viewer-specific Markdown tuning as the primary goal
29
+ - Heuristic reconstruction of missing structure
30
+ - Chapter-splitting output in the v1 baseline
31
+
32
+ ## CLI
33
+
34
+ The v1 CLI keeps a small surface:
35
+
36
+ ```text
37
+ epub2llm <input.epub>
38
+ epub2llm <input.epub> -o <output.md>
39
+ epub2llm -h
40
+ epub2llm --help
41
+ epub2llm -V
42
+ epub2llm --version
43
+ ```
44
+
45
+ - The input EPUB is a positional argument.
46
+ - `-o` and `--output` select an explicit output path.
47
+ - If no output path is provided, the tool derives one from the input filename with a `.md` extension.
48
+ - Existing output files are not overwritten silently.
49
+ - In interactive terminal use, the CLI may prompt for explicit overwrite confirmation with a default `No` answer.
50
+
51
+ ## Node API
52
+
53
+ The package exposes a programmatic `convertEpub()` function for Node.js use.
54
+
55
+ The stable v1 options are:
56
+
57
+ - `inputPath`
58
+ - `outputPath`
59
+ - `cwd`
60
+ - `overwrite`
61
+
62
+ `convertEpub()` writes the Markdown file and returns structured conversion results, including warnings. It does not prompt for overwrite confirmation or print terminal output. If the output target already exists and `overwrite` is not enabled, the function fails conservatively with `OUTPUT_EXISTS`.
63
+
64
+ ## Output Structure
65
+
66
+ The generated Markdown document uses this high-level structure:
67
+
68
+ 1. minimal YAML front matter
69
+ 2. top-level book title
70
+ 3. dedicated `## TOC` section
71
+ 4. merged body content in spine order
72
+
73
+ The front matter stays minimal and only includes values available from EPUB package metadata:
74
+
75
+ - `title`
76
+ - `creator`
77
+ - `language`
78
+ - `identifier`
79
+ - `publisher`
80
+ - `published`
81
+
82
+ Missing metadata fields are omitted rather than guessed.
83
+
84
+ `published` is mapped from EPUB `dc:date`. Full dates and date-times are normalized to `YYYY-MM-DD`. Partial dates such as `YYYY` or `YYYY-MM` are preserved as-is rather than padded with guessed precision.
85
+
86
+ ## Conversion Rules
87
+
88
+ ### Table of contents
89
+
90
+ - The EPUB-native TOC is the authoritative TOC source.
91
+ - The TOC is rendered as a hierarchical Markdown list under `## TOC`.
92
+ - Entries become Markdown links only when the target can be mapped confidently.
93
+ - Unresolved TOC items remain plain text.
94
+
95
+ ### Document structure
96
+
97
+ - Source heading levels are preserved.
98
+ - Source headings are not globally shifted to compensate for the inserted book title.
99
+ - The merged body follows the source document's own heading structure.
100
+
101
+ ### Links, anchors, and notes
102
+
103
+ - Internal targets are rewritten into collision-safe identifiers for merged single-file output.
104
+ - TOC targets, internal links, and note-related links are rewritten conservatively.
105
+ - When a link target cannot be rewritten safely, the output degrades conservatively instead of guessing.
106
+ - Footnote and note structure is preserved as close to the original topology as possible.
107
+
108
+ ### Content cleanup
109
+
110
+ - Cleanup is based on DOM/XHTML elements, not page-type inference.
111
+ - The strategy is conservative blacklist removal.
112
+ - Only high-confidence non-text elements are removed by default.
113
+ - Empty containers may be removed only when they carry no visible text, no preserved children, and no necessary structure.
114
+
115
+ The default removable set includes:
116
+
117
+ - `script`
118
+ - `style`
119
+ - `img`
120
+ - `svg`
121
+ - `canvas`
122
+ - `audio`
123
+ - `video`
124
+ - `source`
125
+ - `track`
126
+ - `iframe`
127
+ - `object`
128
+ - `embed`
129
+ - `form`
130
+ - `input`
131
+ - `button`
132
+ - `select`
133
+ - `option`
134
+ - `textarea`
135
+
136
+ Containers such as `figure`, `figcaption`, `aside`, `section`, `nav`, `div`, and `span` are not removed purely by tag name.
137
+
138
+ ### Core element mapping
139
+
140
+ - `h1` to `h6` map to Markdown headings
141
+ - `p` maps to paragraphs
142
+ - `blockquote` maps to Markdown blockquotes
143
+ - `hr` maps to `---`
144
+ - `em` and `i` map to emphasis
145
+ - `strong` and `b` map to strong emphasis
146
+ - `code` maps to inline code
147
+ - safe `a[href]` targets map to Markdown links
148
+
149
+ Definition lists are degraded into Markdown list structures instead of being dropped.
150
+
151
+ ## Errors and Warnings
152
+
153
+ Fatal errors stop conversion and return a non-zero exit code. Typical fatal cases include:
154
+
155
+ - missing input file
156
+ - invalid or unreadable EPUB container
157
+ - missing or unreadable OPF/package document
158
+ - unreadable spine content required for conversion
159
+ - unwritable output path
160
+ - output target already exists and overwrite is not confirmed
161
+
162
+ Warnings still allow output generation and keep a success exit code. Typical warning cases include:
163
+
164
+ - missing TOC
165
+ - unresolved TOC targets
166
+ - links that cannot be safely rewritten
167
+ - dropped elements caused by cleanup rules
168
+ - incomplete metadata
169
+ - source structures that cannot be represented perfectly in Markdown
170
+
171
+ Warnings may be summarized in CLI output, and some low-signal warnings may be retained only in structured results rather than shown in the terminal.
172
+
173
+ ## Validation Boundary
174
+
175
+ - Fixed Layout EPUB (FXL) is out of scope for v1.
176
+ - Validation should cover nested TOCs, footnotes, CJK ruby content, tables, image-heavy EPUBs, degraded TOC metadata, incomplete metadata, and RTL samples.
@@ -0,0 +1,106 @@
1
+ # EPUB to Markdown v1 Technical Selection
2
+
3
+ This document records the implementation choices made strictly from `epub-to-md-v1-spec.md`.
4
+
5
+ ## Runtime and language
6
+
7
+ - Supported runtime targets: Node.js `20`, `22`, and `24`
8
+ - Language: TypeScript
9
+ - Module output: CommonJS for a simple Node CLI and library distribution path
10
+
11
+ ## Package choices
12
+
13
+ | Concern | Package | Why this fits the spec |
14
+ | --- | --- | --- |
15
+ | CLI parsing | `commander` | Mature, minimal, standard `-h/-V/-o` surface |
16
+ | EPUB unzip to temp dir | `extract-zip` | Small, established, matches the "unpack into temporary working area" pipeline |
17
+ | XML parsing | `fast-xml-parser` | Mature, fast, good fit for `container.xml`, OPF, and NCX |
18
+ | XHTML/DOM handling | `jsdom@26.1.0` | Stable DOM API and compatible with the supported Node.js runtimes |
19
+ | HTML to Markdown | `turndown` | Widely used baseline converter for conservative Markdown generation |
20
+ | GFM table support | `turndown-plugin-gfm` | Provides a starting point for simple-table conversion without inventing a custom renderer too early |
21
+
22
+ ## Project skeleton
23
+
24
+ The codebase is split by the pipeline described in the spec:
25
+
26
+ - `src/cli.ts`
27
+ - CLI surface and exit handling
28
+ - `src/cli/`
29
+ - CLI-only overwrite confirmation, reporting, and command orchestration
30
+ - `src/application/convert-epub.ts`
31
+ - public file-writing API for Node consumers
32
+ - `src/application/convert-epub-document.ts`
33
+ - internal conversion core that returns Markdown plus warnings
34
+ - `src/epub/`
35
+ - archive extraction, `container.xml`, OPF parsing, TOC parsing, spine indexing, spine content loading
36
+ - `src/transform/`
37
+ - DOM cleanup, anchor rewriting, internal-link rewriting, Markdown conversion primitives
38
+ - `src/output/`
39
+ - front matter, title, and TOC rendering
40
+ - `src/domain/`
41
+ - spec constants, shared types, warnings, and fatal error model
42
+ - `src/utils/`
43
+ - path derivation and conservative output handling
44
+
45
+ ## Current MVP coverage
46
+
47
+ The current implementation now covers the minimum viable pipeline:
48
+
49
+ 1. input/output path validation
50
+ 2. temp-dir creation and EPUB extraction
51
+ 3. `container.xml` parsing
52
+ 4. OPF metadata/manifest/spine parsing
53
+ 5. TOC source detection and parsing
54
+ 6. spine index construction
55
+ 7. spine XHTML loading
56
+ 8. conservative DOM cleanup
57
+ 9. internal target collection from `id` / `name` / `xml:id`, with merged-document anchor generation
58
+ 10. low-risk internal link, TOC target, and explicit footnote/backlink rewriting
59
+ 11. XHTML-to-Markdown conversion
60
+ 12. front matter, book title, TOC, and merged body rendering
61
+ 13. final Markdown file emission
62
+ 14. stderr warning emission
63
+
64
+ ## Still intentionally deferred
65
+
66
+ The following spec areas are still intentionally partial rather than fully complete:
67
+
68
+ 1. deeper footnote edge cases beyond explicit source anchors and note/backlink semantics
69
+ 2. richer table strategy, especially complex-table HTML fallback detection
70
+ 3. broader malformed-EPUB tolerance and regression coverage
71
+
72
+ ## Regression Harness
73
+
74
+ The repo now includes a small regression suite using Node's built-in `node:test` runner:
75
+
76
+ - run with `npm test`
77
+ - tests generate temporary EPUB fixtures on the fly
78
+ - current coverage includes:
79
+ - output skeleton generation
80
+ - warning suppression for expected dropped elements
81
+ - `<br>` rendered as plain newline for downstream Markdown tool compatibility
82
+ - ruby converted to explicit text fallback
83
+ - RTL text preserved in TOC and body content
84
+ - degraded-but-readable TOC with unresolved targets downgraded to plain text plus warning
85
+ - explicit footnote/backlink preservation
86
+ - role-based endnotes stored as list items without losing ordered-list semantics
87
+ - image-heavy low-text content preserving surviving text while dropping media
88
+ - row-header tables preserved as Markdown tables without unnecessary HTML fallback
89
+ - simple vs complex table handling
90
+ - inconsistent package navigation metadata downgraded to warning instead of fatal failure
91
+ - invalid nav downgraded to warning instead of fatal failure
92
+ - NCX fallback when nav parsing fails
93
+ - unreadable NCX downgraded to warning instead of fatal failure
94
+ - invalid NCX downgraded to warning instead of fatal failure
95
+ - missing-TOC warning behavior
96
+ - impact-oriented warning wording and explicit CLI visibility policy
97
+ - conservative output-file overwrite failure
98
+ - interactive overwrite confirmation accept path
99
+ - interactive overwrite confirmation decline / EOF path
100
+
101
+ ## Known Divergences
102
+
103
+ The current implementation intentionally diverges from one point in the draft spec:
104
+
105
+ - `<br>` currently renders as a plain newline, not trailing `\`
106
+ - reason: downstream tools in actual use, including Obsidian/MarkEdit in this workflow, do not reliably interpret the trailing-backslash hard-break form
package/package.json ADDED
@@ -0,0 +1,67 @@
1
+ {
2
+ "name": "mark-epub-down",
3
+ "version": "0.1.0",
4
+ "description": "EPUB to Markdown source generator for LLM knowledge bases, wikis, and related ingestion pipelines",
5
+ "license": "MIT",
6
+ "keywords": [
7
+ "epub",
8
+ "markdown",
9
+ "cli",
10
+ "node",
11
+ "llm",
12
+ "ingestion"
13
+ ],
14
+ "homepage": "https://github.com/thomson1973/mark-epub-down#readme",
15
+ "repository": {
16
+ "type": "git",
17
+ "url": "git+https://github.com/thomson1973/mark-epub-down.git"
18
+ },
19
+ "bugs": {
20
+ "url": "https://github.com/thomson1973/mark-epub-down/issues"
21
+ },
22
+ "bin": {
23
+ "epub2llm": "dist/cli.js"
24
+ },
25
+ "main": "dist/index.js",
26
+ "types": "dist/index.d.ts",
27
+ "exports": {
28
+ ".": {
29
+ "types": "./dist/index.d.ts",
30
+ "require": "./dist/index.js",
31
+ "default": "./dist/index.js"
32
+ }
33
+ },
34
+ "files": [
35
+ "dist",
36
+ "README.md",
37
+ "LICENSE",
38
+ "docs"
39
+ ],
40
+ "engines": {
41
+ "node": "^20.14.0 || ^22.0.0 || ^24.0.0"
42
+ },
43
+ "scripts": {
44
+ "build": "tsc -p tsconfig.json",
45
+ "prepack": "npm run build",
46
+ "test": "npm run build && node --test test/*.test.mjs",
47
+ "typecheck": "tsc -p tsconfig.json --noEmit",
48
+ "start": "node dist/cli.js"
49
+ },
50
+ "publishConfig": {
51
+ "access": "public"
52
+ },
53
+ "dependencies": {
54
+ "commander": "^14.0.3",
55
+ "extract-zip": "^2.0.1",
56
+ "fast-xml-parser": "^5.5.11",
57
+ "jsdom": "^26.1.0",
58
+ "turndown": "^7.2.4",
59
+ "turndown-plugin-gfm": "^1.0.2"
60
+ },
61
+ "devDependencies": {
62
+ "@types/jsdom": "^28.0.1",
63
+ "@types/node": "^20.19.39",
64
+ "@types/turndown": "^5.0.6",
65
+ "typescript": "^5.5.3"
66
+ }
67
+ }