epub-wasm 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -104
- package/epub_wasm_bg.js +120 -0
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -1,141 +1,111 @@
|
|
|
1
1
|
# epub-wasm
|
|
2
2
|
|
|
3
|
-
A
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
This crate leverages the [`rbook`](https://crates.io/crates/rbook) and [`scraper`](https://crates.io/crates/scraper) crates to parse EPUB files and extract their content into a clean JSON structure with chapters, headings, and paragraphs. The parsed data is then exposed via WebAssembly bindings for use in web applications.
|
|
3
|
+
A WebAssembly module for parsing EPUB files into structured JSON format. This package provides a fast, browser-compatible way to extract and process EPUB content without server-side processing.
|
|
8
4
|
|
|
9
5
|
## Features
|
|
10
6
|
|
|
11
|
-
- **EPUB parsing**:
|
|
12
|
-
- **
|
|
13
|
-
- **
|
|
14
|
-
- **
|
|
15
|
-
|
|
16
|
-
## Building
|
|
17
|
-
|
|
18
|
-
### Prerequisites
|
|
19
|
-
|
|
20
|
-
- [Rust](https://rustup.rs/) (latest stable)
|
|
21
|
-
- [wasm-pack](https://rustwasm.github.io/wasm-pack/)
|
|
22
|
-
|
|
23
|
-
### Build for WebAssembly
|
|
24
|
-
|
|
25
|
-
```bash
|
|
26
|
-
# Install wasm-pack if not installed
|
|
27
|
-
cargo install wasm-pack
|
|
28
|
-
|
|
29
|
-
# Build for bundler (recommended for Vite/SvelteKit)
|
|
30
|
-
wasm-pack build --release --target bundler --out-dir pkg
|
|
31
|
-
|
|
32
|
-
# Or build for web (serves WASM from URL)
|
|
33
|
-
wasm-pack build --release --target web --out-dir pkg
|
|
34
|
-
```
|
|
7
|
+
- **Fast EPUB parsing**: Leverages Rust's performance compiled to WebAssembly
|
|
8
|
+
- **Browser-ready**: Works directly in web browsers and modern JavaScript environments
|
|
9
|
+
- **Structured output**: Converts EPUB content into clean JSON with chapters, headings, and paragraphs
|
|
10
|
+
- **Zero dependencies**: Self-contained WebAssembly module
|
|
35
11
|
|
|
36
|
-
|
|
12
|
+
## Installation
|
|
37
13
|
|
|
38
14
|
```bash
|
|
39
|
-
|
|
15
|
+
npm install epub-wasm
|
|
16
|
+
# or
|
|
17
|
+
yarn add epub-wasm
|
|
18
|
+
# or
|
|
19
|
+
bun add epub-wasm
|
|
40
20
|
```
|
|
41
21
|
|
|
42
22
|
## Usage
|
|
43
23
|
|
|
44
|
-
|
|
24
|
+
```javascript
|
|
25
|
+
import * as wasm from "epub-wasm";
|
|
45
26
|
|
|
46
|
-
|
|
47
|
-
|
|
27
|
+
async function parseEpub() {
|
|
28
|
+
// Load the EPUB file
|
|
29
|
+
const response = await fetch("./book.epub");
|
|
30
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
31
|
+
const uint8Array = new Uint8Array(arrayBuffer);
|
|
48
32
|
|
|
49
|
-
// Parse
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
33
|
+
// Parse with WebAssembly
|
|
34
|
+
const jsonString = await wasm.parse_epub(uint8Array);
|
|
35
|
+
const bookData = JSON.parse(jsonString);
|
|
36
|
+
|
|
37
|
+
console.log("Book:", bookData);
|
|
38
|
+
// Access chapters: bookData.chapters
|
|
39
|
+
// Access first chapter blocks: bookData.chapters[0].blocks
|
|
40
|
+
}
|
|
53
41
|
```
|
|
54
42
|
|
|
55
|
-
## API
|
|
43
|
+
## API Reference
|
|
56
44
|
|
|
57
|
-
### `parse_epub(data:
|
|
45
|
+
### `parse_epub(data: Uint8Array) -> string`
|
|
58
46
|
|
|
59
|
-
Parses an EPUB file from
|
|
47
|
+
Parses an EPUB file from a byte array and returns a JSON string.
|
|
60
48
|
|
|
61
49
|
**Parameters:**
|
|
62
50
|
|
|
63
|
-
- `data`:
|
|
64
|
-
|
|
65
|
-
**Returns:** JSON string with the following structure:
|
|
66
|
-
|
|
67
|
-
```json
|
|
68
|
-
{
|
|
69
|
-
"id": "book_001",
|
|
70
|
-
"title": "Book Title",
|
|
71
|
-
"chapters": [
|
|
72
|
-
{
|
|
73
|
-
"title": "Chapter Title",
|
|
74
|
-
"id": "chapter_001",
|
|
75
|
-
"blocks": [
|
|
76
|
-
{
|
|
77
|
-
"type": "heading",
|
|
78
|
-
"text": "Heading Text"
|
|
79
|
-
},
|
|
80
|
-
{
|
|
81
|
-
"type": "paragraph",
|
|
82
|
-
"text": "Paragraph text..."
|
|
83
|
-
}
|
|
84
|
-
]
|
|
85
|
-
}
|
|
86
|
-
]
|
|
87
|
-
}
|
|
88
|
-
```
|
|
51
|
+
- `data`: `Uint8Array` - The raw bytes of the EPUB file
|
|
89
52
|
|
|
90
|
-
|
|
53
|
+
**Returns:** `string` - JSON representation of the parsed book
|
|
91
54
|
|
|
92
|
-
|
|
93
|
-
- [`scraper`](https://crates.io/crates/scraper): HTML parsing and CSS selector engine
|
|
94
|
-
- [`serde`](https://crates.io/crates/serde): Serialization framework
|
|
95
|
-
- [`wasm-bindgen`](https://crates.io/crates/wasm-bindgen): WebAssembly bindings
|
|
55
|
+
**Throws:** Will return a fallback empty book JSON if parsing fails
|
|
96
56
|
|
|
97
|
-
|
|
57
|
+
### Extraction Process
|
|
98
58
|
|
|
99
|
-
|
|
100
|
-
epub-wasm/
|
|
101
|
-
├── src/
|
|
102
|
-
│ └── lib.rs # Main library code with WASM bindings
|
|
103
|
-
├── pkg/ # Generated WebAssembly package (after build)
|
|
104
|
-
│ ├── epub_wasm.js
|
|
105
|
-
│ ├── epub_wasm_bg.wasm
|
|
106
|
-
│ └── ...
|
|
107
|
-
├── Cargo.toml # Package configuration
|
|
108
|
-
└── README.md # This file
|
|
109
|
-
```
|
|
59
|
+
The EPUB parsing extracts content based on the following TypeScript type definitions:
|
|
110
60
|
|
|
111
|
-
|
|
61
|
+
```typescript
|
|
62
|
+
type Book = {
|
|
63
|
+
id: string;
|
|
64
|
+
title: string;
|
|
65
|
+
chapters: Chapter[];
|
|
66
|
+
};
|
|
112
67
|
|
|
113
|
-
|
|
68
|
+
type Chapter = {
|
|
69
|
+
title: string;
|
|
70
|
+
id: string;
|
|
71
|
+
blocks: Block[];
|
|
72
|
+
};
|
|
114
73
|
|
|
115
|
-
|
|
116
|
-
|
|
74
|
+
type Block = {
|
|
75
|
+
text: string;
|
|
76
|
+
type: "heading" | "paragraph";
|
|
77
|
+
};
|
|
117
78
|
```
|
|
118
79
|
|
|
119
|
-
|
|
80
|
+
**How extraction works:**
|
|
120
81
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
python3 -m http.server 8000
|
|
127
|
-
# Then open index.html in browser
|
|
128
|
-
```
|
|
82
|
+
- **Book**: Represents the entire EPUB with a unique ID, title, and array of chapters
|
|
83
|
+
- **Chapter**: Each chapter contains a title, unique ID, and content broken into blocks
|
|
84
|
+
- **Block**: Individual content units that are either headings (`h1`-`h6`) or paragraphs
|
|
85
|
+
- Content is extracted from HTML/XHTML files within the EPUB, preserving the document structure
|
|
86
|
+
- Headings and paragraphs are identified and categorized automatically from the EPUB's markup
|
|
129
87
|
|
|
130
|
-
|
|
88
|
+
### JSON Output Structure
|
|
131
89
|
|
|
132
|
-
|
|
90
|
+
```typescript
|
|
91
|
+
interface Book {
|
|
92
|
+
id: string;
|
|
93
|
+
title: string;
|
|
94
|
+
chapters: Chapter[];
|
|
95
|
+
}
|
|
133
96
|
|
|
134
|
-
|
|
97
|
+
interface Chapter {
|
|
98
|
+
title: string;
|
|
99
|
+
id: string;
|
|
100
|
+
blocks: Block[];
|
|
101
|
+
}
|
|
135
102
|
|
|
136
|
-
|
|
103
|
+
interface Block {
|
|
104
|
+
type: "heading" | "paragraph";
|
|
105
|
+
text: string;
|
|
106
|
+
}
|
|
107
|
+
```
|
|
137
108
|
|
|
138
|
-
##
|
|
109
|
+
## License
|
|
139
110
|
|
|
140
|
-
-
|
|
141
|
-
- [`epub-wasm` npm package](pkg/README.md) - The WebAssembly package published to npm
|
|
111
|
+
MIT/Apache-2.0
|
package/epub_wasm_bg.js
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
let wasm;
|
|
2
|
+
export function __wbg_set_wasm(val) {
|
|
3
|
+
wasm = val;
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
function addHeapObject(obj) {
|
|
7
|
+
if (heap_next === heap.length) heap.push(heap.length + 1);
|
|
8
|
+
const idx = heap_next;
|
|
9
|
+
heap_next = heap[idx];
|
|
10
|
+
|
|
11
|
+
heap[idx] = obj;
|
|
12
|
+
return idx;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function dropObject(idx) {
|
|
16
|
+
if (idx < 132) return;
|
|
17
|
+
heap[idx] = heap_next;
|
|
18
|
+
heap_next = idx;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function getArrayU8FromWasm0(ptr, len) {
|
|
22
|
+
ptr = ptr >>> 0;
|
|
23
|
+
return getUint8ArrayMemory0().subarray(ptr / 1, ptr / 1 + len);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
let cachedDataViewMemory0 = null;
|
|
27
|
+
function getDataViewMemory0() {
|
|
28
|
+
if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) {
|
|
29
|
+
cachedDataViewMemory0 = new DataView(wasm.memory.buffer);
|
|
30
|
+
}
|
|
31
|
+
return cachedDataViewMemory0;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function getStringFromWasm0(ptr, len) {
|
|
35
|
+
ptr = ptr >>> 0;
|
|
36
|
+
return decodeText(ptr, len);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
let cachedUint8ArrayMemory0 = null;
|
|
40
|
+
function getUint8ArrayMemory0() {
|
|
41
|
+
if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
|
|
42
|
+
cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
|
|
43
|
+
}
|
|
44
|
+
return cachedUint8ArrayMemory0;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function getObject(idx) { return heap[idx]; }
|
|
48
|
+
|
|
49
|
+
function handleError(f, args) {
|
|
50
|
+
try {
|
|
51
|
+
return f.apply(this, args);
|
|
52
|
+
} catch (e) {
|
|
53
|
+
wasm.__wbindgen_export(addHeapObject(e));
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
let heap = new Array(128).fill(undefined);
|
|
58
|
+
heap.push(undefined, null, true, false);
|
|
59
|
+
|
|
60
|
+
let heap_next = heap.length;
|
|
61
|
+
|
|
62
|
+
function passArray8ToWasm0(arg, malloc) {
|
|
63
|
+
const ptr = malloc(arg.length * 1, 1) >>> 0;
|
|
64
|
+
getUint8ArrayMemory0().set(arg, ptr / 1);
|
|
65
|
+
WASM_VECTOR_LEN = arg.length;
|
|
66
|
+
return ptr;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function takeObject(idx) {
|
|
70
|
+
const ret = getObject(idx);
|
|
71
|
+
dropObject(idx);
|
|
72
|
+
return ret;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
let cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
|
|
76
|
+
cachedTextDecoder.decode();
|
|
77
|
+
const MAX_SAFARI_DECODE_BYTES = 2146435072;
|
|
78
|
+
let numBytesDecoded = 0;
|
|
79
|
+
function decodeText(ptr, len) {
|
|
80
|
+
numBytesDecoded += len;
|
|
81
|
+
if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) {
|
|
82
|
+
cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
|
|
83
|
+
cachedTextDecoder.decode();
|
|
84
|
+
numBytesDecoded = len;
|
|
85
|
+
}
|
|
86
|
+
return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
let WASM_VECTOR_LEN = 0;
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* @param {Uint8Array} data
|
|
93
|
+
* @returns {string}
|
|
94
|
+
*/
|
|
95
|
+
export function parse_epub(data) {
|
|
96
|
+
let deferred2_0;
|
|
97
|
+
let deferred2_1;
|
|
98
|
+
try {
|
|
99
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
100
|
+
const ptr0 = passArray8ToWasm0(data, wasm.__wbindgen_export2);
|
|
101
|
+
const len0 = WASM_VECTOR_LEN;
|
|
102
|
+
wasm.parse_epub(retptr, ptr0, len0);
|
|
103
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
104
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
105
|
+
deferred2_0 = r0;
|
|
106
|
+
deferred2_1 = r1;
|
|
107
|
+
return getStringFromWasm0(r0, r1);
|
|
108
|
+
} finally {
|
|
109
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
110
|
+
wasm.__wbindgen_export3(deferred2_0, deferred2_1, 1);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
export function __wbg_getRandomValues_9b655bdd369112f2() { return handleError(function (arg0, arg1) {
|
|
115
|
+
globalThis.crypto.getRandomValues(getArrayU8FromWasm0(arg0, arg1));
|
|
116
|
+
}, arguments) };
|
|
117
|
+
|
|
118
|
+
export function __wbindgen_object_drop_ref(arg0) {
|
|
119
|
+
takeObject(arg0);
|
|
120
|
+
};
|
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"Mikiyas <0xmik@proton.me>"
|
|
6
6
|
],
|
|
7
7
|
"description": "EPUB utilities compiled to WebAssembly",
|
|
8
|
-
"version": "0.1.
|
|
8
|
+
"version": "0.1.2",
|
|
9
9
|
"license": "MIT/Apache-2.0",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
"files": [
|
|
15
15
|
"epub_wasm.js",
|
|
16
16
|
"epub_wasm_bg.wasm",
|
|
17
|
+
"epub_wasm_bg.js",
|
|
17
18
|
"epub_wasm.d.ts",
|
|
18
19
|
"epub_wasm_bg.wasm.d.ts",
|
|
19
20
|
"package.json",
|