epub_tools 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ci.yml +21 -0
- data/.gitignore +7 -0
- data/.nova/Configuration.json +4 -0
- data/.ruby-version +1 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +53 -0
- data/README.md +118 -0
- data/Rakefile +9 -0
- data/bin/epub-tools +107 -0
- data/epub_tools.gemspec +21 -0
- data/lib/epub_tools/add_chapters_to_epub.rb +87 -0
- data/lib/epub_tools/cli_helper.rb +31 -0
- data/lib/epub_tools/compile_book.rb +121 -0
- data/lib/epub_tools/epub_initializer.rb +197 -0
- data/lib/epub_tools/pack_ebook.rb +60 -0
- data/lib/epub_tools/split_chapters.rb +105 -0
- data/lib/epub_tools/text_style_class_finder.rb +47 -0
- data/lib/epub_tools/unpack_ebook.rb +46 -0
- data/lib/epub_tools/version.rb +3 -0
- data/lib/epub_tools/xhtml_cleaner.rb +75 -0
- data/lib/epub_tools/xhtml_extractor.rb +46 -0
- data/lib/epub_tools.rb +12 -0
- data/style.css +99 -0
- data/test/add_chapters_to_epub_test.rb +92 -0
- data/test/compile_book_test.rb +193 -0
- data/test/epub_initializer_test.rb +55 -0
- data/test/pack_ebook_test.rb +68 -0
- data/test/split_chapters_test.rb +53 -0
- data/test/test_helper.rb +9 -0
- data/test/text_style_class_finder_test.rb +40 -0
- data/test/unpack_ebook_test.rb +58 -0
- data/test/xhtml_cleaner_test.rb +39 -0
- data/test/xhtml_extractor_test.rb +31 -0
- metadata +142 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'fileutils'
|
3
|
+
require 'time'
|
4
|
+
require 'securerandom'
|
5
|
+
|
6
|
+
module EpubTools
|
7
|
+
class EpubInitializer
|
8
|
+
# title: book title; author: author name; destination: output EPUB directory
|
9
|
+
# cover_image: optional path to cover image file
|
10
|
+
def initialize(title, author, destination, cover_image = nil)
|
11
|
+
@title = title
|
12
|
+
@author = author
|
13
|
+
@destination = File.expand_path(destination)
|
14
|
+
@uuid = "urn:uuid:#{SecureRandom.uuid}"
|
15
|
+
@modified = Time.now.utc.iso8601
|
16
|
+
@cover_image_path = cover_image
|
17
|
+
@cover_image_fname = nil
|
18
|
+
@cover_image_media_type = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def run
|
22
|
+
create_structure
|
23
|
+
write_mimetype
|
24
|
+
write_title_page
|
25
|
+
write_container
|
26
|
+
write_cover if @cover_image_path
|
27
|
+
write_package_opf
|
28
|
+
write_nav
|
29
|
+
write_style
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def create_structure
|
35
|
+
FileUtils.mkdir_p("#{@destination}/META-INF")
|
36
|
+
FileUtils.mkdir_p("#{@destination}/OEBPS")
|
37
|
+
end
|
38
|
+
|
39
|
+
def write_mimetype
|
40
|
+
File.write("#{@destination}/mimetype", "application/epub+zip")
|
41
|
+
end
|
42
|
+
|
43
|
+
def write_title_page
|
44
|
+
content = <<~XHTML
|
45
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
46
|
+
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
47
|
+
<head>
|
48
|
+
<meta charset="UTF-8" />
|
49
|
+
<title>#{@title}</title>
|
50
|
+
<link rel="stylesheet" type="text/css" href="style.css"/>
|
51
|
+
</head>
|
52
|
+
<body>
|
53
|
+
<h1 class="title">#{@title}</h1>
|
54
|
+
<p class="author">by #{@author}</p>
|
55
|
+
</body>
|
56
|
+
</html>
|
57
|
+
XHTML
|
58
|
+
|
59
|
+
File.write("#{@destination}/OEBPS/title.xhtml", content)
|
60
|
+
end
|
61
|
+
|
62
|
+
def write_container
|
63
|
+
content = <<~XML
|
64
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
65
|
+
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
66
|
+
<rootfiles>
|
67
|
+
<rootfile full-path="OEBPS/package.opf" media-type="application/oebps-package+xml"/>
|
68
|
+
</rootfiles>
|
69
|
+
</container>
|
70
|
+
XML
|
71
|
+
File.write("#{@destination}/META-INF/container.xml", content)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Copies the cover image into the EPUB structure and creates a cover.xhtml page
|
75
|
+
def write_cover
|
76
|
+
path = @cover_image_path
|
77
|
+
unless File.exist?(path)
|
78
|
+
warn "Warning: cover image '#{path}' not found; skipping cover support."
|
79
|
+
return
|
80
|
+
end
|
81
|
+
ext = File.extname(path).downcase
|
82
|
+
@cover_image_media_type = case ext
|
83
|
+
when '.jpg', '.jpeg' then 'image/jpeg'
|
84
|
+
when '.png' then 'image/png'
|
85
|
+
when '.gif' then 'image/gif'
|
86
|
+
when '.svg' then 'image/svg+xml'
|
87
|
+
else
|
88
|
+
warn "Warning: unsupported cover image type '#{ext}'; skipping cover support."
|
89
|
+
return
|
90
|
+
end
|
91
|
+
@cover_image_fname = "cover#{ext}"
|
92
|
+
dest = File.join(@destination, 'OEBPS', @cover_image_fname)
|
93
|
+
FileUtils.cp(path, dest)
|
94
|
+
write_cover_page
|
95
|
+
end
|
96
|
+
|
97
|
+
# Generates a cover.xhtml file displaying the cover image
|
98
|
+
def write_cover_page
|
99
|
+
content = <<~XHTML
|
100
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
101
|
+
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
102
|
+
<head>
|
103
|
+
<meta charset="UTF-8" />
|
104
|
+
<title>Cover</title>
|
105
|
+
<link rel="stylesheet" type="text/css" href="style.css"/>
|
106
|
+
</head>
|
107
|
+
<body>
|
108
|
+
<div class="cover-image">
|
109
|
+
<img src="#{@cover_image_fname}" alt="Cover"/>
|
110
|
+
</div>
|
111
|
+
</body>
|
112
|
+
</html>
|
113
|
+
XHTML
|
114
|
+
File.write(File.join(@destination, 'OEBPS', 'cover.xhtml'), content)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Generates the package.opf with optional cover image entries
|
118
|
+
def write_package_opf
|
119
|
+
manifest_items = []
|
120
|
+
spine_items = []
|
121
|
+
|
122
|
+
manifest_items << '<item id="style" href="style.css" media-type="text/css"/>'
|
123
|
+
manifest_items << '<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>'
|
124
|
+
|
125
|
+
if @cover_image_fname
|
126
|
+
manifest_items << %Q{<item id="cover-image" href="#{@cover_image_fname}" media-type="#{@cover_image_media_type}" properties="cover-image"/>}
|
127
|
+
manifest_items << '<item id="cover-page" href="cover.xhtml" media-type="application/xhtml+xml"/>'
|
128
|
+
spine_items << '<itemref idref="cover-page"/>'
|
129
|
+
end
|
130
|
+
|
131
|
+
manifest_items << '<item id="title" href="title.xhtml" media-type="application/xhtml+xml"/>'
|
132
|
+
spine_items << '<itemref idref="title"/>'
|
133
|
+
|
134
|
+
metadata = []
|
135
|
+
metadata << %Q{<dc:identifier id="pub-id">#{@uuid}</dc:identifier>}
|
136
|
+
metadata << %Q{<dc:title>#{@title}</dc:title>}
|
137
|
+
metadata << %Q{<dc:creator>#{@author}</dc:creator>}
|
138
|
+
metadata << "<dc:language>en</dc:language>"
|
139
|
+
metadata << %Q{<meta property="dcterms:modified">#{@modified}</meta>}
|
140
|
+
metadata << %Q{<meta property="schema:accessMode">textual</meta>}
|
141
|
+
metadata << %Q{<meta property="schema:accessibilityFeature">unknown</meta>}
|
142
|
+
metadata << %Q{<meta property="schema:accessibilityHazard">none</meta>}
|
143
|
+
metadata << %Q{<meta property="schema:accessModeSufficient">textual</meta>}
|
144
|
+
if @cover_image_fname
|
145
|
+
metadata << %Q{<meta name="cover" content="cover-image"/>}
|
146
|
+
end
|
147
|
+
|
148
|
+
content = <<~XML
|
149
|
+
<?xml version="1.0" encoding="utf-8"?>
|
150
|
+
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="pub-id" xml:lang="en">
|
151
|
+
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
152
|
+
#{metadata.map { |line| " #{line}" }.join("\n")}
|
153
|
+
</metadata>
|
154
|
+
<manifest>
|
155
|
+
#{manifest_items.map { |line| " #{line}" }.join("\n")}
|
156
|
+
</manifest>
|
157
|
+
<spine>
|
158
|
+
#{spine_items.map { |line| " #{line}" }.join("\n")}
|
159
|
+
</spine>
|
160
|
+
</package>
|
161
|
+
XML
|
162
|
+
|
163
|
+
File.write(File.join(@destination, 'OEBPS', 'package.opf'), content)
|
164
|
+
end
|
165
|
+
|
166
|
+
# Generates the initial navigation document (Table of Contents)
|
167
|
+
def write_nav
|
168
|
+
content = <<~XHTML
|
169
|
+
<?xml version="1.0" encoding="utf-8"?>
|
170
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en">
|
171
|
+
<head>
|
172
|
+
<title>Table of Contents</title>
|
173
|
+
</head>
|
174
|
+
<body>
|
175
|
+
<nav epub:type="toc" id="toc">
|
176
|
+
<h1>Table of Contents</h1>
|
177
|
+
<ol>
|
178
|
+
<li><a href="title.xhtml">Title Page</a></li>
|
179
|
+
</ol>
|
180
|
+
</nav>
|
181
|
+
</body>
|
182
|
+
</html>
|
183
|
+
XHTML
|
184
|
+
File.write(File.join(@destination, 'OEBPS', 'nav.xhtml'), content)
|
185
|
+
end
|
186
|
+
|
187
|
+
def write_style
|
188
|
+
src = File.join(Dir.pwd, 'style.css')
|
189
|
+
dest = File.join(@destination, 'OEBPS', 'style.css')
|
190
|
+
unless File.exist?(src)
|
191
|
+
warn "Warning: style.css not found in project root (#{src}), skipping copy."
|
192
|
+
return
|
193
|
+
end
|
194
|
+
FileUtils.cp(src, dest)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'zip'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'pathname'
|
4
|
+
|
5
|
+
module EpubTools
|
6
|
+
# Packages an EPUB directory into a .epub file
|
7
|
+
class PackEbook
|
8
|
+
# input_dir: path to the EPUB directory (containing mimetype, META-INF, OEBPS)
|
9
|
+
# output_file: path to resulting .epub file; if nil, defaults to <input_dir>.epub
|
10
|
+
def initialize(input_dir, output_file = nil, verbose: false)
|
11
|
+
@input_dir = File.expand_path(input_dir)
|
12
|
+
default_name = "#{File.basename(@input_dir)}.epub"
|
13
|
+
@output_file = if output_file.nil? || output_file.empty?
|
14
|
+
default_name
|
15
|
+
else
|
16
|
+
output_file
|
17
|
+
end
|
18
|
+
@verbose = verbose
|
19
|
+
end
|
20
|
+
|
21
|
+
# Run the packaging process
|
22
|
+
def run
|
23
|
+
validate_input!
|
24
|
+
Dir.chdir(@input_dir) do
|
25
|
+
# determine the output path: absolute stays as-is, otherwise sibling to input_dir
|
26
|
+
target = Pathname.new(@output_file).absolute? ? @output_file : File.join('..', @output_file)
|
27
|
+
FileUtils.rm_f(target)
|
28
|
+
Zip::File.open(target, Zip::File::CREATE) do |zip|
|
29
|
+
# Add mimetype first and uncompressed
|
30
|
+
add_mimetype(zip)
|
31
|
+
|
32
|
+
# Add all other files with compression, preserving paths
|
33
|
+
Dir.glob('**/*', File::FNM_DOTMATCH).sort.each do |entry|
|
34
|
+
next if ['.', '..', 'mimetype'].include?(entry)
|
35
|
+
next if File.directory?(entry)
|
36
|
+
zip.add(entry, entry)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
puts "EPUB created: #{@output_file}" if @verbose
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def validate_input!
|
46
|
+
unless Dir.exist?(@input_dir)
|
47
|
+
raise ArgumentError, "Directory '#{@input_dir}' does not exist."
|
48
|
+
end
|
49
|
+
mimetype = File.join(@input_dir, 'mimetype')
|
50
|
+
unless File.file?(mimetype)
|
51
|
+
raise ArgumentError, "Error: 'mimetype' file missing in #{@input_dir}"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def add_mimetype(zip)
|
56
|
+
# Add mimetype first and uncompressed (Stored)
|
57
|
+
zip.add_stored('mimetype', 'mimetype')
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'yaml'
|
4
|
+
require_relative 'text_style_class_finder'
|
5
|
+
require_relative 'xhtml_cleaner'
|
6
|
+
|
7
|
+
module EpubTools
|
8
|
+
class SplitChapters
|
9
|
+
# input_file: path to the source XHTML
|
10
|
+
# book_title: title to use in HTML <title> tags
|
11
|
+
# output_dir: where to write chapter files
|
12
|
+
# output_prefix: filename prefix (e.g. "chapter")
|
13
|
+
def initialize(input_file, book_title, output_dir = './chapters', output_prefix = 'chapter', verbose = false)
|
14
|
+
@input_file = input_file
|
15
|
+
@book_title = book_title
|
16
|
+
@output_dir = output_dir
|
17
|
+
@output_prefix = output_prefix
|
18
|
+
@verbose = verbose
|
19
|
+
end
|
20
|
+
|
21
|
+
def run
|
22
|
+
# Prepare output dir
|
23
|
+
Dir.mkdir(@output_dir) unless Dir.exist?(@output_dir)
|
24
|
+
|
25
|
+
# Read the doc
|
26
|
+
raw_content = read_and_strip_problematic_hr
|
27
|
+
doc = Nokogiri::HTML(raw_content)
|
28
|
+
|
29
|
+
# Find Style Classes
|
30
|
+
TextStyleClassFinder.new(@input_file, verbose: @verbose).call
|
31
|
+
|
32
|
+
chapters = extract_chapters(doc)
|
33
|
+
write_chapter_files(chapters)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def read_and_strip_problematic_hr
|
39
|
+
File.read(@input_file).gsub(/<hr\b[^>]*\/?>/i, '').gsub(/<br\b[^>]*\/?>/i, '')
|
40
|
+
end
|
41
|
+
|
42
|
+
def extract_chapters(doc)
|
43
|
+
chapters = {}
|
44
|
+
current_number = nil
|
45
|
+
current_fragment = nil
|
46
|
+
|
47
|
+
doc.at('body').children.each do |node|
|
48
|
+
if (m = node.text.match(/Chapter\s+(\d+)/i)) && %w[p span h2 h3 h4].include?(node.name)
|
49
|
+
# start a new chapter (skip the marker node so title isn't duplicated)
|
50
|
+
chapters[current_number] = current_fragment.to_html if current_number
|
51
|
+
current_number = m[1].to_i
|
52
|
+
current_fragment = Nokogiri::HTML::DocumentFragment.parse('')
|
53
|
+
elsif prologue_marker?(node)
|
54
|
+
# start the prologue (skip the marker node)
|
55
|
+
chapters[current_number] = current_fragment.to_html if current_number
|
56
|
+
current_number = 0
|
57
|
+
current_fragment = Nokogiri::HTML::DocumentFragment.parse('')
|
58
|
+
else
|
59
|
+
current_fragment&.add_child(node.dup)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
chapters[current_number] = current_fragment.to_html if current_number
|
64
|
+
chapters
|
65
|
+
end
|
66
|
+
|
67
|
+
def write_chapter_files(chapters)
|
68
|
+
chapters.each do |number, content|
|
69
|
+
write_chapter_file(number, content)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def write_chapter_file(label, content)
|
74
|
+
display_label = display_label(label)
|
75
|
+
filename = File.join(@output_dir, "#{@output_prefix}_#{label}.xhtml")
|
76
|
+
File.write(filename, <<~HTML)
|
77
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
78
|
+
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
79
|
+
<head>
|
80
|
+
<title>#{@book_title} - #{display_label}</title>
|
81
|
+
<link rel="stylesheet" type="text/css" href="style.css"/>
|
82
|
+
</head>
|
83
|
+
<body>
|
84
|
+
<h1>#{display_label}</h1>
|
85
|
+
#{content}
|
86
|
+
</body>
|
87
|
+
</html>
|
88
|
+
HTML
|
89
|
+
XHTMLCleaner.new(filename).call
|
90
|
+
puts "Extracted: #{filename}" if @verbose
|
91
|
+
end
|
92
|
+
|
93
|
+
def display_label(label)
|
94
|
+
label > 0 ? "Chapter #{label}" : "Prologue"
|
95
|
+
end
|
96
|
+
|
97
|
+
# Detect a bolded Prologue marker
|
98
|
+
def prologue_marker?(node)
|
99
|
+
return false unless %w[h3 h4].include?(node.name)
|
100
|
+
return false unless node.text.strip =~ /\APrologue\z/i
|
101
|
+
true
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
module EpubTools
|
6
|
+
class TextStyleClassFinder
|
7
|
+
def initialize(file_path, output_path = 'text_style_classes.yaml', verbose: false)
|
8
|
+
@file_path = file_path
|
9
|
+
@output_path = output_path
|
10
|
+
@verbose = verbose
|
11
|
+
raise ArgumentError, "File does not exist: #{@file_path}" unless File.exist?(@file_path)
|
12
|
+
end
|
13
|
+
|
14
|
+
def call
|
15
|
+
doc = Nokogiri::HTML(File.read(@file_path))
|
16
|
+
style_blocks = doc.xpath('//style').map(&:text).join("\n")
|
17
|
+
|
18
|
+
italics = extract_classes(style_blocks, /font-style\s*:\s*italic/)
|
19
|
+
bolds = extract_classes(style_blocks, /font-weight\s*:\s*700/)
|
20
|
+
|
21
|
+
print_summary(italics, bolds) if @verbose
|
22
|
+
|
23
|
+
data = {
|
24
|
+
"italics" => italics,
|
25
|
+
"bolds" => bolds
|
26
|
+
}
|
27
|
+
File.write(@output_path, data.to_yaml)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def extract_classes(style_text, pattern)
|
33
|
+
regex = /\.([\w-]+)\s*{[^}]*#{pattern.source}[^}]*}/i
|
34
|
+
style_text.scan(regex).flatten.uniq
|
35
|
+
end
|
36
|
+
|
37
|
+
def print_summary(italics, bolds)
|
38
|
+
unless italics.empty?
|
39
|
+
puts "Classes with font-style: italic: #{italics.join(", ")}"
|
40
|
+
end
|
41
|
+
|
42
|
+
unless bolds.empty?
|
43
|
+
puts "Classes with font-weight: 700: #{bolds.join(", ")}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'zip'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module EpubTools
|
5
|
+
# Unpacks an EPUB (.epub file) into a directory
|
6
|
+
class UnpackEbook
|
7
|
+
# epub_file: path to the .epub file
|
8
|
+
# output_dir: directory to extract into; defaults to basename of epub_file without .epub
|
9
|
+
def initialize(epub_file, output_dir = nil, verbose: false)
|
10
|
+
@epub_file = File.expand_path(epub_file)
|
11
|
+
default_dir = [File.dirname(@epub_file), File.basename(@epub_file, '.epub')].join("/")
|
12
|
+
@output_dir = if output_dir.nil? || output_dir.empty?
|
13
|
+
default_dir
|
14
|
+
else
|
15
|
+
output_dir
|
16
|
+
end
|
17
|
+
@verbose = verbose
|
18
|
+
end
|
19
|
+
|
20
|
+
# Extracts all entries from the EPUB into the output directory
|
21
|
+
def run
|
22
|
+
validate!
|
23
|
+
FileUtils.mkdir_p(@output_dir)
|
24
|
+
Zip::File.open(@epub_file) do |zip|
|
25
|
+
zip.each do |entry|
|
26
|
+
dest_path = File.join(@output_dir, entry.name)
|
27
|
+
if entry.directory?
|
28
|
+
FileUtils.mkdir_p(dest_path)
|
29
|
+
else
|
30
|
+
FileUtils.mkdir_p(File.dirname(dest_path))
|
31
|
+
entry.extract(dest_path) { true }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
puts "Unpacked #{File.basename(@epub_file)} to #{@output_dir}" if @verbose
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def validate!
|
41
|
+
unless File.file?(@epub_file)
|
42
|
+
raise ArgumentError, "EPUB file '#{@epub_file}' does not exist"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
module EpubTools
|
7
|
+
class XHTMLCleaner
|
8
|
+
def initialize(filename, class_config = 'text_style_classes.yaml')
|
9
|
+
@filename = filename
|
10
|
+
@classes = YAML.load_file(class_config).transform_keys(&:to_sym)
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
raw_content = read_and_strip_problematic_hr
|
15
|
+
doc = parse_xml(raw_content)
|
16
|
+
remove_empty_paragraphs(doc)
|
17
|
+
remove_bold_spans(doc)
|
18
|
+
replace_italic_spans(doc)
|
19
|
+
unwrap_remaining_spans(doc)
|
20
|
+
write_pretty_output(doc)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def read_and_strip_problematic_hr
|
26
|
+
File.read(@filename).gsub(/<hr\b[^>]*\/?>/i, '').gsub(/<br\b[^>]*\/?>/i, '')
|
27
|
+
end
|
28
|
+
|
29
|
+
def parse_xml(content)
|
30
|
+
Nokogiri::XML(content) { |config| config.default_xml.noblanks }
|
31
|
+
rescue => e
|
32
|
+
abort "Error parsing XML: #{e.message}"
|
33
|
+
end
|
34
|
+
|
35
|
+
def remove_empty_paragraphs(doc)
|
36
|
+
doc.css('p').each do |p|
|
37
|
+
content = p.inner_html.strip
|
38
|
+
if content.empty? || content =~ /\A(<span[^>]*>\s*<\/span>\s*)+\z/
|
39
|
+
p.remove
|
40
|
+
else
|
41
|
+
p.remove_attribute('class')
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def remove_bold_spans(doc)
|
47
|
+
@classes[:bolds].each do |class_name|
|
48
|
+
doc.css("span.#{class_name}").each do |node|
|
49
|
+
node.parent.remove
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def replace_italic_spans(doc)
|
55
|
+
@classes[:italics].each do |class_name|
|
56
|
+
doc.css("span.#{class_name}").each do |node|
|
57
|
+
node.name = "i"
|
58
|
+
node.remove_attribute('class')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def unwrap_remaining_spans(doc)
|
64
|
+
doc.css("span").each do |span|
|
65
|
+
span.add_next_sibling(span.dup.content)
|
66
|
+
span.remove
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def write_pretty_output(doc)
|
71
|
+
formatted_xml = doc.to_xml(indent: 2)
|
72
|
+
File.write(@filename, formatted_xml)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'zip'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module EpubTools
|
5
|
+
# Extracts .xhtml files from EPUB archives, excluding nav.xhtml
|
6
|
+
class XHTMLExtractor
|
7
|
+
def initialize(source_dir:, target_dir:, verbose: false)
|
8
|
+
@source_dir = File.expand_path(source_dir)
|
9
|
+
@target_dir = File.expand_path(target_dir)
|
10
|
+
@verbose = verbose
|
11
|
+
FileUtils.mkdir_p(@target_dir)
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract_all
|
15
|
+
epub_files.each do |epub_path|
|
16
|
+
extract_xhtmls_from(epub_path)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def epub_files
|
23
|
+
Dir.glob(File.join(@source_dir, '*.epub'))
|
24
|
+
end
|
25
|
+
|
26
|
+
def extract_xhtmls_from(epub_path)
|
27
|
+
epub_name = File.basename(epub_path, '.epub')
|
28
|
+
puts "Extracting from #{epub_name}.epub" if @verbose
|
29
|
+
extracted_files = []
|
30
|
+
Zip::File.open(epub_path) do |zip_file|
|
31
|
+
zip_file.each do |entry|
|
32
|
+
next unless entry.name.downcase.end_with?('.xhtml')
|
33
|
+
next if File.basename(entry.name).downcase == 'nav.xhtml'
|
34
|
+
output_path = File.join(@target_dir, "#{epub_name}_#{File.basename(entry.name)}")
|
35
|
+
FileUtils.mkdir_p(File.dirname(output_path))
|
36
|
+
entry.extract(output_path) { true }
|
37
|
+
puts output_path if @verbose
|
38
|
+
extracted_files << output_path
|
39
|
+
end
|
40
|
+
end
|
41
|
+
extracted_files
|
42
|
+
rescue Zip::Error => e
|
43
|
+
warn "⚠️ Failed to process #{epub_path}: #{e.message}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/epub_tools.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require_relative 'epub_tools/version'
|
2
|
+
require_relative 'epub_tools/add_chapters_to_epub'
|
3
|
+
require_relative 'epub_tools/cli_helper'
|
4
|
+
require_relative 'epub_tools/epub_initializer'
|
5
|
+
require_relative 'epub_tools/split_chapters'
|
6
|
+
require_relative 'epub_tools/xhtml_cleaner'
|
7
|
+
require_relative 'epub_tools/xhtml_extractor'
|
8
|
+
require_relative 'epub_tools/pack_ebook'
|
9
|
+
require_relative 'epub_tools/unpack_ebook'
|
10
|
+
|
11
|
+
module EpubTools
|
12
|
+
end
|
data/style.css
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
ol{
|
2
|
+
margin:0;
|
3
|
+
padding:0
|
4
|
+
}
|
5
|
+
table td,table th{
|
6
|
+
padding:0;
|
7
|
+
}
|
8
|
+
li{
|
9
|
+
color:#000000;
|
10
|
+
font-size:11pt;
|
11
|
+
}
|
12
|
+
p{
|
13
|
+
margin: 0;
|
14
|
+
margin-bottom: 1em;
|
15
|
+
color:#000000;
|
16
|
+
font-size:11pt;
|
17
|
+
line-height: 1.4;
|
18
|
+
orphans:2;
|
19
|
+
widows:2;
|
20
|
+
text-align:left;
|
21
|
+
}
|
22
|
+
p:first-of-type {
|
23
|
+
text-indent: 1em;
|
24
|
+
}
|
25
|
+
h1{
|
26
|
+
text-align: center;
|
27
|
+
padding-top:16pt;
|
28
|
+
color:#000000;
|
29
|
+
font-size:16pt;
|
30
|
+
padding-bottom:16pt;
|
31
|
+
line-height:1.15;
|
32
|
+
page-break-after:avoid;
|
33
|
+
orphans:2;
|
34
|
+
widows:2;
|
35
|
+
}
|
36
|
+
h1.title {
|
37
|
+
font-size: 42pt;
|
38
|
+
margin-bottom: 20pt;
|
39
|
+
}
|
40
|
+
.author {
|
41
|
+
text-align: center;
|
42
|
+
font-size: 14pt;
|
43
|
+
}
|
44
|
+
h2{
|
45
|
+
padding-top:18pt;
|
46
|
+
color:#000000;
|
47
|
+
font-size:16pt;
|
48
|
+
padding-bottom:6pt;
|
49
|
+
line-height:1.15;
|
50
|
+
page-break-after:avoid;
|
51
|
+
orphans:2;
|
52
|
+
widows:2;
|
53
|
+
text-align:left
|
54
|
+
}
|
55
|
+
h3{
|
56
|
+
padding-top:16pt;
|
57
|
+
color:#434343;
|
58
|
+
font-size:14pt;
|
59
|
+
padding-bottom:4pt;
|
60
|
+
line-height:1.15;
|
61
|
+
page-break-after:avoid;
|
62
|
+
orphans:2;
|
63
|
+
widows:2;
|
64
|
+
text-align:left
|
65
|
+
}
|
66
|
+
h4{
|
67
|
+
padding-top:14pt;
|
68
|
+
color:#666666;
|
69
|
+
font-size:12pt;
|
70
|
+
padding-bottom:4pt;
|
71
|
+
line-height:1.15;
|
72
|
+
page-break-after:avoid;
|
73
|
+
orphans:2;
|
74
|
+
widows:2;
|
75
|
+
text-align:left
|
76
|
+
}
|
77
|
+
h5{
|
78
|
+
padding-top:12pt;
|
79
|
+
color:#666666;
|
80
|
+
font-size:11pt;
|
81
|
+
padding-bottom:4pt;
|
82
|
+
line-height:1.15;
|
83
|
+
page-break-after:avoid;
|
84
|
+
orphans:2;
|
85
|
+
widows:2;
|
86
|
+
text-align:left
|
87
|
+
}
|
88
|
+
h6{
|
89
|
+
padding-top:12pt;
|
90
|
+
color:#666666;
|
91
|
+
font-size:11pt;
|
92
|
+
padding-bottom:4pt;
|
93
|
+
line-height:1.15;
|
94
|
+
page-break-after:avoid;
|
95
|
+
font-style:italic;
|
96
|
+
orphans:2;
|
97
|
+
widows:2;
|
98
|
+
text-align:left
|
99
|
+
}
|