epub-parser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/.gitignore +6 -0
- data/Gemfile +4 -0
- data/Rakefile +12 -0
- data/epub-parser.gemspec +33 -0
- data/lib/epub/book.rb +37 -0
- data/lib/epub/content_document.rb +6 -0
- data/lib/epub/content_document/navigation.rb +80 -0
- data/lib/epub/ocf.rb +8 -0
- data/lib/epub/ocf/container.rb +22 -0
- data/lib/epub/ocf/encryption.rb +6 -0
- data/lib/epub/ocf/manifest.rb +6 -0
- data/lib/epub/ocf/metadata.rb +6 -0
- data/lib/epub/ocf/rights.rb +6 -0
- data/lib/epub/ocf/signatures.rb +6 -0
- data/lib/epub/parser.rb +46 -0
- data/lib/epub/parser/content_document.rb +58 -0
- data/lib/epub/parser/ocf.rb +59 -0
- data/lib/epub/parser/publication.rb +87 -0
- data/lib/epub/parser/version.rb +5 -0
- data/lib/epub/publication.rb +6 -0
- data/lib/epub/publication/package.rb +25 -0
- data/lib/epub/publication/package/bindings.rb +18 -0
- data/lib/epub/publication/package/guide.rb +11 -0
- data/lib/epub/publication/package/manifest.rb +44 -0
- data/lib/epub/publication/package/metadata.rb +17 -0
- data/lib/epub/publication/package/spine.rb +46 -0
- data/lib/epub/type.rb +7 -0
- data/schemas/epub-nav-30.rnc +10 -0
- data/schemas/epub-nav-30.sch +72 -0
- data/schemas/epub-xhtml-30.sch +377 -0
- data/schemas/ocf-container-30.rnc +16 -0
- data/test/fixtures/book.epub +0 -0
- data/test/fixtures/book/META-INF/container.xml +6 -0
- data/test/fixtures/book/OPS/nav.xhtml +26 -0
- data/test/fixtures/book/OPS//343/203/253/343/203/274/343/203/210/343/203/225/343/202/241/343/202/244/343/203/253.opf +28 -0
- data/test/fixtures/book/mimetype +1 -0
- data/test/helper.rb +9 -0
- data/test/publication/package/test_bindings.rb +15 -0
- data/test/publication/package/test_guide.rb +14 -0
- data/test/publication/package/test_manifest.rb +19 -0
- data/test/publication/package/test_metadata.rb +14 -0
- data/test/publication/package/test_spine.rb +15 -0
- data/test/test_parser.rb +52 -0
- data/test/test_parser_ocf.rb +22 -0
- data/test/test_parser_publication.rb +15 -0
- metadata +208 -0
data/.gemtest
ADDED
File without changes
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'yard'
|
4
|
+
|
5
|
+
task :default => :test
|
6
|
+
|
7
|
+
Rake::TestTask.new do |task|
|
8
|
+
task.test_files = FileList['test/**/test_*.rb']
|
9
|
+
ENV['TESTOPTS'] = '--no-show-detail-immediately --verbose'
|
10
|
+
end
|
11
|
+
|
12
|
+
YARD::Rake::YardocTask.new
|
data/epub-parser.gemspec
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "epub/parser/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "epub-parser"
|
7
|
+
s.version = EPUB::Parser::VERSION
|
8
|
+
s.authors = ["KITAITI Makoto"]
|
9
|
+
s.email = ["KitaitiMakoto@gmail.com"]
|
10
|
+
s.homepage = "https://gitorious.org/epub/parser"
|
11
|
+
s.summary = %q{EPUB 3 Parser}
|
12
|
+
s.description = %q{Parse EPUB 3 book loosely}
|
13
|
+
|
14
|
+
# s.rubyforge_project = "epub-parser"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n").push('test/fixtures/book/OPS/ルートファイル.opf')
|
17
|
+
s.files.delete('"test/fixtures/book/OPS/\343\203\253\343\203\274\343\203\210\343\203\225\343\202\241\343\202\244\343\203\253.opf"')
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/**/*.rb`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_development_dependency 'rubygems-test'
|
23
|
+
s.add_development_dependency 'rake'
|
24
|
+
s.add_development_dependency 'pry'
|
25
|
+
s.add_development_dependency 'test-unit', '~> 2'
|
26
|
+
s.add_development_dependency 'simplecov'
|
27
|
+
s.add_development_dependency 'thin'
|
28
|
+
s.add_development_dependency 'yard'
|
29
|
+
|
30
|
+
s.add_runtime_dependency 'enumerabler'
|
31
|
+
s.add_runtime_dependency 'nokogiri'
|
32
|
+
s.add_runtime_dependency 'addressable'
|
33
|
+
end
|
data/lib/epub/book.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'epub/ocf'
|
2
|
+
require 'epub/publication'
|
3
|
+
require 'epub/content_document'
|
4
|
+
|
5
|
+
module EPUB
|
6
|
+
class Book
|
7
|
+
attr_accessor :ocf, :package, :content_document
|
8
|
+
|
9
|
+
def each_page_by_spine(&blk)
|
10
|
+
enum = @package.spine.items
|
11
|
+
if block_given?
|
12
|
+
enum.each &blk
|
13
|
+
else
|
14
|
+
enum
|
15
|
+
end
|
16
|
+
end
|
17
|
+
def each_page_by_toc(&blk)
|
18
|
+
end
|
19
|
+
|
20
|
+
def each_content(&blk)
|
21
|
+
enum = @package.manifest.items
|
22
|
+
if block_given?
|
23
|
+
enum.each &blk
|
24
|
+
else
|
25
|
+
enum.to_enum
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def other_navigation
|
30
|
+
end
|
31
|
+
|
32
|
+
# Syntax suger
|
33
|
+
def rootfile_path
|
34
|
+
ocf.container.rootfile.full_path
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module EPUB
|
2
|
+
module ContentDocument
|
3
|
+
class Navigation
|
4
|
+
attr_accessor :navs
|
5
|
+
alias navigations navs
|
6
|
+
alias navigations= navs=
|
7
|
+
|
8
|
+
def toc
|
9
|
+
navs.selector {|nav| nav.type == Type::TOC}.first
|
10
|
+
end
|
11
|
+
|
12
|
+
def page_list
|
13
|
+
navs.selector {|nav| nav.type == Type::PAGE_LIST}.first
|
14
|
+
end
|
15
|
+
|
16
|
+
def landmarks
|
17
|
+
navs.selector {|nav| nav.type == Type::LANDMARKS}.first
|
18
|
+
end
|
19
|
+
|
20
|
+
# Enumerator version of toc
|
21
|
+
# Usage: nagivation.enum_for(:contents)
|
22
|
+
def contents
|
23
|
+
end
|
24
|
+
|
25
|
+
# Enumerator version of page_list
|
26
|
+
# Usage: navigation.enum_for(:pages)
|
27
|
+
def pages
|
28
|
+
end
|
29
|
+
|
30
|
+
# iterator for #toc
|
31
|
+
def each_content
|
32
|
+
end
|
33
|
+
|
34
|
+
# iterator for #page_list
|
35
|
+
def each_page
|
36
|
+
end
|
37
|
+
|
38
|
+
# iterator for #landmark
|
39
|
+
def each_landmark
|
40
|
+
end
|
41
|
+
|
42
|
+
class Nav
|
43
|
+
attr_accessor :heading, :ol,
|
44
|
+
:items, # children of ol, thus li
|
45
|
+
:type, # toc, page-list, landmarks or other
|
46
|
+
:hidden
|
47
|
+
|
48
|
+
# #show method and #hide are unneccessary
|
49
|
+
# because this is for parser, not for builder nor manipulator
|
50
|
+
def hidden?
|
51
|
+
end
|
52
|
+
|
53
|
+
class Ol
|
54
|
+
# list-style :none
|
55
|
+
attr_accessor :hidden
|
56
|
+
|
57
|
+
def hidden?
|
58
|
+
end
|
59
|
+
|
60
|
+
# may be followed by ol or be a leaf node
|
61
|
+
class A
|
62
|
+
attr_accessor :ol, # optional
|
63
|
+
:hidden
|
64
|
+
|
65
|
+
def hidden?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# must be followed by ol, or must not be a leaf node
|
70
|
+
class Span
|
71
|
+
attr_accessor :ol, # required
|
72
|
+
:hidden
|
73
|
+
def hidden?
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/epub/ocf.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module EPUB
|
2
|
+
class OCF
|
3
|
+
class Container
|
4
|
+
FILE = 'container.xml'
|
5
|
+
|
6
|
+
attr_accessor :rootfiles
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@rootfiles = []
|
10
|
+
end
|
11
|
+
|
12
|
+
# syntax sugar
|
13
|
+
def rootfile
|
14
|
+
rootfiles.first
|
15
|
+
end
|
16
|
+
|
17
|
+
class Rootfile
|
18
|
+
attr_accessor :full_path, :media_type
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/epub/parser.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'epub/book'
|
2
|
+
require 'epub/parser/version'
|
3
|
+
require 'epub/parser/ocf'
|
4
|
+
require 'epub/parser/publication'
|
5
|
+
require 'epub/parser/content_document'
|
6
|
+
require 'nokogiri'
|
7
|
+
|
8
|
+
module EPUB
|
9
|
+
class Parser
|
10
|
+
def initialize(filepath, root_directory, options = {})
|
11
|
+
raise 'File #{filepath} not readable' unless File.readable_real? filepath
|
12
|
+
raise 'File #{root_directory} already exists' if File.file? root_directory
|
13
|
+
|
14
|
+
@filepath = File.realpath filepath
|
15
|
+
Dir.mkdir(root_directory) unless File.directory? root_directory
|
16
|
+
@dir = File.realpath root_directory
|
17
|
+
|
18
|
+
@book = Book.new
|
19
|
+
|
20
|
+
unzip_cmd = options['unzip-command'] || 'unzip'
|
21
|
+
unzip_cmd << " #{@filepath} -d #{@dir}"
|
22
|
+
system unzip_cmd
|
23
|
+
end
|
24
|
+
|
25
|
+
def parse
|
26
|
+
@book.ocf = parse_ocf
|
27
|
+
@book.package = parse_publication
|
28
|
+
@book.content_document = parse_content_document
|
29
|
+
# ...
|
30
|
+
|
31
|
+
@book
|
32
|
+
end
|
33
|
+
|
34
|
+
def parse_ocf
|
35
|
+
OCF.parse @dir
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_publication
|
39
|
+
Publication.parse File.join(@dir, @book.rootfile_path)
|
40
|
+
end
|
41
|
+
|
42
|
+
def parse_content_document
|
43
|
+
# ContentDocument.parse @dir
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'epub/content_document'
|
2
|
+
require 'epub/parser'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module EPUB
|
6
|
+
class Parser
|
7
|
+
class ContentDocument
|
8
|
+
class << self
|
9
|
+
def parse(root_directory)
|
10
|
+
new(root_directory).parse
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(root_directory)
|
15
|
+
@dir = root_directory
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse
|
19
|
+
raise 'Not implemented yet'
|
20
|
+
end
|
21
|
+
|
22
|
+
# @param [Nokogiri::HTML::Document] document HTML document or element including nav
|
23
|
+
# @return [Array<EPUB::ContentDocument::Navigation::Nav>] navs array of Nav object
|
24
|
+
def parse_navigations(document)
|
25
|
+
navs = document.search('/xhtml:html/xhtml:body//xhtml:nav', Parser::NAMESPACES).collect {|elem| parse_navigation elem}
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param [Nokogiri::XML::Element] nav nav element
|
29
|
+
# @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
|
30
|
+
def parse_navigation(element)
|
31
|
+
nav = EPUB::ContentDocument::Navigation::Nav.new
|
32
|
+
nav.heading = find_heading element
|
33
|
+
|
34
|
+
# to find type, need to use strict xpath for handling namespaces?
|
35
|
+
# And if so, where should the namespaces be defined?
|
36
|
+
# nav.type = element['epub:type']
|
37
|
+
element.namespaces['epub'] = "http://www.idpf.org/2007/ops"
|
38
|
+
p element.namespaces
|
39
|
+
nav.type = element['epub:type']
|
40
|
+
p nav.type
|
41
|
+
|
42
|
+
nav
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
# @param [Nokogiri::XML::Element] nav nav element
|
48
|
+
# @return [String] heading heading text
|
49
|
+
def find_heading(element)
|
50
|
+
heading = element.xpath('./h1|h2|h3|h4|h5|h6|hgroup').first
|
51
|
+
|
52
|
+
return heading.text unless heading.name == 'hgroup'
|
53
|
+
|
54
|
+
(heading/'h1' || heading/'h2' || heading/'h3' || heading/'h4' || heading/'h5' || heading/'h6').first.text
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'epub/ocf'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module EPUB
|
5
|
+
class Parser
|
6
|
+
class OCF
|
7
|
+
DIRECTORY = 'META-INF'
|
8
|
+
EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"}
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def parse(root_directory)
|
12
|
+
new(root_directory).parse
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(root_directory)
|
17
|
+
@dir = root_directory
|
18
|
+
@ocf = EPUB::OCF.new
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse
|
22
|
+
EPUB::OCF::MODULES.each do |m|
|
23
|
+
@ocf.send "#{m}=", send("parse_#{m}")
|
24
|
+
end
|
25
|
+
@ocf
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_container
|
29
|
+
container = EPUB::OCF::Container.new
|
30
|
+
doc = Nokogiri.XML open(File.join @dir, DIRECTORY, CONTAINER_FILE)
|
31
|
+
|
32
|
+
doc.xpath('/xmlns:container/xmlns:rootfiles/xmlns:rootfile', doc.namespaces).each do |elem|
|
33
|
+
rootfile = EPUB::OCF::Container::Rootfile.new
|
34
|
+
%w[full-path media-type].each do |attr|
|
35
|
+
rootfile.send(attr.gsub(/-/, '_') + '=', elem[attr])
|
36
|
+
container.rootfiles << rootfile
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
container
|
41
|
+
end
|
42
|
+
|
43
|
+
def parse_encryption
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_manifest
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_metadata
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_rights
|
53
|
+
end
|
54
|
+
|
55
|
+
def parse_signatures
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'addressable/uri'
|
3
|
+
require 'epub/publication'
|
4
|
+
|
5
|
+
module EPUB
|
6
|
+
class Parser
|
7
|
+
class Publication
|
8
|
+
class << self
|
9
|
+
def parse(file)
|
10
|
+
new(file).parse
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(file)
|
15
|
+
@package = EPUB::Publication::Package.new
|
16
|
+
@rootfile = Addressable::URI.parse File.realpath(file)
|
17
|
+
@doc = Nokogiri.XML open(file)
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse
|
21
|
+
# parse_metadata
|
22
|
+
parse_manifest
|
23
|
+
parse_spine
|
24
|
+
# parse_guide
|
25
|
+
# parse_bindings
|
26
|
+
|
27
|
+
@package
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse_metadata
|
31
|
+
raise 'still not implemented'
|
32
|
+
end
|
33
|
+
|
34
|
+
def parse_manifest
|
35
|
+
manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
|
36
|
+
elem = @doc.xpath('/xmlns:package/xmlns:manifest', @doc.namespaces).first
|
37
|
+
manifest.id = elem['id']
|
38
|
+
|
39
|
+
fallback_map = {}
|
40
|
+
elem.xpath('./xmlns:item').each do |elm|
|
41
|
+
item = EPUB::Publication::Package::Manifest::Item.new
|
42
|
+
%w[ id media-type media-overlay ].each do |attr|
|
43
|
+
item.send "#{attr.gsub(/-/, '_')}=", elm[attr]
|
44
|
+
end
|
45
|
+
# item.href = Addressable::URI.parse elm['href']
|
46
|
+
item.href = @rootfile.join Addressable::URI.parse(elm['href'])
|
47
|
+
fallback_map[elm['fallback']] = item if elm['fallback']
|
48
|
+
item.properties = elm['properties'] ? elm['properties'].split(' ') : []
|
49
|
+
manifest << item
|
50
|
+
end
|
51
|
+
fallback_map.each_pair do |id, from|
|
52
|
+
from.fallback = manifest[id]
|
53
|
+
end
|
54
|
+
|
55
|
+
manifest
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_spine
|
59
|
+
spine = @package.spine = EPUB::Publication::Package::Spine.new
|
60
|
+
elem = @doc.xpath('/xmlns:package/xmlns:spine', @doc.namespaces).first
|
61
|
+
%w[ id toc page-progression-direction ].each do |attr|
|
62
|
+
spine.send("#{attr.gsub(/-/, '_')}=", elem[attr])
|
63
|
+
end
|
64
|
+
|
65
|
+
elem.xpath('./xmlns:itemref', @doc.namespaces).each do |elm|
|
66
|
+
itemref = EPUB::Publication::Package::Spine::Itemref.new
|
67
|
+
%w[ idref id ].each do |attr|
|
68
|
+
itemref.send "#{attr}=", elm[attr]
|
69
|
+
end
|
70
|
+
itemref.linear = !(elm['linear'] == 'no')
|
71
|
+
itemref.properties = elm['properties'] ? elm['properties'].split(' ') : []
|
72
|
+
spine << itemref
|
73
|
+
end
|
74
|
+
|
75
|
+
spine
|
76
|
+
end
|
77
|
+
|
78
|
+
def parse_guide
|
79
|
+
raise 'still not implemented'
|
80
|
+
end
|
81
|
+
|
82
|
+
def parse_bindings
|
83
|
+
raise 'still not implemented'
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|