epub-parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/.gitignore +6 -0
- data/Gemfile +4 -0
- data/Rakefile +12 -0
- data/epub-parser.gemspec +33 -0
- data/lib/epub/book.rb +37 -0
- data/lib/epub/content_document.rb +6 -0
- data/lib/epub/content_document/navigation.rb +80 -0
- data/lib/epub/ocf.rb +8 -0
- data/lib/epub/ocf/container.rb +22 -0
- data/lib/epub/ocf/encryption.rb +6 -0
- data/lib/epub/ocf/manifest.rb +6 -0
- data/lib/epub/ocf/metadata.rb +6 -0
- data/lib/epub/ocf/rights.rb +6 -0
- data/lib/epub/ocf/signatures.rb +6 -0
- data/lib/epub/parser.rb +46 -0
- data/lib/epub/parser/content_document.rb +58 -0
- data/lib/epub/parser/ocf.rb +59 -0
- data/lib/epub/parser/publication.rb +87 -0
- data/lib/epub/parser/version.rb +5 -0
- data/lib/epub/publication.rb +6 -0
- data/lib/epub/publication/package.rb +25 -0
- data/lib/epub/publication/package/bindings.rb +18 -0
- data/lib/epub/publication/package/guide.rb +11 -0
- data/lib/epub/publication/package/manifest.rb +44 -0
- data/lib/epub/publication/package/metadata.rb +17 -0
- data/lib/epub/publication/package/spine.rb +46 -0
- data/lib/epub/type.rb +7 -0
- data/schemas/epub-nav-30.rnc +10 -0
- data/schemas/epub-nav-30.sch +72 -0
- data/schemas/epub-xhtml-30.sch +377 -0
- data/schemas/ocf-container-30.rnc +16 -0
- data/test/fixtures/book.epub +0 -0
- data/test/fixtures/book/META-INF/container.xml +6 -0
- data/test/fixtures/book/OPS/nav.xhtml +26 -0
- data/test/fixtures/book/OPS//343/203/253/343/203/274/343/203/210/343/203/225/343/202/241/343/202/244/343/203/253.opf +28 -0
- data/test/fixtures/book/mimetype +1 -0
- data/test/helper.rb +9 -0
- data/test/publication/package/test_bindings.rb +15 -0
- data/test/publication/package/test_guide.rb +14 -0
- data/test/publication/package/test_manifest.rb +19 -0
- data/test/publication/package/test_metadata.rb +14 -0
- data/test/publication/package/test_spine.rb +15 -0
- data/test/test_parser.rb +52 -0
- data/test/test_parser_ocf.rb +22 -0
- data/test/test_parser_publication.rb +15 -0
- metadata +208 -0
data/.gemtest
ADDED
File without changes
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'yard'
|
4
|
+
|
5
|
+
task :default => :test
|
6
|
+
|
7
|
+
Rake::TestTask.new do |task|
|
8
|
+
task.test_files = FileList['test/**/test_*.rb']
|
9
|
+
ENV['TESTOPTS'] = '--no-show-detail-immediately --verbose'
|
10
|
+
end
|
11
|
+
|
12
|
+
YARD::Rake::YardocTask.new
|
data/epub-parser.gemspec
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "epub/parser/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "epub-parser"
|
7
|
+
s.version = EPUB::Parser::VERSION
|
8
|
+
s.authors = ["KITAITI Makoto"]
|
9
|
+
s.email = ["KitaitiMakoto@gmail.com"]
|
10
|
+
s.homepage = "https://gitorious.org/epub/parser"
|
11
|
+
s.summary = %q{EPUB 3 Parser}
|
12
|
+
s.description = %q{Parse EPUB 3 book loosely}
|
13
|
+
|
14
|
+
# s.rubyforge_project = "epub-parser"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n").push('test/fixtures/book/OPS/ルートファイル.opf')
|
17
|
+
s.files.delete('"test/fixtures/book/OPS/\343\203\253\343\203\274\343\203\210\343\203\225\343\202\241\343\202\244\343\203\253.opf"')
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/**/*.rb`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_development_dependency 'rubygems-test'
|
23
|
+
s.add_development_dependency 'rake'
|
24
|
+
s.add_development_dependency 'pry'
|
25
|
+
s.add_development_dependency 'test-unit', '~> 2'
|
26
|
+
s.add_development_dependency 'simplecov'
|
27
|
+
s.add_development_dependency 'thin'
|
28
|
+
s.add_development_dependency 'yard'
|
29
|
+
|
30
|
+
s.add_runtime_dependency 'enumerabler'
|
31
|
+
s.add_runtime_dependency 'nokogiri'
|
32
|
+
s.add_runtime_dependency 'addressable'
|
33
|
+
end
|
data/lib/epub/book.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'epub/ocf'
|
2
|
+
require 'epub/publication'
|
3
|
+
require 'epub/content_document'
|
4
|
+
|
5
|
+
module EPUB
|
6
|
+
class Book
|
7
|
+
attr_accessor :ocf, :package, :content_document
|
8
|
+
|
9
|
+
def each_page_by_spine(&blk)
|
10
|
+
enum = @package.spine.items
|
11
|
+
if block_given?
|
12
|
+
enum.each &blk
|
13
|
+
else
|
14
|
+
enum
|
15
|
+
end
|
16
|
+
end
|
17
|
+
def each_page_by_toc(&blk)
|
18
|
+
end
|
19
|
+
|
20
|
+
def each_content(&blk)
|
21
|
+
enum = @package.manifest.items
|
22
|
+
if block_given?
|
23
|
+
enum.each &blk
|
24
|
+
else
|
25
|
+
enum.to_enum
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def other_navigation
|
30
|
+
end
|
31
|
+
|
32
|
+
# Syntax suger
|
33
|
+
def rootfile_path
|
34
|
+
ocf.container.rootfile.full_path
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module EPUB
|
2
|
+
module ContentDocument
|
3
|
+
class Navigation
|
4
|
+
attr_accessor :navs
|
5
|
+
alias navigations navs
|
6
|
+
alias navigations= navs=
|
7
|
+
|
8
|
+
def toc
|
9
|
+
navs.selector {|nav| nav.type == Type::TOC}.first
|
10
|
+
end
|
11
|
+
|
12
|
+
def page_list
|
13
|
+
navs.selector {|nav| nav.type == Type::PAGE_LIST}.first
|
14
|
+
end
|
15
|
+
|
16
|
+
def landmarks
|
17
|
+
navs.selector {|nav| nav.type == Type::LANDMARKS}.first
|
18
|
+
end
|
19
|
+
|
20
|
+
# Enumerator version of toc
|
21
|
+
# Usage: nagivation.enum_for(:contents)
|
22
|
+
def contents
|
23
|
+
end
|
24
|
+
|
25
|
+
# Enumerator version of page_list
|
26
|
+
# Usage: navigation.enum_for(:pages)
|
27
|
+
def pages
|
28
|
+
end
|
29
|
+
|
30
|
+
# iterator for #toc
|
31
|
+
def each_content
|
32
|
+
end
|
33
|
+
|
34
|
+
# iterator for #page_list
|
35
|
+
def each_page
|
36
|
+
end
|
37
|
+
|
38
|
+
# iterator for #landmark
|
39
|
+
def each_landmark
|
40
|
+
end
|
41
|
+
|
42
|
+
class Nav
|
43
|
+
attr_accessor :heading, :ol,
|
44
|
+
:items, # children of ol, thus li
|
45
|
+
:type, # toc, page-list, landmarks or other
|
46
|
+
:hidden
|
47
|
+
|
48
|
+
# #show method and #hide are unneccessary
|
49
|
+
# because this is for parser, not for builder nor manipulator
|
50
|
+
def hidden?
|
51
|
+
end
|
52
|
+
|
53
|
+
class Ol
|
54
|
+
# list-style :none
|
55
|
+
attr_accessor :hidden
|
56
|
+
|
57
|
+
def hidden?
|
58
|
+
end
|
59
|
+
|
60
|
+
# may be followed by ol or be a leaf node
|
61
|
+
class A
|
62
|
+
attr_accessor :ol, # optional
|
63
|
+
:hidden
|
64
|
+
|
65
|
+
def hidden?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# must be followed by ol, or must not be a leaf node
|
70
|
+
class Span
|
71
|
+
attr_accessor :ol, # required
|
72
|
+
:hidden
|
73
|
+
def hidden?
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/epub/ocf.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module EPUB
|
2
|
+
class OCF
|
3
|
+
class Container
|
4
|
+
FILE = 'container.xml'
|
5
|
+
|
6
|
+
attr_accessor :rootfiles
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@rootfiles = []
|
10
|
+
end
|
11
|
+
|
12
|
+
# syntax sugar
|
13
|
+
def rootfile
|
14
|
+
rootfiles.first
|
15
|
+
end
|
16
|
+
|
17
|
+
class Rootfile
|
18
|
+
attr_accessor :full_path, :media_type
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/epub/parser.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'epub/book'
|
2
|
+
require 'epub/parser/version'
|
3
|
+
require 'epub/parser/ocf'
|
4
|
+
require 'epub/parser/publication'
|
5
|
+
require 'epub/parser/content_document'
|
6
|
+
require 'nokogiri'
|
7
|
+
|
8
|
+
module EPUB
|
9
|
+
class Parser
|
10
|
+
def initialize(filepath, root_directory, options = {})
|
11
|
+
raise 'File #{filepath} not readable' unless File.readable_real? filepath
|
12
|
+
raise 'File #{root_directory} already exists' if File.file? root_directory
|
13
|
+
|
14
|
+
@filepath = File.realpath filepath
|
15
|
+
Dir.mkdir(root_directory) unless File.directory? root_directory
|
16
|
+
@dir = File.realpath root_directory
|
17
|
+
|
18
|
+
@book = Book.new
|
19
|
+
|
20
|
+
unzip_cmd = options['unzip-command'] || 'unzip'
|
21
|
+
unzip_cmd << " #{@filepath} -d #{@dir}"
|
22
|
+
system unzip_cmd
|
23
|
+
end
|
24
|
+
|
25
|
+
def parse
|
26
|
+
@book.ocf = parse_ocf
|
27
|
+
@book.package = parse_publication
|
28
|
+
@book.content_document = parse_content_document
|
29
|
+
# ...
|
30
|
+
|
31
|
+
@book
|
32
|
+
end
|
33
|
+
|
34
|
+
def parse_ocf
|
35
|
+
OCF.parse @dir
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_publication
|
39
|
+
Publication.parse File.join(@dir, @book.rootfile_path)
|
40
|
+
end
|
41
|
+
|
42
|
+
def parse_content_document
|
43
|
+
# ContentDocument.parse @dir
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'epub/content_document'
|
2
|
+
require 'epub/parser'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module EPUB
|
6
|
+
class Parser
|
7
|
+
class ContentDocument
|
8
|
+
class << self
|
9
|
+
def parse(root_directory)
|
10
|
+
new(root_directory).parse
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(root_directory)
|
15
|
+
@dir = root_directory
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse
|
19
|
+
raise 'Not implemented yet'
|
20
|
+
end
|
21
|
+
|
22
|
+
# @param [Nokogiri::HTML::Document] document HTML document or element including nav
|
23
|
+
# @return [Array<EPUB::ContentDocument::Navigation::Nav>] navs array of Nav object
|
24
|
+
def parse_navigations(document)
|
25
|
+
navs = document.search('/xhtml:html/xhtml:body//xhtml:nav', Parser::NAMESPACES).collect {|elem| parse_navigation elem}
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param [Nokogiri::XML::Element] nav nav element
|
29
|
+
# @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
|
30
|
+
def parse_navigation(element)
|
31
|
+
nav = EPUB::ContentDocument::Navigation::Nav.new
|
32
|
+
nav.heading = find_heading element
|
33
|
+
|
34
|
+
# to find type, need to use strict xpath for handling namespaces?
|
35
|
+
# And if so, where should the namespaces be defined?
|
36
|
+
# nav.type = element['epub:type']
|
37
|
+
element.namespaces['epub'] = "http://www.idpf.org/2007/ops"
|
38
|
+
p element.namespaces
|
39
|
+
nav.type = element['epub:type']
|
40
|
+
p nav.type
|
41
|
+
|
42
|
+
nav
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
# @param [Nokogiri::XML::Element] nav nav element
|
48
|
+
# @return [String] heading heading text
|
49
|
+
def find_heading(element)
|
50
|
+
heading = element.xpath('./h1|h2|h3|h4|h5|h6|hgroup').first
|
51
|
+
|
52
|
+
return heading.text unless heading.name == 'hgroup'
|
53
|
+
|
54
|
+
(heading/'h1' || heading/'h2' || heading/'h3' || heading/'h4' || heading/'h5' || heading/'h6').first.text
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'epub/ocf'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module EPUB
|
5
|
+
class Parser
|
6
|
+
class OCF
|
7
|
+
DIRECTORY = 'META-INF'
|
8
|
+
EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"}
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def parse(root_directory)
|
12
|
+
new(root_directory).parse
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(root_directory)
|
17
|
+
@dir = root_directory
|
18
|
+
@ocf = EPUB::OCF.new
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse
|
22
|
+
EPUB::OCF::MODULES.each do |m|
|
23
|
+
@ocf.send "#{m}=", send("parse_#{m}")
|
24
|
+
end
|
25
|
+
@ocf
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_container
|
29
|
+
container = EPUB::OCF::Container.new
|
30
|
+
doc = Nokogiri.XML open(File.join @dir, DIRECTORY, CONTAINER_FILE)
|
31
|
+
|
32
|
+
doc.xpath('/xmlns:container/xmlns:rootfiles/xmlns:rootfile', doc.namespaces).each do |elem|
|
33
|
+
rootfile = EPUB::OCF::Container::Rootfile.new
|
34
|
+
%w[full-path media-type].each do |attr|
|
35
|
+
rootfile.send(attr.gsub(/-/, '_') + '=', elem[attr])
|
36
|
+
container.rootfiles << rootfile
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
container
|
41
|
+
end
|
42
|
+
|
43
|
+
def parse_encryption
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_manifest
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_metadata
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_rights
|
53
|
+
end
|
54
|
+
|
55
|
+
def parse_signatures
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'addressable/uri'
|
3
|
+
require 'epub/publication'
|
4
|
+
|
5
|
+
module EPUB
|
6
|
+
class Parser
|
7
|
+
class Publication
|
8
|
+
class << self
|
9
|
+
def parse(file)
|
10
|
+
new(file).parse
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(file)
|
15
|
+
@package = EPUB::Publication::Package.new
|
16
|
+
@rootfile = Addressable::URI.parse File.realpath(file)
|
17
|
+
@doc = Nokogiri.XML open(file)
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse
|
21
|
+
# parse_metadata
|
22
|
+
parse_manifest
|
23
|
+
parse_spine
|
24
|
+
# parse_guide
|
25
|
+
# parse_bindings
|
26
|
+
|
27
|
+
@package
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse_metadata
|
31
|
+
raise 'still not implemented'
|
32
|
+
end
|
33
|
+
|
34
|
+
def parse_manifest
|
35
|
+
manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
|
36
|
+
elem = @doc.xpath('/xmlns:package/xmlns:manifest', @doc.namespaces).first
|
37
|
+
manifest.id = elem['id']
|
38
|
+
|
39
|
+
fallback_map = {}
|
40
|
+
elem.xpath('./xmlns:item').each do |elm|
|
41
|
+
item = EPUB::Publication::Package::Manifest::Item.new
|
42
|
+
%w[ id media-type media-overlay ].each do |attr|
|
43
|
+
item.send "#{attr.gsub(/-/, '_')}=", elm[attr]
|
44
|
+
end
|
45
|
+
# item.href = Addressable::URI.parse elm['href']
|
46
|
+
item.href = @rootfile.join Addressable::URI.parse(elm['href'])
|
47
|
+
fallback_map[elm['fallback']] = item if elm['fallback']
|
48
|
+
item.properties = elm['properties'] ? elm['properties'].split(' ') : []
|
49
|
+
manifest << item
|
50
|
+
end
|
51
|
+
fallback_map.each_pair do |id, from|
|
52
|
+
from.fallback = manifest[id]
|
53
|
+
end
|
54
|
+
|
55
|
+
manifest
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_spine
|
59
|
+
spine = @package.spine = EPUB::Publication::Package::Spine.new
|
60
|
+
elem = @doc.xpath('/xmlns:package/xmlns:spine', @doc.namespaces).first
|
61
|
+
%w[ id toc page-progression-direction ].each do |attr|
|
62
|
+
spine.send("#{attr.gsub(/-/, '_')}=", elem[attr])
|
63
|
+
end
|
64
|
+
|
65
|
+
elem.xpath('./xmlns:itemref', @doc.namespaces).each do |elm|
|
66
|
+
itemref = EPUB::Publication::Package::Spine::Itemref.new
|
67
|
+
%w[ idref id ].each do |attr|
|
68
|
+
itemref.send "#{attr}=", elm[attr]
|
69
|
+
end
|
70
|
+
itemref.linear = !(elm['linear'] == 'no')
|
71
|
+
itemref.properties = elm['properties'] ? elm['properties'].split(' ') : []
|
72
|
+
spine << itemref
|
73
|
+
end
|
74
|
+
|
75
|
+
spine
|
76
|
+
end
|
77
|
+
|
78
|
+
def parse_guide
|
79
|
+
raise 'still not implemented'
|
80
|
+
end
|
81
|
+
|
82
|
+
def parse_bindings
|
83
|
+
raise 'still not implemented'
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|