epub-parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/.gemtest +0 -0
  2. data/.gitignore +6 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +12 -0
  5. data/epub-parser.gemspec +33 -0
  6. data/lib/epub/book.rb +37 -0
  7. data/lib/epub/content_document.rb +6 -0
  8. data/lib/epub/content_document/navigation.rb +80 -0
  9. data/lib/epub/ocf.rb +8 -0
  10. data/lib/epub/ocf/container.rb +22 -0
  11. data/lib/epub/ocf/encryption.rb +6 -0
  12. data/lib/epub/ocf/manifest.rb +6 -0
  13. data/lib/epub/ocf/metadata.rb +6 -0
  14. data/lib/epub/ocf/rights.rb +6 -0
  15. data/lib/epub/ocf/signatures.rb +6 -0
  16. data/lib/epub/parser.rb +46 -0
  17. data/lib/epub/parser/content_document.rb +58 -0
  18. data/lib/epub/parser/ocf.rb +59 -0
  19. data/lib/epub/parser/publication.rb +87 -0
  20. data/lib/epub/parser/version.rb +5 -0
  21. data/lib/epub/publication.rb +6 -0
  22. data/lib/epub/publication/package.rb +25 -0
  23. data/lib/epub/publication/package/bindings.rb +18 -0
  24. data/lib/epub/publication/package/guide.rb +11 -0
  25. data/lib/epub/publication/package/manifest.rb +44 -0
  26. data/lib/epub/publication/package/metadata.rb +17 -0
  27. data/lib/epub/publication/package/spine.rb +46 -0
  28. data/lib/epub/type.rb +7 -0
  29. data/schemas/epub-nav-30.rnc +10 -0
  30. data/schemas/epub-nav-30.sch +72 -0
  31. data/schemas/epub-xhtml-30.sch +377 -0
  32. data/schemas/ocf-container-30.rnc +16 -0
  33. data/test/fixtures/book.epub +0 -0
  34. data/test/fixtures/book/META-INF/container.xml +6 -0
  35. data/test/fixtures/book/OPS/nav.xhtml +26 -0
  36. data/test/fixtures/book/OPS//343/203/253/343/203/274/343/203/210/343/203/225/343/202/241/343/202/244/343/203/253.opf +28 -0
  37. data/test/fixtures/book/mimetype +1 -0
  38. data/test/helper.rb +9 -0
  39. data/test/publication/package/test_bindings.rb +15 -0
  40. data/test/publication/package/test_guide.rb +14 -0
  41. data/test/publication/package/test_manifest.rb +19 -0
  42. data/test/publication/package/test_metadata.rb +14 -0
  43. data/test/publication/package/test_spine.rb +15 -0
  44. data/test/test_parser.rb +52 -0
  45. data/test/test_parser_ocf.rb +22 -0
  46. data/test/test_parser_publication.rb +15 -0
  47. metadata +208 -0
data/.gemtest ADDED
File without changes
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ vendor/*
6
+ *~
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in epub-parser.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+ require 'yard'
4
+
5
+ task :default => :test
6
+
7
+ Rake::TestTask.new do |task|
8
+ task.test_files = FileList['test/**/test_*.rb']
9
+ ENV['TESTOPTS'] = '--no-show-detail-immediately --verbose'
10
+ end
11
+
12
+ YARD::Rake::YardocTask.new
@@ -0,0 +1,33 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "epub/parser/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "epub-parser"
7
+ s.version = EPUB::Parser::VERSION
8
+ s.authors = ["KITAITI Makoto"]
9
+ s.email = ["KitaitiMakoto@gmail.com"]
10
+ s.homepage = "https://gitorious.org/epub/parser"
11
+ s.summary = %q{EPUB 3 Parser}
12
+ s.description = %q{Parse EPUB 3 book loosely}
13
+
14
+ # s.rubyforge_project = "epub-parser"
15
+
16
+ s.files = `git ls-files`.split("\n").push('test/fixtures/book/OPS/ルートファイル.opf')
17
+ s.files.delete('"test/fixtures/book/OPS/\343\203\253\343\203\274\343\203\210\343\203\225\343\202\241\343\202\244\343\203\253.opf"')
18
+ s.test_files = `git ls-files -- {test,spec,features}/**/*.rb`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_development_dependency 'rubygems-test'
23
+ s.add_development_dependency 'rake'
24
+ s.add_development_dependency 'pry'
25
+ s.add_development_dependency 'test-unit', '~> 2'
26
+ s.add_development_dependency 'simplecov'
27
+ s.add_development_dependency 'thin'
28
+ s.add_development_dependency 'yard'
29
+
30
+ s.add_runtime_dependency 'enumerabler'
31
+ s.add_runtime_dependency 'nokogiri'
32
+ s.add_runtime_dependency 'addressable'
33
+ end
data/lib/epub/book.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'epub/ocf'
2
+ require 'epub/publication'
3
+ require 'epub/content_document'
4
+
5
+ module EPUB
6
+ class Book
7
+ attr_accessor :ocf, :package, :content_document
8
+
9
+ def each_page_by_spine(&blk)
10
+ enum = @package.spine.items
11
+ if block_given?
12
+ enum.each &blk
13
+ else
14
+ enum
15
+ end
16
+ end
17
+ def each_page_by_toc(&blk)
18
+ end
19
+
20
+ def each_content(&blk)
21
+ enum = @package.manifest.items
22
+ if block_given?
23
+ enum.each &blk
24
+ else
25
+ enum.to_enum
26
+ end
27
+ end
28
+
29
+ def other_navigation
30
+ end
31
+
32
+ # Syntax suger
33
+ def rootfile_path
34
+ ocf.container.rootfile.full_path
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,6 @@
1
+ require 'epub/content_document/navigation'
2
+
3
+ module EPUB
4
+ module ContentDocument
5
+ end
6
+ end
@@ -0,0 +1,80 @@
1
+ module EPUB
2
+ module ContentDocument
3
+ class Navigation
4
+ attr_accessor :navs
5
+ alias navigations navs
6
+ alias navigations= navs=
7
+
8
+ def toc
9
+ navs.selector {|nav| nav.type == Type::TOC}.first
10
+ end
11
+
12
+ def page_list
13
+ navs.selector {|nav| nav.type == Type::PAGE_LIST}.first
14
+ end
15
+
16
+ def landmarks
17
+ navs.selector {|nav| nav.type == Type::LANDMARKS}.first
18
+ end
19
+
20
+ # Enumerator version of toc
21
+ # Usage: nagivation.enum_for(:contents)
22
+ def contents
23
+ end
24
+
25
+ # Enumerator version of page_list
26
+ # Usage: navigation.enum_for(:pages)
27
+ def pages
28
+ end
29
+
30
+ # iterator for #toc
31
+ def each_content
32
+ end
33
+
34
+ # iterator for #page_list
35
+ def each_page
36
+ end
37
+
38
+ # iterator for #landmark
39
+ def each_landmark
40
+ end
41
+
42
+ class Nav
43
+ attr_accessor :heading, :ol,
44
+ :items, # children of ol, thus li
45
+ :type, # toc, page-list, landmarks or other
46
+ :hidden
47
+
48
+ # #show method and #hide are unneccessary
49
+ # because this is for parser, not for builder nor manipulator
50
+ def hidden?
51
+ end
52
+
53
+ class Ol
54
+ # list-style :none
55
+ attr_accessor :hidden
56
+
57
+ def hidden?
58
+ end
59
+
60
+ # may be followed by ol or be a leaf node
61
+ class A
62
+ attr_accessor :ol, # optional
63
+ :hidden
64
+
65
+ def hidden?
66
+ end
67
+ end
68
+
69
+ # must be followed by ol, or must not be a leaf node
70
+ class Span
71
+ attr_accessor :ol, # required
72
+ :hidden
73
+ def hidden?
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
data/lib/epub/ocf.rb ADDED
@@ -0,0 +1,8 @@
1
+ module EPUB
2
+ class OCF
3
+ MODULES = %w[container encryption manifest metadata rights signatures]
4
+ MODULES.each {|m| require "epub/ocf/#{m}"}
5
+
6
+ attr_accessor *MODULES
7
+ end
8
+ end
@@ -0,0 +1,22 @@
1
+ module EPUB
2
+ class OCF
3
+ class Container
4
+ FILE = 'container.xml'
5
+
6
+ attr_accessor :rootfiles
7
+
8
+ def initialize
9
+ @rootfiles = []
10
+ end
11
+
12
+ # syntax sugar
13
+ def rootfile
14
+ rootfiles.first
15
+ end
16
+
17
+ class Rootfile
18
+ attr_accessor :full_path, :media_type
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Encryption
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Manifest
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Metadata
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Rights
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Signatures
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,46 @@
1
+ require 'epub/book'
2
+ require 'epub/parser/version'
3
+ require 'epub/parser/ocf'
4
+ require 'epub/parser/publication'
5
+ require 'epub/parser/content_document'
6
+ require 'nokogiri'
7
+
8
+ module EPUB
9
+ class Parser
10
+ def initialize(filepath, root_directory, options = {})
11
+ raise 'File #{filepath} not readable' unless File.readable_real? filepath
12
+ raise 'File #{root_directory} already exists' if File.file? root_directory
13
+
14
+ @filepath = File.realpath filepath
15
+ Dir.mkdir(root_directory) unless File.directory? root_directory
16
+ @dir = File.realpath root_directory
17
+
18
+ @book = Book.new
19
+
20
+ unzip_cmd = options['unzip-command'] || 'unzip'
21
+ unzip_cmd << " #{@filepath} -d #{@dir}"
22
+ system unzip_cmd
23
+ end
24
+
25
+ def parse
26
+ @book.ocf = parse_ocf
27
+ @book.package = parse_publication
28
+ @book.content_document = parse_content_document
29
+ # ...
30
+
31
+ @book
32
+ end
33
+
34
+ def parse_ocf
35
+ OCF.parse @dir
36
+ end
37
+
38
+ def parse_publication
39
+ Publication.parse File.join(@dir, @book.rootfile_path)
40
+ end
41
+
42
+ def parse_content_document
43
+ # ContentDocument.parse @dir
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,58 @@
1
+ require 'epub/content_document'
2
+ require 'epub/parser'
3
+ require 'nokogiri'
4
+
5
+ module EPUB
6
+ class Parser
7
+ class ContentDocument
8
+ class << self
9
+ def parse(root_directory)
10
+ new(root_directory).parse
11
+ end
12
+ end
13
+
14
+ def initialize(root_directory)
15
+ @dir = root_directory
16
+ end
17
+
18
+ def parse
19
+ raise 'Not implemented yet'
20
+ end
21
+
22
+ # @param [Nokogiri::HTML::Document] document HTML document or element including nav
23
+ # @return [Array<EPUB::ContentDocument::Navigation::Nav>] navs array of Nav object
24
+ def parse_navigations(document)
25
+ navs = document.search('/xhtml:html/xhtml:body//xhtml:nav', Parser::NAMESPACES).collect {|elem| parse_navigation elem}
26
+ end
27
+
28
+ # @param [Nokogiri::XML::Element] nav nav element
29
+ # @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
30
+ def parse_navigation(element)
31
+ nav = EPUB::ContentDocument::Navigation::Nav.new
32
+ nav.heading = find_heading element
33
+
34
+ # to find type, need to use strict xpath for handling namespaces?
35
+ # And if so, where should the namespaces be defined?
36
+ # nav.type = element['epub:type']
37
+ element.namespaces['epub'] = "http://www.idpf.org/2007/ops"
38
+ p element.namespaces
39
+ nav.type = element['epub:type']
40
+ p nav.type
41
+
42
+ nav
43
+ end
44
+
45
+ private
46
+
47
+ # @param [Nokogiri::XML::Element] nav nav element
48
+ # @return [String] heading heading text
49
+ def find_heading(element)
50
+ heading = element.xpath('./h1|h2|h3|h4|h5|h6|hgroup').first
51
+
52
+ return heading.text unless heading.name == 'hgroup'
53
+
54
+ (heading/'h1' || heading/'h2' || heading/'h3' || heading/'h4' || heading/'h5' || heading/'h6').first.text
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,59 @@
1
+ require 'epub/ocf'
2
+ require 'nokogiri'
3
+
4
+ module EPUB
5
+ class Parser
6
+ class OCF
7
+ DIRECTORY = 'META-INF'
8
+ EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"}
9
+
10
+ class << self
11
+ def parse(root_directory)
12
+ new(root_directory).parse
13
+ end
14
+ end
15
+
16
+ def initialize(root_directory)
17
+ @dir = root_directory
18
+ @ocf = EPUB::OCF.new
19
+ end
20
+
21
+ def parse
22
+ EPUB::OCF::MODULES.each do |m|
23
+ @ocf.send "#{m}=", send("parse_#{m}")
24
+ end
25
+ @ocf
26
+ end
27
+
28
+ def parse_container
29
+ container = EPUB::OCF::Container.new
30
+ doc = Nokogiri.XML open(File.join @dir, DIRECTORY, CONTAINER_FILE)
31
+
32
+ doc.xpath('/xmlns:container/xmlns:rootfiles/xmlns:rootfile', doc.namespaces).each do |elem|
33
+ rootfile = EPUB::OCF::Container::Rootfile.new
34
+ %w[full-path media-type].each do |attr|
35
+ rootfile.send(attr.gsub(/-/, '_') + '=', elem[attr])
36
+ container.rootfiles << rootfile
37
+ end
38
+ end
39
+
40
+ container
41
+ end
42
+
43
+ def parse_encryption
44
+ end
45
+
46
+ def parse_manifest
47
+ end
48
+
49
+ def parse_metadata
50
+ end
51
+
52
+ def parse_rights
53
+ end
54
+
55
+ def parse_signatures
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,87 @@
1
+ require 'nokogiri'
2
+ require 'addressable/uri'
3
+ require 'epub/publication'
4
+
5
+ module EPUB
6
+ class Parser
7
+ class Publication
8
+ class << self
9
+ def parse(file)
10
+ new(file).parse
11
+ end
12
+ end
13
+
14
+ def initialize(file)
15
+ @package = EPUB::Publication::Package.new
16
+ @rootfile = Addressable::URI.parse File.realpath(file)
17
+ @doc = Nokogiri.XML open(file)
18
+ end
19
+
20
+ def parse
21
+ # parse_metadata
22
+ parse_manifest
23
+ parse_spine
24
+ # parse_guide
25
+ # parse_bindings
26
+
27
+ @package
28
+ end
29
+
30
+ def parse_metadata
31
+ raise 'still not implemented'
32
+ end
33
+
34
+ def parse_manifest
35
+ manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
36
+ elem = @doc.xpath('/xmlns:package/xmlns:manifest', @doc.namespaces).first
37
+ manifest.id = elem['id']
38
+
39
+ fallback_map = {}
40
+ elem.xpath('./xmlns:item').each do |elm|
41
+ item = EPUB::Publication::Package::Manifest::Item.new
42
+ %w[ id media-type media-overlay ].each do |attr|
43
+ item.send "#{attr.gsub(/-/, '_')}=", elm[attr]
44
+ end
45
+ # item.href = Addressable::URI.parse elm['href']
46
+ item.href = @rootfile.join Addressable::URI.parse(elm['href'])
47
+ fallback_map[elm['fallback']] = item if elm['fallback']
48
+ item.properties = elm['properties'] ? elm['properties'].split(' ') : []
49
+ manifest << item
50
+ end
51
+ fallback_map.each_pair do |id, from|
52
+ from.fallback = manifest[id]
53
+ end
54
+
55
+ manifest
56
+ end
57
+
58
+ def parse_spine
59
+ spine = @package.spine = EPUB::Publication::Package::Spine.new
60
+ elem = @doc.xpath('/xmlns:package/xmlns:spine', @doc.namespaces).first
61
+ %w[ id toc page-progression-direction ].each do |attr|
62
+ spine.send("#{attr.gsub(/-/, '_')}=", elem[attr])
63
+ end
64
+
65
+ elem.xpath('./xmlns:itemref', @doc.namespaces).each do |elm|
66
+ itemref = EPUB::Publication::Package::Spine::Itemref.new
67
+ %w[ idref id ].each do |attr|
68
+ itemref.send "#{attr}=", elm[attr]
69
+ end
70
+ itemref.linear = !(elm['linear'] == 'no')
71
+ itemref.properties = elm['properties'] ? elm['properties'].split(' ') : []
72
+ spine << itemref
73
+ end
74
+
75
+ spine
76
+ end
77
+
78
+ def parse_guide
79
+ raise 'still not implemented'
80
+ end
81
+
82
+ def parse_bindings
83
+ raise 'still not implemented'
84
+ end
85
+ end
86
+ end
87
+ end