epub-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/.gemtest +0 -0
  2. data/.gitignore +6 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +12 -0
  5. data/epub-parser.gemspec +33 -0
  6. data/lib/epub/book.rb +37 -0
  7. data/lib/epub/content_document.rb +6 -0
  8. data/lib/epub/content_document/navigation.rb +80 -0
  9. data/lib/epub/ocf.rb +8 -0
  10. data/lib/epub/ocf/container.rb +22 -0
  11. data/lib/epub/ocf/encryption.rb +6 -0
  12. data/lib/epub/ocf/manifest.rb +6 -0
  13. data/lib/epub/ocf/metadata.rb +6 -0
  14. data/lib/epub/ocf/rights.rb +6 -0
  15. data/lib/epub/ocf/signatures.rb +6 -0
  16. data/lib/epub/parser.rb +46 -0
  17. data/lib/epub/parser/content_document.rb +58 -0
  18. data/lib/epub/parser/ocf.rb +59 -0
  19. data/lib/epub/parser/publication.rb +87 -0
  20. data/lib/epub/parser/version.rb +5 -0
  21. data/lib/epub/publication.rb +6 -0
  22. data/lib/epub/publication/package.rb +25 -0
  23. data/lib/epub/publication/package/bindings.rb +18 -0
  24. data/lib/epub/publication/package/guide.rb +11 -0
  25. data/lib/epub/publication/package/manifest.rb +44 -0
  26. data/lib/epub/publication/package/metadata.rb +17 -0
  27. data/lib/epub/publication/package/spine.rb +46 -0
  28. data/lib/epub/type.rb +7 -0
  29. data/schemas/epub-nav-30.rnc +10 -0
  30. data/schemas/epub-nav-30.sch +72 -0
  31. data/schemas/epub-xhtml-30.sch +377 -0
  32. data/schemas/ocf-container-30.rnc +16 -0
  33. data/test/fixtures/book.epub +0 -0
  34. data/test/fixtures/book/META-INF/container.xml +6 -0
  35. data/test/fixtures/book/OPS/nav.xhtml +26 -0
  36. data/test/fixtures/book/OPS//343/203/253/343/203/274/343/203/210/343/203/225/343/202/241/343/202/244/343/203/253.opf +28 -0
  37. data/test/fixtures/book/mimetype +1 -0
  38. data/test/helper.rb +9 -0
  39. data/test/publication/package/test_bindings.rb +15 -0
  40. data/test/publication/package/test_guide.rb +14 -0
  41. data/test/publication/package/test_manifest.rb +19 -0
  42. data/test/publication/package/test_metadata.rb +14 -0
  43. data/test/publication/package/test_spine.rb +15 -0
  44. data/test/test_parser.rb +52 -0
  45. data/test/test_parser_ocf.rb +22 -0
  46. data/test/test_parser_publication.rb +15 -0
  47. metadata +208 -0
data/.gemtest ADDED
File without changes
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ vendor/*
6
+ *~
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in epub-parser.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+ require 'yard'
4
+
5
+ task :default => :test
6
+
7
+ Rake::TestTask.new do |task|
8
+ task.test_files = FileList['test/**/test_*.rb']
9
+ ENV['TESTOPTS'] = '--no-show-detail-immediately --verbose'
10
+ end
11
+
12
+ YARD::Rake::YardocTask.new
@@ -0,0 +1,33 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "epub/parser/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "epub-parser"
7
+ s.version = EPUB::Parser::VERSION
8
+ s.authors = ["KITAITI Makoto"]
9
+ s.email = ["KitaitiMakoto@gmail.com"]
10
+ s.homepage = "https://gitorious.org/epub/parser"
11
+ s.summary = %q{EPUB 3 Parser}
12
+ s.description = %q{Parse EPUB 3 book loosely}
13
+
14
+ # s.rubyforge_project = "epub-parser"
15
+
16
+ s.files = `git ls-files`.split("\n").push('test/fixtures/book/OPS/ルートファイル.opf')
17
+ s.files.delete('"test/fixtures/book/OPS/\343\203\253\343\203\274\343\203\210\343\203\225\343\202\241\343\202\244\343\203\253.opf"')
18
+ s.test_files = `git ls-files -- {test,spec,features}/**/*.rb`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_development_dependency 'rubygems-test'
23
+ s.add_development_dependency 'rake'
24
+ s.add_development_dependency 'pry'
25
+ s.add_development_dependency 'test-unit', '~> 2'
26
+ s.add_development_dependency 'simplecov'
27
+ s.add_development_dependency 'thin'
28
+ s.add_development_dependency 'yard'
29
+
30
+ s.add_runtime_dependency 'enumerabler'
31
+ s.add_runtime_dependency 'nokogiri'
32
+ s.add_runtime_dependency 'addressable'
33
+ end
data/lib/epub/book.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'epub/ocf'
2
+ require 'epub/publication'
3
+ require 'epub/content_document'
4
+
5
+ module EPUB
6
+ class Book
7
+ attr_accessor :ocf, :package, :content_document
8
+
9
+ def each_page_by_spine(&blk)
10
+ enum = @package.spine.items
11
+ if block_given?
12
+ enum.each &blk
13
+ else
14
+ enum
15
+ end
16
+ end
17
+ def each_page_by_toc(&blk)
18
+ end
19
+
20
+ def each_content(&blk)
21
+ enum = @package.manifest.items
22
+ if block_given?
23
+ enum.each &blk
24
+ else
25
+ enum.to_enum
26
+ end
27
+ end
28
+
29
+ def other_navigation
30
+ end
31
+
32
+ # Syntax suger
33
+ def rootfile_path
34
+ ocf.container.rootfile.full_path
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,6 @@
1
+ require 'epub/content_document/navigation'
2
+
3
+ module EPUB
4
+ module ContentDocument
5
+ end
6
+ end
@@ -0,0 +1,80 @@
1
+ module EPUB
2
+ module ContentDocument
3
+ class Navigation
4
+ attr_accessor :navs
5
+ alias navigations navs
6
+ alias navigations= navs=
7
+
8
+ def toc
9
+ navs.selector {|nav| nav.type == Type::TOC}.first
10
+ end
11
+
12
+ def page_list
13
+ navs.selector {|nav| nav.type == Type::PAGE_LIST}.first
14
+ end
15
+
16
+ def landmarks
17
+ navs.selector {|nav| nav.type == Type::LANDMARKS}.first
18
+ end
19
+
20
+ # Enumerator version of toc
21
+ # Usage: nagivation.enum_for(:contents)
22
+ def contents
23
+ end
24
+
25
+ # Enumerator version of page_list
26
+ # Usage: navigation.enum_for(:pages)
27
+ def pages
28
+ end
29
+
30
+ # iterator for #toc
31
+ def each_content
32
+ end
33
+
34
+ # iterator for #page_list
35
+ def each_page
36
+ end
37
+
38
+ # iterator for #landmark
39
+ def each_landmark
40
+ end
41
+
42
+ class Nav
43
+ attr_accessor :heading, :ol,
44
+ :items, # children of ol, thus li
45
+ :type, # toc, page-list, landmarks or other
46
+ :hidden
47
+
48
+ # #show method and #hide are unneccessary
49
+ # because this is for parser, not for builder nor manipulator
50
+ def hidden?
51
+ end
52
+
53
+ class Ol
54
+ # list-style :none
55
+ attr_accessor :hidden
56
+
57
+ def hidden?
58
+ end
59
+
60
+ # may be followed by ol or be a leaf node
61
+ class A
62
+ attr_accessor :ol, # optional
63
+ :hidden
64
+
65
+ def hidden?
66
+ end
67
+ end
68
+
69
+ # must be followed by ol, or must not be a leaf node
70
+ class Span
71
+ attr_accessor :ol, # required
72
+ :hidden
73
+ def hidden?
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
data/lib/epub/ocf.rb ADDED
@@ -0,0 +1,8 @@
1
+ module EPUB
2
+ class OCF
3
+ MODULES = %w[container encryption manifest metadata rights signatures]
4
+ MODULES.each {|m| require "epub/ocf/#{m}"}
5
+
6
+ attr_accessor *MODULES
7
+ end
8
+ end
@@ -0,0 +1,22 @@
1
+ module EPUB
2
+ class OCF
3
+ class Container
4
+ FILE = 'container.xml'
5
+
6
+ attr_accessor :rootfiles
7
+
8
+ def initialize
9
+ @rootfiles = []
10
+ end
11
+
12
+ # syntax sugar
13
+ def rootfile
14
+ rootfiles.first
15
+ end
16
+
17
+ class Rootfile
18
+ attr_accessor :full_path, :media_type
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Encryption
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Manifest
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Metadata
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Rights
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Signatures
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,46 @@
1
+ require 'epub/book'
2
+ require 'epub/parser/version'
3
+ require 'epub/parser/ocf'
4
+ require 'epub/parser/publication'
5
+ require 'epub/parser/content_document'
6
+ require 'nokogiri'
7
+
8
+ module EPUB
9
+ class Parser
10
+ def initialize(filepath, root_directory, options = {})
11
+ raise 'File #{filepath} not readable' unless File.readable_real? filepath
12
+ raise 'File #{root_directory} already exists' if File.file? root_directory
13
+
14
+ @filepath = File.realpath filepath
15
+ Dir.mkdir(root_directory) unless File.directory? root_directory
16
+ @dir = File.realpath root_directory
17
+
18
+ @book = Book.new
19
+
20
+ unzip_cmd = options['unzip-command'] || 'unzip'
21
+ unzip_cmd << " #{@filepath} -d #{@dir}"
22
+ system unzip_cmd
23
+ end
24
+
25
+ def parse
26
+ @book.ocf = parse_ocf
27
+ @book.package = parse_publication
28
+ @book.content_document = parse_content_document
29
+ # ...
30
+
31
+ @book
32
+ end
33
+
34
+ def parse_ocf
35
+ OCF.parse @dir
36
+ end
37
+
38
+ def parse_publication
39
+ Publication.parse File.join(@dir, @book.rootfile_path)
40
+ end
41
+
42
+ def parse_content_document
43
+ # ContentDocument.parse @dir
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,58 @@
1
+ require 'epub/content_document'
2
+ require 'epub/parser'
3
+ require 'nokogiri'
4
+
5
+ module EPUB
6
+ class Parser
7
+ class ContentDocument
8
+ class << self
9
+ def parse(root_directory)
10
+ new(root_directory).parse
11
+ end
12
+ end
13
+
14
+ def initialize(root_directory)
15
+ @dir = root_directory
16
+ end
17
+
18
+ def parse
19
+ raise 'Not implemented yet'
20
+ end
21
+
22
+ # @param [Nokogiri::HTML::Document] document HTML document or element including nav
23
+ # @return [Array<EPUB::ContentDocument::Navigation::Nav>] navs array of Nav object
24
+ def parse_navigations(document)
25
+ navs = document.search('/xhtml:html/xhtml:body//xhtml:nav', Parser::NAMESPACES).collect {|elem| parse_navigation elem}
26
+ end
27
+
28
+ # @param [Nokogiri::XML::Element] nav nav element
29
+ # @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
30
+ def parse_navigation(element)
31
+ nav = EPUB::ContentDocument::Navigation::Nav.new
32
+ nav.heading = find_heading element
33
+
34
+ # to find type, need to use strict xpath for handling namespaces?
35
+ # And if so, where should the namespaces be defined?
36
+ # nav.type = element['epub:type']
37
+ element.namespaces['epub'] = "http://www.idpf.org/2007/ops"
38
+ p element.namespaces
39
+ nav.type = element['epub:type']
40
+ p nav.type
41
+
42
+ nav
43
+ end
44
+
45
+ private
46
+
47
+ # @param [Nokogiri::XML::Element] nav nav element
48
+ # @return [String] heading heading text
49
+ def find_heading(element)
50
+ heading = element.xpath('./h1|h2|h3|h4|h5|h6|hgroup').first
51
+
52
+ return heading.text unless heading.name == 'hgroup'
53
+
54
+ (heading/'h1' || heading/'h2' || heading/'h3' || heading/'h4' || heading/'h5' || heading/'h6').first.text
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,59 @@
1
+ require 'epub/ocf'
2
+ require 'nokogiri'
3
+
4
+ module EPUB
5
+ class Parser
6
+ class OCF
7
+ DIRECTORY = 'META-INF'
8
+ EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"}
9
+
10
+ class << self
11
+ def parse(root_directory)
12
+ new(root_directory).parse
13
+ end
14
+ end
15
+
16
+ def initialize(root_directory)
17
+ @dir = root_directory
18
+ @ocf = EPUB::OCF.new
19
+ end
20
+
21
+ def parse
22
+ EPUB::OCF::MODULES.each do |m|
23
+ @ocf.send "#{m}=", send("parse_#{m}")
24
+ end
25
+ @ocf
26
+ end
27
+
28
+ def parse_container
29
+ container = EPUB::OCF::Container.new
30
+ doc = Nokogiri.XML open(File.join @dir, DIRECTORY, CONTAINER_FILE)
31
+
32
+ doc.xpath('/xmlns:container/xmlns:rootfiles/xmlns:rootfile', doc.namespaces).each do |elem|
33
+ rootfile = EPUB::OCF::Container::Rootfile.new
34
+ %w[full-path media-type].each do |attr|
35
+ rootfile.send(attr.gsub(/-/, '_') + '=', elem[attr])
36
+ container.rootfiles << rootfile
37
+ end
38
+ end
39
+
40
+ container
41
+ end
42
+
43
+ def parse_encryption
44
+ end
45
+
46
+ def parse_manifest
47
+ end
48
+
49
+ def parse_metadata
50
+ end
51
+
52
+ def parse_rights
53
+ end
54
+
55
+ def parse_signatures
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,87 @@
1
+ require 'nokogiri'
2
+ require 'addressable/uri'
3
+ require 'epub/publication'
4
+
5
+ module EPUB
6
+ class Parser
7
+ class Publication
8
+ class << self
9
+ def parse(file)
10
+ new(file).parse
11
+ end
12
+ end
13
+
14
+ def initialize(file)
15
+ @package = EPUB::Publication::Package.new
16
+ @rootfile = Addressable::URI.parse File.realpath(file)
17
+ @doc = Nokogiri.XML open(file)
18
+ end
19
+
20
+ def parse
21
+ # parse_metadata
22
+ parse_manifest
23
+ parse_spine
24
+ # parse_guide
25
+ # parse_bindings
26
+
27
+ @package
28
+ end
29
+
30
+ def parse_metadata
31
+ raise 'still not implemented'
32
+ end
33
+
34
+ def parse_manifest
35
+ manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
36
+ elem = @doc.xpath('/xmlns:package/xmlns:manifest', @doc.namespaces).first
37
+ manifest.id = elem['id']
38
+
39
+ fallback_map = {}
40
+ elem.xpath('./xmlns:item').each do |elm|
41
+ item = EPUB::Publication::Package::Manifest::Item.new
42
+ %w[ id media-type media-overlay ].each do |attr|
43
+ item.send "#{attr.gsub(/-/, '_')}=", elm[attr]
44
+ end
45
+ # item.href = Addressable::URI.parse elm['href']
46
+ item.href = @rootfile.join Addressable::URI.parse(elm['href'])
47
+ fallback_map[elm['fallback']] = item if elm['fallback']
48
+ item.properties = elm['properties'] ? elm['properties'].split(' ') : []
49
+ manifest << item
50
+ end
51
+ fallback_map.each_pair do |id, from|
52
+ from.fallback = manifest[id]
53
+ end
54
+
55
+ manifest
56
+ end
57
+
58
+ def parse_spine
59
+ spine = @package.spine = EPUB::Publication::Package::Spine.new
60
+ elem = @doc.xpath('/xmlns:package/xmlns:spine', @doc.namespaces).first
61
+ %w[ id toc page-progression-direction ].each do |attr|
62
+ spine.send("#{attr.gsub(/-/, '_')}=", elem[attr])
63
+ end
64
+
65
+ elem.xpath('./xmlns:itemref', @doc.namespaces).each do |elm|
66
+ itemref = EPUB::Publication::Package::Spine::Itemref.new
67
+ %w[ idref id ].each do |attr|
68
+ itemref.send "#{attr}=", elm[attr]
69
+ end
70
+ itemref.linear = !(elm['linear'] == 'no')
71
+ itemref.properties = elm['properties'] ? elm['properties'].split(' ') : []
72
+ spine << itemref
73
+ end
74
+
75
+ spine
76
+ end
77
+
78
+ def parse_guide
79
+ raise 'still not implemented'
80
+ end
81
+
82
+ def parse_bindings
83
+ raise 'still not implemented'
84
+ end
85
+ end
86
+ end
87
+ end