epub-parser 0.3.6 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -41,8 +41,10 @@ namespace :doc do
41
41
  YARD::Rake::YardocTask.new
42
42
  Rake::RDocTask.new do |rdoc|
43
43
  rdoc.rdoc_files = FileList['lib/**/*.rb']
44
- rdoc.rdoc_files.include 'README.markdown'
44
+ rdoc.rdoc_files.include 'README.adoc'
45
+ rdoc.rdoc_files.include 'CHANGELOG.adoc'
45
46
  rdoc.rdoc_files.include 'MIT-LICENSE'
47
+ rdoc.rdoc_files.include 'docs/**/*.adoc'
46
48
  rdoc.rdoc_files.include 'docs/**/*.md'
47
49
  end
48
50
  end
@@ -0,0 +1,51 @@
1
+ require "optparse"
2
+ require "uri"
3
+ require "epub/parser"
4
+
5
+ def main(argv)
6
+ option_parser = OptionParser.new {|opt|
7
+ opt.banner = <<EOB
8
+ Extract cover image.
9
+ Image is put to current directory with the same name in EPUB.
10
+ It is put to specified directory when `--output' option is given.
11
+
12
+ Usage: #{opt.program_name} [options] EPUBFILE
13
+
14
+ EOB
15
+ opt.separator "Options:"
16
+ opt.on "-o", "--output=DIR", "Directory to put image file"
17
+ }
18
+ options = option_parser.getopts(argv)
19
+ path = argv.shift
20
+ error "EPUBFILE not given" unless path
21
+ unless File.file? path
22
+ if File.directory? path
23
+ EPUB::OCF::PhysicalContainer.adapter = :UnpackedDirectory
24
+ else
25
+ path = URI.parse(path) rescue nil
26
+ if path
27
+ EPUB::OCF::PhysicalContainer.adapter = :UnpackedURI
28
+ else
29
+ error "EPUBFILE not a file"
30
+ end
31
+ end
32
+ end
33
+ error "output not a directory" if options["output"] && !File.directory?(options["output"])
34
+ cover_image = EPUB::Parser.parse(path).cover_image
35
+ error "cover image not found" unless cover_image
36
+ path = File.basename(cover_image.href.to_s)
37
+ path = File.join(options["output"], path) if options["output"]
38
+ File.write path, cover_image.read
39
+ $stderr.print "Cover image output to "
40
+ print path
41
+ $stderr.puts ""
42
+ end
43
+
44
+ def error(message)
45
+ $stderr.puts "Error: #{message}"
46
+ $stderr.puts ""
47
+ $stderr.puts option_parser.help
48
+ abort
49
+ end
50
+
51
+ main(ARGV)
@@ -0,0 +1,46 @@
1
+ {file:docs/Home} > *{file:docs/EpubCover.adoc}*
2
+
3
+ = `epub-cover` command-line tool
4
+
5
+ `epub-cover` tool extract cover image from EPUB book.
6
+
7
+ == Usage
8
+
9
+ ----
10
+ % epub-cover --help
11
+ Extract cover image.
12
+ Image is put to current directory with the same name in EPUB.
13
+ It is put to specified directory when `--output' option is given.
14
+
15
+ Usage: epub-cover [options] EPUBFILE
16
+
17
+ Options:
18
+ -o, --output=DIR Directory to put image file
19
+ ----
20
+
21
+ Example:
22
+
23
+ ----
24
+ % epub-cover childrens-literature.epub
25
+ Cover image output to cover.png
26
+ ----
27
+
28
+ As output indicates, cover image file is output to current directory. The file name is the same to one in EPUB file.
29
+
30
+ === Output directory
31
+
32
+ You can specify a directory to output the cover file by `--output` option.
33
+
34
+ ----
35
+ % epub-cover --output=/tmp childrens-literature.epub
36
+ Cover image output to /tmp/cover.png
37
+ ----
38
+
39
+ === Extract from the web
40
+
41
+ `epub-open` accepts URI instead of file path.
42
+
43
+ ----
44
+ % epub-cover https://raw.githubusercontent.com/IDPF/epub3-samples/master/30/page-blanche/
45
+ Cover image output to cover.jpg
46
+ ----
@@ -0,0 +1,9 @@
1
+ = Examples
2
+
3
+ = {doctitle}
4
+
5
+ There are examples helping you find how to use EPUB parser gem.
6
+
7
+ * {file:docs/AggregateContentsFromWeb.markdown Aggregate Contents From the Web}
8
+ * {file:examples/exctract-content-using-cfi.rb Extract contents from EPUB files using EPUB CFI(identifier for EPUB)}
9
+ * {file:examples/find-elements-and-cfis.rb Find elements and CFIs}
@@ -0,0 +1,224 @@
1
+ = EPUB Parser
2
+
3
+ = {doctitle}
4
+
5
+ EPUB Parser gem parses EPUB 3 book loosely.
6
+
7
+ image:https://gitlab.com/KitaitiMakoto/epub-parser/badges/master/build.svg[link="https://gitlab.com/KitaitiMakoto/epub-parser/commits/master", title="pipeline status"]
8
+ image:https://gemnasium.com/KitaitiMakoto/epub-parser.png[link="https://gitlab.com/KitaitiMakoto/epub-parser/commits/master",title="Dependency Status"]
9
+ image:https://badge.fury.io/rb/epub-parser.svg[link="https://gemnasium.com/KitaitiMakoto/epub-parser",title="Gem Version"]
10
+ image:https://gitlab.com/KitaitiMakoto/epub-parser/badges/master/coverage.svg[link="https://kitaitimakoto.gitlab.io/epub-parser/coverage/",title="coverage report"]
11
+
12
+ * https://kitaitimakoto.gitlab.io/epub-parser/file.Home.html[Homepage]
13
+ * https://kitaitimakoto.gitlab.io/epub-parser/[Documentation]
14
+ * https://gitlab.com/KitaitiMakoto/epub-parser[Source Code]
15
+ * https://kitaitimakoto.gitlab.io/epub-parser/coverage/[Test Coverage]
16
+
17
+ == Installation
18
+
19
+ gem install epub-parser
20
+
21
+ == Usage
22
+
23
+ === As command-line tools
24
+
25
+ ==== epubinfo
26
+
27
+ `epubinfo` tool extracts and shows the metadata of specified EPUB book.
28
+
29
+ See {file:docs/Epubinfo.markdown}.
30
+
31
+ ==== epub-open
32
+
33
+ `epub-open` tool provides interactive shell(IRB) which helps you research about EPUB book.
34
+
35
+ See {file:docs/EpubOpen.markdown}.
36
+
37
+ ==== epub-cover
38
+
39
+ `epub-cover` tool extract cover image from EPUB book.
40
+
41
+ See {file:docs/EpubCover.adoc}.
42
+
43
+ === As a library
44
+
45
+ Use `EPUB::Parser.parse` at first:
46
+
47
+ ----
48
+ require 'epub/parser'
49
+
50
+ book = EPUB::Parser.parse('/path/to/book.epub')
51
+ ----
52
+
53
+ This book object can yield page by spine's order(spine defines the order to read that the author determines):
54
+
55
+ ----
56
+ book.each_page_on_spine do |page|
57
+ # do something...
58
+ end
59
+ ----
60
+
61
+ `page` above is an {EPUB::Publication::Package::Manifest::Item} object and you can call {EPUB::Publication::Package::Manifest::Item#href #href} to see where is the page file:
62
+
63
+ ----
64
+ book.each_page_on_spine do |page|
65
+ file = page.href # => path/to/page/in/zip/archive
66
+ html = Zip::Archive.open('/path/to/book.epub') {|zip|
67
+ zip.fopen(file.to_s) {|file| file.read}
68
+ }
69
+ end
70
+ ----
71
+
72
+ And {EPUB::Publication::Package::Manifest::Item Item} provides syntax suger {EPUB::Publication::Package::Manifest::Item#read #read} for above:
73
+
74
+ ----
75
+ html = page.read
76
+ doc = Nokogiri.HTML(html)
77
+ # do something with Nokogiri as always
78
+ ----
79
+
80
+ For several utilities of Item, see {file:docs/Item.markdown} page.
81
+
82
+ By the way, although `book` above is a {EPUB::Book} object, all features are provided by {EPUB::Book::Features} module. Therefore YourBook class can include the features of {EPUB::Book::Features}:
83
+
84
+ ----
85
+ require 'epub'
86
+
87
+ class YourBook < ActiveRecord::Base
88
+ include EPUB::Book::Features
89
+ end
90
+
91
+ book = EPUB::Parser.parse(
92
+ 'uploaded-book.epub',
93
+ :class => YourBook # *************** pass YourBook class
94
+ )
95
+ book.instance_of? YourBook # => true
96
+ book.required = 'value for required field'
97
+ book.save!
98
+ book.each_page_on_spine do |epage|
99
+ page = YouBookPage.create(
100
+ :some_attr => 'some attr',
101
+ :content => epage.read,
102
+ :another_attr => 'another attr'
103
+ )
104
+ book.pages << page
105
+ end
106
+ ----
107
+
108
+ You are also able to find YourBook object for the first:
109
+
110
+ ----
111
+ book = YourBook.find params[:id]
112
+ ret = EPUB::Parser.parse(
113
+ 'uploaded-book.epub',
114
+ :book => book # ******************* pass your book instance
115
+ ) # => book
116
+ ret == book # => true; this API is not good I feel... Welcome suggestion!
117
+ # do something with your book
118
+ ----
119
+
120
+ ==== Switching ZIP library
121
+
122
+ EPUB Parser uses https://github.com/javanthropus/archive-zip[Archive::Zip], a pure Ruby ZIP library, by default. You can use https://bitbucket.org/winebarrel/zip-ruby/wiki/Home[Zip/Ruby], a Ruby bindings for https://libzip.org/[libzip] if you have already installed Zip/Ruby gem by RubyGems or Bundler.
123
+
124
+ Globally:
125
+
126
+ ----
127
+ EPUB::OCF::PhysicalContainer.adapter = :Zipruby
128
+ book = EPUB::Parser.parse("path/to/book.epub")
129
+ ----
130
+
131
+ For each EPUB book:
132
+
133
+ ----
134
+ book = EPUB::Parser.parse("path/to/book.epub", container_adapter: :Zipruby)
135
+ ----
136
+
137
+ == Documentation
138
+
139
+ === APIs
140
+
141
+ More documentations are avaiable in:
142
+
143
+ * {file:docs/Publication.markdown} includes document's meta data, file list and so on.
144
+ * {file:docs/Item.markdown} represents a file in EPUB package.
145
+ * {file:docs/FixedLayout.markdown} provides APIs to declare how EPUB reader renders in such as reflowable or fixed layout.
146
+ * {file:docs/Navigation.markdown} describes how to use Navigation Document.
147
+ * {file:docs/Searcher.markdown} introduces APIs to search words and elements, and search by EPUB CFIs(a position pointer for EPUB) from EPUB documents.
148
+ * {file:docs/UnpackedArchive.markdown} describes how to handle directories which was generated by unzip EPUB files instead of EPUB files themselves.
149
+ * {file:docs/MultipleRenditions.markdown} describes about EPUB Multiple-Rendistions Publication and APIs for that.
150
+
151
+ === Examples
152
+
153
+ Example usages are listed in {file:Examples} page.
154
+
155
+ * {file:docs/AggregateContentsFromWeb.markdown Aggregate Contents From the Web}
156
+ * {file:examples/exctract-content-using-cfi.rb Extract contents from EPUB files using EPUB CFI(identifier for EPUB)}
157
+ * {file:examples/find-elements-and-cfis.rb Find elements and CFIs}
158
+
159
+ === Building documentation
160
+
161
+ If you installed EPUB Parser via gem command, you can also generate documentaiton by your own(https://gitlab.com/KitaitiMakoto/rubygems-yardoc[rubygems-yardoc] gem is needed):
162
+
163
+ ----
164
+ $ gem install epub-parser
165
+ $ gem yardoc epub-parser
166
+ ...
167
+ Files: 33
168
+ Modules: 20 ( 20 undocumented)
169
+ Classes: 45 ( 44 undocumented)
170
+ Constants: 31 ( 31 undocumented)
171
+ Methods: 292 ( 88 undocumented)
172
+ 52.84% documented
173
+ YARD documentation is generated to:
174
+ /path/to/gempath/ruby/2.2.0/doc/epub-parser-0.2.0/yardoc
175
+ ----
176
+
177
+ It will show you path to generated documentation(`/path/to/gempath/ruby/2.2.0/doc/epub-parser-0.2.0/yardoc` here) at the end.
178
+
179
+ Or, generating yardoc command is possible, too:
180
+
181
+ ----
182
+ $ git clone https://gitlab.com/KitaitiMakoto/epub-parser.git
183
+ $ cd epub-parser
184
+ $ bundle install --path=deps
185
+ $ bundle exec rake doc:yard
186
+ ...
187
+ Files: 33
188
+ Modules: 20 ( 20 undocumented)
189
+ Classes: 45 ( 44 undocumented)
190
+ Constants: 31 ( 31 undocumented)
191
+ Methods: 292 ( 88 undocumented)
192
+ 52.84% documented
193
+ ----
194
+
195
+ Then documentation will be available in `doc` directory.
196
+
197
+ == Requirements
198
+
199
+ * Ruby 2.2.0 or later
200
+ * `patch` command to install Nokogiri
201
+ * C compiler to compile Zip/Ruby and Nokogiri
202
+
203
+ == History
204
+
205
+ See {file:CHANGELOG.adoc}.
206
+
207
+ == Note
208
+
209
+ This library is still in work.
210
+ Only a few features are implemented and APIs might be changed in the future.
211
+ Note that.
212
+
213
+ Currently implemented:
214
+
215
+ * container.xml of http://idpf.org/epub/30/spec/epub30-ocf.html#sec-container-metainf-container.xml[EPUB Open Container Format (OCF) 3.0]
216
+ * http://idpf.org/epub/30/spec/epub30-publications.html[EPUB Publications 3.0]
217
+ * EPUB Navigation Documents of http://www.idpf.org/epub/30/spec/epub30-contentdocs.html[EPUB Content Documents 3.0]
218
+ * http://www.idpf.org/epub/fxl/[EPUB 3 Fixed-Layout Documents]
219
+ * metadata.xml of http://www.idpf.org/epub/renditions/multiple/[EPUB Multiple-Rendition Publications]
220
+
221
+ == License
222
+
223
+ This library is distributed under the term of the MIT Licence.
224
+ See {file:MIT-LICENSE} file for more info.
@@ -0,0 +1,132 @@
1
+ {file:docs/Home.markdown} > **{file:docs/Searcher.markdown}**
2
+
3
+ = Searcher
4
+
5
+ *Searcher is experimental now. Note that all interfaces are not stable at all.*
6
+
7
+ == Example
8
+
9
+ ----
10
+ epub = EPUB::Parser.parse('childrens-literature.epub')
11
+ search_word = 'INTRODUCTORY'
12
+ results = EPUB::Searcher.search_text(epub, search_word)
13
+ # => [#<EPUB::Searcher::Result:0x007f80ccde9528
14
+ # @end_steps=[#<EPUB::Searcher::Result::Step:0x007f80ccde9730 @index=12, @info={}, @type=:character>],
15
+ # @parent_steps=
16
+ # [#<EPUB::Searcher::Result::Step:0x007f80ccf571d0 @index=2, @info={:name=>"spine", :id=>nil}, @type=:element>,
17
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccf3d3e8 @index=1, @info={:id=>nil}, @type=:itemref>,
18
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccde9e88 @index=1, @info={:name=>"body", :id=>nil}, @type=:element>,
19
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccde9e38 @index=0, @info={:name=>"nav", :id=>"toc"}, @type=:element>,
20
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccde9de8 @index=1, @info={:name=>"ol", :id=>"tocList"}, @type=:element>,
21
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccde9d98 @index=0, @info={:name=>"li", :id=>"np-313"}, @type=:element>,
22
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccde9d48 @index=1, @info={:name=>"ol", :id=>nil}, @type=:element>,
23
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccde9ca8 @index=1, @info={:name=>"li", :id=>"np-317"}, @type=:element>,
24
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccde9c08 @index=0, @info={:name=>"a", :id=>nil}, @type=:element>,
25
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccde9bb8 @index=0, @info={}, @type=:text>],
26
+ # @start_steps=[#<EPUB::Searcher::Result::Step:0x007f80ccde9af0 @index=0, @info={}, @type=:character>]>,
27
+ # #<EPUB::Searcher::Result:0x007f80ccebcb30
28
+ # @end_steps=[#<EPUB::Searcher::Result::Step:0x007f80ccebcdb0 @index=12, @info={}, @type=:character>],
29
+ # @parent_steps=
30
+ # [#<EPUB::Searcher::Result::Step:0x007f80ccf571d0 @index=2, @info={:name=>"spine", :id=>nil}, @type=:element>,
31
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccde94b0 @index=2, @info={:id=>nil}, @type=:itemref>,
32
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccebd328 @index=1, @info={:name=>"body", :id=>nil}, @type=:element>,
33
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccebd2d8 @index=0, @info={:name=>"section", :id=>"pgepubid00492"}, @type=:element>,
34
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccebd260 @index=3, @info={:name=>"section", :id=>"pgepubid00498"}, @type=:element>,
35
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccebd210 @index=1, @info={:name=>"h3", :id=>nil}, @type=:element>,
36
+ # ##<EPUB::Searcher::Result::Step:0x007f80ccebd198 @index=0, @info={}, @type=:text>],
37
+ # @start_steps=[#<EPUB::Searcher::Result::Step:0x007f80ccebd0d0 @index=0, @info={}, @type=:character>]>]
38
+ puts results.collect(&:to_cfi).collect(&:to_fragment)
39
+ # epubcfi(/6/4!/4/2[toc]/4[tocList]/2[np-313]/4/4[np-317]/2/1,:0,:12)
40
+ # epubcfi(/6/6!/4/2[pgepubid00492]/8[pgepubid00498]/4/1,:0,:12)
41
+ # => nil
42
+ ----
43
+
44
+ == Search result
45
+
46
+ Search result is an array of {EPUB::Searcher::Result} and it may be converted to an EPUBCFI string by {EPUB::Searcher::Result#to_cfi_s}.
47
+
48
+ == Seamless XHTML Searcher
49
+
50
+ Now default searcher for XHTML is *seamless* searcher, which ignores tags when searching.
51
+
52
+ You can search words 'search word' from XHTML document below:
53
+
54
+ ----
55
+ <html>
56
+ <head>
57
+ <title>Sample document</title>
58
+ </head>
59
+ <body>
60
+ <p><em>search</em> word</p>
61
+ </body>
62
+ </html>
63
+ ----
64
+
65
+ == Restricted XHTML Searcher
66
+
67
+ You can also use *restricted* searcher, which means that it can search from only single elements. For instance, it can find 'search word' from XHTML document below:
68
+
69
+ ----
70
+ <html>
71
+ <head>
72
+ <title>Sample document</title>
73
+ </head>
74
+ <body>
75
+ <p>search word</p>
76
+ </body>
77
+ </html>
78
+ ----
79
+
80
+ But cannot from document below:
81
+
82
+ ----
83
+ <html>
84
+ <head>
85
+ <title>Sample document</title>
86
+ </head>
87
+ <body>
88
+ <p><em>search</em> word</p>
89
+ </body>
90
+ </html>
91
+ ----
92
+
93
+ because the words 'search' and 'word' are not in the same element.
94
+
95
+ To use restricted searcher, specify `algorithm` option for `search` method:
96
+
97
+ results = EPUB::Searcher.search_text(epub, search_word, algorithm: :restricted)
98
+
99
+ == Element Searcher
100
+
101
+ You can search XHTML elements by CSS selector or XPath.
102
+
103
+ ----
104
+ EPUB::Searcher::Publication.search_element(@package, css: 'ol > li').collect {|result| result[:location]}.map(&:to_fragment)
105
+ # => ["epubcfi(/4/4!/4/2[toc]/4[tocList]/2[np-313])",
106
+ # "epubcfi(/4/4!/4/2[toc]/4[tocList]/2[np-313]/4/2[np-315])",
107
+ # "epubcfi(/4/4!/4/2[toc]/4[tocList]/2[np-313]/4/4[np-317])",
108
+ # "epubcfi(/4/4!/4/2[toc]/4[tocList]/2[np-313]/4/6)",
109
+ # "epubcfi(/4/4!/4/2[toc]/4[tocList]/2[np-313]/4/6/4/2[np-319])",
110
+ # "epubcfi(/4/4!/4/2[toc]/4[tocList]/2[np-313]/4/6/4/2[np-319]/4/2)",
111
+ # :
112
+ # :
113
+ ----
114
+
115
+ == Search by EPUB CFI
116
+
117
+ You can fetch XML node from EPUB document by EPUB CFI.
118
+
119
+ ----
120
+ require "epub/parser"
121
+ require "epub/searcher"
122
+
123
+ epub = EPUB::Parser.parse("childrens-literature.epub")
124
+ cfi = EPUB::CFI("/6/4!/4/2[toc]/4[tocList]/2[np-313]/4/4[np-317]")
125
+ itemref, node = EPUB::Searcher.search_by_cfi(epub, cfi)
126
+ puts itemref.item.full_path
127
+ puts node
128
+ # EPUB/nav.xhtml
129
+ # <li id="np-317" class="front">
130
+ # <a href="s04.xhtml#pgepubid00498">INTRODUCTORY</a>
131
+ # </li>
132
+ ----