epub-parser 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -0
- data/CHANGELOG.markdown +7 -0
- data/Gemfile +1 -1
- data/README.markdown +7 -8
- data/docs/Searcher.markdown +74 -0
- data/epub-parser.gemspec +1 -0
- data/lib/epub/parser/version.rb +1 -1
- data/lib/epub/publication/package/manifest.rb +14 -1
- data/lib/epub/searcher.rb +3 -0
- data/lib/epub/searcher/publication.rb +32 -0
- data/lib/epub/searcher/result.rb +73 -0
- data/lib/epub/searcher/xhtml.rb +57 -0
- data/test/fixtures/book/OPS/japanese.eucjp.xhtml +10 -0
- data/test/fixtures/book/OPS/japanese.sjis.xhtml +10 -0
- data/test/fixtures/book/OPS/japanese.utf8.xhtml +10 -0
- data/test/fixtures/book/OPS//343/203/253/343/203/274/343/203/210/343/203/225/343/202/241/343/202/244/343/203/253.opf +10 -1
- data/test/test_parser_publication.rb +2 -2
- data/test/test_publication.rb +10 -0
- data/test/test_searcher.rb +117 -0
- metadata +27 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 02abe0846123f8f3d218581a102f69c2972a80e1
|
4
|
+
data.tar.gz: 510bddcca86f789add0ecc4193b424063a5d23d6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8c46c39e8031ab857e968dc3826e4ce0fb6d65588ef8d8e61f05672628feaaa58d25a829c1b39826cf1a04b458dcdcd2f0898578f92768bd9db866b4a5f1f6aa
|
7
|
+
data.tar.gz: feea20e63185013eea0c9d5bdd0b8c0e80a4e06646b8bd106095a9eb967a0c067113915ad70458d9332d3efab120875447c90032fa4f4dedf2465d72532f1d3e
|
data/.yardopts
CHANGED
data/CHANGELOG.markdown
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
CHANGELOG
|
2
2
|
=========
|
3
|
+
|
4
|
+
0.1.7
|
5
|
+
-----
|
6
|
+
|
7
|
+
* [Experimental]Add `EPUB::Searcher` module. See {file:Searcher.markdown} for details
|
8
|
+
* Detect and set character encoding in `EPUB::Publication::Package::Item#read`
|
9
|
+
|
3
10
|
0.1.6
|
4
11
|
-----
|
5
12
|
* Remove `EPUB.parse` method
|
data/Gemfile
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
gemspec
|
data/README.markdown
CHANGED
@@ -93,6 +93,7 @@ See {file:docs/EpubOpen} for more info.
|
|
93
93
|
REQUIREMENTS
|
94
94
|
------------
|
95
95
|
* Ruby 1.9.3 or later
|
96
|
+
* `patch` command to install Nokogiri
|
96
97
|
* C compiler to compile Zip/Ruby and Nokogiri
|
97
98
|
|
98
99
|
Related Gems
|
@@ -108,6 +109,12 @@ If you find other gems, please tell me or request a pull request.
|
|
108
109
|
|
109
110
|
RECENT CHANGES
|
110
111
|
--------------
|
112
|
+
|
113
|
+
### 0.1.7
|
114
|
+
|
115
|
+
* [Experimental]Add `EPUB::Searcher` module. See {file:Searcher.markdown} for details
|
116
|
+
* Detect and set character encoding in `EPUB::Publication::Package::Item#read`
|
117
|
+
|
111
118
|
### 0.1.6
|
112
119
|
* Remove `EPUB.parse` method
|
113
120
|
* Remove `EPUB::Publication::Package::Metadata#to_hash`
|
@@ -134,14 +141,6 @@ RECENT CHANGES
|
|
134
141
|
* Add `ContentDocument::XHTML#rexml` and `#nokogiri`
|
135
142
|
* Inspect more readably
|
136
143
|
|
137
|
-
### 0.1.4
|
138
|
-
* [Fixed-Layout Documents][fixed-layout] support
|
139
|
-
* Define `ContentDocument::XHTML#top_level?`
|
140
|
-
* Define `Spine::Itemref#page_spread` and `#page_spread=`
|
141
|
-
* Define some utility methods around `Manifest::Item` and `Spine::Itemref`
|
142
|
-
|
143
|
-
[fixed-layout]: http://www.idpf.org/epub/fxl/
|
144
|
-
|
145
144
|
See {file:CHANGELOG.markdown} for older changelogs and details.
|
146
145
|
|
147
146
|
TODOS
|
@@ -0,0 +1,74 @@
|
|
1
|
+
{file:docs/Home.markdown} > **{file:docs/Searcher.markdown}**
|
2
|
+
|
3
|
+
Searcher
|
4
|
+
========
|
5
|
+
|
6
|
+
*Searcher is experimental now. Note that all interfaces are not stable at all.*
|
7
|
+
|
8
|
+
Example
|
9
|
+
-------
|
10
|
+
|
11
|
+
epub = EPUB::Parser.parse('childrens-literature-20130206.epub')
|
12
|
+
search_word = 'INTRODUCTORY'
|
13
|
+
results = EPUB::Searcher.search(epub.package, search_word)
|
14
|
+
# => [#<EPUB::Searcher::Result:0x007f74d2b31548
|
15
|
+
# @end_steps=[#<EPUB::Searcher::Result::Step:0x007f74d2b7baa8 @index=12, @type=:character>],
|
16
|
+
# @parent_steps=
|
17
|
+
# [#<EPUB::Searcher::Result::Step:0x007f74d2b81318 @index=2, @name="spine", @type=:element>,
|
18
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7f4c8 @index=1, @type=:itemref>,
|
19
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7d560 @index=1, @name="body", @type=:element>,
|
20
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7d308 @index=0, @name="nav", @type=:element>,
|
21
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7cdb8 @index=1, @name="ol", @type=:element>,
|
22
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7cb38 @index=0, @name="li", @type=:element>,
|
23
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7c5e8 @index=1, @name="ol", @type=:element>,
|
24
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7bf80 @index=1, @name="li", @type=:element>,
|
25
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7bd28 @index=0, @name="a", @type=:element>,
|
26
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7bb70 @index=0, @type=:text>],
|
27
|
+
# @start_steps=[#<EPUB::Searcher::Result::Step:0x007f74d2b7baf8 @index=0, @type=:character>]>,
|
28
|
+
# #<EPUB::Searcher::Result:0x007f74d294e258
|
29
|
+
# @end_steps=[#<EPUB::Searcher::Result::Step:0x007f74d2b0f8d0 @index=12, @type=:character>],
|
30
|
+
# @parent_steps=
|
31
|
+
# [#<EPUB::Searcher::Result::Step:0x007f74d2b81318 @index=2, @name="spine", @type=:element>,
|
32
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b314f8 @index=2, @type=:itemref>,
|
33
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b2fb80 @index=1, @name="body", @type=:element>,
|
34
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b2f900 @index=0, @name="section", @type=:element>,
|
35
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b10578 @index=3, @name="section", @type=:element>,
|
36
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b0fb50 @index=1, @name="h3", @type=:element>,
|
37
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b0f998 @index=0, @type=:text>],
|
38
|
+
# @start_steps=[#<EPUB::Searcher::Result::Step:0x007f74d2b0f920 @index=0, @type=:character>]>]
|
39
|
+
puts results.collect(&:to_cfi_s)
|
40
|
+
# /6/4!/4/2/4/2/4/4/2/1,:0,:12
|
41
|
+
# /6/6!/4/2/8/4/1,:0,:12
|
42
|
+
# => nil
|
43
|
+
|
44
|
+
Search result
|
45
|
+
-------------
|
46
|
+
|
47
|
+
Search result is an array of {EPUB::Searcher::Result} and it may be converted to an EPUBCFI string by {EPUB::Searcher::Result#to_cfi_s}.
|
48
|
+
|
49
|
+
Restricted XHTML Searcher
|
50
|
+
-------------------------
|
51
|
+
|
52
|
+
Now searcher for XHTML documents is *restricted*, which means that it can search from only single elements. For instance, it can find 'search word' from XHTML document below:
|
53
|
+
|
54
|
+
<html>
|
55
|
+
<head>
|
56
|
+
<title>Sample document</title>
|
57
|
+
</head>
|
58
|
+
<body>
|
59
|
+
<p>search word</p>
|
60
|
+
</body>
|
61
|
+
</html>
|
62
|
+
|
63
|
+
But cannot from document below:
|
64
|
+
|
65
|
+
<html>
|
66
|
+
<head>
|
67
|
+
<title>Sample document</title>
|
68
|
+
</head>
|
69
|
+
<body>
|
70
|
+
<p><em>search</em> word</p>
|
71
|
+
</body>
|
72
|
+
</html>
|
73
|
+
|
74
|
+
because the words 'search' and 'word' are not in the same element.
|
data/epub-parser.gemspec
CHANGED
data/lib/epub/parser/version.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'set'
|
2
2
|
require 'enumerabler'
|
3
|
+
require 'rchardet'
|
3
4
|
require 'epub/constants'
|
4
5
|
require 'epub/parser/content_document'
|
5
6
|
|
@@ -91,9 +92,21 @@ module EPUB
|
|
91
92
|
end
|
92
93
|
|
93
94
|
def read
|
94
|
-
Zip::Archive.open(manifest.package.book.epub_file) {|zip|
|
95
|
+
raw_content = Zip::Archive.open(manifest.package.book.epub_file) {|zip|
|
95
96
|
zip.fopen(entry_name).read
|
96
97
|
}
|
98
|
+
# CharDet.detect doesn't raise Encoding::CompatibilityError
|
99
|
+
# that is caused when trying compare CharDet's internal
|
100
|
+
# ASCII-8BIT RegExp with a String with other encoding
|
101
|
+
# because Zip::File#read returns a String with encoding ASCII-8BIT.
|
102
|
+
# So, no need to rescue the error here.
|
103
|
+
encoding = CharDet.detect(raw_content)['encoding']
|
104
|
+
if encoding
|
105
|
+
raw_content.force_encoding(encoding)
|
106
|
+
else
|
107
|
+
warn "No encoding detected for #{entry_name}. Set to ASCII-8BIT" if $DEBUG || $VERBOSE
|
108
|
+
raw_content
|
109
|
+
end
|
97
110
|
end
|
98
111
|
|
99
112
|
def xhtml?
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'epub/publication'
|
2
|
+
|
3
|
+
module EPUB
|
4
|
+
module Searcher
|
5
|
+
class Publication
|
6
|
+
class << self
|
7
|
+
def search(package, word)
|
8
|
+
new(word).search(package)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(word)
|
13
|
+
@word = word
|
14
|
+
end
|
15
|
+
|
16
|
+
def search(package)
|
17
|
+
results = []
|
18
|
+
|
19
|
+
spine = package.spine
|
20
|
+
spine_step = Result::Step.new(:element, 2, {:name => 'spine', :id => spine.id})
|
21
|
+
spine.each_itemref.with_index do |itemref, index|
|
22
|
+
itemref_step = Result::Step.new(:itemref, index, {:id => itemref.id})
|
23
|
+
XHTML::Restricted.search(Nokogiri.XML(itemref.item.read), @word).each do |sub_result|
|
24
|
+
results << Result.new([spine_step, itemref_step] + sub_result.parent_steps, sub_result.start_steps, sub_result.end_steps)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
results
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module EPUB
|
2
|
+
module Searcher
|
3
|
+
class Result
|
4
|
+
attr_reader :parent_steps, :start_steps, :end_steps
|
5
|
+
|
6
|
+
# @param parent_steps [Array<Step>] common steps between start and end
|
7
|
+
# @param start_steps [Array<Step>] steps to start from +parent_steps+
|
8
|
+
# @param end_steps [Array<Step>] steps to end from +parent_steps+
|
9
|
+
def initialize(parent_steps, start_steps, end_steps)
|
10
|
+
@parent_steps, @start_steps, @end_steps = parent_steps, start_steps, end_steps
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_xpath_and_offset(with_xmlns=false)
|
14
|
+
xpath = (@parent_steps + @start_steps).reduce('.') {|path, step|
|
15
|
+
case step.type
|
16
|
+
when :element
|
17
|
+
path + '/%s*[%d]' % [with_xmlns ? 'xhtml:' : nil, step.index + 1]
|
18
|
+
when :text
|
19
|
+
path + '/text()[%s]' % [step.index + 1]
|
20
|
+
else
|
21
|
+
path
|
22
|
+
end
|
23
|
+
}
|
24
|
+
|
25
|
+
[xpath, @start_steps.last.index]
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_cfi_s
|
29
|
+
[@parent_steps, @start_steps, @end_steps].collect {|steps|
|
30
|
+
steps ? steps.collect(&:to_cfi_s).join : nil
|
31
|
+
}.compact.join(',')
|
32
|
+
end
|
33
|
+
|
34
|
+
def ==(other)
|
35
|
+
[@parent_steps + @start_steps.to_a] == [other.parent_steps + other.start_steps.to_a] and
|
36
|
+
[@parent_steps + @end_steps.to_a] == [other.parent_steps + other.end_steps.to_a]
|
37
|
+
end
|
38
|
+
|
39
|
+
class Step
|
40
|
+
attr_reader :type, :index, :info
|
41
|
+
|
42
|
+
def initialize(type, index, info={})
|
43
|
+
@type, @index, @info = type, index, info
|
44
|
+
end
|
45
|
+
|
46
|
+
def ==(other)
|
47
|
+
self.type == other.type and
|
48
|
+
self.index == other.index and
|
49
|
+
self.info == other.info
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_cfi_s
|
53
|
+
case type
|
54
|
+
when :element
|
55
|
+
'/%d%s' % [(index + 1) * 2, id_assertion]
|
56
|
+
when :text
|
57
|
+
'/%d' % [(index + 1)]
|
58
|
+
when :character
|
59
|
+
':%d' % [index]
|
60
|
+
when :itemref
|
61
|
+
'/%d%s!' % [(index + 1) * 2, id_assertion]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def id_assertion
|
68
|
+
info[:id] ? "[#{info[:id]}]" : nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'epub'
|
2
|
+
require 'epub/parser/utils'
|
3
|
+
|
4
|
+
module EPUB
|
5
|
+
module Searcher
|
6
|
+
class XHTML
|
7
|
+
class Restricted
|
8
|
+
class << self
|
9
|
+
# @param element [Nokogiri::XML::Element, Nokogiri::XML::Document]
|
10
|
+
# @param word [String]
|
11
|
+
# @return [Array<Result>]
|
12
|
+
def search(element, word)
|
13
|
+
new(word).search(element.respond_to?(:root) ? element.root : element)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# @param word [String]
|
18
|
+
def initialize(word)
|
19
|
+
@word = word
|
20
|
+
end
|
21
|
+
|
22
|
+
# @param element [Nokogiri::XML::Element]
|
23
|
+
# @return [Array<Result>]
|
24
|
+
def search(element)
|
25
|
+
results = []
|
26
|
+
|
27
|
+
elem_index = 0
|
28
|
+
element.children.each do |child|
|
29
|
+
if child.element?
|
30
|
+
child_step = Result::Step.new(:element, elem_index, {:name => child.name, :id => Parser::Utils.extract_attribute(child, 'id')})
|
31
|
+
if child.name == 'img'
|
32
|
+
if Parser::Utils.extract_attribute(child, 'alt').index(@word)
|
33
|
+
results << Result.new([child_step], nil, nil)
|
34
|
+
end
|
35
|
+
else
|
36
|
+
search(child).each do |sub_result|
|
37
|
+
results << Result.new([child_step] + sub_result.parent_steps, sub_result.start_steps, sub_result.end_steps)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
elem_index += 1
|
41
|
+
elsif child.text?
|
42
|
+
text_index = elem_index
|
43
|
+
char_index = 0
|
44
|
+
text_step = Result::Step.new(:text, text_index)
|
45
|
+
while char_index = child.text.index(@word, char_index)
|
46
|
+
results << Result.new([text_step], [Result::Step.new(:character, char_index)], [Result::Step.new(:character, char_index + @word.length)])
|
47
|
+
char_index += 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
results
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -101,6 +101,15 @@
|
|
101
101
|
<item id="encoded-japanese-filename"
|
102
102
|
href="%E6%97%A5%E6%9C%AC%E8%AA%9E.xhtml"
|
103
103
|
media-type="application/xhtml+xml"/>
|
104
|
+
<item id="utf-8-encoded"
|
105
|
+
href="japanese.utf8.xhtml"
|
106
|
+
media-type="application/xhtml+xml"/>
|
107
|
+
<item id="euc-jp-encoded"
|
108
|
+
href="japanese.eucjp.xhtml"
|
109
|
+
media-type="application/xhtml+xml"/>
|
110
|
+
<item id="shift_jis-encoded"
|
111
|
+
href="japanese.sjis.xhtml"
|
112
|
+
media-type="application/xhtml+xml"/>
|
104
113
|
</manifest>
|
105
114
|
<spine>
|
106
115
|
<itemref idref="nav"/>
|
@@ -116,4 +125,4 @@
|
|
116
125
|
<mediaType handler="impl"
|
117
126
|
media-type="application/x-demo-slideshow"/>
|
118
127
|
</bindings>
|
119
|
-
</package>
|
128
|
+
</package>
|
@@ -81,8 +81,8 @@ class TestParserPublication < Test::Unit::TestCase
|
|
81
81
|
@manifest = @parser.parse_manifest
|
82
82
|
end
|
83
83
|
|
84
|
-
def
|
85
|
-
assert_equal
|
84
|
+
def test_manifest_has_19_items
|
85
|
+
assert_equal 19, @manifest.items.length
|
86
86
|
end
|
87
87
|
|
88
88
|
def test_item_has_relative_path_as_href_attribute
|
data/test/test_publication.rb
CHANGED
@@ -239,6 +239,16 @@ class TestPublication < Test::Unit::TestCase
|
|
239
239
|
|
240
240
|
assert_nil xhtml_item.find_item_by_relative_iri(Addressable::URI.parse('../image/01.png'))
|
241
241
|
end
|
242
|
+
|
243
|
+
data('UTF-8' => [Encoding::UTF_8, 'utf-8-encoded'],
|
244
|
+
'EUC-JP' => [Encoding::EUC_JP, 'euc-jp-encoded'],
|
245
|
+
'Shift-JIS' => [Encoding::Shift_JIS, 'shift_jis-encoded'])
|
246
|
+
def test_read_detects_encoding_automatically(data)
|
247
|
+
encoding, id = data
|
248
|
+
epub = EPUB::Parser.parse('test/fixtures/book.epub')
|
249
|
+
item = epub.package.manifest[id]
|
250
|
+
assert_equal encoding, item.read.encoding
|
251
|
+
end
|
242
252
|
end
|
243
253
|
end
|
244
254
|
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative 'helper'
|
3
|
+
require 'epub/searcher'
|
4
|
+
|
5
|
+
class TestSearcher < Test::Unit::TestCase
|
6
|
+
class TestPublication < self
|
7
|
+
def setup
|
8
|
+
super
|
9
|
+
opf_path = File.expand_path('../fixtures/book/OPS/ルートファイル.opf', __FILE__)
|
10
|
+
nav_path = File.expand_path('../fixtures/book/OPS/nav.xhtml', __FILE__)
|
11
|
+
@package = EPUB::Parser::Publication.new(open(opf_path), 'OPS/ルートファイル.opf').parse
|
12
|
+
@package.spine.each_itemref do |itemref|
|
13
|
+
stub(itemref.item).read {
|
14
|
+
itemref.idref == 'nav' ? File.read(nav_path) : '<html></html>'
|
15
|
+
}
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_no_result
|
20
|
+
assert_empty EPUB::Searcher::Publication.search(@package, 'no result')
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_simple
|
24
|
+
assert_equal(
|
25
|
+
results([
|
26
|
+
[[[:element, 2, {:name => 'spine', :id => nil}], [:itemref, 0, {:id => nil}], [:element, 0, {:name => 'head', :id => nil}], [:element, 0, {:name => 'title', :id => nil}], [:text, 0]], [[:character, 9]], [[:character, 16]]],
|
27
|
+
[[[:element, 2, {:name => 'spine', :id => nil}], [:itemref, 0, {:id => nil}], [:element, 1, {:name => 'body', :id => nil}], [:element, 0, {:name => 'div', :id => nil}], [:element, 0, {:name => 'nav', :id => 'idid'}], [:element, 0, {:name => 'hgroup', :id => nil}], [:element, 1, {:name => 'h1', :id => nil}], [:text, 0]], [[:character, 9]], [[:character, 16]]]
|
28
|
+
]),
|
29
|
+
EPUB::Searcher::Publication.search(@package, 'Content')
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
class TesetResult < self
|
34
|
+
def test_to_cfi_s
|
35
|
+
assert_equal '/6/2!/4/2/2[idid]/2/4/1,:9,:16', EPUB::Searcher::Publication.search(@package, 'Content').last.to_cfi_s
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class TestXHTML < self
|
41
|
+
def setup
|
42
|
+
super
|
43
|
+
nav_path = File.expand_path('../fixtures/book/OPS/nav.xhtml', __FILE__)
|
44
|
+
@doc = Nokogiri.XML(open(nav_path))
|
45
|
+
@h1 = @doc.search('h1').first
|
46
|
+
@nav = @doc.search('nav').first
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_no_result
|
50
|
+
assert_empty EPUB::Searcher::XHTML::Restricted.search(@h1, 'no result')
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_simple
|
54
|
+
assert_equal results([[[[:text, 0]], [[:character, 9]], [[:character, 16]]]]), EPUB::Searcher::XHTML::Restricted.search(@h1, 'Content')
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_multiple_text_result
|
58
|
+
assert_equal results([[[[:text, 0]], [[:character, 6]], [[:character, 7]]], [[[:text, 0]], [[:character, 10]], [[:character, 11]]]]), EPUB::Searcher::XHTML::Restricted.search(@h1, 'o')
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_text_after_element
|
62
|
+
elem = Nokogiri.XML('<root><elem>inner</elem>after</root>')
|
63
|
+
|
64
|
+
assert_equal results([[[[:text, 1]], [[:character, 0]], [[:character, 5]]]]), EPUB::Searcher::XHTML::Restricted.search(elem, 'after')
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_entity_reference
|
68
|
+
elem = Nokogiri.XML('<root>before<after</root>')
|
69
|
+
|
70
|
+
assert_equal results([[[[:text, 0]], [[:character, 6]], [[:character, 7]]]]), EPUB::Searcher::XHTML::Restricted.search(elem, '<')
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_nested_result
|
74
|
+
assert_equal results([[[[:element, 1, {:name => 'ol', :id => nil}], [:element, 1, {:name => 'li', :id => nil}], [:element, 1, {:name => 'ol', :id => nil}], [:element, 1, {:name => 'li', :id => nil}], [:element, 0, {:name => 'a', :id => nil}], [:text, 0]], [[:character, 0]], [[:character, 3]]]]), EPUB::Searcher::XHTML::Restricted.search(@nav, '第二節')
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_img
|
78
|
+
assert_equal [result([[[:element, 1, {:name => 'ol', :id => nil}], [:element, 1, {:name => 'li', :id => nil}], [:element, 1, {:name => 'ol', :id => nil}], [:element, 2, {:name => 'li', :id => nil}], [:element, 0, {:name => 'a', :id => nil}], [:element, 0, {:name => 'img', :id => nil}]], nil, nil])], EPUB::Searcher::XHTML::Restricted.search(@nav, '第三節')
|
79
|
+
end
|
80
|
+
|
81
|
+
class TestResult < self
|
82
|
+
def setup
|
83
|
+
super
|
84
|
+
@result = EPUB::Searcher::XHTML::Restricted.search(@doc, '第二節').first
|
85
|
+
end
|
86
|
+
|
87
|
+
def test_to_xpath_and_offset
|
88
|
+
assert_equal ['./*[2]/*[1]/*[1]/*[2]/*[2]/*[2]/*[2]/*[1]/text()[1]', 0], @result.to_xpath_and_offset
|
89
|
+
assert_equal ['./xhtml:*[2]/xhtml:*[1]/xhtml:*[1]/xhtml:*[2]/xhtml:*[2]/xhtml:*[2]/xhtml:*[2]/xhtml:*[1]/text()[1]', 0], @result.to_xpath_and_offset(true)
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_to_cfi_s
|
93
|
+
assert_equal '/4/2/2[idid]/4/4/4/4/2/1,:0,:3', @result.to_cfi_s
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_to_cfi_s_img
|
97
|
+
assert_equal '/4/2/2[idid]/4/4/4/6/2/2', EPUB::Searcher::XHTML::Restricted.search(@doc, '第三節').first.to_cfi_s
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
|
104
|
+
def results(results)
|
105
|
+
results.collect {|res| result(res)}
|
106
|
+
end
|
107
|
+
|
108
|
+
def result(steps_triple)
|
109
|
+
EPUB::Searcher::Result.new(*steps_triple.collect {|steps|
|
110
|
+
steps ? steps.collect {|s| step(s)} : steps
|
111
|
+
})
|
112
|
+
end
|
113
|
+
|
114
|
+
def step(step)
|
115
|
+
EPUB::Searcher::Result::Step.new(*step)
|
116
|
+
end
|
117
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: epub-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- KITAITI Makoto
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-09-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -276,6 +276,20 @@ dependencies:
|
|
276
276
|
- - ">="
|
277
277
|
- !ruby/object:Gem::Version
|
278
278
|
version: 2.3.5
|
279
|
+
- !ruby/object:Gem::Dependency
|
280
|
+
name: rchardet
|
281
|
+
requirement: !ruby/object:Gem::Requirement
|
282
|
+
requirements:
|
283
|
+
- - ">="
|
284
|
+
- !ruby/object:Gem::Version
|
285
|
+
version: '0'
|
286
|
+
type: :runtime
|
287
|
+
prerelease: false
|
288
|
+
version_requirements: !ruby/object:Gem::Requirement
|
289
|
+
requirements:
|
290
|
+
- - ">="
|
291
|
+
- !ruby/object:Gem::Version
|
292
|
+
version: '0'
|
279
293
|
description: Parse EPUB 3 book loosely
|
280
294
|
email:
|
281
295
|
- KitaitiMakoto@gmail.com
|
@@ -304,6 +318,7 @@ files:
|
|
304
318
|
- docs/Item.markdown
|
305
319
|
- docs/Navigation.markdown
|
306
320
|
- docs/Publication.markdown
|
321
|
+
- docs/Searcher.markdown
|
307
322
|
- epub-parser.gemspec
|
308
323
|
- features/epubinfo.feature
|
309
324
|
- features/step_definitions/epubinfo_steps.rb
|
@@ -337,6 +352,10 @@ files:
|
|
337
352
|
- lib/epub/publication/package/manifest.rb
|
338
353
|
- lib/epub/publication/package/metadata.rb
|
339
354
|
- lib/epub/publication/package/spine.rb
|
355
|
+
- lib/epub/searcher.rb
|
356
|
+
- lib/epub/searcher/publication.rb
|
357
|
+
- lib/epub/searcher/result.rb
|
358
|
+
- lib/epub/searcher/xhtml.rb
|
340
359
|
- man/epubinfo.1.ronn
|
341
360
|
- schemas/epub-nav-30.rnc
|
342
361
|
- schemas/epub-nav-30.sch
|
@@ -347,6 +366,9 @@ files:
|
|
347
366
|
- test/fixtures/book/OPS/case-sensitive.xhtml
|
348
367
|
- test/fixtures/book/OPS/containing space.xhtml
|
349
368
|
- test/fixtures/book/OPS/containing%20space.xhtml
|
369
|
+
- test/fixtures/book/OPS/japanese.eucjp.xhtml
|
370
|
+
- test/fixtures/book/OPS/japanese.sjis.xhtml
|
371
|
+
- test/fixtures/book/OPS/japanese.utf8.xhtml
|
350
372
|
- test/fixtures/book/OPS/nav.xhtml
|
351
373
|
- test/fixtures/book/OPS/ルートファイル.opf
|
352
374
|
- test/fixtures/book/OPS/日本語.xhtml
|
@@ -362,6 +384,7 @@ files:
|
|
362
384
|
- test/test_parser_ocf.rb
|
363
385
|
- test/test_parser_publication.rb
|
364
386
|
- test/test_publication.rb
|
387
|
+
- test/test_searcher.rb
|
365
388
|
homepage: https://github.com/KitaitiMakoto/epub-parser
|
366
389
|
licenses:
|
367
390
|
- MIT
|
@@ -382,7 +405,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
382
405
|
version: '0'
|
383
406
|
requirements: []
|
384
407
|
rubyforge_project:
|
385
|
-
rubygems_version: 2.2.
|
408
|
+
rubygems_version: 2.2.2
|
386
409
|
signing_key:
|
387
410
|
specification_version: 4
|
388
411
|
summary: EPUB 3 Parser
|
@@ -401,4 +424,5 @@ test_files:
|
|
401
424
|
- test/test_parser_ocf.rb
|
402
425
|
- test/test_parser_publication.rb
|
403
426
|
- test/test_publication.rb
|
427
|
+
- test/test_searcher.rb
|
404
428
|
has_rdoc: yard
|