epub-parser 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -0
- data/CHANGELOG.markdown +7 -0
- data/Gemfile +1 -1
- data/README.markdown +7 -8
- data/docs/Searcher.markdown +74 -0
- data/epub-parser.gemspec +1 -0
- data/lib/epub/parser/version.rb +1 -1
- data/lib/epub/publication/package/manifest.rb +14 -1
- data/lib/epub/searcher.rb +3 -0
- data/lib/epub/searcher/publication.rb +32 -0
- data/lib/epub/searcher/result.rb +73 -0
- data/lib/epub/searcher/xhtml.rb +57 -0
- data/test/fixtures/book/OPS/japanese.eucjp.xhtml +10 -0
- data/test/fixtures/book/OPS/japanese.sjis.xhtml +10 -0
- data/test/fixtures/book/OPS/japanese.utf8.xhtml +10 -0
- data/test/fixtures/book/OPS//343/203/253/343/203/274/343/203/210/343/203/225/343/202/241/343/202/244/343/203/253.opf +10 -1
- data/test/test_parser_publication.rb +2 -2
- data/test/test_publication.rb +10 -0
- data/test/test_searcher.rb +117 -0
- metadata +27 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 02abe0846123f8f3d218581a102f69c2972a80e1
|
4
|
+
data.tar.gz: 510bddcca86f789add0ecc4193b424063a5d23d6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8c46c39e8031ab857e968dc3826e4ce0fb6d65588ef8d8e61f05672628feaaa58d25a829c1b39826cf1a04b458dcdcd2f0898578f92768bd9db866b4a5f1f6aa
|
7
|
+
data.tar.gz: feea20e63185013eea0c9d5bdd0b8c0e80a4e06646b8bd106095a9eb967a0c067113915ad70458d9332d3efab120875447c90032fa4f4dedf2465d72532f1d3e
|
data/.yardopts
CHANGED
data/CHANGELOG.markdown
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
CHANGELOG
|
2
2
|
=========
|
3
|
+
|
4
|
+
0.1.7
|
5
|
+
-----
|
6
|
+
|
7
|
+
* [Experimental]Add `EPUB::Searcher` module. See {file:Searcher.markdown} for details
|
8
|
+
* Detect and set character encoding in `EPUB::Publication::Package::Item#read`
|
9
|
+
|
3
10
|
0.1.6
|
4
11
|
-----
|
5
12
|
* Remove `EPUB.parse` method
|
data/Gemfile
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
gemspec
|
data/README.markdown
CHANGED
@@ -93,6 +93,7 @@ See {file:docs/EpubOpen} for more info.
|
|
93
93
|
REQUIREMENTS
|
94
94
|
------------
|
95
95
|
* Ruby 1.9.3 or later
|
96
|
+
* `patch` command to install Nokogiri
|
96
97
|
* C compiler to compile Zip/Ruby and Nokogiri
|
97
98
|
|
98
99
|
Related Gems
|
@@ -108,6 +109,12 @@ If you find other gems, please tell me or request a pull request.
|
|
108
109
|
|
109
110
|
RECENT CHANGES
|
110
111
|
--------------
|
112
|
+
|
113
|
+
### 0.1.7
|
114
|
+
|
115
|
+
* [Experimental]Add `EPUB::Searcher` module. See {file:Searcher.markdown} for details
|
116
|
+
* Detect and set character encoding in `EPUB::Publication::Package::Item#read`
|
117
|
+
|
111
118
|
### 0.1.6
|
112
119
|
* Remove `EPUB.parse` method
|
113
120
|
* Remove `EPUB::Publication::Package::Metadata#to_hash`
|
@@ -134,14 +141,6 @@ RECENT CHANGES
|
|
134
141
|
* Add `ContentDocument::XHTML#rexml` and `#nokogiri`
|
135
142
|
* Inspect more readably
|
136
143
|
|
137
|
-
### 0.1.4
|
138
|
-
* [Fixed-Layout Documents][fixed-layout] support
|
139
|
-
* Define `ContentDocument::XHTML#top_level?`
|
140
|
-
* Define `Spine::Itemref#page_spread` and `#page_spread=`
|
141
|
-
* Define some utility methods around `Manifest::Item` and `Spine::Itemref`
|
142
|
-
|
143
|
-
[fixed-layout]: http://www.idpf.org/epub/fxl/
|
144
|
-
|
145
144
|
See {file:CHANGELOG.markdown} for older changelogs and details.
|
146
145
|
|
147
146
|
TODOS
|
@@ -0,0 +1,74 @@
|
|
1
|
+
{file:docs/Home.markdown} > **{file:docs/Searcher.markdown}**
|
2
|
+
|
3
|
+
Searcher
|
4
|
+
========
|
5
|
+
|
6
|
+
*Searcher is experimental now. Note that all interfaces are not stable at all.*
|
7
|
+
|
8
|
+
Example
|
9
|
+
-------
|
10
|
+
|
11
|
+
epub = EPUB::Parser.parse('childrens-literature-20130206.epub')
|
12
|
+
search_word = 'INTRODUCTORY'
|
13
|
+
results = EPUB::Searcher.search(epub.package, search_word)
|
14
|
+
# => [#<EPUB::Searcher::Result:0x007f74d2b31548
|
15
|
+
# @end_steps=[#<EPUB::Searcher::Result::Step:0x007f74d2b7baa8 @index=12, @type=:character>],
|
16
|
+
# @parent_steps=
|
17
|
+
# [#<EPUB::Searcher::Result::Step:0x007f74d2b81318 @index=2, @name="spine", @type=:element>,
|
18
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7f4c8 @index=1, @type=:itemref>,
|
19
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7d560 @index=1, @name="body", @type=:element>,
|
20
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7d308 @index=0, @name="nav", @type=:element>,
|
21
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7cdb8 @index=1, @name="ol", @type=:element>,
|
22
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7cb38 @index=0, @name="li", @type=:element>,
|
23
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7c5e8 @index=1, @name="ol", @type=:element>,
|
24
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7bf80 @index=1, @name="li", @type=:element>,
|
25
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7bd28 @index=0, @name="a", @type=:element>,
|
26
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b7bb70 @index=0, @type=:text>],
|
27
|
+
# @start_steps=[#<EPUB::Searcher::Result::Step:0x007f74d2b7baf8 @index=0, @type=:character>]>,
|
28
|
+
# #<EPUB::Searcher::Result:0x007f74d294e258
|
29
|
+
# @end_steps=[#<EPUB::Searcher::Result::Step:0x007f74d2b0f8d0 @index=12, @type=:character>],
|
30
|
+
# @parent_steps=
|
31
|
+
# [#<EPUB::Searcher::Result::Step:0x007f74d2b81318 @index=2, @name="spine", @type=:element>,
|
32
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b314f8 @index=2, @type=:itemref>,
|
33
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b2fb80 @index=1, @name="body", @type=:element>,
|
34
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b2f900 @index=0, @name="section", @type=:element>,
|
35
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b10578 @index=3, @name="section", @type=:element>,
|
36
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b0fb50 @index=1, @name="h3", @type=:element>,
|
37
|
+
# # #<EPUB::Searcher::Result::Step:0x007f74d2b0f998 @index=0, @type=:text>],
|
38
|
+
# @start_steps=[#<EPUB::Searcher::Result::Step:0x007f74d2b0f920 @index=0, @type=:character>]>]
|
39
|
+
puts results.collect(&:to_cfi_s)
|
40
|
+
# /6/4!/4/2/4/2/4/4/2/1,:0,:12
|
41
|
+
# /6/6!/4/2/8/4/1,:0,:12
|
42
|
+
# => nil
|
43
|
+
|
44
|
+
Search result
|
45
|
+
-------------
|
46
|
+
|
47
|
+
Search result is an array of {EPUB::Searcher::Result} and it may be converted to an EPUBCFI string by {EPUB::Searcher::Result#to_cfi_s}.
|
48
|
+
|
49
|
+
Restricted XHTML Searcher
|
50
|
+
-------------------------
|
51
|
+
|
52
|
+
Now searcher for XHTML documents is *restricted*, which means that it can search from only single elements. For instance, it can find 'search word' from XHTML document below:
|
53
|
+
|
54
|
+
<html>
|
55
|
+
<head>
|
56
|
+
<title>Sample document</title>
|
57
|
+
</head>
|
58
|
+
<body>
|
59
|
+
<p>search word</p>
|
60
|
+
</body>
|
61
|
+
</html>
|
62
|
+
|
63
|
+
But cannot from document below:
|
64
|
+
|
65
|
+
<html>
|
66
|
+
<head>
|
67
|
+
<title>Sample document</title>
|
68
|
+
</head>
|
69
|
+
<body>
|
70
|
+
<p><em>search</em> word</p>
|
71
|
+
</body>
|
72
|
+
</html>
|
73
|
+
|
74
|
+
because the words 'search' and 'word' are not in the same element.
|
data/epub-parser.gemspec
CHANGED
data/lib/epub/parser/version.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'set'
|
2
2
|
require 'enumerabler'
|
3
|
+
require 'rchardet'
|
3
4
|
require 'epub/constants'
|
4
5
|
require 'epub/parser/content_document'
|
5
6
|
|
@@ -91,9 +92,21 @@ module EPUB
|
|
91
92
|
end
|
92
93
|
|
93
94
|
def read
|
94
|
-
Zip::Archive.open(manifest.package.book.epub_file) {|zip|
|
95
|
+
raw_content = Zip::Archive.open(manifest.package.book.epub_file) {|zip|
|
95
96
|
zip.fopen(entry_name).read
|
96
97
|
}
|
98
|
+
# CharDet.detect doesn't raise Encoding::CompatibilityError
|
99
|
+
# that is caused when trying compare CharDet's internal
|
100
|
+
# ASCII-8BIT RegExp with a String with other encoding
|
101
|
+
# because Zip::File#read returns a String with encoding ASCII-8BIT.
|
102
|
+
# So, no need to rescue the error here.
|
103
|
+
encoding = CharDet.detect(raw_content)['encoding']
|
104
|
+
if encoding
|
105
|
+
raw_content.force_encoding(encoding)
|
106
|
+
else
|
107
|
+
warn "No encoding detected for #{entry_name}. Set to ASCII-8BIT" if $DEBUG || $VERBOSE
|
108
|
+
raw_content
|
109
|
+
end
|
97
110
|
end
|
98
111
|
|
99
112
|
def xhtml?
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'epub/publication'
|
2
|
+
|
3
|
+
module EPUB
|
4
|
+
module Searcher
|
5
|
+
class Publication
|
6
|
+
class << self
|
7
|
+
def search(package, word)
|
8
|
+
new(word).search(package)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(word)
|
13
|
+
@word = word
|
14
|
+
end
|
15
|
+
|
16
|
+
def search(package)
|
17
|
+
results = []
|
18
|
+
|
19
|
+
spine = package.spine
|
20
|
+
spine_step = Result::Step.new(:element, 2, {:name => 'spine', :id => spine.id})
|
21
|
+
spine.each_itemref.with_index do |itemref, index|
|
22
|
+
itemref_step = Result::Step.new(:itemref, index, {:id => itemref.id})
|
23
|
+
XHTML::Restricted.search(Nokogiri.XML(itemref.item.read), @word).each do |sub_result|
|
24
|
+
results << Result.new([spine_step, itemref_step] + sub_result.parent_steps, sub_result.start_steps, sub_result.end_steps)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
results
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module EPUB
|
2
|
+
module Searcher
|
3
|
+
class Result
|
4
|
+
attr_reader :parent_steps, :start_steps, :end_steps
|
5
|
+
|
6
|
+
# @param parent_steps [Array<Step>] common steps between start and end
|
7
|
+
# @param start_steps [Array<Step>] steps to start from +parent_steps+
|
8
|
+
# @param end_steps [Array<Step>] steps to end from +parent_steps+
|
9
|
+
def initialize(parent_steps, start_steps, end_steps)
|
10
|
+
@parent_steps, @start_steps, @end_steps = parent_steps, start_steps, end_steps
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_xpath_and_offset(with_xmlns=false)
|
14
|
+
xpath = (@parent_steps + @start_steps).reduce('.') {|path, step|
|
15
|
+
case step.type
|
16
|
+
when :element
|
17
|
+
path + '/%s*[%d]' % [with_xmlns ? 'xhtml:' : nil, step.index + 1]
|
18
|
+
when :text
|
19
|
+
path + '/text()[%s]' % [step.index + 1]
|
20
|
+
else
|
21
|
+
path
|
22
|
+
end
|
23
|
+
}
|
24
|
+
|
25
|
+
[xpath, @start_steps.last.index]
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_cfi_s
|
29
|
+
[@parent_steps, @start_steps, @end_steps].collect {|steps|
|
30
|
+
steps ? steps.collect(&:to_cfi_s).join : nil
|
31
|
+
}.compact.join(',')
|
32
|
+
end
|
33
|
+
|
34
|
+
def ==(other)
|
35
|
+
[@parent_steps + @start_steps.to_a] == [other.parent_steps + other.start_steps.to_a] and
|
36
|
+
[@parent_steps + @end_steps.to_a] == [other.parent_steps + other.end_steps.to_a]
|
37
|
+
end
|
38
|
+
|
39
|
+
class Step
|
40
|
+
attr_reader :type, :index, :info
|
41
|
+
|
42
|
+
def initialize(type, index, info={})
|
43
|
+
@type, @index, @info = type, index, info
|
44
|
+
end
|
45
|
+
|
46
|
+
def ==(other)
|
47
|
+
self.type == other.type and
|
48
|
+
self.index == other.index and
|
49
|
+
self.info == other.info
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_cfi_s
|
53
|
+
case type
|
54
|
+
when :element
|
55
|
+
'/%d%s' % [(index + 1) * 2, id_assertion]
|
56
|
+
when :text
|
57
|
+
'/%d' % [(index + 1)]
|
58
|
+
when :character
|
59
|
+
':%d' % [index]
|
60
|
+
when :itemref
|
61
|
+
'/%d%s!' % [(index + 1) * 2, id_assertion]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def id_assertion
|
68
|
+
info[:id] ? "[#{info[:id]}]" : nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'epub'
|
2
|
+
require 'epub/parser/utils'
|
3
|
+
|
4
|
+
module EPUB
|
5
|
+
module Searcher
|
6
|
+
class XHTML
|
7
|
+
class Restricted
|
8
|
+
class << self
|
9
|
+
# @param element [Nokogiri::XML::Element, Nokogiri::XML::Document]
|
10
|
+
# @param word [String]
|
11
|
+
# @return [Array<Result>]
|
12
|
+
def search(element, word)
|
13
|
+
new(word).search(element.respond_to?(:root) ? element.root : element)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# @param word [String]
|
18
|
+
def initialize(word)
|
19
|
+
@word = word
|
20
|
+
end
|
21
|
+
|
22
|
+
# @param element [Nokogiri::XML::Element]
|
23
|
+
# @return [Array<Result>]
|
24
|
+
def search(element)
|
25
|
+
results = []
|
26
|
+
|
27
|
+
elem_index = 0
|
28
|
+
element.children.each do |child|
|
29
|
+
if child.element?
|
30
|
+
child_step = Result::Step.new(:element, elem_index, {:name => child.name, :id => Parser::Utils.extract_attribute(child, 'id')})
|
31
|
+
if child.name == 'img'
|
32
|
+
if Parser::Utils.extract_attribute(child, 'alt').index(@word)
|
33
|
+
results << Result.new([child_step], nil, nil)
|
34
|
+
end
|
35
|
+
else
|
36
|
+
search(child).each do |sub_result|
|
37
|
+
results << Result.new([child_step] + sub_result.parent_steps, sub_result.start_steps, sub_result.end_steps)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
elem_index += 1
|
41
|
+
elsif child.text?
|
42
|
+
text_index = elem_index
|
43
|
+
char_index = 0
|
44
|
+
text_step = Result::Step.new(:text, text_index)
|
45
|
+
while char_index = child.text.index(@word, char_index)
|
46
|
+
results << Result.new([text_step], [Result::Step.new(:character, char_index)], [Result::Step.new(:character, char_index + @word.length)])
|
47
|
+
char_index += 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
results
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -101,6 +101,15 @@
|
|
101
101
|
<item id="encoded-japanese-filename"
|
102
102
|
href="%E6%97%A5%E6%9C%AC%E8%AA%9E.xhtml"
|
103
103
|
media-type="application/xhtml+xml"/>
|
104
|
+
<item id="utf-8-encoded"
|
105
|
+
href="japanese.utf8.xhtml"
|
106
|
+
media-type="application/xhtml+xml"/>
|
107
|
+
<item id="euc-jp-encoded"
|
108
|
+
href="japanese.eucjp.xhtml"
|
109
|
+
media-type="application/xhtml+xml"/>
|
110
|
+
<item id="shift_jis-encoded"
|
111
|
+
href="japanese.sjis.xhtml"
|
112
|
+
media-type="application/xhtml+xml"/>
|
104
113
|
</manifest>
|
105
114
|
<spine>
|
106
115
|
<itemref idref="nav"/>
|
@@ -116,4 +125,4 @@
|
|
116
125
|
<mediaType handler="impl"
|
117
126
|
media-type="application/x-demo-slideshow"/>
|
118
127
|
</bindings>
|
119
|
-
</package>
|
128
|
+
</package>
|
@@ -81,8 +81,8 @@ class TestParserPublication < Test::Unit::TestCase
|
|
81
81
|
@manifest = @parser.parse_manifest
|
82
82
|
end
|
83
83
|
|
84
|
-
def
|
85
|
-
assert_equal
|
84
|
+
def test_manifest_has_19_items
|
85
|
+
assert_equal 19, @manifest.items.length
|
86
86
|
end
|
87
87
|
|
88
88
|
def test_item_has_relative_path_as_href_attribute
|
data/test/test_publication.rb
CHANGED
@@ -239,6 +239,16 @@ class TestPublication < Test::Unit::TestCase
|
|
239
239
|
|
240
240
|
assert_nil xhtml_item.find_item_by_relative_iri(Addressable::URI.parse('../image/01.png'))
|
241
241
|
end
|
242
|
+
|
243
|
+
data('UTF-8' => [Encoding::UTF_8, 'utf-8-encoded'],
|
244
|
+
'EUC-JP' => [Encoding::EUC_JP, 'euc-jp-encoded'],
|
245
|
+
'Shift-JIS' => [Encoding::Shift_JIS, 'shift_jis-encoded'])
|
246
|
+
def test_read_detects_encoding_automatically(data)
|
247
|
+
encoding, id = data
|
248
|
+
epub = EPUB::Parser.parse('test/fixtures/book.epub')
|
249
|
+
item = epub.package.manifest[id]
|
250
|
+
assert_equal encoding, item.read.encoding
|
251
|
+
end
|
242
252
|
end
|
243
253
|
end
|
244
254
|
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require_relative 'helper'
|
3
|
+
require 'epub/searcher'
|
4
|
+
|
5
|
+
class TestSearcher < Test::Unit::TestCase
|
6
|
+
class TestPublication < self
|
7
|
+
def setup
|
8
|
+
super
|
9
|
+
opf_path = File.expand_path('../fixtures/book/OPS/ルートファイル.opf', __FILE__)
|
10
|
+
nav_path = File.expand_path('../fixtures/book/OPS/nav.xhtml', __FILE__)
|
11
|
+
@package = EPUB::Parser::Publication.new(open(opf_path), 'OPS/ルートファイル.opf').parse
|
12
|
+
@package.spine.each_itemref do |itemref|
|
13
|
+
stub(itemref.item).read {
|
14
|
+
itemref.idref == 'nav' ? File.read(nav_path) : '<html></html>'
|
15
|
+
}
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_no_result
|
20
|
+
assert_empty EPUB::Searcher::Publication.search(@package, 'no result')
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_simple
|
24
|
+
assert_equal(
|
25
|
+
results([
|
26
|
+
[[[:element, 2, {:name => 'spine', :id => nil}], [:itemref, 0, {:id => nil}], [:element, 0, {:name => 'head', :id => nil}], [:element, 0, {:name => 'title', :id => nil}], [:text, 0]], [[:character, 9]], [[:character, 16]]],
|
27
|
+
[[[:element, 2, {:name => 'spine', :id => nil}], [:itemref, 0, {:id => nil}], [:element, 1, {:name => 'body', :id => nil}], [:element, 0, {:name => 'div', :id => nil}], [:element, 0, {:name => 'nav', :id => 'idid'}], [:element, 0, {:name => 'hgroup', :id => nil}], [:element, 1, {:name => 'h1', :id => nil}], [:text, 0]], [[:character, 9]], [[:character, 16]]]
|
28
|
+
]),
|
29
|
+
EPUB::Searcher::Publication.search(@package, 'Content')
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
class TesetResult < self
|
34
|
+
def test_to_cfi_s
|
35
|
+
assert_equal '/6/2!/4/2/2[idid]/2/4/1,:9,:16', EPUB::Searcher::Publication.search(@package, 'Content').last.to_cfi_s
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class TestXHTML < self
|
41
|
+
def setup
|
42
|
+
super
|
43
|
+
nav_path = File.expand_path('../fixtures/book/OPS/nav.xhtml', __FILE__)
|
44
|
+
@doc = Nokogiri.XML(open(nav_path))
|
45
|
+
@h1 = @doc.search('h1').first
|
46
|
+
@nav = @doc.search('nav').first
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_no_result
|
50
|
+
assert_empty EPUB::Searcher::XHTML::Restricted.search(@h1, 'no result')
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_simple
|
54
|
+
assert_equal results([[[[:text, 0]], [[:character, 9]], [[:character, 16]]]]), EPUB::Searcher::XHTML::Restricted.search(@h1, 'Content')
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_multiple_text_result
|
58
|
+
assert_equal results([[[[:text, 0]], [[:character, 6]], [[:character, 7]]], [[[:text, 0]], [[:character, 10]], [[:character, 11]]]]), EPUB::Searcher::XHTML::Restricted.search(@h1, 'o')
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_text_after_element
|
62
|
+
elem = Nokogiri.XML('<root><elem>inner</elem>after</root>')
|
63
|
+
|
64
|
+
assert_equal results([[[[:text, 1]], [[:character, 0]], [[:character, 5]]]]), EPUB::Searcher::XHTML::Restricted.search(elem, 'after')
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_entity_reference
|
68
|
+
elem = Nokogiri.XML('<root>before<after</root>')
|
69
|
+
|
70
|
+
assert_equal results([[[[:text, 0]], [[:character, 6]], [[:character, 7]]]]), EPUB::Searcher::XHTML::Restricted.search(elem, '<')
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_nested_result
|
74
|
+
assert_equal results([[[[:element, 1, {:name => 'ol', :id => nil}], [:element, 1, {:name => 'li', :id => nil}], [:element, 1, {:name => 'ol', :id => nil}], [:element, 1, {:name => 'li', :id => nil}], [:element, 0, {:name => 'a', :id => nil}], [:text, 0]], [[:character, 0]], [[:character, 3]]]]), EPUB::Searcher::XHTML::Restricted.search(@nav, '第二節')
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_img
|
78
|
+
assert_equal [result([[[:element, 1, {:name => 'ol', :id => nil}], [:element, 1, {:name => 'li', :id => nil}], [:element, 1, {:name => 'ol', :id => nil}], [:element, 2, {:name => 'li', :id => nil}], [:element, 0, {:name => 'a', :id => nil}], [:element, 0, {:name => 'img', :id => nil}]], nil, nil])], EPUB::Searcher::XHTML::Restricted.search(@nav, '第三節')
|
79
|
+
end
|
80
|
+
|
81
|
+
class TestResult < self
|
82
|
+
def setup
|
83
|
+
super
|
84
|
+
@result = EPUB::Searcher::XHTML::Restricted.search(@doc, '第二節').first
|
85
|
+
end
|
86
|
+
|
87
|
+
def test_to_xpath_and_offset
|
88
|
+
assert_equal ['./*[2]/*[1]/*[1]/*[2]/*[2]/*[2]/*[2]/*[1]/text()[1]', 0], @result.to_xpath_and_offset
|
89
|
+
assert_equal ['./xhtml:*[2]/xhtml:*[1]/xhtml:*[1]/xhtml:*[2]/xhtml:*[2]/xhtml:*[2]/xhtml:*[2]/xhtml:*[1]/text()[1]', 0], @result.to_xpath_and_offset(true)
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_to_cfi_s
|
93
|
+
assert_equal '/4/2/2[idid]/4/4/4/4/2/1,:0,:3', @result.to_cfi_s
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_to_cfi_s_img
|
97
|
+
assert_equal '/4/2/2[idid]/4/4/4/6/2/2', EPUB::Searcher::XHTML::Restricted.search(@doc, '第三節').first.to_cfi_s
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
|
104
|
+
def results(results)
|
105
|
+
results.collect {|res| result(res)}
|
106
|
+
end
|
107
|
+
|
108
|
+
def result(steps_triple)
|
109
|
+
EPUB::Searcher::Result.new(*steps_triple.collect {|steps|
|
110
|
+
steps ? steps.collect {|s| step(s)} : steps
|
111
|
+
})
|
112
|
+
end
|
113
|
+
|
114
|
+
def step(step)
|
115
|
+
EPUB::Searcher::Result::Step.new(*step)
|
116
|
+
end
|
117
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: epub-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- KITAITI Makoto
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-09-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -276,6 +276,20 @@ dependencies:
|
|
276
276
|
- - ">="
|
277
277
|
- !ruby/object:Gem::Version
|
278
278
|
version: 2.3.5
|
279
|
+
- !ruby/object:Gem::Dependency
|
280
|
+
name: rchardet
|
281
|
+
requirement: !ruby/object:Gem::Requirement
|
282
|
+
requirements:
|
283
|
+
- - ">="
|
284
|
+
- !ruby/object:Gem::Version
|
285
|
+
version: '0'
|
286
|
+
type: :runtime
|
287
|
+
prerelease: false
|
288
|
+
version_requirements: !ruby/object:Gem::Requirement
|
289
|
+
requirements:
|
290
|
+
- - ">="
|
291
|
+
- !ruby/object:Gem::Version
|
292
|
+
version: '0'
|
279
293
|
description: Parse EPUB 3 book loosely
|
280
294
|
email:
|
281
295
|
- KitaitiMakoto@gmail.com
|
@@ -304,6 +318,7 @@ files:
|
|
304
318
|
- docs/Item.markdown
|
305
319
|
- docs/Navigation.markdown
|
306
320
|
- docs/Publication.markdown
|
321
|
+
- docs/Searcher.markdown
|
307
322
|
- epub-parser.gemspec
|
308
323
|
- features/epubinfo.feature
|
309
324
|
- features/step_definitions/epubinfo_steps.rb
|
@@ -337,6 +352,10 @@ files:
|
|
337
352
|
- lib/epub/publication/package/manifest.rb
|
338
353
|
- lib/epub/publication/package/metadata.rb
|
339
354
|
- lib/epub/publication/package/spine.rb
|
355
|
+
- lib/epub/searcher.rb
|
356
|
+
- lib/epub/searcher/publication.rb
|
357
|
+
- lib/epub/searcher/result.rb
|
358
|
+
- lib/epub/searcher/xhtml.rb
|
340
359
|
- man/epubinfo.1.ronn
|
341
360
|
- schemas/epub-nav-30.rnc
|
342
361
|
- schemas/epub-nav-30.sch
|
@@ -347,6 +366,9 @@ files:
|
|
347
366
|
- test/fixtures/book/OPS/case-sensitive.xhtml
|
348
367
|
- test/fixtures/book/OPS/containing space.xhtml
|
349
368
|
- test/fixtures/book/OPS/containing%20space.xhtml
|
369
|
+
- test/fixtures/book/OPS/japanese.eucjp.xhtml
|
370
|
+
- test/fixtures/book/OPS/japanese.sjis.xhtml
|
371
|
+
- test/fixtures/book/OPS/japanese.utf8.xhtml
|
350
372
|
- test/fixtures/book/OPS/nav.xhtml
|
351
373
|
- test/fixtures/book/OPS/ルートファイル.opf
|
352
374
|
- test/fixtures/book/OPS/日本語.xhtml
|
@@ -362,6 +384,7 @@ files:
|
|
362
384
|
- test/test_parser_ocf.rb
|
363
385
|
- test/test_parser_publication.rb
|
364
386
|
- test/test_publication.rb
|
387
|
+
- test/test_searcher.rb
|
365
388
|
homepage: https://github.com/KitaitiMakoto/epub-parser
|
366
389
|
licenses:
|
367
390
|
- MIT
|
@@ -382,7 +405,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
382
405
|
version: '0'
|
383
406
|
requirements: []
|
384
407
|
rubyforge_project:
|
385
|
-
rubygems_version: 2.2.
|
408
|
+
rubygems_version: 2.2.2
|
386
409
|
signing_key:
|
387
410
|
specification_version: 4
|
388
411
|
summary: EPUB 3 Parser
|
@@ -401,4 +424,5 @@ test_files:
|
|
401
424
|
- test/test_parser_ocf.rb
|
402
425
|
- test/test_parser_publication.rb
|
403
426
|
- test/test_publication.rb
|
427
|
+
- test/test_searcher.rb
|
404
428
|
has_rdoc: yard
|