epub-parser 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -70,13 +70,18 @@ module EPUB
70
70
  include Hidable
71
71
 
72
72
  attr_accessor :items, :text,
73
- :content_document, :href, :item
73
+ :content_document, :item
74
+ attr_reader :href
74
75
 
75
76
  def initialize
76
77
  @items = ItemList.new
77
78
  @items.parent = self
78
79
  end
79
80
 
81
+ def href=(iri)
82
+ @href = iri.kind_of?(Addressable::URI) ? iri : Addressable::URI.parse(iri)
83
+ end
84
+
80
85
  def traverse(depth=0, &block)
81
86
  block.call self, depth
82
87
  items.each do |item|
@@ -34,8 +34,7 @@ module EPUB
34
34
 
35
35
  # @return [Nokogiri::XML::Document] content as Nokogiri::XML::Document object
36
36
  def nokogiri
37
- require 'nokogumbo' unless Nokogiri.respond_to? :HTML5
38
- @nokogiri ||= Nokogiri.HTML5(raw_document)
37
+ @nokogiri ||= Nokogiri.XML(raw_document)
39
38
  end
40
39
  end
41
40
  end
@@ -1,9 +1,10 @@
1
1
  require 'epub/ocf/physical_container/zipruby'
2
- require 'epub/ocf/physical_container/file'
2
+ require 'epub/ocf/physical_container/unpacked_directory'
3
3
  require 'epub/ocf/physical_container/unpacked_uri'
4
4
 
5
5
  module EPUB
6
6
  class OCF
7
+ # @todo: Make thread save
7
8
  class PhysicalContainer
8
9
  @adapter = Zipruby
9
10
 
@@ -1,7 +1,7 @@
1
1
  module EPUB
2
2
  class OCF
3
3
  class PhysicalContainer
4
- class File < self
4
+ class UnpackedDirectory < self
5
5
  def open
6
6
  yield self
7
7
  end
@@ -0,0 +1,84 @@
1
+ require 'strscan'
2
+ require 'epub/parser/cfi.tab'
3
+ require 'epub/cfi'
4
+
5
+ EPUB::Parser::CFI = EPUB::CFIParser
6
+
7
+ class EPUB::Parser::CFI
8
+ include Comparable
9
+
10
+ UNICODE_CHARACTER_EXCLUDING_SPECIAL_CHARS_AND_SPACE_AND_DOT_AND_COLON_AND_TILDE_AND_ATMARK_AND_SOLIDUS_AND_EXCLAMATION_MARK_PATTERN = /\u0009|\u000A|\u000D|[\u0022-\u0027]|[\u002A-\u002B]|\u002D|[\u0030-\u0039]|\u003C|[\u003E-\u0040]|[\u0041-\u005A]|\u005C|[\u005F-\u007D]|[\u007F-\uD7FF]|[\uE000-\uFFFD]|[\u10000-\u10FFFF]/ # excluding special chars and space(\u0020) and dot(\u002E) and colon(\u003A) and tilde(\u007E) and atmark(\u0040) and solidus(\u002F) and exclamation mark(\u0021)
11
+ UNICODE_CHARACTER_PATTERN = Regexp.union(UNICODE_CHARACTER_EXCLUDING_SPECIAL_CHARS_AND_SPACE_AND_DOT_AND_COLON_AND_TILDE_AND_ATMARK_AND_SOLIDUS_AND_EXCLAMATION_MARK_PATTERN, Regexp.new(Regexp.escape(EPUB::CFI::SPECIAL_CHARS), / \.:~@!/))
12
+
13
+ class << self
14
+ def parse(string, debug: false)
15
+ new(debug: debug).parse(string)
16
+ end
17
+ end
18
+
19
+ def initialize(debug: false)
20
+ @yydebug = debug
21
+ super()
22
+ end
23
+
24
+ def parse(string)
25
+ if string.start_with? 'epubcfi('
26
+ string = string['epubcfi('.length .. -2]
27
+ end
28
+ @scanner = StringScanner.new(string, true)
29
+ @q = []
30
+ until @scanner.eos?
31
+ case
32
+ when @scanner.scan(/[1-9]/)
33
+ @q << [:DIGIT_NON_ZERO, @scanner[0]]
34
+ when @scanner.scan(/0/)
35
+ @q << [:ZERO, @scanner[0]]
36
+ when @scanner.scan(/ /)
37
+ @q << [:SPACE, @scanner[0]]
38
+ when @scanner.scan(/\^/)
39
+ @q << [:CIRCUMFLEX, @scanner[0]]
40
+ when @scanner.scan(/\[/)
41
+ @q << [:OPENING_SQUARE_BRACKET, @scanner[0]]
42
+ when @scanner.scan(/\]/)
43
+ @q << [:CLOSING_SQUARE_BRACKET, @scanner[0]]
44
+ when @scanner.scan(/\(/)
45
+ @q << [:OPENING_PARENTHESIS, @scanner[0]]
46
+ when @scanner.scan(/\)/)
47
+ @q << [:CLOSING_PARENTHESIS, @scanner[0]]
48
+ when @scanner.scan(/,/)
49
+ @q << [:COMMA, @scanner[0]]
50
+ when @scanner.scan(/;/)
51
+ @q << [:SEMICOLON, @scanner[0]]
52
+ when @scanner.scan(/=/)
53
+ @q << [:EQUAL, @scanner[0]]
54
+ when @scanner.scan(/\./)
55
+ @q << [:DOT, @scanner[0]]
56
+ when @scanner.scan(/:/)
57
+ @q << [:COLON, @scanner[0]]
58
+ when @scanner.scan(/~/)
59
+ @q << [:TILDE, @scanner[0]]
60
+ when @scanner.scan(/@/)
61
+ @q << [:ATMARK, @scanner[0]]
62
+ when @scanner.scan(/\//)
63
+ @q << [:SOLIDUS, @scanner[0]]
64
+ when @scanner.scan(/!/)
65
+ @q << [:EXCLAMATION_MARK, @scanner[0]]
66
+ when @scanner.scan(UNICODE_CHARACTER_EXCLUDING_SPECIAL_CHARS_AND_SPACE_AND_DOT_AND_COLON_AND_TILDE_AND_ATMARK_AND_SOLIDUS_AND_EXCLAMATION_MARK_PATTERN)
67
+ @q << [:UNICODE_CHARACTER_EXCLUDING_SPECIAL_CHARS_AND_SPACE_AND_DOT_AND_COLON_AND_TILDE_AND_ATMARK_AND_SOLIDUS_AND_EXCLAMATION_MARK, @scanner[0]]
68
+ else
69
+ raise 'unexpected character'
70
+ end
71
+ end
72
+ @q << [false, false]
73
+
74
+ do_parse
75
+ end
76
+
77
+ def next_token
78
+ @q.shift
79
+ end
80
+ end
81
+
82
+ def EPUB::CFI(string)
83
+ EPUB::Parser::CFI.parse('epubcfi(' + string + ')')
84
+ end
@@ -0,0 +1,187 @@
1
+ # EPUB::Parser::CFI is prefered but cannot be used.
2
+ # Racc automatically declare module EPUB::Parser
3
+ # but EPUB::Parser have been declared as a class.
4
+ class EPUB::CFIParser
5
+ rule
6
+
7
+ fragment : path range_zero_or_one
8
+ {
9
+ if val[1]
10
+ result = CFI::Range.from_parent_and_start_and_end(val[0], *val[1])
11
+ else
12
+ result = CFI::Location.new(val[0])
13
+ end
14
+ }
15
+
16
+ range_zero_or_one : range
17
+ |
18
+
19
+ path : step local_path
20
+ {
21
+ path, redirected_path = *val[1]
22
+ path.steps.unshift val[0]
23
+ result = val[1]
24
+ }
25
+
26
+ range : COMMA local_path COMMA local_path
27
+ {result = [val[1], val[3]]}
28
+
29
+ local_path : step_zero_or_more redirected_path
30
+ {result = [CFI::Path.new(val[0])] + val[1]}
31
+ | step_zero_or_more offset_zero_or_one
32
+ {result = [CFI::Path.new(val[0], val[1])]}
33
+
34
+ step_zero_or_more : step_zero_or_more step
35
+ {result = val[0] + [val[1]]}
36
+ | step
37
+ {result = [val[0]]}
38
+ |
39
+ {result = []}
40
+
41
+ redirected_path : EXCLAMATION_MARK offset
42
+ {result = [CFI::Path.new([], val[1])]}
43
+ | EXCLAMATION_MARK path
44
+ {result = val[1]}
45
+
46
+ step : SOLIDUS integer assertion_part_zero_or_one
47
+ {
48
+ assertion = val[2] ? CFI::IDAssertion.new(val[2][0], val[2][2]) : nil
49
+ result = CFI::Step.new(val[1].to_i, assertion)
50
+ }
51
+
52
+ offset_zero_or_one : offset
53
+ |
54
+
55
+ offset : COLON integer assertion_part_zero_or_one
56
+ {
57
+ assertion = val[2] ? CFI::TextLocationAssertion.new(*val[2]) : nil
58
+ result = CFI::CharacterOffset.new(val[1].to_i, assertion)
59
+ }
60
+ | spatial_offset assertion_part_zero_or_one
61
+ {result = CFI::TemporalSpatialOffset.new(nil, val[0][0].to_f, val[0][1].to_f, val[2])}
62
+ | TILDE number spatial_offset_zero_or_one assertion_part_zero_or_one
63
+ {
64
+ x = val[2] ? val[2][0].to_f : nil
65
+ y = val[2] ? val[2][1].to_f : nil
66
+ result = CFI::TemporalSpatialOffset.new(val[1].to_f, x, y, val[3])
67
+ }
68
+
69
+ spatial_offset_zero_or_one : spatial_offset
70
+ |
71
+
72
+ spatial_offset : ATMARK number COLON number
73
+ {result = [val[1], val[3]]}
74
+
75
+ assertion_part_zero_or_one : opening_square_bracket assertion closing_square_bracket
76
+ {result = val[1]}
77
+ |
78
+
79
+ number : DIGIT_NON_ZERO digit_zero_or_more fractional_portion_zero_or_one
80
+ {result = val.join}
81
+ | ZERO fractional_portion_zero_or_one
82
+ {result = val.join}
83
+
84
+ fractional_portion_zero_or_one : fractional_portion
85
+ |
86
+
87
+ fractional_portion : DOT digit_zero_or_more DIGIT_NON_ZERO
88
+ {result = val.join}
89
+ | DOT DIGIT_NON_ZERO
90
+ {result = val.join}
91
+
92
+ integer : ZERO
93
+ | DIGIT_NON_ZERO digit_zero_or_more
94
+ {result = val.join}
95
+
96
+ digit_zero_or_more : digit_zero_or_more digit
97
+ {result = val.join}
98
+ | digit
99
+ |
100
+
101
+ assertion : value_csv_one_or_two parameter_zero_or_more
102
+ {result = [val[0][0], val[0][1], val[1]]} # Cannot see id assertion or text location assertion when val[0]'s length is 1. It can be done by context.
103
+ | COMMA value parameter_zero_or_more
104
+ {result = [nil, val[1], val[2]]}
105
+ | parameter parameter_zero_or_more
106
+ {result = [nil, nil, val[0].merge(val[1])]} # Cannot see id assertion or text location assertion when val[0]'s length is 1. It can be done by context. In EPUBCFI 3.0.1 spec, only side-bias parameter is defined and we can say it's text location assertion of the assertion has parameters. But when the spec is extended and other parameter definitions added, we might become not able to say so.
107
+
108
+ value_csv_one_or_two : value COMMA value
109
+ {result = [val[0], val[2]]}
110
+ | value
111
+ {result = [val[0]]}
112
+
113
+ parameter_zero_or_more : parameter_zero_or_more parameter
114
+ {result = val[0].merge(val[1])}
115
+ | parameter
116
+ {result = val[0]}
117
+ |
118
+ {result = {}}
119
+
120
+ parameter : SEMICOLON value_no_space EQUAL csv
121
+ {result = {val[1] => val[3]}}
122
+
123
+ csv : csv COMMA value
124
+ {result = val[0] + [val[2]]}
125
+ | value
126
+ {result = [val[0]]}
127
+
128
+ value : string_escaped_special_chars
129
+ {result = val[0]}
130
+
131
+ value_no_space: string_escaped_special_chars_excluding_space
132
+
133
+ escaped_special_chars : CIRCUMFLEX CIRCUMFLEX
134
+ {result = val[1]}
135
+ | CIRCUMFLEX square_brackets
136
+ {result = val[1]}
137
+ | CIRCUMFLEX parentheses
138
+ {result = val[1]}
139
+ | CIRCUMFLEX COMMA
140
+ {result = val[1]}
141
+ | CIRCUMFLEX SEMICOLON
142
+ {result = val[1]}
143
+ | CIRCUMFLEX EQUAL
144
+ {result = val[1]}
145
+
146
+ character_escaped_special : character_excluding_special_chars
147
+ | escaped_special_chars
148
+
149
+ string_escaped_special_chars : string_escaped_special_chars character_escaped_special
150
+ {result = val.join}
151
+ | character_escaped_special
152
+ {result = val[0]}
153
+
154
+ string_escaped_special_chars_excluding_space : string_escaped_special_chars_excluding_space character_escaped_special_excluding_space
155
+ | character_escaped_special_excluding_space
156
+
157
+ character_escaped_special_excluding_space : character_excluding_special_chars_and_space
158
+ | escaped_special_chars
159
+
160
+ digit : ZERO
161
+ | DIGIT_NON_ZERO
162
+
163
+ square_brackets : opening_square_bracket
164
+ | closing_square_bracket
165
+
166
+ opening_square_bracket : OPENING_SQUARE_BRACKET
167
+
168
+ closing_square_bracket : CLOSING_SQUARE_BRACKET
169
+
170
+ parentheses : OPENING_PARENTHESIS
171
+ | CLOSING_PARENTHESIS
172
+
173
+ character_excluding_special_chars : character_excluding_special_chars_and_space
174
+ | SPACE
175
+
176
+ character_excluding_special_chars_and_space : character_excluding_special_chars_and_space_and_dot_and_colon_and_tilde_and_atmark_and_solidus_and_exclamation_mark
177
+ | DOT
178
+ | COLON
179
+ | TILDE
180
+ | ATMARK
181
+ | SOLIDUS
182
+ | EXCLAMATION_MARK
183
+
184
+ character_excluding_special_chars_and_space_and_dot_and_colon_and_tilde_and_atmark_and_solidus_and_exclamation_mark : UNICODE_CHARACTER_EXCLUDING_SPECIAL_CHARS_AND_SPACE_AND_DOT_AND_COLON_AND_TILDE_AND_ATMARK_AND_SOLIDUS_AND_EXCLAMATION_MARK
185
+ | digit
186
+
187
+ end
@@ -81,7 +81,7 @@ module EPUB
81
81
  end
82
82
  item.text = extract_attribute(a_or_span, 'title').to_s if item.text.nil? || item.text.empty?
83
83
  end
84
- item.href = Addressable::URI.parse(extract_attribute(a_or_span, 'href'))
84
+ item.href = extract_attribute(a_or_span, 'href')
85
85
  item.item = @item.manifest.items.find {|it| it.href.request_uri == item.href.request_uri}
86
86
  end
87
87
  item.items = element.xpath('./xhtml:ol[1]/xhtml:li', EPUB::NAMESPACES).map {|li| parse_navigation_item(li)}
@@ -1,7 +1,6 @@
1
1
  require 'strscan'
2
2
  require 'zipruby'
3
3
  require 'nokogiri'
4
- require 'addressable/uri'
5
4
  require 'epub/publication'
6
5
  require 'epub/constants'
7
6
 
@@ -61,7 +60,7 @@ module EPUB
61
60
  metadata.rights = extract_model(elem, id_map, './dc:rights')
62
61
  metadata.metas = extract_refinee(elem, id_map, './opf:meta', :Meta, %w[property id scheme])
63
62
  metadata.links = extract_refinee(elem, id_map, './opf:link', :Link, %w[id media-type]) {|link, e|
64
- link.href = Addressable::URI.parse(extract_attribute(e, 'href'))
63
+ link.href = extract_attribute(e, 'href')
65
64
  link.rel = Set.new(extract_attribute(e, 'rel').split(nil))
66
65
  }
67
66
 
@@ -85,7 +84,7 @@ module EPUB
85
84
  %w[id media-type media-overlay].each do |attr|
86
85
  item.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
87
86
  end
88
- item.href = Addressable::URI.parse(extract_attribute(e, 'href'))
87
+ item.href = extract_attribute(e, 'href')
89
88
  fallback = extract_attribute(e, 'fallback')
90
89
  fallback_map[fallback] = item if fallback
91
90
  properties = extract_attribute(e, 'properties')
@@ -127,7 +126,7 @@ module EPUB
127
126
  %w[type title].each do |attr|
128
127
  reference.__send__ "#{attr}=", extract_attribute(ref, attr)
129
128
  end
130
- reference.href = Addressable::URI.parse(extract_attribute(ref, 'href'))
129
+ reference.href = extract_attribute(ref, 'href')
131
130
  guide << reference
132
131
  end
133
132
 
@@ -1,5 +1,5 @@
1
1
  module EPUB
2
2
  class Parser
3
- VERSION = "0.2.2"
3
+ VERSION = "0.2.3"
4
4
  end
5
5
  end
@@ -21,7 +21,12 @@ module EPUB
21
21
  class Reference
22
22
  TYPES = %w[cover title-page toc index glossary acknowledgements bibliography colophon copyright-page dedication epigraph foreword loi lot notes preface text]
23
23
  attr_accessor :guide,
24
- :type, :title, :href
24
+ :type, :title
25
+ attr_reader :href
26
+
27
+ def href=(iri)
28
+ @href = iri.kind_of?(Addressable::URI) ? iri : Addressable::URI.parse(iri)
29
+ end
25
30
 
26
31
  def item
27
32
  return @item if @item
@@ -1,4 +1,5 @@
1
1
  require 'set'
2
+ require 'addressable/uri'
2
3
  require 'rchardet'
3
4
  require 'epub/constants'
4
5
  require 'epub/parser/content_document'
@@ -64,6 +65,8 @@ module EPUB
64
65
  end
65
66
 
66
67
  class Item
68
+ DUMMY_ROOT_IRI = Addressable::URI.parse('http://example.net/').freeze
69
+
67
70
  include Inspector
68
71
 
69
72
  # @!attribute [rw] manifest
@@ -82,8 +85,8 @@ module EPUB
82
85
  # @!attribute [rw] fallback
83
86
  # @return [Item] Returns the value of attribute fallback
84
87
  attr_accessor :manifest,
85
- :id, :href, :media_type, :fallback, :media_overlay
86
- attr_reader :properties
88
+ :id, :media_type, :fallback, :media_overlay
89
+ attr_reader :properties, :href
87
90
 
88
91
  def initialize
89
92
  @properties = Set.new
@@ -93,18 +96,31 @@ module EPUB
93
96
  @properties = props.kind_of?(Set) ? props : Set.new(props)
94
97
  end
95
98
 
99
+ def href=(iri)
100
+ @href = iri.kind_of?(Addressable::URI) ? iri : Addressable::URI.parse(iri)
101
+ end
102
+
96
103
  # @todo Handle circular fallback chain
97
104
  def fallback_chain
98
105
  @fallback_chain ||= traverse_fallback_chain([])
99
106
  end
100
107
 
101
108
  # full path in archive
102
- def entry_name
103
- dummy_root_iri = Addressable::URI.parse('http://example.net/') # FIXME: Use constant
109
+ # @return [Addressable::URI]
110
+ def full_path
111
+ return @full_path if @full_path
104
112
  rootfile = manifest.package.book.ocf.container.rootfile.full_path
105
- en = Addressable::URI.unencode((dummy_root_iri + rootfile + href).normalize.request_uri)
106
- en.slice!(0) if en.start_with? '/'
107
- en
113
+ path = DUMMY_ROOT_IRI + rootfile + href
114
+ path.scheme = nil
115
+ path.host = nil
116
+ path.path = path.path[1..-1]
117
+ @full_path = path
118
+ end
119
+
120
+ # full path in archive
121
+ # @return [String]
122
+ def entry_name
123
+ Addressable::URI.unencode(full_path)
108
124
  end
109
125
 
110
126
  def read
@@ -178,7 +194,7 @@ module EPUB
178
194
  # @note Algorithm stolen form Rack::Utils#clean_path_info
179
195
  def find_item_by_relative_iri(iri)
180
196
  raise ArgumentError, "Not relative: #{iri.inspect}" unless iri.relative?
181
- raise ArgumentError, "Start with slash: #{iri.inspect}" if iri.to_s.start_with? Addressable::URI::SLASH
197
+ raise ArgumentError, "Start with slash: #{iri.inspect}" if iri.path.start_with? Addressable::URI::SLASH
182
198
  target_href = href + iri
183
199
  segments = target_href.to_s.split(Addressable::URI::SLASH)
184
200
  clean_segments = []