peregrin 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,204 @@
1
+ class Peregrin::Outliner
2
+
3
+ REGEXES = {
4
+ :section_root => /^BLOCKQUOTE|BODY|DETAILS|FIELDSET|FIGURE|TD$/i,
5
+ :section_content => /^ARTICLE|ASIDE|NAV|SECTION$/i,
6
+ :heading => /^H[1-6]|HGROUP$/i
7
+ }
8
+
9
+ class Utils
10
+
11
+ def self.section_root?(el)
12
+ element_name_is?(el, REGEXES[:section_root])
13
+ end
14
+
15
+
16
+ def self.section_content?(el)
17
+ element_name_is?(el, REGEXES[:section_content])
18
+ end
19
+
20
+
21
+ def self.heading?(el)
22
+ element_name_is?(el, REGEXES[:heading])
23
+ end
24
+
25
+
26
+ def self.named?(el, name)
27
+ element_name_is?(el, /^#{name}$/)
28
+ end
29
+
30
+
31
+ def self.heading_rank(el)
32
+ raise "Not a heading: #{el.inspect}" unless heading?(el)
33
+ if named?(el, 'HGROUP')
34
+ 1.upto(6) { |n| return n if el.at_css("h#{n}") }
35
+ return 6 #raise "Heading not found in HGROUP: #{el.inspect}"
36
+ else
37
+ el.name.reverse.to_i
38
+ end
39
+ end
40
+
41
+
42
+ def self.element_name_is?(el, pattern)
43
+ return false unless el
44
+ return false unless el.respond_to?(:name)
45
+ return false if el.name.nil? || el.name.empty?
46
+ el.name.upcase.match(pattern) ? true : false
47
+ end
48
+
49
+ end
50
+
51
+
52
+ class Section
53
+
54
+ attr_accessor :sections, :heading, :container, :node
55
+
56
+
57
+ def initialize(node = nil)
58
+ self.node = node
59
+ self.sections = []
60
+ end
61
+
62
+
63
+ def append(subsection)
64
+ subsection.container = self
65
+ sections.push(subsection)
66
+ end
67
+
68
+
69
+ def empty?
70
+ heading_text.nil? && sections.all? { |sxn| sxn.empty? }
71
+ end
72
+
73
+
74
+ def heading_text
75
+ return nil unless Utils.heading?(heading)
76
+ h = heading
77
+ h = h.at_css("h#{Utils.heading_rank(h)}") if Utils.named?(h, 'HGROUP')
78
+ return nil unless h && !h.content.strip.empty?
79
+ h.content.strip
80
+ end
81
+
82
+
83
+ def heading_rank
84
+ # FIXME: some doubt as to whether 1 is the sensible default
85
+ Utils.heading?(heading) ? Utils.heading_rank(heading) : 1
86
+ end
87
+
88
+ end
89
+
90
+
91
+
92
+ def initialize(doc)
93
+ @document = doc
94
+ end
95
+
96
+
97
+ def process(from)
98
+ @outlinee = nil
99
+ @outlines = {}
100
+ @section = Section.new
101
+ @stack = []
102
+ walk(from)
103
+ end
104
+
105
+
106
+ def walk(node)
107
+ return unless node
108
+ enter_node(node)
109
+ node.children.each { |ch| walk(ch) }
110
+ exit_node(node)
111
+ end
112
+
113
+
114
+ def enter_node(node)
115
+ return if Utils.heading?(@stack.last)
116
+
117
+ if Utils.section_content?(node) || Utils.section_root?(node)
118
+ @stack.push(@outlinee) unless @outlinee.nil?
119
+ @outlinee = node
120
+ @section = Section.new(node)
121
+ @outlines[@outlinee] = Section.new(node)
122
+ @outlines[@outlinee].sections = [@section]
123
+ return
124
+ end
125
+
126
+ return if @outlinee.nil?
127
+
128
+ if Utils.heading?(node)
129
+ node_rank = Utils.heading_rank(node)
130
+ if !@section.heading
131
+ @section.heading = node
132
+ elsif node_rank <= @outlines[@outlinee].sections.last.heading_rank
133
+ @section = Section.new
134
+ @section.heading = node
135
+ @outlines[@outlinee].sections.push(@section)
136
+ else
137
+ candidate = @section
138
+ while true
139
+ if node_rank > candidate.heading_rank
140
+ @section = Section.new
141
+ candidate.append(@section)
142
+ @section.heading = node
143
+ break
144
+ end
145
+ candidate = candidate.container
146
+ end
147
+ end
148
+ @stack.push(node)
149
+ end
150
+ end
151
+
152
+
153
+ def exit_node(node)
154
+ if Utils.heading?(@stack.last)
155
+ @stack.pop if @stack.last == node
156
+ return
157
+ end
158
+
159
+ if Utils.section_content?(node) && !@stack.empty?
160
+ @outlinee = @stack.pop
161
+ @section = @outlines[@outlinee].sections.last
162
+ @outlines[node].sections.each { |s| @section.append(s) }
163
+ return
164
+ end
165
+
166
+ if Utils.section_root?(node) && !@stack.empty?
167
+ @outlinee = @stack.pop
168
+ @section = @outlines[@outlinee].sections.last
169
+ while @section.sections.any?
170
+ @section = @section.sections.last
171
+ end
172
+ return
173
+ end
174
+
175
+ if Utils.section_content?(node) || Utils.section_root?(node)
176
+ @section = @outlines[@outlinee].sections.first
177
+ return
178
+ end
179
+ end
180
+
181
+
182
+ def to_html
183
+ curse = lambda { |section, is_root|
184
+ below = section.sections.collect { |ch|
185
+ ch_out = curse.call(ch, false).strip
186
+ (ch_out.nil? || ch_out.empty?) ? "" : "<li>#{ch_out}</li>"
187
+ }.join.strip
188
+ below = (below.nil? || below.empty?) ? "" : "<ol>#{below}</ol>\n"
189
+ if is_root
190
+ below
191
+ else
192
+ heading = block_given? ? yield(section, below) : section.heading_text
193
+ "#{heading}#{below}"
194
+ end
195
+ }
196
+ curse.call(result_root, true)
197
+ end
198
+
199
+
200
+ def result_root
201
+ @outlines[@outlinee]
202
+ end
203
+
204
+ end
@@ -0,0 +1,16 @@
1
+ # Books have metadata. Each unit of metadata (each metadatum?) is a 'property'
2
+ # of the book.
3
+ #
4
+ # A property has a key, a value and an optional set of attributes.
5
+ #
6
+ class Peregrin::Property
7
+
8
+ attr_accessor :key, :value, :attributes
9
+
10
+ def initialize(key, value, attributes = {})
11
+ @key = key
12
+ @value = value
13
+ @attributes = attributes
14
+ end
15
+
16
+ end
@@ -0,0 +1,24 @@
1
+ # Any file that is a part of the book but not one of its linear sections (ie,
2
+ # Components) is a Resource.
3
+ #
4
+ # Resources can potentially be quite large, so as far as possible we don't
5
+ # store their contents in memory.
6
+ #
7
+ class Peregrin::Resource
8
+
9
+ attr_accessor :src, :attributes
10
+ attr_writer :media_type
11
+
12
+ def initialize(src, media_type = nil, attributes = {})
13
+ @src = src
14
+ @media_type = media_type || MIME::Types.of(File.basename(@src))
15
+ @media_type = @media_type.first if @media_type.kind_of?(Array)
16
+ @attributes = attributes
17
+ end
18
+
19
+
20
+ def media_type
21
+ @media_type ? @media_type.to_s : nil
22
+ end
23
+
24
+ end
@@ -0,0 +1,5 @@
1
+ module Peregrin
2
+
3
+ VERSION = "1.1.1"
4
+
5
+ end
@@ -0,0 +1,11 @@
1
+ class Zip::Archive
2
+
3
+ def read(path)
4
+ fopen(path) { |f| f.read }
5
+ end
6
+
7
+ def find(path)
8
+ detect { |f| f.name == path }
9
+ end
10
+
11
+ end
data/lib/peregrin.rb ADDED
@@ -0,0 +1,139 @@
1
+ module Peregrin
2
+
3
+ # Required libraries
4
+ require 'fileutils'
5
+ require 'uri'
6
+ require 'zipruby'
7
+ require 'nokogiri'
8
+ require 'mime/types'
9
+
10
+ # Require libs in this directory
11
+ [
12
+ "peregrin/version",
13
+ "peregrin/zip_patch",
14
+ "peregrin/book",
15
+ "peregrin/resource",
16
+ "peregrin/component",
17
+ "peregrin/chapter",
18
+ "peregrin/property",
19
+ "peregrin/componentizer",
20
+ "peregrin/outliner",
21
+ "formats/epub",
22
+ "formats/zhook",
23
+ "formats/ochook"
24
+ ].each { |lib|
25
+ require lib
26
+ }
27
+
28
+
29
+ class Main
30
+
31
+ def self.run(args)
32
+ if args.size == 1
33
+ src = args.first
34
+ validate(src) and inspect(src)
35
+ elsif args.size == 2
36
+ src, dest = args
37
+ validate(src) and convert(src, dest) and inspect(dest)
38
+ else
39
+ usage
40
+ end
41
+ end
42
+
43
+
44
+ def self.usage
45
+ puts "Peregrin [http://ochook.org/peregrin]"
46
+ puts "Version: #{VERSION}"
47
+ puts "A tool for inspecting Zhooks, Ochooks and EPUB ebooks,"
48
+ puts "and converting between them."
49
+ puts ""
50
+ puts "Usage: peregrin srcpath [destpath]"
51
+ puts ""
52
+ puts "If one path given, validates ebook at that path and outputs analysis."
53
+ puts "If two paths given, converts from srcpath to destpath and outputs "
54
+ puts "analysis of converted ebook."
55
+ end
56
+
57
+
58
+ def self.validate(path)
59
+ klass = format_for_path(path)
60
+ klass.validate(path)
61
+ true
62
+ rescue UnknownFileFormat => e
63
+ exit_with("Unknown file format: #{path}")
64
+ rescue => e
65
+ exit_with("Invalid #{klass::FORMAT}: #{path}", "Reason - #{e}")
66
+ end
67
+
68
+
69
+ def self.convert(src_path, dest_path, src_klass = nil, dest_klass = nil)
70
+ src_klass ||= format_for_path(src_path)
71
+ dest_klass ||= format_for_path(dest_path)
72
+
73
+ src_ook = src_klass.read(src_path)
74
+
75
+ # FIXME: how do we do these options? User-specified? Dest-format-specified?
76
+ options = {}
77
+ options[:componentize] = true if dest_klass == Peregrin::Epub
78
+ book = src_ook.to_book(options)
79
+
80
+ dest_ook = dest_klass.new(book)
81
+ dest_ook.write(dest_path)
82
+ validate(dest_path)
83
+ end
84
+
85
+
86
+ def self.inspect(path)
87
+ klass = format_for_path(path)
88
+ ook = klass.read(path)
89
+ book = ook.to_book
90
+ puts "[#{klass::FORMAT}]"
91
+ puts "\nCover\n #{book.cover.src}"
92
+ puts "\nComponents [#{book.components.size}]"
93
+ book.components.each { |cmpt| puts " #{cmpt.src}" }
94
+ puts "\nResources [#{book.resources.size}]"
95
+ book.resources.each { |res| puts " #{res.src}" }
96
+ puts "\nChapters"
97
+ book.chapters.each { |chp| print_chapter_title(chp, "- ") }
98
+ puts "\nProperties [#{book.properties.size}]"
99
+ book.properties.each { |property|
100
+ puts " #{property.key}: #{property.value}" unless property.value.empty?
101
+ }
102
+ true
103
+ end
104
+
105
+
106
+ private
107
+
108
+ def self.format_for_path(path)
109
+ return Peregrin::Zhook if File.extname(path) == ".zhook"
110
+ return Peregrin::Epub if File.extname(path) == ".epub"
111
+ return Peregrin::Ochook if File.directory?(path) || !File.exists?(path)
112
+ raise UnknownFileFormat.new(path)
113
+ end
114
+
115
+
116
+ def self.exit_with(*remarks)
117
+ remarks.each { |rm| puts(rm) }
118
+ false
119
+ end
120
+
121
+
122
+ def self.print_chapter_title(chp, padd)
123
+ puts "#{padd}#{chp.title}"
124
+ chp.children.each { |ch|
125
+ print_chapter_title(ch, " "+padd)
126
+ }
127
+ end
128
+
129
+
130
+
131
+ class UnknownFileFormat < RuntimeError
132
+ def initialize(path = nil)
133
+ @page = path
134
+ end
135
+ end
136
+
137
+ end
138
+
139
+ end
@@ -0,0 +1,80 @@
1
+ require 'test_helper'
2
+
3
+ class Peregrin::Tests::ConversionTest < Test::Unit::TestCase
4
+
5
+ def test_epub_to_ochook
6
+ conversion_test(
7
+ Peregrin::Epub,
8
+ Peregrin::Ochook,
9
+ 'test/fixtures/epubs/strunk.epub',
10
+ 'test/output/conversions/epub_to_ochook'
11
+ )
12
+ end
13
+
14
+
15
+ def test_epub_to_zhook
16
+ conversion_test(
17
+ Peregrin::Epub,
18
+ Peregrin::Zhook,
19
+ 'test/fixtures/epubs/strunk.epub',
20
+ 'test/output/conversions/epub_to_zhook.zhook'
21
+ )
22
+ end
23
+
24
+
25
+ def test_ochook_to_epub
26
+ conversion_test(
27
+ Peregrin::Ochook,
28
+ Peregrin::Epub,
29
+ 'test/fixtures/ochooks/basic',
30
+ 'test/output/conversions/ochook_to_epub.epub',
31
+ :componentize => true
32
+ )
33
+ end
34
+
35
+
36
+ def test_ochook_to_zhook
37
+ conversion_test(
38
+ Peregrin::Ochook,
39
+ Peregrin::Zhook,
40
+ 'test/fixtures/ochooks/basic',
41
+ 'test/output/conversions/ochook_to_zhook.zhook',
42
+ :componentize => true
43
+ )
44
+ assert_nil(@dest_ook.send(:index).root['manifest'])
45
+ end
46
+
47
+
48
+ def test_zhook_to_epub
49
+ conversion_test(
50
+ Peregrin::Zhook,
51
+ Peregrin::Epub,
52
+ 'test/fixtures/zhooks/flat.zhook',
53
+ 'test/output/conversions/zhook_to_epub.epub',
54
+ :componentize => true
55
+ )
56
+ end
57
+
58
+
59
+ def test_zhook_to_ochook
60
+ conversion_test(
61
+ Peregrin::Zhook,
62
+ Peregrin::Ochook,
63
+ 'test/fixtures/zhooks/flat.zhook',
64
+ 'test/output/conversions/zhook_to_ochook',
65
+ :componentize => true
66
+ )
67
+ end
68
+
69
+
70
+ private
71
+
72
+ def conversion_test(src_klass, dest_klass, src, dest, to_book_options = {})
73
+ @src_ook = src_klass.read(src)
74
+ @dest_ook = dest_klass.new(@src_ook.to_book(to_book_options))
75
+ FileUtils.mkdir_p(File.dirname(dest))
76
+ @dest_ook.write(dest)
77
+ assert_nothing_raised { dest_klass.validate(dest) }
78
+ end
79
+
80
+ end
@@ -0,0 +1,159 @@
1
+ require 'test_helper'
2
+
3
+ class Peregrin::Tests::EpubTest < Test::Unit::TestCase
4
+
5
+ # A fairly trivial book-in, book-out test.
6
+ def test_book_to_book
7
+ epub = Peregrin::Epub.new(strunk_book)
8
+ book = epub.to_book
9
+ assert_equal(22, book.components.length)
10
+ assert_equal(6, book.chapters.length)
11
+ assert_equal("William Strunk Jr.", book.property_for('creator'))
12
+ end
13
+
14
+
15
+ def test_write_to_epub
16
+ epub = Peregrin::Epub.new(strunk_book)
17
+ epub.write('test/output/strunk_test.epub')
18
+ assert(File.exists?('test/output/strunk_test.epub'))
19
+ assert_nothing_raised {
20
+ Peregrin::Epub.validate("test/output/strunk_test.epub")
21
+ }
22
+ end
23
+
24
+
25
+ def test_heading_depth
26
+ epub = Peregrin::Epub.new(strunk_book)
27
+ assert_equal(2, epub.send(:heading_depth))
28
+ end
29
+
30
+
31
+ def test_epub_validation
32
+ assert_nothing_raised {
33
+ Peregrin::Epub.validate("test/fixtures/epubs/strunk.epub")
34
+ }
35
+ end
36
+
37
+
38
+ def test_extracting_metadata
39
+ epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
40
+ assert_equal("The Elements of Style", epub.to_book.property_for('title'))
41
+ end
42
+
43
+
44
+ def test_extracting_components
45
+ epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
46
+ book = epub.to_book
47
+ assert_equal(
48
+ ["cover.xml", "title.xml", "about.xml", "main0.xml", "main1.xml", "main2.xml", "main3.xml", "main4.xml", "main5.xml", "main6.xml", "main7.xml", "main8.xml", "main9.xml", "main10.xml", "main11.xml", "main12.xml", "main13.xml", "main14.xml", "main15.xml", "main16.xml", "main17.xml", "main18.xml", "main19.xml", "main20.xml", "main21.xml", "similar.xml", "feedbooks.xml"],
49
+ book.components.collect { |cmpt| cmpt.src }
50
+ )
51
+ assert_equal(
52
+ ["css/page.css", "css/feedbooks.css", "css/title.css", "css/about.css", "css/main.css", "images/logo-feedbooks-tiny.png", "images/logo-feedbooks.png", "images/cover.png"],
53
+ book.resources.collect { |res| res.src }
54
+ )
55
+ end
56
+
57
+
58
+ def test_extracting_contents
59
+ epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
60
+ assert_equal(2, epub.send(:heading_depth))
61
+ end
62
+
63
+
64
+ def test_extracting_cover
65
+ # Cover image referenced from metadata
66
+ epub = Peregrin::Epub.read("test/fixtures/epubs/covers/cover_in_meta.epub")
67
+ assert_equal("cover.png", epub.to_book.cover.src)
68
+
69
+ # First image in a component listed in the guide as 'cover'
70
+ epub = Peregrin::Epub.read("test/fixtures/epubs/covers/cover_in_guide.epub")
71
+ assert_equal("cover.png", epub.to_book.cover.src)
72
+
73
+ # A component with the id of 'cover-image'.
74
+ epub = Peregrin::Epub.read(
75
+ "test/fixtures/epubs/covers/cover-image_in_manifest.epub"
76
+ )
77
+ assert_equal("cover.png", epub.to_book.cover.src)
78
+
79
+ # First image in component with the id of 'cover'.
80
+ epub = Peregrin::Epub.read(
81
+ "test/fixtures/epubs/covers/cover_in_manifest.epub"
82
+ )
83
+ assert_equal("cover.png", epub.to_book.cover.src)
84
+
85
+ # First image in first component.
86
+ epub = Peregrin::Epub.read(
87
+ "test/fixtures/epubs/covers/cover_in_first_cmpt.epub"
88
+ )
89
+ assert_equal("cover.png", epub.to_book.cover.src)
90
+ end
91
+
92
+
93
+ def test_read_epub_to_write_epub
94
+ epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
95
+ epub.write("test/output/strunk_test2.epub")
96
+ assert(File.exists?('test/output/strunk_test2.epub'))
97
+ assert_nothing_raised {
98
+ Peregrin::Epub.validate("test/output/strunk_test2.epub")
99
+ }
100
+ end
101
+
102
+
103
+ protected
104
+
105
+ def strunk_book
106
+ book = Peregrin::Book.new
107
+ 0.upto(21) { |i|
108
+ path = "main#{i}.xml"
109
+ book.add_component(
110
+ path,
111
+ IO.read("test/fixtures/epubs/strunk/OPS/#{path}")
112
+ )
113
+ }
114
+ pos = 0
115
+ chp = book.add_chapter(
116
+ "Chapter 1 - Introductory",
117
+ pos+=1,
118
+ "main0.xml"
119
+ )
120
+ chp = book.add_chapter(
121
+ "Chapter 2 - Elementary Rules of Usage",
122
+ pos+=1,
123
+ "main1.xml"
124
+ )
125
+ chp.add_child(
126
+ "1. Form the possessive singular of nounds with 's",
127
+ pos+=1,
128
+ "main1.xml#section_98344"
129
+ )
130
+ chp = book.add_chapter(
131
+ "Chapter 3 - Elementary Principles of Composition",
132
+ pos+=1,
133
+ "main9.xml"
134
+ )
135
+ chp = book.add_chapter(
136
+ "Chapter 4 - A Few Matters of Form",
137
+ pos+=1,
138
+ "main19.xml"
139
+ )
140
+ chp = book.add_chapter(
141
+ "Chapter 5 - Words and Expressions Commonly Misused",
142
+ pos+=1,
143
+ "main20.xml"
144
+ )
145
+ chp = book.add_chapter(
146
+ "Chapter 6 - Words Commonly Misspelled",
147
+ pos+=1,
148
+ "main21.xml"
149
+ )
150
+ book.add_property("title", "The Elements of Style")
151
+ book.add_property("creator", "William Strunk Jr.")
152
+ book.add_resource("css/main.css")
153
+ book.read_resource_proc = lambda { |resource|
154
+ IO.read("test/fixtures/epubs/strunk/OPS/#{resource.src}")
155
+ }
156
+ book
157
+ end
158
+
159
+ end