peregrin 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,204 @@
1
+ class Peregrin::Outliner
2
+
3
+ REGEXES = {
4
+ :section_root => /^BLOCKQUOTE|BODY|DETAILS|FIELDSET|FIGURE|TD$/i,
5
+ :section_content => /^ARTICLE|ASIDE|NAV|SECTION$/i,
6
+ :heading => /^H[1-6]|HGROUP$/i
7
+ }
8
+
9
+ class Utils
10
+
11
+ def self.section_root?(el)
12
+ element_name_is?(el, REGEXES[:section_root])
13
+ end
14
+
15
+
16
+ def self.section_content?(el)
17
+ element_name_is?(el, REGEXES[:section_content])
18
+ end
19
+
20
+
21
+ def self.heading?(el)
22
+ element_name_is?(el, REGEXES[:heading])
23
+ end
24
+
25
+
26
+ def self.named?(el, name)
27
+ element_name_is?(el, /^#{name}$/)
28
+ end
29
+
30
+
31
+ def self.heading_rank(el)
32
+ raise "Not a heading: #{el.inspect}" unless heading?(el)
33
+ if named?(el, 'HGROUP')
34
+ 1.upto(6) { |n| return n if el.at_css("h#{n}") }
35
+ return 6 #raise "Heading not found in HGROUP: #{el.inspect}"
36
+ else
37
+ el.name.reverse.to_i
38
+ end
39
+ end
40
+
41
+
42
+ def self.element_name_is?(el, pattern)
43
+ return false unless el
44
+ return false unless el.respond_to?(:name)
45
+ return false if el.name.nil? || el.name.empty?
46
+ el.name.upcase.match(pattern) ? true : false
47
+ end
48
+
49
+ end
50
+
51
+
52
+ class Section
53
+
54
+ attr_accessor :sections, :heading, :container, :node
55
+
56
+
57
+ def initialize(node = nil)
58
+ self.node = node
59
+ self.sections = []
60
+ end
61
+
62
+
63
+ def append(subsection)
64
+ subsection.container = self
65
+ sections.push(subsection)
66
+ end
67
+
68
+
69
+ def empty?
70
+ heading_text.nil? && sections.all? { |sxn| sxn.empty? }
71
+ end
72
+
73
+
74
+ def heading_text
75
+ return nil unless Utils.heading?(heading)
76
+ h = heading
77
+ h = h.at_css("h#{Utils.heading_rank(h)}") if Utils.named?(h, 'HGROUP')
78
+ return nil unless h && !h.content.strip.empty?
79
+ h.content.strip
80
+ end
81
+
82
+
83
+ def heading_rank
84
+ # FIXME: some doubt as to whether 1 is the sensible default
85
+ Utils.heading?(heading) ? Utils.heading_rank(heading) : 1
86
+ end
87
+
88
+ end
89
+
90
+
91
+
92
+ def initialize(doc)
93
+ @document = doc
94
+ end
95
+
96
+
97
+ def process(from)
98
+ @outlinee = nil
99
+ @outlines = {}
100
+ @section = Section.new
101
+ @stack = []
102
+ walk(from)
103
+ end
104
+
105
+
106
+ def walk(node)
107
+ return unless node
108
+ enter_node(node)
109
+ node.children.each { |ch| walk(ch) }
110
+ exit_node(node)
111
+ end
112
+
113
+
114
+ def enter_node(node)
115
+ return if Utils.heading?(@stack.last)
116
+
117
+ if Utils.section_content?(node) || Utils.section_root?(node)
118
+ @stack.push(@outlinee) unless @outlinee.nil?
119
+ @outlinee = node
120
+ @section = Section.new(node)
121
+ @outlines[@outlinee] = Section.new(node)
122
+ @outlines[@outlinee].sections = [@section]
123
+ return
124
+ end
125
+
126
+ return if @outlinee.nil?
127
+
128
+ if Utils.heading?(node)
129
+ node_rank = Utils.heading_rank(node)
130
+ if !@section.heading
131
+ @section.heading = node
132
+ elsif node_rank <= @outlines[@outlinee].sections.last.heading_rank
133
+ @section = Section.new
134
+ @section.heading = node
135
+ @outlines[@outlinee].sections.push(@section)
136
+ else
137
+ candidate = @section
138
+ while true
139
+ if node_rank > candidate.heading_rank
140
+ @section = Section.new
141
+ candidate.append(@section)
142
+ @section.heading = node
143
+ break
144
+ end
145
+ candidate = candidate.container
146
+ end
147
+ end
148
+ @stack.push(node)
149
+ end
150
+ end
151
+
152
+
153
+ def exit_node(node)
154
+ if Utils.heading?(@stack.last)
155
+ @stack.pop if @stack.last == node
156
+ return
157
+ end
158
+
159
+ if Utils.section_content?(node) && !@stack.empty?
160
+ @outlinee = @stack.pop
161
+ @section = @outlines[@outlinee].sections.last
162
+ @outlines[node].sections.each { |s| @section.append(s) }
163
+ return
164
+ end
165
+
166
+ if Utils.section_root?(node) && !@stack.empty?
167
+ @outlinee = @stack.pop
168
+ @section = @outlines[@outlinee].sections.last
169
+ while @section.sections.any?
170
+ @section = @section.sections.last
171
+ end
172
+ return
173
+ end
174
+
175
+ if Utils.section_content?(node) || Utils.section_root?(node)
176
+ @section = @outlines[@outlinee].sections.first
177
+ return
178
+ end
179
+ end
180
+
181
+
182
+ def to_html
183
+ curse = lambda { |section, is_root|
184
+ below = section.sections.collect { |ch|
185
+ ch_out = curse.call(ch, false).strip
186
+ (ch_out.nil? || ch_out.empty?) ? "" : "<li>#{ch_out}</li>"
187
+ }.join.strip
188
+ below = (below.nil? || below.empty?) ? "" : "<ol>#{below}</ol>\n"
189
+ if is_root
190
+ below
191
+ else
192
+ heading = block_given? ? yield(section, below) : section.heading_text
193
+ "#{heading}#{below}"
194
+ end
195
+ }
196
+ curse.call(result_root, true)
197
+ end
198
+
199
+
200
+ def result_root
201
+ @outlines[@outlinee]
202
+ end
203
+
204
+ end
@@ -0,0 +1,16 @@
1
+ # Books have metadata. Each unit of metadata (each metadatum?) is a 'property'
2
+ # of the book.
3
+ #
4
+ # A property has a key, a value and an optional set of attributes.
5
+ #
6
+ class Peregrin::Property
7
+
8
+ attr_accessor :key, :value, :attributes
9
+
10
+ def initialize(key, value, attributes = {})
11
+ @key = key
12
+ @value = value
13
+ @attributes = attributes
14
+ end
15
+
16
+ end
@@ -0,0 +1,24 @@
1
+ # Any file that is a part of the book but not one of its linear sections (ie,
2
+ # Components) is a Resource.
3
+ #
4
+ # Resources can potentially be quite large, so as far as possible we don't
5
+ # store their contents in memory.
6
+ #
7
+ class Peregrin::Resource
8
+
9
+ attr_accessor :src, :attributes
10
+ attr_writer :media_type
11
+
12
+ def initialize(src, media_type = nil, attributes = {})
13
+ @src = src
14
+ @media_type = media_type || MIME::Types.of(File.basename(@src))
15
+ @media_type = @media_type.first if @media_type.kind_of?(Array)
16
+ @attributes = attributes
17
+ end
18
+
19
+
20
+ def media_type
21
+ @media_type ? @media_type.to_s : nil
22
+ end
23
+
24
+ end
@@ -0,0 +1,5 @@
1
+ module Peregrin
2
+
3
+ VERSION = "1.1.1"
4
+
5
+ end
@@ -0,0 +1,11 @@
1
+ class Zip::Archive
2
+
3
+ def read(path)
4
+ fopen(path) { |f| f.read }
5
+ end
6
+
7
+ def find(path)
8
+ detect { |f| f.name == path }
9
+ end
10
+
11
+ end
data/lib/peregrin.rb ADDED
@@ -0,0 +1,139 @@
1
+ module Peregrin
2
+
3
+ # Required libraries
4
+ require 'fileutils'
5
+ require 'uri'
6
+ require 'zipruby'
7
+ require 'nokogiri'
8
+ require 'mime/types'
9
+
10
+ # Require libs in this directory
11
+ [
12
+ "peregrin/version",
13
+ "peregrin/zip_patch",
14
+ "peregrin/book",
15
+ "peregrin/resource",
16
+ "peregrin/component",
17
+ "peregrin/chapter",
18
+ "peregrin/property",
19
+ "peregrin/componentizer",
20
+ "peregrin/outliner",
21
+ "formats/epub",
22
+ "formats/zhook",
23
+ "formats/ochook"
24
+ ].each { |lib|
25
+ require lib
26
+ }
27
+
28
+
29
+ class Main
30
+
31
+ def self.run(args)
32
+ if args.size == 1
33
+ src = args.first
34
+ validate(src) and inspect(src)
35
+ elsif args.size == 2
36
+ src, dest = args
37
+ validate(src) and convert(src, dest) and inspect(dest)
38
+ else
39
+ usage
40
+ end
41
+ end
42
+
43
+
44
+ def self.usage
45
+ puts "Peregrin [http://ochook.org/peregrin]"
46
+ puts "Version: #{VERSION}"
47
+ puts "A tool for inspecting Zhooks, Ochooks and EPUB ebooks,"
48
+ puts "and converting between them."
49
+ puts ""
50
+ puts "Usage: peregrin srcpath [destpath]"
51
+ puts ""
52
+ puts "If one path given, validates ebook at that path and outputs analysis."
53
+ puts "If two paths given, converts from srcpath to destpath and outputs "
54
+ puts "analysis of converted ebook."
55
+ end
56
+
57
+
58
+ def self.validate(path)
59
+ klass = format_for_path(path)
60
+ klass.validate(path)
61
+ true
62
+ rescue UnknownFileFormat => e
63
+ exit_with("Unknown file format: #{path}")
64
+ rescue => e
65
+ exit_with("Invalid #{klass::FORMAT}: #{path}", "Reason - #{e}")
66
+ end
67
+
68
+
69
+ def self.convert(src_path, dest_path, src_klass = nil, dest_klass = nil)
70
+ src_klass ||= format_for_path(src_path)
71
+ dest_klass ||= format_for_path(dest_path)
72
+
73
+ src_ook = src_klass.read(src_path)
74
+
75
+ # FIXME: how do we do these options? User-specified? Dest-format-specified?
76
+ options = {}
77
+ options[:componentize] = true if dest_klass == Peregrin::Epub
78
+ book = src_ook.to_book(options)
79
+
80
+ dest_ook = dest_klass.new(book)
81
+ dest_ook.write(dest_path)
82
+ validate(dest_path)
83
+ end
84
+
85
+
86
+ def self.inspect(path)
87
+ klass = format_for_path(path)
88
+ ook = klass.read(path)
89
+ book = ook.to_book
90
+ puts "[#{klass::FORMAT}]"
91
+ puts "\nCover\n #{book.cover.src}"
92
+ puts "\nComponents [#{book.components.size}]"
93
+ book.components.each { |cmpt| puts " #{cmpt.src}" }
94
+ puts "\nResources [#{book.resources.size}]"
95
+ book.resources.each { |res| puts " #{res.src}" }
96
+ puts "\nChapters"
97
+ book.chapters.each { |chp| print_chapter_title(chp, "- ") }
98
+ puts "\nProperties [#{book.properties.size}]"
99
+ book.properties.each { |property|
100
+ puts " #{property.key}: #{property.value}" unless property.value.empty?
101
+ }
102
+ true
103
+ end
104
+
105
+
106
+ private
107
+
108
+ def self.format_for_path(path)
109
+ return Peregrin::Zhook if File.extname(path) == ".zhook"
110
+ return Peregrin::Epub if File.extname(path) == ".epub"
111
+ return Peregrin::Ochook if File.directory?(path) || !File.exists?(path)
112
+ raise UnknownFileFormat.new(path)
113
+ end
114
+
115
+
116
+ def self.exit_with(*remarks)
117
+ remarks.each { |rm| puts(rm) }
118
+ false
119
+ end
120
+
121
+
122
+ def self.print_chapter_title(chp, padd)
123
+ puts "#{padd}#{chp.title}"
124
+ chp.children.each { |ch|
125
+ print_chapter_title(ch, " "+padd)
126
+ }
127
+ end
128
+
129
+
130
+
131
+ class UnknownFileFormat < RuntimeError
132
+ def initialize(path = nil)
133
+ @page = path
134
+ end
135
+ end
136
+
137
+ end
138
+
139
+ end
@@ -0,0 +1,80 @@
1
+ require 'test_helper'
2
+
3
+ class Peregrin::Tests::ConversionTest < Test::Unit::TestCase
4
+
5
+ def test_epub_to_ochook
6
+ conversion_test(
7
+ Peregrin::Epub,
8
+ Peregrin::Ochook,
9
+ 'test/fixtures/epubs/strunk.epub',
10
+ 'test/output/conversions/epub_to_ochook'
11
+ )
12
+ end
13
+
14
+
15
+ def test_epub_to_zhook
16
+ conversion_test(
17
+ Peregrin::Epub,
18
+ Peregrin::Zhook,
19
+ 'test/fixtures/epubs/strunk.epub',
20
+ 'test/output/conversions/epub_to_zhook.zhook'
21
+ )
22
+ end
23
+
24
+
25
+ def test_ochook_to_epub
26
+ conversion_test(
27
+ Peregrin::Ochook,
28
+ Peregrin::Epub,
29
+ 'test/fixtures/ochooks/basic',
30
+ 'test/output/conversions/ochook_to_epub.epub',
31
+ :componentize => true
32
+ )
33
+ end
34
+
35
+
36
+ def test_ochook_to_zhook
37
+ conversion_test(
38
+ Peregrin::Ochook,
39
+ Peregrin::Zhook,
40
+ 'test/fixtures/ochooks/basic',
41
+ 'test/output/conversions/ochook_to_zhook.zhook',
42
+ :componentize => true
43
+ )
44
+ assert_nil(@dest_ook.send(:index).root['manifest'])
45
+ end
46
+
47
+
48
+ def test_zhook_to_epub
49
+ conversion_test(
50
+ Peregrin::Zhook,
51
+ Peregrin::Epub,
52
+ 'test/fixtures/zhooks/flat.zhook',
53
+ 'test/output/conversions/zhook_to_epub.epub',
54
+ :componentize => true
55
+ )
56
+ end
57
+
58
+
59
+ def test_zhook_to_ochook
60
+ conversion_test(
61
+ Peregrin::Zhook,
62
+ Peregrin::Ochook,
63
+ 'test/fixtures/zhooks/flat.zhook',
64
+ 'test/output/conversions/zhook_to_ochook',
65
+ :componentize => true
66
+ )
67
+ end
68
+
69
+
70
+ private
71
+
72
+ def conversion_test(src_klass, dest_klass, src, dest, to_book_options = {})
73
+ @src_ook = src_klass.read(src)
74
+ @dest_ook = dest_klass.new(@src_ook.to_book(to_book_options))
75
+ FileUtils.mkdir_p(File.dirname(dest))
76
+ @dest_ook.write(dest)
77
+ assert_nothing_raised { dest_klass.validate(dest) }
78
+ end
79
+
80
+ end
@@ -0,0 +1,159 @@
1
+ require 'test_helper'
2
+
3
+ class Peregrin::Tests::EpubTest < Test::Unit::TestCase
4
+
5
+ # A fairly trivial book-in, book-out test.
6
+ def test_book_to_book
7
+ epub = Peregrin::Epub.new(strunk_book)
8
+ book = epub.to_book
9
+ assert_equal(22, book.components.length)
10
+ assert_equal(6, book.chapters.length)
11
+ assert_equal("William Strunk Jr.", book.property_for('creator'))
12
+ end
13
+
14
+
15
+ def test_write_to_epub
16
+ epub = Peregrin::Epub.new(strunk_book)
17
+ epub.write('test/output/strunk_test.epub')
18
+ assert(File.exists?('test/output/strunk_test.epub'))
19
+ assert_nothing_raised {
20
+ Peregrin::Epub.validate("test/output/strunk_test.epub")
21
+ }
22
+ end
23
+
24
+
25
+ def test_heading_depth
26
+ epub = Peregrin::Epub.new(strunk_book)
27
+ assert_equal(2, epub.send(:heading_depth))
28
+ end
29
+
30
+
31
+ def test_epub_validation
32
+ assert_nothing_raised {
33
+ Peregrin::Epub.validate("test/fixtures/epubs/strunk.epub")
34
+ }
35
+ end
36
+
37
+
38
+ def test_extracting_metadata
39
+ epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
40
+ assert_equal("The Elements of Style", epub.to_book.property_for('title'))
41
+ end
42
+
43
+
44
+ def test_extracting_components
45
+ epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
46
+ book = epub.to_book
47
+ assert_equal(
48
+ ["cover.xml", "title.xml", "about.xml", "main0.xml", "main1.xml", "main2.xml", "main3.xml", "main4.xml", "main5.xml", "main6.xml", "main7.xml", "main8.xml", "main9.xml", "main10.xml", "main11.xml", "main12.xml", "main13.xml", "main14.xml", "main15.xml", "main16.xml", "main17.xml", "main18.xml", "main19.xml", "main20.xml", "main21.xml", "similar.xml", "feedbooks.xml"],
49
+ book.components.collect { |cmpt| cmpt.src }
50
+ )
51
+ assert_equal(
52
+ ["css/page.css", "css/feedbooks.css", "css/title.css", "css/about.css", "css/main.css", "images/logo-feedbooks-tiny.png", "images/logo-feedbooks.png", "images/cover.png"],
53
+ book.resources.collect { |res| res.src }
54
+ )
55
+ end
56
+
57
+
58
+ def test_extracting_contents
59
+ epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
60
+ assert_equal(2, epub.send(:heading_depth))
61
+ end
62
+
63
+
64
+ def test_extracting_cover
65
+ # Cover image referenced from metadata
66
+ epub = Peregrin::Epub.read("test/fixtures/epubs/covers/cover_in_meta.epub")
67
+ assert_equal("cover.png", epub.to_book.cover.src)
68
+
69
+ # First image in a component listed in the guide as 'cover'
70
+ epub = Peregrin::Epub.read("test/fixtures/epubs/covers/cover_in_guide.epub")
71
+ assert_equal("cover.png", epub.to_book.cover.src)
72
+
73
+ # A component with the id of 'cover-image'.
74
+ epub = Peregrin::Epub.read(
75
+ "test/fixtures/epubs/covers/cover-image_in_manifest.epub"
76
+ )
77
+ assert_equal("cover.png", epub.to_book.cover.src)
78
+
79
+ # First image in component with the id of 'cover'.
80
+ epub = Peregrin::Epub.read(
81
+ "test/fixtures/epubs/covers/cover_in_manifest.epub"
82
+ )
83
+ assert_equal("cover.png", epub.to_book.cover.src)
84
+
85
+ # First image in first component.
86
+ epub = Peregrin::Epub.read(
87
+ "test/fixtures/epubs/covers/cover_in_first_cmpt.epub"
88
+ )
89
+ assert_equal("cover.png", epub.to_book.cover.src)
90
+ end
91
+
92
+
93
+ def test_read_epub_to_write_epub
94
+ epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
95
+ epub.write("test/output/strunk_test2.epub")
96
+ assert(File.exists?('test/output/strunk_test2.epub'))
97
+ assert_nothing_raised {
98
+ Peregrin::Epub.validate("test/output/strunk_test2.epub")
99
+ }
100
+ end
101
+
102
+
103
+ protected
104
+
105
+ def strunk_book
106
+ book = Peregrin::Book.new
107
+ 0.upto(21) { |i|
108
+ path = "main#{i}.xml"
109
+ book.add_component(
110
+ path,
111
+ IO.read("test/fixtures/epubs/strunk/OPS/#{path}")
112
+ )
113
+ }
114
+ pos = 0
115
+ chp = book.add_chapter(
116
+ "Chapter 1 - Introductory",
117
+ pos+=1,
118
+ "main0.xml"
119
+ )
120
+ chp = book.add_chapter(
121
+ "Chapter 2 - Elementary Rules of Usage",
122
+ pos+=1,
123
+ "main1.xml"
124
+ )
125
+ chp.add_child(
126
+ "1. Form the possessive singular of nounds with 's",
127
+ pos+=1,
128
+ "main1.xml#section_98344"
129
+ )
130
+ chp = book.add_chapter(
131
+ "Chapter 3 - Elementary Principles of Composition",
132
+ pos+=1,
133
+ "main9.xml"
134
+ )
135
+ chp = book.add_chapter(
136
+ "Chapter 4 - A Few Matters of Form",
137
+ pos+=1,
138
+ "main19.xml"
139
+ )
140
+ chp = book.add_chapter(
141
+ "Chapter 5 - Words and Expressions Commonly Misused",
142
+ pos+=1,
143
+ "main20.xml"
144
+ )
145
+ chp = book.add_chapter(
146
+ "Chapter 6 - Words Commonly Misspelled",
147
+ pos+=1,
148
+ "main21.xml"
149
+ )
150
+ book.add_property("title", "The Elements of Style")
151
+ book.add_property("creator", "William Strunk Jr.")
152
+ book.add_resource("css/main.css")
153
+ book.read_resource_proc = lambda { |resource|
154
+ IO.read("test/fixtures/epubs/strunk/OPS/#{resource.src}")
155
+ }
156
+ book
157
+ end
158
+
159
+ end