peregrin 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README.md +148 -0
- data/bin/peregrin +6 -0
- data/lib/formats/epub.rb +553 -0
- data/lib/formats/ochook.rb +113 -0
- data/lib/formats/zhook.rb +394 -0
- data/lib/peregrin/book.rb +87 -0
- data/lib/peregrin/chapter.rb +31 -0
- data/lib/peregrin/component.rb +12 -0
- data/lib/peregrin/componentizer.rb +118 -0
- data/lib/peregrin/outliner.rb +204 -0
- data/lib/peregrin/property.rb +16 -0
- data/lib/peregrin/resource.rb +24 -0
- data/lib/peregrin/version.rb +5 -0
- data/lib/peregrin/zip_patch.rb +11 -0
- data/lib/peregrin.rb +139 -0
- data/test/conversion_test.rb +80 -0
- data/test/formats/epub_test.rb +159 -0
- data/test/formats/ochook_test.rb +104 -0
- data/test/formats/zhook_test.rb +219 -0
- data/test/test_helper.rb +16 -0
- data/test/utils/componentizer_test.rb +78 -0
- data/test/utils/outliner_test.rb +49 -0
- metadata +135 -0
@@ -0,0 +1,394 @@
|
|
1
|
+
class Peregrin::Zhook
|
2
|
+
|
3
|
+
FORMAT = "Zhook"
|
4
|
+
|
5
|
+
FILE_EXT = ".zhook"
|
6
|
+
INDEX_PATH = "index.html"
|
7
|
+
COVER_PATH = "cover.png"
|
8
|
+
BODY_XPATH = '/html/body'
|
9
|
+
HEAD_XPATH = '/html/head'
|
10
|
+
|
11
|
+
# Raises an exception if file at path is not a valid Zhook. Otherwise
|
12
|
+
# returns true.
|
13
|
+
#
|
14
|
+
def self.validate(path)
|
15
|
+
raise FileNotFound.new(path) unless File.file?(path)
|
16
|
+
raise WrongExtension.new(path) unless File.extname(path) == FILE_EXT
|
17
|
+
begin
|
18
|
+
zf = Zip::Archive.open(path)
|
19
|
+
rescue
|
20
|
+
raise NotAZipArchive.new(path)
|
21
|
+
end
|
22
|
+
|
23
|
+
unless zf.find(INDEX_PATH)
|
24
|
+
raise MissingIndexHTML.new(path)
|
25
|
+
end
|
26
|
+
|
27
|
+
unless zf.find(COVER_PATH)
|
28
|
+
raise MissingCoverPNG.new(path)
|
29
|
+
end
|
30
|
+
|
31
|
+
doc = Nokogiri::HTML::Document.parse(zf.read(INDEX_PATH), nil, 'UTF-8')
|
32
|
+
raise IndexHTMLRootHasId.new(path) if doc.root['id']
|
33
|
+
|
34
|
+
ensure
|
35
|
+
zf.close if zf
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
# Unzips the file at path, generates a simple book object, passes to new.
|
40
|
+
#
|
41
|
+
def self.read(path)
|
42
|
+
validate(path)
|
43
|
+
book = Peregrin::Book.new
|
44
|
+
Zip::Archive.open(path) { |zf|
|
45
|
+
book.add_component(INDEX_PATH, zf.read(INDEX_PATH))
|
46
|
+
zf.each { |entry|
|
47
|
+
ze = entry.name
|
48
|
+
book.add_resource(ze) unless ze == INDEX_PATH || entry.directory?
|
49
|
+
}
|
50
|
+
}
|
51
|
+
book.read_resource_proc = lambda { |resource|
|
52
|
+
Zip::Archive.open(path) { |zipfile|
|
53
|
+
zipfile.read(resource.src)
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
extract_properties_from_index(book)
|
58
|
+
|
59
|
+
new(book)
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
# Stitches together components of the internal book.
|
64
|
+
#
|
65
|
+
def initialize(book)
|
66
|
+
@book = book
|
67
|
+
|
68
|
+
if @book.components.length > 1
|
69
|
+
stitch_components(@book)
|
70
|
+
end
|
71
|
+
|
72
|
+
consolidate_properties(@book)
|
73
|
+
|
74
|
+
@book.chapters = outline_book(index)
|
75
|
+
|
76
|
+
@book.cover ||= (
|
77
|
+
@book.resources.detect { |r| r.src == COVER_PATH } ||
|
78
|
+
@book.add_resource(COVER_PATH)
|
79
|
+
)
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
# Writes the internal book object to a .zhook file at the given path.
|
84
|
+
#
|
85
|
+
def write(path)
|
86
|
+
File.unlink(path) if File.exists?(path)
|
87
|
+
Zip::Archive.open(path, Zip::CREATE) { |zipfile|
|
88
|
+
zipfile.add_buffer(INDEX_PATH, htmlize(index))
|
89
|
+
@book.resources.each { |resource|
|
90
|
+
zipfile.add_buffer(resource.src, @book.read_resource(resource))
|
91
|
+
}
|
92
|
+
unless @book.cover.src == COVER_PATH
|
93
|
+
zipfile.add_buffer(COVER_PATH, to_png_data(@book.cover))
|
94
|
+
end
|
95
|
+
}
|
96
|
+
path
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
# Returns the internal book object.
|
101
|
+
#
|
102
|
+
def to_book(options = {})
|
103
|
+
bk = @book.deep_clone
|
104
|
+
|
105
|
+
# XPath => URI mapping tools
|
106
|
+
cmpt_xpaths = []
|
107
|
+
|
108
|
+
boilerplate_rel_links =
|
109
|
+
'<link rel="start" href="cover.html" />' +
|
110
|
+
'<link rel="contents" href="toc.html" />'
|
111
|
+
|
112
|
+
# Componentizing.
|
113
|
+
if options[:componentize]
|
114
|
+
componentizer = Peregrin::Componentizer.new(index)
|
115
|
+
componentizer.process(index.root.at_css('body'))
|
116
|
+
bk.components = componentizer.component_xpaths.collect { |xpath|
|
117
|
+
cmpt_xpaths.push(xpath)
|
118
|
+
doc = componentizer.generate_component(xpath)
|
119
|
+
Peregrin::Component.new(uri_for_xpath(xpath, cmpt_xpaths), doc)
|
120
|
+
}
|
121
|
+
|
122
|
+
# Add rel links and convert to html string
|
123
|
+
first_path = bk.components.first.src
|
124
|
+
last_path = bk.components.last.src
|
125
|
+
boilerplate_rel_links <<
|
126
|
+
'<link rel="first" href="'+bk.components.first.src+'" />' +
|
127
|
+
'<link rel="last" href="'+bk.components.last.src+'" />'
|
128
|
+
bk.components.each_with_index { |cmpt, i|
|
129
|
+
head = cmpt.contents.at_xpath(HEAD_XPATH)
|
130
|
+
prev_path = bk.components[i-1].src if (i-1) >= 0
|
131
|
+
next_path = bk.components[i+1].src if (i+1) < bk.components.size
|
132
|
+
head.add_child(boilerplate_rel_links)
|
133
|
+
head.add_child('<link rel="prev" href="'+prev_path+'" />') if prev_path
|
134
|
+
head.add_child('<link rel="next" href="'+next_path+'" />') if next_path
|
135
|
+
cmpt.contents = htmlize(cmpt.contents)
|
136
|
+
}
|
137
|
+
else
|
138
|
+
cmpt_xpaths.push(BODY_XPATH)
|
139
|
+
bk.components.clear
|
140
|
+
bk.add_component(uri_for_xpath(BODY_XPATH), htmlize(index))
|
141
|
+
end
|
142
|
+
|
143
|
+
# Outlining.
|
144
|
+
bk.chapters = outline_book(index, cmpt_xpaths)
|
145
|
+
|
146
|
+
if options[:componentize]
|
147
|
+
# Table of Contents
|
148
|
+
doc = Nokogiri::HTML::Builder.new(:encoding => 'UTF-8') { |html|
|
149
|
+
curse = lambda { |children|
|
150
|
+
parts = children.collect { |chp|
|
151
|
+
chp.empty_leaf? ? nil : [chp.title, chp.src, chp.children]
|
152
|
+
}.compact
|
153
|
+
|
154
|
+
html.ol {
|
155
|
+
parts.each { |part|
|
156
|
+
html.li {
|
157
|
+
html.a(part[0], :href => part[1])
|
158
|
+
curse.call(part[2]) if part[2].any?
|
159
|
+
}
|
160
|
+
}
|
161
|
+
} if parts.any?
|
162
|
+
}
|
163
|
+
curse.call(bk.chapters)
|
164
|
+
}.doc
|
165
|
+
if doc.root
|
166
|
+
toc_doc = componentizer.generate_document(doc.root)
|
167
|
+
toc_doc.at_xpath(HEAD_XPATH).add_child(boilerplate_rel_links)
|
168
|
+
bk.add_component(
|
169
|
+
"toc.html",
|
170
|
+
htmlize(toc_doc),
|
171
|
+
nil,
|
172
|
+
:linear => "no",
|
173
|
+
:guide => "Table of Contents",
|
174
|
+
:guide_type => "toc"
|
175
|
+
)
|
176
|
+
end
|
177
|
+
|
178
|
+
# List of Illustrations
|
179
|
+
figures = index.css('figure[id], div.figure[id]')
|
180
|
+
if figures.any?
|
181
|
+
doc = Nokogiri::HTML::Builder.new(:encoding => 'UTF-8') { |html|
|
182
|
+
html.ol {
|
183
|
+
figures.each { |fig|
|
184
|
+
next unless caption = fig.at_css('figcaption, .figcaption')
|
185
|
+
n = fig
|
186
|
+
while n && n.respond_to?(:parent)
|
187
|
+
break if cmpt_uri = uri_for_xpath(n.path, cmpt_xpaths)
|
188
|
+
n = n.parent
|
189
|
+
end
|
190
|
+
next unless cmpt_uri
|
191
|
+
html.li {
|
192
|
+
html.a(caption.content, :href => "#{cmpt_uri}##{fig['id']}")
|
193
|
+
}
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}.doc
|
197
|
+
loi_doc = componentizer.generate_document(doc.root)
|
198
|
+
loi_doc.at_xpath(HEAD_XPATH).add_child(boilerplate_rel_links)
|
199
|
+
bk.add_component(
|
200
|
+
"loi.html",
|
201
|
+
htmlize(loi_doc),
|
202
|
+
nil,
|
203
|
+
:linear => "no",
|
204
|
+
:guide => "List of Illustrations",
|
205
|
+
:guide_type => "loi"
|
206
|
+
)
|
207
|
+
end
|
208
|
+
|
209
|
+
# Cover
|
210
|
+
doc = Nokogiri::HTML::Builder.new(:encoding => 'UTF-8') { |html|
|
211
|
+
html.div(:id => "cover") {
|
212
|
+
html.img(:src => bk.cover.src, :alt => bk.property_for("title"))
|
213
|
+
}
|
214
|
+
}.doc
|
215
|
+
cover_doc = componentizer.generate_document(doc.root)
|
216
|
+
cover_doc.at_xpath(HEAD_XPATH).add_child(boilerplate_rel_links)
|
217
|
+
bk.components.unshift(
|
218
|
+
Peregrin::Component.new(
|
219
|
+
"cover.html",
|
220
|
+
htmlize(cover_doc),
|
221
|
+
nil,
|
222
|
+
:linear => "no",
|
223
|
+
:guide => "Cover",
|
224
|
+
:guide_type => "cover"
|
225
|
+
)
|
226
|
+
)
|
227
|
+
end
|
228
|
+
|
229
|
+
bk
|
230
|
+
end
|
231
|
+
|
232
|
+
|
233
|
+
protected
|
234
|
+
|
235
|
+
def index
|
236
|
+
@index_document ||= Nokogiri::HTML::Document.parse(
|
237
|
+
@book.components.first.contents
|
238
|
+
)
|
239
|
+
end
|
240
|
+
|
241
|
+
|
242
|
+
# Takes a book with multiple components and joins them together,
|
243
|
+
# by creating article elements from every body element and appending them
|
244
|
+
# to the body of the first component.
|
245
|
+
#
|
246
|
+
def stitch_components(book)
|
247
|
+
node = Nokogiri::XML::Node.new('article', index)
|
248
|
+
bdy = index.at_xpath(BODY_XPATH)
|
249
|
+
head = index.at_xpath(HEAD_XPATH)
|
250
|
+
bdy.children.each { |ch|
|
251
|
+
node.add_child(ch)
|
252
|
+
}
|
253
|
+
bdy.add_child(node)
|
254
|
+
|
255
|
+
book.components.shift
|
256
|
+
while cmpt = book.components.shift
|
257
|
+
str = cmpt.contents
|
258
|
+
doc = Nokogiri::HTML::Document.parse(str)
|
259
|
+
art = doc.at_xpath(BODY_XPATH)
|
260
|
+
art.name = 'article'
|
261
|
+
bdy.add_child(art)
|
262
|
+
|
263
|
+
# Import all other unique elements from the head, like link & meta tags.
|
264
|
+
if dhead = doc.at_xpath(HEAD_XPATH)
|
265
|
+
dhead.children.each { |foreign_child|
|
266
|
+
next if foreign_child.name.downcase == "title"
|
267
|
+
next if head.children.any? { |index_child|
|
268
|
+
index_child.to_s == foreign_child.to_s
|
269
|
+
}
|
270
|
+
head.add_child(foreign_child.dup)
|
271
|
+
}
|
272
|
+
end
|
273
|
+
end
|
274
|
+
book.components.clear
|
275
|
+
book.add_component(uri_for_xpath(BODY_XPATH), htmlize(index))
|
276
|
+
end
|
277
|
+
|
278
|
+
|
279
|
+
# Takes the properties out of the book and ensures that there are matching
|
280
|
+
# meta tags in the index document.
|
281
|
+
#
|
282
|
+
def consolidate_properties(book)
|
283
|
+
head = index.at_xpath('/html/head')
|
284
|
+
head.css('meta[name]').each { |meta| meta.remove }
|
285
|
+
book.properties.each { |property|
|
286
|
+
# FIXME: handle properties with attributes?
|
287
|
+
meta = Nokogiri::XML::Node.new('meta', index)
|
288
|
+
meta['name'] = property.key
|
289
|
+
meta['content'] = property.value
|
290
|
+
head.add_child(meta)
|
291
|
+
}
|
292
|
+
end
|
293
|
+
|
294
|
+
|
295
|
+
def outline_book(doc, cmpt_xpaths = [BODY_XPATH])
|
296
|
+
unless defined?(@outliner) && @outliner
|
297
|
+
@outliner = Peregrin::Outliner.new(doc)
|
298
|
+
@outliner.process(doc.at_css('body'))
|
299
|
+
end
|
300
|
+
|
301
|
+
i = 0
|
302
|
+
curse = lambda { |sxn|
|
303
|
+
chapter = Peregrin::Chapter.new(sxn.heading_text, i+=1)
|
304
|
+
|
305
|
+
# identify any relevant child sections
|
306
|
+
children = sxn.sections.collect { |ch|
|
307
|
+
curse.call(ch) unless ch.empty?
|
308
|
+
}.compact
|
309
|
+
chapter.children = children if children.any?
|
310
|
+
|
311
|
+
# Find the component parent
|
312
|
+
n = sxn.node || sxn.heading
|
313
|
+
while n && n.respond_to?(:parent)
|
314
|
+
break if cmpt_uri = uri_for_xpath(n.path, cmpt_xpaths)
|
315
|
+
n = n.parent
|
316
|
+
end
|
317
|
+
|
318
|
+
if cmpt_uri
|
319
|
+
# get URI for section
|
320
|
+
sid = sxn.heading['id'] if sxn.heading
|
321
|
+
cmpt_uri += "#"+sid if sid && !sid.empty?
|
322
|
+
chapter.src = cmpt_uri
|
323
|
+
end
|
324
|
+
|
325
|
+
chapter
|
326
|
+
}
|
327
|
+
|
328
|
+
result = curse.call(@outliner.result_root).children
|
329
|
+
while result && result.length == 1 && result.first.title.nil?
|
330
|
+
result = result.first.children
|
331
|
+
end
|
332
|
+
result
|
333
|
+
end
|
334
|
+
|
335
|
+
|
336
|
+
def uri_for_xpath(xpath, cmpt_xpaths = [BODY_XPATH])
|
337
|
+
return nil unless cmpt_xpaths.include?(xpath)
|
338
|
+
i = cmpt_xpaths.index(xpath)
|
339
|
+
(i == 0) ? "index.html" : "part#{"%03d" % i}.html"
|
340
|
+
end
|
341
|
+
|
342
|
+
|
343
|
+
def htmlize(doc)
|
344
|
+
"<!DOCTYPE html>\n"+doc.root.to_html
|
345
|
+
end
|
346
|
+
|
347
|
+
|
348
|
+
def to_png_data(resource)
|
349
|
+
return if resource.nil?
|
350
|
+
if File.extname(resource.src) == ".png"
|
351
|
+
return @book.read_resource(resource)
|
352
|
+
else
|
353
|
+
raise ConvertUtilityMissing unless `which convert`
|
354
|
+
out = nil
|
355
|
+
IO.popen("convert - png:-", "r+") { |io|
|
356
|
+
io.write(@book.read_resource(resource))
|
357
|
+
io.close_write
|
358
|
+
out = io.read
|
359
|
+
}
|
360
|
+
out
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
|
365
|
+
def self.extract_properties_from_index(book)
|
366
|
+
doc = Nokogiri::HTML::Document.parse(
|
367
|
+
book.components.first.contents
|
368
|
+
)
|
369
|
+
doc.css('html head meta[name]').each { |meta|
|
370
|
+
name = meta['name']
|
371
|
+
content = meta['content']
|
372
|
+
book.add_property(name, content)
|
373
|
+
}
|
374
|
+
end
|
375
|
+
|
376
|
+
|
377
|
+
class ValidationError < ::RuntimeError
|
378
|
+
|
379
|
+
def initialize(path = nil)
|
380
|
+
@path = path
|
381
|
+
end
|
382
|
+
|
383
|
+
end
|
384
|
+
|
385
|
+
class FileNotFound < ValidationError; end
|
386
|
+
class WrongExtension < ValidationError; end
|
387
|
+
class NotAZipArchive < ValidationError; end
|
388
|
+
class MissingIndexHTML < ValidationError; end
|
389
|
+
class MissingCoverPNG < ValidationError; end
|
390
|
+
class IndexHTMLRootHasId < ValidationError; end
|
391
|
+
|
392
|
+
class ConvertUtilityMissing < RuntimeError; end
|
393
|
+
|
394
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
class Peregrin::Book
|
2
|
+
|
3
|
+
# Unique identifier for this book
|
4
|
+
attr_accessor :identifier
|
5
|
+
|
6
|
+
# An array of Components
|
7
|
+
attr_accessor :components
|
8
|
+
|
9
|
+
# A tree of Chapters. Top-level chapters in this array, each with
|
10
|
+
# children arrays.
|
11
|
+
attr_accessor :chapters
|
12
|
+
|
13
|
+
# An array of Properties.
|
14
|
+
attr_accessor :properties
|
15
|
+
|
16
|
+
# An array of Resources.
|
17
|
+
attr_accessor :resources
|
18
|
+
|
19
|
+
# A Resource that is used for the book cover.
|
20
|
+
attr_accessor :cover
|
21
|
+
|
22
|
+
# A proc that copies a resource to the given destination.
|
23
|
+
attr_writer :read_resource_proc
|
24
|
+
|
25
|
+
|
26
|
+
def initialize
|
27
|
+
@components = []
|
28
|
+
@chapters = []
|
29
|
+
@properties = []
|
30
|
+
@resources = []
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
def all_files
|
35
|
+
@components + @resources
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
def add_component(*args)
|
40
|
+
@components.push(Peregrin::Component.new(*args)).last
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
def add_resource(*args)
|
45
|
+
@resources.push(Peregrin::Resource.new(*args)).last
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
def add_chapter(*args)
|
50
|
+
@chapters.push(Peregrin::Chapter.new(*args)).last
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def add_property(*args)
|
55
|
+
@properties.push(Peregrin::Property.new(*args)).last
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
def property_for(key)
|
60
|
+
key = key.to_s
|
61
|
+
prop = @properties.detect { |p| p.key == key }
|
62
|
+
prop ? prop.value : nil
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def read_resource(resource_path)
|
67
|
+
@read_resource_proc.call(resource_path) if @read_resource_proc
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
def copy_resource_to(resource_path, dest_path)
|
72
|
+
File.open(dest_path, 'w') { |f|
|
73
|
+
f << read_resource(resource_path)
|
74
|
+
}
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def deep_clone
|
79
|
+
@read_resource_proc ||= nil
|
80
|
+
tmp = @read_resource_proc
|
81
|
+
@read_resource_proc = nil
|
82
|
+
clone = Marshal.load(Marshal.dump(self))
|
83
|
+
clone.read_resource_proc = @read_resource_proc = tmp
|
84
|
+
clone
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Books have nested sections with headings - each of these is a chapter.
|
2
|
+
#
|
3
|
+
# TODO: flag whether a chapter is linkable?
|
4
|
+
#
|
5
|
+
class Peregrin::Chapter
|
6
|
+
|
7
|
+
attr_accessor :title, :src, :children, :position
|
8
|
+
|
9
|
+
def initialize(title, pos, src = nil)
|
10
|
+
@title = title.gsub(/[\r\n]/,' ') if title
|
11
|
+
@src = src
|
12
|
+
@position = pos.to_i
|
13
|
+
@children = []
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
def add_child(child_title, child_pos, child_src = nil)
|
18
|
+
chp = Peregrin::Chapter.new(child_title, child_pos, child_src)
|
19
|
+
children.push(chp)
|
20
|
+
chp
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
# A chapter is an empty leaf if you can't link to it or any of its children.
|
25
|
+
# Typically you wouldn't show an empty-leaf chapter in a Table of Contents.
|
26
|
+
#
|
27
|
+
def empty_leaf?
|
28
|
+
src.nil? && children.all? { |ch| ch.empty_leaf? }
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# A component is a section of the book's linear text.
|
2
|
+
#
|
3
|
+
class Peregrin::Component < Peregrin::Resource
|
4
|
+
|
5
|
+
attr_accessor :contents
|
6
|
+
|
7
|
+
def initialize(src, contents = nil, media_type = nil, attributes = {})
|
8
|
+
@contents = contents
|
9
|
+
super(src, media_type, attributes)
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
class Peregrin::Componentizer
|
2
|
+
|
3
|
+
attr_reader :component_xpaths
|
4
|
+
|
5
|
+
|
6
|
+
def initialize(doc)
|
7
|
+
@document = doc
|
8
|
+
@component_xpaths = []
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
# Build a list of xpaths for nodes that can be turned into standalone
|
13
|
+
# components.
|
14
|
+
#
|
15
|
+
def process(from)
|
16
|
+
@component_xpaths = []
|
17
|
+
walk(from)
|
18
|
+
@component_xpaths.reject! { |xpath| emptied?(xpath) }
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def generate_component(xpath)
|
23
|
+
raise "Not a component: #{xpath}" unless @component_xpaths.include?(xpath)
|
24
|
+
node = @document.at_xpath(xpath)
|
25
|
+
generate_document(node)
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
# Creates a new document with the same root and head nodes, but with
|
30
|
+
# a body that just contains the nodes at the given xpath.
|
31
|
+
#
|
32
|
+
def generate_document(node)
|
33
|
+
# Clean up the "shell" document.
|
34
|
+
@shell_document ||= @document.dup
|
35
|
+
bdy = @shell_document.at_xpath('/html/body')
|
36
|
+
bdy.children.remove
|
37
|
+
|
38
|
+
# Find the node we're going to copy into the shell document.
|
39
|
+
# Create a deep clone of it. Remove any children of it that are
|
40
|
+
# componentizable in their own right.
|
41
|
+
ndup = node.dup
|
42
|
+
node.children.collect { |ch|
|
43
|
+
next unless component_xpaths.include?(ch.path)
|
44
|
+
dpath = ch.path.sub(/^#{Regexp.escape(node.path)}/, ndup.path)
|
45
|
+
ndup.children.detect { |dch| dch.path == dpath }
|
46
|
+
}.compact.each { |ch|
|
47
|
+
ch.unlink
|
48
|
+
}
|
49
|
+
|
50
|
+
# Append the node to the body of the shell (or replace the body, if
|
51
|
+
# the node is a body itself).
|
52
|
+
if node.name.downcase == "body"
|
53
|
+
bdy.replace(ndup)
|
54
|
+
else
|
55
|
+
bdy.add_child(ndup)
|
56
|
+
end
|
57
|
+
|
58
|
+
@shell_document.dup
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
# Writes the componentizable node at the given xpath to the given
|
63
|
+
# filesystem path.
|
64
|
+
#
|
65
|
+
# If you provide a block, you get the new document object,
|
66
|
+
# and you are expected to return the string containing its HTML form --
|
67
|
+
# in this way you can tweak the HTML output. Default is simply: doc.to_html
|
68
|
+
#
|
69
|
+
def write_component(xpath, path, &blk)
|
70
|
+
new_doc = generate_component(xpath)
|
71
|
+
out = block_given? ? blk.call(new_doc) : new_doc.to_html
|
72
|
+
File.open(path, 'w') { |f| f.write(out) }
|
73
|
+
out
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
# The recursive method for walking the tree - checks if the current node
|
80
|
+
# is a component, then checks each child of the current node.
|
81
|
+
#
|
82
|
+
def walk(node)
|
83
|
+
return unless componentizable?(node)
|
84
|
+
@component_xpaths.push(node.path)
|
85
|
+
node.children.each { |c| walk(c) }
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
# True if the node meets the criteria for being componentizable:
|
90
|
+
# 1) Is a body or article element (or a div.article)?
|
91
|
+
# 2) Are all subsequent siblings also componentizable?
|
92
|
+
#
|
93
|
+
def componentizable?(node)
|
94
|
+
begin
|
95
|
+
return false unless (
|
96
|
+
%w[body article].include?(node.name.downcase) ||
|
97
|
+
(
|
98
|
+
node.name.downcase == "div" &&
|
99
|
+
node['class'] &&
|
100
|
+
node['class'].match(/\barticle\b/)
|
101
|
+
)
|
102
|
+
)
|
103
|
+
end while node = node.next
|
104
|
+
true
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
# True if all children are either componentizable or blank text nodes.
|
109
|
+
#
|
110
|
+
def emptied?(xpath)
|
111
|
+
node = @document.at_xpath(xpath)
|
112
|
+
node.children.all? { |ch|
|
113
|
+
@component_xpaths.include?(ch.path) ||
|
114
|
+
(ch.text? && ch.content.strip.empty?)
|
115
|
+
}
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|